name: CI
-# on: [push, pull_request]
-on: [pull_request]
+on: [push, pull_request]
jobs:
direct:
strategy:
matrix:
os: [ubuntu, macos]
- python-version: [2.7, 3.6, 3.7, 3.8, 3.9]
+ python-version: ['2.7', '3.6', '3.7', '3.8', '3.9', '3.10']
exclude:
# Run only the latest 2.x and 3.x on macOS
- os: macos
python-version: 3.7
- os: macos
python-version: 3.8
+ - os: macos
+ python-version: 3.9
steps:
- name: Checkout pysam
strategy:
matrix:
os: [ubuntu, macos]
- python-version: [3.9]
+ python-version: ['3.10']
steps:
- name: Checkout pysam
strategy:
matrix:
os: [ubuntu]
- python-version: [3.7]
+ python-version: ['3.7']
defaults:
run:
shell: bash -l {0} # needed for conda activation
runs-on: ${{ matrix.os }}
strategy:
matrix:
- os: [ubuntu-20.04, macos-10.15] # windows-2019,
-
+ os: [ubuntu-latest, macos-10.15] # windows-2019,
+ cibw_archs: ["auto"]
+ # include:
+ # - os: ubuntu-latest
+ # cibw_archs: "aarch64"
+
steps:
- name: Checkout pysam
uses: actions/checkout@v2
uses: actions/setup-python@v2
with:
python-version: '3.8'
-
+
- name: Install prerequisite Python libraries
run: |
- python -m pip install --upgrade pip
+ python -m pip install --upgrade pip
pip install cython pytest pytest-pep8
- name: Build wheels for linux
if: runner.os == 'Linux'
- uses: pypa/cibuildwheel@v2.1.2
+ uses: pypa/cibuildwheel@v2.2.2
env:
- CIBW_BUILD: cp36-* cp37-* cp38-* cp39-*
+ CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* cp310-*
+ CIBW_SKIP: "*musllinux*"
CIBW_BEFORE_BUILD: yum install -y libcurl-devel zlib-devel bzip2-devel xz-devel && pip install cython
- CIBW_MANYLINUX_X86_64_IMAGE: manylinux1
- CIBW_MANYLINUX_I686_IMAGE: manylinux1
+ CIBW_ARCHS: ${{ matrix.cibw_archs }}
- name: Build wheels for macos
if: runner.os != 'Linux'
- uses: pypa/cibuildwheel@v2.1.2
+ uses: pypa/cibuildwheel@v2.2.2
env:
- CIBW_BUILD: cp36-* cp37-* cp38-* cp39-*
+ CIBW_BUILD: cp36-* cp37-* cp38-* cp39-* cp310-*
CIBW_BEFORE_BUILD: pip install cython
+ CIBW_ARCHS: ${{ matrix.cibw_archs }}
- name: Upload artifacts
uses: actions/upload-artifact@v2
if: github.event_name == 'release' && github.event.action == 'published'
uses: pypa/gh-action-pypi-publish@master
with:
- user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
--- /dev/null
+FROM ubi8:latest
+
+RUN yum update \
+ && yum install -y python3-pip python3-devel pigz \
+ && cd /usr/local/bin \
+ && ln -s /usr/bin/python3 python \
+ && pip3 --no-cache-dir install --upgrade pip \
+ && yum clean all \
+ && echo "system packages installed"
+
+RUN python -m pip install pysam
+
+WORKDIR /opt/
compilation options. Especially for OS X this will potentially save a
lot of trouble.
-The current version of pysam wraps 3rd-party code from htslib-1.13, samtools-1.13, and bcftools-1.13.
+The current version of pysam wraps 3rd-party code from htslib-1.14, samtools-1.14, and bcftools-1.14.
Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
}
}
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst);
static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
{
const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags
- if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
bcf1_t *rec = buf->split.rec;
// Check for incorrect number of values. Note this check does not consider all values missing
// and will remove annotations that don't pass.
- if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL )
+ {
+ if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+ }
if ( buf->mtmp2 < buf->mtmp )
{
buf->mtmp2 = buf->mtmp;
}
+ const int num_size = 4;
+ assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
int32_t missing = bcf_int32_missing;
void *missing_ptr = (void*)&missing;
if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+ int32_t vector_end = bcf_int32_vector_end;
+ void *vector_end_ptr = (void*)&vector_end;
+ if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
int iout,i;
for (iout=0; iout<buf->split.nout; iout++)
int ret = 0;
if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
- else if ( len==BCF_VL_A )
+ else if ( len==BCF_VL_A && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial - 1;
assert( iori<nval );
- memcpy(buf->tmp2,buf->tmp+4*iori,4);
+ if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
+ memcpy(buf->tmp2,missing_ptr,num_size);
+ else
+ memcpy(buf->tmp2,buf->tmp+num_size*iori,num_size);
if ( star_allele )
- memcpy(buf->tmp2+4,missing_ptr,4);
+ memcpy(buf->tmp2+num_size,missing_ptr,num_size);
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
}
- else if ( len==BCF_VL_R )
+ else if ( len==BCF_VL_A && type==BCF_HT_STR )
{
- memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records
+ int iori = buf->split.atoms[iout]->ial - 1;
+ kstring_t dst;
+ dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
+ kputc('.',&dst);
+ if ( star_allele ) kputs(",.",&dst);
+ copy_string_field(buf->tmp, iori, nval, &dst, 0);
+ if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1);
+ buf->mtmp2 = dst.m;
+ buf->tmp2 = dst.s;
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
+ }
+ else if ( len==BCF_VL_R && type!=BCF_HT_STR )
+ {
+ memcpy(buf->tmp2,buf->tmp,num_size); // REF contributes to all records
int iori = buf->split.atoms[iout]->ial;
assert( iori<nval && iori<=buf->split.nori );
- memcpy(buf->tmp2+4,buf->tmp+4*iori,4);
+ if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
+ memcpy(buf->tmp2+num_size,missing_ptr,num_size);
+ else
+ memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size);
if ( type==BCF_HT_INT && mode==M_SUM )
{
uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
}
}
if ( star_allele )
- memcpy(buf->tmp2+8,missing_ptr,4);
+ memcpy(buf->tmp2+2*num_size,missing_ptr,num_size);
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
}
+ else if ( len==BCF_VL_R && type==BCF_HT_STR )
+ {
+ int iori = buf->split.atoms[iout]->ial - 1;
+ kstring_t dst;
+ dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
+ kputs(".,.",&dst);
+ if ( star_allele ) kputs(",.",&dst);
+ copy_string_field(buf->tmp, 0, nval, &dst, 0);
+ copy_string_field(buf->tmp, iori+1, nval, &dst, 1);
+ if ( star_allele ) copy_string_field(".", 0, 1, &dst, 2);
+ buf->mtmp2 = dst.m;
+ buf->tmp2 = dst.s;
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
+ }
if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
}
}
int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
- if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
+ if ( type==BCF_HT_STR && len==BCF_VL_G ) return; // possible todo: Number=G for strings
if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
const int num_size = 4;
int32_t missing = bcf_int32_missing;
void *missing_ptr = (void*)&missing;
if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+ int32_t vector_end = bcf_int32_vector_end;
+ void *vector_end_ptr = (void*)&vector_end;
+ if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
bcf1_t *rec = buf->split.rec;
int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items
int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes
- if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL )
+ {
+ if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid
- // Check for incorrect number of values. Note this check does not consider all values missing
- // and will remove annotations that don't pass.
- if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+ // Check for incorrect number of values. Note this check does not consider all values missing
+ // and will remove annotations that don't pass.
+ if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+ }
// Increase buffer size to accommodate star allele
int nval1 = nval / nsmpl;
mtmp = buf->mtmp;
- if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
- else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL )
+ {
+ if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
+ else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+ }
+ else if ( type==BCF_HT_STR )
+ {
+ if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < nsmpl*(nval1+2) ) mtmp = nsmpl*(nval1+2); // +2 for the possibility of the star allele, ",."
+ else if ( len==BCF_VL_G && mtmp < nsmpl*(nval1+6) ) mtmp = nsmpl*(nval1+6);
+ }
if ( buf->mtmp2 < mtmp )
{
int ret = 0;
if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
- else if ( len==BCF_VL_A )
+ else if ( len==BCF_VL_A && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial - 1;
assert( iori<nval );
{
void *src = buf->tmp + nval1*num_size*i;
void *dst = buf->tmp2 + num_size*i*(star_allele+1);
- memcpy(dst,src+iori*num_size,num_size);
+ if ( !memcmp(vector_end_ptr,src+iori*num_size,num_size) )
+ memcpy(dst,missing_ptr,num_size);
+ else
+ memcpy(dst,src+iori*num_size,num_size);
if ( star_allele )
memcpy(dst+num_size,missing_ptr,num_size);
}
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
}
- else if ( len==BCF_VL_R )
+ else if ( (len==BCF_VL_A || len==BCF_VL_R) && type==BCF_HT_STR )
+ {
+ int ioff = len==BCF_VL_R ? 1 : 0;
+ int iori = buf->split.atoms[iout]->ial - 1;
+ int nval1_dst = star_allele ? nval1 + 2 : nval1;
+ memset(buf->tmp2,0,nval1_dst*nsmpl);
+ for (i=0; i<nsmpl; i++)
+ {
+ kstring_t dst;
+ dst.l = 0; dst.m = nval1_dst; dst.s = (char*)buf->tmp2 + nval1_dst*i;
+ kputc_('.',&dst);
+ if ( star_allele ) kputsn_(",.",2,&dst);
+ if ( len==BCF_VL_R )
+ {
+ kputsn_(",.",2,&dst);
+ copy_string_field(buf->tmp+nval1*i, 0, nval1, &dst, 0);
+ }
+ copy_string_field(buf->tmp+nval1*i, iori+ioff, nval1, &dst, 0+ioff);
+ if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1+ioff);
+ }
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nval1_dst*nsmpl, type);
+ }
+ else if ( len==BCF_VL_R && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial;
assert( iori<=nval );
void *dst = buf->tmp2 + num_size*i*(star_allele+2);
memcpy(dst,src,num_size);
memcpy(dst+num_size,src+iori*num_size,num_size);
-
if ( type==BCF_HT_INT && mode==M_SUM )
{
uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
}
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
}
- else if ( len==BCF_VL_G )
+ else if ( len==BCF_VL_G && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial;
int i01 = bcf_alleles2gt(0,iori);
_split_table_set_format(buf, &rec->d.fmt[i], mode);
}
+
+ // Check that at least one FORMAT field was added, if not, the number of samples must be set manually
+ for (i=0; i<buf->split.nout; i++)
+ {
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
+ if ( !out->n_sample ) out->n_sample = rec->n_sample;
+ }
}
void abuf_push(abuf_t *buf, bcf1_t *rec)
bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt);
}
}
+int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst);
static void _split_table_set_info(abuf_t *buf, bcf_info_t *info, merge_rule_t mode)
{
const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key);
int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key);
int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key);
if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags
- if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
bcf1_t *rec = buf->split.rec;
// Check for incorrect number of values. Note this check does not consider all values missing
// and will remove annotations that don't pass.
- if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL )
+ {
+ if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) return;
+ }
if ( buf->mtmp2 < buf->mtmp )
{
buf->mtmp2 = buf->mtmp;
}
+ const int num_size = 4;
+ assert( num_size==sizeof(int32_t) && num_size==sizeof(float) );
int32_t missing = bcf_int32_missing;
void *missing_ptr = (void*)&missing;
if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+ int32_t vector_end = bcf_int32_vector_end;
+ void *vector_end_ptr = (void*)&vector_end;
+ if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
int iout,i;
for (iout=0; iout<buf->split.nout; iout++)
int ret = 0;
if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
ret = bcf_update_info(buf->out_hdr, out, tag, type==BCF_HT_FLAG ? NULL : buf->tmp, nval, type);
- else if ( len==BCF_VL_A )
+ else if ( len==BCF_VL_A && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial - 1;
assert( iori<nval );
- memcpy(buf->tmp2,buf->tmp+4*iori,4);
+ if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
+ memcpy(buf->tmp2,missing_ptr,num_size);
+ else
+ memcpy(buf->tmp2,buf->tmp+num_size*iori,num_size);
if ( star_allele )
- memcpy(buf->tmp2+4,missing_ptr,4);
+ memcpy(buf->tmp2+num_size,missing_ptr,num_size);
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type);
}
- else if ( len==BCF_VL_R )
+ else if ( len==BCF_VL_A && type==BCF_HT_STR )
{
- memcpy(buf->tmp2,buf->tmp,4); // REF contributes to all records
+ int iori = buf->split.atoms[iout]->ial - 1;
+ kstring_t dst;
+ dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
+ kputc('.',&dst);
+ if ( star_allele ) kputs(",.",&dst);
+ copy_string_field(buf->tmp, iori, nval, &dst, 0);
+ if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1);
+ buf->mtmp2 = dst.m;
+ buf->tmp2 = dst.s;
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
+ }
+ else if ( len==BCF_VL_R && type!=BCF_HT_STR )
+ {
+ memcpy(buf->tmp2,buf->tmp,num_size); // REF contributes to all records
int iori = buf->split.atoms[iout]->ial;
assert( iori<nval && iori<=buf->split.nori );
- memcpy(buf->tmp2+4,buf->tmp+4*iori,4);
+ if ( !memcmp(vector_end_ptr,buf->tmp+num_size*iori,num_size) )
+ memcpy(buf->tmp2+num_size,missing_ptr,num_size);
+ else
+ memcpy(buf->tmp2+num_size,buf->tmp+num_size*iori,num_size);
if ( type==BCF_HT_INT && mode==M_SUM )
{
uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
}
}
if ( star_allele )
- memcpy(buf->tmp2+8,missing_ptr,4);
+ memcpy(buf->tmp2+2*num_size,missing_ptr,num_size);
ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type);
}
+ else if ( len==BCF_VL_R && type==BCF_HT_STR )
+ {
+ int iori = buf->split.atoms[iout]->ial - 1;
+ kstring_t dst;
+ dst.l = 0; dst.m = buf->mtmp2; dst.s = (char*)buf->tmp2;
+ kputs(".,.",&dst);
+ if ( star_allele ) kputs(",.",&dst);
+ copy_string_field(buf->tmp, 0, nval, &dst, 0);
+ copy_string_field(buf->tmp, iori+1, nval, &dst, 1);
+ if ( star_allele ) copy_string_field(".", 0, 1, &dst, 2);
+ buf->mtmp2 = dst.m;
+ buf->tmp2 = dst.s;
+ ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, dst.l, type);
+ }
if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag);
}
}
int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id);
int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id);
- if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings
+ if ( type==BCF_HT_STR && len==BCF_VL_G ) return; // possible todo: Number=G for strings
if ( type==BCF_HT_LONG ) return; // todo: 64bit integers
const int num_size = 4;
int32_t missing = bcf_int32_missing;
void *missing_ptr = (void*)&missing;
if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr));
+ int32_t vector_end = bcf_int32_vector_end;
+ void *vector_end_ptr = (void*)&vector_end;
+ if ( type==BCF_HT_REAL ) bcf_float_set_vector_end(*((float*)vector_end_ptr));
bcf1_t *rec = buf->split.rec;
int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; // number of items
int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type);
if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; // number of bytes
- if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL )
+ {
+ if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid
- // Check for incorrect number of values. Note this check does not consider all values missing
- // and will remove annotations that don't pass.
- if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+ // Check for incorrect number of values. Note this check does not consider all values missing
+ // and will remove annotations that don't pass.
+ if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) return;
+ }
// Increase buffer size to accommodate star allele
int nval1 = nval / nsmpl;
mtmp = buf->mtmp;
- if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
- else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+ if ( type==BCF_HT_INT || type==BCF_HT_REAL )
+ {
+ if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*nsmpl*(nval1+1) ) mtmp = num_size*nsmpl*(nval1+1); // +1 for the possibility of the star allele
+ else if ( len==BCF_VL_G && mtmp < num_size*nsmpl*(nval1+3) ) mtmp = num_size*nsmpl*(nval1+3);
+ }
+ else if ( type==BCF_HT_STR )
+ {
+ if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < nsmpl*(nval1+2) ) mtmp = nsmpl*(nval1+2); // +2 for the possibility of the star allele, ",."
+ else if ( len==BCF_VL_G && mtmp < nsmpl*(nval1+6) ) mtmp = nsmpl*(nval1+6);
+ }
if ( buf->mtmp2 < mtmp )
{
int ret = 0;
if ( len==BCF_VL_FIXED || len==BCF_VL_VAR )
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type);
- else if ( len==BCF_VL_A )
+ else if ( len==BCF_VL_A && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial - 1;
assert( iori<nval );
{
void *src = buf->tmp + nval1*num_size*i;
void *dst = buf->tmp2 + num_size*i*(star_allele+1);
- memcpy(dst,src+iori*num_size,num_size);
+ if ( !memcmp(vector_end_ptr,src+iori*num_size,num_size) )
+ memcpy(dst,missing_ptr,num_size);
+ else
+ memcpy(dst,src+iori*num_size,num_size);
if ( star_allele )
memcpy(dst+num_size,missing_ptr,num_size);
}
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type);
}
- else if ( len==BCF_VL_R )
+ else if ( (len==BCF_VL_A || len==BCF_VL_R) && type==BCF_HT_STR )
+ {
+ int ioff = len==BCF_VL_R ? 1 : 0;
+ int iori = buf->split.atoms[iout]->ial - 1;
+ int nval1_dst = star_allele ? nval1 + 2 : nval1;
+ memset(buf->tmp2,0,nval1_dst*nsmpl);
+ for (i=0; i<nsmpl; i++)
+ {
+ kstring_t dst;
+ dst.l = 0; dst.m = nval1_dst; dst.s = (char*)buf->tmp2 + nval1_dst*i;
+ kputc_('.',&dst);
+ if ( star_allele ) kputsn_(",.",2,&dst);
+ if ( len==BCF_VL_R )
+ {
+ kputsn_(",.",2,&dst);
+ copy_string_field(buf->tmp+nval1*i, 0, nval1, &dst, 0);
+ }
+ copy_string_field(buf->tmp+nval1*i, iori+ioff, nval1, &dst, 0+ioff);
+ if ( star_allele ) copy_string_field(".", 0, 1, &dst, 1+ioff);
+ }
+ ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nval1_dst*nsmpl, type);
+ }
+ else if ( len==BCF_VL_R && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial;
assert( iori<=nval );
void *dst = buf->tmp2 + num_size*i*(star_allele+2);
memcpy(dst,src,num_size);
memcpy(dst+num_size,src+iori*num_size,num_size);
-
if ( type==BCF_HT_INT && mode==M_SUM )
{
uint8_t *tbl = buf->split.tbl + iout*buf->split.nori;
}
ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type);
}
- else if ( len==BCF_VL_G )
+ else if ( len==BCF_VL_G && type!=BCF_HT_STR )
{
int iori = buf->split.atoms[iout]->ial;
int i01 = bcf_alleles2gt(0,iori);
_split_table_set_format(buf, &rec->d.fmt[i], mode);
}
+
+ // Check that at least one FORMAT field was added, if not, the number of samples must be set manually
+ for (i=0; i<buf->split.nout; i++)
+ {
+ bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)];
+ if ( !out->n_sample ) out->n_sample = rec->n_sample;
+ }
}
void abuf_push(abuf_t *buf, bcf1_t *rec)
// Compensate for AD not being counted on low quality REF indel matches.
if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
{
- for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0?
+ for (i=0; i<4; i++)
{
- r->ADR[i] += ADR_ref_missed[i];
- r->ADF[i] += ADF_ref_missed[i];
+ r->ADR[0] += ADR_ref_missed[i];
+ r->ADF[0] += ADF_ref_missed[i];
}
}
else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
double calc_mwu_bias_cdf(int *a, int *b, int n)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
+ //double ties = 0;
for (i=0; i<n; i++)
{
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- if ( a[i] && b[i] )
- {
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
- }
+ // if ( a[i] && b[i] )
+ // {
+ // double tie = a[i] + b[i];
+ // ties += (tie*tie-1)*tie;
+ // }
}
if ( !na || !nb ) return HUGE_VAL;
double calc_mwu_bias(int *a, int *b, int n, int left)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
+ // double ties = 0;
for (i=0; i<n; i++)
{
if (!a[i]) {
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
+ // double tie = a[i] + b[i];
+ // ties += (tie*tie-1)*tie;
}
}
if ( !na || !nb ) return HUGE_VAL;
}
}
- if (na+nb <= 1)
+ if (!na || !nb)
return HUGE_VAL;
double U, m;
double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
// var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
if (var2 <= 0)
- return HUGE_VAL;
+ return do_Z ? 0 : 1;
if (do_Z) {
// S.D. normalised Z-score
// Compensate for AD not being counted on low quality REF indel matches.
if ( r->ADF && bca->ambig_reads==B2B_INC_AD0 )
{
- for (i=0; i<4; i++) // verify: are the counters ever non-zero for i!=0?
+ for (i=0; i<4; i++)
{
- r->ADR[i] += ADR_ref_missed[i];
- r->ADF[i] += ADF_ref_missed[i];
+ r->ADR[0] += ADR_ref_missed[i];
+ r->ADF[0] += ADF_ref_missed[i];
}
}
else if ( r->ADF && bca->ambig_reads==B2B_INC_AD )
double calc_mwu_bias_cdf(int *a, int *b, int n)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
+ //double ties = 0;
for (i=0; i<n; i++)
{
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- if ( a[i] && b[i] )
- {
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
- }
+ // if ( a[i] && b[i] )
+ // {
+ // double tie = a[i] + b[i];
+ // ties += (tie*tie-1)*tie;
+ // }
}
if ( !na || !nb ) return HUGE_VAL;
double calc_mwu_bias(int *a, int *b, int n, int left)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
+ // double ties = 0;
for (i=0; i<n; i++)
{
if (!a[i]) {
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
+ // double tie = a[i] + b[i];
+ // ties += (tie*tie-1)*tie;
}
}
if ( !na || !nb ) return HUGE_VAL;
}
}
- if (na+nb <= 1)
+ if (!na || !nb)
return HUGE_VAL;
double U, m;
double var2 = (na*nb)/12.0 * ((na+nb+1) - t/(double)((na+nb)*(na+nb-1)));
// var = na*nb*(na+nb+1)/12.0; // simpler; minus tie adjustment
if (var2 <= 0)
- return HUGE_VAL;
+ return do_Z ? 0 : 1;
if (do_Z) {
// S.D. normalised Z-score
// for internal uses
int max_bases;
int indel_types[4]; // indel lengths
+ int indel_win_size;
int maxins, indelreg;
int read_len;
char *inscns;
KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
-#define INDEL_WINDOW_SIZE 110
#define MAX_TYPES 64
// To prevent long stretches of N's to be mistaken for indels
// (sometimes thousands of bases), check the number of N's in the
// sequence and skip places where half or more reference bases are Ns.
- int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len
- ?2*INDEL_WINDOW_SIZE : max_rd_len);
+ int nN=0, i_end = pos + (2*bca->indel_win_size < max_rd_len
+ ?2*bca->indel_win_size : max_rd_len);
for (i=pos; i<i_end && ref[i]; i++)
nN += ref[i] == 'N';
if ( nN*2>(i-pos) ) {
// calculate left and right boundary
- left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
- right = pos + INDEL_WINDOW_SIZE;
+ left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0;
+ right = pos + bca->indel_win_size;
if (types[0] < 0) right -= types[0];
// in case the alignments stand out the reference
// long read data needs less context. It also tends to
// have many more candidate indels to investigate so
// speed here matters more.
- if (pos - left >= INDEL_WINDOW_SIZE)
- left2 += INDEL_WINDOW_SIZE/2;
- if (right-pos >= INDEL_WINDOW_SIZE)
- right2 -= INDEL_WINDOW_SIZE/2;
+ if (pos - left >= bca->indel_win_size)
+ left2 += bca->indel_win_size/2;
+ if (right-pos >= bca->indel_win_size)
+ right2 -= bca->indel_win_size/2;
}
int r_start = p->b->core.pos;
KSORT_INIT_GENERIC(uint32_t)
#define MINUS_CONST 0x10000000
-#define INDEL_WINDOW_SIZE 110
#define MAX_TYPES 64
// To prevent long stretches of N's to be mistaken for indels
// (sometimes thousands of bases), check the number of N's in the
// sequence and skip places where half or more reference bases are Ns.
- int nN=0, i_end = pos + (2*INDEL_WINDOW_SIZE < max_rd_len
- ?2*INDEL_WINDOW_SIZE : max_rd_len);
+ int nN=0, i_end = pos + (2*bca->indel_win_size < max_rd_len
+ ?2*bca->indel_win_size : max_rd_len);
for (i=pos; i<i_end && ref[i]; i++)
nN += ref[i] == 'N';
if ( nN*2>(i-pos) ) {
// calculate left and right boundary
- left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
- right = pos + INDEL_WINDOW_SIZE;
+ left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0;
+ right = pos + bca->indel_win_size;
if (types[0] < 0) right -= types[0];
// in case the alignments stand out the reference
// long read data needs less context. It also tends to
// have many more candidate indels to investigate so
// speed here matters more.
- if (pos - left >= INDEL_WINDOW_SIZE)
- left2 += INDEL_WINDOW_SIZE/2;
- if (right-pos >= INDEL_WINDOW_SIZE)
- right2 -= INDEL_WINDOW_SIZE/2;
+ if (pos - left >= bca->indel_win_size)
+ left2 += bca->indel_win_size/2;
+ if (right-pos >= bca->indel_win_size)
+ right2 -= bca->indel_win_size/2;
}
int r_start = p->b->core.pos;
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
const char *hts_bcf_wmode2(int file_type, char *fname);
+void set_wmode(char dst[8], int file_type, char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset
char *init_tmp_prefix(const char *prefix);
void *smalloc(size_t size); // safe malloc
// determine if uppercase or lowercase is used in this fasta file
if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;
- if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l);
+ if ( args->mask ) mask_region(args, str.s, str.l);
kputs(str.s, &args->fa_buf);
bcf1_t **rec_ptr = NULL;
// determine if uppercase or lowercase is used in this fasta file
if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;
- if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l);
+ if ( args->mask ) mask_region(args, str.s, str.l);
kputs(str.s, &args->fa_buf);
bcf1_t **rec_ptr = NULL;
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
#include <htslib/kfunc.h>
+#include <htslib/khash_str2int.h>
#include "bcftools.h"
#include "variantkey.h"
#include "convert.h"
void *dat;
int ndat;
char *undef_info_tag;
+ void *used_tags_hash;
+ char **used_tags_list;
+ int nused_tags;
int allow_undef_tags;
uint8_t **subset_samples;
};
n /= convert->nsamples;
for (i=0; i<convert->nsamples; i++)
{
- float sum = 0, *ptr = (float*)convert->dat + i*n;
+ float *ptr = (float*)convert->dat + i*n;
int j;
for (j=0; j<n; j++)
{
if ( bcf_float_is_vector_end(ptr[j]) ) break;
if ( bcf_float_is_missing(ptr[j]) ) { ptr[j]=0; continue; }
if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]);
- sum+=ptr[j];
}
if ( j==line->n_allele )
ksprintf(str," %f %f %f",ptr[0],0.,ptr[1]); // haploid
kputc('.', str);
}
-static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
+static void _used_tags_add(convert_t *convert, int type, char *key)
+{
+ kstring_t str = {0,0,0};
+ ksprintf(&str,"%s/%s",type==T_INFO?"INFO":"FORMAT",key);
+ khash_str2int_inc(convert->used_tags_hash,str.s);
+ convert->nused_tags++;
+ convert->used_tags_list = (char**)realloc(convert->used_tags_list,sizeof(*convert->used_tags_list)*convert->nused_tags);
+ convert->used_tags_list[convert->nused_tags-1] = str.s;
+}
+
+
+#define _SET_NON_FORMAT_TAGS(function,key,...) \
+ if ( !strcmp("CHROM",key) ) { function(__VA_ARGS__, T_CHROM); } \
+ else if ( !strcmp("POS",key) ) { function(__VA_ARGS__, T_POS); } \
+ else if ( !strcmp("POS0",key) ) { function(__VA_ARGS__, T_POS0); } \
+ else if ( !strcmp("END",key) ) { function(__VA_ARGS__, T_END); } \
+ else if ( !strcmp("END0",key) ) { function(__VA_ARGS__, T_END0); } \
+ else if ( !strcmp("ID",key) ) { function(__VA_ARGS__, T_ID); } \
+ else if ( !strcmp("REF",key) ) { function(__VA_ARGS__, T_REF); } \
+ else if ( !strcmp("FIRST_ALT",key) ) { function(__VA_ARGS__, T_FIRST_ALT); } \
+ else if ( !strcmp("QUAL",key) ) { function(__VA_ARGS__, T_QUAL); } \
+ else if ( !strcmp("TYPE",key) ) { function(__VA_ARGS__, T_TYPE); } \
+ else if ( !strcmp("FILTER",key) ) { function(__VA_ARGS__, T_FILTER); } \
+ else if ( !strcmp("IS_TS",key) ) { function(__VA_ARGS__, T_IS_TS); } \
+ else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \
+ else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); }
+
+static void set_type(fmt_t *fmt, int type) { fmt->type = type; }
+static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
{
convert->nfmt++;
if ( convert->nfmt > convert->mfmt )
int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) )
{
- if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
- else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
- else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; }
- else if ( !strcmp("END",key) ) { fmt->type = T_END; }
- else if ( !strcmp("END0",key) ) { fmt->type = T_END0; }
- else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
- else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
- else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
- else if ( !strcmp("FIRST_ALT",key) ) { fmt->type = T_FIRST_ALT; }
- else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; }
- else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; }
- else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
+ _SET_NON_FORMAT_TAGS(set_type,key,fmt)
+ else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
+ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; }
else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
- else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; }
+ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
+ {
+ fmt->type = T_INFO;
+ _used_tags_add(convert,T_INFO,key);
+ }
}
else if ( fmt->type==T_PBINOM )
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key);
+ _used_tags_add(convert,T_FORMAT,key);
}
else if ( fmt->type==T_NPASS )
{
kputsn(p, q-p, &str);
if ( is_gtf )
{
- if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
- else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
- else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
+ else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
+ else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
else if ( !strcmp(str.s, "TBCSQ") )
{
- fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf);
+ fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ);
fmt->subscript = parse_subscript(&q);
if ( fmt->subscript==-1 )
{
}
else fmt->subscript++;
}
- else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, "GT", is_gtf, T_IUPAC_GT);
else if ( !strcmp(str.s, "INFO") )
{
if ( *q!='/' )
while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO);
fmt->subscript = parse_subscript(&q);
+ _used_tags_add(convert,T_INFO,str.s);
}
else if ( !strcmp(str.s,"PBINOM") )
{
while ( *q && *q!=')' ) q++;
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- register_tag(convert, T_PBINOM, str.s, is_gtf);
+ register_tag(convert, str.s, is_gtf, T_PBINOM);
q++;
}
else if ( !strcmp(str.s,"N_PASS") )
error("N_PASS() must be placed outside the square brackets\n");
else
{
- fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_FORMAT);
fmt->subscript = parse_subscript(&q);
}
}
else
{
- if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
- else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
- else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf);
- else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf);
- else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf);
- else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
- else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
+ _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
else if ( !strcmp(str.s, "ALT") )
{
- fmt_t *fmt = register_tag(convert, T_ALT, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
fmt->subscript = parse_subscript(&q);
}
- else if ( !strcmp(str.s, "FIRST_ALT") ) register_tag(convert, T_FIRST_ALT, str.s, is_gtf);
- else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
- else if ( !strcmp(str.s, "FILTER") ) register_tag(convert, T_FILTER, str.s, is_gtf);
- else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
- else if ( !strcmp(str.s, "IS_TS") ) register_tag(convert, T_IS_TS, str.s, is_gtf);
- else if ( !strcmp(str.s, "TYPE") ) register_tag(convert, T_TYPE, str.s, is_gtf);
- else if ( !strcmp(str.s, "MASK") ) register_tag(convert, T_MASK, str.s, is_gtf);
- else if ( !strcmp(str.s, "LINE") ) register_tag(convert, T_LINE, str.s, is_gtf);
- else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, T_CHROM_POS_ID, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, T_GT_TO_PROB3, str.s, is_gtf);
- else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, T_PL_TO_PROB3, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
- else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf);
- else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, str.s, is_gtf, T_CHROM_POS_ID);
+ else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GT_TO_PROB3);
+ else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_PL_TO_PROB3);
+ else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GP_TO_PROB3);
+ else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP);
+ else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP2);
+ else if ( !strcmp(str.s, "RSX") ) register_tag(convert, str.s, is_gtf, T_RSX);
+ else if ( !strcmp(str.s, "VKX") ) register_tag(convert, str.s, is_gtf, T_VKX);
else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n");
else if ( !strcmp(str.s, "INFO") )
{
while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO);
fmt->subscript = parse_subscript(&q);
+ _used_tags_add(convert,T_INFO,str.s);
}
else
- register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO
+ register_tag(convert, NULL, is_gtf, T_INFO); // the whole INFO
}
else if ( !strcmp(str.s, "FORMAT") )
- register_tag(convert, T_FORMAT, NULL, 0);
+ register_tag(convert, NULL, 0, T_FORMAT);
else if ( !strcmp(str.s,"N_PASS") )
{
if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str);
}
if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p-1, &str);
- register_tag(convert, T_NPASS, str.s, is_gtf);
+ register_tag(convert, str.s, is_gtf, T_NPASS);
}
else
{
- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO);
fmt->subscript = parse_subscript(&q);
+ _used_tags_add(convert,T_INFO,str.s);
}
}
free(str.s);
q++;
}
if ( !str.l ) error("Could not parse format string: %s\n", convert->format_str);
- register_tag(convert, T_SEP, str.s, is_gtf);
+ register_tag(convert, str.s, is_gtf, T_SEP);
free(str.s);
return q;
}
convert->header = hdr;
convert->format_str = strdup(format_str);
convert->max_unpack = BCF_UN_STR;
+ convert->used_tags_hash = khash_str2int_init();
int i, is_gtf = 0;
char *p = convert->format_str;
switch (*p)
{
case '[': is_gtf = 1; p++; break;
- case ']': is_gtf = 0; register_tag(convert, T_SEP, NULL, 0); p++; break;
+ case ']': is_gtf = 0; register_tag(convert, NULL, 0, T_SEP); p++; break;
case '%': p = parse_tag(convert, p, is_gtf); break;
default: p = parse_sep(convert, p, is_gtf); break;
}
if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr);
free(convert->fmt[i].key);
}
+ if ( convert->nused_tags )
+ {
+ for (i=0; i<convert->nused_tags; i++) free(convert->used_tags_list[i]);
+ free(convert->used_tags_list);
+ }
+ khash_str2int_destroy(convert->used_tags_hash);
free(convert->fmt);
free(convert->undef_info_tag);
free(convert->dat);
return convert->max_unpack;
}
+int convert_is_tag_used(convert_t *convert, char *tag)
+{
+ return khash_str2int_has_key(convert->used_tags_hash, tag);
+}
+const char **convert_list_used_tags(convert_t *convert, int *ntags)
+{
+ *ntags = convert->nused_tags;
+ return (const char **)convert->used_tags_list;
+}
+
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
#include <htslib/kfunc.h>
+#include <htslib/khash_str2int.h>
#include "bcftools.h"
#include "variantkey.h"
#include "convert.h"
void *dat;
int ndat;
char *undef_info_tag;
+ void *used_tags_hash;
+ char **used_tags_list;
+ int nused_tags;
int allow_undef_tags;
uint8_t **subset_samples;
};
n /= convert->nsamples;
for (i=0; i<convert->nsamples; i++)
{
- float sum = 0, *ptr = (float*)convert->dat + i*n;
+ float *ptr = (float*)convert->dat + i*n;
int j;
for (j=0; j<n; j++)
{
if ( bcf_float_is_vector_end(ptr[j]) ) break;
if ( bcf_float_is_missing(ptr[j]) ) { ptr[j]=0; continue; }
if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]);
- sum+=ptr[j];
}
if ( j==line->n_allele )
ksprintf(str," %f %f %f",ptr[0],0.,ptr[1]); // haploid
kputc('.', str);
}
-static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
+static void _used_tags_add(convert_t *convert, int type, char *key)
+{
+ kstring_t str = {0,0,0};
+ ksprintf(&str,"%s/%s",type==T_INFO?"INFO":"FORMAT",key);
+ khash_str2int_inc(convert->used_tags_hash,str.s);
+ convert->nused_tags++;
+ convert->used_tags_list = (char**)realloc(convert->used_tags_list,sizeof(*convert->used_tags_list)*convert->nused_tags);
+ convert->used_tags_list[convert->nused_tags-1] = str.s;
+}
+
+
+#define _SET_NON_FORMAT_TAGS(function,key,...) \
+ if ( !strcmp("CHROM",key) ) { function(__VA_ARGS__, T_CHROM); } \
+ else if ( !strcmp("POS",key) ) { function(__VA_ARGS__, T_POS); } \
+ else if ( !strcmp("POS0",key) ) { function(__VA_ARGS__, T_POS0); } \
+ else if ( !strcmp("END",key) ) { function(__VA_ARGS__, T_END); } \
+ else if ( !strcmp("END0",key) ) { function(__VA_ARGS__, T_END0); } \
+ else if ( !strcmp("ID",key) ) { function(__VA_ARGS__, T_ID); } \
+ else if ( !strcmp("REF",key) ) { function(__VA_ARGS__, T_REF); } \
+ else if ( !strcmp("FIRST_ALT",key) ) { function(__VA_ARGS__, T_FIRST_ALT); } \
+ else if ( !strcmp("QUAL",key) ) { function(__VA_ARGS__, T_QUAL); } \
+ else if ( !strcmp("TYPE",key) ) { function(__VA_ARGS__, T_TYPE); } \
+ else if ( !strcmp("FILTER",key) ) { function(__VA_ARGS__, T_FILTER); } \
+ else if ( !strcmp("IS_TS",key) ) { function(__VA_ARGS__, T_IS_TS); } \
+ else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \
+ else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); }
+
+static void set_type(fmt_t *fmt, int type) { fmt->type = type; }
+static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
{
convert->nfmt++;
if ( convert->nfmt > convert->mfmt )
int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) )
{
- if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
- else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
- else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; }
- else if ( !strcmp("END",key) ) { fmt->type = T_END; }
- else if ( !strcmp("END0",key) ) { fmt->type = T_END0; }
- else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
- else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
- else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
- else if ( !strcmp("FIRST_ALT",key) ) { fmt->type = T_FIRST_ALT; }
- else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; }
- else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; }
- else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
+ _SET_NON_FORMAT_TAGS(set_type,key,fmt)
+ else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
+ else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; }
else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
- else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; }
+ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
+ {
+ fmt->type = T_INFO;
+ _used_tags_add(convert,T_INFO,key);
+ }
}
else if ( fmt->type==T_PBINOM )
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key);
+ _used_tags_add(convert,T_FORMAT,key);
}
else if ( fmt->type==T_NPASS )
{
kputsn(p, q-p, &str);
if ( is_gtf )
{
- if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
- else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
- else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
+ else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
+ else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
else if ( !strcmp(str.s, "TBCSQ") )
{
- fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf);
+ fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ);
fmt->subscript = parse_subscript(&q);
if ( fmt->subscript==-1 )
{
}
else fmt->subscript++;
}
- else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, "GT", is_gtf, T_IUPAC_GT);
else if ( !strcmp(str.s, "INFO") )
{
if ( *q!='/' )
while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO);
fmt->subscript = parse_subscript(&q);
+ _used_tags_add(convert,T_INFO,str.s);
}
else if ( !strcmp(str.s,"PBINOM") )
{
while ( *q && *q!=')' ) q++;
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- register_tag(convert, T_PBINOM, str.s, is_gtf);
+ register_tag(convert, str.s, is_gtf, T_PBINOM);
q++;
}
else if ( !strcmp(str.s,"N_PASS") )
error("N_PASS() must be placed outside the square brackets\n");
else
{
- fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_FORMAT);
fmt->subscript = parse_subscript(&q);
}
}
else
{
- if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
- else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
- else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf);
- else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf);
- else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf);
- else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
- else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
+ _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
else if ( !strcmp(str.s, "ALT") )
{
- fmt_t *fmt = register_tag(convert, T_ALT, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
fmt->subscript = parse_subscript(&q);
}
- else if ( !strcmp(str.s, "FIRST_ALT") ) register_tag(convert, T_FIRST_ALT, str.s, is_gtf);
- else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
- else if ( !strcmp(str.s, "FILTER") ) register_tag(convert, T_FILTER, str.s, is_gtf);
- else if ( !strcmp(str.s, "QUAL") ) register_tag(convert, T_QUAL, str.s, is_gtf);
- else if ( !strcmp(str.s, "IS_TS") ) register_tag(convert, T_IS_TS, str.s, is_gtf);
- else if ( !strcmp(str.s, "TYPE") ) register_tag(convert, T_TYPE, str.s, is_gtf);
- else if ( !strcmp(str.s, "MASK") ) register_tag(convert, T_MASK, str.s, is_gtf);
- else if ( !strcmp(str.s, "LINE") ) register_tag(convert, T_LINE, str.s, is_gtf);
- else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, T_CHROM_POS_ID, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, T_GT_TO_PROB3, str.s, is_gtf);
- else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, T_PL_TO_PROB3, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf);
- else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf);
- else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf);
- else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf);
+ else if ( !strcmp(str.s, "_CHROM_POS_ID") ) register_tag(convert, str.s, is_gtf, T_CHROM_POS_ID);
+ else if ( !strcmp(str.s, "_GT_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GT_TO_PROB3);
+ else if ( !strcmp(str.s, "_PL_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_PL_TO_PROB3);
+ else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, str.s, is_gtf, T_GP_TO_PROB3);
+ else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP);
+ else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, str.s, is_gtf, T_GT_TO_HAP2);
+ else if ( !strcmp(str.s, "RSX") ) register_tag(convert, str.s, is_gtf, T_RSX);
+ else if ( !strcmp(str.s, "VKX") ) register_tag(convert, str.s, is_gtf, T_VKX);
else if ( !strcmp(str.s,"PBINOM") ) error("Error: PBINOM() is currently supported only with FORMAT tags. (todo)\n");
else if ( !strcmp(str.s, "INFO") )
{
while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++;
if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p, &str);
- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO);
fmt->subscript = parse_subscript(&q);
+ _used_tags_add(convert,T_INFO,str.s);
}
else
- register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO
+ register_tag(convert, NULL, is_gtf, T_INFO); // the whole INFO
}
else if ( !strcmp(str.s, "FORMAT") )
- register_tag(convert, T_FORMAT, NULL, 0);
+ register_tag(convert, NULL, 0, T_FORMAT);
else if ( !strcmp(str.s,"N_PASS") )
{
if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str);
}
if ( q-p==0 || nopen ) error("Could not parse format string: %s\n", convert->format_str);
kputsn(p, q-p-1, &str);
- register_tag(convert, T_NPASS, str.s, is_gtf);
+ register_tag(convert, str.s, is_gtf, T_NPASS);
}
else
{
- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf);
+ fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_INFO);
fmt->subscript = parse_subscript(&q);
+ _used_tags_add(convert,T_INFO,str.s);
}
}
free(str.s);
q++;
}
if ( !str.l ) error("Could not parse format string: %s\n", convert->format_str);
- register_tag(convert, T_SEP, str.s, is_gtf);
+ register_tag(convert, str.s, is_gtf, T_SEP);
free(str.s);
return q;
}
convert->header = hdr;
convert->format_str = strdup(format_str);
convert->max_unpack = BCF_UN_STR;
+ convert->used_tags_hash = khash_str2int_init();
int i, is_gtf = 0;
char *p = convert->format_str;
switch (*p)
{
case '[': is_gtf = 1; p++; break;
- case ']': is_gtf = 0; register_tag(convert, T_SEP, NULL, 0); p++; break;
+ case ']': is_gtf = 0; register_tag(convert, NULL, 0, T_SEP); p++; break;
case '%': p = parse_tag(convert, p, is_gtf); break;
default: p = parse_sep(convert, p, is_gtf); break;
}
if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr);
free(convert->fmt[i].key);
}
+ if ( convert->nused_tags )
+ {
+ for (i=0; i<convert->nused_tags; i++) free(convert->used_tags_list[i]);
+ free(convert->used_tags_list);
+ }
+ khash_str2int_destroy(convert->used_tags_hash);
free(convert->fmt);
free(convert->undef_info_tag);
free(convert->dat);
return convert->max_unpack;
}
+int convert_is_tag_used(convert_t *convert, char *tag)
+{
+ return khash_str2int_has_key(convert->used_tags_hash, tag);
+}
+const char **convert_list_used_tags(convert_t *convert, int *ntags)
+{
+ *ntags = convert->nused_tags;
+ return (const char **)convert->used_tags_list;
+}
+
/* convert.h -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int convert_header(convert_t *convert, kstring_t *str);
int convert_line(convert_t *convert, bcf1_t *rec, kstring_t *str);
int convert_max_unpack(convert_t *convert);
+int convert_is_tag_used(convert_t *convert, char *tag);
+const char **convert_list_used_tags(convert_t *convert, int *ntags);
#endif
vcsq_t
information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
- vcrec_t
+ vrec_t
single VCF record and csq tied to this record. (Haplotype can have multiple
consequences in several VCF records. Each record can have multiple consequences
from multiple haplotypes.)
uint32_t strand:1,
type:31; // one of CSQ_* types
uint32_t trid;
+ uint32_t vcf_ial;
uint32_t biotype; // one of GF_* types
char *gene; // gene name
bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
typedef struct
{
bcf1_t *line;
- uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved
+ uint32_t *fmt_bm; // bitmask of sample consequences with first/second haplotype interleaved
uint32_t nfmt:4, // the bitmask size (the number of integers per sample)
nvcsq:28, mvcsq;
vcsq_t *vcsq; // there can be multiple consequences for a single VCF record
hap_node_t **child, *prev; // children haplotypes and previous coding node
int nchild, mchild;
bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record
+ int vcf_ial; // which VCF allele generated this node
uint32_t nend; // number of haplotypes ending in this node
int *cur_child, mcur_child; // mapping from the allele to the currently active child
- csq_t *csq_list; // list of haplotype's consequences, broken by position
+ csq_t *csq_list; // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
int ncsq_list, mcsq_list;
};
struct _tscript_t
char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
char *bcsq_tag;
- int argc, output_type;
+ int argc, output_type, clevel;
int phase, verbosity, local_csq, record_cmd_line;
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
else
{
- new_chr = malloc(len+3); // gff does not have the prefix, faidx has
+ new_chr = malloc(len+4); // gff does not have the prefix, faidx has
memcpy(new_chr,"chr",3);
memcpy(new_chr+3,chr_beg,len);
new_chr[len+3] = 0;
}
else
{
- args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads > 0)
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
{
if ( !vbuf->vrec[j] ) continue;
if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
- free(vbuf->vrec[j]->smpl);
+ free(vbuf->vrec[j]->fmt_bm);
free(vbuf->vrec[j]->vcsq);
free(vbuf->vrec[j]);
}
{
tscript_t *tr;
struct {
- int32_t pos, rlen, alen;
+ int32_t pos, rlen, alen, ial;
char *ref, *alt;
bcf1_t *rec;
} vcf;
#endif
}
void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
-static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type)
+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type, int ial)
{
while ( regitr_overlap(itr) )
{
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = ial;
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
return csq.type.type;
}
return 0;
}
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
{
#if XDBG
fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = ial;
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
}
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
if ( ret!=0 )
{
regitr_destroy(itr);
if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
{
- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
if ( ret!=0 )
{
regitr_destroy(itr);
if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
// overlaps the exon or inside the exon
splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
}
if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
{
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
return SPLICE_OVERLAP;
}
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
}
if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
{
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
kstring_t str = {0,0,0};
tscript_t *tr = cds->tr;
child->icds = cds->icds; // index of cds in the tscript's list of exons
+ child->vcf_ial = ial;
splice_t splice;
splice_init(&splice, rec);
splice.tr = tr;
+ splice.vcf.ial = ial;
splice.vcf.alt = rec->d.allele[ial];
splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
if ( !(tr->trim & TRIM_5PRIME) )
if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
+ if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue;
+ if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop
if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
{
// This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
}
// no such csq yet in this vcf record
csq->vrec = vrec;
- csq->idx = i;
- vrec->nvcsq++;
+ csq->idx = vrec->nvcsq++;
hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
vrec->vcsq[i] = csq->type;
return 0;
csq_t *csq = &node->csq_list[icsq];
csq->pos = hap->stack[ref_node].node->rec->pos;
csq->type.trid = tr->id;
+ csq->type.vcf_ial = node->vcf_ial;
csq->type.gene = tr->gene->name;
csq->type.strand = tr->strand;
csq->type.biotype = tr->type;
csq->type.type |= CSQ_INFRAME_DELETION;
else
csq->type.type |= CSQ_INFRAME_INSERTION;
+ if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' )
+ csq->type.type |= CSQ_STOP_GAINED;
}
else
{
csq->type.type |= CSQ_MISSENSE_VARIANT;
}
}
+ // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
+ if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' )
+ {
+ rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING;
+ csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED;
+ }
if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
csq->type.type &= ~rm_csq;
csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
tmp_csq->pos = hap->stack[i].node->rec->pos;
tmp_csq->type.trid = tr->id;
+ //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants
tmp_csq->type.gene = tr->gene->name;
tmp_csq->type.strand = tr->strand;
tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
tmp_csq->pos = hap->stack[i].node->rec->pos;
tmp_csq->type.trid = tr->id;
+ //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants
tmp_csq->type.gene = tr->gene->name;
tmp_csq->type.strand = tr->strand;
tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
if ( ibeg==-1 ) ibeg = i;
continue;
}
+ // the last base of the current variant vs the first base of the next
+ // variant: are they in the same codon? (forward strand)
int icur = node2sbeg(i);
int inext = node2sbeg(i+1);
+ if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen;
+ else if ( hap->stack[i].node->dlen < 0 ) icur++;
if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet
{
if ( ibeg==-1 ) ibeg = i;
if ( ibeg==-1 ) ibeg = i;
continue;
}
+ // the last base of the current variant vs the first base of the next
+ // variant: are they in the same codon? (reverse strand)
int icur = sseq.m - 1 - node2sbeg(i);
int inext = sseq.m - 1 - node2sbeg(i-1);
+ if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen - 1;
+ else if ( hap->stack[i].node->dlen < 0 ) icur -= hap->stack[i].node->dlen;
+ if ( hap->stack[i-1].node->dlen > 0 ) inext -= hap->stack[i-1].node->dlen;
if ( icur/3 == inext/3 )
{
if ( ibeg==-1 ) ibeg = i;
int ival, ibit;
icsq2_to_bit(icsq2, &ival,&ibit);
if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
- vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
+ vrec->fmt_bm[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
{
- if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq);
- else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ if ( !vrec->fmt_bm ) vrec->fmt_bm = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
+ else memset(vrec->fmt_bm,0,args->hdr_nsmpl*sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
}
if ( !vrec->line ) vrec->line = bcf_init1();
SWAP(bcf1_t*, (*rec_ptr), vrec->line);
{
if ( vrec->nfmt < args->nfmt_bcsq )
for (j=1; j<args->hdr_nsmpl; j++)
- memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl));
- bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt);
+ memmove(&vrec->fmt_bm[j*vrec->nfmt], &vrec->fmt_bm[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->fmt_bm));
+ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->fmt_bm, args->hdr_nsmpl*vrec->nfmt);
}
vrec->nvcsq = 0;
if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = i;
csq.type.gene = tr->gene->name;
int csq_type = node.csq;
- // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though
+ // code repetition: it would be nice to reuse the code from hap_add_csq, needs refactoring though
if ( node.type == HAP_SSS )
{
csq.type.type = csq_type;
csq_type |= CSQ_INFRAME_DELETION;
else
csq_type |= CSQ_INFRAME_INSERTION;
+ if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' )
+ csq_type |= CSQ_STOP_GAINED;
}
else
{
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = 1;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = ial;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
// known issues: tab output leads to unsorted output. This is because
// coding haplotypes are printed in one go and buffering is not used
// with tab output. VCF output is OK though.
- if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists
+ if ( csq_push(args, csq, rec)!=0 && args->phase==PHASE_DROP_GT ) return; // the consequence already exists
int i,j,ngt = 0;
if ( args->phase!=PHASE_DROP_GT )
int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
for (j=0; j<ngt; j++)
{
- if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
+ int ial = bcf_gt_allele(gt[j]);
+ if ( !ial || ial!=csq->type.vcf_ial ) continue;
csq_print_text(args, csq, args->smpl->idx[i],j+1);
}
}
int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
for (j=0; j<ngt; j++)
{
- if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
+ int ial = bcf_gt_allele(gt[j]);
+ if ( !ial || ial!=csq->type.vcf_ial ) continue;
int icsq2 = 2*csq->idx + j;
if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
int ival, ibit;
icsq2_to_bit(icsq2, &ival,&ibit);
if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
- vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit;
+ vrec->fmt_bm[i*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
}
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = i;
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
ret = 1;
bcf1_t *rec = *rec_ptr;
static int32_t prev_rid = -1, prev_pos = -1;
- if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; }
+ if ( prev_rid!=rec->rid )
+ {
+ prev_rid = rec->rid;
+ prev_pos = rec->pos;
+
+ // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX).
+ // Perform a simple sanity check (that does not catch much), the chromosome must be present in the
+ // reference file
+ if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
+ error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ }
if ( prev_pos > rec->pos )
error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
"Usage: bcftools csq [OPTIONS] in.vcf\n"
"\n"
"Required options:\n"
- " -f, --fasta-ref FILE reference file in fasta format\n"
- " -g, --gff-annot FILE gff3 annotation file\n"
+ " -f, --fasta-ref FILE Reference file in fasta format\n"
+ " -g, --gff-annot FILE GFF3 annotation file\n"
"\n"
"CSQ options:\n"
- " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n"
- " -c, --custom-tag STRING use this tag instead of the default BCSQ\n"
- " -l, --local-csq localized predictions, consider only one VCF record at a time\n"
- " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n"
- " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n"
- " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
- " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
- " r: require phased GTs, throw an error on unphased het GTs\n"
- " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
- " s: skip unphased hets\n"
+ " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n"
+ " -l, --local-csq Localized predictions, consider only one VCF record at a time\n"
+ " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n"
+ " -p, --phase a|m|r|R|s How to handle unphased heterozygous genotypes: [r]\n"
+ " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
+ " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
+ " r: require phased GTs, throw an error on unphased het GTs\n"
+ " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
+ " s: skip unphased hets\n"
"Options:\n"
- " -e, --exclude EXPR exclude sites for which the expression is true\n"
- " --force run even if some sanity checks fail\n"
- " -i, --include EXPR select sites for which the expression is true\n"
- " --no-version do not append version and command line to the header\n"
- " -o, --output FILE write output to a file [standard output]\n"
- " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
- " v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
- " -r, --regions REGION restrict to comma-separated list of regions\n"
- " -R, --regions-file FILE restrict to regions listed in a file\n"
- " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n"
- " -S, --samples-file FILE samples to include\n"
- " -t, --targets REGION similar to -r but streams rather than index-jumps\n"
- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
- " --threads INT use multithreading with <int> worker threads [0]\n"
- " -v, --verbose INT verbosity level 0-2 [1]\n"
+ " -e, --exclude EXPR Exclude sites for which the expression is true\n"
+ " --force Run even if some sanity checks fail\n"
+ " -i, --include EXPR Select sites for which the expression is true\n"
+ " --no-version Do not append version and command line to the header\n"
+ " -o, --output FILE Write output to a file [standard output]\n"
+ " -O, --output-type b|u|z|v|t[0-9] b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " v: uncompressed VCF, t: plain tab-delimited text output, 0-9: compression level [v]\n"
+ " -r, --regions REGION Restrict to comma-separated list of regions\n"
+ " -R, --regions-file FILE Restrict to regions listed in a file\n"
+ " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"
+ " -s, --samples -|LIST Samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file FILE Samples to include\n"
+ " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
+ " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
+ " --threads INT Use multithreading with <int> worker threads [0]\n"
+ " -v, --verbose INT Verbosity level 0-2 [1]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values
args->verbosity = 1;
args->record_cmd_line = 1;
+ args->clevel = -1;
static struct option loptions[] =
{
{"verbose",1,0,'v'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,4},
{"samples",1,0,'s'},
{"samples-file",1,0,'S'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,5},
{"no-version",no_argument,NULL,3},
{0,0,0,0}
};
int c, targets_is_file = 0, regions_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
{
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
}
break;
case 'e':
case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
case 't': targets_list = optarg; break;
case 'T': targets_list = optarg; targets_is_file = 1; break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 5 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
if ( !args->gff_fname ) error("Missing the --gff option\n");
args->sr = bcf_sr_init();
- if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
- error("Failed to read the targets: %s\n", targets_list);
- if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", regions_list);
+ if ( targets_list )
+ {
+ bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,targets_overlap);
+ if ( bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", targets_list);
+ }
+ if ( regions_list )
+ {
+ bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", regions_list);
+ }
if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads);
if ( !bcf_sr_add_reader(args->sr, fname) )
error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum));
vcsq_t
information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
- vcrec_t
+ vrec_t
single VCF record and csq tied to this record. (Haplotype can have multiple
consequences in several VCF records. Each record can have multiple consequences
from multiple haplotypes.)
uint32_t strand:1,
type:31; // one of CSQ_* types
uint32_t trid;
+ uint32_t vcf_ial;
uint32_t biotype; // one of GF_* types
char *gene; // gene name
bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
typedef struct
{
bcf1_t *line;
- uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved
+ uint32_t *fmt_bm; // bitmask of sample consequences with first/second haplotype interleaved
uint32_t nfmt:4, // the bitmask size (the number of integers per sample)
nvcsq:28, mvcsq;
vcsq_t *vcsq; // there can be multiple consequences for a single VCF record
hap_node_t **child, *prev; // children haplotypes and previous coding node
int nchild, mchild;
bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record
+ int vcf_ial; // which VCF allele generated this node
uint32_t nend; // number of haplotypes ending in this node
int *cur_child, mcur_child; // mapping from the allele to the currently active child
- csq_t *csq_list; // list of haplotype's consequences, broken by position
+ csq_t *csq_list; // list of haplotype's consequences, broken by position (each corresponds to a VCF record)
int ncsq_list, mcsq_list;
};
struct _tscript_t
char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
char *bcsq_tag;
- int argc, output_type;
+ int argc, output_type, clevel;
int phase, verbosity, local_csq, record_cmd_line;
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
new_chr = strdup(chr_beg+3); // gff has the prefix, faidx does not
else
{
- new_chr = malloc(len+3); // gff does not have the prefix, faidx has
+ new_chr = malloc(len+4); // gff does not have the prefix, faidx has
memcpy(new_chr,"chr",3);
memcpy(new_chr+3,chr_beg,len);
new_chr[len+3] = 0;
}
else
{
- args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads > 0)
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
{
if ( !vbuf->vrec[j] ) continue;
if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
- free(vbuf->vrec[j]->smpl);
+ free(vbuf->vrec[j]->fmt_bm);
free(vbuf->vrec[j]->vcsq);
free(vbuf->vrec[j]);
}
{
tscript_t *tr;
struct {
- int32_t pos, rlen, alen;
+ int32_t pos, rlen, alen, ial;
char *ref, *alt;
bcf1_t *rec;
} vcf;
#endif
}
void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
-static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type)
+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type, int ial)
{
while ( regitr_overlap(itr) )
{
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = ial;
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
return csq.type.type;
}
return 0;
}
-static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type)
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type, int ial)
{
#if XDBG
fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = ial;
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
}
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
if ( ret!=0 )
{
regitr_destroy(itr);
if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
{
- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
if ( ret!=0 )
{
regitr_destroy(itr);
if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
// overlaps the exon or inside the exon
splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
}
if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
{
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
return SPLICE_OVERLAP;
}
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
regitr_t *itr = regitr_init(NULL);
const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq);
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
regitr_destroy(itr);
}
if ( !csq )
}
if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
{
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_OUTSIDE;
}
splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
}
- csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
kstring_t str = {0,0,0};
tscript_t *tr = cds->tr;
child->icds = cds->icds; // index of cds in the tscript's list of exons
+ child->vcf_ial = ial;
splice_t splice;
splice_init(&splice, rec);
splice.tr = tr;
+ splice.vcf.ial = ial;
splice.vcf.alt = rec->d.allele[ial];
splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
if ( !(tr->trim & TRIM_5PRIME) )
if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
+ if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue;
+ if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop
if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
{
// This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
}
// no such csq yet in this vcf record
csq->vrec = vrec;
- csq->idx = i;
- vrec->nvcsq++;
+ csq->idx = vrec->nvcsq++;
hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
vrec->vcsq[i] = csq->type;
return 0;
csq_t *csq = &node->csq_list[icsq];
csq->pos = hap->stack[ref_node].node->rec->pos;
csq->type.trid = tr->id;
+ csq->type.vcf_ial = node->vcf_ial;
csq->type.gene = tr->gene->name;
csq->type.strand = tr->strand;
csq->type.biotype = tr->type;
csq->type.type |= CSQ_INFRAME_DELETION;
else
csq->type.type |= CSQ_INFRAME_INSERTION;
+ if ( hap->tref.s[hap->tref.l-1]!='*' && hap->tseq.s[hap->tseq.l-1]=='*' )
+ csq->type.type |= CSQ_STOP_GAINED;
}
else
{
csq->type.type |= CSQ_MISSENSE_VARIANT;
}
}
+ // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
+ if ( ibeg!=iend && (csq->type.type & (CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_INFRAME_ALTERING)) && hap->tseq.s[hap->tseq.l-1]=='*' )
+ {
+ rm_csq |= CSQ_INFRAME_DELETION | CSQ_INFRAME_INSERTION | CSQ_INFRAME_ALTERING;
+ csq->type.type |= CSQ_FRAMESHIFT_VARIANT | CSQ_STOP_GAINED;
+ }
if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
csq->type.type &= ~rm_csq;
csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
tmp_csq->pos = hap->stack[i].node->rec->pos;
tmp_csq->type.trid = tr->id;
+ //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants
tmp_csq->type.gene = tr->gene->name;
tmp_csq->type.strand = tr->strand;
tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
tmp_csq->pos = hap->stack[i].node->rec->pos;
tmp_csq->type.trid = tr->id;
+ //??tmp_csq->type.vcf_ial = node->vcf_ial; .. this should not be needed for non-compound variants
tmp_csq->type.gene = tr->gene->name;
tmp_csq->type.strand = tr->strand;
tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
if ( ibeg==-1 ) ibeg = i;
continue;
}
+ // the last base of the current variant vs the first base of the next
+ // variant: are they in the same codon? (forward strand)
int icur = node2sbeg(i);
int inext = node2sbeg(i+1);
+ if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen;
+ else if ( hap->stack[i].node->dlen < 0 ) icur++;
if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet
{
if ( ibeg==-1 ) ibeg = i;
if ( ibeg==-1 ) ibeg = i;
continue;
}
+ // the last base of the current variant vs the first base of the next
+ // variant: are they in the same codon? (reverse strand)
int icur = sseq.m - 1 - node2sbeg(i);
int inext = sseq.m - 1 - node2sbeg(i-1);
+ if ( hap->stack[i].node->dlen > 0 ) icur += hap->stack[i].node->dlen - 1;
+ else if ( hap->stack[i].node->dlen < 0 ) icur -= hap->stack[i].node->dlen;
+ if ( hap->stack[i-1].node->dlen > 0 ) inext -= hap->stack[i-1].node->dlen;
if ( icur/3 == inext/3 )
{
if ( ibeg==-1 ) ibeg = i;
int ival, ibit;
icsq2_to_bit(icsq2, &ival,&ibit);
if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
- vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
+ vrec->fmt_bm[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
{
- if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq);
- else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ if ( !vrec->fmt_bm ) vrec->fmt_bm = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
+ else memset(vrec->fmt_bm,0,args->hdr_nsmpl*sizeof(*vrec->fmt_bm) * args->nfmt_bcsq);
}
if ( !vrec->line ) vrec->line = bcf_init1();
SWAP(bcf1_t*, (*rec_ptr), vrec->line);
{
if ( vrec->nfmt < args->nfmt_bcsq )
for (j=1; j<args->hdr_nsmpl; j++)
- memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl));
- bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt);
+ memmove(&vrec->fmt_bm[j*vrec->nfmt], &vrec->fmt_bm[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->fmt_bm));
+ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->fmt_bm, args->hdr_nsmpl*vrec->nfmt);
}
vrec->nvcsq = 0;
if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = i;
csq.type.gene = tr->gene->name;
int csq_type = node.csq;
- // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though
+ // code repetition: it would be nice to reuse the code from hap_add_csq, needs refactoring though
if ( node.type == HAP_SSS )
{
csq.type.type = csq_type;
csq_type |= CSQ_INFRAME_DELETION;
else
csq_type |= CSQ_INFRAME_INSERTION;
+ if ( tref->s[tref->l-1]!='*' && tseq->s[tseq->l-1]=='*' )
+ csq_type |= CSQ_STOP_GAINED;
}
else
{
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = 1;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = ial;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
// known issues: tab output leads to unsorted output. This is because
// coding haplotypes are printed in one go and buffering is not used
// with tab output. VCF output is OK though.
- if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists
+ if ( csq_push(args, csq, rec)!=0 && args->phase==PHASE_DROP_GT ) return; // the consequence already exists
int i,j,ngt = 0;
if ( args->phase!=PHASE_DROP_GT )
int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
for (j=0; j<ngt; j++)
{
- if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
+ int ial = bcf_gt_allele(gt[j]);
+ if ( !ial || ial!=csq->type.vcf_ial ) continue;
csq_print_text(args, csq, args->smpl->idx[i],j+1);
}
}
int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
for (j=0; j<ngt; j++)
{
- if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end ) continue;
+ int ial = bcf_gt_allele(gt[j]);
+ if ( !ial || ial!=csq->type.vcf_ial ) continue;
int icsq2 = 2*csq->idx + j;
if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT
int ival, ibit;
icsq2_to_bit(icsq2, &ival,&ibit);
if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival;
- vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit;
+ vrec->fmt_bm[i*args->nfmt_bcsq + ival] |= 1 << ibit;
}
}
}
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
+ csq.type.vcf_ial = i;
csq.type.gene = tr->gene->name;
csq_stage(args, &csq, rec);
ret = 1;
bcf1_t *rec = *rec_ptr;
static int32_t prev_rid = -1, prev_pos = -1;
- if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; }
+ if ( prev_rid!=rec->rid )
+ {
+ prev_rid = rec->rid;
+ prev_pos = rec->pos;
+
+ // Common error is to use different naming conventions in the fasta and the VCF (e.g. X vs chrX).
+ // Perform a simple sanity check (that does not catch much), the chromosome must be present in the
+ // reference file
+ if ( !faidx_has_seq(args->fai,bcf_seqname(args->hdr,rec)) )
+ error("Error: the chromosome \"%s\" is not present in %s\n",bcf_seqname(args->hdr,rec),args->fa_fname);
+ }
if ( prev_pos > rec->pos )
error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
"Usage: bcftools csq [OPTIONS] in.vcf\n"
"\n"
"Required options:\n"
- " -f, --fasta-ref FILE reference file in fasta format\n"
- " -g, --gff-annot FILE gff3 annotation file\n"
+ " -f, --fasta-ref FILE Reference file in fasta format\n"
+ " -g, --gff-annot FILE GFF3 annotation file\n"
"\n"
"CSQ options:\n"
- " -B, --trim-protein-seq INT abbreviate protein-changing predictions to max INT aminoacids\n"
- " -c, --custom-tag STRING use this tag instead of the default BCSQ\n"
- " -l, --local-csq localized predictions, consider only one VCF record at a time\n"
- " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n"
- " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n"
- " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
- " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
- " r: require phased GTs, throw an error on unphased het GTs\n"
- " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
- " s: skip unphased hets\n"
+ " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n"
+ " -l, --local-csq Localized predictions, consider only one VCF record at a time\n"
+ " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n"
+ " -p, --phase a|m|r|R|s How to handle unphased heterozygous genotypes: [r]\n"
+ " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
+ " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
+ " r: require phased GTs, throw an error on unphased het GTs\n"
+ " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
+ " s: skip unphased hets\n"
"Options:\n"
- " -e, --exclude EXPR exclude sites for which the expression is true\n"
- " --force run even if some sanity checks fail\n"
- " -i, --include EXPR select sites for which the expression is true\n"
- " --no-version do not append version and command line to the header\n"
- " -o, --output FILE write output to a file [standard output]\n"
- " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
- " v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
- " -r, --regions REGION restrict to comma-separated list of regions\n"
- " -R, --regions-file FILE restrict to regions listed in a file\n"
- " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n"
- " -S, --samples-file FILE samples to include\n"
- " -t, --targets REGION similar to -r but streams rather than index-jumps\n"
- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
- " --threads INT use multithreading with <int> worker threads [0]\n"
- " -v, --verbose INT verbosity level 0-2 [1]\n"
+ " -e, --exclude EXPR Exclude sites for which the expression is true\n"
+ " --force Run even if some sanity checks fail\n"
+ " -i, --include EXPR Select sites for which the expression is true\n"
+ " --no-version Do not append version and command line to the header\n"
+ " -o, --output FILE Write output to a file [standard output]\n"
+ " -O, --output-type b|u|z|v|t[0-9] b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " v: uncompressed VCF, t: plain tab-delimited text output, 0-9: compression level [v]\n"
+ " -r, --regions REGION Restrict to comma-separated list of regions\n"
+ " -R, --regions-file FILE Restrict to regions listed in a file\n"
+ " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"
+ " -s, --samples -|LIST Samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file FILE Samples to include\n"
+ " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
+ " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
+ " --threads INT Use multithreading with <int> worker threads [0]\n"
+ " -v, --verbose INT Verbosity level 0-2 [1]\n"
"\n"
"Example:\n"
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values
args->verbosity = 1;
args->record_cmd_line = 1;
+ args->clevel = -1;
static struct option loptions[] =
{
{"verbose",1,0,'v'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,4},
{"samples",1,0,'s'},
{"samples-file",1,0,'S'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,5},
{"no-version",no_argument,NULL,3},
{0,0,0,0}
};
int c, targets_is_file = 0, regions_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
{
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
}
break;
case 'e':
case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
case 't': targets_list = optarg; break;
case 'T': targets_list = optarg; targets_is_file = 1; break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 5 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': error("%s",usage());
default: error("The option not recognised: %s\n\n", optarg); break;
if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
if ( !args->gff_fname ) error("Missing the --gff option\n");
args->sr = bcf_sr_init();
- if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
- error("Failed to read the targets: %s\n", targets_list);
- if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", regions_list);
+ if ( targets_list )
+ {
+ bcf_sr_set_opt(args->sr,BCF_SR_TARGETS_OVERLAP,targets_overlap);
+ if ( bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", targets_list);
+ }
+ if ( regions_list )
+ {
+ bcf_sr_set_opt(args->sr,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", regions_list);
+ }
if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads);
if ( !bcf_sr_add_reader(args->sr, fname) )
error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum));
call->als[nals] = call->tgt_als->allele[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]);
- if ( j+1==*unseen )
- {
- fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]);
- int k;
- for (k=0; k<rec->n_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
- fprintf(stderr,"\tTAB=");
- for (k=0; k<call->tgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
- fprintf(stderr,"\n");
- return -1;
- }
+ // if ( j+1==*unseen )
+ // {
+ // fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen);
+ // int k;
+ // for (k=0; k<rec->n_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
+ // fprintf(stderr,"\tTAB=");
+ // for (k=0; k<call->tgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
+ // fprintf(stderr,"\n");
+ // return -1;
+ // }
if ( j>=0 )
{
call->als[nals] = call->tgt_als->allele[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]);
- if ( j+1==*unseen )
- {
- fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]);
- int k;
- for (k=0; k<rec->n_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
- fprintf(bcftools_stderr,"\tTAB=");
- for (k=0; k<call->tgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
- fprintf(bcftools_stderr,"\n");
- return -1;
- }
+ // if ( j+1==*unseen )
+ // {
+ // fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen);
+ // int k;
+ // for (k=0; k<rec->n_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
+ // fprintf(bcftools_stderr,"\tTAB=");
+ // for (k=0; k<call->tgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
+ // fprintf(bcftools_stderr,"\n");
+ // return -1;
+ // }
if ( j>=0 )
{
int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
max_indel_depth, max_read_len, fmt_flag, ambig_reads;
int rflag_require, rflag_filter, output_type;
- int openQ, extQ, tandemQ, min_support; // for indels
+ int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
double min_frac; // for indels
double indel_bias;
char *reg_fname, *pl_list, *fai_fname, *output_fname;
- int reg_is_file, record_cmd_line, n_threads;
+ int reg_is_file, record_cmd_line, n_threads, clevel;
faidx_t *fai;
regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions
regitr_t *bed_itr, *reg_itr;
}
if (ma->conf->flag & MPLP_REALN) {
- int i, tot_ins = 0;
+ int i;
+ // int tot_ins = 0;
+ // int p = 0;
uint32_t *cigar = bam_get_cigar(b);
- int p = 0;
for (i=0; i<b->core.n_cigar; i++) {
int cig = cigar[i] & BAM_CIGAR_MASK;
- if (bam_cigar_type(cig) & 2)
- p += cigar[i] >> BAM_CIGAR_SHIFT;
+ // if (bam_cigar_type(cig) & 2)
+ // p += cigar[i] >> BAM_CIGAR_SHIFT;
if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
- tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
+ // tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
// Possible further optimsation, check tot_ins==1 later
// (and remove break) so we can detect single bp indels.
// We may want to focus BAQ on more complex regions only.
fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
// write the VCF header
- conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname));
+ char wmode[8];
+ set_wmode(wmode,conf->output_type,conf->output_fname,conf->clevel);
+ conf->bcf_fp = hts_open(conf->output_fname ? conf->output_fname : "-", wmode);
if (conf->bcf_fp == NULL) {
fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
exit(EXIT_FAILURE);
conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
conf->bca->fmt_flag = conf->fmt_flag;
conf->bca->ambig_reads = conf->ambig_reads;
+ conf->bca->indel_win_size = conf->indel_win_size;
conf->bc.bcf_hdr = conf->bcf_hdr;
conf->bc.n = nsmpl;
"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
"\n"
"Input options:\n"
- " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
- " -A, --count-orphans do not discard anomalous read pairs\n"
- " -b, --bam-list FILE list of input BAM filenames, one per line\n"
- " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
- " -C, --adjust-MQ INT adjust mapping quality [0]\n"
+ " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n"
+ " -A, --count-orphans Do not discard anomalous read pairs\n"
+ " -b, --bam-list FILE List of input BAM filenames, one per line\n"
+ " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n"
+ " -C, --adjust-MQ INT Adjust mapping quality [0]\n"
" -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n"
- " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
fprintf(fp,
- " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
- " -f, --fasta-ref FILE faidx indexed reference sequence file\n"
- " --no-reference do not require fasta reference file\n"
- " -G, --read-groups FILE select or exclude read groups listed in the file\n"
- " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n"
+ " -f, --fasta-ref FILE Faidx indexed reference sequence file\n"
+ " --no-reference Do not require fasta reference file\n"
+ " -G, --read-groups FILE Select or exclude read groups listed in the file\n"
+ " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
fprintf(fp,
- " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
fprintf(fp,
- " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
+ " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
fprintf(fp,
" --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
fprintf(fp,
- " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
- " -R, --regions-file FILE restrict to regions listed in a file\n"
- " --ignore-RG ignore RG tags (one BAM = one sample)\n"
- " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n"
+ " -R, --regions-file FILE Restrict to regions listed in a file\n"
+ " --ignore-RG Ignore RG tags (one BAM = one sample)\n"
+ " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require);
fprintf(fp,
- " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+ " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
- " -s, --samples LIST comma separated list of samples to include\n"
- " -S, --samples-file FILE file of samples to include\n"
- " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
- " -x, --ignore-overlaps disable read-pair overlap detection\n"
- " --seed INT random number seed used for sampling deep regions [0]\n"
+ " -s, --samples LIST Comma separated list of samples to include\n"
+ " -S, --samples-file FILE File of samples to include\n"
+ " -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
+ " -x, --ignore-overlaps Disable read-pair overlap detection\n"
+ " --seed INT Random number seed used for sampling deep regions [0]\n"
"\n"
"Output options:\n"
- " -a, --annotate LIST optional tags to output; '?' to list available tags []\n"
- " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
- " to minimum per-sample DP\n"
- " --no-version do not append version and command line to the header\n"
- " -o, --output FILE write output to FILE [standard output]\n"
+ " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n"
+ " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n"
+ " To minimum per-sample DP\n"
+ " --no-version Do not append version and command line to the header\n"
+ " -o, --output FILE Write output to FILE [standard output]\n"
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
- " 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
- " -U, --mwu-u use older probability scale for Mann-Whitney U test\n"
- " --threads INT use multithreading with INT worker threads [0]\n"
+ " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
+ " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n"
+ " --threads INT Use multithreading with INT worker threads [0]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
" -X, --config STR Specify platform specific profiles (see below)\n"
" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
fprintf(fp,
- " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac);
fprintf(fp,
- " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
fprintf(fp,
- " -I, --skip-indels do not perform indel calling\n"
- " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ " -I, --skip-indels Do not perform indel calling\n"
+ " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
fprintf(fp,
- " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
fprintf(fp,
- " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
+ " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
fprintf(fp,
" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
fprintf(fp,
- " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
- " -P, --platforms STR comma separated list of platforms for indels [all]\n"
+ " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n"
+ " -P, --platforms STR Comma separated list of platforms for indels [all]\n"
" --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
fprintf(fp,
" --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+ fprintf(fp,
+ " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
fprintf(fp,"\n");
fprintf(fp,
"Configuration profiles activated with -X, --config:\n"
mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
mplp.max_read_len = 500;
mplp.ambig_reads = B2B_DROP;
+ mplp.indel_win_size = 110;
+ mplp.clevel = -1;
hts_srand48(0);
static const struct option lopts[] =
{"ext-prob", required_argument, NULL, 'e'},
{"gap-frac", required_argument, NULL, 'F'},
{"indel-bias", required_argument, NULL, 10},
+ {"indel-size", required_argument, NULL, 15},
{"tandem-qual", required_argument, NULL, 'h'},
{"skip-indels", no_argument, NULL, 'I'},
{"max-idepth", required_argument, NULL, 'L'},
case 'u': mplp.output_type = FT_BCF; break;
case 'z': mplp.output_type = FT_VCF_GZ; break;
case 'v': mplp.output_type = FT_VCF; break;
- default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
+ default:
+ {
+ char *tmp;
+ mplp.clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ char *tmp;
+ mplp.clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
}
break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
else
mplp.indel_bias = 1/atof(optarg);
break;
+ case 15: {
+ char *tmp;
+ mplp.indel_win_size = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg);
+ if ( mplp.indel_win_size < 110 )
+ {
+ mplp.indel_win_size = 110;
+ fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size);
+ }
+ }
+ break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
max_indel_depth, max_read_len, fmt_flag, ambig_reads;
int rflag_require, rflag_filter, output_type;
- int openQ, extQ, tandemQ, min_support; // for indels
+ int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
double min_frac; // for indels
double indel_bias;
char *reg_fname, *pl_list, *fai_fname, *output_fname;
- int reg_is_file, record_cmd_line, n_threads;
+ int reg_is_file, record_cmd_line, n_threads, clevel;
faidx_t *fai;
regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions
regitr_t *bed_itr, *reg_itr;
}
if (ma->conf->flag & MPLP_REALN) {
- int i, tot_ins = 0;
+ int i;
+ // int tot_ins = 0;
+ // int p = 0;
uint32_t *cigar = bam_get_cigar(b);
- int p = 0;
for (i=0; i<b->core.n_cigar; i++) {
int cig = cigar[i] & BAM_CIGAR_MASK;
- if (bam_cigar_type(cig) & 2)
- p += cigar[i] >> BAM_CIGAR_SHIFT;
+ // if (bam_cigar_type(cig) & 2)
+ // p += cigar[i] >> BAM_CIGAR_SHIFT;
if (cig == BAM_CINS || cig == BAM_CDEL || cig == BAM_CREF_SKIP) {
- tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
+ // tot_ins += cigar[i] >> BAM_CIGAR_SHIFT;
// Possible further optimsation, check tot_ins==1 later
// (and remove break) so we can detect single bp indels.
// We may want to focus BAQ on more complex regions only.
fprintf(bcftools_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
// write the VCF header
- conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname));
+ char wmode[8];
+ set_wmode(wmode,conf->output_type,conf->output_fname,conf->clevel);
+ conf->bcf_fp = hts_open(conf->output_fname ? conf->output_fname : "-", wmode);
if (conf->bcf_fp == NULL) {
fprintf(bcftools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
bcftools_exit(EXIT_FAILURE);
conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
conf->bca->fmt_flag = conf->fmt_flag;
conf->bca->ambig_reads = conf->ambig_reads;
+ conf->bca->indel_win_size = conf->indel_win_size;
conf->bc.bcf_hdr = conf->bcf_hdr;
conf->bc.n = nsmpl;
"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
"\n"
"Input options:\n"
- " -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
- " -A, --count-orphans do not discard anomalous read pairs\n"
- " -b, --bam-list FILE list of input BAM filenames, one per line\n"
- " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
- " -C, --adjust-MQ INT adjust mapping quality [0]\n"
+ " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n"
+ " -A, --count-orphans Do not discard anomalous read pairs\n"
+ " -b, --bam-list FILE List of input BAM filenames, one per line\n"
+ " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n"
+ " -C, --adjust-MQ INT Adjust mapping quality [0]\n"
" -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n"
- " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ " -d, --max-depth INT Max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
fprintf(fp,
- " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
- " -f, --fasta-ref FILE faidx indexed reference sequence file\n"
- " --no-reference do not require fasta reference file\n"
- " -G, --read-groups FILE select or exclude read groups listed in the file\n"
- " -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n"
+ " -f, --fasta-ref FILE Faidx indexed reference sequence file\n"
+ " --no-reference Do not require fasta reference file\n"
+ " -G, --read-groups FILE Select or exclude read groups listed in the file\n"
+ " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
fprintf(fp,
- " -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
fprintf(fp,
- " --max-BQ INT limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
+ " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mplp->max_baseQ);
fprintf(fp,
" --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mplp->delta_baseQ);
fprintf(fp,
- " -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
- " -R, --regions-file FILE restrict to regions listed in a file\n"
- " --ignore-RG ignore RG tags (one BAM = one sample)\n"
- " --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n"
+ " -R, --regions-file FILE Restrict to regions listed in a file\n"
+ " --ignore-RG Ignore RG tags (one BAM = one sample)\n"
+ " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require);
fprintf(fp,
- " --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+ " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
- " -s, --samples LIST comma separated list of samples to include\n"
- " -S, --samples-file FILE file of samples to include\n"
- " -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
- " -x, --ignore-overlaps disable read-pair overlap detection\n"
- " --seed INT random number seed used for sampling deep regions [0]\n"
+ " -s, --samples LIST Comma separated list of samples to include\n"
+ " -S, --samples-file FILE File of samples to include\n"
+ " -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
+ " -x, --ignore-overlaps Disable read-pair overlap detection\n"
+ " --seed INT Random number seed used for sampling deep regions [0]\n"
"\n"
"Output options:\n"
- " -a, --annotate LIST optional tags to output; '?' to list available tags []\n"
- " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
- " to minimum per-sample DP\n"
- " --no-version do not append version and command line to the header\n"
- " -o, --output FILE write output to FILE [standard output]\n"
+ " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n"
+ " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n"
+ " To minimum per-sample DP\n"
+ " --no-version Do not append version and command line to the header\n"
+ " -o, --output FILE Write output to FILE [standard output]\n"
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
- " 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
- " -U, --mwu-u use older probability scale for Mann-Whitney U test\n"
- " --threads INT use multithreading with INT worker threads [0]\n"
+ " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
+ " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n"
+ " --threads INT Use multithreading with INT worker threads [0]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
" -X, --config STR Specify platform specific profiles (see below)\n"
" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
fprintf(fp,
- " -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%g]\n", mplp->min_frac);
fprintf(fp,
- " -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
fprintf(fp,
- " -I, --skip-indels do not perform indel calling\n"
- " -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ " -I, --skip-indels Do not perform indel calling\n"
+ " -L, --max-idepth INT Maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
fprintf(fp,
- " -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
fprintf(fp,
- " -M, --max-read-len INT maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
+ " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mplp->max_read_len);
fprintf(fp,
" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
fprintf(fp,
- " -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
- " -P, --platforms STR comma separated list of platforms for indels [all]\n"
+ " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n"
+ " -P, --platforms STR Comma separated list of platforms for indels [all]\n"
" --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
fprintf(fp,
" --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
+ fprintf(fp,
+ " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
fprintf(fp,"\n");
fprintf(fp,
"Configuration profiles activated with -X, --config:\n"
mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
mplp.max_read_len = 500;
mplp.ambig_reads = B2B_DROP;
+ mplp.indel_win_size = 110;
+ mplp.clevel = -1;
hts_srand48(0);
static const struct option lopts[] =
{"ext-prob", required_argument, NULL, 'e'},
{"gap-frac", required_argument, NULL, 'F'},
{"indel-bias", required_argument, NULL, 10},
+ {"indel-size", required_argument, NULL, 15},
{"tandem-qual", required_argument, NULL, 'h'},
{"skip-indels", no_argument, NULL, 'I'},
{"max-idepth", required_argument, NULL, 'L'},
case 'u': mplp.output_type = FT_BCF; break;
case 'z': mplp.output_type = FT_VCF_GZ; break;
case 'v': mplp.output_type = FT_VCF; break;
- default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n");
+ default:
+ {
+ char *tmp;
+ mplp.clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ char *tmp;
+ mplp.clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || mplp.clevel<0 || mplp.clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1);
}
break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
else
mplp.indel_bias = 1/atof(optarg);
break;
+ case 15: {
+ char *tmp;
+ mplp.indel_win_size = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg);
+ if ( mplp.indel_win_size < 110 )
+ {
+ mplp.indel_win_size = 110;
+ fprintf(bcftools_stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size);
+ }
+ }
+ break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
}
annot_line_t;
-#define REPLACE_MISSING 0 // replace only missing values
-#define REPLACE_ALL 1 // replace both missing and existing values
-#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
-#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
-#define MATCH_VALUE 4 // do not set, just match the value -c ~ID
+#define REPLACE_MISSING (1<<0) // -c +TAG .. replace only missing values
+#define REPLACE_ALL (1<<1) // -c TAG .. replace both missing and existing values
+#define REPLACE_NON_MISSING (1<<2) // -c -TAG .. replace only if tgt is not missing
+#define SET_OR_APPEND (1<<3) // -c =TAG .. set new value if missing or non-existent, append otherwise
+#define MATCH_VALUE (1<<4) // -c ~ID .. do not set, just match the value
+#define CARRY_OVER_MISSING (1<<5) // -c .TAG .. carry over source missing values as well
#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
#define MM_APPEND 1 // append, possibly multiple times
#define MM_UNIQUE 2 // append, only unique values
bcf_srs_t *files;
bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, clevel;
bcf_sr_regions_t *tgts;
regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns
annot_col_t *cols; // column indexes and setters
int ncols;
int match_id; // set iff `-c ~ID` given
+ int match_end; // set iff `-c ~INFO/END` is given
char *set_ids_fmt;
convert_t *set_ids;
void *keep = khash_str2int_init();
kstring_t str = {0,0,0};
char *ss = args->remove_annots;
+
+ int i, ntags, needs_info = 0;
+ if ( args->set_ids )
+ {
+ const char **tags = convert_list_used_tags(args->set_ids,&ntags);
+ for (i=0; i<ntags; i++)
+ if ( !strncmp("INFO/",tags[i],4) ) needs_info = 1;
+ }
+
while ( *ss )
{
args->nrm++;
fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
tag->key = strdup(str.s);
- if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ if ( type==BCF_HL_INFO )
+ {
+ tag->handler = remove_info_tag;
+ if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
+ }
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
}
else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) )
else
{
tag->key = strdup(str.s);
- if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ if ( type==BCF_HL_INFO )
+ {
+ tag->handler = remove_info_tag;
+ if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
+ }
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
}
else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
else if ( !strcasecmp("INFO",str.s) )
{
+ if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt);
tag->handler = remove_info;
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
}
// note: so far this works only with one filter, not a list of filters
annot_line_t *tab = (annot_line_t*) data;
- if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ if ( tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
+ return 0;
+ }
hts_expand(int,1,args->mtmpi,args->tmpi);
args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
- if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
- if ( col->replace!=REPLACE_MISSING )
+ if ( col->replace & SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
+ if ( !(col->replace & REPLACE_MISSING) )
{
bcf_update_filter(args->hdr_out,line,NULL,0);
return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
bcf1_t *rec = (bcf1_t*) data;
if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
- if ( !rec->d.n_flt ) return 0; // don't overwrite with a missing value
- if ( col->replace==SET_OR_APPEND || col->replace==REPLACE_MISSING )
+ if ( !rec->d.n_flt ) // don't overwrite with a missing value unless asked
{
- if ( col->replace==REPLACE_MISSING && line->d.n_flt ) return 0; // only update missing FILTER
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
+ return 0;
+ }
+ if ( col->replace & (SET_OR_APPEND|REPLACE_MISSING) )
+ {
+ if ( (col->replace & REPLACE_MISSING) && line->d.n_flt ) return 0; // only update missing FILTER
for (i=0; i<rec->d.n_flt; i++)
{
const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
bcf_update_filter(args->hdr_out,line,NULL,0);
return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
}
+static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ char *tmp;
+ int pos = strtol(tab->cols[col->icol], &tmp, 10);
+ if ( tmp==tab->cols[col->icol] )
+ error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
+ line->pos = pos - 1;
+ return 0;
+}
static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
- if ( col->replace==MATCH_VALUE ) return 0;
+ if ( col->replace & MATCH_VALUE ) return 0;
// possible cases:
// IN ANNOT OUT ACHIEVED_BY
//
annot_line_t *tab = (annot_line_t*) data;
if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
- if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
- if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
+ if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
+ if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
// running with +ID, only update missing ids
if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
}
static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
- if ( col->replace==MATCH_VALUE ) return 0;
+ if ( col->replace & MATCH_VALUE ) return 0;
bcf1_t *rec = (bcf1_t*) data;
if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
id = rec->d.id;
}
- if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
- if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id);
+ if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
+ if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,id);
// running with +ID, only update missing ids
if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
annot_line_t *tab = (annot_line_t*) data;
char *str = tab->cols[col->icol];
- if ( str[0]=='.' && str[1]==0 ) return 0; // empty
-
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+ if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
+ return 0;
+ }
+ if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
line->qual = strtod(str, &str);
if ( str == tab->cols[col->icol] )
static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- if ( bcf_float_is_missing(rec->qual) ) return 0;
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+ if ( bcf_float_is_missing(rec->qual) ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
+ return 0;
+ }
+ if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
line->qual = rec->qual;
return 0;
}
annot_line_t *tab = (annot_line_t*) data;
char *str = tab->cols[col->icol];
- if ( str[0]=='.' && str[1]==0 ) return 0;
+ if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
+ return 0;
+ }
if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing;
continue;
}
- if ( ntmpi2==ndst && col->replace==REPLACE_MISSING
+ if ( ntmpi2==ndst && (col->replace & REPLACE_MISSING)
&& args->tmpi2[i]!=bcf_int32_missing
&& args->tmpi2[i]!=bcf_int32_vector_end ) continue;
// This is a bit hacky, only to reuse existing code with minimal changes:
// -c =TAG will now behave as -l TAG:APPEND for integers
- if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+ if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
if ( !tab )
{
}
int i,ntmpi = 0;
+ if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
+ {
+ ntmpi = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi, &args->mtmpi);
+ if ( ntmpi>0 && (args->tmpi[0]!=bcf_int32_missing || (col->replace & CARRY_OVER_MISSING)) )
+ {
+ col->mm_dbl_nused = col->mm_dbl_ndat = ntmpi;
+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
+ for (i=0; i<ntmpi; i++)
+ col->mm_dbl[i] = args->tmpi[i];
+ col->mm_dbl_ndat = 1;
+ }
+ ntmpi = 0;
+ }
if ( tab ) // has data, not flushing yet
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
while ( *end )
{
hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
{
- if ( col->merge_method==MM_APPEND_MISSING )
+ if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
args->tmpi[ntmpi-1] = bcf_int32_missing;
else
ntmpi--;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
-
return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]);
continue;
}
- if ( ntmpf2==ndst && col->replace==REPLACE_MISSING
+ if ( ntmpf2==ndst && (col->replace & REPLACE_MISSING)
&& !bcf_float_is_missing(args->tmpf2[i])
&& !bcf_float_is_vector_end(args->tmpf2[i]) ) continue;
// This is a bit hacky, only to reuse existing code with minimal changes:
// -c =TAG will now behave as -l TAG:APPEND for floats
- if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+ if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
if ( !tab )
{
}
int i,ntmpf = 0;
- if ( tab )
+ if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
+ {
+ ntmpf = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf, &args->mtmpf);
+ if ( ntmpf>0 && (!bcf_float_is_missing(args->tmpf[0]) || (col->replace & CARRY_OVER_MISSING)) )
+ {
+ col->mm_dbl_nused = ntmpf;
+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
+ for (i=0; i<ntmpf; i++)
+ if ( bcf_float_is_missing(args->tmpf[i]) )
+ bcf_double_set_missing(col->mm_dbl[i]);
+ else
+ col->mm_dbl[i] = args->tmpf[i];
+ col->mm_dbl_ndat = 1;
+ }
+ ntmpf = 0;
+ }
+ if ( tab ) // data row, not just flushing
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
while ( *end )
{
hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
{
- if ( col->merge_method==MM_APPEND_MISSING )
+ if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
bcf_float_set_missing(args->tmpf[ntmpf-1]);
else
ntmpf--;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
if ( empty ) copy_string_field(".",0,1,&args->tmpks,i);
continue;
}
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
// Do not replace filled values. The field must be looked up again because
// of realloc in copy_string_field
}
static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
- if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
+ if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
{
int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
// This is a bit hacky, only to reuse existing code with minimal changes:
// -c =TAG will now behave as -l TAG:unique for strings
- if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
+ if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
annot_line_t *tab = (annot_line_t*) data;
{
len = strlen(tab->cols[col->icol]);
if ( !len ) return 0;
- if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1;
+ if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
}
if ( col->merge_method!=MM_FIRST )
khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
}
+ if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l )
+ {
+ int m = col->mm_kstr.m;
+ int n = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &col->mm_kstr.s, &m);
+ col->mm_kstr.m = m;
+ if ( n>0 && ((col->replace & CARRY_OVER_MISSING) || col->mm_kstr.s[0]!='.' || col->mm_kstr.s[1]) ) col->mm_kstr.l = n;
+ }
+
if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
kputs(tab->cols[col->icol], &col->mm_kstr);
return 1;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 ) // field not present in dst file
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0;
+ if ( col->replace & REPLACE_NON_MISSING ) return 0;
hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
int32_t *src = args->tmpi + nsrc*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue;
- if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
+ if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(dst[0]) ) continue;
for (j=0; j<nsrc; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
int32_t *dst = args->tmpi3 + nsrc*i;
int keep_ori = 0;
if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
if ( keep_ori )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
// . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
// x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING
// x . . -TAG .. REPLACE_NON_MISSING
- if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
- else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
- else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
+ if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
+ else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer
int use_new_ann = 1;
if ( args->sample_map[i]==-1 ) use_new_ann = 0;
- else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
float *src = vals + nvals*args->sample_map[i];
float *dst = args->tmpf2 + ndst*i;
- if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
- else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
- else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
+ if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
}
float *dst = args->tmpf3 + nvals*i; // expanded buffer
int use_new_ann = 1;
if ( args->sample_map[i]==-1 ) use_new_ann = 0;
- else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
char **src = vals + args->sample_map[i];
char **dst = args->tmpp2 + i;
- if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
- else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
- else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
+ if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
+ else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
*dst = *src;
}
return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
int ndst1 = ndst / nsmpl_dst;
if ( ndst <= 0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
if ( col->number==BCF_VL_G )
ndst1 = line->n_allele*(line->n_allele+1)/2;
else
int ndst1 = ndst / nsmpl_dst;
if ( ndst <= 0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
if ( col->number==BCF_VL_G )
ndst1 = line->n_allele*(line->n_allele+1)/2;
else
}
ksprintf(str,">\n");
}
+static char *set_replace_mode(char *ss, int *replace)
+{
+ int mode = 0;
+ while (*ss)
+ {
+ if ( *ss=='+' ) mode |= REPLACE_MISSING;
+ else if ( *ss=='-' ) mode |= REPLACE_NON_MISSING;
+ else if ( *ss=='=' ) mode |= SET_OR_APPEND;
+ else if ( *ss=='.' ) mode |= CARRY_OVER_MISSING;
+ else break;
+ ss++;
+ }
+ if ( !mode ) mode = REPLACE_ALL;
+// is exactly one bit set?
+// if ( mode && !(mode && ((mode & mode-1) == 0)) )
+ *replace = mode;
+ return ss;
+}
static void init_columns(args_t *args)
{
int need_sample_map = 0;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
- int replace = REPLACE_ALL;
- if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
- else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; }
- else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
+ int replace;
+ ss = set_replace_mode(ss, &replace);
icol++;
str.l = 0;
kputsn(ss, se-ss, &str);
}
else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
if ( str.s[0]=='~' ) replace = MATCH_VALUE;
- if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n");
+ if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
- if ( replace==MATCH_VALUE ) args->match_id = icol;
+ if ( replace & MATCH_VALUE ) args->match_id = icol;
+ }
+ else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf )
+ {
+ replace = MATCH_VALUE;
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->setter = NULL;
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
+ args->match_end = icol;
+ }
+ else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf )
+ {
+ if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->setter = setter_pos;
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
+ args->match_end = icol;
}
else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column
{
if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
}
else if ( !strcasecmp("FILTER",str.s) )
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
}
else if ( !strcasecmp("QUAL",str.s) )
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace & SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
}
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace & SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
}
else
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND )
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace & SET_OR_APPEND )
{
if ( args->tgts_is_vcf )
error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
key_dst = str.s + 5;
explicit_dst_info = 1;
}
+ else if ( !strcasecmp("~INFO/END",str.s) )
+ {
+ key_dst = str.s + 6;
+ explicit_dst_info = 1;
+ }
else
key_dst = str.s;
char *key_src = strstr(key_dst,":=");
case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break;
default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
}
- if ( replace==SET_OR_APPEND ) // change to Number=.
+ if ( replace & SET_OR_APPEND ) // change to Number=.
{
bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
args->hdr = args->files->readers[0].header;
args->hdr_out = bcf_hdr_dup(args->hdr);
+ if ( args->set_ids_fmt )
+ {
+ if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
+ args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
+ }
if ( args->remove_annots ) init_remove_annots(args);
if ( args->header_fname ) init_header_lines(args);
if ( args->targets_fname && args->tgts_is_vcf )
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
- if ( args->set_ids_fmt )
- {
- if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
- args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
- }
-
if ( args->mark_sites )
{
if ( !args->targets_fname ) error("The -a option not given\n");
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
- if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
if ( !args->drop_header )
{
if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
if ( args->rename_annots ) rename_annots(args, args->rename_annots);
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
if ( args->n_threads )
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
{
+ if ( args->nalines + 1 == 0xffff ) break; // likely a symbolic allele, don't let the buffer overflow
args->nalines++;
hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
annot_line_t *tmp = &args->alines[args->nalines-1];
for (j=0; j<args->ncols; j++)
{
if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ if ( !args->cols[j].setter ) continue;
if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
}
if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ kstring_t match_end = {0,0,0};
+ if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
+ kputw(args->tmpi[0],&match_end);
+
// Find matching lines
for (i=0; i<args->nalines; i++)
{
ialt++;
}
if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+ if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
has_overlap = 1;
break;
has_overlap = 1;
}
}
+
+ free(match_end.s);
+
// Sort lines if needed
if ( args->has_append_mode )
{
{
if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
if ( args->cols[j].done==1 ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
if ( ret < 0 )
error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
for (j=0; j<args->ncols; j++)
{
if ( args->cols[j].done==1 ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
{
if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
if ( args->cols[j].done==1 ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
if ( ret < 0 )
error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
for (j=0; j<args->ncols; j++)
{
if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
{
bcf1_t *aline = bcf_sr_get_line(args->files,1);
for (j=0; j<args->ncols; j++)
+ {
+ if ( !args->cols[j].setter ) continue;
if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ }
has_overlap = 1;
}
fprintf(stderr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
- fprintf(stderr, " --collapse STR matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
- fprintf(stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
- fprintf(stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
- fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n");
- fprintf(stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n");
- fprintf(stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n");
- fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
- fprintf(stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
- fprintf(stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " -r, --regions REGION restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n");
- fprintf(stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
- fprintf(stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n");
- fprintf(stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
- fprintf(stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
- fprintf(stderr, " --threads INT number of extra output compression threads [0]\n");
+ fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(stderr, " --collapse STR Matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
+ fprintf(stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+ fprintf(stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " --force Continue despite parsing error (at your own risk!)\n");
+ fprintf(stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n");
+ fprintf(stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n");
+ fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n");
+ fprintf(stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
+ fprintf(stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " --rename-annots FILE Rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
+ fprintf(stderr, " --rename-chrs FILE Rename sequences according to the mapping: old\\tnew\n");
+ fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " -S, --samples-file [^]FILE File of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
+ fprintf(stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
+ fprintf(stderr, " --threads INT Number of extra output compression threads [0]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Examples:\n");
+ fprintf(stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n");
fprintf(stderr, "\n");
exit(1);
}
args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
args->set_ids_replace = 1;
args->match_id = -1;
+ args->clevel = -1;
int regions_is_file = 0, collapse = 0;
+ int regions_overlap = 1;
static struct option loptions[] =
{
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"remove",required_argument,NULL,'x'},
{"columns-file",required_argument,NULL,'C'},
{"columns",required_argument,NULL,'c'},
{"force",no_argument,NULL,'f'},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
{
switch (c) {
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
else error("The --collapse string \"%s\" not recognised.\n", optarg);
break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->single_overlaps = 1; break;
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
}
annot_line_t;
-#define REPLACE_MISSING 0 // replace only missing values
-#define REPLACE_ALL 1 // replace both missing and existing values
-#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
-#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
-#define MATCH_VALUE 4 // do not set, just match the value -c ~ID
+#define REPLACE_MISSING (1<<0) // -c +TAG .. replace only missing values
+#define REPLACE_ALL (1<<1) // -c TAG .. replace both missing and existing values
+#define REPLACE_NON_MISSING (1<<2) // -c -TAG .. replace only if tgt is not missing
+#define SET_OR_APPEND (1<<3) // -c =TAG .. set new value if missing or non-existent, append otherwise
+#define MATCH_VALUE (1<<4) // -c ~ID .. do not set, just match the value
+#define CARRY_OVER_MISSING (1<<5) // -c .TAG .. carry over source missing values as well
#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest
#define MM_APPEND 1 // append, possibly multiple times
#define MM_UNIQUE 2 // append, only unique values
bcf_srs_t *files;
bcf_hdr_t *hdr, *hdr_out, *tgts_hdr;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, clevel;
bcf_sr_regions_t *tgts;
regidx_t *tgt_idx; // keep everything in memory only with .tab annotation file and -c BEG,END columns
annot_col_t *cols; // column indexes and setters
int ncols;
int match_id; // set iff `-c ~ID` given
+ int match_end; // set iff `-c ~INFO/END` is given
char *set_ids_fmt;
convert_t *set_ids;
void *keep = khash_str2int_init();
kstring_t str = {0,0,0};
char *ss = args->remove_annots;
+
+ int i, ntags, needs_info = 0;
+ if ( args->set_ids )
+ {
+ const char **tags = convert_list_used_tags(args->set_ids,&ntags);
+ for (i=0; i<ntags; i++)
+ if ( !strncmp("INFO/",tags[i],4) ) needs_info = 1;
+ }
+
while ( *ss )
{
args->nrm++;
fprintf(bcftools_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s);
tag->key = strdup(str.s);
- if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ if ( type==BCF_HL_INFO )
+ {
+ tag->handler = remove_info_tag;
+ if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
+ }
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
}
else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) )
else
{
tag->key = strdup(str.s);
- if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
+ if ( type==BCF_HL_INFO )
+ {
+ tag->handler = remove_info_tag;
+ if ( needs_info ) error("Error: `--remove INFO/%s` is executed first, cannot combine with `--set-id %s`\n",tag->key,args->set_ids_fmt);
+ }
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
}
else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
else if ( !strcasecmp("INFO",str.s) )
{
+ if ( needs_info ) error("Error: `--remove INFO` is executed first, cannot combine with `--set-id %s`\n",args->set_ids_fmt);
tag->handler = remove_info;
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
}
// note: so far this works only with one filter, not a list of filters
annot_line_t *tab = (annot_line_t*) data;
- if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ if ( tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
+ return 0;
+ }
hts_expand(int,1,args->mtmpi,args->tmpi);
args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]);
if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]);
- if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
- if ( col->replace!=REPLACE_MISSING )
+ if ( col->replace & SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]);
+ if ( !(col->replace & REPLACE_MISSING) )
{
bcf_update_filter(args->hdr_out,line,NULL,0);
return bcf_update_filter(args->hdr_out,line,args->tmpi,1);
bcf1_t *rec = (bcf1_t*) data;
if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT);
if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
- if ( !rec->d.n_flt ) return 0; // don't overwrite with a missing value
- if ( col->replace==SET_OR_APPEND || col->replace==REPLACE_MISSING )
+ if ( !rec->d.n_flt ) // don't overwrite with a missing value unless asked
{
- if ( col->replace==REPLACE_MISSING && line->d.n_flt ) return 0; // only update missing FILTER
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_filter(args->hdr_out,line,NULL,0);
+ return 0;
+ }
+ if ( col->replace & (SET_OR_APPEND|REPLACE_MISSING) )
+ {
+ if ( (col->replace & REPLACE_MISSING) && line->d.n_flt ) return 0; // only update missing FILTER
for (i=0; i<rec->d.n_flt; i++)
{
const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]);
bcf_update_filter(args->hdr_out,line,NULL,0);
return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt);
}
+static int setter_pos(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
+ char *tmp;
+ int pos = strtol(tab->cols[col->icol], &tmp, 10);
+ if ( tmp==tab->cols[col->icol] )
+ error("Could not parse ~POS at %s:%"PRId64" .. [%s]\n",bcf_seqname(args->hdr,line),(int64_t)line->pos+1,tab->cols[col->icol]);
+ line->pos = pos - 1;
+ return 0;
+}
static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n");
- if ( col->replace==MATCH_VALUE ) return 0;
+ if ( col->replace & MATCH_VALUE ) return 0;
// possible cases:
// IN ANNOT OUT ACHIEVED_BY
//
annot_line_t *tab = (annot_line_t*) data;
if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "."
- if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
- if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
+ if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,tab->cols[col->icol]);
+ if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,tab->cols[col->icol]);
// running with +ID, only update missing ids
if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
}
static int vcf_setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
- if ( col->replace==MATCH_VALUE ) return 0;
+ if ( col->replace & MATCH_VALUE ) return 0;
bcf1_t *rec = (bcf1_t*) data;
if ( rec->d.id && rec->d.id[0]=='.' && !rec->d.id[1] ) return 0; // don't replace with "."
id = rec->d.id;
}
- if ( col->replace==SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
- if ( col->replace!=REPLACE_MISSING ) return bcf_update_id(args->hdr_out,line,id);
+ if ( col->replace & SET_OR_APPEND ) return bcf_add_id(args->hdr_out,line,id);
+ if ( !(col->replace & REPLACE_MISSING) ) return bcf_update_id(args->hdr_out,line,id);
// running with +ID, only update missing ids
if ( !line->d.id || (line->d.id[0]=='.' && !line->d.id[1]) )
annot_line_t *tab = (annot_line_t*) data;
char *str = tab->cols[col->icol];
- if ( str[0]=='.' && str[1]==0 ) return 0; // empty
-
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+ if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
+ return 0;
+ }
+ if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
line->qual = strtod(str, &str);
if ( str == tab->cols[col->icol] )
static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- if ( bcf_float_is_missing(rec->qual) ) return 0;
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(line->qual) ) return 0;
+ if ( bcf_float_is_missing(rec->qual) ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_float_set_missing(line->qual);
+ return 0;
+ }
+ if ( (col->replace & REPLACE_MISSING) && !bcf_float_is_missing(line->qual) ) return 0;
line->qual = rec->qual;
return 0;
}
annot_line_t *tab = (annot_line_t*) data;
char *str = tab->cols[col->icol];
- if ( str[0]=='.' && str[1]==0 ) return 0;
+ if ( str[0]=='.' && str[1]==0 ) // don't overwrite with a missing value unless asked
+ {
+ if ( (col->replace & CARRY_OVER_MISSING) && (col->replace & (REPLACE_ALL|REPLACE_NON_MISSING)) ) bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
+ return 0;
+ }
if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
if ( ntmpi2 < ndst ) args->tmpi2[i] = bcf_int32_missing;
continue;
}
- if ( ntmpi2==ndst && col->replace==REPLACE_MISSING
+ if ( ntmpi2==ndst && (col->replace & REPLACE_MISSING)
&& args->tmpi2[i]!=bcf_int32_missing
&& args->tmpi2[i]!=bcf_int32_vector_end ) continue;
// This is a bit hacky, only to reuse existing code with minimal changes:
// -c =TAG will now behave as -l TAG:APPEND for integers
- if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+ if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
if ( !tab )
{
}
int i,ntmpi = 0;
+ if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
+ {
+ ntmpi = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi, &args->mtmpi);
+ if ( ntmpi>0 && (args->tmpi[0]!=bcf_int32_missing || (col->replace & CARRY_OVER_MISSING)) )
+ {
+ col->mm_dbl_nused = col->mm_dbl_ndat = ntmpi;
+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
+ for (i=0; i<ntmpi; i++)
+ col->mm_dbl[i] = args->tmpi[i];
+ col->mm_dbl_ndat = 1;
+ }
+ ntmpi = 0;
+ }
if ( tab ) // has data, not flushing yet
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
while ( *end )
{
hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi);
if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
{
- if ( col->merge_method==MM_APPEND_MISSING )
+ if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
args->tmpi[ntmpi-1] = bcf_int32_missing;
else
ntmpi--;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_int32(args,line,col,tab->nals,tab->als,ntmpi);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
-
return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_int32(args,line,col,rec->n_allele,rec->d.allele,ntmpi);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
if ( ntmpf2 < ndst ) bcf_float_set_missing(args->tmpf2[i]);
continue;
}
- if ( ntmpf2==ndst && col->replace==REPLACE_MISSING
+ if ( ntmpf2==ndst && (col->replace & REPLACE_MISSING)
&& !bcf_float_is_missing(args->tmpf2[i])
&& !bcf_float_is_vector_end(args->tmpf2[i]) ) continue;
// This is a bit hacky, only to reuse existing code with minimal changes:
// -c =TAG will now behave as -l TAG:APPEND for floats
- if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_APPEND;
+ if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_APPEND;
if ( !tab )
{
}
int i,ntmpf = 0;
- if ( tab )
+ if ( (col->replace & SET_OR_APPEND) && !col->mm_dbl_nused )
+ {
+ ntmpf = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf, &args->mtmpf);
+ if ( ntmpf>0 && (!bcf_float_is_missing(args->tmpf[0]) || (col->replace & CARRY_OVER_MISSING)) )
+ {
+ col->mm_dbl_nused = ntmpf;
+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl);
+ for (i=0; i<ntmpf; i++)
+ if ( bcf_float_is_missing(args->tmpf[i]) )
+ bcf_double_set_missing(col->mm_dbl[i]);
+ else
+ col->mm_dbl[i] = args->tmpf[i];
+ col->mm_dbl_ndat = 1;
+ }
+ ntmpf = 0;
+ }
+ if ( tab ) // data row, not just flushing
{
char *str = tab->cols[col->icol], *end = str;
- if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1;
+ if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
while ( *end )
{
hts_expand(float,ntmpf,args->mtmpf,args->tmpf);
if ( str[0]=='.' && (str[1]==0 || str[1]==',') )
{
- if ( col->merge_method==MM_APPEND_MISSING )
+ if ( col->merge_method==MM_APPEND_MISSING || (col->replace & CARRY_OVER_MISSING) )
bcf_float_set_missing(args->tmpf[ntmpf-1]);
else
ntmpf--;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_real(args,line,col,tab->nals,tab->als,ntmpf);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_real(args,line,col,rec->n_allele,rec->d.allele,ntmpf);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
if ( empty ) copy_string_field(".",0,1,&args->tmpks,i);
continue;
}
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
// Do not replace filled values. The field must be looked up again because
// of realloc in copy_string_field
}
static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
- if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
+ if ( (col->replace & REPLACE_MISSING) && col->number!=BCF_VL_A && col->number!=BCF_VL_R )
{
int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
// This is a bit hacky, only to reuse existing code with minimal changes:
// -c =TAG will now behave as -l TAG:unique for strings
- if ( col->replace==SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
+ if ( col->replace & SET_OR_APPEND ) col->merge_method=MM_UNIQUE;
annot_line_t *tab = (annot_line_t*) data;
{
len = strlen(tab->cols[col->icol]);
if ( !len ) return 0;
- if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1;
+ if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING && !(col->replace & CARRY_OVER_MISSING) ) return 1;
}
if ( col->merge_method!=MM_FIRST )
khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol]));
}
+ if ( (col->replace & SET_OR_APPEND) && !col->mm_kstr.l )
+ {
+ int m = col->mm_kstr.m;
+ int n = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &col->mm_kstr.s, &m);
+ col->mm_kstr.m = m;
+ if ( n>0 && ((col->replace & CARRY_OVER_MISSING) || col->mm_kstr.s[0]!='.' || col->mm_kstr.s[1]) ) col->mm_kstr.l = n;
+ }
+
if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr);
kputs(tab->cols[col->icol], &col->mm_kstr);
return 1;
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
return setter_ARinfo_string(args,line,col,rec->n_allele,rec->d.allele);
- if ( col->replace==REPLACE_MISSING )
+ if ( col->replace & REPLACE_MISSING )
{
int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 ) // field not present in dst file
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0;
+ if ( col->replace & REPLACE_NON_MISSING ) return 0;
hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
int32_t *src = args->tmpi + nsrc*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue;
- if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
+ if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(dst[0]) ) continue;
for (j=0; j<nsrc; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
int32_t *dst = args->tmpi3 + nsrc*i;
int keep_ori = 0;
if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( (col->replace & REPLACE_NON_MISSING) && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( (col->replace & REPLACE_MISSING) && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
if ( keep_ori )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
// . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
// x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING
// x . . -TAG .. REPLACE_NON_MISSING
- if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
- else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
- else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
+ if ( col->replace & REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
+ else if ( col->replace & REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer
int use_new_ann = 1;
if ( args->sample_map[i]==-1 ) use_new_ann = 0;
- else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
float *src = vals + nvals*args->sample_map[i];
float *dst = args->tmpf2 + ndst*i;
- if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
- else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
- else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
+ if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
}
float *dst = args->tmpf3 + nvals*i; // expanded buffer
int use_new_ann = 1;
if ( args->sample_map[i]==-1 ) use_new_ann = 0;
- else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
- else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace & REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
char **src = vals + args->sample_map[i];
char **dst = args->tmpp2 + i;
- if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
- else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
- else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
+ if ( col->replace & REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
+ else if ( col->replace & REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
+ else if ( col->replace & REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
*dst = *src;
}
return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
int ndst1 = ndst / nsmpl_dst;
if ( ndst <= 0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
if ( col->number==BCF_VL_G )
ndst1 = line->n_allele*(line->n_allele+1)/2;
else
int ndst1 = ndst / nsmpl_dst;
if ( ndst <= 0 )
{
- if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ if ( col->replace & REPLACE_NON_MISSING ) return 0; // overwrite only if present
if ( col->number==BCF_VL_G )
ndst1 = line->n_allele*(line->n_allele+1)/2;
else
}
ksprintf(str,">\n");
}
+static char *set_replace_mode(char *ss, int *replace)
+{
+ int mode = 0;
+ while (*ss)
+ {
+ if ( *ss=='+' ) mode |= REPLACE_MISSING;
+ else if ( *ss=='-' ) mode |= REPLACE_NON_MISSING;
+ else if ( *ss=='=' ) mode |= SET_OR_APPEND;
+ else if ( *ss=='.' ) mode |= CARRY_OVER_MISSING;
+ else break;
+ ss++;
+ }
+ if ( !mode ) mode = REPLACE_ALL;
+// is exactly one bit set?
+// if ( mode && !(mode && ((mode & mode-1) == 0)) )
+ *replace = mode;
+ return ss;
+}
static void init_columns(args_t *args)
{
int need_sample_map = 0;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
- int replace = REPLACE_ALL;
- if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
- else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; }
- else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
+ int replace;
+ ss = set_replace_mode(ss, &replace);
icol++;
str.l = 0;
kputsn(ss, se-ss, &str);
}
else if ( !strcasecmp("ID",str.s) || !strcasecmp("~ID",str.s) )
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
if ( str.s[0]=='~' ) replace = MATCH_VALUE;
- if ( args->tgts_is_vcf && replace==MATCH_VALUE ) error("todo: -c ~ID with -a VCF?\n");
+ if ( args->tgts_is_vcf && (replace & MATCH_VALUE) ) error("todo: -c ~ID with -a VCF?\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
col->hdr_key_src = strdup(str.s);
col->hdr_key_dst = strdup(str.s);
- if ( replace==MATCH_VALUE ) args->match_id = icol;
+ if ( replace & MATCH_VALUE ) args->match_id = icol;
+ }
+ else if ( !strcasecmp("~INFO/END",str.s) && !args->tgts_is_vcf )
+ {
+ replace = MATCH_VALUE;
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->setter = NULL;
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
+ args->match_end = icol;
+ }
+ else if ( !strcasecmp("~POS",str.s) && !args->tgts_is_vcf )
+ {
+ if ( args->tgts_is_vcf ) error("Error: cannot use ~POS, position can be replaced only from a tab-delimited file\n");
+ args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
+ annot_col_t *col = &args->cols[args->ncols-1];
+ memset(col,0,sizeof(*col));
+ col->icol = icol;
+ col->replace = replace;
+ col->setter = setter_pos;
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
+ args->match_end = icol;
}
else if ( !strncasecmp("ID:=",str.s,4) ) // transfer a tag from INFO to ID column
{
if ( !args->tgts_is_vcf ) error("The annotation source must be a VCF for \"%s\"\n",str.s);
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
}
else if ( !strcasecmp("FILTER",str.s) )
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
}
else if ( !strcasecmp("QUAL",str.s) )
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace & SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
}
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace & SET_OR_APPEND ) error("Apologies, the =INFO feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
}
else
{
- if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
- if ( replace==SET_OR_APPEND )
+ if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace & SET_OR_APPEND )
{
if ( args->tgts_is_vcf )
error("Error: the =INFO/TAG feature is currently supported only with TAB annotation files and has limitations\n"
key_dst = str.s + 5;
explicit_dst_info = 1;
}
+ else if ( !strcasecmp("~INFO/END",str.s) )
+ {
+ key_dst = str.s + 6;
+ explicit_dst_info = 1;
+ }
else
key_dst = str.s;
char *key_src = strstr(key_dst,":=");
case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_info_str : setter_info_str; break;
default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id));
}
- if ( replace==SET_OR_APPEND ) // change to Number=.
+ if ( replace & SET_OR_APPEND ) // change to Number=.
{
bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, BCF_HL_INFO, "ID", key_dst, NULL);
if ( !hrec ) error("Uh, could not find the new tag \"%s\" in the header\n", key_dst);
args->hdr = args->files->readers[0].header;
args->hdr_out = bcf_hdr_dup(args->hdr);
+ if ( args->set_ids_fmt )
+ {
+ if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
+ args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
+ }
if ( args->remove_annots ) init_remove_annots(args);
if ( args->header_fname ) init_header_lines(args);
if ( args->targets_fname && args->tgts_is_vcf )
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
- if ( args->set_ids_fmt )
- {
- if ( args->set_ids_fmt[0]=='+' ) { args->set_ids_replace = 0; args->set_ids_fmt++; }
- args->set_ids = convert_init(args->hdr_out, NULL, 0, args->set_ids_fmt);
- }
-
if ( args->mark_sites )
{
if ( !args->targets_fname ) error("The -a option not given\n");
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
- if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
+ if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_annotate");
if ( !args->drop_header )
{
if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs);
if ( args->rename_annots ) rename_annots(args, args->rename_annots);
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno));
if ( args->n_threads )
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) )
{
+ if ( args->nalines + 1 == 0xffff ) break; // likely a symbolic allele, don't let the buffer overflow
args->nalines++;
hts_expand0(annot_line_t,args->nalines,args->malines,args->alines);
annot_line_t *tmp = &args->alines[args->nalines-1];
for (j=0; j<args->ncols; j++)
{
if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ if ( !args->cols[j].setter ) continue;
if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
}
if ( args->nalines >= 0xffff || line->n_allele >= 0xffff )
error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ kstring_t match_end = {0,0,0};
+ if ( args->match_end>=0 && bcf_get_info_int32(args->hdr,line,"END",&args->tmpi,&args->mtmpi)==1 )
+ kputw(args->tmpi[0],&match_end);
+
// Find matching lines
for (i=0; i<args->nalines; i++)
{
ialt++;
}
if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
+ if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
has_overlap = 1;
break;
has_overlap = 1;
}
}
+
+ free(match_end.s);
+
// Sort lines if needed
if ( args->has_append_mode )
{
{
if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
if ( args->cols[j].done==1 ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
if ( ret < 0 )
error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
for (j=0; j<args->ncols; j++)
{
if ( args->cols[j].done==1 ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]);
if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
{
if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue;
if ( args->cols[j].done==1 ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing);
if ( ret < 0 )
error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
for (j=0; j<args->ncols; j++)
{
if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue;
+ if ( !args->cols[j].setter ) continue;
int ret = args->cols[j].setter(args,line,&args->cols[j],NULL);
if ( ret < 0 )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
{
bcf1_t *aline = bcf_sr_get_line(args->files,1);
for (j=0; j<args->ncols; j++)
+ {
+ if ( !args->cols[j].setter ) continue;
if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1);
+ }
has_overlap = 1;
}
fprintf(bcftools_stderr, "Usage: bcftools annotate [options] <in.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
- fprintf(bcftools_stderr, " --collapse STR matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
- fprintf(bcftools_stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
- fprintf(bcftools_stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
- fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n");
- fprintf(bcftools_stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n");
- fprintf(bcftools_stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n");
- fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
- fprintf(bcftools_stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
- fprintf(bcftools_stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -r, --regions REGION restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n");
- fprintf(bcftools_stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
- fprintf(bcftools_stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n");
- fprintf(bcftools_stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(bcftools_stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
- fprintf(bcftools_stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
- fprintf(bcftools_stderr, " --threads INT number of extra output compression threads [0]\n");
+ fprintf(bcftools_stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(bcftools_stderr, " --collapse STR Matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
+ fprintf(bcftools_stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
+ fprintf(bcftools_stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " --force Continue despite parsing error (at your own risk!)\n");
+ fprintf(bcftools_stderr, " -h, --header-lines FILE Lines which should be appended to the VCF header\n");
+ fprintf(bcftools_stderr, " -I, --set-id [+]FORMAT Set ID column using a `bcftools query`-like expression, see man page for details\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n");
+ fprintf(bcftools_stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n");
+ fprintf(bcftools_stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " --rename-annots FILE Rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n");
+ fprintf(bcftools_stderr, " --rename-chrs FILE Rename sequences according to the mapping: old\\tnew\n");
+ fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(bcftools_stderr, " -S, --samples-file [^]FILE File of samples to annotate (or exclude with \"^\" prefix)\n");
+ fprintf(bcftools_stderr, " --single-overlaps Keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n");
+ fprintf(bcftools_stderr, " -x, --remove LIST List of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
+ fprintf(bcftools_stderr, " --threads INT Number of extra output compression threads [0]\n");
+ fprintf(bcftools_stderr, "\n");
+ fprintf(bcftools_stderr, "Examples:\n");
+ fprintf(bcftools_stderr, " http://samtools.github.io/bcftools/howtos/annotate.html\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
args->set_ids_replace = 1;
args->match_id = -1;
+ args->clevel = -1;
int regions_is_file = 0, collapse = 0;
+ int regions_overlap = 1;
static struct option loptions[] =
{
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"remove",required_argument,NULL,'x'},
{"columns-file",required_argument,NULL,'C'},
{"columns",required_argument,NULL,'c'},
{"force",no_argument,NULL,'f'},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0)
{
switch (c) {
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
else error("The --collapse string \"%s\" not recognised.\n", optarg);
break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->single_overlaps = 1; break;
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
typedef struct
{
int flag; // combination of CF_* flags above
- int output_type, n_threads, record_cmd_line;
+ int output_type, n_threads, record_cmd_line, clevel;
htsFile *bcf_in, *out_fh;
char *bcf_fname, *output_fname;
char **samples; // for subsampling and ploidy
int nsamples, *samples_map; // mapping from output sample names to original VCF
char *regions, *targets; // regions to process
- int regions_is_file, targets_is_file;
+ int regions_is_file, targets_is_file, regions_overlap;
regidx_t *tgt_idx;
regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp;
vcfbuf_t *vcfbuf;
if ( args->regions )
{
+ bcf_sr_set_opt(args->aux.srs,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions);
}
if ( args->aux.flag & CALL_CONSTR_ALLELES )
args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
fprintf(stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "File format options:\n");
- fprintf(stderr, " --no-version Do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
- fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
- fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n");
- fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
- fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
+ fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n");
+ fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Input/output options:\n");
- fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
- fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
-//todo?
-// fprintf(stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n");
-// fprintf(stderr, " tag removal [^I16,^QS,^FMT/QS]\n");
- fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
- fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
- fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
- fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
- fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
- fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
- fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
- fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
- fprintf(stderr, " -v, --variants-only Output variant sites only\n");
+ fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
+ fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
+ fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
+ fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
+ fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
+ fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
+ fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+ fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
+ fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
+ fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
+ fprintf(stderr, " -v, --variants-only Output variant sites only\n");
fprintf(stderr, "\n");
fprintf(stderr, "Consensus/variant calling options:\n");
- fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
- fprintf(stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n");
- fprintf(stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
- fprintf(stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
- fprintf(stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(stderr, " -P, --prior FLOAT Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
+ fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
+ fprintf(stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n");
+ fprintf(stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+ fprintf(stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+ fprintf(stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)<FLOAT with -c [0.5]\n");
+ fprintf(stderr, " -P, --prior FLOAT Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Example:\n");
fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
args.record_cmd_line = 1;
args.aux.trio_Pm_SNPs = 1 - 1e-8;
args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
+ args.regions_overlap = 1;
+ args.clevel = -1;
int c;
static struct option loptions[] =
{"output-type",required_argument,NULL,'O'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,4},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
{"targets",required_argument,NULL,'t'},
case 'u': args.output_type = FT_BCF; break;
case 'z': args.output_type = FT_VCF_GZ; break;
case 'v': args.output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args.clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args.clevel<0 || args.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args.clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'C':
case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
case 9 : args.n_threads = strtol(optarg, 0, 0); break;
case 8 : args.record_cmd_line = 0; break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
default: usage(&args);
}
}
typedef struct
{
int flag; // combination of CF_* flags above
- int output_type, n_threads, record_cmd_line;
+ int output_type, n_threads, record_cmd_line, clevel;
htsFile *bcf_in, *out_fh;
char *bcf_fname, *output_fname;
char **samples; // for subsampling and ploidy
int nsamples, *samples_map; // mapping from output sample names to original VCF
char *regions, *targets; // regions to process
- int regions_is_file, targets_is_file;
+ int regions_is_file, targets_is_file, regions_overlap;
regidx_t *tgt_idx;
regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp;
vcfbuf_t *vcfbuf;
if ( args->regions )
{
+ bcf_sr_set_opt(args->aux.srs,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions);
}
if ( args->aux.flag & CALL_CONSTR_ALLELES )
args->vcfbuf = vcfbuf_init(args->aux.hdr, 0);
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
fprintf(bcftools_stderr, "Usage: bcftools call [options] <in.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "File format options:\n");
- fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
- fprintf(bcftools_stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
- fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include [all samples]\n");
- fprintf(bcftools_stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
- fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n");
+ fprintf(bcftools_stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include [all samples]\n");
+ fprintf(bcftools_stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Input/output options:\n");
- fprintf(bcftools_stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
- fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
-//todo?
-// fprintf(bcftools_stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n");
-// fprintf(bcftools_stderr, " tag removal [^I16,^QS,^FMT/QS]\n");
- fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
- fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
- fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
- fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
- fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
- fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
- fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
- fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
- fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n");
+ fprintf(bcftools_stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
+ fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
+ fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
+ fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
+ fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
+ fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
+ fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
+ fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
+ fprintf(bcftools_stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
+ fprintf(bcftools_stderr, " -V, --skip-variants TYPE Skip indels/snps\n");
+ fprintf(bcftools_stderr, " -v, --variants-only Output variant sites only\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Consensus/variant calling options:\n");
- fprintf(bcftools_stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
- fprintf(bcftools_stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n");
- fprintf(bcftools_stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
- fprintf(bcftools_stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
- fprintf(bcftools_stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(bcftools_stderr, " -P, --prior FLOAT Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
+ fprintf(bcftools_stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n");
+ fprintf(bcftools_stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n");
+ fprintf(bcftools_stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
+ fprintf(bcftools_stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
+ fprintf(bcftools_stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)<FLOAT with -c [0.5]\n");
+ fprintf(bcftools_stderr, " -P, --prior FLOAT Mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Example:\n");
fprintf(bcftools_stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n");
args.record_cmd_line = 1;
args.aux.trio_Pm_SNPs = 1 - 1e-8;
args.aux.trio_Pm_ins = args.aux.trio_Pm_del = 1 - 1e-9;
+ args.regions_overlap = 1;
+ args.clevel = -1;
int c;
static struct option loptions[] =
{"output-type",required_argument,NULL,'O'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,4},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
{"targets",required_argument,NULL,'t'},
case 'u': args.output_type = FT_BCF; break;
case 'z': args.output_type = FT_VCF_GZ; break;
case 'v': args.output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args.clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args.clevel<0 || args.clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args.clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args.clevel<0 || args.clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'C':
case 'S': args.samples_fname = optarg; args.samples_is_file = 1; break;
case 9 : args.n_threads = strtol(optarg, 0, 0); break;
case 8 : args.record_cmd_line = 0; break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
default: usage(&args);
}
}
/* The MIT License
- Copyright (c) 2014-2018 Genome Research Ltd.
+ Copyright (c) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
" chr = row[0]\n"
" if chr[0]=='#': continue\n"
" if chr not in dat: dat[chr] = []\n"
- " dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
+ " dat[chr].append([int(row[1]), float(row[2]), float(row[3])])\n"
"\n"
"cnv = {}\n"
"with open('%s', 'r') as f:\n"
" if chr[0]=='#': continue\n"
" if chr not in cnv: cnv[chr] = []\n"
" row[2] = int(row[2]) + 0.5\n"
+ " row[1] = int(row[1])\n"
" cnv[chr].append(row[1:])\n"
"\n"
"for chr in dat:\n"
" heat[1][x] = cn_dat[x][3]\n"
" heat[2][x] = cn_dat[x][4]\n"
" heat[3][x] = cn_dat[x][5]\n"
- " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r', shading='auto', alpha=0)\n"
" mesh.set_clim(vmin=-1,vmax=1)\n"
" ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'.-',ms=3,color='black')\n"
" fig.suptitle('%s (chr '+chr+')')\n"
" for row in reader:\n"
" chr = row[0]\n"
" if chr != plot_chr: continue\n"
- " dat.append([row[1], float(row[2]), float(row[3])])\n"
+ " dat.append([int(row[1]), float(row[2]), float(row[3])])\n"
"def read_cnv(file,cnv,plot_chr):\n"
" with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" chr = row[0]\n"
" if chr != plot_chr: continue\n"
" row[2] = int(row[2]) + 0.5\n"
+ " row[1] = int(row[1])\n"
" cnv.append(row[1:])\n"
"def find_diffs(a,b):\n"
" out = []\n"
" heat[1][x] = cn_dat[x][3]\n"
" heat[2][x] = cn_dat[x][4]\n"
" heat[3][x] = cn_dat[x][5]\n"
- " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr')\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr', shading='auto', alpha=0)\n"
" mesh.set_clim(vmin=-1,vmax=1)\n"
" ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n"
"\n"
fprintf(stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n");
fprintf(stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n");
fprintf(stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n");
- fprintf(stderr, "Usage: bcftools cnv [OPTIONS] <file.vcf>\n");
+ fprintf(stderr, "Usage: bcftools cnv [OPTIONS] FILE.vcf\n");
fprintf(stderr, "General Options:\n");
- fprintf(stderr, " -c, --control-sample <string> optional control sample name to highlight differences\n");
- fprintf(stderr, " -f, --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(stderr, " -o, --output-dir <path> \n");
- fprintf(stderr, " -p, --plot-threshold <float> plot aberrant chromosomes with quality at least 'float'\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --query-sample <string> query samply name\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -c, --control-sample STRING Optional control sample name to highlight differences\n");
+ fprintf(stderr, " -f, --AF-file FILE Read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(stderr, " -o, --output-dir PATH \n");
+ fprintf(stderr, " -p, --plot-threshold FLOAT Plot aberrant chromosomes with quality at least FLOAT\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -s, --query-sample STRING Query samply name\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, "HMM Options:\n");
- fprintf(stderr, " -a, --aberrant <float[,float]> fraction of aberrant cells in query and control [1.0,1.0]\n");
- fprintf(stderr, " -b, --BAF-weight <float> relative contribution from BAF [1]\n");
- fprintf(stderr, " -d, --BAF-dev <float[,float]> expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
- fprintf(stderr, " -e, --err-prob <float> uniform error probability [1e-4]\n");
- fprintf(stderr, " -k, --LRR-dev <float[,float]> expected LRR deviation [0.2,0.2]\n"); // experimental
- fprintf(stderr, " -l, --LRR-weight <float> relative contribution from LRR [0.2]\n");
- fprintf(stderr, " -L, --LRR-smooth-win <int> window of LRR moving average smoothing [10]\n");
- fprintf(stderr, " -O, --optimize <float> estimate fraction of aberrant cells down to <float> [1.0]\n");
- fprintf(stderr, " -P, --same-prob <float> prior probability of -s/-c being the same [0.5]\n");
- fprintf(stderr, " -x, --xy-prob <float> P(x|y) transition probability [1e-9]\n");
+ fprintf(stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n");
+ fprintf(stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n");
+ fprintf(stderr, " -d, --BAF-dev FLOAT[,FLOAT] Expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
+ fprintf(stderr, " -e, --err-prob FLOAT Uniform error probability [1e-4]\n");
+ fprintf(stderr, " -k, --LRR-dev FLOAT[,FLOAT] Expected LRR deviation [0.2,0.2]\n"); // experimental
+ fprintf(stderr, " -l, --LRR-weight FLOAT Relative contribution from LRR [0.2]\n");
+ fprintf(stderr, " -L, --LRR-smooth-win INT Window of LRR moving average smoothing [10]\n");
+ fprintf(stderr, " -O, --optimize FLOAT Estimate fraction of aberrant cells down to FLOAT [1.0]\n");
+ fprintf(stderr, " -P, --same-prob FLOA> Prior probability of -s/-c being the same [0.5]\n");
+ fprintf(stderr, " -x, --xy-prob FLOAT P(x|y) transition probability [1e-9]\n");
fprintf(stderr, "\n");
exit(1);
}
args->query_sample.lrr_dev2 = args->control_sample.lrr_dev2 = 0.2*0.2; //0.20*0.20; // illumina: 0.18
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
+
static struct option loptions[] =
{
{"BAF-dev",1,0,'d'},
{"control-sample",1,0,'c'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"plot-threshold",1,0,'p'},
{"output-dir",1,0,'o'},
{0,0,0,0}
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
if ( !args->output_dir ) error("Expected -o option\n");
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
/* The MIT License
- Copyright (c) 2014-2018 Genome Research Ltd.
+ Copyright (c) 2014-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
" chr = row[0]\n"
" if chr[0]=='#': continue\n"
" if chr not in dat: dat[chr] = []\n"
- " dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
+ " dat[chr].append([int(row[1]), float(row[2]), float(row[3])])\n"
"\n"
"cnv = {}\n"
"with open('%s', 'r') as f:\n"
" if chr[0]=='#': continue\n"
" if chr not in cnv: cnv[chr] = []\n"
" row[2] = int(row[2]) + 0.5\n"
+ " row[1] = int(row[1])\n"
" cnv[chr].append(row[1:])\n"
"\n"
"for chr in dat:\n"
" heat[1][x] = cn_dat[x][3]\n"
" heat[2][x] = cn_dat[x][4]\n"
" heat[3][x] = cn_dat[x][5]\n"
- " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r')\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr_r', shading='auto', alpha=0)\n"
" mesh.set_clim(vmin=-1,vmax=1)\n"
" ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'.-',ms=3,color='black')\n"
" fig.suptitle('%s (chr '+chr+')')\n"
" for row in reader:\n"
" chr = row[0]\n"
" if chr != plot_chr: continue\n"
- " dat.append([row[1], float(row[2]), float(row[3])])\n"
+ " dat.append([int(row[1]), float(row[2]), float(row[3])])\n"
"def read_cnv(file,cnv,plot_chr):\n"
" with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" chr = row[0]\n"
" if chr != plot_chr: continue\n"
" row[2] = int(row[2]) + 0.5\n"
+ " row[1] = int(row[1])\n"
" cnv.append(row[1:])\n"
"def find_diffs(a,b):\n"
" out = []\n"
" heat[1][x] = cn_dat[x][3]\n"
" heat[2][x] = cn_dat[x][4]\n"
" heat[3][x] = cn_dat[x][5]\n"
- " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr')\n"
+ " mesh = ax3.pcolormesh(xgrid, ygrid, heat, cmap='bwr', shading='auto', alpha=0)\n"
" mesh.set_clim(vmin=-1,vmax=1)\n"
" ax3.plot([x[0] for x in cn_dat],[x[1] for x in cn_dat],'-',ms=3,color='black',lw=1.7)\n"
"\n"
fprintf(bcftools_stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n");
fprintf(bcftools_stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n");
fprintf(bcftools_stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n");
- fprintf(bcftools_stderr, "Usage: bcftools cnv [OPTIONS] <file.vcf>\n");
+ fprintf(bcftools_stderr, "Usage: bcftools cnv [OPTIONS] FILE.vcf\n");
fprintf(bcftools_stderr, "General Options:\n");
- fprintf(bcftools_stderr, " -c, --control-sample <string> optional control sample name to highlight differences\n");
- fprintf(bcftools_stderr, " -f, --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(bcftools_stderr, " -o, --output-dir <path> \n");
- fprintf(bcftools_stderr, " -p, --plot-threshold <float> plot aberrant chromosomes with quality at least 'float'\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --query-sample <string> query samply name\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -c, --control-sample STRING Optional control sample name to highlight differences\n");
+ fprintf(bcftools_stderr, " -f, --AF-file FILE Read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+ fprintf(bcftools_stderr, " -o, --output-dir PATH \n");
+ fprintf(bcftools_stderr, " -p, --plot-threshold FLOAT Plot aberrant chromosomes with quality at least FLOAT\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -s, --query-sample STRING Query samply name\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, "HMM Options:\n");
- fprintf(bcftools_stderr, " -a, --aberrant <float[,float]> fraction of aberrant cells in query and control [1.0,1.0]\n");
- fprintf(bcftools_stderr, " -b, --BAF-weight <float> relative contribution from BAF [1]\n");
- fprintf(bcftools_stderr, " -d, --BAF-dev <float[,float]> expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
- fprintf(bcftools_stderr, " -e, --err-prob <float> uniform error probability [1e-4]\n");
- fprintf(bcftools_stderr, " -k, --LRR-dev <float[,float]> expected LRR deviation [0.2,0.2]\n"); // experimental
- fprintf(bcftools_stderr, " -l, --LRR-weight <float> relative contribution from LRR [0.2]\n");
- fprintf(bcftools_stderr, " -L, --LRR-smooth-win <int> window of LRR moving average smoothing [10]\n");
- fprintf(bcftools_stderr, " -O, --optimize <float> estimate fraction of aberrant cells down to <float> [1.0]\n");
- fprintf(bcftools_stderr, " -P, --same-prob <float> prior probability of -s/-c being the same [0.5]\n");
- fprintf(bcftools_stderr, " -x, --xy-prob <float> P(x|y) transition probability [1e-9]\n");
+ fprintf(bcftools_stderr, " -a, --aberrant FLOAT[,FLOAT] Fraction of aberrant cells in query and control [1.0,1.0]\n");
+ fprintf(bcftools_stderr, " -b, --BAF-weight FLOAT Relative contribution from BAF [1]\n");
+ fprintf(bcftools_stderr, " -d, --BAF-dev FLOAT[,FLOAT] Expected BAF deviation in query and control [0.04,0.04]\n"); // experimental
+ fprintf(bcftools_stderr, " -e, --err-prob FLOAT Uniform error probability [1e-4]\n");
+ fprintf(bcftools_stderr, " -k, --LRR-dev FLOAT[,FLOAT] Expected LRR deviation [0.2,0.2]\n"); // experimental
+ fprintf(bcftools_stderr, " -l, --LRR-weight FLOAT Relative contribution from LRR [0.2]\n");
+ fprintf(bcftools_stderr, " -L, --LRR-smooth-win INT Window of LRR moving average smoothing [10]\n");
+ fprintf(bcftools_stderr, " -O, --optimize FLOAT Estimate fraction of aberrant cells down to FLOAT [1.0]\n");
+ fprintf(bcftools_stderr, " -P, --same-prob FLOA> Prior probability of -s/-c being the same [0.5]\n");
+ fprintf(bcftools_stderr, " -x, --xy-prob FLOAT P(x|y) transition probability [1e-9]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->query_sample.lrr_dev2 = args->control_sample.lrr_dev2 = 0.2*0.2; //0.20*0.20; // illumina: 0.18
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
+
static struct option loptions[] =
{
{"BAF-dev",1,0,'d'},
{"control-sample",1,0,'c'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"plot-threshold",1,0,'p'},
{"output-dir",1,0,'o'},
{0,0,0,0}
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
if ( !args->output_dir ) error("Expected -o option\n");
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
{
bcf_srs_t *files;
htsFile *out_fh;
- int output_type, n_threads, record_cmd_line;
+ int output_type, n_threads, record_cmd_line, clevel;
bcf_hdr_t *out_hdr;
int *seen_seq;
int *start_pos, start_tid, ifname;
int *swap_phase, nswap, *nmatch, *nmism;
bcf1_t **buf;
+ uint8_t *buf_mask;
int nbuf, mbuf, prev_chr, min_PQ, prev_pos_check;
int32_t *GTa, *GTb, mGTa, mGTb, *phase_qual, *phase_set;
char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
- int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
+ int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap;
int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers;
- int verbose;
+ int verbose, explicit_output_type, ligate_force, ligate_warn;
htsThreadPool *tpool;
}
args_t;
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
}
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->allow_overlaps || args->phased_concat )
{
{
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
free(args->swap_phase);
for (i=0; i<args->mbuf; i++) bcf_destroy(args->buf[i]);
free(args->buf);
+ free(args->buf_mask);
free(args->GTa);
free(args->GTb);
free(args->nmatch);
int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr);
static int gt_absent_warned = 0;
-
for (i=0; i<args->nbuf; i+=2)
{
+ if ( args->buf_mask[i/2]!=3 ) continue;
+
bcf1_t *arec = args->buf[i];
bcf1_t *brec = args->buf[i+1];
}
for (i=0; i<args->nbuf/2; i+=2)
{
- bcf1_t *arec = args->buf[i];
- bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
- if ( args->nswap )
- phase_update(args, args->out_hdr, arec);
+ bcf1_t *rec;
+ bcf_hdr_t *hdr;
+ int mask = args->buf_mask[i/2];
+ if ( mask & 1 ) { rec = args->buf[i]; hdr = args->files->readers[0].header; }
+ else { rec = args->buf[i+1]; hdr = args->files->readers[1].header; }
+ bcf_translate(args->out_hdr, hdr, rec);
+ if ( args->nswap && (mask&1) )
+ phase_update(args, args->out_hdr, rec);
if ( !args->compact_PS || args->phase_set_changed )
{
- bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
+ bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl);
args->phase_set_changed = 0;
}
- if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
- if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1);
- args->prev_pos_check = arec->pos;
+ if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = rec->pos;
}
args->nswap = 0;
for (j=0; j<nsmpl; j++)
int PQ_printed = 0;
for (; i<args->nbuf; i+=2)
{
- bcf1_t *brec = args->buf[i+1];
- bcf_translate(args->out_hdr, args->files->readers[1].header, brec);
- if ( !PQ_printed )
+ bcf1_t *rec;
+ bcf_hdr_t *hdr;
+ int mask = args->buf_mask[i/2];
+ if ( mask & 2 ) { rec = args->buf[i+1]; hdr = args->files->readers[1].header; }
+ else { rec = args->buf[i]; hdr = args->files->readers[0].header; }
+ bcf_translate(args->out_hdr, hdr, rec);
+ if ( !PQ_printed && mask==3 )
{
- bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl);
+ bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
PQ_printed = 1;
for (j=0; j<nsmpl; j++)
if ( args->phase_qual[j] < args->min_PQ )
{
- args->phase_set[j] = brec->pos+1;
+ args->phase_set[j] = rec->pos+1;
args->phase_set_changed = 1;
}
else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing;
}
if ( args->nswap )
- phase_update(args, args->out_hdr, brec);
+ phase_update(args, args->out_hdr, rec);
if ( !args->compact_PS || args->phase_set_changed )
{
- bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl);
+ bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl);
args->phase_set_changed = 0;
}
- if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
- if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1);
- args->prev_pos_check = brec->pos;
+ if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = rec->pos;
}
args->nbuf = 0;
}
-static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec)
+static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec, int is_overlap)
{
+ bcf_hdr_t *ahdr = arec ? bcf_sr_get_header(args->files,0) : NULL;
+ bcf_hdr_t *bhdr = brec ? bcf_sr_get_header(args->files,1) : NULL;
+
if ( arec && arec->errcode )
- error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname);
+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(ahdr,arec),(int64_t) arec->pos+1, args->files->readers[0].fname);
if ( brec && brec->errcode )
- error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname);
+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(bhdr,brec),(int64_t) brec->pos+1, args->files->readers[1].fname);
int i, nsmpl = bcf_hdr_nsamples(args->out_hdr);
- int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec));
+ int chr_id = arec ? bcf_hdr_name2id(args->out_hdr,bcf_seqname(ahdr,arec)) : bcf_hdr_name2id(args->out_hdr,bcf_seqname(bhdr,brec));
if ( args->prev_chr<0 || args->prev_chr!=chr_id )
{
if ( args->prev_chr>=0 ) phased_flush(args);
for (i=0; i<nsmpl; i++)
- args->phase_set[i] = arec->pos+1;
+ args->phase_set[i] = arec ? arec->pos+1 : brec->pos+1;
args->phase_set_changed = 1;
- if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec));
+ if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", arec ? bcf_seqname(ahdr,arec) : bcf_seqname(bhdr,brec));
args->seen_seq[chr_id] = 1;
args->prev_chr = chr_id;
args->prev_pos_check = -1;
}
- if ( !brec )
+ if ( !is_overlap )
{
- bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
+ assert(arec);
+
+ bcf_translate(args->out_hdr, ahdr, arec);
if ( args->nswap )
phase_update(args, args->out_hdr, arec);
if ( !args->compact_PS || args->phase_set_changed )
if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( arec->pos < args->prev_pos_check )
- error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
+ error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
args->prev_pos_check = arec->pos;
return;
}
int m = args->mbuf;
args->nbuf += 2;
hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf);
+ if ( m < args->mbuf ) args->buf_mask = (uint8_t*)realloc(args->buf_mask,sizeof(*args->buf_mask)*args->mbuf);
for (i=m; i<args->mbuf; i++)
args->buf[i] = bcf_init1();
- SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
- SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
+ if ( arec ) SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
+ if ( brec ) SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
+ args->buf_mask[args->nbuf/2-1] = (arec?1:0) | (brec?2:0);
+}
+
+static int _get_active_index(bcf_srs_t *sr)
+{
+ int i;
+ for (i=0; i<sr->nreaders; i++)
+ if ( bcf_sr_has_line(sr,i) ) return i;
+ return -1;
}
static void concat(args_t *args)
else if ( new_file )
bcf_sr_seek(args->files,NULL,0); // set to start
- int nret;
+ int nret, ir;
while ( (nret = bcf_sr_next_line(args->files)) )
{
+ int is_overlap = args->files->nreaders==1 ? 0 : 1;
if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader
{
// We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
- if ( ! bcf_sr_region_done(args->files,0) )
+ if ( bcf_sr_region_done(args->files,0) )
+ {
+ phased_flush(args);
+ bcf_sr_remove_reader(args->files, 0);
+ is_overlap = 0;
+ }
+ else if ( args->ligate_warn )
{
if ( !site_drop_warned )
{
+ ir = _get_active_index(args->files);
fprintf(stderr,
"Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n"
" overlap, sites in overlapping regions present in one but missing in other are dropped.\n"
" This warning is printed only once.\n",
- bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1
- );
+ bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1);
site_drop_warned = 1;
}
continue;
}
- phased_flush(args);
- bcf_sr_remove_reader(args->files, 0);
+ else if ( !args->ligate_force )
+ {
+ ir = _get_active_index(args->files);
+ error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n",
+ bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1);
+ }
}
// Get a line to learn about current position
- for (i=0; i<args->files->nreaders; i++)
- if ( bcf_sr_has_line(args->files,i) ) break;
- bcf1_t *line = bcf_sr_get_line(args->files,i);
+ ir = _get_active_index(args->files);
+ bcf1_t *line = bcf_sr_get_line(args->files,ir);
// This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
- if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
+ if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)) ) continue;
seek_pos = seek_chr = -1;
// Check if the position overlaps with the next, yet unopened, reader
}
if ( must_seek )
{
- bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
+ bcf_sr_seek(args->files, bcf_seqname(args->files->readers[ir].header,line), line->pos);
seek_pos = line->pos;
- seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));
+ seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line));
continue;
}
// We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
- if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;
+ if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) && !args->ligate_force )
+ {
+ if ( args->ligate_warn && !site_drop_warned )
+ {
+ ir = _get_active_index(args->files);
+ fprintf(stderr,
+ "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n"
+ " overlap, sites in overlapping regions present in one but missing in other are dropped.\n"
+ " This warning is printed only once.\n",
+ bcf_seqname(bcf_sr_get_header(args->files,ir),line), (int64_t) line->pos+1);
+ site_drop_warned = 1;
+ }
+ else if ( !args->ligate_warn )
+ {
+ ir = _get_active_index(args->files);
+ error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n",
+ bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1);
+ }
+ continue;
+ }
- phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);
+ bcf1_t *line0 = bcf_sr_get_line(args->files,0);
+ bcf1_t *line1 = args->files->nreaders > 1 ? bcf_sr_get_line(args->files,1) : NULL;
+ phased_push(args, line0, line1, is_overlap);
}
if ( args->files->nreaders )
// if BCF, check if tag IDs are consistent in the dictionary of strings
if ( type.compression!=bgzf )
error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
- if ( type.format==vcf )
- {
- bcf_hdr_destroy(hdr);
- continue;
- }
_check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]);
_check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]);
// only compressed BCF atm
BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
+ htsFormat output_type;
+ output_type.format = (args->output_type & FT_VCF) ? vcf : bcf;
+ output_type.compression = (args->output_type & FT_GZ) ? bgzf : no_compression;
+
struct timeval t0, t1;
const size_t page_size = BGZF_MAX_BLOCK_SIZE;
uint8_t *buf = (uint8_t*) malloc(page_size);
htsFormat type = *hts_get_format(hts_fp);
if ( type.compression!=bgzf )
- error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
+ error("\nThe --naive option works only for compressed BCFs or VCFs\n");
file_types |= type.format==vcf ? 1 : 2;
if ( file_types==3 )
- error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n");
+ error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs\n");
+ if ( args->explicit_output_type )
+ {
+ if ( output_type.format!=type.format )
+ error("\nThe --naive option works only for the output of the same type, all BCFs or all VCFs\n");
+ if ( output_type.compression!=type.compression )
+ error("\nThe --naive option works only for the output of the same compression type\n");
+ }
BGZF *fp = hts_get_bgzfp(hts_fp);
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
fprintf(stderr, "Options:\n");
fprintf(stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
fprintf(stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
- fprintf(stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
+ fprintf(stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
fprintf(stderr, " -D, --remove-duplicates Alias for -d exact\n");
- fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
+ fprintf(stderr, " -f, --file-list FILE Read the list of files from a file.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
+ fprintf(stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n");
+ fprintf(stderr, " --ligate-warn Drop sites in imperfect overlaps\n");
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
fprintf(stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n");
fprintf(stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n");
- fprintf(stderr, " -o, --output <file> Write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
- fprintf(stderr, " -r, --regions <region> Restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> Restrict to regions listed in a file\n");
- fprintf(stderr, " --threads <int> Use multithreading with <int> worker threads [0]\n");
- fprintf(stderr, " -v, --verbose <0|1> Set verbosity level [1]\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " -q, --min-PQ INT Break phase set if phasing quality is lower than <int> [30]\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " -v, --verbose 0|1 Set verbosity level [1]\n");
fprintf(stderr, "\n");
exit(1);
}
args->record_cmd_line = 1;
args->min_PQ = 30;
args->verbose = 1;
+ args->clevel = -1;
static struct option loptions[] =
{
{"compact-PS",no_argument,NULL,'c'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,12},
{"remove-duplicates",no_argument,NULL,'D'},
{"rm-dups",required_argument,NULL,'d'},
{"allow-overlaps",no_argument,NULL,'a'},
{"ligate",no_argument,NULL,'l'},
+ {"ligate-force",no_argument,NULL,10},
+ {"ligate-warn",no_argument,NULL,11},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
case 'f': args->file_list = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'O':
+ args->explicit_output_type = 1;
switch (optarg[0]) {
case 'b': args->output_type = FT_BCF_GZ; break;
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
+ case 10 : args->ligate_force = 1; break;
+ case 11 : args->ligate_warn = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break;
+ case 12 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 'v':
args->verbose = strtol(optarg, 0, 0);
error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
args->fnames[args->nfnames-1] = strdup(argv[optind]);
optind++;
}
+ if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n");
if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n");
if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
if ( args->file_list )
{
bcf_srs_t *files;
htsFile *out_fh;
- int output_type, n_threads, record_cmd_line;
+ int output_type, n_threads, record_cmd_line, clevel;
bcf_hdr_t *out_hdr;
int *seen_seq;
int *start_pos, start_tid, ifname;
int *swap_phase, nswap, *nmatch, *nmism;
bcf1_t **buf;
+ uint8_t *buf_mask;
int nbuf, mbuf, prev_chr, min_PQ, prev_pos_check;
int32_t *GTa, *GTb, mGTa, mGTb, *phase_qual, *phase_set;
char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
- int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
+ int argc, nfnames, allow_overlaps, phased_concat, regions_is_file, regions_overlap;
int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers;
- int verbose;
+ int verbose, explicit_output_type, ligate_force, ligate_warn;
htsThreadPool *tpool;
}
args_t;
bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
}
if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->allow_overlaps || args->phased_concat )
{
{
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
free(args->swap_phase);
for (i=0; i<args->mbuf; i++) bcf_destroy(args->buf[i]);
free(args->buf);
+ free(args->buf_mask);
free(args->GTa);
free(args->GTb);
free(args->nmatch);
int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr);
static int gt_absent_warned = 0;
-
for (i=0; i<args->nbuf; i+=2)
{
+ if ( args->buf_mask[i/2]!=3 ) continue;
+
bcf1_t *arec = args->buf[i];
bcf1_t *brec = args->buf[i+1];
}
for (i=0; i<args->nbuf/2; i+=2)
{
- bcf1_t *arec = args->buf[i];
- bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
- if ( args->nswap )
- phase_update(args, args->out_hdr, arec);
+ bcf1_t *rec;
+ bcf_hdr_t *hdr;
+ int mask = args->buf_mask[i/2];
+ if ( mask & 1 ) { rec = args->buf[i]; hdr = args->files->readers[0].header; }
+ else { rec = args->buf[i+1]; hdr = args->files->readers[1].header; }
+ bcf_translate(args->out_hdr, hdr, rec);
+ if ( args->nswap && (mask&1) )
+ phase_update(args, args->out_hdr, rec);
if ( !args->compact_PS || args->phase_set_changed )
{
- bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
+ bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl);
args->phase_set_changed = 0;
}
- if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
- if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1);
- args->prev_pos_check = arec->pos;
+ if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = rec->pos;
}
args->nswap = 0;
for (j=0; j<nsmpl; j++)
int PQ_printed = 0;
for (; i<args->nbuf; i+=2)
{
- bcf1_t *brec = args->buf[i+1];
- bcf_translate(args->out_hdr, args->files->readers[1].header, brec);
- if ( !PQ_printed )
+ bcf1_t *rec;
+ bcf_hdr_t *hdr;
+ int mask = args->buf_mask[i/2];
+ if ( mask & 2 ) { rec = args->buf[i+1]; hdr = args->files->readers[1].header; }
+ else { rec = args->buf[i]; hdr = args->files->readers[0].header; }
+ bcf_translate(args->out_hdr, hdr, rec);
+ if ( !PQ_printed && mask==3 )
{
- bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl);
+ bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
PQ_printed = 1;
for (j=0; j<nsmpl; j++)
if ( args->phase_qual[j] < args->min_PQ )
{
- args->phase_set[j] = brec->pos+1;
+ args->phase_set[j] = rec->pos+1;
args->phase_set_changed = 1;
}
else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing;
}
if ( args->nswap )
- phase_update(args, args->out_hdr, brec);
+ phase_update(args, args->out_hdr, rec);
if ( !args->compact_PS || args->phase_set_changed )
{
- bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl);
+ bcf_update_format_int32(args->out_hdr,rec,"PS",args->phase_set,nsmpl);
args->phase_set_changed = 0;
}
- if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
+ if ( bcf_write(args->out_fh, args->out_hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
- if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1);
- args->prev_pos_check = brec->pos;
+ if ( rec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(hdr,rec),(int64_t)rec->pos+1,args->prev_pos_check+1);
+ args->prev_pos_check = rec->pos;
}
args->nbuf = 0;
}
-static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec)
+static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec, int is_overlap)
{
+ bcf_hdr_t *ahdr = arec ? bcf_sr_get_header(args->files,0) : NULL;
+ bcf_hdr_t *bhdr = brec ? bcf_sr_get_header(args->files,1) : NULL;
+
if ( arec && arec->errcode )
- error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname);
+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(ahdr,arec),(int64_t) arec->pos+1, args->files->readers[0].fname);
if ( brec && brec->errcode )
- error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname);
+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(bhdr,brec),(int64_t) brec->pos+1, args->files->readers[1].fname);
int i, nsmpl = bcf_hdr_nsamples(args->out_hdr);
- int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec));
+ int chr_id = arec ? bcf_hdr_name2id(args->out_hdr,bcf_seqname(ahdr,arec)) : bcf_hdr_name2id(args->out_hdr,bcf_seqname(bhdr,brec));
if ( args->prev_chr<0 || args->prev_chr!=chr_id )
{
if ( args->prev_chr>=0 ) phased_flush(args);
for (i=0; i<nsmpl; i++)
- args->phase_set[i] = arec->pos+1;
+ args->phase_set[i] = arec ? arec->pos+1 : brec->pos+1;
args->phase_set_changed = 1;
- if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec));
+ if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", arec ? bcf_seqname(ahdr,arec) : bcf_seqname(bhdr,brec));
args->seen_seq[chr_id] = 1;
args->prev_chr = chr_id;
args->prev_pos_check = -1;
}
- if ( !brec )
+ if ( !is_overlap )
{
- bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
+ assert(arec);
+
+ bcf_translate(args->out_hdr, ahdr, arec);
if ( args->nswap )
phase_update(args, args->out_hdr, arec);
if ( !args->compact_PS || args->phase_set_changed )
if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
if ( arec->pos < args->prev_pos_check )
- error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
+ error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
args->prev_pos_check = arec->pos;
return;
}
int m = args->mbuf;
args->nbuf += 2;
hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf);
+ if ( m < args->mbuf ) args->buf_mask = (uint8_t*)realloc(args->buf_mask,sizeof(*args->buf_mask)*args->mbuf);
for (i=m; i<args->mbuf; i++)
args->buf[i] = bcf_init1();
- SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
- SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
+ if ( arec ) SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
+ if ( brec ) SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
+ args->buf_mask[args->nbuf/2-1] = (arec?1:0) | (brec?2:0);
+}
+
+static int _get_active_index(bcf_srs_t *sr)
+{
+ int i;
+ for (i=0; i<sr->nreaders; i++)
+ if ( bcf_sr_has_line(sr,i) ) return i;
+ return -1;
}
static void concat(args_t *args)
else if ( new_file )
bcf_sr_seek(args->files,NULL,0); // set to start
- int nret;
+ int nret, ir;
while ( (nret = bcf_sr_next_line(args->files)) )
{
+ int is_overlap = args->files->nreaders==1 ? 0 : 1;
if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader
{
// We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
- if ( ! bcf_sr_region_done(args->files,0) )
+ if ( bcf_sr_region_done(args->files,0) )
+ {
+ phased_flush(args);
+ bcf_sr_remove_reader(args->files, 0);
+ is_overlap = 0;
+ }
+ else if ( args->ligate_warn )
{
if ( !site_drop_warned )
{
+ ir = _get_active_index(args->files);
fprintf(bcftools_stderr,
"Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n"
" overlap, sites in overlapping regions present in one but missing in other are dropped.\n"
" This warning is printed only once.\n",
- bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1
- );
+ bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1);
site_drop_warned = 1;
}
continue;
}
- phased_flush(args);
- bcf_sr_remove_reader(args->files, 0);
+ else if ( !args->ligate_force )
+ {
+ ir = _get_active_index(args->files);
+ error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n",
+ bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1);
+ }
}
// Get a line to learn about current position
- for (i=0; i<args->files->nreaders; i++)
- if ( bcf_sr_has_line(args->files,i) ) break;
- bcf1_t *line = bcf_sr_get_line(args->files,i);
+ ir = _get_active_index(args->files);
+ bcf1_t *line = bcf_sr_get_line(args->files,ir);
// This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
- if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
+ if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)) ) continue;
seek_pos = seek_chr = -1;
// Check if the position overlaps with the next, yet unopened, reader
}
if ( must_seek )
{
- bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
+ bcf_sr_seek(args->files, bcf_seqname(args->files->readers[ir].header,line), line->pos);
seek_pos = line->pos;
- seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));
+ seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line));
continue;
}
// We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
- if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;
+ if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) && !args->ligate_force )
+ {
+ if ( args->ligate_warn && !site_drop_warned )
+ {
+ ir = _get_active_index(args->files);
+ fprintf(bcftools_stderr,
+ "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n"
+ " overlap, sites in overlapping regions present in one but missing in other are dropped.\n"
+ " This warning is printed only once.\n",
+ bcf_seqname(bcf_sr_get_header(args->files,ir),line), (int64_t) line->pos+1);
+ site_drop_warned = 1;
+ }
+ else if ( !args->ligate_warn )
+ {
+ ir = _get_active_index(args->files);
+ error("Error: The --ligate option is intended for VCFs with perfect overlap, the site %s:%"PRId64" breaks the assumption\n",
+ bcf_seqname(bcf_sr_get_header(args->files,ir),bcf_sr_get_line(args->files,ir)), (int64_t) bcf_sr_get_line(args->files,ir)->pos+1);
+ }
+ continue;
+ }
- phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);
+ bcf1_t *line0 = bcf_sr_get_line(args->files,0);
+ bcf1_t *line1 = args->files->nreaders > 1 ? bcf_sr_get_line(args->files,1) : NULL;
+ phased_push(args, line0, line1, is_overlap);
}
if ( args->files->nreaders )
// if BCF, check if tag IDs are consistent in the dictionary of strings
if ( type.compression!=bgzf )
error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
- if ( type.format==vcf )
- {
- bcf_hdr_destroy(hdr);
- continue;
- }
_check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]);
_check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]);
// only compressed BCF atm
BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
+ htsFormat output_type;
+ output_type.format = (args->output_type & FT_VCF) ? vcf : bcf;
+ output_type.compression = (args->output_type & FT_GZ) ? bgzf : no_compression;
+
struct timeval t0, t1;
const size_t page_size = BGZF_MAX_BLOCK_SIZE;
uint8_t *buf = (uint8_t*) malloc(page_size);
htsFormat type = *hts_get_format(hts_fp);
if ( type.compression!=bgzf )
- error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
+ error("\nThe --naive option works only for compressed BCFs or VCFs\n");
file_types |= type.format==vcf ? 1 : 2;
if ( file_types==3 )
- error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n");
+ error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs\n");
+ if ( args->explicit_output_type )
+ {
+ if ( output_type.format!=type.format )
+ error("\nThe --naive option works only for the output of the same type, all BCFs or all VCFs\n");
+ if ( output_type.compression!=type.compression )
+ error("\nThe --naive option works only for the output of the same compression type\n");
+ }
BGZF *fp = hts_get_bgzfp(hts_fp);
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
fprintf(bcftools_stderr, "Options:\n");
fprintf(bcftools_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n");
fprintf(bcftools_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n");
- fprintf(bcftools_stderr, " -d, --rm-dups <string> Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
+ fprintf(bcftools_stderr, " -d, --rm-dups STRING Output duplicate records present in multiple files only once: <snps|indels|both|all|exact>\n");
fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d exact\n");
- fprintf(bcftools_stderr, " -f, --file-list <file> Read the list of files from a file.\n");
+ fprintf(bcftools_stderr, " -f, --file-list FILE Read the list of files from a file.\n");
fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
+ fprintf(bcftools_stderr, " --ligate-force Ligate even non-overlapping chunks, keep all sites\n");
+ fprintf(bcftools_stderr, " --ligate-warn Drop sites in imperfect overlaps\n");
fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n");
fprintf(bcftools_stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n");
- fprintf(bcftools_stderr, " -o, --output <file> Write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
- fprintf(bcftools_stderr, " -r, --regions <region> Restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> Restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " --threads <int> Use multithreading with <int> worker threads [0]\n");
- fprintf(bcftools_stderr, " -v, --verbose <0|1> Set verbosity level [1]\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " -q, --min-PQ INT Break phase set if phasing quality is lower than <int> [30]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " -v, --verbose 0|1 Set verbosity level [1]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->record_cmd_line = 1;
args->min_PQ = 30;
args->verbose = 1;
+ args->clevel = -1;
static struct option loptions[] =
{
{"compact-PS",no_argument,NULL,'c'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,12},
{"remove-duplicates",no_argument,NULL,'D'},
{"rm-dups",required_argument,NULL,'d'},
{"allow-overlaps",no_argument,NULL,'a'},
{"ligate",no_argument,NULL,'l'},
+ {"ligate-force",no_argument,NULL,10},
+ {"ligate-warn",no_argument,NULL,11},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
case 'f': args->file_list = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'O':
+ args->explicit_output_type = 1;
switch (optarg[0]) {
case 'b': args->output_type = FT_BCF_GZ; break;
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
+ case 10 : args->ligate_force = 1; break;
+ case 11 : args->ligate_warn = 1; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break;
+ case 12 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 'v':
args->verbose = strtol(optarg, 0, 0);
error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
args->fnames[args->nfnames-1] = strdup(argv[optind]);
optind++;
}
+ if ( args->ligate_force && args->ligate_warn ) error("The options cannot be combined: --ligate-force and --ligate-warn\n");
if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n");
if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n");
if ( args->file_list )
float *flt;
int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
+ int regions_overlap, targets_overlap;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname, *sex_fname;
- int argc, n_threads, record_cmd_line, keep_duplicates;
+ int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
};
static void destroy_data(args_t *args)
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nrows; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
bcf_hdr_add_sample(args->header, NULL);
args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
static void vcf_to_vcf(args_t *args)
{
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
fprintf(stderr, "\n");
fprintf(stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n");
fprintf(stderr, " formats details. When specifying output files explicitly instead\n");
- fprintf(stderr, " of with <prefix>, one can use '-' for stdout and '.' to suppress.\n");
- fprintf(stderr, "Usage: bcftools convert [OPTIONS] <input_file>\n");
+ fprintf(stderr, " of with PREFIX, one can use '-' for stdout and '.' to suppress.\n");
+ fprintf(stderr, "Usage: bcftools convert [OPTIONS] INPUT_FILE\n");
fprintf(stderr, "\n");
fprintf(stderr, "VCF input options:\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --samples <list> list of samples to include\n");
- fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
+ fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -s, --samples LIST List of samples to include\n");
+ fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "VCF output options:\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
- fprintf(stderr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
- fprintf(stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
- fprintf(stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
- fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
- fprintf(stderr, " --keep-duplicates keep duplicate positions\n");
- fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
- fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " -G, --gensample2vcf ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
+ fprintf(stderr, " -g, --gensample ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
+ fprintf(stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
+ fprintf(stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " --keep-duplicates Keep duplicate positions\n");
+ fprintf(stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
+ fprintf(stderr, " --vcf-ids Output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "gVCF conversion:\n");
- fprintf(stderr, " --gvcf2vcf expand gVCF reference blocks\n");
- fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(stderr, " --gvcf2vcf Expand gVCF reference blocks\n");
+ fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
fprintf(stderr, "\n");
fprintf(stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
- fprintf(stderr, " --hapsample2vcf <...> <prefix>|<hap-file>,<sample-file>\n");
- fprintf(stderr, " --hapsample <...> <prefix>|<hap-file>,<sample-file>\n");
- fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
- fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
- fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " --hapsample2vcf ... <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
+ fprintf(stderr, " --hapsample ... <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
+ fprintf(stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
+ fprintf(stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "HAP/LEGEND/SAMPLE conversion:\n");
- fprintf(stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
- fprintf(stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
- fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
- fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
- fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " -H, --haplegendsample2vcf ... <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
+ fprintf(stderr, " -h, --haplegendsample ... <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
+ fprintf(stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
+ fprintf(stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "TSV conversion:\n");
- fprintf(stderr, " --tsv2vcf <file> \n");
- fprintf(stderr, " -c, --columns <string> columns of the input tsv file [ID,CHROM,POS,AA]\n");
- fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(stderr, " -s, --samples <list> list of sample names\n");
- fprintf(stderr, " -S, --samples-file <file> file of sample names\n");
+ fprintf(stderr, " --tsv2vcf FILE\n");
+ fprintf(stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n");
+ fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
+ fprintf(stderr, " -s, --samples LIST List of sample names\n");
+ fprintf(stderr, " -S, --samples-file FILE File of sample names\n");
fprintf(stderr, "\n");
// fprintf(stderr, "PLINK options:\n");
// fprintf(stderr, " -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->regions_overlap = 1;
+ args->targets_overlap = 0;
+ args->clevel = -1;
static struct option loptions[] =
{
{"threads",required_argument,NULL,9},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,13},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,14},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
{"sex",required_argument,NULL,11},
{"keep-duplicates",no_argument,NULL,12},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
switch (c) {
case 'e':
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 10 : args->record_cmd_line = 0; break;
case 11 : args->sex_fname = optarg; break;
case 12 : args->keep_duplicates = 1; break;
+ case 13 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 14 :
+ if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
}
float *flt;
int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
+ int regions_overlap, targets_overlap;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
char *outfname, *infname, *ref_fname, *sex_fname;
- int argc, n_threads, record_cmd_line, keep_duplicates;
+ int argc, n_threads, record_cmd_line, keep_duplicates, clevel;
};
static void destroy_data(args_t *args)
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, args->targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nrows; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname);
for (i=0; i<nsamples; i++) free(samples[i]);
free(samples);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
bcf_hdr_add_sample(args->header, NULL);
args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
static void vcf_to_vcf(args_t *args)
{
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname);
open_vcf(args,NULL);
- htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->outfname,args->clevel);
+ htsFile *out_fh = hts_open(args->outfname ? args->outfname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n");
fprintf(bcftools_stderr, " formats details. When specifying output files explicitly instead\n");
- fprintf(bcftools_stderr, " of with <prefix>, one can use '-' for bcftools_stdout and '.' to suppress.\n");
- fprintf(bcftools_stderr, "Usage: bcftools convert [OPTIONS] <input_file>\n");
+ fprintf(bcftools_stderr, " of with PREFIX, one can use '-' for bcftools_stdout and '.' to suppress.\n");
+ fprintf(bcftools_stderr, "Usage: bcftools convert [OPTIONS] INPUT_FILE\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "VCF input options:\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --samples <list> list of samples to include\n");
- fprintf(bcftools_stderr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include\n");
+ fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "VCF output options:\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output <file> output file name [bcftools_stdout]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n");
- fprintf(bcftools_stderr, " -G, --gensample2vcf <...> <prefix>|<gen-file>,<sample-file>\n");
- fprintf(bcftools_stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
- fprintf(bcftools_stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
- fprintf(bcftools_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
- fprintf(bcftools_stderr, " --keep-duplicates keep duplicate positions\n");
- fprintf(bcftools_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
- fprintf(bcftools_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
+ fprintf(bcftools_stderr, " -G, --gensample2vcf ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
+ fprintf(bcftools_stderr, " -g, --gensample ... <PREFIX>|<GEN-FILE>,<SAMPLE-FILE>\n");
+ fprintf(bcftools_stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
+ fprintf(bcftools_stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(bcftools_stderr, " --keep-duplicates Keep duplicate positions\n");
+ fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
+ fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "gVCF conversion:\n");
- fprintf(bcftools_stderr, " --gvcf2vcf expand gVCF reference blocks\n");
- fprintf(bcftools_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
+ fprintf(bcftools_stderr, " --gvcf2vcf Expand gVCF reference blocks\n");
+ fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n");
- fprintf(bcftools_stderr, " --hapsample2vcf <...> <prefix>|<hap-file>,<sample-file>\n");
- fprintf(bcftools_stderr, " --hapsample <...> <prefix>|<hap-file>,<sample-file>\n");
- fprintf(bcftools_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
- fprintf(bcftools_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
- fprintf(bcftools_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(bcftools_stderr, " --hapsample2vcf ... <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
+ fprintf(bcftools_stderr, " --hapsample ... <PREFIX>|<HAP-FILE>,<SAMPLE-FILE>\n");
+ fprintf(bcftools_stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n");
+ fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
+ fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "HAP/LEGEND/SAMPLE conversion:\n");
- fprintf(bcftools_stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
- fprintf(bcftools_stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
- fprintf(bcftools_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
- fprintf(bcftools_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
- fprintf(bcftools_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
+ fprintf(bcftools_stderr, " -H, --haplegendsample2vcf ... <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
+ fprintf(bcftools_stderr, " -h, --haplegendsample ... <PREFIX>|<HAP-FILE>,<LEGEND-FILE>,<SAMPLE-FILE>\n");
+ fprintf(bcftools_stderr, " --haploid2diploid Convert haploid genotypes to diploid homozygotes\n");
+ fprintf(bcftools_stderr, " --sex FILE Output sex column in the sample-file, input format is: Sample\\t[MF]\n");
+ fprintf(bcftools_stderr, " --vcf-ids Output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "TSV conversion:\n");
- fprintf(bcftools_stderr, " --tsv2vcf <file> \n");
- fprintf(bcftools_stderr, " -c, --columns <string> columns of the input tsv file [ID,CHROM,POS,AA]\n");
- fprintf(bcftools_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
- fprintf(bcftools_stderr, " -s, --samples <list> list of sample names\n");
- fprintf(bcftools_stderr, " -S, --samples-file <file> file of sample names\n");
+ fprintf(bcftools_stderr, " --tsv2vcf FILE\n");
+ fprintf(bcftools_stderr, " -c, --columns STRING Columns of the input tsv file [ID,CHROM,POS,AA]\n");
+ fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
+ fprintf(bcftools_stderr, " -s, --samples LIST List of sample names\n");
+ fprintf(bcftools_stderr, " -S, --samples-file FILE File of sample names\n");
fprintf(bcftools_stderr, "\n");
// fprintf(bcftools_stderr, "PLINK options:\n");
// fprintf(bcftools_stderr, " -p, --plink <prefix>|<ped>,<map>,<fam>|<bed>,<bim>,<fam>|<tped>,<tfam>\n");
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->regions_overlap = 1;
+ args->targets_overlap = 0;
+ args->clevel = -1;
static struct option loptions[] =
{
{"threads",required_argument,NULL,9},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,13},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,14},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
{"sex",required_argument,NULL,11},
{"keep-duplicates",no_argument,NULL,12},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) {
switch (c) {
case 'e':
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 10 : args->record_cmd_line = 0; break;
case 11 : args->sex_fname = optarg; break;
case 12 : args->keep_duplicates = 1; break;
+ case 13 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 14 :
+ if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
}
bcf_srs_t *files;
bcf_hdr_t *hdr;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, clevel;
char **argv, *output_fname, *targets_list, *regions_list;
int argc, record_cmd_line;
static void init_data(args_t *args)
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
fprintf(stderr, "Usage: bcftools filter [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -g, --SnpGap <int>[:type] filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
- fprintf(stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
- fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
- fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --soft-filter <string> annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
- fprintf(stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -g, --SnpGap INT[:TYPE] Filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
+ fprintf(stderr, " -G, --IndelGap INT Filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
+ fprintf(stderr, " -i, --include EXPR Include only sites for which the expression is true (see man page for details\n");
+ fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -s, --soft-filter STRING Annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
+ fprintf(stderr, " -S, --set-GTs .|0 Set genotypes of failed samples to missing (.) or ref (0)\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, "\n");
exit(1);
}
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->clevel = -1;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"include",required_argument,NULL,'i'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 's': args->soft_filter = optarg; break;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
if ( args->regions_list )
{
args->files->require_index = 1;
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
kputs(argv[optind+1],&tmp);
for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
args->files->require_index = 1;
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, tmp.s, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
free(tmp.s);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
bcf_srs_t *files;
bcf_hdr_t *hdr;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, clevel;
char **argv, *output_fname, *targets_list, *regions_list;
int argc, record_cmd_line;
static void init_data(args_t *args)
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
fprintf(bcftools_stderr, "Usage: bcftools filter [options] <in.vcf.gz>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -g, --SnpGap <int>[:type] filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
- fprintf(bcftools_stderr, " -G, --IndelGap <int> filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
- fprintf(bcftools_stderr, " -i, --include <expr> include only sites for which the expression is true (see man page for details\n");
- fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --soft-filter <string> annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
- fprintf(bcftools_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -g, --SnpGap INT[:TYPE] Filter SNPs within <int> base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n");
+ fprintf(bcftools_stderr, " -G, --IndelGap INT Filter clusters of indels separated by <int> or fewer base pairs allowing only one to pass\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Include only sites for which the expression is true (see man page for details\n");
+ fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -s, --soft-filter STRING Annotate FILTER column with <string> or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n");
+ fprintf(bcftools_stderr, " -S, --set-GTs .|0 Set genotypes of failed samples to missing (.) or ref (0)\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->clevel = -1;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"include",required_argument,NULL,'i'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 's': args->soft_filter = optarg; break;
break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
if ( args->regions_list )
{
args->files->require_index = 1;
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
kputs(argv[optind+1],&tmp);
for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); }
args->files->require_index = 1;
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, tmp.s, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
free(tmp.s);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
+ int regions_overlap, targets_overlap;
int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
double *pdiff, *qry_prob, *gt_prob;
uint32_t *ndiff,*ncnt,ncmp, npairs;
hts_srand48(0);
args->files = bcf_sr_init();
- if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
- if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+ if ( args->regions )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
+ }
+ if ( args->targets )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
+ if ( bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+ }
if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
fprintf(stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
fprintf(stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n");
fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Check discordance of all samples from B against all sample in A\n");
args->gt_use_GT = -1;
args->calc_hwe_prob = 1;
args->use_PLs = 40;
+ args->regions_overlap = 1;
+ args->targets_overlap = 0;
// external sort for --distinctive-sites
#ifdef _WIN32
{"distinctive-sites",1,0,6},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,7},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,8},
{"pairs",1,0,'p'},
{"pairs-file",1,0,'P'},
{0,0,0,0}
case 'R': args->regions = optarg; args->regions_is_file = 1; break;
case 't': args->targets = optarg; break;
case 'T': args->targets = optarg; args->targets_is_file = 1; break;
+ case 7 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 8 :
+ if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
bcf_hdr_t *gt_hdr, *qry_hdr; // VCF with genotypes to compare against and the query VCF
char *cwd, **argv, *gt_samples, *qry_samples, *regions, *targets, *qry_fname, *gt_fname, *pair_samples;
int argc, gt_samples_is_file, qry_samples_is_file, regions_is_file, targets_is_file, pair_samples_is_file;
+ int regions_overlap, targets_overlap;
int qry_use_GT,gt_use_GT, nqry_smpl,ngt_smpl, *qry_smpl,*gt_smpl;
double *pdiff, *qry_prob, *gt_prob;
uint32_t *ndiff,*ncnt,ncmp, npairs;
hts_srand48(0);
args->files = bcf_sr_init();
- if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
- if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+ if ( args->regions )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions);
+ }
+ if ( args->targets )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
+ if ( bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets);
+ }
if ( args->gt_fname ) bcf_sr_set_opt(args->files, BCF_SR_REQUIRE_IDX);
if ( !bcf_sr_add_reader(args->files,args->qry_fname) ) error("Failed to open %s: %s\n", args->qry_fname,bcf_sr_strerror(args->files->errnum));
fprintf(bcftools_stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n");
fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n");
fprintf(bcftools_stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n");
fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " -u, --use TAG1[,TAG2] Which tag to use in the query file (TAG1) and the -g file (TAG2) [PL,GT]\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # Check discordance of all samples from B against all sample in A\n");
args->gt_use_GT = -1;
args->calc_hwe_prob = 1;
args->use_PLs = 40;
+ args->regions_overlap = 1;
+ args->targets_overlap = 0;
// external sort for --distinctive-sites
#ifdef _WIN32
{"distinctive-sites",1,0,6},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,7},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,8},
{"pairs",1,0,'p'},
{"pairs-file",1,0,'P'},
{0,0,0,0}
case 'R': args->regions = optarg; args->regions_is_file = 1; break;
case 't': args->targets = optarg; break;
case 'T': args->targets = optarg; args->targets_is_file = 1; break;
+ case 7 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 8 :
+ if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
* the total number of records.
*/
int len = strlen(fname);
+ int idx_only = 0;
if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
fntemp = strdup(fname);
if ( !fntemp ) return 1;
fntemp = strdup(fname);
fname = fntemp;
fname[len-4] = 0;
+ idx_only = 1;
}
if ( stats&per_contig )
{
- fp = hts_open(fname,"r");
- if ( !fp ) {
- fprintf(stderr,"Could not read %s\n", fname);
- ret = 1; goto cleanup;
+ if ( idx_only )
+ {
+ struct stat buf;
+ if ( stat(fname, &buf)==0 ) idx_only = 0;
}
- hdr = bcf_hdr_read(fp);
- if ( !hdr ) {
- fprintf(stderr,"Could not read the header: %s\n", fname);
- ret = 1; goto cleanup;
+
+ enum htsExactFormat fmt;
+ if ( !idx_only )
+ {
+ fp = hts_open(fname,"r");
+ if ( !fp ) {
+ fprintf(stderr,"Could not read %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+ hdr = bcf_hdr_read(fp);
+ if ( !hdr ) {
+ fprintf(stderr,"Could not read the header: %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+ fmt = hts_get_format(fp)->format;
+ }
+ else
+ {
+ int len = strlen(fnidx);
+ if ( !strcasecmp(".tbi",fnidx+len-4) ) fmt = vcf;
+ else fmt = bcf;
}
- if ( hts_get_format(fp)->format==vcf )
+ if ( fmt==vcf )
{
tbx = tbx_index_load2(fname, fnidx);
if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
}
- else if ( hts_get_format(fp)->format==bcf )
+ else if ( fmt==bcf )
{
idx = bcf_index_load2(fname, fnidx);
if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
} else {
nseq = hts_idx_nseq(idx);
}
-
+ if ( !tbx && !hdr ) fprintf(stderr,"Warning: cannot determine contig names given the .csi index alone\n");
for (tid=0; tid<nseq; tid++)
{
uint64_t records, v;
hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
sum += records;
if ( (stats&total) || !records ) continue;
- const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL;
- if ( ctg_name ) {
- bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
- int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
- }
+ const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a";
+ bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
+ int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+ printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
}
if ( !sum )
{
* the total number of records.
*/
int len = strlen(fname);
+ int idx_only = 0;
if ( (fnidx = strstr(fname, HTS_IDX_DELIM)) != NULL ) {
fntemp = strdup(fname);
if ( !fntemp ) return 1;
fntemp = strdup(fname);
fname = fntemp;
fname[len-4] = 0;
+ idx_only = 1;
}
if ( stats&per_contig )
{
- fp = hts_open(fname,"r");
- if ( !fp ) {
- fprintf(bcftools_stderr,"Could not read %s\n", fname);
- ret = 1; goto cleanup;
+ if ( idx_only )
+ {
+ struct stat buf;
+ if ( stat(fname, &buf)==0 ) idx_only = 0;
}
- hdr = bcf_hdr_read(fp);
- if ( !hdr ) {
- fprintf(bcftools_stderr,"Could not read the header: %s\n", fname);
- ret = 1; goto cleanup;
+
+ enum htsExactFormat fmt;
+ if ( !idx_only )
+ {
+ fp = hts_open(fname,"r");
+ if ( !fp ) {
+ fprintf(bcftools_stderr,"Could not read %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+ hdr = bcf_hdr_read(fp);
+ if ( !hdr ) {
+ fprintf(bcftools_stderr,"Could not read the header: %s\n", fname);
+ ret = 1; goto cleanup;
+ }
+ fmt = hts_get_format(fp)->format;
+ }
+ else
+ {
+ int len = strlen(fnidx);
+ if ( !strcasecmp(".tbi",fnidx+len-4) ) fmt = vcf;
+ else fmt = bcf;
}
- if ( hts_get_format(fp)->format==vcf )
+ if ( fmt==vcf )
{
tbx = tbx_index_load2(fname, fnidx);
if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; }
}
- else if ( hts_get_format(fp)->format==bcf )
+ else if ( fmt==bcf )
{
idx = bcf_index_load2(fname, fnidx);
if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
} else {
nseq = hts_idx_nseq(idx);
}
-
+ if ( !tbx && !hdr ) fprintf(bcftools_stderr,"Warning: cannot determine contig names given the .csi index alone\n");
for (tid=0; tid<nseq; tid++)
{
uint64_t records, v;
hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
sum += records;
if ( (stats&total) || !records ) continue;
- const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : NULL;
- if ( ctg_name ) {
- bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
- int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
- }
+ const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a";
+ bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
+ int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
+ fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
}
if ( !sum )
{
typedef struct
{
- int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads;
+ int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads, clevel;
int nflt, *flt_logic;
filter_t **flt;
char **flt_expr;
if ( args->targets_list && files->nreaders==1 ) out_std = 1;
if ( out_std )
{
- out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
#define OPEN_FILE(i,j) { \
open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
- args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \
+ char wmode[8]; \
+ set_wmode(wmode,args->output_type,args->fnames[i],args->clevel); \
+ args->fh_out[i] = hts_open(args->fnames[i], wmode); \
if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
fprintf(stderr, "Usage: bcftools isec [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
- fprintf(stderr, " -C, --complement output positions present only in the first file but missing in the others\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
- fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
+ fprintf(stderr, " -c, --collapse STRING Treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(stderr, " -C, --complement Output positions present only in the first file but missing in the others\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
+ fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -i, --include EXPR Include only sites for which the expression is true\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -n, --nfiles [+-=~]INT Output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " -p, --prefix DIR If given, subset each of the input files accordingly, see also -w\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->clevel = -1;
int targets_is_file = 0, regions_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"write",required_argument,NULL,'w'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
switch (c) {
case 'o': args->output_fname = optarg; break;
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'c':
else if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg);
}
break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 'h':
}
}
if ( argc-optind<1 ) usage(); // no file given
- if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 )
- error("Failed to read the targets: %s\n", args->targets_list);
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->targets_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->regions_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
if ( argc-optind==2 && !args->isec_op )
{
args->isec_op = OP_VENN;
typedef struct
{
- int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads;
+ int isec_op, isec_n, *write, iwrite, nwrite, output_type, n_threads, clevel;
int nflt, *flt_logic;
filter_t **flt;
char **flt_expr;
if ( args->targets_list && files->nreaders==1 ) out_std = 1;
if ( out_std )
{
- out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
#define OPEN_FILE(i,j) { \
open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \
- args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \
+ char wmode[8]; \
+ set_wmode(wmode,args->output_type,args->fnames[i],args->clevel); \
+ args->fh_out[i] = hts_open(args->fnames[i], wmode); \
if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \
if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \
if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \
fprintf(bcftools_stderr, "Usage: bcftools isec [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
- fprintf(bcftools_stderr, " -C, --complement output positions present only in the first file but missing in the others\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(bcftools_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(bcftools_stderr, " -i, --include <expr> include only sites for which the expression is true\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -n, --nfiles [+-=~]<int> output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
- fprintf(bcftools_stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
+ fprintf(bcftools_stderr, " -c, --collapse STRING Treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
+ fprintf(bcftools_stderr, " -C, --complement Output positions present only in the first file but missing in the others\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
+ fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Include only sites for which the expression is true\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -n, --nfiles [+-=~]INT Output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " -p, --prefix DIR If given, subset each of the input files accordingly, see also -w\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " -w, --write LIST List of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->clevel = -1;
int targets_is_file = 0, regions_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"write",required_argument,NULL,'w'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
switch (c) {
case 'o': args->output_fname = optarg; break;
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'c':
else if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg);
}
break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 'h':
}
}
if ( argc-optind<1 ) usage(); // no file given
- if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 )
- error("Failed to read the targets: %s\n", args->targets_list);
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->targets_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->regions_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
if ( argc-optind==2 && !args->isec_op )
{
args->isec_op = OP_VENN;
htsFile *out_fh;
bcf_hdr_t *out_hdr;
char **argv;
- int argc, n_threads, record_cmd_line;
+ int argc, n_threads, record_cmd_line, clevel;
int local_alleles; // the value of -L option
+ int keep_AC_AN;
}
args_t;
if ( str.l ) kputc(',',&str);
kputs("IMF:max",&str);
}
+ if ( !bcf_hdr_nsamples(args->out_hdr) )
+ {
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AN")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("AN:sum",&str);
+ }
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AC")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("AC:sum",&str);
+ }
+ }
if ( !str.l ) return;
args->info_rules = str.s;
else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type);
+ if ( !strcmp(rule->hdr_tag,"AC") || !strcmp(rule->hdr_tag,"AN") ) args->keep_AC_AN = 1;
+
ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
* @param src: source string
* @param isrc: index of the field to copy
* @param src_len: length of source string (excluding the terminating \0)
- * @param dst: destination kstring (must be initialized)
+ * @param dst: destination kstring (must be initialized with missing values, e.g. as ".")
* @param idst: index of the destination field
*/
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst)
bcf_info_t *inf = &line->d.info[j];
const char *key = hdr->id[BCF_DT_ID][inf->key].key;
- if ( !strcmp("AC",key) || !strcmp("AN",key) ) continue; // AC and AN are done in merge_format() after genotypes are done
+ if ( !args->keep_AC_AN && (!strcmp("AC",key) || !strcmp("AN",key)) ) continue; // AC and AN are done in merge_format() after genotypes are done
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key);
if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key);
out->n_sample = bcf_hdr_nsamples(out_hdr);
if ( has_GT )
merge_GT(args, ma->fmt_map, out);
- update_AN_AC(out_hdr, out);
+ if ( !args->keep_AC_AN )
+ update_AN_AC(out_hdr, out);
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
}
void merge_vcf(args_t *args)
{
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
fprintf(stderr, "Usage: bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " --force-samples resolve duplicate sample names\n");
- fprintf(stderr, " --print-header print only the merged header and exit\n");
- fprintf(stderr, " --use-header <file> use the provided header\n");
- fprintf(stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n");
- fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(stderr, " -F, --filter-logic <x|+> remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
- fprintf(stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
- fprintf(stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
- fprintf(stderr, " -l, --file-list <file> read file names from the file\n");
- fprintf(stderr, " -L, --local-alleles <int> EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
- fprintf(stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
- fprintf(stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " --force-samples Resolve duplicate sample names\n");
+ fprintf(stderr, " --print-header Print only the merged header and exit\n");
+ fprintf(stderr, " --use-header FILE Use the provided header\n");
+ fprintf(stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n");
+ fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
+ fprintf(stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
+ fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
+ fprintf(stderr, " -l, --file-list FILE Read file names from the file\n");
+ fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
+ fprintf(stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, "\n");
exit(1);
}
args->n_threads = 0;
args->record_cmd_line = 1;
args->collapse = COLLAPSE_BOTH;
+ args->clevel = -1;
int regions_is_file = 0;
+ int regions_overlap = 1;
static struct option loptions[] =
{
{"threads",required_argument,NULL,9},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,4},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
{"no-index",no_argument,NULL,10},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'm':
case 1 : args->header_fname = optarg; break;
case 2 : args->header_only = 1; break;
case 3 : args->force_samples = 1; break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->no_index = 1; break;
bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX);
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
if ( regions_is_file )
htsFile *out_fh;
bcf_hdr_t *out_hdr;
char **argv;
- int argc, n_threads, record_cmd_line;
+ int argc, n_threads, record_cmd_line, clevel;
int local_alleles; // the value of -L option
+ int keep_AC_AN;
}
args_t;
if ( str.l ) kputc(',',&str);
kputs("IMF:max",&str);
}
+ if ( !bcf_hdr_nsamples(args->out_hdr) )
+ {
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AN")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("AN:sum",&str);
+ }
+ if ( bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "AC")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("AC:sum",&str);
+ }
+ }
if ( !str.l ) return;
args->info_rules = str.s;
else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
else error("The INFO rule \"%s\" is not supported; the tag \"%s\" type is %d\n", ss,rule->hdr_tag,rule->type);
+ if ( !strcmp(rule->hdr_tag,"AC") || !strcmp(rule->hdr_tag,"AN") ) args->keep_AC_AN = 1;
+
ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
* @param src: source string
* @param isrc: index of the field to copy
* @param src_len: length of source string (excluding the terminating \0)
- * @param dst: destination kstring (must be initialized)
+ * @param dst: destination kstring (must be initialized with missing values, e.g. as ".")
* @param idst: index of the destination field
*/
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst)
bcf_info_t *inf = &line->d.info[j];
const char *key = hdr->id[BCF_DT_ID][inf->key].key;
- if ( !strcmp("AC",key) || !strcmp("AN",key) ) continue; // AC and AN are done in merge_format() after genotypes are done
+ if ( !args->keep_AC_AN && (!strcmp("AC",key) || !strcmp("AN",key)) ) continue; // AC and AN are done in merge_format() after genotypes are done
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key);
if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key);
out->n_sample = bcf_hdr_nsamples(out_hdr);
if ( has_GT )
merge_GT(args, ma->fmt_map, out);
- update_AN_AC(out_hdr, out);
+ if ( !args->keep_AC_AN )
+ update_AN_AC(out_hdr, out);
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
}
void merge_vcf(args_t *args)
{
- args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
fprintf(bcftools_stderr, "Usage: bcftools merge [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " --force-samples resolve duplicate sample names\n");
- fprintf(bcftools_stderr, " --print-header print only the merged header and exit\n");
- fprintf(bcftools_stderr, " --use-header <file> use the provided header\n");
- fprintf(bcftools_stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n");
- fprintf(bcftools_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(bcftools_stderr, " -F, --filter-logic <x|+> remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
- fprintf(bcftools_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
- fprintf(bcftools_stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
- fprintf(bcftools_stderr, " -l, --file-list <file> read file names from the file\n");
- fprintf(bcftools_stderr, " -L, --local-alleles <int> EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
- fprintf(bcftools_stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
- fprintf(bcftools_stderr, " --no-index merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " --force-samples Resolve duplicate sample names\n");
+ fprintf(bcftools_stderr, " --print-header Print only the merged header and exit\n");
+ fprintf(bcftools_stderr, " --use-header FILE Use the provided header\n");
+ fprintf(bcftools_stderr, " -0 --missing-to-ref Assume genotypes at missing sites are 0/0\n");
+ fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(bcftools_stderr, " -F, --filter-logic x|+ Remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
+ fprintf(bcftools_stderr, " -g, --gvcf -|REF.FA Merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
+ fprintf(bcftools_stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
+ fprintf(bcftools_stderr, " -l, --file-list FILE Read file names from the file\n");
+ fprintf(bcftools_stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
+ fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(bcftools_stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->n_threads = 0;
args->record_cmd_line = 1;
args->collapse = COLLAPSE_BOTH;
+ args->clevel = -1;
int regions_is_file = 0;
+ int regions_overlap = 1;
static struct option loptions[] =
{
{"threads",required_argument,NULL,9},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,4},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
{"no-index",no_argument,NULL,10},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'm':
case 1 : args->header_fname = optarg; break;
case 2 : args->header_only = 1; break;
case 3 : args->force_samples = 1; break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 10 : args->no_index = 1; break;
bcf_sr_set_opt(args->files,BCF_SR_REQUIRE_IDX);
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
if ( regions_is_file )
faidx_t *fai;
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
- int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
+ int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel;
int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
int record_cmd_line, force, force_warned, keep_sum_ad;
abuf_t *abuf;
int nsmpl = bcf_hdr_nsamples(args->hdr);
ngts /= nsmpl;
- int i, j, k;
+ int i, j, k,k2;
for (i=1; i<nlines; i++)
{
int ntmp2 = args->ntmp_arr2 / 4;
int32_t *gt2 = (int32_t*) args->tmp_arr2;
for (j=0; j<nsmpl; j++)
{
- for (k=0; k<ngts; k++)
+ for (k2=0; k2<ngts2; k2++)
{
- if ( gt2[k]==bcf_int32_vector_end ) break;
- if ( bcf_gt_is_missing(gt2[k]) || bcf_gt_allele(gt2[k])==0 ) continue;
- if ( gt2[k]==0 ) gt[k] = 0; // missing genotype
- else
+ if ( gt2[k2]==bcf_int32_vector_end ) break;
+ if ( bcf_gt_is_missing(gt2[k2]) ) continue;
+ int ial2 = bcf_gt_allele(gt2[k2]);
+ if ( ial2==0 ) continue; // never overwrite with ref
+ if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
+ int ial = args->maps[i].map[ial2];
+ for (k=0; k<ngts; k++)
+ if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
+ if ( k<ngts )
{
- int ial = bcf_gt_allele(gt2[k]);
- if ( ial>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial);
- gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
+ gt[k] = bcf_gt_unphased(ial);
}
}
gt += ngts;
}
static void normalize_vcf(args_t *args)
{
- args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads )
hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n");
fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n");
fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, "\n");
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
args->do_indels = 1;
+ args->clevel = -1;
int region_is_file = 0;
int targets_is_file = 0;
args->use_star_allele = 1;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"multiallelics",required_argument,NULL,'m'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,1},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"site-win",required_argument,NULL,'w'},
{"remove-duplicates",no_argument,NULL,'D'},
{"rm-dup",required_argument,NULL,'d'},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'o': args->output_fname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 7 : args->force = 1; break;
+ case 1 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
if ( args->region )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 )
error("Failed to read the regions: %s\n", args->region);
}
if ( args->targets )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets,targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets);
}
faidx_t *fai;
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
- int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels;
+ int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel;
int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious;
int record_cmd_line, force, force_warned, keep_sum_ad;
abuf_t *abuf;
int nsmpl = bcf_hdr_nsamples(args->hdr);
ngts /= nsmpl;
- int i, j, k;
+ int i, j, k,k2;
for (i=1; i<nlines; i++)
{
int ntmp2 = args->ntmp_arr2 / 4;
int32_t *gt2 = (int32_t*) args->tmp_arr2;
for (j=0; j<nsmpl; j++)
{
- for (k=0; k<ngts; k++)
+ for (k2=0; k2<ngts2; k2++)
{
- if ( gt2[k]==bcf_int32_vector_end ) break;
- if ( bcf_gt_is_missing(gt2[k]) || bcf_gt_allele(gt2[k])==0 ) continue;
- if ( gt2[k]==0 ) gt[k] = 0; // missing genotype
- else
+ if ( gt2[k2]==bcf_int32_vector_end ) break;
+ if ( bcf_gt_is_missing(gt2[k2]) ) continue;
+ int ial2 = bcf_gt_allele(gt2[k2]);
+ if ( ial2==0 ) continue; // never overwrite with ref
+ if ( ial2>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial2);
+ int ial = args->maps[i].map[ial2];
+ for (k=0; k<ngts; k++)
+ if ( gt[k]==bcf_int32_vector_end || bcf_gt_is_missing(gt[k]) || !bcf_gt_allele(gt[k]) ) break;
+ if ( k<ngts )
{
- int ial = bcf_gt_allele(gt2[k]);
- if ( ial>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial);
- gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
+ gt[k] = bcf_gt_unphased(ial);
}
}
gt += ngts;
}
static void normalize_vcf(args_t *args)
{
- args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads )
hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
fprintf(bcftools_stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n");
fprintf(bcftools_stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n");
fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(bcftools_stderr, "\n");
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
args->do_indels = 1;
+ args->clevel = -1;
int region_is_file = 0;
int targets_is_file = 0;
args->use_star_allele = 1;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"multiallelics",required_argument,NULL,'m'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,1},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"site-win",required_argument,NULL,'w'},
{"remove-duplicates",no_argument,NULL,'D'},
{"rm-dup",required_argument,NULL,'d'},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
+ }
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
}
break;
case 'o': args->output_fname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case 7 : args->force = 1; break;
+ case 1 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
if ( args->region )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 )
error("Failed to read the regions: %s\n", args->region);
}
if ( args->targets )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets,targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets);
}
bcf_srs_t *files;
bcf_hdr_t *hdr, *hdr_out;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, clevel;
filter_t *filter;
char *filter_str;
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
if ( !args->drop_header )
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
fprintf(stderr, " bcftools +name [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
fprintf(stderr, "\n");
fprintf(stderr, "VCF input options:\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
+ fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, "VCF output options:\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " --threads INTT Use multithreading with <int> worker threads [0]\n");
fprintf(stderr, "Plugin options:\n");
- fprintf(stderr, " -h, --help list plugin's options\n");
- fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(stderr, " -v, --verbose print verbose information, -vv increases verbosity\n");
- fprintf(stderr, " -V, --version print version string and exit\n");
+ fprintf(stderr, " -h, --help List plugin's options\n");
+ fprintf(stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
+ fprintf(stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n");
+ fprintf(stderr, " -V, --version Print version string and exit\n");
fprintf(stderr, "\n");
exit(1);
}
args->n_threads = 0;
args->record_cmd_line = 1;
args->nplugin_paths = -1;
+ args->clevel = -1;
int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
if ( argc==1 ) usage(args);
char *plugin_name = NULL;
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,1},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
{
switch (c) {
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
case 't': args->targets_list = optarg; break;
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'l': args->plist_only = 1; break;
+ case 1 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?':
args->files = bcf_sr_init();
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
args->files->collapse |= COLLAPSE_SOME;
bcf_srs_t *files;
bcf_hdr_t *hdr, *hdr_out;
htsFile *out_fh;
- int output_type, n_threads;
+ int output_type, n_threads, clevel;
filter_t *filter;
char *filter_str;
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin");
if ( !args->drop_header )
{
- args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ args->out_fh = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
fprintf(bcftools_stderr, " bcftools +name [OPTIONS] <file> [-- PLUGIN_OPTIONS]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "VCF input options:\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
- fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, "VCF output options:\n");
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -O, --output-type <type> 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " --threads INTT Use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, "Plugin options:\n");
- fprintf(bcftools_stderr, " -h, --help list plugin's options\n");
- fprintf(bcftools_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(bcftools_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n");
- fprintf(bcftools_stderr, " -V, --version print version string and exit\n");
+ fprintf(bcftools_stderr, " -h, --help List plugin's options\n");
+ fprintf(bcftools_stderr, " -l, --list-plugins List available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
+ fprintf(bcftools_stderr, " -v, --verbose Print verbose information, -vv increases verbosity\n");
+ fprintf(bcftools_stderr, " -V, --version Print version string and exit\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->n_threads = 0;
args->record_cmd_line = 1;
args->nplugin_paths = -1;
+ args->clevel = -1;
int regions_is_file = 0, targets_is_file = 0, usage_only = 0, version_only = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
if ( argc==1 ) usage(args);
char *plugin_name = NULL;
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,1},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0)
{
switch (c) {
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
case 't': args->targets_list = optarg; break;
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'l': args->plist_only = 1; break;
+ case 1 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?':
args->files = bcf_sr_init();
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
args->files->collapse |= COLLAPSE_SOME;
fprintf(stderr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -f, --format <string> see man page for details\n");
- fprintf(stderr, " -H, --print-header print header\n");
- fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -l, --list-samples print the list of samples and exit\n");
- fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --samples <list> list of samples to include\n");
- fprintf(stderr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n");
- fprintf(stderr, " -v, --vcf-list <file> process multiple VCFs listed in the file\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -f, --format STRING See man page for details\n");
+ fprintf(stderr, " -H, --print-header Print header\n");
+ fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -l, --list-samples Print the list of samples and exit\n");
+ fprintf(stderr, " -o, --output FILE Output file name [stdout]\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -s, --samples LIST List of samples to include\n");
+ fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n");
+ fprintf(stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"output",1,0,'o'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,1},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"annots",1,0,'a'},
{"samples",1,0,'s'},
{"samples-file",1,0,'S'},
case 'u': args->allow_undef_tags = 1; break;
case 's': args->sample_list = optarg; break;
case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 1 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
if ( !fname ) usage();
args->files = bcf_sr_init();
if ( optind+1 < argc ) args->files->require_index = 1;
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->regions_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
fprintf(bcftools_stderr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -f, --format <string> see man page for details\n");
- fprintf(bcftools_stderr, " -H, --print-header print header\n");
- fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -l, --list-samples print the list of samples and exit\n");
- fprintf(bcftools_stderr, " -o, --output <file> output file name [bcftools_stdout]\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -s, --samples <list> list of samples to include\n");
- fprintf(bcftools_stderr, " -S, --samples-file <file> file of samples to include\n");
- fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(bcftools_stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n");
- fprintf(bcftools_stderr, " -v, --vcf-list <file> process multiple VCFs listed in the file\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -f, --format STRING See man page for details\n");
+ fprintf(bcftools_stderr, " -H, --print-header Print header\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -l, --list-samples Print the list of samples and exit\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -s, --samples LIST List of samples to include\n");
+ fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n");
+ fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(bcftools_stderr, " -u, --allow-undef-tags Print \".\" for undefined tags\n");
+ fprintf(bcftools_stderr, " -v, --vcf-list FILE Process multiple VCFs listed in the file\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n");
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"output",1,0,'o'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,1},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"annots",1,0,'a'},
{"samples",1,0,'s'},
{"samples-file",1,0,'S'},
case 'u': args->allow_undef_tags = 1; break;
case 's': args->sample_list = optarg; break;
case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 1 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 'h':
case '?': usage(); break;
default: error("Unknown argument: %s\n", optarg);
if ( !fname ) usage();
args->files = bcf_sr_init();
if ( optind+1 < argc ) args->files->require_index = 1;
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->regions_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
fprintf(stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " -s, --samples <list> list of samples to analyze [all samples]\n");
fprintf(stderr, " -S, --samples-file <file> file of samples to analyze [all samples]\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "HMM Options:\n");
args->t2HW = 5e-9;
args->rec_rate = 0;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"viterbi-training",1,0,'V'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,6},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,7},
{"genetic-map",1,0,'m'},
{"rec-rate",1,0,'M'},
{"skip-indels",0,0,'I'},
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 6 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
+ case 7 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'V':
args->vi_training = 1;
if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n");
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
fprintf(bcftools_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n");
fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " -s, --samples <list> list of samples to analyze [all samples]\n");
fprintf(bcftools_stderr, " -S, --samples-file <file> file of samples to analyze [all samples]\n");
fprintf(bcftools_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "HMM Options:\n");
args->t2HW = 5e-9;
args->rec_rate = 0;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"viterbi-training",1,0,'V'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,6},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,7},
{"genetic-map",1,0,'m'},
{"rec-rate",1,0,'M'},
{"skip-indels",0,0,'I'},
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
+ case 6 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
+ case 7 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'V':
args->vi_training = 1;
if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n");
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
{
bcf_hdr_t *hdr;
char **argv, *fname, *output_fname, *tmp_dir;
- int argc, output_type;
+ int argc, output_type, clevel;
size_t max_mem, mem;
bcf1_t **buf;
+ uint8_t *mem_block;
size_t nbuf, mbuf, nblk;
blk_t *blk;
}
// This will be called rarely so should not slow the sorting down
// noticeably.
- if ( !a->unpacked ) bcf_unpack(a, BCF_UN_STR);
- if ( !b->unpacked ) bcf_unpack(b, BCF_UN_STR);
int i;
for (i=0; i<a->n_allele; i++)
{
for (i=0; i<args->nbuf; i++)
{
if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
- bcf_destroy(args->buf[i]);
}
if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname);
args->mem = 0;
}
+
+static inline uint8_t *_align_up(uint8_t *ptr)
+{
+ return (uint8_t*)(((size_t)ptr + 8 - 1) & ~((size_t)(8 - 1)));
+}
+
void buf_push(args_t *args, bcf1_t *rec)
{
- int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*);
- if ( args->mem + delta > args->max_mem ) buf_flush(args);
+ size_t delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + rec->unpack_size[0] + rec->unpack_size[1]
+ + sizeof(*rec->d.allele)*rec->d.m_allele
+ + sizeof(bcf1_t*) // args->buf
+ + 8; // the number of _align_up() calls
+
+ if ( delta > args->max_mem - args->mem )
+ {
+ args->nbuf++;
+ hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
+ args->buf[args->nbuf-1] = rec;
+ buf_flush(args);
+ bcf_destroy(rec);
+ return;
+ }
+
+ // make sure nothing has changed in htslib
+ assert( rec->unpacked==BCF_UN_STR && !rec->d.flt && !rec->d.info && !rec->d.fmt && !rec->d.var );
+
+ uint8_t *ptr_beg = args->mem_block + args->mem;
+ uint8_t *ptr = _align_up(ptr_beg);
+ bcf1_t *new_rec = (bcf1_t*)ptr;
+ memcpy(new_rec,rec,sizeof(*rec));
+ ptr += sizeof(*rec);
+
+ // The array of allele pointers does not need alignment as bcf1_t is already padded to the biggest
+ // data type in the structure
+ char **allele = (char**)ptr;
+ ptr += rec->n_allele*sizeof(*allele);
+
+ // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
+ // and the end may be uninitialized
+ delta = rec->d.allele[rec->n_allele-1] - rec->d.allele[0];
+ while ( delta < rec->unpack_size[1] ) if ( !rec->d.als[delta++] ) break;
+ memcpy(ptr,rec->d.als,delta);
+ new_rec->d.als = (char*)ptr;
+ ptr = ptr + delta;
+
+ int i;
+ for (i=0; i<rec->n_allele; i++) allele[i] = new_rec->d.als + (ptrdiff_t)(rec->d.allele[i] - rec->d.allele[0]);
+ new_rec->d.allele = allele;
+
+ memcpy(ptr,rec->shared.s,rec->shared.l);
+ new_rec->shared.s = (char*)ptr;
+ new_rec->shared.m = rec->shared.l;
+ ptr += rec->shared.l;
+
+ memcpy(ptr,rec->indiv.s,rec->indiv.l);
+ new_rec->indiv.s = (char*)ptr;
+ new_rec->indiv.m = rec->indiv.l;
+ ptr += rec->indiv.l;
+
+ // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
+ // and the end may be uninitialized
+ i = 0;
+ while ( i < rec->unpack_size[0] ) if ( !rec->d.id[i++] ) break;
+ memcpy(ptr,rec->d.id,i);
+ new_rec->d.id = (char*)ptr;
+ ptr += i;
+
args->nbuf++;
- args->mem += delta;
hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
- args->buf[args->nbuf-1] = rec;
+ args->buf[args->nbuf-1] = new_rec;
+
+ delta = ptr - ptr_beg;
+ args->mem += delta;
+
+ assert( args->mem <= args->max_mem );
+
+ bcf_destroy(rec);
}
void sort_blocks(args_t *args)
break;
}
if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+ bcf_unpack(rec, BCF_UN_STR);
buf_push(args, rec);
}
buf_flush(args);
blk->fh = 0;
return;
}
+ bcf_unpack(blk->rec, BCF_UN_STR);
khp_insert(blk, bhp, &blk);
}
void merge_blocks(args_t *args)
{
fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk);
-
khp_blk_t *bhp = khp_init(blk);
int i;
blk_read(args, bhp, args->hdr, blk);
}
- htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
while ( bhp->ndat )
{
fprintf(stderr, "Usage: bcftools sort [OPTIONS] <FILE.vcf>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
- fprintf(stderr, " -o, --output FILE output file name [stdout]\n");
- fprintf(stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
+ fprintf(stderr, " -o, --output FILE output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+
#ifdef _WIN32
- fprintf(stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n");
+ fprintf(stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n");
#else
- fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
+ fprintf(stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
#endif
fprintf(stderr, "\n");
exit(1);
void mkdir_p(const char *fmt, ...);
static void init(args_t *args)
{
+ args->max_mem *= 0.9;
+ args->mem_block = malloc(args->max_mem);
+ args->mem = 0;
+
args->tmp_dir = init_tmp_prefix(args->tmp_dir);
#ifdef _WIN32
static void destroy(args_t *args)
{
bcf_hdr_destroy(args->hdr);
+ free(args->mem_block);
free(args->tmp_dir);
free(args);
}
args->argc = argc; args->argv = argv;
args->max_mem = 768*1000*1000;
args->output_fname = "-";
+ args->clevel = -1;
static struct option loptions[] =
{
{"help",no_argument,NULL,'h'},
{0,0,0,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
{
switch (c)
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'h':
case '?': usage(args); break;
{
bcf_hdr_t *hdr;
char **argv, *fname, *output_fname, *tmp_dir;
- int argc, output_type;
+ int argc, output_type, clevel;
size_t max_mem, mem;
bcf1_t **buf;
+ uint8_t *mem_block;
size_t nbuf, mbuf, nblk;
blk_t *blk;
}
// This will be called rarely so should not slow the sorting down
// noticeably.
- if ( !a->unpacked ) bcf_unpack(a, BCF_UN_STR);
- if ( !b->unpacked ) bcf_unpack(b, BCF_UN_STR);
int i;
for (i=0; i<a->n_allele; i++)
{
for (i=0; i<args->nbuf; i++)
{
if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
- bcf_destroy(args->buf[i]);
}
if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname);
args->mem = 0;
}
+
+static inline uint8_t *_align_up(uint8_t *ptr)
+{
+ return (uint8_t*)(((size_t)ptr + 8 - 1) & ~((size_t)(8 - 1)));
+}
+
void buf_push(args_t *args, bcf1_t *rec)
{
- int delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + sizeof(bcf1_t*);
- if ( args->mem + delta > args->max_mem ) buf_flush(args);
+ size_t delta = sizeof(bcf1_t) + rec->shared.l + rec->indiv.l + rec->unpack_size[0] + rec->unpack_size[1]
+ + sizeof(*rec->d.allele)*rec->d.m_allele
+ + sizeof(bcf1_t*) // args->buf
+ + 8; // the number of _align_up() calls
+
+ if ( delta > args->max_mem - args->mem )
+ {
+ args->nbuf++;
+ hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
+ args->buf[args->nbuf-1] = rec;
+ buf_flush(args);
+ bcf_destroy(rec);
+ return;
+ }
+
+ // make sure nothing has changed in htslib
+ assert( rec->unpacked==BCF_UN_STR && !rec->d.flt && !rec->d.info && !rec->d.fmt && !rec->d.var );
+
+ uint8_t *ptr_beg = args->mem_block + args->mem;
+ uint8_t *ptr = _align_up(ptr_beg);
+ bcf1_t *new_rec = (bcf1_t*)ptr;
+ memcpy(new_rec,rec,sizeof(*rec));
+ ptr += sizeof(*rec);
+
+ // The array of allele pointers does not need alignment as bcf1_t is already padded to the biggest
+ // data type in the structure
+ char **allele = (char**)ptr;
+ ptr += rec->n_allele*sizeof(*allele);
+
+ // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
+ // and the end may be uninitialized
+ delta = rec->d.allele[rec->n_allele-1] - rec->d.allele[0];
+ while ( delta < rec->unpack_size[1] ) if ( !rec->d.als[delta++] ) break;
+ memcpy(ptr,rec->d.als,delta);
+ new_rec->d.als = (char*)ptr;
+ ptr = ptr + delta;
+
+ int i;
+ for (i=0; i<rec->n_allele; i++) allele[i] = new_rec->d.als + (ptrdiff_t)(rec->d.allele[i] - rec->d.allele[0]);
+ new_rec->d.allele = allele;
+
+ memcpy(ptr,rec->shared.s,rec->shared.l);
+ new_rec->shared.s = (char*)ptr;
+ new_rec->shared.m = rec->shared.l;
+ ptr += rec->shared.l;
+
+ memcpy(ptr,rec->indiv.s,rec->indiv.l);
+ new_rec->indiv.s = (char*)ptr;
+ new_rec->indiv.m = rec->indiv.l;
+ ptr += rec->indiv.l;
+
+ // This is just to prevent valgrind from complaining about memcpy, unpack_size is a high-water mark
+ // and the end may be uninitialized
+ i = 0;
+ while ( i < rec->unpack_size[0] ) if ( !rec->d.id[i++] ) break;
+ memcpy(ptr,rec->d.id,i);
+ new_rec->d.id = (char*)ptr;
+ ptr += i;
+
args->nbuf++;
- args->mem += delta;
hts_expand(bcf1_t*, args->nbuf, args->mbuf, args->buf);
- args->buf[args->nbuf-1] = rec;
+ args->buf[args->nbuf-1] = new_rec;
+
+ delta = ptr - ptr_beg;
+ args->mem += delta;
+
+ assert( args->mem <= args->max_mem );
+
+ bcf_destroy(rec);
}
void sort_blocks(args_t *args)
break;
}
if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1);
+ bcf_unpack(rec, BCF_UN_STR);
buf_push(args, rec);
}
buf_flush(args);
blk->fh = 0;
return;
}
+ bcf_unpack(blk->rec, BCF_UN_STR);
khp_insert(blk, bhp, &blk);
}
void merge_blocks(args_t *args)
{
fprintf(bcftools_stderr,"Merging %d temporary files\n", (int)args->nblk);
-
khp_blk_t *bhp = khp_init(blk);
int i;
blk_read(args, bhp, args->hdr, blk);
}
- htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname));
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->output_fname,args->clevel);
+ htsFile *out = hts_open(args->output_fname ? args->output_fname : "-", wmode);
if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname);
while ( bhp->ndat )
{
fprintf(bcftools_stderr, "Usage: bcftools sort [OPTIONS] <FILE.vcf>\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
- fprintf(bcftools_stderr, " -o, --output FILE output file name [bcftools_stdout]\n");
- fprintf(bcftools_stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(bcftools_stderr, " -m, --max-mem FLOAT[kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6
+ fprintf(bcftools_stderr, " -o, --output FILE output file name [bcftools_stdout]\n");
+ fprintf(bcftools_stderr, " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+
#ifdef _WIN32
- fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n");
+ fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/bcftools.XXXXXX]\n");
#else
- fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
+ fprintf(bcftools_stderr, " -T, --temp-dir DIR temporary files [/tmp/bcftools.XXXXXX]\n");
#endif
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
void mkdir_p(const char *fmt, ...);
static void init(args_t *args)
{
+ args->max_mem *= 0.9;
+ args->mem_block = malloc(args->max_mem);
+ args->mem = 0;
+
args->tmp_dir = init_tmp_prefix(args->tmp_dir);
#ifdef _WIN32
static void destroy(args_t *args)
{
bcf_hdr_destroy(args->hdr);
+ free(args->mem_block);
free(args->tmp_dir);
free(args);
}
args->argc = argc; args->argv = argv;
args->max_mem = 768*1000*1000;
args->output_fname = "-";
+ args->clevel = -1;
static struct option loptions[] =
{
{"help",no_argument,NULL,'h'},
{0,0,0,0}
};
+ char *tmp;
while ((c = getopt_long(argc, argv, "m:T:O:o:h?",loptions,NULL)) >= 0)
{
switch (c)
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'h':
case '?': usage(args); break;
fprintf(stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n");
fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n");
fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
fprintf(stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n");
fprintf(stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
args->argc = argc; args->argv = argv;
args->dp_min = 0; args->dp_max = 500; args->dp_step = 1;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"collapse",1,0,'c'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"verbose",0,0,'v'},
{"depth",1,0,'d'},
{"apply-filters",1,0,'f'},
{"split-by-ID",0,0,'I'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"fasta-ref",1,0,'F'},
{"user-tstv",1,0,'u'},
{"threads",1,0,9},
case 'i':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage(); break;
if ( args->split_by_id ) error("Only one file can be given with -i.\n");
}
if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO;
- if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
- error("Failed to read the targets: %s\n", args->targets_list);
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->targets_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->regions_list)
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
error("Failed to create threads\n");
fprintf(bcftools_stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n");
fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(bcftools_stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n");
fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n");
fprintf(bcftools_stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n");
fprintf(bcftools_stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
fprintf(bcftools_stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
fprintf(bcftools_stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n");
fprintf(bcftools_stderr, " --threads INT Use multithreading with <int> worker threads [0]\n");
args->argc = argc; args->argv = argv;
args->dp_min = 0; args->dp_max = 500; args->dp_step = 1;
int regions_is_file = 0, targets_is_file = 0;
+ int regions_overlap = 1;
+ int targets_overlap = 0;
static struct option loptions[] =
{
{"collapse",1,0,'c'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"verbose",0,0,'v'},
{"depth",1,0,'d'},
{"apply-filters",1,0,'f'},
{"split-by-ID",0,0,'I'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
+ {"targets-overlap",required_argument,NULL,4},
{"fasta-ref",1,0,'F'},
{"user-tstv",1,0,'u'},
{"threads",1,0,9},
case 'i':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
+ case 4 :
+ if ( !strcasecmp(optarg,"0") ) targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage(); break;
if ( args->split_by_id ) error("Only one file can be given with -i.\n");
}
if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO;
- if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
- error("Failed to read the targets: %s\n", args->targets_list);
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->targets_list )
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
+ if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", args->targets_list);
+ }
+ if ( args->regions_list)
+ {
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,regions_overlap);
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ }
if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
error("Failed to create threads\n");
bcf_srs_t *files;
bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
+ int regions_overlap, targets_overlap;
int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
free(type_list);
}
- // setup output
- const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out);
- char modew[8];
- strcpy(modew,tmp);
- if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
- args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->fn_out,args->clevel);
+ args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
if ( args->n_threads > 0)
hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
fprintf(stderr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Output options:\n");
- fprintf(stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
- fprintf(stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
- fprintf(stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -o, --output <file> output file name [stdout]\n");
- fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(stderr, " -G, --drop-genotypes Drop individual genotype information (after subsetting if -s option set)\n");
+ fprintf(stderr, " -h, --header-only Print only the header in VCF output (equivalent to bcftools head)\n");
+ fprintf(stderr, " -H, --no-header Suppress the header in VCF output\n");
+ fprintf(stderr, " --with-header Print both header and records in VCF output [default]\n");
+ fprintf(stderr, " -l, --compression-level [0-9] Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -o, --output FILE Output file name [stdout]\n");
+ fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
+ fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(stderr, " -t, --targets [^]REGION Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Subset options:\n");
- fprintf(stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
- fprintf(stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
- fprintf(stderr, " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" prefix)\n");
- fprintf(stderr, " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n");
- fprintf(stderr, " --force-samples only warn about unknown subset samples\n");
+ fprintf(stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
+ fprintf(stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
+ fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " --force-samples Only warn about unknown subset samples\n");
fprintf(stderr, "\n");
fprintf(stderr, "Filter options:\n");
- fprintf(stderr, " -c/C, --min-ac/--max-ac <int>[:<type>] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
- fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
- fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(stderr, " -g, --genotype [^]<hom|het|miss> require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n");
- fprintf(stderr, " -i/e, --include/--exclude <expr> select/exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n");
- fprintf(stderr, " -m/M, --min-alleles/--max-alleles <int> minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
- fprintf(stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n");
- fprintf(stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
- fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
- fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
- fprintf(stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
- fprintf(stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+ fprintf(stderr, " -c/C, --min-ac/--max-ac INT[:TYPE] Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -g, --genotype [^]hom|het|miss Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n");
+ fprintf(stderr, " -i/e, --include/--exclude EXPR Select/exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -k/n, --known/--novel Select known/novel sites only (ID is not/is '.')\n");
+ fprintf(stderr, " -m/M, --min-alleles/--max-alleles INT Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
+ fprintf(stderr, " -p/P, --phased/--exclude-phased Select/exclude sites where all samples are phased\n");
+ fprintf(stderr, " -q/Q, --min-af/--max-af FLOAT[:TYPE] Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n");
+ fprintf(stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
+ fprintf(stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
fprintf(stderr, "\n");
exit(1);
}
args->n_threads = 0;
args->record_cmd_line = 1;
args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
+ args->regions_overlap = 1;
+ args->targets_overlap = 0;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
{"threads",required_argument,NULL,9},
{"header-only",no_argument,NULL,'h'},
{"no-header",no_argument,NULL,'H'},
+ {"with-header",no_argument,NULL,4},
{"exclude",required_argument,NULL,'e'},
{"include",required_argument,NULL,'i'},
{"trim-alt-alleles",no_argument,NULL,'a'},
{"exclude-types",required_argument,NULL,'V'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"min-ac",required_argument,NULL,'c'},
{"max-ac",required_argument,NULL,'C'},
{"min-af",required_argument,NULL,'q'},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'l':
args->clevel = strtol(optarg,&tmp,10);
case 'o': args->fn_out = optarg; break;
case 'H': args->print_header = 0; break;
case 'h': args->header_only = 1; break;
+ case 4 : args->print_header = 1; args->header_only = 0; break;
case 't': args->targets_list = optarg; break;
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
break;
}
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
// read in the regions from the command line
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
bcf_srs_t *files;
bcf_hdr_t *hdr, *hnull, *hsub; // original header, sites-only header, subset header
char **argv, *format, *sample_names, *subset_fname, *targets_list, *regions_list;
+ int regions_overlap, targets_overlap;
int argc, clevel, n_threads, output_type, print_header, update_info, header_only, n_samples, *imap, calc_ac;
int trim_alts, sites_only, known, novel, min_alleles, max_alleles, private_vars, uncalled, phased;
int min_ac, min_ac_type, max_ac, max_ac_type, min_af_type, max_af_type, gt_type;
free(type_list);
}
- // setup output
- const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out);
- char modew[8];
- strcpy(modew,tmp);
- if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
- args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
+ char wmode[8];
+ set_wmode(wmode,args->output_type,args->fn_out,args->clevel);
+ args->out = hts_open(args->fn_out ? args->fn_out : "-", wmode);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
if ( args->n_threads > 0)
hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
fprintf(bcftools_stderr, "Usage: bcftools view [options] <in.vcf.gz> [region1 [...]]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Output options:\n");
- fprintf(bcftools_stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n");
- fprintf(bcftools_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n");
- fprintf(bcftools_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
- fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(bcftools_stderr, " -o, --output <file> output file name [bcftools_stdout]\n");
- fprintf(bcftools_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
- fprintf(bcftools_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
- fprintf(bcftools_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(bcftools_stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(bcftools_stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(bcftools_stderr, " --threads <int> use multithreading with <int> worker threads [0]\n");
+ fprintf(bcftools_stderr, " -G, --drop-genotypes Drop individual genotype information (after subsetting if -s option set)\n");
+ fprintf(bcftools_stderr, " -h, --header-only Print only the header in VCF output (equivalent to bcftools head)\n");
+ fprintf(bcftools_stderr, " -H, --no-header Suppress the header in VCF output\n");
+ fprintf(bcftools_stderr, " --with-header Print both header and records in VCF output [default]\n");
+ fprintf(bcftools_stderr, " -l, --compression-level [0-9] Compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel);
+ fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Output file name [bcftools_stdout]\n");
+ fprintf(bcftools_stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n");
+ fprintf(bcftools_stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n");
+ fprintf(bcftools_stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n");
+ fprintf(bcftools_stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
+ fprintf(bcftools_stderr, " -t, --targets [^]REGION Similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(bcftools_stderr, " -T, --targets-file [^]FILE Similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
+ fprintf(bcftools_stderr, " --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n");
+ fprintf(bcftools_stderr, " --threads INT Use multithreading with INT worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Subset options:\n");
- fprintf(bcftools_stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
- fprintf(bcftools_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
- fprintf(bcftools_stderr, " -s, --samples [^]<list> comma separated list of samples to include (or exclude with \"^\" prefix)\n");
- fprintf(bcftools_stderr, " -S, --samples-file [^]<file> file of samples to include (or exclude with \"^\" prefix)\n");
- fprintf(bcftools_stderr, " --force-samples only warn about unknown subset samples\n");
+ fprintf(bcftools_stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
+ fprintf(bcftools_stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
+ fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(bcftools_stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(bcftools_stderr, " --force-samples Only warn about unknown subset samples\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Filter options:\n");
- fprintf(bcftools_stderr, " -c/C, --min-ac/--max-ac <int>[:<type>] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
- fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
- fprintf(bcftools_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
- fprintf(bcftools_stderr, " -g, --genotype [^]<hom|het|miss> require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n");
- fprintf(bcftools_stderr, " -i/e, --include/--exclude <expr> select/exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n");
- fprintf(bcftools_stderr, " -m/M, --min-alleles/--max-alleles <int> minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
- fprintf(bcftools_stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n");
- fprintf(bcftools_stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
- fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
- fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
- fprintf(bcftools_stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
- fprintf(bcftools_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
+ fprintf(bcftools_stderr, " -c/C, --min-ac/--max-ac INT[:TYPE] Minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(bcftools_stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(bcftools_stderr, " -g, --genotype [^]hom|het|miss Require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude such sites\n");
+ fprintf(bcftools_stderr, " -i/e, --include/--exclude EXPR Select/exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -k/n, --known/--novel Select known/novel sites only (ID is not/is '.')\n");
+ fprintf(bcftools_stderr, " -m/M, --min-alleles/--max-alleles INT Minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n");
+ fprintf(bcftools_stderr, " -p/P, --phased/--exclude-phased Select/exclude sites where all samples are phased\n");
+ fprintf(bcftools_stderr, " -q/Q, --min-af/--max-af FLOAT[:TYPE] Minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
+ fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
+ fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled Select/exclude sites without a called genotype\n");
+ fprintf(bcftools_stderr, " -v/V, --types/--exclude-types LIST Select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
+ fprintf(bcftools_stderr, " -x/X, --private/--exclude-private Select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args->n_threads = 0;
args->record_cmd_line = 1;
args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
+ args->regions_overlap = 1;
+ args->targets_overlap = 0;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
{"threads",required_argument,NULL,9},
{"header-only",no_argument,NULL,'h'},
{"no-header",no_argument,NULL,'H'},
+ {"with-header",no_argument,NULL,4},
{"exclude",required_argument,NULL,'e'},
{"include",required_argument,NULL,'i'},
{"trim-alt-alleles",no_argument,NULL,'a'},
{"exclude-types",required_argument,NULL,'V'},
{"targets",required_argument,NULL,'t'},
{"targets-file",required_argument,NULL,'T'},
+ {"targets-overlap",required_argument,NULL,2},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
+ {"regions-overlap",required_argument,NULL,3},
{"min-ac",required_argument,NULL,'c'},
{"max-ac",required_argument,NULL,'C'},
{"min-af",required_argument,NULL,'q'},
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
- default: error("The output type \"%s\" not recognised\n", optarg);
+ default:
+ {
+ args->clevel = strtol(optarg,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg);
+ }
};
+ if ( optarg[1] )
+ {
+ args->clevel = strtol(optarg+1,&tmp,10);
+ if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1);
+ }
break;
case 'l':
args->clevel = strtol(optarg,&tmp,10);
case 'o': args->fn_out = optarg; break;
case 'H': args->print_header = 0; break;
case 'h': args->header_only = 1; break;
+ case 4 : args->print_header = 1; args->header_only = 0; break;
case 't': args->targets_list = optarg; break;
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg);
break;
}
+ case 2 :
+ if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2;
+ else error("Could not parse: --targets-overlap %s\n",optarg);
+ break;
+ case 3 :
+ if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0;
+ else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1;
+ else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2;
+ else error("Could not parse: --regions-overlap %s\n",optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
// read in the regions from the command line
if ( args->regions_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
}
}
if ( args->targets_list )
{
+ bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,args->targets_overlap);
if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
error("Failed to read the targets: %s\n", args->targets_list);
}
return hts_bcf_wmode(file_type);
}
+void set_wmode(char dst[8], int file_type, char *fname, int clevel)
+{
+ const char *ret = NULL;
+ int len = fname ? strlen(fname) : 0;
+ if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+ else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF);
+ else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ else ret = hts_bcf_wmode(file_type);
+ if ( clevel>=0 && clevel<=9 )
+ {
+ if ( strchr(ret,'v') || strchr(ret,'u') ) error("Error: compression level (%d) cannot be set on uncompressed streams (%s)\n",clevel,fname);
+ len = strlen(ret);
+ if ( len>6 ) error("Fixme: %s\n", ret);
+ sprintf(dst, "%s%d", ret, clevel);
+ }
+ else
+ strcpy(dst, ret);
+}
+
return hts_bcf_wmode(file_type);
}
+void set_wmode(char dst[8], int file_type, char *fname, int clevel)
+{
+ const char *ret = NULL;
+ int len = fname ? strlen(fname) : 0;
+ if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) ret = hts_bcf_wmode(FT_BCF|FT_GZ);
+ else if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) ret = hts_bcf_wmode(FT_VCF);
+ else if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ else if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) ret = hts_bcf_wmode(FT_VCF|FT_GZ);
+ else ret = hts_bcf_wmode(file_type);
+ if ( clevel>=0 && clevel<=9 )
+ {
+ if ( strchr(ret,'v') || strchr(ret,'u') ) error("Error: compression level (%d) cannot be set on uncompressed streams (%s)\n",clevel,fname);
+ len = strlen(ret);
+ if ( len>6 ) error("Fixme: %s\n", ret);
+ sprintf(dst, "%s%d", ret, clevel);
+ }
+ else
+ strcpy(dst, ret);
+}
+
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.13
+VERSION=1.14
# If we have a git clone, then check against the current tag
if [ -e .git ]
if basename == "samtools":
lines = re.sub(r"main_(reheader)\(",
r"samtools_main_\1(", lines)
+ lines = re.sub(r"\b({}_stdout)\b".format(basename), r"\1_internal", lines)
lines = re.sub(r"\bexit\(", "{}_exit(".format(basename), lines)
- lines = re.sub("stderr", "{}_stderr".format(basename), lines)
- lines = re.sub("stdout", "{}_stdout".format(basename), lines)
+ lines = re.sub(r"\bstderr\b", "{}_stderr".format(basename), lines)
+ lines = re.sub(r"\bstdout\b", "{}_stdout".format(basename), lines)
lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines)
lines = re.sub(r"([^kf])puts\(", r"\1{}_puts(".format(basename), lines)
lines = re.sub(r"putchar\(([^)]+)\)",
using cython and a high-level, pythonic API for convenient access to
the data within genomic file formats.
-The current version wraps *htslib-1.13*, *samtools-1.13*, and *bcftools-1.13*.
+The current version wraps *htslib-1.14*, *samtools-1.14*, and *bcftools-1.14*.
To install the latest release, type::
Release notes
=============
+Release 0.18.0
+==============
+
+This release wraps htslib/samtools/bcftools version 1.14.
+
+* [#1048] and [#1060], clarify documentation of index statistics with CRAM files
+* Prevent "retval may be used uninitialised" warning.
+* Add new "samples" subcommand to pysam/samtools.py
+* Introduce TupleProxyIterator iterator object class
+
Release 0.17.0
==============
"""int with total number of mapped alignments according to the
statistics recorded in the index. This is a read-only
attribute.
+ (This will be 0 for a CRAM file indexed by a .crai index, as that
+ index format does not record these statistics.)
"""
def __get__(self):
self.check_index()
"""int with total number of unmapped reads according to the statistics
recorded in the index. This number of reads includes the number of reads
without coordinates. This is a read-only attribute.
+ (This will be 0 for a CRAM file indexed by a .crai index, as that
+ index format does not record these statistics.)
"""
def __get__(self):
self.check_index()
"""int with total number of reads without coordinates according to the
statistics recorded in the index, i.e., the statistic printed for "*"
by the ``samtools idxstats`` command. This is a read-only attribute.
+ (This will be 0 for a CRAM file indexed by a .crai index, as that
+ index format does not record these statistics.)
"""
def __get__(self):
self.check_index()
they are stored in the index, similarly to the statistics printed
by the ``samtools idxstats`` command.
+ CRAI indexes do not record these statistics, so for a CRAM file
+ with a .crai index the returned statistics will all be 0.
+
Returns:
list :
a list of records for each chromosome. Each record has the
char * data
char ** fields
int nfields
- int index
int nbytes
int offset
bint is_modified
cdef update(self, char * buffer, size_t nbytes)
+cdef class TupleProxyIterator:
+ cdef TupleProxy proxy
+ cdef int index
+
+
cdef class NamedTupleProxy(TupleProxy):
pass
def __cinit__(self, encoding="ascii"):
self.data = NULL
self.fields = NULL
- self.index = 0
self.nbytes = 0
self.is_modified = 0
self.nfields = 0
return self.nfields
def __iter__(self):
- self.index = 0
- return self
-
- def __next__(self):
- """python version of next().
- """
- if self.index >= self.nfields:
- raise StopIteration
- cdef char * retval = self.fields[self.index]
- self.index += 1
- if retval == NULL:
- return None
- else:
- return force_str(retval, self.encoding)
+ return TupleProxyIterator(self)
def __str__(self):
'''return original data'''
r = result.decode(self.encoding)
return r
+
+cdef class TupleProxyIterator:
+ def __init__(self, proxy):
+ self.proxy = proxy
+ self.index = 0
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if self.index >= self.proxy.nfields:
+ raise StopIteration
+ cdef char *retval = self.proxy.fields[self.index]
+ self.index += 1
+ return force_str(retval, self.proxy.encoding) if retval != NULL else None
+
+
def toDot(v):
'''convert value to '.' if None'''
if v is None:
retval = bcftools_dispatch(n + 2, cargs)
bcftools_close_stdout()
bcftools_close_stderr()
+ else:
+ # unknown -- just return a Unix shell's "command not found" exit status
+ retval = 127
for i from 0 <= i < n:
free(cargs[i + 2])
"ampliconstats": ("ampliconstats", None),
"version": ("version", None),
"fqimport": ("import", None),
+ "samples": ("samples", None),
}
# instantiate samtools commands as python functions
// Version information used while compiling samtools, bcftools, and htslib
-#define SAMTOOLS_VERSION "1.13 (pysam)"
-#define BCFTOOLS_VERSION "1.13 (pysam)"
-#define HTS_VERSION_TEXT "1.13 (pysam)"
+#define SAMTOOLS_VERSION "1.14 (pysam)"
+#define BCFTOOLS_VERSION "1.14 (pysam)"
+#define HTS_VERSION_TEXT "1.14 (pysam)"
# pysam versioning information
-__version__ = "0.17.0"
+__version__ = "0.18.0"
-__samtools_version__ = "1.13"
-__bcftools_version__ = "1.13"
-__htslib_version__ = "1.13"
+__samtools_version__ = "1.14"
+__bcftools_version__ = "1.14"
+__htslib_version__ = "1.14"
The typical simple case of building Samtools using the HTSlib bundled within
this Samtools release tarball is done as follows:
- cd .../samtools-1.13 # Within the unpacked release directory
+ cd .../samtools-1.14 # Within the unpacked release directory
./configure
make
installation using the HTSlib bundled within this Samtools release tarball,
and building the various HTSlib utilities such as bgzip is done as follows:
- cd .../samtools-1.13 # Within the unpacked release directory
+ cd .../samtools-1.14 # Within the unpacked release directory
./configure --prefix=/path/to/location
make all all-htslib
make install install-htslib
To build with plug-ins, you need to use the --enable-plugins configure option
as follows:
- cd .../samtools-1.13 # Within the unpacked release directory
+ cd .../samtools-1.14 # Within the unpacked release directory
./configure --enable-plugins --prefix=/path/to/location
make all all-htslib
make install install-htslib
the source distribution instead of installing the package. In that case
you can use:
- cd .../samtools-1.13 # Within the unpacked release directory
- ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.13
+ cd .../samtools-1.14 # Within the unpacked release directory
+ ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.14
make all all-htslib
It is possible to override the built-in search path using the HTS_PATH
-/* bam.c -- BAM format.
+/* bam.c -- miscellaneous BAM functions.
Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
#include "bam.h"
#include "htslib/kstring.h"
-char *bam_format1(const bam_header_t *header, const bam1_t *b)
-{
- kstring_t str;
- str.l = str.m = 0; str.s = NULL;
- if (sam_format1(header, b, &str) < 0) {
- free(str.s);
- str.s = NULL;
- return NULL;
- }
- return str.s;
-}
-
-int bam_view1(const bam_header_t *header, const bam1_t *b)
-{
- char *s = bam_format1(header, b);
- int ret = -1;
- if (!s) return -1;
- if (puts(s) != EOF) ret = 0;
- free(s);
- return ret;
-}
-
-int bam_validate1(const bam_header_t *header, const bam1_t *b)
-{
- char *s;
-
- if (b->core.tid < -1 || b->core.mtid < -1) return 0;
- if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0;
-
- if (b->data_len < b->core.l_qname) return 0;
- s = memchr(bam1_qname(b), '\0', b->core.l_qname);
- if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
-
- // FIXME: Other fields could also be checked, especially the auxiliary data
-
- return 1;
-}
-
-#ifndef MIN
-#define MIN(a,b) ((a)<(b)?(a):(b))
-#endif
-
// FIXME: we should also check the LB tag associated with each alignment
-const char *bam_get_library(bam_header_t *h, const bam1_t *b)
+const char *bam_get_library(sam_hdr_t *h, const bam1_t *b)
{
const char *rg;
kstring_t lib = { 0, 0, NULL };
return LB_text;
}
-int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
-{
- int ret;
- bam_iter_t iter;
- bam1_t *b;
- b = bam_init1();
- iter = bam_iter_query(idx, tid, beg, end);
- while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
- bam_iter_destroy(iter);
- bam_destroy1(b);
- return (ret == -1)? 0 : ret;
-}
-
/************
* Remove B *
************/
#include "samtools.pysam.h"
-/* bam.c -- BAM format.
+/* bam.c -- miscellaneous BAM functions.
Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
#include "bam.h"
#include "htslib/kstring.h"
-char *bam_format1(const bam_header_t *header, const bam1_t *b)
-{
- kstring_t str;
- str.l = str.m = 0; str.s = NULL;
- if (sam_format1(header, b, &str) < 0) {
- free(str.s);
- str.s = NULL;
- return NULL;
- }
- return str.s;
-}
-
-int bam_view1(const bam_header_t *header, const bam1_t *b)
-{
- char *s = bam_format1(header, b);
- int ret = -1;
- if (!s) return -1;
- if (samtools_puts(s) != EOF) ret = 0;
- free(s);
- return ret;
-}
-
-int bam_validate1(const bam_header_t *header, const bam1_t *b)
-{
- char *s;
-
- if (b->core.tid < -1 || b->core.mtid < -1) return 0;
- if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0;
-
- if (b->data_len < b->core.l_qname) return 0;
- s = memchr(bam1_qname(b), '\0', b->core.l_qname);
- if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
-
- // FIXME: Other fields could also be checked, especially the auxiliary data
-
- return 1;
-}
-
-#ifndef MIN
-#define MIN(a,b) ((a)<(b)?(a):(b))
-#endif
-
// FIXME: we should also check the LB tag associated with each alignment
-const char *bam_get_library(bam_header_t *h, const bam1_t *b)
+const char *bam_get_library(sam_hdr_t *h, const bam1_t *b)
{
const char *rg;
kstring_t lib = { 0, 0, NULL };
return LB_text;
}
-int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
-{
- int ret;
- bam_iter_t iter;
- bam1_t *b;
- b = bam_init1();
- iter = bam_iter_query(idx, tid, beg, end);
- while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
- bam_iter_destroy(iter);
- bam_destroy1(b);
- return (ret == -1)? 0 : ret;
-}
-
/************
* Remove B *
************/
-/* bam.h -- BAM API.
+/* bam.h -- miscellaneous BAM functions.
Copyright (C) 2008-2014, 2019 Genome Research Ltd.
- Portions copyright (C) 2010-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#ifndef BAM_BAM_H
#define BAM_BAM_H
-/*!
- @header
-
- BAM library provides I/O and various operations on manipulating files
- in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
- format. It now supports importing from or exporting to SAM, sorting,
- merging, generating pileup, and quickly retrieval of reads overlapped
- with a specified region.
-
- @copyright Genome Research Ltd.
- */
-
-#define BAM_VERSION "1.13"
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "htslib/bgzf.h"
#include "htslib/sam.h"
-/*! @abstract BAM file handler */
-typedef BGZF *bamFile;
-#define bam_open(fn, mode) bgzf_open(fn, mode)
-#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
-#define bam_close(fp) bgzf_close(fp)
-#define bam_tell(fp) bgzf_tell(fp)
-#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
-
-/*! @typedef
- @abstract Structure for the alignment header.
- @field n_targets number of reference sequences
- @field target_name names of the reference sequences
- @field target_len lengths of the referene sequences
- @field dict header dictionary
- @field hash hash table for fast name lookup
- @field rg2lib hash table for @RG-ID -> LB lookup
- @field l_text length of the plain text in the header
- @field text plain text
-
- @discussion Field hash points to null by default. It is a private
- member.
- */
-typedef bam_hdr_t bam_header_t;
-
-// TODO This flag-formatting functionality does not currently exist in htslib
-#define BAM_OFDEC 0
-#define BAM_OFHEX 1
-#define BAM_OFSTR 2
-
-/*! @abstract default mask for pileup */
-#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
-
-/*! @typedef
- @abstract Structure for core alignment information.
- @field tid chromosome ID, defined by bam_header_t
- @field pos 0-based leftmost coordinate
- @field bin bin calculated by bam_reg2bin()
- @field qual mapping quality
- @field l_qname length of the query name
- @field flag bitwise flag
- @field n_cigar number of CIGAR operations
- @field l_qseq length of the query sequence (read)
- */
-// typedef struct { ... } bam1_core_t;
-
-/*! @typedef
- @abstract Structure for one alignment.
- @field core core information about the alignment
- @field l_aux length of auxiliary data
- @field data_len current length of bam1_t::data
- @field m_data maximum length of bam1_t::data
- @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
-
- @discussion Notes:
-
- 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
- 2. l_qseq is calculated from the total length of an alignment block
- on reading or from CIGAR.
- 3. cigar data is encoded 4 bytes per CIGAR operation.
- 4. seq is nybble-encoded according to bam_nt16_table.
- */
-// typedef struct { ... } bam1_t;
-// NOTE htslib version doesn't have l_aux; use bam_get_l_aux(b) instead
-#ifndef SAMTOOLS_HTSLIB_SUPPRESS_HACKS
-// NOTE htslib also renames data_len to l_data; this macro may help or hinder
-#define data_len l_data
-#endif
-
-typedef hts_itr_t *bam_iter_t;
-
-#define bam1_strand(b) (bam_is_rev((b)))
-#define bam1_mstrand(b) (bam_is_mrev((b)))
-
-/*! @function
- @abstract Get the CIGAR array
- @param b pointer to an alignment
- @return pointer to the CIGAR array
-
- @discussion In the CIGAR array, each element is a 32-bit integer. The
- lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
- length of a CIGAR.
- */
-#define bam1_cigar(b) (bam_get_cigar((b)))
-
-/*! @function
- @abstract Get the name of the query
- @param b pointer to an alignment
- @return pointer to the name string, null terminated
- */
-#define bam1_qname(b) (bam_get_qname((b)))
-
-/*! @function
- @abstract Get query sequence
- @param b pointer to an alignment
- @return pointer to sequence
-
- @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
- 8 for T and 15 for N. Two bases are packed in one byte with the base
- at the higher 4 bits having smaller coordinate on the read. It is
- recommended to use bam1_seqi() macro to get the base.
- */
-#define bam1_seq(b) (bam_get_seq((b)))
-
-/*! @function
- @abstract Get query quality
- @param b pointer to an alignment
- @return pointer to quality string
- */
-#define bam1_qual(b) (bam_get_qual((b)))
-
-/*! @function
- @abstract Get a base on read
- @param s Query sequence returned by bam1_seq()
- @param i The i-th position, 0-based
- @return 4-bit integer representing the base.
- */
-#define bam1_seqi(s, i) (bam_seqi((s), (i)))
-
-/*! @function
- @abstract Get auxiliary data
- @param b pointer to an alignment
- @return pointer to the concatenated auxiliary data
- */
-#define bam1_aux(b) (bam_get_aux((b)))
-
-/*!
- @abstract Verbose level between 0 and 3; 0 is supposed to disable all
- debugging information, though this may not have been implemented.
- */
-#define bam_verbose hts_verbose
-
-/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
-#define bam_nt16_table seq_nt16_table
-
-/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
-#define bam_nt16_rev_table seq_nt16_str
-
-/*! @abstract Table for converting a 4-bit encoded nucleotide to ~2 bits. */
-#define bam_nt16_nt4_table seq_nt16_int
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- /*********************
- * Low-level SAM I/O *
- *********************/
-
- /*! @abstract TAM file handler */
- typedef samFile *tamFile;
-
- /*!
- @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
- @param fn SAM file name
- @return SAM file handler
- */
- static inline tamFile samtools_sam_open(const char *fn) { return sam_open(fn, "r"); }
- #undef sam_open
- #define sam_open samtools_sam_open
-
- /*!
- @abstract Close a SAM file handler
- @param fp SAM file handler
- */
- // void sam_close(tamFile fp);
-
- /*!
- @abstract Read one alignment from a SAM file handler
- @param fp SAM file handler
- @param header header information (ordered names of chromosomes)
- @param b read alignment; all members in b will be updated
- @return 0 if successful; otherwise negative
- */
- // int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
-
- /*!
- @abstract Read header from a SAM file (if present)
- @param fp SAM file handler
- @return pointer to header struct; 0 if no @SQ lines available
- */
- static inline bam_header_t *sam_header_read(tamFile fp) { return sam_hdr_read(fp); }
-
- // Note the distressing cast -- bam_name2id is not thread-safe
- static inline int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) { return bam_name2id((bam_header_t *)header, seq_name); }
-
-
- /*********************
- * Low-level BAM I/O *
- *********************/
-
- /*!
- @abstract Initialize a header structure.
- @return the pointer to the header structure
- */
- static inline bam_header_t *bam_header_init(void) { return sam_hdr_init(); }
-
- /*!
- @abstract Destroy a header structure.
- @param header pointer to the header
- */
- static inline void bam_header_destroy(bam_header_t *header) { sam_hdr_destroy(header); }
-
- /*!
- @abstract Read a header structure from BAM.
- @param fp BAM file handler, opened by bam_open()
- @return pointer to the header structure
-
- @discussion The file position indicator must be placed at the
- beginning of the file. Upon success, the position indicator will
- be set at the start of the first alignment.
- */
- static inline bam_header_t *bam_header_read(bamFile fp) { return bam_hdr_read(fp); }
-
- /*!
- @abstract Write a header structure to BAM.
- @param fp BAM file handler
- @param header pointer to the header structure
- @return always 0 currently
- */
- static inline int bam_header_write(bamFile fp, bam_header_t *header) { return bam_hdr_write(fp, header); }
-
- /*!
- @abstract Read an alignment from BAM.
- @param fp BAM file handler
- @param b read alignment; all members are updated.
- @return number of bytes read from the file
-
- @discussion The file position indicator must be
- placed right before an alignment. Upon success, this function
- will set the position indicator to the start of the next
- alignment. This function is not affected by the machine
- endianness.
- */
- // int bam_read1(bamFile fp, bam1_t *b);
-
- int bam_remove_B(bam1_t *b);
-
- /*!
- @abstract Write an alignment to BAM.
- @param fp BAM file handler
- @param b alignment to write
- @return number of bytes written to the file
- */
- // int bam_write1(bamFile fp, const bam1_t *b);
-
- /*! @function
- @abstract Initiate a pointer to bam1_t struct
- */
-//#define bam_init1()
-
- /*! @function
- @abstract Free the memory allocated for an alignment.
- @param b pointer to an alignment
- */
-//#define bam_destroy1(b)
-
- /*!
- @abstract Format a BAM record in the SAM format
- @param header pointer to the header structure
- @param b alignment to print
- @return a pointer to the SAM string
- */
- char *bam_format1(const bam_header_t *header, const bam1_t *b);
-
- /*!
- @abstract Formats a BAM record and writes it and \n to stdout
- @return 0 if successful, -1 on error
- */
- int bam_view1(const bam_header_t *header, const bam1_t *b);
-
- /*!
- @abstract Check whether a BAM record is plausibly valid
- @param header associated header structure, or NULL if unavailable
- @param b alignment to validate
- @return 0 if the alignment is invalid; non-zero otherwise
-
- @discussion Simple consistency check of some of the fields of the
- alignment record. If the header is provided, several additional checks
- are made. Not all fields are checked, so a non-zero result is not a
- guarantee that the record is valid. However it is usually good enough
- to detect when bam_seek() has been called with a virtual file offset
- that is not the offset of an alignment record.
- */
- int bam_validate1(const bam_header_t *header, const bam1_t *b);
-
- // TODO Parses headers, so not yet implemented in terms of htslib
- const char *bam_get_library(bam_header_t *header, const bam1_t *b);
-
-
- /***************
- * pileup APIs *
- ***************/
-
- /*! @typedef
- @abstract Structure for one alignment covering the pileup position.
- @field b pointer to the alignment
- @field qpos position of the read base at the pileup site, 0-based
- @field indel indel length; 0 for no indel, positive for ins and negative for del
- @field is_del 1 iff the base on the padded read is a deletion
- @field level the level of the read in the "viewer" mode
-
- @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
- difference between the two functions is that the former does not
- set bam_pileup1_t::level, while the later does. Level helps the
- implementation of alignment viewers, but calculating this has some
- overhead.
- */
- // typedef struct { ... } bam_pileup1_t;
-
- // typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
-
- // typedef struct incomplete *bam_plp_t;
-
- // bam_plp_t bam_plp_init(bam_plp_auto_f read, void *data);
- // int bam_plp_push(bam_plp_t iter, const bam1_t *b);
- // const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
- // const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
- // void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
- // void bam_plp_reset(bam_plp_t iter);
- // void bam_plp_destroy(bam_plp_t iter);
-
- // typedef struct incomplete *bam_mplp_t;
-
- // bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
- // void bam_mplp_destroy(bam_mplp_t iter);
- // void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
- // int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
-
- /*! @typedef
- @abstract Type of function to be called by bam_plbuf_push().
- @param tid chromosome ID as is defined in the header
- @param pos start coordinate of the alignment, 0-based
- @param n number of elements in pl array
- @param pl array of alignments
- @param data user provided data
- @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
- */
- typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
-
- typedef struct {
- bam_plp_t iter;
- bam_pileup_f func;
- void *data;
- } bam_plbuf_t;
-
- void bam_plbuf_reset(bam_plbuf_t *buf);
- bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
- void bam_plbuf_destroy(bam_plbuf_t *buf);
- int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
-
- int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
-
- struct __bam_lplbuf_t;
- typedef struct __bam_lplbuf_t bam_lplbuf_t;
-
- void bam_lplbuf_reset(bam_lplbuf_t *buf);
-
- /*! @abstract bam_plbuf_init() equivalent with level calculated. */
- bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
-
- /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */
- void bam_lplbuf_destroy(bam_lplbuf_t *tv);
-
- /*! @abstract bam_plbuf_push() equivalent with level calculated. */
- int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
-
-
- /*********************
- * BAM indexing APIs *
- *********************/
-
- typedef hts_idx_t bam_index_t;
-
- /*!
- @abstract Build index for a BAM file.
- @discussion Index file "fn.bai" will be created.
- @param fn name of the BAM file
- @return always 0 currently
- */
- static inline int samtools_bam_index_build(const char *fn) { return bam_index_build(fn, 0); }
- #undef bam_index_build
- #define bam_index_build samtools_bam_index_build
-
- /*!
- @abstract Load index from file "fn.bai".
- @param fn name of the BAM file (NOT the index file)
- @return pointer to the index structure
- */
- // bam_index_t *bam_index_load(const char *fn);
-
- /*!
- @abstract Destroy an index structure.
- @param idx pointer to the index structure
- */
- static inline void bam_index_destroy(bam_index_t *idx) { hts_idx_destroy(idx); }
-
- /*! @typedef
- @abstract Type of function to be called by bam_fetch().
- @param b the alignment
- @param data user provided data
- */
- typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
-
- /*!
- @abstract Retrieve the alignments that are overlapped with the
- specified region. (For BAM files only; see also samfetch() in sam.h.)
-
- @discussion A user defined function will be called for each
- retrieved alignment ordered by its start position.
-
- @param fp BAM file handler
- @param idx pointer to the alignment index
- @param tid chromosome ID as is defined in the header
- @param beg start coordinate, 0-based
- @param end end coordinate, 0-based
- @param data user provided data (will be transferred to func)
- @param func user defined function
- */
- int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
-
- static inline bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) { return bam_itr_queryi(idx, tid, beg, end); }
- static inline int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) { return iter? hts_itr_next(fp, iter, b, 0) : bam_read1(fp, b); }
- static inline void bam_iter_destroy(bam_iter_t iter) { bam_itr_destroy(iter); }
-
- /*!
- @abstract Parse a region in the format: "chr2:100,000-200,000".
- @discussion bam_header_t::hash will be initialized if empty.
- @param header pointer to the header structure
- @param str string to be parsed
- @param ref_id the returned chromosome ID
- @param begin the returned start coordinate
- @param end the returned end coordinate
- @return 0 on success; -1 on failure
- */
- int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
-
-
- /**************************
- * APIs for optional tags *
- **************************/
-
- /*!
- @abstract Retrieve data of a tag
- @param b pointer to an alignment struct
- @param tag two-character tag to be retrieved
-
- @return pointer to the type and data. The first character is the
- type that can be 'iIsScCdfAZH'.
-
- @discussion Use bam_aux2?() series to convert the returned data to
- the corresponding type.
- */
- // uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
-
- // int32_t bam_aux2i(const uint8_t *s);
- // float bam_aux2f(const uint8_t *s);
- #define bam_aux2d(s) (bam_aux2f((s)))
- // char bam_aux2A(const uint8_t *s);
- // char *bam_aux2Z(const uint8_t *s);
-
- // int bam_aux_del(bam1_t *b, uint8_t *s);
- // void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
- static inline uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) { return bam_aux_get(b, tag); } // an alias of bam_aux_get()
-
-
- /*****************
- * Miscellaneous *
- *****************/
-
- /*!
- @abstract Calculate the rightmost coordinate of an alignment on the
- reference genome.
-
- @param c pointer to the bam1_core_t structure
- @param cigar the corresponding CIGAR array (from bam1_t::cigar)
- @return the rightmost coordinate, 0-based
- */
- static inline uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) { return c->pos + (c->n_cigar? bam_cigar2rlen(c->n_cigar, cigar) : 1); }
-
- /*!
- @abstract Calculate the length of the query sequence from CIGAR.
- @param c pointer to the bam1_core_t structure
- @param cigar the corresponding CIGAR array (from bam1_t::cigar)
- @return length of the query sequence
- */
- static inline int32_t samtools_bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) { return bam_cigar2qlen(c->n_cigar, cigar); }
- #undef bam_cigar2qlen
- #define bam_cigar2qlen samtools_bam_cigar2qlen
-
-#ifdef __cplusplus
-}
-#endif
-
-/*!
- @abstract Calculate the minimum bin that contains a region [beg,end).
- @param beg start of the region, 0-based
- @param end end of the region, 0-based
- @return bin
- */
-static inline int bam_reg2bin(uint32_t beg, uint32_t end)
-{
- return hts_reg2bin(beg, end, 14, 5);
-}
-
-/*!
- @abstract Copy an alignment
- @param bdst destination alignment struct
- @param bsrc source alignment struct
- @return pointer to the destination alignment struct
- */
-// bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+int bam_remove_B(bam1_t *b);
-/*!
- @abstract Duplicate an alignment
- @param src source alignment struct
- @return pointer to the destination alignment struct
- */
-// bam1_t *bam_dup1(const bam1_t *src)
+const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
#endif
/* bam2bcf.c -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2015, 2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
double calc_mwu_bias_cdf(int *a, int *b, int n)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
for (i=0; i<n; i++)
{
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- if ( a[i] && b[i] )
- {
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
- }
}
if ( !na || !nb ) return HUGE_VAL;
if ( na>=8 || nb>=8 )
{
double mean = ((double)na*nb)*0.5;
- // Correction for ties:
- // double N = na+nb;
- // double var2 = (N*N-1)*N-ties;
- // if ( var2==0 ) return 1.0;
- // var2 *= ((double)na*nb)/N/(N-1)/12.0;
- // No correction for ties:
double var2 = ((double)na*nb)*(na+nb+1)/12.0;
double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
return 2.0 - kf_erfc(z); // which is 1 + erf(z)
double calc_mwu_bias(int *a, int *b, int n)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
for (i=0; i<n; i++)
{
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- if ( a[i] && b[i] )
- {
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
- }
}
if ( !na || !nb ) return HUGE_VAL;
if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
// Linear approximation
return U>mean ? (2.0*mean-U)/mean : U/mean;
}
- // Correction for ties:
- // double N = na+nb;
- // double var2 = (N*N-1)*N-ties;
- // if ( var2==0 ) return 1.0;
- // var2 *= ((double)na*nb)/N/(N-1)/12.0;
- // No correction for ties:
double var2 = ((double)na*nb)*(na+nb+1)/12.0;
if ( na>=8 || nb>=8 )
{
/* bam2bcf.c -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2015, 2021 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
double calc_mwu_bias_cdf(int *a, int *b, int n)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
for (i=0; i<n; i++)
{
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- if ( a[i] && b[i] )
- {
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
- }
}
if ( !na || !nb ) return HUGE_VAL;
if ( na>=8 || nb>=8 )
{
double mean = ((double)na*nb)*0.5;
- // Correction for ties:
- // double N = na+nb;
- // double var2 = (N*N-1)*N-ties;
- // if ( var2==0 ) return 1.0;
- // var2 *= ((double)na*nb)/N/(N-1)/12.0;
- // No correction for ties:
double var2 = ((double)na*nb)*(na+nb+1)/12.0;
double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
return 2.0 - kf_erfc(z); // which is 1 + erf(z)
double calc_mwu_bias(int *a, int *b, int n)
{
int na = 0, nb = 0, i;
- double U = 0, ties = 0;
+ double U = 0;
for (i=0; i<n; i++)
{
na += a[i];
U += a[i] * (nb + b[i]*0.5);
nb += b[i];
- if ( a[i] && b[i] )
- {
- double tie = a[i] + b[i];
- ties += (tie*tie-1)*tie;
- }
}
if ( !na || !nb ) return HUGE_VAL;
if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
// Linear approximation
return U>mean ? (2.0*mean-U)/mean : U/mean;
}
- // Correction for ties:
- // double N = na+nb;
- // double var2 = (N*N-1)*N-ties;
- // if ( var2==0 ) return 1.0;
- // var2 *= ((double)na*nb)/N/(N-1)/12.0;
- // No correction for ties:
double var2 = ((double)na*nb)*(na+nb+1)/12.0;
if ( na>=8 || nb>=8 )
{
sam_hdr_t **header;
int c, has_index_file = 0;
char *file_list = NULL, **fn = NULL;
+ char *out_file = NULL;
depth_opt opt = {
.flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
.min_qual = 0,
case 'o':
if (opt.out != stdout)
break;
- opt.out = fopen(optarg, "w");
+ opt.out = fopen(out_file = optarg, "w");
if (!opt.out) {
print_error_errno("depth", "Cannot open \"%s\" for writing.",
optarg);
if (opt.bed)
bed_destroy(opt.bed);
sam_global_args_free(&ga);
- if (opt.out != stdout) fclose(opt.out);
+ if (opt.out != stdout) {
+ if (fclose(opt.out) != 0 && ret == 0) {
+ print_error_errno("depth", "error on closing \"%s\"", out_file);
+ ret = 1;
+ }
+ }
+
return ret;
}
sam_hdr_t **header;
int c, has_index_file = 0;
char *file_list = NULL, **fn = NULL;
+ char *out_file = NULL;
depth_opt opt = {
.flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
.min_qual = 0,
case 'o':
if (opt.out != samtools_stdout)
break;
- opt.out = fopen(optarg, "w");
+ opt.out = fopen(out_file = optarg, "w");
if (!opt.out) {
print_error_errno("depth", "Cannot open \"%s\" for writing.",
optarg);
if (opt.bed)
bed_destroy(opt.bed);
sam_global_args_free(&ga);
- if (opt.out != samtools_stdout) fclose(opt.out);
+ if (opt.out != samtools_stdout) {
+ if (fclose(opt.out) != 0 && ret == 0) {
+ print_error_errno("depth", "error on closing \"%s\"", out_file);
+ ret = 1;
+ }
+ }
+
return ret;
}
#include <config.h>
#include <ctype.h>
-#include <limits.h>
-#include "bam.h"
+#include "htslib/sam.h"
static inline int bam_aux_type2size(int x)
{
}
return 0;
}
-
-// Only here due to libbam.a being used by some applications.
-int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
-{
- hts_pos_t beg64, end64;
- int r;
- r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1;
- if (beg64 > INT_MAX || end64 > INT_MAX)
- return -1;
- *beg = beg64;
- *end = end64;
- return r;
-}
#include <config.h>
#include <ctype.h>
-#include <limits.h>
-#include "bam.h"
+#include "htslib/sam.h"
static inline int bam_aux_type2size(int x)
{
}
return 0;
}
-
-// Only here due to libbam.a being used by some applications.
-int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
-{
- hts_pos_t beg64, end64;
- int r;
- r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1;
- if (beg64 > INT_MAX || end64 > INT_MAX)
- return -1;
- *beg = beg64;
- *end = end64;
- return r;
-}
+++ /dev/null
-/* bam_endian.h -- endianness conversion functions.
-
- Copyright (C) 2008 Genome Research Ltd.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#ifndef BAM_ENDIAN_H
-#define BAM_ENDIAN_H
-
-#include <stdint.h>
-
-static inline int bam_is_big_endian()
-{
- long one= 1;
- return !(*((char *)(&one)));
-}
-static inline uint16_t bam_swap_endian_2(uint16_t v)
-{
- return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
-}
-static inline void *bam_swap_endian_2p(void *x)
-{
- *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
- return x;
-}
-static inline uint32_t bam_swap_endian_4(uint32_t v)
-{
- v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
- return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
-}
-static inline void *bam_swap_endian_4p(void *x)
-{
- *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
- return x;
-}
-static inline uint64_t bam_swap_endian_8(uint64_t v)
-{
- v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
- v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
- return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
-}
-static inline void *bam_swap_endian_8p(void *x)
-{
- *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
- return x;
-}
-
-#endif
return false;
}
set_sam_opts(state->hstdout, state, opts);
+ autoflush_if_stdout(state->hstdout, "-");
}
state->fpr[i] = state->hstdout;
}
}
}
if (state->hstdout) {
+ release_autoflush(state->hstdout);
if (sam_close(state->hstdout) < 0) {
print_error_errno("bam2fq", "Error closing STDOUT");
valid = false;
int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
bam2fq_opts_t* opts) {
- bam1_t *b[2] = {b1, b2};
+ bam1_t *b = b1 ? b1 : b2;
char *ifmt = opts->index_format;
if (!ifmt)
break;
case 'i':
- if (write_index_rec(state->fpi[inum], b[inum], state, opts,
+ if (write_index_rec(state->fpi[inum], b, state, opts,
bc, bc_end-bc, qt, qt_end-qt) < 0)
return -1;
bc = bc_end + (len==0);
while (true) {
int res = sam_read1(state->fp, state->h, b[n]);
if (res < -1) {
- fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
+ print_error("bam2fq", "Failed to read bam record");
goto err;
}
at_eof = res < 0;
samFile *fpse;
samFile *fpr[3];
samFile *fpi[3];
- samFile *hsamtools_stdout;
+ samFile *hstdout;
sam_hdr_t *h;
bool has12, use_oq, copy_tags, illumina_tag;
int flag_on, flag_off, flag_alloff;
state->filetype = opts->filetype;
state->def_qual = opts->def_qual;
state->index_sequence = NULL;
- state->hsamtools_stdout = NULL;
+ state->hstdout = NULL;
state->compression_level = opts->compression_level;
state->fp = sam_open(opts->fn_input, "r");
state->fpr[i] = state->fpr[j];
}
} else {
- if (!state->hsamtools_stdout) {
- if (!(state->hsamtools_stdout = sam_open_z("-", mode, state))) {
+ if (!state->hstdout) {
+ if (!(state->hstdout = sam_open_z("-", mode, state))) {
print_error_errno("bam2fq", "Cannot open STDOUT");
free(state);
return false;
}
- set_sam_opts(state->hsamtools_stdout, state, opts);
+ set_sam_opts(state->hstdout, state, opts);
+ autoflush_if_stdout(state->hstdout, "-");
}
- state->fpr[i] = state->hsamtools_stdout;
+ state->fpr[i] = state->hstdout;
}
}
int i, j;
for (i = 0; i < 3; ++i) {
- if (state->fpr[i] != state->hsamtools_stdout) {
+ if (state->fpr[i] != state->hstdout) {
for (j = 0; j < i; j++)
if (state->fpr[i] == state->fpr[j])
break;
}
}
}
- if (state->hsamtools_stdout) {
- if (sam_close(state->hsamtools_stdout) < 0) {
+ if (state->hstdout) {
+ release_autoflush(state->hstdout);
+ if (sam_close(state->hstdout) < 0) {
print_error_errno("bam2fq", "Error closing STDOUT");
valid = false;
}
int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state,
bam2fq_opts_t* opts) {
- bam1_t *b[2] = {b1, b2};
+ bam1_t *b = b1 ? b1 : b2;
char *ifmt = opts->index_format;
if (!ifmt)
break;
case 'i':
- if (write_index_rec(state->fpi[inum], b[inum], state, opts,
+ if (write_index_rec(state->fpi[inum], b, state, opts,
bc, bc_end-bc, qt, qt_end-qt) < 0)
return -1;
bc = bc_end + (len==0);
while (true) {
int res = sam_read1(state->fp, state->h, b[n]);
if (res < -1) {
- fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n");
+ print_error("bam2fq", "Failed to read bam record");
goto err;
}
at_eof = res < 0;
fprintf(fp, " Tag to use with barcode sequences [BC]\n");
fprintf(fp, " --quality-tag TAG\n");
fprintf(fp, " Tag to use with barcode qualities [QT]\n");
+ fprintf(fp, " -N, --name2 Use 2nd field as read name (SRA format)\n");
fprintf(fp, " -r STRING Build up a complete @RG line\n");
fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n");
fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n");
char *order;
int compress_level;
htsThreadPool p;
+ int name2;
} opts_t;
// Append a sequence and quality string from a BAM record to a BC:Z and
hts_set_thread_pool(fp_in[i], &opts->p);
ids[n++] = i;
+ if (opts->name2)
+ hts_set_opt(fp_in[i], FASTQ_OPT_NAME2, 1);
if (opts->casava)
hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1);
if (opts->barcode_seq) // for auto-CASAVA parsing
perror(opts->fn_out);
goto err;
}
+ autoflush_if_stdout(fp_out, opts->fn_out);
if (opts->p.pool)
hts_set_thread_pool(fp_out, &opts->p);
ks_free(&index_str);
ks_free(&read_str);
if (fp_out) {
+ release_autoflush(fp_out);
if (sam_close(fp_out) < 0) {
perror(opts->fn_out);
ret |= -1;
.rg_line = NULL,
.order = NULL,
.compress_level = -1,
+ .name2 = 0,
};
kstring_t rg = {0};
{"order", required_argument, NULL, 3},
{"barcode-tag", required_argument, NULL, 4},
{"quality-tag", required_argument, NULL, 5},
+ {"name2", no_argument, NULL, 'N'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:N", lopts, NULL)) >= 0) {
switch (c) {
case 'b': opts.idx_both = 1; break;
case '0': opts.fn[FQ_R0] = optarg; break;
opts.rg_line = rg.s;
break;
+ case 'N': opts.name2 = 1; break;
+
case 9: opts.no_pg = 1; break;
case 3: opts.order = optarg; break;
fprintf(fp, " Tag to use with barcode sequences [BC]\n");
fprintf(fp, " --quality-tag TAG\n");
fprintf(fp, " Tag to use with barcode qualities [QT]\n");
+ fprintf(fp, " -N, --name2 Use 2nd field as read name (SRA format)\n");
fprintf(fp, " -r STRING Build up a complete @RG line\n");
fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n");
fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n");
char *order;
int compress_level;
htsThreadPool p;
+ int name2;
} opts_t;
// Append a sequence and quality string from a BAM record to a BC:Z and
hts_set_thread_pool(fp_in[i], &opts->p);
ids[n++] = i;
+ if (opts->name2)
+ hts_set_opt(fp_in[i], FASTQ_OPT_NAME2, 1);
if (opts->casava)
hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1);
if (opts->barcode_seq) // for auto-CASAVA parsing
perror(opts->fn_out);
goto err;
}
+ autoflush_if_stdout(fp_out, opts->fn_out);
if (opts->p.pool)
hts_set_thread_pool(fp_out, &opts->p);
ks_free(&index_str);
ks_free(&read_str);
if (fp_out) {
+ release_autoflush(fp_out);
if (sam_close(fp_out) < 0) {
perror(opts->fn_out);
ret |= -1;
.rg_line = NULL,
.order = NULL,
.compress_level = -1,
+ .name2 = 0,
};
kstring_t rg = {0};
{"order", required_argument, NULL, 3},
{"barcode-tag", required_argument, NULL, 4},
{"quality-tag", required_argument, NULL, 5},
+ {"name2", no_argument, NULL, 'N'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:N", lopts, NULL)) >= 0) {
switch (c) {
case 'b': opts.idx_both = 1; break;
case '0': opts.fn[FQ_R0] = optarg; break;
opts.rg_line = rg.s;
break;
+ case 'N': opts.name2 = 1; break;
+
case 9: opts.no_pg = 1; break;
case 3: opts.order = optarg; break;
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2020 Genome Research Ltd.
+ Copyright (C) 2017-2021 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
c->y = dy;
c->xpos = dxpos;
- if (ret) {
- c->opt = ret;
- }
-
return ret;
}
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2020 Genome Research Ltd.
+ Copyright (C) 2017-2021 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
c->y = dy;
c->xpos = dxpos;
- if (ret) {
- c->opt = ret;
- }
-
return ret;
}
#include <htslib/kstring.h>
#include <htslib/klist.h>
#include <htslib/khash_str2int.h>
+#include <htslib/cram.h>
#include "samtools.h"
#include "bedidx.h"
#include "sam_opts.h"
}
static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
- hts_pos_t ref_len, const char *ref, kstring_t *ks,
- int rev_del)
+ hts_pos_t ref_len, const char *ref, kstring_t *ks,
+ int rev_del, int no_ins, int no_ins_mods,
+ int no_del, int no_ends)
{
+ no_ins_mods |= no_ins;
int j;
- if (p->is_head) {
+ hts_base_mod_state *m = p->cd.p;
+ if (!no_ends && p->is_head) {
putc('^', fp);
putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
}
else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
}
putc(c, fp);
+ if (m) {
+ int nm;
+ hts_base_mod mod[256];
+ if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 256)) > 0) {
+ putc('[', fp);
+ int j;
+ for (j = 0; j < nm && j < 256; j++) {
+ char qual[20];
+ if (mod[j].qual >= 0)
+ sprintf(qual, "%d", mod[j].qual);
+ else
+ *qual = 0;
+ if (mod[j].modified_base < 0)
+ // ChEBI
+ fprintf(fp, "%c(%d)%s", "+-"[mod[j].strand],
+ -mod[j].modified_base, qual);
+ else
+ fprintf(fp, "%c%c%s", "+-"[mod[j].strand],
+ mod[j].modified_base, qual);
+ }
+ putc(']', fp);
+ }
+ }
} else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp);
int del_len = -p->indel;
if (p->indel > 0) {
- int len = bam_plp_insertion(p, ks, &del_len);
+ int len = bam_plp_insertion_mod(p, m && !no_ins_mods ? m : NULL,
+ ks, &del_len);
if (len < 0) {
print_error("mpileup", "bam_plp_insertion() failed");
return -1;
}
- putc('+', fp); printw(len, fp);
- if (bam_is_rev(p->b)) {
- char pad = rev_del ? '#' : '*';
- for (j = 0; j < len; j++)
- putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp);
- } else {
- for (j = 0; j < len; j++)
- putc(toupper(ks->s[j]), fp);
+ if (no_ins < 2) {
+ putc('+', fp);
+ printw(len, fp);
+ }
+ if (!no_ins) {
+ if (bam_is_rev(p->b)) {
+ char pad = rev_del ? '#' : '*';
+ int in_mod = 0;
+ for (j = 0; j < ks->l; j++) {
+ if (ks->s[j] == '[') in_mod = 1;
+ else if (ks->s[j] == ']') in_mod = 0;
+ putc(ks->s[j] != '*'
+ ? (in_mod ? ks->s[j] : tolower(ks->s[j]))
+ : pad, fp);
+ }
+ } else {
+ int in_mod = 0;
+ for (j = 0; j < ks->l; j++) {
+ if (ks->s[j] == '[') in_mod = 1;
+ if (ks->s[j] == ']') in_mod = 0;
+ putc(in_mod ? ks->s[j] : toupper(ks->s[j]), fp);
+ }
+ }
}
}
if (del_len > 0) {
- printw(-del_len, fp);
- for (j = 1; j <= del_len; ++j) {
- int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
- putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
+ if (no_del < 2)
+ printw(-del_len, fp);
+ if (!no_del) {
+ for (j = 1; j <= del_len; ++j) {
+ int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
+ putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
+ }
}
}
- if (p->is_tail) putc('$', fp);
+ if (!no_ends && p->is_tail) putc('$', fp);
return 0;
}
#define MPLP_PRINT_TLEN (1<<21)
#define MPLP_PRINT_SEQ (1<<22)
#define MPLP_PRINT_QUAL (1<<23)
+#define MPLP_PRINT_MODS (1<<24)
+#define MPLP_PRINT_QPOS5 (1<<25)
+
+#define MPLP_PRINT_LAST (1<<26) // terminator for loop
#define MPLP_MAX_DEPTH 8000
#define MPLP_MAX_INDEL_DEPTH 250
void *bed, *rghash, *auxlist;
int argc;
char **argv;
- char sep, empty;
+ char sep, empty, no_ins, no_ins_mods, no_del, no_ends;
sam_global_args ga;
} mplp_conf_t;
return 1;
}
+// Initialise and destroy the base modifier state data. This is called
+// as each new read is added or removed from the pileups.
+static
+int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ int ret;
+ hts_base_mod_state *m = hts_base_mod_state_alloc();
+ ret = bam_parse_basemod(b, m);
+ cd->p = m;
+ return ret;
+}
+
+static
+int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ hts_base_mod_state_free(cd->p);
+ return 0;
+}
+
static void
print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len)
for (i = 0; i < n; ++i) {
fputs("\t0\t*\t*", fp);
int flag_value = MPLP_PRINT_MAPQ_CHAR;
- while(flag_value < MPLP_PRINT_QUAL + 1) {
- if (conf->flag & flag_value)
+ while(flag_value < MPLP_PRINT_LAST) {
+ if (flag_value != MPLP_PRINT_MODS && (conf->flag & flag_value))
fputs("\t*", fp);
flag_value <<= 1;
}
}
// read the header of each file in the list and initialize data
+ refs_t *refs = NULL;
for (i = 0; i < n; ++i) {
sam_hdr_t *h_tmp;
data[i] = calloc(1, sizeof(mplp_aux_t));
fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
exit(EXIT_FAILURE);
}
- if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
- fprintf(stderr, "[%s] failed to process %s: %s\n",
- __func__, conf->fai_fname, strerror(errno));
- exit(EXIT_FAILURE);
+
+ if (!refs && conf->fai_fname) {
+ if (hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ refs = cram_get_refs(data[i]->fp);
+ } else if (conf->fai_fname) {
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_SHARED_REF, refs) != 0) {
+ fprintf(stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
}
+
data[i]->conf = conf;
data[i]->ref = &mp_ref;
h_tmp = sam_hdr_read(data[i]->fp);
fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
exit(EXIT_FAILURE);
}
+ autoflush_if_stdout(bcf_fp, conf->output_fname);
// BCF header creation
bcf_hdr = bcf_hdr_init("w");
// init pileup
iter = bam_mplp_init(n, mplp_func, (void**)data);
+ if (conf->flag & MPLP_PRINT_MODS) {
+ bam_mplp_constructor(iter, pileup_cd_create);
+ bam_mplp_destructor(iter, pileup_cd_destroy);
+ }
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
if ( !conf->max_depth ) {
max_depth = INT_MAX;
if (n_plp[i] == 0) {
fputs("*\t*", pileup_fp);
int flag_value = MPLP_PRINT_MAPQ_CHAR;
- while(flag_value < MPLP_PRINT_QUAL + 1) {
- if (conf->flag & flag_value)
+ while(flag_value < MPLP_PRINT_LAST) {
+ if (flag_value != MPLP_PRINT_MODS
+ && (conf->flag & flag_value))
fputs("\t*", pileup_fp);
flag_value <<= 1;
}
: 0;
if (c >= conf->min_baseQ) {
n++;
- if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) {
+ if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len,
+ ref, &ks, conf->rev_del,
+ conf->no_ins, conf->no_ins_mods,
+ conf->no_del, conf->no_ends) < 0) {
ret = 1;
goto fail;
}
/* Print selected columns */
int flag_value = MPLP_PRINT_MAPQ_CHAR;
- while(flag_value < MPLP_PRINT_QUAL + 1) {
- if (conf->flag & flag_value) {
+ while(flag_value < MPLP_PRINT_LAST) {
+ if (flag_value != MPLP_PRINT_MODS
+ && (conf->flag & flag_value)) {
n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
putc(c, pileup_fp);
break;
case MPLP_PRINT_QPOS:
+ // query position in current orientation
fprintf(pileup_fp, "%d", p->qpos + 1);
break;
+ case MPLP_PRINT_QPOS5: {
+ // query position in 5' to 3' orientation
+ int pos5 = bam_is_rev(p->b)
+ ? p->b->core.l_qseq-p->qpos + p->is_del
+ : p->qpos + 1;
+ fprintf(pileup_fp, "%d", pos5);
+ break;
+ }
case MPLP_PRINT_QNAME:
fputs(bam_get_qname(p->b), pileup_fp);
break;
bcf_destroy1(bcf_rec);
if (bcf_fp)
{
+ release_autoflush(bcf_fp);
hts_close(bcf_fp);
bcf_hdr_destroy(bcf_hdr);
bcf_call_destroy(bca);
" -X, --customized-index use customized index files\n" // -X flag for index filename
"\n"
"Output options:\n"
-" -o, --output FILE write output to FILE [standard output]\n"
-" -O, --output-BP output base positions on reads\n"
-" -s, --output-MQ output mapping quality\n"
-" --output-QNAME output read names\n"
-" --output-extra STR output extra read fields and read tag values\n"
-" --output-sep CHAR set the separator character for tag lists [,]\n"
-" --output-empty CHAR set the no value character for tag lists [*]\n"
-" --reverse-del use '#' character for deletions on the reverse strand\n"
-" -a output all positions (including zero depth)\n"
-" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
+" -o, --output FILE write output to FILE [standard output]\n"
+" -O, --output-BP output base positions on reads, current orientation\n"
+" --output-BP-5 output base positions on reads, 5' to 3' orientation\n"
+" -M, --output-mods output base modifications\n"
+" -s, --output-MQ output mapping quality\n"
+" --output-QNAME output read names\n"
+" --output-extra STR output extra read fields and read tag values\n"
+" --output-sep CHAR set the separator character for tag lists [,]\n"
+" --output-empty CHAR set the no value character for tag lists [*]\n"
+" --no-output-ins skip insertion sequence after +NUM\n"
+" Use twice for complete insertion removal\n"
+" --no-output-ins-mods don't display base modifications within insertions\n"
+" --no-output-del skip deletion sequence after -NUM\n"
+" Use twice for complete deletion removal\n"
+" --no-output-ends remove ^MQUAL and $ markup in sequence column\n"
+" --reverse-del use '#' character for deletions on the reverse strand\n"
+" -a output all positions (including zero depth)\n"
+" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
"Generic options:\n");
sam_global_opt_help(fp, "-.--.--.");
{"bcf", no_argument, NULL, 'g'},
{"VCF", no_argument, NULL, 'v'},
{"vcf", no_argument, NULL, 'v'},
+ {"output-mods", no_argument, NULL, 'M'},
{"output-BP", no_argument, NULL, 'O'},
{"output-bp", no_argument, NULL, 'O'},
+ {"output-BP-5", no_argument, NULL, 14},
+ {"output-bp-5", no_argument, NULL, 14},
{"output-MQ", no_argument, NULL, 's'},
{"output-mq", no_argument, NULL, 's'},
{"output-tags", required_argument, NULL, 't'},
{"output-extra", required_argument, NULL, 7},
{"output-sep", required_argument, NULL, 8},
{"output-empty", required_argument, NULL, 9},
+ {"no-output-ins", no_argument, NULL, 10},
+ {"no-output-ins-mods", no_argument, NULL, 11},
+ {"no-output-del", no_argument, NULL, 12},
+ {"no-output-ends", no_argument, NULL, 13},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:aM",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
break;
case 8: mplp.sep = optarg[0]; break;
case 9: mplp.empty = optarg[0]; break;
+ case 10: mplp.no_ins++; break;
+ case 11: mplp.no_ins_mods = 1; break;
+ case 12: mplp.no_del++; break;
+ case 13: mplp.no_ends = 1; break;
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == NULL) return 1;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break;
- case 'O': mplp.flag |= MPLP_PRINT_QPOS; break;
+ case 'O':
+ if (!(mplp.flag & MPLP_PRINT_QPOS5))
+ mplp.flag |= MPLP_PRINT_QPOS;
+ break;
+ case 14:
+ mplp.flag |= MPLP_PRINT_QPOS5;
+ mplp.flag &= ~MPLP_PRINT_QPOS;
+ break;
+ case 'M': mplp.flag |= MPLP_PRINT_MODS; break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
case 'Q': mplp.min_baseQ = atoi(optarg); break;
#include <htslib/kstring.h>
#include <htslib/klist.h>
#include <htslib/khash_str2int.h>
+#include <htslib/cram.h>
#include "samtools.h"
#include "bedidx.h"
#include "sam_opts.h"
}
static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos,
- hts_pos_t ref_len, const char *ref, kstring_t *ks,
- int rev_del)
+ hts_pos_t ref_len, const char *ref, kstring_t *ks,
+ int rev_del, int no_ins, int no_ins_mods,
+ int no_del, int no_ends)
{
+ no_ins_mods |= no_ins;
int j;
- if (p->is_head) {
+ hts_base_mod_state *m = p->cd.p;
+ if (!no_ends && p->is_head) {
putc('^', fp);
putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp);
}
else c = bam_is_rev(p->b)? tolower(c) : toupper(c);
}
putc(c, fp);
+ if (m) {
+ int nm;
+ hts_base_mod mod[256];
+ if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 256)) > 0) {
+ putc('[', fp);
+ int j;
+ for (j = 0; j < nm && j < 256; j++) {
+ char qual[20];
+ if (mod[j].qual >= 0)
+ sprintf(qual, "%d", mod[j].qual);
+ else
+ *qual = 0;
+ if (mod[j].modified_base < 0)
+ // ChEBI
+ fprintf(fp, "%c(%d)%s", "+-"[mod[j].strand],
+ -mod[j].modified_base, qual);
+ else
+ fprintf(fp, "%c%c%s", "+-"[mod[j].strand],
+ mod[j].modified_base, qual);
+ }
+ putc(']', fp);
+ }
+ }
} else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp);
int del_len = -p->indel;
if (p->indel > 0) {
- int len = bam_plp_insertion(p, ks, &del_len);
+ int len = bam_plp_insertion_mod(p, m && !no_ins_mods ? m : NULL,
+ ks, &del_len);
if (len < 0) {
print_error("mpileup", "bam_plp_insertion() failed");
return -1;
}
- putc('+', fp); printw(len, fp);
- if (bam_is_rev(p->b)) {
- char pad = rev_del ? '#' : '*';
- for (j = 0; j < len; j++)
- putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp);
- } else {
- for (j = 0; j < len; j++)
- putc(toupper(ks->s[j]), fp);
+ if (no_ins < 2) {
+ putc('+', fp);
+ printw(len, fp);
+ }
+ if (!no_ins) {
+ if (bam_is_rev(p->b)) {
+ char pad = rev_del ? '#' : '*';
+ int in_mod = 0;
+ for (j = 0; j < ks->l; j++) {
+ if (ks->s[j] == '[') in_mod = 1;
+ else if (ks->s[j] == ']') in_mod = 0;
+ putc(ks->s[j] != '*'
+ ? (in_mod ? ks->s[j] : tolower(ks->s[j]))
+ : pad, fp);
+ }
+ } else {
+ int in_mod = 0;
+ for (j = 0; j < ks->l; j++) {
+ if (ks->s[j] == '[') in_mod = 1;
+ if (ks->s[j] == ']') in_mod = 0;
+ putc(in_mod ? ks->s[j] : toupper(ks->s[j]), fp);
+ }
+ }
}
}
if (del_len > 0) {
- printw(-del_len, fp);
- for (j = 1; j <= del_len; ++j) {
- int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
- putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
+ if (no_del < 2)
+ printw(-del_len, fp);
+ if (!no_del) {
+ for (j = 1; j <= del_len; ++j) {
+ int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
+ putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp);
+ }
}
}
- if (p->is_tail) putc('$', fp);
+ if (!no_ends && p->is_tail) putc('$', fp);
return 0;
}
#define MPLP_PRINT_TLEN (1<<21)
#define MPLP_PRINT_SEQ (1<<22)
#define MPLP_PRINT_QUAL (1<<23)
+#define MPLP_PRINT_MODS (1<<24)
+#define MPLP_PRINT_QPOS5 (1<<25)
+
+#define MPLP_PRINT_LAST (1<<26) // terminator for loop
#define MPLP_MAX_DEPTH 8000
#define MPLP_MAX_INDEL_DEPTH 250
void *bed, *rghash, *auxlist;
int argc;
char **argv;
- char sep, empty;
+ char sep, empty, no_ins, no_ins_mods, no_del, no_ends;
sam_global_args ga;
} mplp_conf_t;
return 1;
}
+// Initialise and destroy the base modifier state data. This is called
+// as each new read is added or removed from the pileups.
+static
+int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ int ret;
+ hts_base_mod_state *m = hts_base_mod_state_alloc();
+ ret = bam_parse_basemod(b, m);
+ cd->p = m;
+ return ret;
+}
+
+static
+int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ hts_base_mod_state_free(cd->p);
+ return 0;
+}
+
static void
print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len)
for (i = 0; i < n; ++i) {
fputs("\t0\t*\t*", fp);
int flag_value = MPLP_PRINT_MAPQ_CHAR;
- while(flag_value < MPLP_PRINT_QUAL + 1) {
- if (conf->flag & flag_value)
+ while(flag_value < MPLP_PRINT_LAST) {
+ if (flag_value != MPLP_PRINT_MODS && (conf->flag & flag_value))
fputs("\t*", fp);
flag_value <<= 1;
}
}
// read the header of each file in the list and initialize data
+ refs_t *refs = NULL;
for (i = 0; i < n; ++i) {
sam_hdr_t *h_tmp;
data[i] = calloc(1, sizeof(mplp_aux_t));
fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
samtools_exit(EXIT_FAILURE);
}
- if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
- fprintf(samtools_stderr, "[%s] failed to process %s: %s\n",
- __func__, conf->fai_fname, strerror(errno));
- samtools_exit(EXIT_FAILURE);
+
+ if (!refs && conf->fai_fname) {
+ if (hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(samtools_stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ samtools_exit(EXIT_FAILURE);
+ }
+ refs = cram_get_refs(data[i]->fp);
+ } else if (conf->fai_fname) {
+ if (hts_set_opt(data[i]->fp, CRAM_OPT_SHARED_REF, refs) != 0) {
+ fprintf(samtools_stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ samtools_exit(EXIT_FAILURE);
+ }
}
+
data[i]->conf = conf;
data[i]->ref = &mp_ref;
h_tmp = sam_hdr_read(data[i]->fp);
fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
samtools_exit(EXIT_FAILURE);
}
+ autoflush_if_stdout(bcf_fp, conf->output_fname);
// BCF header creation
bcf_hdr = bcf_hdr_init("w");
// init pileup
iter = bam_mplp_init(n, mplp_func, (void**)data);
+ if (conf->flag & MPLP_PRINT_MODS) {
+ bam_mplp_constructor(iter, pileup_cd_create);
+ bam_mplp_destructor(iter, pileup_cd_destroy);
+ }
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
if ( !conf->max_depth ) {
max_depth = INT_MAX;
if (n_plp[i] == 0) {
fputs("*\t*", pileup_fp);
int flag_value = MPLP_PRINT_MAPQ_CHAR;
- while(flag_value < MPLP_PRINT_QUAL + 1) {
- if (conf->flag & flag_value)
+ while(flag_value < MPLP_PRINT_LAST) {
+ if (flag_value != MPLP_PRINT_MODS
+ && (conf->flag & flag_value))
fputs("\t*", pileup_fp);
flag_value <<= 1;
}
: 0;
if (c >= conf->min_baseQ) {
n++;
- if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) {
+ if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len,
+ ref, &ks, conf->rev_del,
+ conf->no_ins, conf->no_ins_mods,
+ conf->no_del, conf->no_ends) < 0) {
ret = 1;
goto fail;
}
/* Print selected columns */
int flag_value = MPLP_PRINT_MAPQ_CHAR;
- while(flag_value < MPLP_PRINT_QUAL + 1) {
- if (conf->flag & flag_value) {
+ while(flag_value < MPLP_PRINT_LAST) {
+ if (flag_value != MPLP_PRINT_MODS
+ && (conf->flag & flag_value)) {
n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
putc(c, pileup_fp);
break;
case MPLP_PRINT_QPOS:
+ // query position in current orientation
fprintf(pileup_fp, "%d", p->qpos + 1);
break;
+ case MPLP_PRINT_QPOS5: {
+ // query position in 5' to 3' orientation
+ int pos5 = bam_is_rev(p->b)
+ ? p->b->core.l_qseq-p->qpos + p->is_del
+ : p->qpos + 1;
+ fprintf(pileup_fp, "%d", pos5);
+ break;
+ }
case MPLP_PRINT_QNAME:
fputs(bam_get_qname(p->b), pileup_fp);
break;
bcf_destroy1(bcf_rec);
if (bcf_fp)
{
+ release_autoflush(bcf_fp);
hts_close(bcf_fp);
bcf_hdr_destroy(bcf_hdr);
bcf_call_destroy(bca);
" -X, --customized-index use customized index files\n" // -X flag for index filename
"\n"
"Output options:\n"
-" -o, --output FILE write output to FILE [standard output]\n"
-" -O, --output-BP output base positions on reads\n"
-" -s, --output-MQ output mapping quality\n"
-" --output-QNAME output read names\n"
-" --output-extra STR output extra read fields and read tag values\n"
-" --output-sep CHAR set the separator character for tag lists [,]\n"
-" --output-empty CHAR set the no value character for tag lists [*]\n"
-" --reverse-del use '#' character for deletions on the reverse strand\n"
-" -a output all positions (including zero depth)\n"
-" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
+" -o, --output FILE write output to FILE [standard output]\n"
+" -O, --output-BP output base positions on reads, current orientation\n"
+" --output-BP-5 output base positions on reads, 5' to 3' orientation\n"
+" -M, --output-mods output base modifications\n"
+" -s, --output-MQ output mapping quality\n"
+" --output-QNAME output read names\n"
+" --output-extra STR output extra read fields and read tag values\n"
+" --output-sep CHAR set the separator character for tag lists [,]\n"
+" --output-empty CHAR set the no value character for tag lists [*]\n"
+" --no-output-ins skip insertion sequence after +NUM\n"
+" Use twice for complete insertion removal\n"
+" --no-output-ins-mods don't display base modifications within insertions\n"
+" --no-output-del skip deletion sequence after -NUM\n"
+" Use twice for complete deletion removal\n"
+" --no-output-ends remove ^MQUAL and $ markup in sequence column\n"
+" --reverse-del use '#' character for deletions on the reverse strand\n"
+" -a output all positions (including zero depth)\n"
+" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
"Generic options:\n");
sam_global_opt_help(fp, "-.--.--.");
{"bcf", no_argument, NULL, 'g'},
{"VCF", no_argument, NULL, 'v'},
{"vcf", no_argument, NULL, 'v'},
+ {"output-mods", no_argument, NULL, 'M'},
{"output-BP", no_argument, NULL, 'O'},
{"output-bp", no_argument, NULL, 'O'},
+ {"output-BP-5", no_argument, NULL, 14},
+ {"output-bp-5", no_argument, NULL, 14},
{"output-MQ", no_argument, NULL, 's'},
{"output-mq", no_argument, NULL, 's'},
{"output-tags", required_argument, NULL, 't'},
{"output-extra", required_argument, NULL, 7},
{"output-sep", required_argument, NULL, 8},
{"output-empty", required_argument, NULL, 9},
+ {"no-output-ins", no_argument, NULL, 10},
+ {"no-output-ins-mods", no_argument, NULL, 11},
+ {"no-output-del", no_argument, NULL, 12},
+ {"no-output-ends", no_argument, NULL, 13},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:aM",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
break;
case 8: mplp.sep = optarg[0]; break;
case 9: mplp.empty = optarg[0]; break;
+ case 10: mplp.no_ins++; break;
+ case 11: mplp.no_ins_mods = 1; break;
+ case 12: mplp.no_del++; break;
+ case 13: mplp.no_ends = 1; break;
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == NULL) return 1;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
case 's': mplp.flag |= MPLP_PRINT_MAPQ_CHAR; break;
- case 'O': mplp.flag |= MPLP_PRINT_QPOS; break;
+ case 'O':
+ if (!(mplp.flag & MPLP_PRINT_QPOS5))
+ mplp.flag |= MPLP_PRINT_QPOS;
+ break;
+ case 14:
+ mplp.flag |= MPLP_PRINT_QPOS5;
+ mplp.flag &= ~MPLP_PRINT_QPOS;
+ break;
+ case 'M': mplp.flag |= MPLP_PRINT_MODS; break;
case 'C': mplp.capQ_thres = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
case 'Q': mplp.min_baseQ = atoi(optarg); break;
cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len);
cram_block_update_size(b);
- cram_compress_block(fd, b, NULL, -1, -1);
+ cram_compress_block(fd, b, NULL, -1, 9);
if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26)
goto err;
cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len);
cram_block_update_size(b);
- cram_compress_block(fd, b, NULL, -1, -1);
+ cram_compress_block(fd, b, NULL, -1, 9);
if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26)
goto err;
--- /dev/null
+/* bam_samples -- print samples in a set of BAM files
+
+ Copyright (C) 2021 Pierre Lindenbaum
+ Institut du Thorax. u1087 Nantes. France.
+ @yokofakun
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+#include <htslib/khash.h>
+#include <htslib/kseq.h>
+#include <samtools.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+KHASH_MAP_INIT_STR(sm, int)
+
+/** and chained struct containing the faidx and the fasta filename
+ will be compared with the @SQ lines in the SAM header
+ */
+typedef struct FaidxPath {
+ /** path to reference */
+ char* filename;
+ /** fasta index */
+ faidx_t* faidx;
+ struct FaidxPath* next;
+} FaidxPath;
+
+/** program parameters */
+typedef struct Params {
+ /** output stream */
+ FILE* out;
+ /** tag in @RG line. default is "SM" */
+ char tag[3];
+ /** first faidx/path in chained list */
+ FaidxPath* faidx;
+ /** show whether the bam is indexed */
+ int test_index;
+} Params;
+
+/** print usage */
+static void usage_samples(FILE *write_to) {
+ fprintf(write_to,
+ "Usage: samtools samples [options] <input> [...]\n"
+ " samtools samples [options] -X f1.bam f2.bam f1.bam.bai f2.bai \n"
+ " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.cram\" \\) | samtools samples [options]\n"
+ " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.bai\" \\) | sort | paste - - | samtools samples -X [options]\n"
+ "\n"
+ "Options:\n"
+ " -? print help and exit\n"
+ " -h add the columns header before printing the results\n"
+ " -i test if the file is indexed.\n"
+ " -T <tag> provide the sample tag name from the @RG line [SM].\n"
+ " -o <file> output file [stdout].\n"
+ " -f <file.fa> load an indexed fasta file in the collection of references. Can be used multiple times.\n"
+ " -F <file.txt> read a file containing the paths to indexed fasta files. One path per line.\n"
+ " -X use a custom index file.\n"
+ "\n"
+ " Using -f or -F will add a column containing the path to the reference or \".\" if the reference was not found.\n"
+ "\n"
+ );
+}
+
+
+/** loads fasta fai file into FaidxPath, add it to params->faidx */
+static int load_dictionary(struct Params* params, const char* filename) {
+ FaidxPath* head = params->faidx;
+ FaidxPath* ptr = (FaidxPath*)malloc(sizeof(FaidxPath));
+ if (ptr == NULL) {
+ print_error_errno("samples", "Out of memory");
+ return EXIT_FAILURE;
+ }
+ ptr->filename = strdup(filename);
+ if (ptr->filename == NULL) {
+ free(ptr);
+ print_error_errno("samples", "Out of memory");
+ return EXIT_FAILURE;
+ }
+ ptr->faidx = fai_load(filename);
+ if (ptr->faidx == NULL) {
+ free(ptr->filename);
+ free(ptr);
+ print_error_errno("samples", "Cannot load index from \"%s\"", filename);
+ return EXIT_FAILURE;
+ }
+ /* insert at the beginning of the linked list */
+ params->faidx = ptr;
+ ptr->next = head;
+ return EXIT_SUCCESS;
+}
+
+/** load a faidx file and append it to params */
+static int load_dictionaries(Params* params, const char* filename) {
+ int ret;
+ htsFile* in;
+ int status = EXIT_SUCCESS;
+
+ in = hts_open(filename, "r");
+ if (in == NULL) {
+ print_error_errno("samples", "Cannot open \"%s\"", filename);
+ status = EXIT_FAILURE;
+ } else {
+ kstring_t ks = KS_INITIALIZE;
+ while ((ret = hts_getline(in, KS_SEP_LINE, &ks)) >= 0) {
+ if (load_dictionary(params, ks_str(&ks)) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ ks_free(&ks);
+ hts_close(in);
+ }
+ return status;
+}
+
+/** print the sample information, search for a reference */
+static int print_sample(
+ Params* params,
+ sam_hdr_t *header,
+ int has_index,
+ const char* sample,
+ const char* fname) {
+ fputs(sample, params->out);
+ fputc('\t', params->out);
+ fputs(fname, params->out);
+ if (params->test_index) {
+ fprintf(params->out, "\t%c", has_index ? 'Y' : 'N');
+ }
+ if (params->faidx != NULL) {
+ FaidxPath* ref = NULL;
+ FaidxPath* curr = params->faidx;
+ while (curr != NULL) {
+ /** check names and length are the same in the same order */
+ if (faidx_nseq(curr->faidx) == header->n_targets) {
+ int i;
+ for (i = 0; i < faidx_nseq(curr->faidx); i++) {
+ /** check name is the same */
+ if (strcmp(faidx_iseq(curr->faidx, i), header->target_name[i]) != 0) break;
+ /** check length is the same */
+ if (faidx_seq_len(curr->faidx, faidx_iseq(curr->faidx, i)) != header->target_len[i]) break;
+ }
+ /* the ref was found */
+ if (i == faidx_nseq(curr->faidx)) {
+ ref = curr;
+ break;
+ }
+ }
+ curr = curr->next;
+ }
+ fputc('\t', params->out);
+ if (ref == NULL) {
+ fputc('.', params->out);
+ } else {
+ fputs(curr->filename, params->out);
+ }
+ }
+ fputc('\n', params->out);
+ return 0;
+}
+
+/** open a sam file. Search for all samples in the @RG lines */
+static int print_samples(Params* params, const char* fname, const char* baifname) {
+ samFile *in = 0;
+ sam_hdr_t *header = NULL;
+ int n_rg;
+ int status = EXIT_SUCCESS;
+ khash_t(sm) *sample_set = NULL;
+ khint_t k;
+ int count_samples = 0;
+ int has_index = 0;
+
+ if ((sample_set = kh_init(sm)) == NULL) {
+ print_error("samples", "Failed to initialise sample hash");
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+
+ if ((in = sam_open_format(fname, "r", NULL)) == 0) {
+ print_error_errno("samples", "Failed to open \"%s\" for reading", fname);
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+ if ((header = sam_hdr_read(in)) == 0) {
+ print_error("samples", "Failed to read the header from \"%s\"", fname);
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+
+ /* try to load index if required */
+ if (params->test_index) {
+ hts_idx_t *bam_idx;
+ /* path to bam index was specified */
+ if (baifname != NULL) {
+ bam_idx = sam_index_load3(in, fname, baifname, HTS_IDX_SILENT_FAIL);
+ }
+ /* get default index */
+ else {
+ bam_idx = sam_index_load3(in, fname, NULL, HTS_IDX_SILENT_FAIL);
+ }
+ has_index = bam_idx != NULL;
+ if (bam_idx != NULL) hts_idx_destroy(bam_idx);
+ /* and we continue... we have tested the index file but we always test for the samples and the references */
+ }
+
+ /* get the RG lines */
+ n_rg = sam_hdr_count_lines(header, "RG");
+ if (n_rg > 0) {
+ int i, r, ret;
+ char* sample;
+ kstring_t sm_val = KS_INITIALIZE;
+ for (i = 0; i < n_rg; i++) {
+ r = sam_hdr_find_tag_pos(header, "RG", i, params->tag, &sm_val);
+ if (r < 0) continue;
+ k = kh_get(sm, sample_set, ks_str(&sm_val));
+ if (k != kh_end(sample_set)) continue;
+ sample = strdup(ks_str(&sm_val));
+ if (sample == NULL) {
+ print_error_errno("samples", "Out of memory");
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+ kh_put(sm, sample_set, sample, &ret);
+ if (ret < 0) {
+ print_error("samples", "Failed to insert key '%s' into sample_set", sample);
+ free(sample);
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+ ++count_samples;
+ }
+ ks_free(&sm_val);
+ }
+ if (count_samples == 0) {
+ print_sample(params, header, has_index, ".", fname);
+ } else {
+ for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) {
+ if (kh_exist(sample_set, k)) {
+ char* sample = (char*)kh_key(sample_set, k);
+ print_sample(params, header, has_index, sample, fname);
+ }
+ }
+ }
+
+end_print:
+ if (sample_set != NULL) {
+ for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) {
+ if (kh_exist(sample_set, k)) {
+ char* sample = (char*)kh_key(sample_set, k);
+ free(sample);
+ }
+ }
+ kh_destroy(sm, sample_set);
+ }
+ if (header != NULL) sam_hdr_destroy(header);
+ if (in != NULL) sam_close(in);
+
+ return status;
+}
+
+
+int main_samples(int argc, char** argv) {
+ int status = EXIT_SUCCESS;
+ int print_header = 0;
+ int has_index_file = 0;
+ Params params;
+ char* out_filename = NULL;
+ FaidxPath* fai;
+
+ strcpy(params.tag, "SM");
+ params.faidx = NULL;
+ params.test_index =0;
+
+ int opt;
+ while ((opt = getopt_long(argc, argv, "?hiXo:f:F:T:", NULL, NULL)) != -1) {
+ switch (opt) {
+ case 'h':
+ print_header = 1;
+ break;
+ case 'o':
+ out_filename = optarg;
+ break;
+ case 'i':
+ params.test_index = 1;
+ break;
+ case 'f':
+ if (load_dictionary(¶ms, optarg) != EXIT_SUCCESS) {
+ return EXIT_FAILURE;
+ }
+ break;
+ case 'F':
+ if (load_dictionaries(¶ms, optarg) != EXIT_SUCCESS) {
+ return EXIT_FAILURE;
+ }
+ break;
+ case 'T':
+ if (strlen(optarg) != 2) {
+ print_error("samples", "Length of tag \"%s\" is not 2.", optarg);
+ return EXIT_FAILURE;
+ }
+ strcpy(params.tag, optarg);
+ break;
+ case '?':
+ usage_samples(stdout);
+ return EXIT_SUCCESS;
+ case 'X':
+ has_index_file = 1;
+ break;
+ default:
+ usage_samples(stderr);
+ return EXIT_FAILURE;
+ }
+ }
+
+ /* if no file was provided and input is the terminal, print the usage and exit */
+ if (argc == optind && isatty(STDIN_FILENO)) {
+ usage_samples(stderr);
+ return EXIT_FAILURE;
+ }
+
+ if (out_filename != NULL) {
+ params.out = fopen(out_filename, "w");
+ if (params.out == NULL) {
+ print_error_errno("samples", "Cannot open \"%s\" for writing", out_filename);
+ return EXIT_FAILURE;
+ }
+ } else {
+ params.out = stdout;
+ }
+
+ if (print_header) {
+ fprintf(params.out, "#%s\tPATH", params.tag);
+ if (params.test_index) fprintf(params.out, "\tINDEX");
+ if (params.faidx != NULL) fprintf(params.out, "\tREFERENCE");
+ fprintf(params.out, "\n");
+ }
+
+ /* no file was provided, input is stdin, each line contains the path to a bam file */
+ if (argc == optind) {
+ htsFile* fp = hts_open("-", "r");
+ if (fp == NULL) {
+ print_error_errno("samples", "Cannot read from stdin");
+ status = EXIT_FAILURE;
+ } else {
+ kstring_t ks = KS_INITIALIZE;
+ int ret;
+ while ((ret = hts_getline(fp, KS_SEP_LINE, &ks)) >= 0) {
+ char* bai_path = NULL;
+ if (has_index_file) {
+ /* bam path and bam index file are separated by a tab */
+ char* tab = strchr(ks_str(&ks), '\t');
+ if (tab == NULL || *(tab+1) == '\0') {
+ print_error_errno("samples", "Expected path-to-bam(tab)path-to-index but got \"%s\"", ks_str(&ks));
+ status = EXIT_FAILURE;
+ break;
+ }
+ *tab=0;
+ bai_path = (tab + 1);
+ }
+ if (print_samples(¶ms, ks_str(&ks), bai_path) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ ks_free(&ks);
+ hts_close(fp);
+ }
+ }
+ /* loop over each file in argc/argv bam index provided */
+ else if (has_index_file) {
+ /* calculate number of input BAM files */
+ if ((argc - optind) % 2 != 0) {
+ print_error("samples","Odd number of filenames detected! Each BAM file should have an index file");
+ status = EXIT_FAILURE;
+ } else {
+ int i;
+ int n = (argc - optind ) / 2;
+ for (i = 0; i < n; i++) {
+ if (print_samples(¶ms, argv[optind+i], argv[optind+i+n]) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ }
+ } else {
+ int i;
+ for (i = optind; i < argc; i++) {
+ if (print_samples(¶ms, argv[i], NULL) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ }
+
+ fai = params.faidx;
+ while (fai != NULL) {
+ FaidxPath* next = fai -> next;
+ free(fai->filename);
+ fai_destroy(fai->faidx);
+ free(fai);
+ fai = next;
+ }
+
+ if (fflush(params.out) != 0) {
+ print_error_errno("samples", "Cannot flush output");
+ status = EXIT_FAILURE;
+ }
+ if (out_filename != NULL) {
+ fclose(params.out);
+ }
+
+ return status;
+}
--- /dev/null
+#include "samtools.pysam.h"
+
+/* bam_samples -- print samples in a set of BAM files
+
+ Copyright (C) 2021 Pierre Lindenbaum
+ Institut du Thorax. u1087 Nantes. France.
+ @yokofakun
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+#include <htslib/khash.h>
+#include <htslib/kseq.h>
+#include <samtools.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+
+KHASH_MAP_INIT_STR(sm, int)
+
+/** and chained struct containing the faidx and the fasta filename
+ will be compared with the @SQ lines in the SAM header
+ */
+typedef struct FaidxPath {
+ /** path to reference */
+ char* filename;
+ /** fasta index */
+ faidx_t* faidx;
+ struct FaidxPath* next;
+} FaidxPath;
+
+/** program parameters */
+typedef struct Params {
+ /** output stream */
+ FILE* out;
+ /** tag in @RG line. default is "SM" */
+ char tag[3];
+ /** first faidx/path in chained list */
+ FaidxPath* faidx;
+ /** show whether the bam is indexed */
+ int test_index;
+} Params;
+
+/** print usage */
+static void usage_samples(FILE *write_to) {
+ fprintf(write_to,
+ "Usage: samtools samples [options] <input> [...]\n"
+ " samtools samples [options] -X f1.bam f2.bam f1.bam.bai f2.bai \n"
+ " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.cram\" \\) | samtools samples [options]\n"
+ " find dir1 dir2 -type f \\(-name \"*.bam\" -o -name \"*.bai\" \\) | sort | paste - - | samtools samples -X [options]\n"
+ "\n"
+ "Options:\n"
+ " -? print help and exit\n"
+ " -h add the columns header before printing the results\n"
+ " -i test if the file is indexed.\n"
+ " -T <tag> provide the sample tag name from the @RG line [SM].\n"
+ " -o <file> output file [samtools_stdout].\n"
+ " -f <file.fa> load an indexed fasta file in the collection of references. Can be used multiple times.\n"
+ " -F <file.txt> read a file containing the paths to indexed fasta files. One path per line.\n"
+ " -X use a custom index file.\n"
+ "\n"
+ " Using -f or -F will add a column containing the path to the reference or \".\" if the reference was not found.\n"
+ "\n"
+ );
+}
+
+
+/** loads fasta fai file into FaidxPath, add it to params->faidx */
+static int load_dictionary(struct Params* params, const char* filename) {
+ FaidxPath* head = params->faidx;
+ FaidxPath* ptr = (FaidxPath*)malloc(sizeof(FaidxPath));
+ if (ptr == NULL) {
+ print_error_errno("samples", "Out of memory");
+ return EXIT_FAILURE;
+ }
+ ptr->filename = strdup(filename);
+ if (ptr->filename == NULL) {
+ free(ptr);
+ print_error_errno("samples", "Out of memory");
+ return EXIT_FAILURE;
+ }
+ ptr->faidx = fai_load(filename);
+ if (ptr->faidx == NULL) {
+ free(ptr->filename);
+ free(ptr);
+ print_error_errno("samples", "Cannot load index from \"%s\"", filename);
+ return EXIT_FAILURE;
+ }
+ /* insert at the beginning of the linked list */
+ params->faidx = ptr;
+ ptr->next = head;
+ return EXIT_SUCCESS;
+}
+
+/** load a faidx file and append it to params */
+static int load_dictionaries(Params* params, const char* filename) {
+ int ret;
+ htsFile* in;
+ int status = EXIT_SUCCESS;
+
+ in = hts_open(filename, "r");
+ if (in == NULL) {
+ print_error_errno("samples", "Cannot open \"%s\"", filename);
+ status = EXIT_FAILURE;
+ } else {
+ kstring_t ks = KS_INITIALIZE;
+ while ((ret = hts_getline(in, KS_SEP_LINE, &ks)) >= 0) {
+ if (load_dictionary(params, ks_str(&ks)) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ ks_free(&ks);
+ hts_close(in);
+ }
+ return status;
+}
+
+/** print the sample information, search for a reference */
+static int print_sample(
+ Params* params,
+ sam_hdr_t *header,
+ int has_index,
+ const char* sample,
+ const char* fname) {
+ fputs(sample, params->out);
+ fputc('\t', params->out);
+ fputs(fname, params->out);
+ if (params->test_index) {
+ fprintf(params->out, "\t%c", has_index ? 'Y' : 'N');
+ }
+ if (params->faidx != NULL) {
+ FaidxPath* ref = NULL;
+ FaidxPath* curr = params->faidx;
+ while (curr != NULL) {
+ /** check names and length are the same in the same order */
+ if (faidx_nseq(curr->faidx) == header->n_targets) {
+ int i;
+ for (i = 0; i < faidx_nseq(curr->faidx); i++) {
+ /** check name is the same */
+ if (strcmp(faidx_iseq(curr->faidx, i), header->target_name[i]) != 0) break;
+ /** check length is the same */
+ if (faidx_seq_len(curr->faidx, faidx_iseq(curr->faidx, i)) != header->target_len[i]) break;
+ }
+ /* the ref was found */
+ if (i == faidx_nseq(curr->faidx)) {
+ ref = curr;
+ break;
+ }
+ }
+ curr = curr->next;
+ }
+ fputc('\t', params->out);
+ if (ref == NULL) {
+ fputc('.', params->out);
+ } else {
+ fputs(curr->filename, params->out);
+ }
+ }
+ fputc('\n', params->out);
+ return 0;
+}
+
+/** open a sam file. Search for all samples in the @RG lines */
+static int print_samples(Params* params, const char* fname, const char* baifname) {
+ samFile *in = 0;
+ sam_hdr_t *header = NULL;
+ int n_rg;
+ int status = EXIT_SUCCESS;
+ khash_t(sm) *sample_set = NULL;
+ khint_t k;
+ int count_samples = 0;
+ int has_index = 0;
+
+ if ((sample_set = kh_init(sm)) == NULL) {
+ print_error("samples", "Failed to initialise sample hash");
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+
+ if ((in = sam_open_format(fname, "r", NULL)) == 0) {
+ print_error_errno("samples", "Failed to open \"%s\" for reading", fname);
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+ if ((header = sam_hdr_read(in)) == 0) {
+ print_error("samples", "Failed to read the header from \"%s\"", fname);
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+
+ /* try to load index if required */
+ if (params->test_index) {
+ hts_idx_t *bam_idx;
+ /* path to bam index was specified */
+ if (baifname != NULL) {
+ bam_idx = sam_index_load3(in, fname, baifname, HTS_IDX_SILENT_FAIL);
+ }
+ /* get default index */
+ else {
+ bam_idx = sam_index_load3(in, fname, NULL, HTS_IDX_SILENT_FAIL);
+ }
+ has_index = bam_idx != NULL;
+ if (bam_idx != NULL) hts_idx_destroy(bam_idx);
+ /* and we continue... we have tested the index file but we always test for the samples and the references */
+ }
+
+ /* get the RG lines */
+ n_rg = sam_hdr_count_lines(header, "RG");
+ if (n_rg > 0) {
+ int i, r, ret;
+ char* sample;
+ kstring_t sm_val = KS_INITIALIZE;
+ for (i = 0; i < n_rg; i++) {
+ r = sam_hdr_find_tag_pos(header, "RG", i, params->tag, &sm_val);
+ if (r < 0) continue;
+ k = kh_get(sm, sample_set, ks_str(&sm_val));
+ if (k != kh_end(sample_set)) continue;
+ sample = strdup(ks_str(&sm_val));
+ if (sample == NULL) {
+ print_error_errno("samples", "Out of memory");
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+ kh_put(sm, sample_set, sample, &ret);
+ if (ret < 0) {
+ print_error("samples", "Failed to insert key '%s' into sample_set", sample);
+ free(sample);
+ status = EXIT_FAILURE;
+ goto end_print;
+ }
+ ++count_samples;
+ }
+ ks_free(&sm_val);
+ }
+ if (count_samples == 0) {
+ print_sample(params, header, has_index, ".", fname);
+ } else {
+ for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) {
+ if (kh_exist(sample_set, k)) {
+ char* sample = (char*)kh_key(sample_set, k);
+ print_sample(params, header, has_index, sample, fname);
+ }
+ }
+ }
+
+end_print:
+ if (sample_set != NULL) {
+ for (k = kh_begin(sample_set); k != kh_end(sample_set); ++k) {
+ if (kh_exist(sample_set, k)) {
+ char* sample = (char*)kh_key(sample_set, k);
+ free(sample);
+ }
+ }
+ kh_destroy(sm, sample_set);
+ }
+ if (header != NULL) sam_hdr_destroy(header);
+ if (in != NULL) sam_close(in);
+
+ return status;
+}
+
+
+int main_samples(int argc, char** argv) {
+ int status = EXIT_SUCCESS;
+ int print_header = 0;
+ int has_index_file = 0;
+ Params params;
+ char* out_filename = NULL;
+ FaidxPath* fai;
+
+ strcpy(params.tag, "SM");
+ params.faidx = NULL;
+ params.test_index =0;
+
+ int opt;
+ while ((opt = getopt_long(argc, argv, "?hiXo:f:F:T:", NULL, NULL)) != -1) {
+ switch (opt) {
+ case 'h':
+ print_header = 1;
+ break;
+ case 'o':
+ out_filename = optarg;
+ break;
+ case 'i':
+ params.test_index = 1;
+ break;
+ case 'f':
+ if (load_dictionary(¶ms, optarg) != EXIT_SUCCESS) {
+ return EXIT_FAILURE;
+ }
+ break;
+ case 'F':
+ if (load_dictionaries(¶ms, optarg) != EXIT_SUCCESS) {
+ return EXIT_FAILURE;
+ }
+ break;
+ case 'T':
+ if (strlen(optarg) != 2) {
+ print_error("samples", "Length of tag \"%s\" is not 2.", optarg);
+ return EXIT_FAILURE;
+ }
+ strcpy(params.tag, optarg);
+ break;
+ case '?':
+ usage_samples(samtools_stdout);
+ return EXIT_SUCCESS;
+ case 'X':
+ has_index_file = 1;
+ break;
+ default:
+ usage_samples(samtools_stderr);
+ return EXIT_FAILURE;
+ }
+ }
+
+ /* if no file was provided and input is the terminal, print the usage and exit */
+ if (argc == optind && isatty(STDIN_FILENO)) {
+ usage_samples(samtools_stderr);
+ return EXIT_FAILURE;
+ }
+
+ if (out_filename != NULL) {
+ params.out = fopen(out_filename, "w");
+ if (params.out == NULL) {
+ print_error_errno("samples", "Cannot open \"%s\" for writing", out_filename);
+ return EXIT_FAILURE;
+ }
+ } else {
+ params.out = samtools_stdout;
+ }
+
+ if (print_header) {
+ fprintf(params.out, "#%s\tPATH", params.tag);
+ if (params.test_index) fprintf(params.out, "\tINDEX");
+ if (params.faidx != NULL) fprintf(params.out, "\tREFERENCE");
+ fprintf(params.out, "\n");
+ }
+
+ /* no file was provided, input is stdin, each line contains the path to a bam file */
+ if (argc == optind) {
+ htsFile* fp = hts_open("-", "r");
+ if (fp == NULL) {
+ print_error_errno("samples", "Cannot read from stdin");
+ status = EXIT_FAILURE;
+ } else {
+ kstring_t ks = KS_INITIALIZE;
+ int ret;
+ while ((ret = hts_getline(fp, KS_SEP_LINE, &ks)) >= 0) {
+ char* bai_path = NULL;
+ if (has_index_file) {
+ /* bam path and bam index file are separated by a tab */
+ char* tab = strchr(ks_str(&ks), '\t');
+ if (tab == NULL || *(tab+1) == '\0') {
+ print_error_errno("samples", "Expected path-to-bam(tab)path-to-index but got \"%s\"", ks_str(&ks));
+ status = EXIT_FAILURE;
+ break;
+ }
+ *tab=0;
+ bai_path = (tab + 1);
+ }
+ if (print_samples(¶ms, ks_str(&ks), bai_path) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ ks_free(&ks);
+ hts_close(fp);
+ }
+ }
+ /* loop over each file in argc/argv bam index provided */
+ else if (has_index_file) {
+ /* calculate number of input BAM files */
+ if ((argc - optind) % 2 != 0) {
+ print_error("samples","Odd number of filenames detected! Each BAM file should have an index file");
+ status = EXIT_FAILURE;
+ } else {
+ int i;
+ int n = (argc - optind ) / 2;
+ for (i = 0; i < n; i++) {
+ if (print_samples(¶ms, argv[optind+i], argv[optind+i+n]) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ }
+ } else {
+ int i;
+ for (i = optind; i < argc; i++) {
+ if (print_samples(¶ms, argv[i], NULL) != EXIT_SUCCESS) {
+ status = EXIT_FAILURE;
+ break;
+ }
+ }
+ }
+
+ fai = params.faidx;
+ while (fai != NULL) {
+ FaidxPath* next = fai -> next;
+ free(fai->filename);
+ fai_destroy(fai->faidx);
+ free(fai);
+ fai = next;
+ }
+
+ if (fflush(params.out) != 0) {
+ print_error_errno("samples", "Cannot flush output");
+ status = EXIT_FAILURE;
+ }
+ if (out_filename != NULL) {
+ fclose(params.out);
+ }
+
+ return status;
+}
#include "htslib/kstring.h"
#include "htslib/sam.h"
#include "htslib/hts_endian.h"
+#include "htslib/cram.h"
#include "sam_opts.h"
#include "samtools.h"
#include "bedidx.h"
hts_reglist_t *lreg = NULL;
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
+ refs_t *refs = NULL;
// Is there a specified pre-prepared header to use for output?
if (headers) {
flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG,
(flag & MERGE_FIRST_CO)? (i == 0) : true,
RG[i]))
- return -1; // FIXME: memory leak
+ goto fail;
hdr[i] = hin;
+ int order_ok = 1;
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
+ order_ok = 0;
}
- // Potential future improvement is to share headers between CRAM files for
- // samtools sort (where all headers are identical.
- // Eg:
- //
- // if (i > 1) {
- // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram));
- // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram));
- // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram));
- // }
+ if (!refs)
+ refs = cram_get_refs(fp[i]);
+
+ if (order_ok && refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs))
+ goto fail;
}
// Did we get an @HD line?
goto fail;
hout = merged_hdr->hdr;
- if (!hout) return -1; // FIXME: memory leak
+ if (!hout)
+ goto fail;
// If we're only merging a specified region move our iters to start at that point
int tid, nreg;
}
if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads);
+ if (refs && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs))
+ goto fail;
+
// Begin the actual merge
ks_heapmake(heap, n, heap);
while (heap->pos != HEAP_EMPTY) {
const char *prefix;
bam1_tag *buf;
const sam_hdr_t *h;
+ char *tmpfile_name;
int index;
int error;
int no_save;
+ int large_pos;
} worker_t;
// Returns 0 for success
{
worker_t *w = (worker_t*)data;
char *name;
+ size_t name_len;
w->error = 0;
+ w->tmpfile_name = NULL;
if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
if (w->no_save)
return 0;
- name = (char*)calloc(strlen(w->prefix) + 20, 1);
+ name_len = strlen(w->prefix) + 30;
+ name = (char*)calloc(name_len, 1);
if (!name) { w->error = errno; return 0; }
- sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
-
- uint32_t max_ncigar = 0;
- int i;
- for (i = 0; i < w->buf_len; i++) {
- uint32_t nc = w->buf[i].bam_record->core.n_cigar;
- if (max_ncigar < nc)
- max_ncigar = nc;
- }
+ const int MAX_TRIES = 1000;
+ int tries = 0;
+ for (;;) {
+ if (tries) {
+ snprintf(name, name_len, "%s.%.4d-%.3d.bam",
+ w->prefix, w->index, tries);
+ } else {
+ snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index);
+ }
- if (max_ncigar > 65535) {
- htsFormat fmt;
- memset(&fmt, 0, sizeof(fmt));
- if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) {
+ if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1",
+ w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) {
+ break;
+ }
+ if (errno == EEXIST && tries < MAX_TRIES) {
+ tries++;
+ } else {
w->error = errno;
- free(name);
- return 0;
+ break;
}
+ }
- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0)
- w->error = errno;
+ if (w->error) {
+ free(name);
} else {
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0)
- w->error = errno;
+ w->tmpfile_name = name;
}
-
- free(name);
return 0;
}
static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
- const sam_hdr_t *h, int n_threads, buf_region *in_mem)
+ const sam_hdr_t *h, int n_threads, buf_region *in_mem,
+ int large_pos, char **fns, size_t fns_size)
{
int i;
size_t pos, rest;
w[i].prefix = prefix;
w[i].h = h;
w[i].index = n_files + i;
+ w[i].tmpfile_name = NULL;
+ w[i].large_pos = large_pos;
if (in_mem) {
w[i].no_save = 1;
in_mem[i].from = pos;
}
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
+ if (!in_mem) {
+ assert(w[i].index >= 0 && w[i].index < fns_size);
+ fns[w[i].index] = w[i].tmpfile_name;
+ }
if (w[i].error != 0) {
errno = w[i].error;
print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
n_failed++;
}
}
+ if (n_failed && !in_mem) {
+ // Clean up any temporary files that did get made, as we're
+ // about to lose track of them
+ for (i = 0; i < n_threads; ++i) {
+ if (fns[w[i].index]) {
+ unlink(fns[w[i].index]);
+ free(fns[w[i].index]);
+ fns[w[i].index] = NULL;
+ }
+ }
+ }
free(tid); free(w);
if (n_failed) return -1;
if (in_mem) return n_threads;
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
- int ret = -1, res, i, n_files = 0;
+ int ret = -1, res, i, nref, n_files = 0;
size_t max_k, k, max_mem, bam_mem_offset;
sam_hdr_t *header = NULL;
samFile *fp;
bam1_t *b = bam_init1();
uint8_t *bam_mem = NULL;
char **fns = NULL;
+ size_t fns_size = 0;
const char *new_so;
buf_region *in_mem = NULL;
int num_in_mem = 0;
+ int large_pos = 0;
if (!b) {
print_error("sort", "couldn't allocate memory for bam record");
goto err;
}
+ // Inspect the header looking for long chromosomes
+ // If there is one, we need to write temporary files in SAM format
+ nref = sam_hdr_nref(header);
+ for (i = 0; i < nref; i++) {
+ if (sam_hdr_tid2len(header, i) > INT32_MAX)
+ large_pos = 1;
+ }
+
+ // Also check the output format is large position compatible
+ if (large_pos) {
+ int compatible = (out_fmt->format == sam
+ || (out_fmt->format == cram
+ && out_fmt->version.major >= 4)
+ || (out_fmt->format == unknown_format
+ && modeout[0] == 'w'
+ && (modeout[1] == 'z' || modeout[1] == '\0')));
+ if (!compatible) {
+ print_error("sort", "output format is not compatible with very large references");
+ goto err;
+ }
+ }
+
if (sort_by_tag != NULL)
new_so = "unknown";
else if (is_by_qname)
++k;
if (mem_full) {
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- NULL);
- if (n_files < 0) {
+ if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1),
+ &fns_size, &fns, 0) < 0)
+ goto err;
+ int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads,
+ NULL, large_pos, fns, fns_size);
+ if (new_n < 0) {
goto err;
+ } else {
+ n_files = new_n;
}
k = 0;
bam_mem_offset = 0;
in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
if (!in_mem) goto err;
num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- in_mem);
+ in_mem, large_pos, fns, fns_size);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
fprintf(stderr,
"[bam_sort_core] merging from %d files and %d in-memory blocks...\n",
n_files, num_in_mem);
- fns = (char**)calloc(n_files, sizeof(char*));
- if (!fns) goto err;
+ // Paranoia check - all temporary files should have a name
for (i = 0; i < n_files; ++i) {
- fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
- if (!fns[i]) goto err;
- sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+ if (!fns[i]) {
+ print_error("sort",
+ "BUG: no name stored for temporary file %d", i);
+ abort();
+ }
}
if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header,
n_files, fns, num_in_mem, in_mem, buf,
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
if (tmpprefix.l == 0) {
- if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout);
- else kputc('.', &tmpprefix);
+ if (strcmp(fnout, "-") != 0) {
+ char *idx = strstr(fnout, HTS_IDX_DELIM);
+ kputsn(fnout, idx ? idx - fnout : strlen(fnout), &tmpprefix);
+ kputs(".tmp", &tmpprefix);
+ } else {
+ kputc('.', &tmpprefix);
+ }
}
if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) {
unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
#include "htslib/kstring.h"
#include "htslib/sam.h"
#include "htslib/hts_endian.h"
+#include "htslib/cram.h"
#include "sam_opts.h"
#include "samtools.h"
#include "bedidx.h"
hts_reglist_t *lreg = NULL;
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
+ refs_t *refs = NULL;
// Is there a specified pre-prepared header to use for output?
if (headers) {
flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG,
(flag & MERGE_FIRST_CO)? (i == 0) : true,
RG[i]))
- return -1; // FIXME: memory leak
+ goto fail;
hdr[i] = hin;
+ int order_ok = 1;
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
+ order_ok = 0;
}
- // Potential future improvement is to share headers between CRAM files for
- // samtools sort (where all headers are identical.
- // Eg:
- //
- // if (i > 1) {
- // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram));
- // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram));
- // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram));
- // }
+ if (!refs)
+ refs = cram_get_refs(fp[i]);
+
+ if (order_ok && refs && hts_set_opt(fp[i], CRAM_OPT_SHARED_REF, refs))
+ goto fail;
}
// Did we get an @HD line?
goto fail;
hout = merged_hdr->hdr;
- if (!hout) return -1; // FIXME: memory leak
+ if (!hout)
+ goto fail;
// If we're only merging a specified region move our iters to start at that point
int tid, nreg;
}
if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads);
+ if (refs && hts_set_opt(fpout, CRAM_OPT_SHARED_REF, refs))
+ goto fail;
+
// Begin the actual merge
ks_heapmake(heap, n, heap);
while (heap->pos != HEAP_EMPTY) {
const char *prefix;
bam1_tag *buf;
const sam_hdr_t *h;
+ char *tmpfile_name;
int index;
int error;
int no_save;
+ int large_pos;
} worker_t;
// Returns 0 for success
{
worker_t *w = (worker_t*)data;
char *name;
+ size_t name_len;
w->error = 0;
+ w->tmpfile_name = NULL;
if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
if (w->no_save)
return 0;
- name = (char*)calloc(strlen(w->prefix) + 20, 1);
+ name_len = strlen(w->prefix) + 30;
+ name = (char*)calloc(name_len, 1);
if (!name) { w->error = errno; return 0; }
- sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
-
- uint32_t max_ncigar = 0;
- int i;
- for (i = 0; i < w->buf_len; i++) {
- uint32_t nc = w->buf[i].bam_record->core.n_cigar;
- if (max_ncigar < nc)
- max_ncigar = nc;
- }
+ const int MAX_TRIES = 1000;
+ int tries = 0;
+ for (;;) {
+ if (tries) {
+ snprintf(name, name_len, "%s.%.4d-%.3d.bam",
+ w->prefix, w->index, tries);
+ } else {
+ snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index);
+ }
- if (max_ncigar > 65535) {
- htsFormat fmt;
- memset(&fmt, 0, sizeof(fmt));
- if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) {
+ if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1",
+ w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) {
+ break;
+ }
+ if (errno == EEXIST && tries < MAX_TRIES) {
+ tries++;
+ } else {
w->error = errno;
- free(name);
- return 0;
+ break;
}
+ }
- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, 0, NULL, 1, 0) < 0)
- w->error = errno;
+ if (w->error) {
+ free(name);
} else {
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) < 0)
- w->error = errno;
+ w->tmpfile_name = name;
}
-
- free(name);
return 0;
}
static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
- const sam_hdr_t *h, int n_threads, buf_region *in_mem)
+ const sam_hdr_t *h, int n_threads, buf_region *in_mem,
+ int large_pos, char **fns, size_t fns_size)
{
int i;
size_t pos, rest;
w[i].prefix = prefix;
w[i].h = h;
w[i].index = n_files + i;
+ w[i].tmpfile_name = NULL;
+ w[i].large_pos = large_pos;
if (in_mem) {
w[i].no_save = 1;
in_mem[i].from = pos;
}
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
+ if (!in_mem) {
+ assert(w[i].index >= 0 && w[i].index < fns_size);
+ fns[w[i].index] = w[i].tmpfile_name;
+ }
if (w[i].error != 0) {
errno = w[i].error;
print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
n_failed++;
}
}
+ if (n_failed && !in_mem) {
+ // Clean up any temporary files that did get made, as we're
+ // about to lose track of them
+ for (i = 0; i < n_threads; ++i) {
+ if (fns[w[i].index]) {
+ unlink(fns[w[i].index]);
+ free(fns[w[i].index]);
+ fns[w[i].index] = NULL;
+ }
+ }
+ }
free(tid); free(w);
if (n_failed) return -1;
if (in_mem) return n_threads;
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
- int ret = -1, res, i, n_files = 0;
+ int ret = -1, res, i, nref, n_files = 0;
size_t max_k, k, max_mem, bam_mem_offset;
sam_hdr_t *header = NULL;
samFile *fp;
bam1_t *b = bam_init1();
uint8_t *bam_mem = NULL;
char **fns = NULL;
+ size_t fns_size = 0;
const char *new_so;
buf_region *in_mem = NULL;
int num_in_mem = 0;
+ int large_pos = 0;
if (!b) {
print_error("sort", "couldn't allocate memory for bam record");
goto err;
}
+ // Inspect the header looking for long chromosomes
+ // If there is one, we need to write temporary files in SAM format
+ nref = sam_hdr_nref(header);
+ for (i = 0; i < nref; i++) {
+ if (sam_hdr_tid2len(header, i) > INT32_MAX)
+ large_pos = 1;
+ }
+
+ // Also check the output format is large position compatible
+ if (large_pos) {
+ int compatible = (out_fmt->format == sam
+ || (out_fmt->format == cram
+ && out_fmt->version.major >= 4)
+ || (out_fmt->format == unknown_format
+ && modeout[0] == 'w'
+ && (modeout[1] == 'z' || modeout[1] == '\0')));
+ if (!compatible) {
+ print_error("sort", "output format is not compatible with very large references");
+ goto err;
+ }
+ }
+
if (sort_by_tag != NULL)
new_so = "unknown";
else if (is_by_qname)
++k;
if (mem_full) {
- n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- NULL);
- if (n_files < 0) {
+ if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1),
+ &fns_size, &fns, 0) < 0)
+ goto err;
+ int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads,
+ NULL, large_pos, fns, fns_size);
+ if (new_n < 0) {
goto err;
+ } else {
+ n_files = new_n;
}
k = 0;
bam_mem_offset = 0;
in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
if (!in_mem) goto err;
num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- in_mem);
+ in_mem, large_pos, fns, fns_size);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
fprintf(samtools_stderr,
"[bam_sort_core] merging from %d files and %d in-memory blocks...\n",
n_files, num_in_mem);
- fns = (char**)calloc(n_files, sizeof(char*));
- if (!fns) goto err;
+ // Paranoia check - all temporary files should have a name
for (i = 0; i < n_files; ++i) {
- fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
- if (!fns[i]) goto err;
- sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+ if (!fns[i]) {
+ print_error("sort",
+ "BUG: no name stored for temporary file %d", i);
+ abort();
+ }
}
if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header,
n_files, fns, num_in_mem, in_mem, buf,
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
if (tmpprefix.l == 0) {
- if (strcmp(fnout, "-") != 0) ksprintf(&tmpprefix, "%s.tmp", fnout);
- else kputc('.', &tmpprefix);
+ if (strcmp(fnout, "-") != 0) {
+ char *idx = strstr(fnout, HTS_IDX_DELIM);
+ kputsn(fnout, idx ? idx - fnout : strlen(fnout), &tmpprefix);
+ kputs(".tmp", &tmpprefix);
+ } else {
+ kputc('.', &tmpprefix);
+ }
}
if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) {
unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
- int is_samtools_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg)
+ int is_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg)
{
samFile *fp, *fpw = NULL, **fpt = NULL;
char **fnt = NULL, modew[8];
sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
- if (!is_samtools_stdout && !output_file) { // output to a file (name based on prefix)
+ if (!is_stdout && !output_file) { // output to a file (name based on prefix)
char *fnw = (char*)calloc(l + 5, 1);
if (!fnw) goto mem_fail;
if (ga->out.format == unknown_format)
fpw = sam_open_format(output_file, modew, &ga->out);
} else fpw = sam_open_format("-", modew, &ga->out); // output to samtools_stdout
if (fpw == NULL) {
- if (is_samtools_stdout) print_error_errno("collate", "Cannot open standard output");
+ if (is_stdout) print_error_errno("collate", "Cannot open standard output");
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
goto fail;
}
int main_bamshuf(int argc, char *argv[])
{
- int c, n_files = 64, clevel = DEF_CLEVEL, is_samtools_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0;
+ int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0;
const char *output_file = NULL;
char *prefix = NULL, *arg_list = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'u': is_un = 1; break;
- case 'O': is_samtools_stdout = 1; break;
+ case 'O': is_stdout = 1; break;
case 'o': output_file = optarg; break;
case 'f': fast_coll = 1; break;
case 'r': reads_store = atoi(optarg); break;
}
if (is_un) clevel = 0;
if (argc >= optind + 2) prefix = argv[optind+1];
- if (!(prefix || is_samtools_stdout || output_file))
+ if (!(prefix || is_stdout || output_file))
return usage(samtools_stderr, n_files, reads_store);
- if (is_samtools_stdout && output_file) {
+ if (is_stdout && output_file) {
fprintf(samtools_stderr, "collate: -o and -O options cannot be used together.\n");
return usage(samtools_stderr, n_files, reads_store);
}
return 1;
}
- ret = bamshuf(argv[optind], n_files, prefix, clevel, is_samtools_stdout,
+ ret = bamshuf(argv[optind], n_files, prefix, clevel, is_stdout,
output_file, fast_coll, reads_store, &ga, arg_list, no_pg);
if (pre_mem) free(prefix);
int amplicon_clip_main(int argc, char *argv[]);
int main_ampliconstats(int argc, char *argv[]);
int main_import(int argc, char *argv[]);
+int main_samples(int argc, char *argv[]);
const char *samtools_version()
{
" tview text alignment viewer\n"
" view SAM<->BAM<->CRAM conversion\n"
" depad convert padded BAM to unpadded BAM\n"
+" samples list the samples in a set of SAM/BAM/CRAM files\n"
"\n"
" -- Misc\n"
" help [cmd] display this help message or help for [cmd]\n"
}
else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1);
+ else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0 || \
strcmp(argv[1], "--version") == 0) {
long_version();
int amplicon_clip_main(int argc, char *argv[]);
int main_ampliconstats(int argc, char *argv[]);
int main_import(int argc, char *argv[]);
+int main_samples(int argc, char *argv[]);
const char *samtools_version()
{
" tview text alignment viewer\n"
" view SAM<->BAM<->CRAM conversion\n"
" depad convert padded BAM to unpadded BAM\n"
+" samples list the samples in a set of SAM/BAM/CRAM files\n"
"\n"
" -- Misc\n"
" help [cmd] display this help message or help for [cmd]\n"
}
//else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1);
+ else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0 || \
strcmp(argv[1], "--version") == 0) {
long_version();
" endpos End position (or sequence length)\n"
" numreads Number reads aligned to the region (after filtering)\n"
" covbases Number of covered bases with depth >= 1\n"
- " coverage Proportion of covered bases [0..1]\n"
+ " coverage Percentage of covered bases [0..100]\n"
" meandepth Mean depth of coverage\n"
" meanbaseq Mean baseQ in covered region\n"
" meanmapq Mean mapQ of selected reads\n"
" endpos End position (or sequence length)\n"
" numreads Number reads aligned to the region (after filtering)\n"
" covbases Number of covered bases with depth >= 1\n"
- " coverage Proportion of covered bases [0..1]\n"
+ " coverage Percentage of covered bases [0..100]\n"
" meandepth Mean depth of coverage\n"
" meanbaseq Mean baseQ in covered region\n"
" meanmapq Mean mapQ of selected reads\n"
+++ /dev/null
-/* sam.c -- format-neutral SAM/BAM API.
-
- Copyright (C) 2009, 2012-2016 Genome Research Ltd.
- Portions copyright (C) 2011 Broad Institute.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <string.h>
-#include <unistd.h>
-#include "htslib/faidx.h"
-#include "sam.h"
-
-int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
-{
- if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1;
- if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1;
- return 0;
-}
-
-samfile_t *samopen(const char *fn, const char *mode, const void *aux)
-{
- // hts_open() is really sam_open(), except for #define games
- samFile *hts_fp = hts_open(fn, mode);
- if (hts_fp == NULL) return NULL;
-
- samfile_t *fp = malloc(sizeof (samfile_t));
- if (!fp) {
- sam_close(hts_fp);
- return NULL;
- }
- fp->file = hts_fp;
- fp->x.bam = hts_fp->fp.bgzf;
- if (strchr(mode, 'r')) {
- if (aux) {
- if (hts_set_fai_filename(fp->file, aux) != 0) {
- sam_close(hts_fp);
- free(fp);
- return NULL;
- }
- }
- fp->header = sam_hdr_read(fp->file); // samclose() will free this
- if (fp->header == NULL) {
- sam_close(hts_fp);
- free(fp);
- return NULL;
- }
- fp->is_write = 0;
- if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1)
- fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
- }
- else {
- enum htsExactFormat fmt = hts_get_format(fp->file)->format;
- fp->header = (sam_hdr_t *)aux; // For writing, we won't free it
- fp->is_write = 1;
- if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) {
- if (sam_hdr_write(fp->file, fp->header) < 0) {
- if (bam_verbose >= 1)
- fprintf(stderr, "[samopen] Couldn't write header\n");
- sam_close(hts_fp);
- free(fp);
- return NULL;
- }
- }
- }
-
- return fp;
-}
-
-void samclose(samfile_t *fp)
-{
- if (fp) {
- if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header);
- sam_close(fp->file);
- free(fp);
- }
-}
-
-int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
-{
- bam1_t *b = bam_init1();
- hts_itr_t *iter = sam_itr_queryi(idx, tid, beg, end);
- int ret;
- while ((ret = sam_itr_next(fp->file, iter, b)) >= 0) func(b, data);
- hts_itr_destroy(iter);
- bam_destroy1(b);
- return (ret == -1)? 0 : ret;
-}
-
-int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
-{
- bam_plbuf_t *buf;
- int ret;
- bam1_t *b;
- b = bam_init1();
- buf = bam_plbuf_init(func, func_data);
- if (mask < 0) mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
- else mask |= BAM_FUNMAP;
- while ((ret = samread(fp, b)) >= 0) {
- // bam_plp_push() itself now filters out unmapped reads only
- if (b->core.flag & mask) b->core.flag |= BAM_FUNMAP;
- bam_plbuf_push(b, buf);
- }
- bam_plbuf_push(0, buf);
- bam_plbuf_destroy(buf);
- bam_destroy1(b);
- return 0;
-}
-
-char *samfaipath(const char *fn_ref)
-{
- char *fn_list = 0;
- if (fn_ref == 0) return 0;
- fn_list = calloc(strlen(fn_ref) + 5, 1);
- strcat(strcpy(fn_list, fn_ref), ".fai");
- if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
- if (access(fn_ref, R_OK) == -1) {
- fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
- } else {
- if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
- if (fai_build(fn_ref) == -1) {
- fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
- free(fn_list); fn_list = 0;
- }
- }
- }
- return fn_list;
-}
+++ /dev/null
-#include "samtools.pysam.h"
-
-/* sam.c -- format-neutral SAM/BAM API.
-
- Copyright (C) 2009, 2012-2016 Genome Research Ltd.
- Portions copyright (C) 2011 Broad Institute.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <string.h>
-#include <unistd.h>
-#include "htslib/faidx.h"
-#include "sam.h"
-
-int samthreads(samfile_t *fp, int n_threads, int n_sub_blks)
-{
- if (hts_get_format(fp->file)->format != bam || !fp->is_write) return -1;
- if (bgzf_mt(fp->x.bam, n_threads, n_sub_blks) < 0) return -1;
- return 0;
-}
-
-samfile_t *samopen(const char *fn, const char *mode, const void *aux)
-{
- // hts_open() is really sam_open(), except for #define games
- samFile *hts_fp = hts_open(fn, mode);
- if (hts_fp == NULL) return NULL;
-
- samfile_t *fp = malloc(sizeof (samfile_t));
- if (!fp) {
- sam_close(hts_fp);
- return NULL;
- }
- fp->file = hts_fp;
- fp->x.bam = hts_fp->fp.bgzf;
- if (strchr(mode, 'r')) {
- if (aux) {
- if (hts_set_fai_filename(fp->file, aux) != 0) {
- sam_close(hts_fp);
- free(fp);
- return NULL;
- }
- }
- fp->header = sam_hdr_read(fp->file); // samclose() will free this
- if (fp->header == NULL) {
- sam_close(hts_fp);
- free(fp);
- return NULL;
- }
- fp->is_write = 0;
- if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1)
- fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n");
- }
- else {
- enum htsExactFormat fmt = hts_get_format(fp->file)->format;
- fp->header = (sam_hdr_t *)aux; // For writing, we won't free it
- fp->is_write = 1;
- if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) {
- if (sam_hdr_write(fp->file, fp->header) < 0) {
- if (bam_verbose >= 1)
- fprintf(samtools_stderr, "[samopen] Couldn't write header\n");
- sam_close(hts_fp);
- free(fp);
- return NULL;
- }
- }
- }
-
- return fp;
-}
-
-void samclose(samfile_t *fp)
-{
- if (fp) {
- if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header);
- sam_close(fp->file);
- free(fp);
- }
-}
-
-int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
-{
- bam1_t *b = bam_init1();
- hts_itr_t *iter = sam_itr_queryi(idx, tid, beg, end);
- int ret;
- while ((ret = sam_itr_next(fp->file, iter, b)) >= 0) func(b, data);
- hts_itr_destroy(iter);
- bam_destroy1(b);
- return (ret == -1)? 0 : ret;
-}
-
-int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data)
-{
- bam_plbuf_t *buf;
- int ret;
- bam1_t *b;
- b = bam_init1();
- buf = bam_plbuf_init(func, func_data);
- if (mask < 0) mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
- else mask |= BAM_FUNMAP;
- while ((ret = samread(fp, b)) >= 0) {
- // bam_plp_push() itself now filters out unmapped reads only
- if (b->core.flag & mask) b->core.flag |= BAM_FUNMAP;
- bam_plbuf_push(b, buf);
- }
- bam_plbuf_push(0, buf);
- bam_plbuf_destroy(buf);
- bam_destroy1(b);
- return 0;
-}
-
-char *samfaipath(const char *fn_ref)
-{
- char *fn_list = 0;
- if (fn_ref == 0) return 0;
- fn_list = calloc(strlen(fn_ref) + 5, 1);
- strcat(strcpy(fn_list, fn_ref), ".fai");
- if (access(fn_list, R_OK) == -1) { // fn_list is unreadable
- if (access(fn_ref, R_OK) == -1) {
- fprintf(samtools_stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
- } else {
- if (bam_verbose >= 3) fprintf(samtools_stderr, "[samfaipath] build FASTA index...\n");
- if (fai_build(fn_ref) == -1) {
- fprintf(samtools_stderr, "[samfaipath] fail to build FASTA index.\n");
- free(fn_list); fn_list = 0;
- }
- }
- }
- return fn_list;
-}
+++ /dev/null
-/* sam.h -- format-neutral SAM/BAM API.
-
- Copyright (C) 2009, 2013-2015, 2019 Genome Research Ltd.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#ifndef BAM_SAM_H
-#define BAM_SAM_H
-
-#include "htslib/sam.h"
-#include "bam.h"
-
-/*!
- @header
-
- This file provides higher level of I/O routines and unifies the APIs
- for SAM and BAM formats. These APIs are more convenient and
- recommended.
-
- @copyright Genome Research Ltd.
- */
-
-/*! @typedef
- @abstract SAM/BAM file handler
- @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format
- @field bam BAM file handler; valid if (type&1) == 1
- @field tamr SAM file handler for reading; valid if type == 2
- @field tamw SAM file handler for writing; valid if type == 0
- @field header header struct
- */
-typedef struct {
- samFile *file;
- struct { BGZF *bam; } x; // Hack so that fp->x.bam still works
- sam_hdr_t *header;
- unsigned short is_write:1;
-} samfile_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- /*!
- @abstract Open a SAM/BAM file
-
- @param fn SAM/BAM file name; "-" is recognized as stdin (for
- reading) or stdout (for writing).
-
- @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading,
- 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output,
- 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for
- string flag. If 'b' present, it must immediately follow 'r' or
- 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX",
- "rb", "wb" and "wbu" exclusively.
-
- @param aux auxiliary data; if mode[0]=='w', aux points to
- bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM
- are absent, aux points the file name of the list of the reference;
- aux is not used otherwise. If @SQ header lines are present in SAM,
- aux is not used, either.
-
- @return SAM/BAM file handler
- */
- samfile_t *samopen(const char *fn, const char *mode, const void *aux);
-
- /*!
- @abstract Close a SAM/BAM handler
- @param fp file handler to be closed
- */
- void samclose(samfile_t *fp);
-
- /*!
- @abstract Read one alignment
- @param fp file handler
- @param b alignment
- @return bytes read
- */
- static inline int samread(samfile_t *fp, bam1_t *b) { return sam_read1(fp->file, fp->header, b); }
-
- /*!
- @abstract Write one alignment
- @param fp file handler
- @param b alignment
- @return bytes written
- */
- static inline int samwrite(samfile_t *fp, const bam1_t *b) { return sam_write1(fp->file, fp->header, b); }
-
- /*!
- @abstract Load BAM/CRAM index for use with samfetch() with supporting the use of index file
- @param fp file handler
- @param fn name of the BAM or CRAM file (NOT the index file)
- @param fnidx name of the index file
- @return pointer to the index structure
- */
- static inline bam_index_t *samtools_sam_index_load(samfile_t *fp, const char *fn, const char *fnidx) {
- if (fnidx != NULL) {
- return sam_index_load2(fp->file, fn, fnidx);
- }
- return sam_index_load(fp->file, fn);
- }
- #undef sam_index_load
- #define sam_index_load(fp,fn,fnidx) (samtools_sam_index_load((fp), (fn), (fnidx)))
-
- /*!
- @abstract Retrieve the alignments overlapping the specified region.
- @discussion A user defined function will be called for each
- retrieved alignment ordered by its start position.
- @param fp file handler
- @param idx index returned by sam_index_load()
- @param tid chromosome ID as is defined in the header
- @param beg start coordinate, 0-based
- @param end end coordinate, 0-based
- @param data user provided data (will be transferred to func)
- @param func user defined function
- */
- int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
-
- /*!
- @abstract Get the pileup for a whole alignment file
- @param fp file handler
- @param mask mask transferred to bam_plbuf_set_mask()
- @param func user defined function called in the pileup process
- #param data user provided data for func()
- */
- int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data);
-
- char *samfaipath(const char *fn_ref);
- int samthreads(samfile_t *fp, int n_threads, int n_sub_blks);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
#include "samtools.h"
+static htsFile *samtools_stdout = NULL;
+
+void autoflush_if_stdout(htsFile *fp, const char *fname) {
+ if (fname == NULL || strcmp(fname, "-") == 0) samtools_stdout = fp;
+}
+
+void release_autoflush(htsFile *fp) {
+ if (samtools_stdout == fp) samtools_stdout = NULL;
+}
+
static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
{
fflush(stdout);
+ if (samtools_stdout) hts_flush(samtools_stdout);
+
if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand);
else fprintf(stderr, "samtools: ");
vfprintf(stderr, format, args);
void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp)
{
+ release_autoflush(fp);
int r = sam_close(fp);
if (r >= 0) return;
#include "samtools.h"
+static htsFile *samtools_stdout_internal = NULL;
+
+void autoflush_if_stdout(htsFile *fp, const char *fname) {
+ if (fname == NULL || strcmp(fname, "-") == 0) samtools_stdout_internal = fp;
+}
+
+void release_autoflush(htsFile *fp) {
+ if (samtools_stdout_internal == fp) samtools_stdout_internal = NULL;
+}
+
static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
{
fflush(samtools_stdout);
+ if (samtools_stdout_internal) hts_flush(samtools_stdout_internal);
+
if (subcommand && *subcommand) fprintf(samtools_stderr, "samtools %s: ", subcommand);
else fprintf(samtools_stderr, "samtools: ");
vfprintf(samtools_stderr, format, args);
void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp)
{
+ release_autoflush(fp);
int r = sam_close(fp);
if (r >= 0) return;
#include "htslib/hts_expr.h"
#include "samtools.h"
#include "sam_opts.h"
+#include "bam.h" // for bam_get_library and bam_remove_B
#include "bedidx.h"
KHASH_SET_INIT_STR(str)
-
typedef khash_t(str) *strhash_t;
+KHASH_SET_INIT_INT(aux_exists)
+typedef khash_t(aux_exists) *auxhash_t;
+
// This structure contains the settings for a samview run
typedef struct samview_settings {
strhash_t rghash;
strhash_t rnhash;
strhash_t tvhash;
int min_mapQ;
- int flag_on;
- int flag_off;
- int flag_alloff;
+
+ // Described here in the same terms as the usage statement.
+ // The code however always negates to "reject if" keep if:
+ int flag_on; // keep if (FLAG & N) == N (all on)
+ int flag_off; // keep if (FLAG & N) == 0 (all off)
+ int flag_anyon; // keep if (FLAG & N) != 0 (any on)
+ int flag_alloff; // reject if (FLAG & N) == N (any off)
+
int min_qlen;
int remove_B;
uint32_t subsam_seed;
hts_filter_t *filter;
int remove_flag;
int add_flag;
+ int unmap;
+ auxhash_t remove_tag;
+ auxhash_t keep_tag;
} samview_settings_t;
+// Copied from htslib/sam.c.
+// TODO: we need a proper interface to find the length of an aux tag,
+// or at the very make exportable versions of these in htslib.
+static inline int aux_type2size(uint8_t type)
+{
+ switch (type) {
+ case 'A': case 'c': case 'C':
+ return 1;
+ case 's': case 'S':
+ return 2;
+ case 'i': case 'I': case 'f':
+ return 4;
+ case 'd':
+ return 8;
+ case 'Z': case 'H': case 'B':
+ return type;
+ default:
+ return 0;
+ }
+}
-// TODO Add declarations of these to a viable htslib or samtools header
-extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
-extern int bam_remove_B(bam1_t *b);
+// Copied from htslib/sam.c.
+static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
+{
+ int size;
+ uint32_t n;
+ if (s >= end) return end;
+ size = aux_type2size(*s); ++s; // skip type
+ switch (size) {
+ case 'Z':
+ case 'H':
+ while (s < end && *s) ++s;
+ return s < end ? s + 1 : end;
+ case 'B':
+ if (end - s < 5) return NULL;
+ size = aux_type2size(*s); ++s;
+ n = le_to_u32(s);
+ s += 4;
+ if (size == 0 || end - s < size * n) return NULL;
+ return s + size * n;
+ case 0:
+ return NULL;
+ default:
+ if (end - s < size) return NULL;
+ return s + size;
+ }
+}
// Returns 0 to indicate read should be output 1 otherwise
static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
{
+ if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
+ return 1;
+
if (settings->remove_B) bam_remove_B(b);
if (settings->min_qlen > 0) {
int k, qlen = 0;
return 1;
if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff))
return 1;
+ if (settings->flag_anyon && ((b->core.flag & settings->flag_anyon) == 0))
+ return 1;
if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b))))
return 1;
if (settings->subsam_frac > 0.) {
const char *p = bam_get_library((sam_hdr_t*)h, b);
if (!p || strcmp(p, settings->library) != 0) return 1;
}
- if (settings->remove_aux_len) {
- size_t i;
- for (i = 0; i < settings->remove_aux_len; ++i) {
- uint8_t *s = bam_aux_get(b, settings->remove_aux[i]);
- if (s) {
- bam_aux_del(b, s);
+ if (settings->keep_tag) {
+ uint8_t *s_from, *s_to, *end = b->data + b->l_data;
+ auxhash_t h = settings->keep_tag;
+
+ s_from = s_to = bam_get_aux(b);
+ while (s_from < end) {
+ int x = (int)s_from[0]<<8 | s_from[1];
+ uint8_t *s = skip_aux(s_from+2, end);
+ if (s == NULL) {
+ print_error("view", "malformed aux data for record \"%s\"",
+ bam_get_qname(b));
+ break;
}
+
+ if (kh_get(aux_exists, h, x) != kh_end(h) ) {
+ if (s_to != s_from) memmove(s_to, s_from, s - s_from);
+ s_to += s - s_from;
+ }
+ s_from = s;
}
- }
+ b->l_data = s_to - b->data;
+
+ } else if (settings->remove_tag) {
+ uint8_t *s_from, *s_to, *end = b->data + b->l_data;
+ auxhash_t h = settings->remove_tag;
+
+ s_from = s_to = bam_get_aux(b);
+ while (s_from < end) {
+ int x = (int)s_from[0]<<8 | s_from[1];
+ uint8_t *s = skip_aux(s_from+2, end);
+ if (s == NULL) {
+ print_error("view", "malformed aux data for record \"%s\"",
+ bam_get_qname(b));
+ break;
+ }
- if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
- return 1;
+ if (kh_get(aux_exists, h, x) == kh_end(h) ) {
+ if (s_to != s_from) memmove(s_to, s_from, s - s_from);
+ s_to += s - s_from;
+ }
+ s_from = s;
+ }
+ b->l_data = s_to - b->data;
+ }
return 0;
}
b->core.flag &= ~settings->remove_flag;
}
+int parse_aux_list(auxhash_t *h, char *optarg) {
+ if (!*h)
+ *h = kh_init(aux_exists);
+
+ while (strlen(optarg) >= 2) {
+ int x = optarg[0]<<8 | optarg[1];
+ int ret = 0;
+ kh_put(aux_exists, *h, x, &ret);
+ if (ret < 0)
+ return -1;
+
+ optarg += 2;
+ if (*optarg == ',') // allow white-space too for easy `cat file`?
+ optarg++;
+ else if (*optarg != 0)
+ break;
+ }
+
+ if (strlen(optarg) != 0) {
+ fprintf(stderr, "main_samview: Error parsing option, "
+ "auxiliary tags should be exactly two characters long.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
// Make mnemonic distinct values for longoption-only options
#define LONGOPT(c) ((c) + 128)
.flag_on = 0,
.flag_off = 0,
.flag_alloff = 0,
+ .flag_anyon = 0,
.min_qlen = 0,
.remove_B = 0,
.subsam_seed = 0,
.tag = NULL,
.filter = NULL,
.remove_flag = 0,
- .add_flag = 0
+ .add_flag = 0,
+ .keep_tag = NULL,
+ .remove_tag = NULL,
+ .unmap = 0,
};
static const struct option lopts[] = {
{"fast", no_argument, NULL, '1'},
{"header-only", no_argument, NULL, 'H'},
{"help", no_argument, NULL, LONGOPT('?')},
+ {"incl-flags", required_argument, NULL, LONGOPT('g')},
+ {"include-flags", required_argument, NULL, LONGOPT('g')},
+ {"rf", required_argument, NULL, LONGOPT('g')}, // aka incl-flags
+ {"keep-tag", required_argument, NULL, LONGOPT('x') },
{"library", required_argument, NULL, 'l'},
{"min-mapq", required_argument, NULL, 'q'},
{"min-MQ", required_argument, NULL, 'q'},
{"target-file", required_argument, NULL, 'L'},
{"targets-file", required_argument, NULL, 'L'},
{"uncompressed", no_argument, NULL, 'u'},
+ {"unmap", no_argument, NULL, 'p'},
{"unoutput", required_argument, NULL, 'U'},
{"use-index", no_argument, NULL, 'M'},
{"with-header", no_argument, NULL, 'h'},
- { NULL, 0, NULL, 0 }
};
/* parse command-line options */
opterr = 0;
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:p",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
case 'X': has_index_file = 1; break;
case 'f': settings.flag_on |= bam_str2flag(optarg); break;
case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+ case LONGOPT('g'):
+ settings.flag_anyon |= bam_str2flag(optarg); break;
case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
case 'l': settings.library = strdup(optarg); break;
+ case 'p': settings.unmap = 1; break;
case LONGOPT('L'):
settings.multi_region = 1;
// fall through
return usage(stderr, EXIT_FAILURE, 0);
}
case 'B': settings.remove_B = 1; break;
- case 'x':
- {
- if (strlen(optarg) != 2) {
- print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long.");
- return usage(stderr, EXIT_FAILURE, 0);
- }
- settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
- settings.remove_aux[settings.remove_aux_len-1] = optarg;
- }
- break;
+
case 'M': settings.multi_region = 1; break;
case LONGOPT('P'): no_pg = 1; break;
case 'e':
break;
case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
+
+ case 'x':
+ if (*optarg == '^') {
+ if (parse_aux_list(&settings.keep_tag, optarg+1))
+ return usage(stderr, EXIT_FAILURE, 0);
+ } else {
+ if (parse_aux_list(&settings.remove_tag, optarg))
+ return usage(stderr, EXIT_FAILURE, 0);
+ }
+ break;
+
+ case LONGOPT('x'):
+ if (parse_aux_list(&settings.keep_tag, optarg))
+ return usage(stderr, EXIT_FAILURE, 0);
+ break;
+
default:
if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
return usage(stderr, EXIT_FAILURE, 0);
print_error("view", "No input provided or missing option argument.");
return usage(stderr, EXIT_FAILURE, 0); // potential memory leak...
}
+
+ if (settings.unmap && fn_un_out) {
+ print_error("view", "Options --unoutput and --unmap are mutually exclusive.");
+ ret = 1;
+ goto view_end;
+ }
+
if (settings.subsam_seed != 0) {
// Convert likely user input 1,2,... to pseudo-random
// values with more entropy and more bits set
goto view_end;
}
}
+ autoflush_if_stdout(out, fn_out);
if (!no_pg) {
if (!(arg_list = stringify_argv(argc+1, argv-1))) {
goto view_end;
}
}
+ autoflush_if_stdout(un_out, fn_un_out);
if (*out_format || is_header ||
out_un_mode[1] == 'b' || out_un_mode[1] == 'c' ||
(ga.out.format != sam && ga.out.format != unknown_format)) {
if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
}
count++;
+ } else if (settings.unmap) {
+ b->core.flag |= BAM_FUNMAP;
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
}
if (result < -1) {
- fprintf(stderr, "[main_samview] retrieval of region %d failed due to truncated file or corrupt BAM index file\n", iter->curr_tid);
+ print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid);
ret = 1;
}
if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
}
count++;
+ } else if (settings.unmap) {
+ b->core.flag |= BAM_FUNMAP;
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
}
count++;
+ } else if (settings.unmap) {
+ b->core.flag |= BAM_FUNMAP;
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
}
hts_itr_destroy(iter);
if (result < -1) {
- fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]);
+ print_error("view", "retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file", argv[i]);
ret = 1;
break;
}
if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k));
kh_destroy(str, settings.tvhash);
}
+
if (settings.remove_aux_len) {
free(settings.remove_aux);
}
free(fn_un_out_idx);
free(arg_list);
+ if (settings.keep_tag)
+ kh_destroy(aux_exists, settings.keep_tag);
+ if (settings.remove_tag)
+ kh_destroy(aux_exists, settings.remove_tag);
+
return ret;
}
"\n"
"Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
"\n"
+
"Output options:\n"
" -b, --bam Output BAM\n"
" -C, --cram Output CRAM (requires -T)\n"
" -o, --output FILE Write output to FILE [standard output]\n"
" -U, --unoutput FILE, --output-unselected FILE\n"
" Output reads not selected by filters to FILE\n"
+" -p, --unmap Set flag to UNMAP on reads not selected\n"
+" then write to output file.\n"
"Input options:\n"
" -t, --fai-reference FILE FILE listing reference names and lengths\n"
" -M, --use-index Use index and multi-region iterator for regions\n"
"Processing options:\n"
" --add-flags FLAG Add FLAGs to reads\n"
" --remove-flags FLAG Remove FLAGs from reads\n"
-" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n"
+" -x, --remove-tag STR\n"
+" Comma-separated read tags to strip (repeatable) [null]\n"
+" --keep-tag STR\n"
+" Comma-separated read tags to preserve (repeatable) [null].\n"
+" Equivalent to \"-x ^STR\"\n"
" -B, --remove-B Collapse the backward CIGAR operation\n"
"\n"
"General options:\n"
#include "htslib/hts_expr.h"
#include "samtools.h"
#include "sam_opts.h"
+#include "bam.h" // for bam_get_library and bam_remove_B
#include "bedidx.h"
KHASH_SET_INIT_STR(str)
-
typedef khash_t(str) *strhash_t;
+KHASH_SET_INIT_INT(aux_exists)
+typedef khash_t(aux_exists) *auxhash_t;
+
// This structure contains the settings for a samview run
typedef struct samview_settings {
strhash_t rghash;
strhash_t rnhash;
strhash_t tvhash;
int min_mapQ;
- int flag_on;
- int flag_off;
- int flag_alloff;
+
+ // Described here in the same terms as the usage statement.
+ // The code however always negates to "reject if" keep if:
+ int flag_on; // keep if (FLAG & N) == N (all on)
+ int flag_off; // keep if (FLAG & N) == 0 (all off)
+ int flag_anyon; // keep if (FLAG & N) != 0 (any on)
+ int flag_alloff; // reject if (FLAG & N) == N (any off)
+
int min_qlen;
int remove_B;
uint32_t subsam_seed;
hts_filter_t *filter;
int remove_flag;
int add_flag;
+ int unmap;
+ auxhash_t remove_tag;
+ auxhash_t keep_tag;
} samview_settings_t;
+// Copied from htslib/sam.c.
+// TODO: we need a proper interface to find the length of an aux tag,
+// or at the very make exportable versions of these in htslib.
+static inline int aux_type2size(uint8_t type)
+{
+ switch (type) {
+ case 'A': case 'c': case 'C':
+ return 1;
+ case 's': case 'S':
+ return 2;
+ case 'i': case 'I': case 'f':
+ return 4;
+ case 'd':
+ return 8;
+ case 'Z': case 'H': case 'B':
+ return type;
+ default:
+ return 0;
+ }
+}
-// TODO Add declarations of these to a viable htslib or samtools header
-extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
-extern int bam_remove_B(bam1_t *b);
+// Copied from htslib/sam.c.
+static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
+{
+ int size;
+ uint32_t n;
+ if (s >= end) return end;
+ size = aux_type2size(*s); ++s; // skip type
+ switch (size) {
+ case 'Z':
+ case 'H':
+ while (s < end && *s) ++s;
+ return s < end ? s + 1 : end;
+ case 'B':
+ if (end - s < 5) return NULL;
+ size = aux_type2size(*s); ++s;
+ n = le_to_u32(s);
+ s += 4;
+ if (size == 0 || end - s < size * n) return NULL;
+ return s + size * n;
+ case 0:
+ return NULL;
+ default:
+ if (end - s < size) return NULL;
+ return s + size;
+ }
+}
// Returns 0 to indicate read should be output 1 otherwise
static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
{
+ if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
+ return 1;
+
if (settings->remove_B) bam_remove_B(b);
if (settings->min_qlen > 0) {
int k, qlen = 0;
return 1;
if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff))
return 1;
+ if (settings->flag_anyon && ((b->core.flag & settings->flag_anyon) == 0))
+ return 1;
if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b))))
return 1;
if (settings->subsam_frac > 0.) {
const char *p = bam_get_library((sam_hdr_t*)h, b);
if (!p || strcmp(p, settings->library) != 0) return 1;
}
- if (settings->remove_aux_len) {
- size_t i;
- for (i = 0; i < settings->remove_aux_len; ++i) {
- uint8_t *s = bam_aux_get(b, settings->remove_aux[i]);
- if (s) {
- bam_aux_del(b, s);
+ if (settings->keep_tag) {
+ uint8_t *s_from, *s_to, *end = b->data + b->l_data;
+ auxhash_t h = settings->keep_tag;
+
+ s_from = s_to = bam_get_aux(b);
+ while (s_from < end) {
+ int x = (int)s_from[0]<<8 | s_from[1];
+ uint8_t *s = skip_aux(s_from+2, end);
+ if (s == NULL) {
+ print_error("view", "malformed aux data for record \"%s\"",
+ bam_get_qname(b));
+ break;
}
+
+ if (kh_get(aux_exists, h, x) != kh_end(h) ) {
+ if (s_to != s_from) memmove(s_to, s_from, s - s_from);
+ s_to += s - s_from;
+ }
+ s_from = s;
}
- }
+ b->l_data = s_to - b->data;
+
+ } else if (settings->remove_tag) {
+ uint8_t *s_from, *s_to, *end = b->data + b->l_data;
+ auxhash_t h = settings->remove_tag;
+
+ s_from = s_to = bam_get_aux(b);
+ while (s_from < end) {
+ int x = (int)s_from[0]<<8 | s_from[1];
+ uint8_t *s = skip_aux(s_from+2, end);
+ if (s == NULL) {
+ print_error("view", "malformed aux data for record \"%s\"",
+ bam_get_qname(b));
+ break;
+ }
- if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1)
- return 1;
+ if (kh_get(aux_exists, h, x) == kh_end(h) ) {
+ if (s_to != s_from) memmove(s_to, s_from, s - s_from);
+ s_to += s - s_from;
+ }
+ s_from = s;
+ }
+ b->l_data = s_to - b->data;
+ }
return 0;
}
b->core.flag &= ~settings->remove_flag;
}
+int parse_aux_list(auxhash_t *h, char *optarg) {
+ if (!*h)
+ *h = kh_init(aux_exists);
+
+ while (strlen(optarg) >= 2) {
+ int x = optarg[0]<<8 | optarg[1];
+ int ret = 0;
+ kh_put(aux_exists, *h, x, &ret);
+ if (ret < 0)
+ return -1;
+
+ optarg += 2;
+ if (*optarg == ',') // allow white-space too for easy `cat file`?
+ optarg++;
+ else if (*optarg != 0)
+ break;
+ }
+
+ if (strlen(optarg) != 0) {
+ fprintf(samtools_stderr, "main_samview: Error parsing option, "
+ "auxiliary tags should be exactly two characters long.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
// Make mnemonic distinct values for longoption-only options
#define LONGOPT(c) ((c) + 128)
.flag_on = 0,
.flag_off = 0,
.flag_alloff = 0,
+ .flag_anyon = 0,
.min_qlen = 0,
.remove_B = 0,
.subsam_seed = 0,
.tag = NULL,
.filter = NULL,
.remove_flag = 0,
- .add_flag = 0
+ .add_flag = 0,
+ .keep_tag = NULL,
+ .remove_tag = NULL,
+ .unmap = 0,
};
static const struct option lopts[] = {
{"fast", no_argument, NULL, '1'},
{"header-only", no_argument, NULL, 'H'},
{"help", no_argument, NULL, LONGOPT('?')},
+ {"incl-flags", required_argument, NULL, LONGOPT('g')},
+ {"include-flags", required_argument, NULL, LONGOPT('g')},
+ {"rf", required_argument, NULL, LONGOPT('g')}, // aka incl-flags
+ {"keep-tag", required_argument, NULL, LONGOPT('x') },
{"library", required_argument, NULL, 'l'},
{"min-mapq", required_argument, NULL, 'q'},
{"min-MQ", required_argument, NULL, 'q'},
{"target-file", required_argument, NULL, 'L'},
{"targets-file", required_argument, NULL, 'L'},
{"uncompressed", no_argument, NULL, 'u'},
+ {"unmap", no_argument, NULL, 'p'},
{"unoutput", required_argument, NULL, 'U'},
{"use-index", no_argument, NULL, 'M'},
{"with-header", no_argument, NULL, 'h'},
- { NULL, 0, NULL, 0 }
};
/* parse command-line options */
opterr = 0;
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:p",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
case 'X': has_index_file = 1; break;
case 'f': settings.flag_on |= bam_str2flag(optarg); break;
case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+ case LONGOPT('g'):
+ settings.flag_anyon |= bam_str2flag(optarg); break;
case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
case 'l': settings.library = strdup(optarg); break;
+ case 'p': settings.unmap = 1; break;
case LONGOPT('L'):
settings.multi_region = 1;
// fall through
return usage(samtools_stderr, EXIT_FAILURE, 0);
}
case 'B': settings.remove_B = 1; break;
- case 'x':
- {
- if (strlen(optarg) != 2) {
- print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long.");
- return usage(samtools_stderr, EXIT_FAILURE, 0);
- }
- settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len));
- settings.remove_aux[settings.remove_aux_len-1] = optarg;
- }
- break;
+
case 'M': settings.multi_region = 1; break;
case LONGOPT('P'): no_pg = 1; break;
case 'e':
break;
case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
+
+ case 'x':
+ if (*optarg == '^') {
+ if (parse_aux_list(&settings.keep_tag, optarg+1))
+ return usage(samtools_stderr, EXIT_FAILURE, 0);
+ } else {
+ if (parse_aux_list(&settings.remove_tag, optarg))
+ return usage(samtools_stderr, EXIT_FAILURE, 0);
+ }
+ break;
+
+ case LONGOPT('x'):
+ if (parse_aux_list(&settings.keep_tag, optarg))
+ return usage(samtools_stderr, EXIT_FAILURE, 0);
+ break;
+
default:
if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0)
return usage(samtools_stderr, EXIT_FAILURE, 0);
print_error("view", "No input provided or missing option argument.");
return usage(samtools_stderr, EXIT_FAILURE, 0); // potential memory leak...
}
+
+ if (settings.unmap && fn_un_out) {
+ print_error("view", "Options --unoutput and --unmap are mutually exclusive.");
+ ret = 1;
+ goto view_end;
+ }
+
if (settings.subsam_seed != 0) {
// Convert likely user input 1,2,... to pseudo-random
// values with more entropy and more bits set
goto view_end;
}
}
+ autoflush_if_stdout(out, fn_out);
if (!no_pg) {
if (!(arg_list = stringify_argv(argc+1, argv-1))) {
goto view_end;
}
}
+ autoflush_if_stdout(un_out, fn_un_out);
if (*out_format || is_header ||
out_un_mode[1] == 'b' || out_un_mode[1] == 'c' ||
(ga.out.format != sam && ga.out.format != unknown_format)) {
if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
}
count++;
+ } else if (settings.unmap) {
+ b->core.flag |= BAM_FUNMAP;
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
}
if (result < -1) {
- fprintf(samtools_stderr, "[main_samview] retrieval of region %d failed due to truncated file or corrupt BAM index file\n", iter->curr_tid);
+ print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid);
ret = 1;
}
if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
}
count++;
+ } else if (settings.unmap) {
+ b->core.flag |= BAM_FUNMAP;
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
}
count++;
+ } else if (settings.unmap) {
+ b->core.flag |= BAM_FUNMAP;
+ if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break;
} else {
if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; }
}
}
hts_itr_destroy(iter);
if (result < -1) {
- fprintf(samtools_stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]);
+ print_error("view", "retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file", argv[i]);
ret = 1;
break;
}
if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k));
kh_destroy(str, settings.tvhash);
}
+
if (settings.remove_aux_len) {
free(settings.remove_aux);
}
free(fn_un_out_idx);
free(arg_list);
+ if (settings.keep_tag)
+ kh_destroy(aux_exists, settings.keep_tag);
+ if (settings.remove_tag)
+ kh_destroy(aux_exists, settings.remove_tag);
+
return ret;
}
"\n"
"Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n"
"\n"
+
"Output options:\n"
" -b, --bam Output BAM\n"
" -C, --cram Output CRAM (requires -T)\n"
" -o, --output FILE Write output to FILE [standard output]\n"
" -U, --unoutput FILE, --output-unselected FILE\n"
" Output reads not selected by filters to FILE\n"
+" -p, --unmap Set flag to UNMAP on reads not selected\n"
+" then write to output file.\n"
"Input options:\n"
" -t, --fai-reference FILE FILE listing reference names and lengths\n"
" -M, --use-index Use index and multi-region iterator for regions\n"
"Processing options:\n"
" --add-flags FLAG Add FLAGs to reads\n"
" --remove-flags FLAG Remove FLAGs from reads\n"
-" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n"
+" -x, --remove-tag STR\n"
+" Comma-separated read tags to strip (repeatable) [null]\n"
+" --keep-tag STR\n"
+" Comma-separated read tags to preserve (repeatable) [null].\n"
+" Equivalent to \"-x ^STR\"\n"
" -B, --remove-B Collapse the backward CIGAR operation\n"
"\n"
"General options:\n"
void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp);
+/* Utility functions to register an output htsFile/samFile/vcfFile that
+ * might be stdout. If FNAME is "-" or NULL, records FP so that print_error()
+ * et al can automatically flush it before printing an error message.
+ */
+void autoflush_if_stdout(htsFile *fp, const char *fname);
+
+/* Call this before closing FP; check_sam_close() does this automatically.
+ */
+void release_autoflush(htsFile *fp);
+
/*
* Utility function to add an index to a file we've opened for write.
* NB: Call this after writing the header and before writing sequences.
#include <htslib/sam.h>
#include <htslib/hts.h>
#include <htslib/hts_defs.h>
-#include <htslib/khash_str2int.h>
#include "samtools.h"
#include <htslib/khash.h>
#include <htslib/kstring.h>
} pair_t;
KHASH_MAP_INIT_STR(qn2pair, pair_t*)
+KHASH_SET_INIT_STR(rg)
+
static void HTS_NORETURN error(const char *format, ...);
int is_in_regions(bam1_t *bam_line, stats_t *stats);
{
const uint8_t *rg = bam_aux_get(bam_line, "RG");
if ( !rg ) return; // certain read groups were requested but this record has none
- if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return;
+ khint_t k = kh_get(rg, stats->rg_hash, (const char*)(rg + 1));
+ if ( k == kh_end((kh_rg_t *)stats->rg_hash) ) return;
}
if ( stats->info->flag_require && (bam_line->core.flag & stats->info->flag_require)!=stats->info->flag_require )
{
if ( stats->gcd[igcd].depth )
stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
}
- qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
+ if ( stats->ngcd )
+ qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
igcd = 0;
while ( igcd < stats->igcd )
{
return 0;
}
-void init_group_id(stats_t *stats, const char *id)
+static void init_group_id(stats_t *stats, stats_info_t *info, const char *id)
{
-#if 0
- if ( !stats->sam_header->dict )
- stats->sam_header->dict = sam_header_parse2(stats->sam_header->text);
- void *iter = stats->sam_header->dict;
- const char *key, *val;
- int n = 0;
- stats->rg_hash = khash_str2int_init();
- while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
- {
- if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
- {
- khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
- if ( k != kh_end(stats->rg_hash) )
- fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
- int ret;
- k = kh_put(kh_rg, stats->rg_hash, key, &ret);
- kh_value(stats->rg_hash, k) = val;
- n++;
+ stats->rg_hash = kh_init(rg);
+ if (!stats->rg_hash) error("Could not initialise RG set\n");
+ sam_hdr_t *hdr = info->sam_header;
+ const char *key;
+ kstring_t sm = KS_INITIALIZE;
+ int i, ret, nrg = sam_hdr_count_lines(hdr, "RG");
+ if (nrg < 0) error("Could not parse header\n");
+
+ for (i=0; i<nrg; i++) {
+ key = sam_hdr_line_name(hdr, "RG", i);
+ if (!strcmp(key, id)) {
+ kh_put(rg, stats->rg_hash, key, &ret);
+ if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); }
+ } else { /* Check for SM name, as per manual */
+ if (!sam_hdr_find_tag_pos(hdr, "RG", i, "SM", &sm)) {
+ if (!strcmp(ks_c_str(&sm), id)) {
+ kh_put(rg, stats->rg_hash, key, &ret);
+ if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); }
+ }
+ }
}
}
- if ( !n )
- error("The sample or read group \"%s\" not present.\n", id);
-#else
- fprintf(stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n");
- abort();
-#endif
+
+ ks_free(&sm);
}
if (stats->quals_barcode) free(stats->quals_barcode);
free(stats->tags_barcode);
destroy_regions(stats);
- if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash);
+ if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash);
free(stats->split_name);
free(stats);
}
stats->cov_rbuf.size = stats->nbases*5;
stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
if (!stats->cov_rbuf.buffer) goto nomem;
- if ( group_id ) init_group_id(stats, group_id);
+ if ( group_id ) init_group_id(stats, info, group_id);
// .. arrays
stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
if (!stats->quals_1st) goto nomem;
#include <htslib/sam.h>
#include <htslib/hts.h>
#include <htslib/hts_defs.h>
-#include <htslib/khash_str2int.h>
#include "samtools.h"
#include <htslib/khash.h>
#include <htslib/kstring.h>
} pair_t;
KHASH_MAP_INIT_STR(qn2pair, pair_t*)
+KHASH_SET_INIT_STR(rg)
+
static void HTS_NORETURN error(const char *format, ...);
int is_in_regions(bam1_t *bam_line, stats_t *stats);
{
const uint8_t *rg = bam_aux_get(bam_line, "RG");
if ( !rg ) return; // certain read groups were requested but this record has none
- if ( !khash_str2int_has_key(stats->rg_hash, (const char*)(rg + 1)) ) return;
+ khint_t k = kh_get(rg, stats->rg_hash, (const char*)(rg + 1));
+ if ( k == kh_end((kh_rg_t *)stats->rg_hash) ) return;
}
if ( stats->info->flag_require && (bam_line->core.flag & stats->info->flag_require)!=stats->info->flag_require )
{
if ( stats->gcd[igcd].depth )
stats->gcd[igcd].gc = rint(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
}
- qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
+ if ( stats->ngcd )
+ qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
igcd = 0;
while ( igcd < stats->igcd )
{
return 0;
}
-void init_group_id(stats_t *stats, const char *id)
+static void init_group_id(stats_t *stats, stats_info_t *info, const char *id)
{
-#if 0
- if ( !stats->sam_header->dict )
- stats->sam_header->dict = sam_header_parse2(stats->sam_header->text);
- void *iter = stats->sam_header->dict;
- const char *key, *val;
- int n = 0;
- stats->rg_hash = khash_str2int_init();
- while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
- {
- if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
- {
- khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
- if ( k != kh_end(stats->rg_hash) )
- fprintf(samtools_stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
- int ret;
- k = kh_put(kh_rg, stats->rg_hash, key, &ret);
- kh_value(stats->rg_hash, k) = val;
- n++;
+ stats->rg_hash = kh_init(rg);
+ if (!stats->rg_hash) error("Could not initialise RG set\n");
+ sam_hdr_t *hdr = info->sam_header;
+ const char *key;
+ kstring_t sm = KS_INITIALIZE;
+ int i, ret, nrg = sam_hdr_count_lines(hdr, "RG");
+ if (nrg < 0) error("Could not parse header\n");
+
+ for (i=0; i<nrg; i++) {
+ key = sam_hdr_line_name(hdr, "RG", i);
+ if (!strcmp(key, id)) {
+ kh_put(rg, stats->rg_hash, key, &ret);
+ if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); }
+ } else { /* Check for SM name, as per manual */
+ if (!sam_hdr_find_tag_pos(hdr, "RG", i, "SM", &sm)) {
+ if (!strcmp(ks_c_str(&sm), id)) {
+ kh_put(rg, stats->rg_hash, key, &ret);
+ if (ret == -1) { ks_free(&sm); error("Could not add key \"%s\" to RG set\n", key); }
+ }
+ }
}
}
- if ( !n )
- error("The sample or read group \"%s\" not present.\n", id);
-#else
- fprintf(samtools_stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n");
- abort();
-#endif
+
+ ks_free(&sm);
}
if (stats->quals_barcode) free(stats->quals_barcode);
free(stats->tags_barcode);
destroy_regions(stats);
- if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash);
+ if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash);
free(stats->split_name);
free(stats);
}
stats->cov_rbuf.size = stats->nbases*5;
stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
if (!stats->cov_rbuf.buffer) goto nomem;
- if ( group_id ) init_group_id(stats, group_id);
+ if ( group_id ) init_group_id(stats, info, group_id);
// .. arrays
stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
if (!stats->quals_1st) goto nomem;
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.13
+VERSION=1.14
# If we have a git clone, then check against the current tag
if [ -e .git ]
all: all.stamp
-all.stamp:
+DERIVED_FILES = \
+ empty.bed.gz.tbi \
+ example.bed.gz example.bed.gz.tbi \
+ example.gff2.gz.tbi \
+ example.gff3.gz.tbi \
+ example.sam.gz.tbi \
+ example.vcf.gz.tbi \
+ example_badcomments.bed.gz.tbi \
+ example_badcomments.gtf.gz.tbi \
+ example_badcomments.sam.gz.tbi \
+ example_badcomments.vcf.gz.tbi \
+ example_comments.bed.gz.tbi \
+ example_comments.gtf.gz.tbi \
+ example_comments.sam.gz.tbi \
+ example_comments.vcf.gz.tbi \
+ example_large.bed.gz.tbi \
+ fivecolumns.bed.gz fivecolumns.bed.gz.tbi
+
+all.stamp: $(DERIVED_FILES)
touch $@
+%.gz: %
+ bgzip -c $< > $@
+
+%.gff2.gz.tbi: %.gff2.gz
+ tabix -p gff $<
+
+%.gff3.gz.tbi: %.gff3.gz
+ tabix -p gff $<
+
+%.gtf.gz.tbi: %.gtf.gz
+ tabix -p gff $<
+
+%.gz.tbi: %.gz
+ tabix -p $(subst .,,$(suffix $*)) $<
+
clean:
- -rm -f all.stamp
+ -rm -f all.stamp $(DERIVED_FILES)
--- /dev/null
+chr1 100 200 one apple
+chr1 300 400 two banana
+chr2 100 600 three carrot
+chr2 700 800 four durian
##source_20110031.1=/nfs/users/nfs_p/pd3/cvs/vcftools/perl/vcf-annotate -d /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.desc -a /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.tab.gz -c CHROM,FROM,INFO/VLD,INFO/KGPilot123,INFO/dbSNP
##vcfCTools=filter
##vcfCtools=merge freebayes.20:0-100000.baq.20110328.vcf, freebayes.20:100000-200000.baq.20110328.vcf, freebayes.20:200000-300000.baq.20110328.vcf, freebayes.20:300000-400000.baq.20110328.vcf, freebayes.20:400000-500000.baq.20110328.vcf, freebayes.20:500000-600000.baq.20110328.vcf, freebayes.20:600000-700000.baq.20110328.vcf, freebayes.20:700000-800000.baq.20110328.vcf, freebayes.20:800000-900000.baq.20110328.vcf, freebayes.20:900000-1000000.baq.20110328.vcf, freebayes.20:1000000-1100000.baq.20110328.vcf, freebayes.20:1100000-1200000.baq.20110328.vcf, freebayes.20:1200000-1300000.baq.20110328.vcf, freebayes.20:1300000-1400000.baq.20110328.vcf, freebayes.20:1400000-1500000.baq.20110328.vcf, freebayes.20:1500000-1600000.baq.20110328.vcf, freebayes.20:1600000-1700000.baq.20110328.vcf, freebayes.20:1700000-1800000.baq.20110328.vcf, freebayes.20:1800000-1900000.baq.20110328.vcf, freebayes.20:1900000-2000000.baq.20110328.vcf, freebayes.20:2000000-2100000.baq.20110328.vcf, freebayes.20:2100000-2200000.baq.20110328.vcf, freebayes.20:2200000-2300000.baq.20110328.vcf, freebayes.20:2300000-2400000.baq.20110328.vcf, freebayes.20:2400000-2500000.baq.20110328.vcf, freebayes.20:2500000-2600000.baq.20110328.vcf, freebayes.20:2600000-2700000.baq.20110328.vcf, freebayes.20:2700000-2800000.baq.20110328.vcf, freebayes.20:2800000-2900000.baq.20110328.vcf, freebayes.20:2900000-3000000.baq.20110328.vcf, freebayes.20:3000000-3100000.baq.20110328.vcf, freebayes.20:3100000-3200000.baq.20110328.vcf, freebayes.20:3200000-3300000.baq.20110328.vcf, freebayes.20:3300000-3400000.baq.20110328.vcf, freebayes.20:3400000-3500000.baq.20110328.vcf, freebayes.20:3500000-3600000.baq.20110328.vcf, freebayes.20:3600000-3700000.baq.20110328.vcf, freebayes.20:3700000-3800000.baq.20110328.vcf, freebayes.20:3800000-3900000.baq.20110328.vcf, freebayes.20:3900000-4000000.baq.20110328.vcf, freebayes.20:4000000-4100000.baq.20110328.vcf, freebayes.20:4100000-4200000.baq.20110328.vcf, freebayes.20:4200000-4300000.baq.20110328.vcf, freebayes.20:4300000-4400000.baq.20110328.vcf, freebayes.20:4400000-4500000.baq.20110328.vcf, freebayes.20:4500000-4600000.baq.20110328.vcf, freebayes.20:4600000-4700000.baq.20110328.vcf, freebayes.20:4700000-4800000.baq.20110328.vcf, freebayes.20:4800000-4900000.baq.20110328.vcf, freebayes.20:4900000-5000000.baq.20110328.vcf, freebayes.20:5000000-5100000.baq.20110328.vcf, freebayes.20:5100000-5200000.baq.20110328.vcf, freebayes.20:5200000-5300000.baq.20110328.vcf, freebayes.20:5300000-5400000.baq.20110328.vcf, freebayes.20:5400000-5500000.baq.20110328.vcf, freebayes.20:5500000-5600000.baq.20110328.vcf, freebayes.20:5600000-5700000.baq.20110328.vcf, freebayes.20:5700000-5800000.baq.20110328.vcf, freebayes.20:5800000-5900000.baq.20110328.vcf, freebayes.20:5900000-6000000.baq.20110328.vcf, freebayes.20:6000000-6100000.baq.20110328.vcf, freebayes.20:6100000-6200000.baq.20110328.vcf, freebayes.20:6200000-6300000.baq.20110328.vcf, freebayes.20:6300000-6400000.baq.20110328.vcf, freebayes.20:6400000-6500000.baq.20110328.vcf, freebayes.20:6500000-6600000.baq.20110328.vcf, freebayes.20:6600000-6700000.baq.20110328.vcf, freebayes.20:6700000-6800000.baq.20110328.vcf, freebayes.20:6800000-6900000.baq.20110328.vcf, freebayes.20:6900000-7000000.baq.20110328.vcf, freebayes.20:7000000-7100000.baq.20110328.vcf, freebayes.20:7100000-7200000.baq.20110328.vcf, freebayes.20:7200000-7300000.baq.20110328.vcf, freebayes.20:7300000-7400000.baq.20110328.vcf, freebayes.20:7400000-7500000.baq.20110328.vcf, freebayes.20:7500000-7600000.baq.20110328.vcf, freebayes.20:7600000-7700000.baq.20110328.vcf, freebayes.20:7700000-7800000.baq.20110328.vcf, freebayes.20:7800000-7900000.baq.20110328.vcf, freebayes.20:7900000-8000000.baq.20110328.vcf, freebayes.20:8000000-8100000.baq.20110328.vcf, freebayes.20:8100000-8200000.baq.20110328.vcf, freebayes.20:8200000-8300000.baq.20110328.vcf, freebayes.20:8300000-8400000.baq.20110328.vcf, freebayes.20:8400000-8500000.baq.20110328.vcf, freebayes.20:8500000-8600000.baq.20110328.vcf, freebayes.20:8600000-8700000.baq.20110328.vcf, freebayes.20:8700000-8800000.baq.20110328.vcf, freebayes.20:8800000-8900000.baq.20110328.vcf, freebayes.20:8900000-9000000.baq.20110328.vcf, freebayes.20:9000000-9100000.baq.20110328.vcf, freebayes.20:9100000-9200000.baq.20110328.vcf, freebayes.20:9200000-9300000.baq.20110328.vcf, freebayes.20:9300000-9400000.baq.20110328.vcf, freebayes.20:9400000-9500000.baq.20110328.vcf, freebayes.20:9500000-9600000.baq.20110328.vcf, freebayes.20:9600000-9700000.baq.20110328.vcf, freebayes.20:9700000-9800000.baq.20110328.vcf, freebayes.20:9800000-9900000.baq.20110328.vcf, freebayes.20:9900000-10000000.baq.20110328.vcf, freebayes.20:10000000-10100000.baq.20110328.vcf, freebayes.20:10100000-10200000.baq.20110328.vcf, freebayes.20:10200000-10300000.baq.20110328.vcf, freebayes.20:10300000-10400000.baq.20110328.vcf, freebayes.20:10400000-10500000.baq.20110328.vcf, freebayes.20:10500000-10600000.baq.20110328.vcf, freebayes.20:10600000-10700000.baq.20110328.vcf, freebayes.20:10700000-10800000.baq.20110328.vcf, freebayes.20:10800000-10900000.baq.20110328.vcf, freebayes.20:10900000-11000000.baq.20110328.vcf, freebayes.20:11000000-11100000.baq.20110328.vcf, freebayes.20:11100000-11200000.baq.20110328.vcf, freebayes.20:11200000-11300000.baq.20110328.vcf, freebayes.20:11300000-11400000.baq.20110328.vcf, freebayes.20:11400000-11500000.baq.20110328.vcf, freebayes.20:11500000-11600000.baq.20110328.vcf, freebayes.20:11600000-11700000.baq.20110328.vcf, freebayes.20:11700000-11800000.baq.20110328.vcf, freebayes.20:11800000-11900000.baq.20110328.vcf, freebayes.20:11900000-12000000.baq.20110328.vcf, freebayes.20:12000000-12100000.baq.20110328.vcf, freebayes.20:12100000-12200000.baq.20110328.vcf, freebayes.20:12200000-12300000.baq.20110328.vcf, freebayes.20:12300000-12400000.baq.20110328.vcf, freebayes.20:12400000-12500000.baq.20110328.vcf, freebayes.20:12500000-12600000.baq.20110328.vcf, freebayes.20:12600000-12700000.baq.20110328.vcf, freebayes.20:12700000-12800000.baq.20110328.vcf, freebayes.20:12800000-12900000.baq.20110328.vcf, freebayes.20:12900000-13000000.baq.20110328.vcf, freebayes.20:13000000-13100000.baq.20110328.vcf, freebayes.20:13100000-13200000.baq.20110328.vcf, freebayes.20:13200000-13300000.baq.20110328.vcf, freebayes.20:13300000-13400000.baq.20110328.vcf, freebayes.20:13400000-13500000.baq.20110328.vcf, freebayes.20:13500000-13600000.baq.20110328.vcf, freebayes.20:13600000-13700000.baq.20110328.vcf, freebayes.20:13700000-13800000.baq.20110328.vcf, freebayes.20:13800000-13900000.baq.20110328.vcf, freebayes.20:13900000-14000000.baq.20110328.vcf, freebayes.20:14000000-14100000.baq.20110328.vcf, freebayes.20:14100000-14200000.baq.20110328.vcf, freebayes.20:14200000-14300000.baq.20110328.vcf, freebayes.20:14300000-14400000.baq.20110328.vcf, freebayes.20:14400000-14500000.baq.20110328.vcf, freebayes.20:14500000-14600000.baq.20110328.vcf, freebayes.20:14600000-14700000.baq.20110328.vcf, freebayes.20:14700000-14800000.baq.20110328.vcf, freebayes.20:14800000-14900000.baq.20110328.vcf, freebayes.20:14900000-15000000.baq.20110328.vcf, freebayes.20:15000000-15100000.baq.20110328.vcf, freebayes.20:15100000-15200000.baq.20110328.vcf, freebayes.20:15200000-15300000.baq.20110328.vcf, freebayes.20:15300000-15400000.baq.20110328.vcf, freebayes.20:15400000-15500000.baq.20110328.vcf, freebayes.20:15500000-15600000.baq.20110328.vcf, freebayes.20:15600000-15700000.baq.20110328.vcf, freebayes.20:15700000-15800000.baq.20110328.vcf, freebayes.20:15800000-15900000.baq.20110328.vcf, freebayes.20:15900000-16000000.baq.20110328.vcf, freebayes.20:16000000-16100000.baq.20110328.vcf, freebayes.20:16100000-16200000.baq.20110328.vcf, freebayes.20:16200000-16300000.baq.20110328.vcf, freebayes.20:16300000-16400000.baq.20110328.vcf, freebayes.20:16400000-16500000.baq.20110328.vcf, freebayes.20:16500000-16600000.baq.20110328.vcf, freebayes.20:16600000-16700000.baq.20110328.vcf, freebayes.20:16700000-16800000.baq.20110328.vcf, freebayes.20:16800000-16900000.baq.20110328.vcf, freebayes.20:16900000-17000000.baq.20110328.vcf, freebayes.20:17000000-17100000.baq.20110328.vcf, freebayes.20:17100000-17200000.baq.20110328.vcf, freebayes.20:17200000-17300000.baq.20110328.vcf, freebayes.20:17300000-17400000.baq.20110328.vcf, freebayes.20:17400000-17500000.baq.20110328.vcf, freebayes.20:17500000-17600000.baq.20110328.vcf, freebayes.20:17600000-17700000.baq.20110328.vcf, freebayes.20:17700000-17800000.baq.20110328.vcf, freebayes.20:17800000-17900000.baq.20110328.vcf, freebayes.20:17900000-18000000.baq.20110328.vcf, freebayes.20:18000000-18100000.baq.20110328.vcf, freebayes.20:18100000-18200000.baq.20110328.vcf, freebayes.20:18200000-18300000.baq.20110328.vcf, freebayes.20:18300000-18400000.baq.20110328.vcf, freebayes.20:18400000-18500000.baq.20110328.vcf, freebayes.20:18500000-18600000.baq.20110328.vcf, freebayes.20:18600000-18700000.baq.20110328.vcf, freebayes.20:18700000-18800000.baq.20110328.vcf, freebayes.20:18800000-18900000.baq.20110328.vcf, freebayes.20:18900000-19000000.baq.20110328.vcf, freebayes.20:19000000-19100000.baq.20110328.vcf, freebayes.20:19100000-19200000.baq.20110328.vcf, freebayes.20:19200000-19300000.baq.20110328.vcf, freebayes.20:19300000-19400000.baq.20110328.vcf, freebayes.20:19400000-19500000.baq.20110328.vcf, freebayes.20:19500000-19600000.baq.20110328.vcf, freebayes.20:19600000-19700000.baq.20110328.vcf, freebayes.20:19700000-19800000.baq.20110328.vcf, freebayes.20:19800000-19900000.baq.20110328.vcf, freebayes.20:19900000-20000000.baq.20110328.vcf, freebayes.20:20000000-20100000.baq.20110328.vcf, freebayes.20:20100000-20200000.baq.20110328.vcf, freebayes.20:20200000-20300000.baq.20110328.vcf, freebayes.20:20300000-20400000.baq.20110328.vcf, freebayes.20:20400000-20500000.baq.20110328.vcf, freebayes.20:20500000-20600000.baq.20110328.vcf, freebayes.20:20600000-20700000.baq.20110328.vcf, freebayes.20:20700000-20800000.baq.20110328.vcf, freebayes.20:20800000-20900000.baq.20110328.vcf, freebayes.20:20900000-21000000.baq.20110328.vcf, freebayes.20:21000000-21100000.baq.20110328.vcf, freebayes.20:21100000-21200000.baq.20110328.vcf, freebayes.20:21200000-21300000.baq.20110328.vcf, freebayes.20:21300000-21400000.baq.20110328.vcf, freebayes.20:21400000-21500000.baq.20110328.vcf, freebayes.20:21500000-21600000.baq.20110328.vcf, freebayes.20:21600000-21700000.baq.20110328.vcf, freebayes.20:21700000-21800000.baq.20110328.vcf, freebayes.20:21800000-21900000.baq.20110328.vcf, freebayes.20:21900000-22000000.baq.20110328.vcf, freebayes.20:22000000-22100000.baq.20110328.vcf, freebayes.20:22100000-22200000.baq.20110328.vcf, freebayes.20:22200000-22300000.baq.20110328.vcf, freebayes.20:22300000-22400000.baq.20110328.vcf, freebayes.20:22400000-22500000.baq.20110328.vcf, freebayes.20:22500000-22600000.baq.20110328.vcf, freebayes.20:22600000-22700000.baq.20110328.vcf, freebayes.20:22700000-22800000.baq.20110328.vcf, freebayes.20:22800000-22900000.baq.20110328.vcf, freebayes.20:22900000-23000000.baq.20110328.vcf, freebayes.20:23000000-23100000.baq.20110328.vcf, freebayes.20:23100000-23200000.baq.20110328.vcf, freebayes.20:23200000-23300000.baq.20110328.vcf, freebayes.20:23300000-23400000.baq.20110328.vcf, freebayes.20:23400000-23500000.baq.20110328.vcf, freebayes.20:23500000-23600000.baq.20110328.vcf, freebayes.20:23600000-23700000.baq.20110328.vcf, freebayes.20:23700000-23800000.baq.20110328.vcf, freebayes.20:23800000-23900000.baq.20110328.vcf, freebayes.20:23900000-24000000.baq.20110328.vcf, freebayes.20:24000000-24100000.baq.20110328.vcf, freebayes.20:24100000-24200000.baq.20110328.vcf, freebayes.20:24200000-24300000.baq.20110328.vcf, freebayes.20:24300000-24400000.baq.20110328.vcf, freebayes.20:24400000-24500000.baq.20110328.vcf, freebayes.20:24500000-24600000.baq.20110328.vcf, freebayes.20:24600000-24700000.baq.20110328.vcf, freebayes.20:24700000-24800000.baq.20110328.vcf, freebayes.20:24800000-24900000.baq.20110328.vcf, freebayes.20:24900000-25000000.baq.20110328.vcf, freebayes.20:25000000-25100000.baq.20110328.vcf, freebayes.20:25100000-25200000.baq.20110328.vcf, freebayes.20:25200000-25300000.baq.20110328.vcf, freebayes.20:25300000-25400000.baq.20110328.vcf, freebayes.20:25400000-25500000.baq.20110328.vcf, freebayes.20:25500000-25600000.baq.20110328.vcf, freebayes.20:25600000-25700000.baq.20110328.vcf, freebayes.20:25700000-25800000.baq.20110328.vcf, freebayes.20:25800000-25900000.baq.20110328.vcf, freebayes.20:25900000-26000000.baq.20110328.vcf, freebayes.20:26000000-26100000.baq.20110328.vcf, freebayes.20:26100000-26200000.baq.20110328.vcf, freebayes.20:26200000-26300000.baq.20110328.vcf, freebayes.20:26300000-26400000.baq.20110328.vcf, freebayes.20:26400000-26500000.baq.20110328.vcf, freebayes.20:26500000-26600000.baq.20110328.vcf, freebayes.20:26600000-26700000.baq.20110328.vcf, freebayes.20:26700000-26800000.baq.20110328.vcf, freebayes.20:26800000-26900000.baq.20110328.vcf, freebayes.20:26900000-27000000.baq.20110328.vcf, freebayes.20:27000000-27100000.baq.20110328.vcf, freebayes.20:27100000-27200000.baq.20110328.vcf, freebayes.20:27200000-27300000.baq.20110328.vcf, freebayes.20:27300000-27400000.baq.20110328.vcf, freebayes.20:27400000-27500000.baq.20110328.vcf, freebayes.20:27500000-27600000.baq.20110328.vcf, freebayes.20:27600000-27700000.baq.20110328.vcf, freebayes.20:27700000-27800000.baq.20110328.vcf, freebayes.20:27800000-27900000.baq.20110328.vcf, freebayes.20:27900000-28000000.baq.20110328.vcf, freebayes.20:28000000-28100000.baq.20110328.vcf, freebayes.20:28100000-28200000.baq.20110328.vcf, freebayes.20:28200000-28300000.baq.20110328.vcf, freebayes.20:28300000-28400000.baq.20110328.vcf, freebayes.20:28400000-28500000.baq.20110328.vcf, freebayes.20:28500000-28600000.baq.20110328.vcf, freebayes.20:28600000-28700000.baq.20110328.vcf, freebayes.20:28700000-28800000.baq.20110328.vcf, freebayes.20:28800000-28900000.baq.20110328.vcf, freebayes.20:28900000-29000000.baq.20110328.vcf, freebayes.20:29000000-29100000.baq.20110328.vcf, freebayes.20:29100000-29200000.baq.20110328.vcf, freebayes.20:29200000-29300000.baq.20110328.vcf, freebayes.20:29300000-29400000.baq.20110328.vcf, freebayes.20:29400000-29500000.baq.20110328.vcf, freebayes.20:29500000-29600000.baq.20110328.vcf, freebayes.20:29600000-29700000.baq.20110328.vcf, freebayes.20:29700000-29800000.baq.20110328.vcf, freebayes.20:29800000-29900000.baq.20110328.vcf, freebayes.20:29900000-30000000.baq.20110328.vcf, freebayes.20:30000000-30100000.baq.20110328.vcf, freebayes.20:30100000-30200000.baq.20110328.vcf, freebayes.20:30200000-30300000.baq.20110328.vcf, freebayes.20:30300000-30400000.baq.20110328.vcf, freebayes.20:30400000-30500000.baq.20110328.vcf, freebayes.20:30500000-30600000.baq.20110328.vcf, freebayes.20:30600000-30700000.baq.20110328.vcf, freebayes.20:30700000-30800000.baq.20110328.vcf, freebayes.20:30800000-30900000.baq.20110328.vcf, freebayes.20:30900000-31000000.baq.20110328.vcf, freebayes.20:31000000-31100000.baq.20110328.vcf, freebayes.20:31100000-31200000.baq.20110328.vcf, freebayes.20:31200000-31300000.baq.20110328.vcf, freebayes.20:31300000-31400000.baq.20110328.vcf, freebayes.20:31400000-31500000.baq.20110328.vcf, freebayes.20:31500000-31600000.baq.20110328.vcf, freebayes.20:31600000-31700000.baq.20110328.vcf, freebayes.20:31700000-31800000.baq.20110328.vcf, freebayes.20:31800000-31900000.baq.20110328.vcf, freebayes.20:31900000-32000000.baq.20110328.vcf, freebayes.20:32000000-32100000.baq.20110328.vcf, freebayes.20:32100000-32200000.baq.20110328.vcf, freebayes.20:32200000-32300000.baq.20110328.vcf, freebayes.20:32300000-32400000.baq.20110328.vcf, freebayes.20:32400000-32500000.baq.20110328.vcf, freebayes.20:32500000-32600000.baq.20110328.vcf, freebayes.20:32600000-32700000.baq.20110328.vcf, freebayes.20:32700000-32800000.baq.20110328.vcf, freebayes.20:32800000-32900000.baq.20110328.vcf, freebayes.20:32900000-33000000.baq.20110328.vcf, freebayes.20:33000000-33100000.baq.20110328.vcf, freebayes.20:33100000-33200000.baq.20110328.vcf, freebayes.20:33200000-33300000.baq.20110328.vcf, freebayes.20:33300000-33400000.baq.20110328.vcf, freebayes.20:33400000-33500000.baq.20110328.vcf, freebayes.20:33500000-33600000.baq.20110328.vcf, freebayes.20:33600000-33700000.baq.20110328.vcf, freebayes.20:33700000-33800000.baq.20110328.vcf, freebayes.20:33800000-33900000.baq.20110328.vcf, freebayes.20:33900000-34000000.baq.20110328.vcf, freebayes.20:34000000-34100000.baq.20110328.vcf, freebayes.20:34100000-34200000.baq.20110328.vcf, freebayes.20:34200000-34300000.baq.20110328.vcf, freebayes.20:34300000-34400000.baq.20110328.vcf, freebayes.20:34400000-34500000.baq.20110328.vcf, freebayes.20:34500000-34600000.baq.20110328.vcf, freebayes.20:34600000-34700000.baq.20110328.vcf, freebayes.20:34700000-34800000.baq.20110328.vcf, freebayes.20:34800000-34900000.baq.20110328.vcf, freebayes.20:34900000-35000000.baq.20110328.vcf, freebayes.20:35000000-35100000.baq.20110328.vcf, freebayes.20:35100000-35200000.baq.20110328.vcf, freebayes.20:35200000-35300000.baq.20110328.vcf, freebayes.20:35300000-35400000.baq.20110328.vcf, freebayes.20:35400000-35500000.baq.20110328.vcf, freebayes.20:35500000-35600000.baq.20110328.vcf, freebayes.20:35600000-35700000.baq.20110328.vcf, freebayes.20:35700000-35800000.baq.20110328.vcf, freebayes.20:35800000-35900000.baq.20110328.vcf, freebayes.20:35900000-36000000.baq.20110328.vcf, freebayes.20:36000000-36100000.baq.20110328.vcf, freebayes.20:36100000-36200000.baq.20110328.vcf, freebayes.20:36200000-36300000.baq.20110328.vcf, freebayes.20:36300000-36400000.baq.20110328.vcf, freebayes.20:36400000-36500000.baq.20110328.vcf, freebayes.20:36500000-36600000.baq.20110328.vcf, freebayes.20:36600000-36700000.baq.20110328.vcf, freebayes.20:36700000-36800000.baq.20110328.vcf, freebayes.20:36800000-36900000.baq.20110328.vcf, freebayes.20:36900000-37000000.baq.20110328.vcf, freebayes.20:37000000-37100000.baq.20110328.vcf, freebayes.20:37100000-37200000.baq.20110328.vcf, freebayes.20:37200000-37300000.baq.20110328.vcf, freebayes.20:37300000-37400000.baq.20110328.vcf, freebayes.20:37400000-37500000.baq.20110328.vcf, freebayes.20:37500000-37600000.baq.20110328.vcf, freebayes.20:37600000-37700000.baq.20110328.vcf, freebayes.20:37700000-37800000.baq.20110328.vcf, freebayes.20:37800000-37900000.baq.20110328.vcf, freebayes.20:37900000-38000000.baq.20110328.vcf, freebayes.20:38000000-38100000.baq.20110328.vcf, freebayes.20:38100000-38200000.baq.20110328.vcf, freebayes.20:38200000-38300000.baq.20110328.vcf, freebayes.20:38300000-38400000.baq.20110328.vcf, freebayes.20:38400000-38500000.baq.20110328.vcf, freebayes.20:38500000-38600000.baq.20110328.vcf, freebayes.20:38600000-38700000.baq.20110328.vcf, freebayes.20:38700000-38800000.baq.20110328.vcf, freebayes.20:38800000-38900000.baq.20110328.vcf, freebayes.20:38900000-39000000.baq.20110328.vcf, freebayes.20:39000000-39100000.baq.20110328.vcf, freebayes.20:39100000-39200000.baq.20110328.vcf, freebayes.20:39200000-39300000.baq.20110328.vcf, freebayes.20:39300000-39400000.baq.20110328.vcf, freebayes.20:39400000-39500000.baq.20110328.vcf, freebayes.20:39500000-39600000.baq.20110328.vcf, freebayes.20:39600000-39700000.baq.20110328.vcf, freebayes.20:39700000-39800000.baq.20110328.vcf, freebayes.20:39800000-39900000.baq.20110328.vcf, freebayes.20:39900000-40000000.baq.20110328.vcf, freebayes.20:40000000-40100000.baq.20110328.vcf, freebayes.20:40100000-40200000.baq.20110328.vcf, freebayes.20:40200000-40300000.baq.20110328.vcf, freebayes.20:40300000-40400000.baq.20110328.vcf, freebayes.20:40400000-40500000.baq.20110328.vcf, freebayes.20:40500000-40600000.baq.20110328.vcf, freebayes.20:40600000-40700000.baq.20110328.vcf, freebayes.20:40700000-40800000.baq.20110328.vcf, freebayes.20:40800000-40900000.baq.20110328.vcf, freebayes.20:40900000-41000000.baq.20110328.vcf, freebayes.20:41000000-41100000.baq.20110328.vcf, freebayes.20:41100000-41200000.baq.20110328.vcf, freebayes.20:41200000-41300000.baq.20110328.vcf, freebayes.20:41300000-41400000.baq.20110328.vcf, freebayes.20:41400000-41500000.baq.20110328.vcf, freebayes.20:41500000-41600000.baq.20110328.vcf, freebayes.20:41600000-41700000.baq.20110328.vcf, freebayes.20:41700000-41800000.baq.20110328.vcf, freebayes.20:41800000-41900000.baq.20110328.vcf, freebayes.20:41900000-42000000.baq.20110328.vcf, freebayes.20:42000000-42100000.baq.20110328.vcf, freebayes.20:42100000-42200000.baq.20110328.vcf, freebayes.20:42200000-42300000.baq.20110328.vcf, freebayes.20:42300000-42400000.baq.20110328.vcf, freebayes.20:42400000-42500000.baq.20110328.vcf, freebayes.20:42500000-42600000.baq.20110328.vcf, freebayes.20:42600000-42700000.baq.20110328.vcf, freebayes.20:42700000-42800000.baq.20110328.vcf, freebayes.20:42800000-42900000.baq.20110328.vcf, freebayes.20:42900000-43000000.baq.20110328.vcf, freebayes.20:43000000-43100000.baq.20110328.vcf, freebayes.20:43100000-43200000.baq.20110328.vcf, freebayes.20:43200000-43300000.baq.20110328.vcf, freebayes.20:43300000-43400000.baq.20110328.vcf, freebayes.20:43400000-43500000.baq.20110328.vcf, freebayes.20:43500000-43600000.baq.20110328.vcf, freebayes.20:43600000-43700000.baq.20110328.vcf, freebayes.20:43700000-43800000.baq.20110328.vcf, freebayes.20:43800000-43900000.baq.20110328.vcf, freebayes.20:43900000-44000000.baq.20110328.vcf, freebayes.20:44000000-44100000.baq.20110328.vcf, freebayes.20:44100000-44200000.baq.20110328.vcf, freebayes.20:44200000-44300000.baq.20110328.vcf, freebayes.20:44300000-44400000.baq.20110328.vcf, freebayes.20:44400000-44500000.baq.20110328.vcf, freebayes.20:44500000-44600000.baq.20110328.vcf, freebayes.20:44600000-44700000.baq.20110328.vcf, freebayes.20:44700000-44800000.baq.20110328.vcf, freebayes.20:44800000-44900000.baq.20110328.vcf, freebayes.20:44900000-45000000.baq.20110328.vcf, freebayes.20:45000000-45100000.baq.20110328.vcf, freebayes.20:45100000-45200000.baq.20110328.vcf, freebayes.20:45200000-45300000.baq.20110328.vcf, freebayes.20:45300000-45400000.baq.20110328.vcf, freebayes.20:45400000-45500000.baq.20110328.vcf, freebayes.20:45500000-45600000.baq.20110328.vcf, freebayes.20:45600000-45700000.baq.20110328.vcf, freebayes.20:45700000-45800000.baq.20110328.vcf, freebayes.20:45800000-45900000.baq.20110328.vcf, freebayes.20:45900000-46000000.baq.20110328.vcf, freebayes.20:46000000-46100000.baq.20110328.vcf, freebayes.20:46100000-46200000.baq.20110328.vcf, freebayes.20:46200000-46300000.baq.20110328.vcf, freebayes.20:46300000-46400000.baq.20110328.vcf, freebayes.20:46400000-46500000.baq.20110328.vcf, freebayes.20:46500000-46600000.baq.20110328.vcf, freebayes.20:46600000-46700000.baq.20110328.vcf, freebayes.20:46700000-46800000.baq.20110328.vcf, freebayes.20:46800000-46900000.baq.20110328.vcf, freebayes.20:46900000-47000000.baq.20110328.vcf, freebayes.20:47000000-47100000.baq.20110328.vcf, freebayes.20:47100000-47200000.baq.20110328.vcf, freebayes.20:47200000-47300000.baq.20110328.vcf, freebayes.20:47300000-47400000.baq.20110328.vcf, freebayes.20:47400000-47500000.baq.20110328.vcf, freebayes.20:47500000-47600000.baq.20110328.vcf, freebayes.20:47600000-47700000.baq.20110328.vcf, freebayes.20:47700000-47800000.baq.20110328.vcf, freebayes.20:47800000-47900000.baq.20110328.vcf, freebayes.20:47900000-48000000.baq.20110328.vcf, freebayes.20:48000000-48100000.baq.20110328.vcf, freebayes.20:48100000-48200000.baq.20110328.vcf, freebayes.20:48200000-48300000.baq.20110328.vcf, freebayes.20:48300000-48400000.baq.20110328.vcf, freebayes.20:48400000-48500000.baq.20110328.vcf, freebayes.20:48500000-48600000.baq.20110328.vcf, freebayes.20:48600000-48700000.baq.20110328.vcf, freebayes.20:48700000-48800000.baq.20110328.vcf, freebayes.20:48800000-48900000.baq.20110328.vcf, freebayes.20:48900000-49000000.baq.20110328.vcf, freebayes.20:49000000-49100000.baq.20110328.vcf, freebayes.20:49100000-49200000.baq.20110328.vcf, freebayes.20:49200000-49300000.baq.20110328.vcf, freebayes.20:49300000-49400000.baq.20110328.vcf, freebayes.20:49400000-49500000.baq.20110328.vcf, freebayes.20:49500000-49600000.baq.20110328.vcf, freebayes.20:49600000-49700000.baq.20110328.vcf, freebayes.20:49700000-49800000.baq.20110328.vcf, freebayes.20:49800000-49900000.baq.20110328.vcf, freebayes.20:49900000-50000000.baq.20110328.vcf, freebayes.20:50000000-50100000.baq.20110328.vcf, freebayes.20:50100000-50200000.baq.20110328.vcf, freebayes.20:50200000-50300000.baq.20110328.vcf, freebayes.20:50300000-50400000.baq.20110328.vcf, freebayes.20:50400000-50500000.baq.20110328.vcf, freebayes.20:50500000-50600000.baq.20110328.vcf, freebayes.20:50600000-50700000.baq.20110328.vcf, freebayes.20:50700000-50800000.baq.20110328.vcf, freebayes.20:50800000-50900000.baq.20110328.vcf, freebayes.20:50900000-51000000.baq.20110328.vcf, freebayes.20:51000000-51100000.baq.20110328.vcf, freebayes.20:51100000-51200000.baq.20110328.vcf, freebayes.20:51200000-51300000.baq.20110328.vcf, freebayes.20:51300000-51400000.baq.20110328.vcf, freebayes.20:51400000-51500000.baq.20110328.vcf, freebayes.20:51500000-51600000.baq.20110328.vcf, freebayes.20:51600000-51700000.baq.20110328.vcf, freebayes.20:51700000-51800000.baq.20110328.vcf, freebayes.20:51800000-51900000.baq.20110328.vcf, freebayes.20:51900000-52000000.baq.20110328.vcf, freebayes.20:52000000-52100000.baq.20110328.vcf, freebayes.20:52100000-52200000.baq.20110328.vcf, freebayes.20:52200000-52300000.baq.20110328.vcf, freebayes.20:52300000-52400000.baq.20110328.vcf, freebayes.20:52400000-52500000.baq.20110328.vcf, freebayes.20:52500000-52600000.baq.20110328.vcf, freebayes.20:52600000-52700000.baq.20110328.vcf, freebayes.20:52700000-52800000.baq.20110328.vcf, freebayes.20:52800000-52900000.baq.20110328.vcf, freebayes.20:52900000-53000000.baq.20110328.vcf, freebayes.20:53000000-53100000.baq.20110328.vcf, freebayes.20:53100000-53200000.baq.20110328.vcf, freebayes.20:53200000-53300000.baq.20110328.vcf, freebayes.20:53300000-53400000.baq.20110328.vcf, freebayes.20:53400000-53500000.baq.20110328.vcf, freebayes.20:53500000-53600000.baq.20110328.vcf, freebayes.20:53600000-53700000.baq.20110328.vcf, freebayes.20:53700000-53800000.baq.20110328.vcf, freebayes.20:53800000-53900000.baq.20110328.vcf, freebayes.20:53900000-54000000.baq.20110328.vcf, freebayes.20:54000000-54100000.baq.20110328.vcf, freebayes.20:54100000-54200000.baq.20110328.vcf, freebayes.20:54200000-54300000.baq.20110328.vcf, freebayes.20:54300000-54400000.baq.20110328.vcf, freebayes.20:54400000-54500000.baq.20110328.vcf, freebayes.20:54500000-54600000.baq.20110328.vcf, freebayes.20:54600000-54700000.baq.20110328.vcf, freebayes.20:54700000-54800000.baq.20110328.vcf, freebayes.20:54800000-54900000.baq.20110328.vcf, freebayes.20:54900000-55000000.baq.20110328.vcf, freebayes.20:55000000-55100000.baq.20110328.vcf, freebayes.20:55100000-55200000.baq.20110328.vcf, freebayes.20:55200000-55300000.baq.20110328.vcf, freebayes.20:55300000-55400000.baq.20110328.vcf, freebayes.20:55400000-55500000.baq.20110328.vcf, freebayes.20:55500000-55600000.baq.20110328.vcf, freebayes.20:55600000-55700000.baq.20110328.vcf, freebayes.20:55700000-55800000.baq.20110328.vcf, freebayes.20:55800000-55900000.baq.20110328.vcf, freebayes.20:55900000-56000000.baq.20110328.vcf, freebayes.20:56000000-56100000.baq.20110328.vcf, freebayes.20:56100000-56200000.baq.20110328.vcf, freebayes.20:56200000-56300000.baq.20110328.vcf, freebayes.20:56300000-56400000.baq.20110328.vcf, freebayes.20:56400000-56500000.baq.20110328.vcf, freebayes.20:56500000-56600000.baq.20110328.vcf, freebayes.20:56600000-56700000.baq.20110328.vcf, freebayes.20:56700000-56800000.baq.20110328.vcf, freebayes.20:56800000-56900000.baq.20110328.vcf, freebayes.20:56900000-57000000.baq.20110328.vcf, freebayes.20:57000000-57100000.baq.20110328.vcf, freebayes.20:57100000-57200000.baq.20110328.vcf, freebayes.20:57200000-57300000.baq.20110328.vcf, freebayes.20:57300000-57400000.baq.20110328.vcf, freebayes.20:57400000-57500000.baq.20110328.vcf, freebayes.20:57500000-57600000.baq.20110328.vcf, freebayes.20:57600000-57700000.baq.20110328.vcf, freebayes.20:57700000-57800000.baq.20110328.vcf, freebayes.20:57800000-57900000.baq.20110328.vcf, freebayes.20:57900000-58000000.baq.20110328.vcf, freebayes.20:58000000-58100000.baq.20110328.vcf, freebayes.20:58100000-58200000.baq.20110328.vcf, freebayes.20:58200000-58300000.baq.20110328.vcf, freebayes.20:58300000-58400000.baq.20110328.vcf, freebayes.20:58400000-58500000.baq.20110328.vcf, freebayes.20:58500000-58600000.baq.20110328.vcf, freebayes.20:58600000-58700000.baq.20110328.vcf, freebayes.20:58700000-58800000.baq.20110328.vcf, freebayes.20:58800000-58900000.baq.20110328.vcf, freebayes.20:58900000-59000000.baq.20110328.vcf, freebayes.20:59000000-59100000.baq.20110328.vcf, freebayes.20:59100000-59200000.baq.20110328.vcf, freebayes.20:59200000-59300000.baq.20110328.vcf, freebayes.20:59300000-59400000.baq.20110328.vcf, freebayes.20:59400000-59500000.baq.20110328.vcf, freebayes.20:59500000-59600000.baq.20110328.vcf, freebayes.20:59600000-59700000.baq.20110328.vcf, freebayes.20:59700000-59800000.baq.20110328.vcf, freebayes.20:59800000-59900000.baq.20110328.vcf, freebayes.20:59900000-60000000.baq.20110328.vcf, freebayes.20:60000000-60100000.baq.20110328.vcf, freebayes.20:60100000-60200000.baq.20110328.vcf, freebayes.20:60200000-60300000.baq.20110328.vcf, freebayes.20:60300000-60400000.baq.20110328.vcf, freebayes.20:60400000-60500000.baq.20110328.vcf, freebayes.20:60500000-60600000.baq.20110328.vcf, freebayes.20:60600000-60700000.baq.20110328.vcf, freebayes.20:60700000-60800000.baq.20110328.vcf, freebayes.20:60800000-60900000.baq.20110328.vcf, freebayes.20:60900000-61000000.baq.20110328.vcf, freebayes.20:61000000-61100000.baq.20110328.vcf, freebayes.20:61100000-61200000.baq.20110328.vcf, freebayes.20:61200000-61300000.baq.20110328.vcf, freebayes.20:61300000-61400000.baq.20110328.vcf, freebayes.20:61400000-61500000.baq.20110328.vcf, freebayes.20:61500000-61600000.baq.20110328.vcf, freebayes.20:61600000-61700000.baq.20110328.vcf, freebayes.20:61700000-61800000.baq.20110328.vcf, freebayes.20:61800000-61900000.baq.20110328.vcf, freebayes.20:61900000-62000000.baq.20110328.vcf, freebayes.20:62000000-62100000.baq.20110328.vcf, freebayes.20:62100000-62200000.baq.20110328.vcf, freebayes.20:62200000-62300000.baq.20110328.vcf, freebayes.20:62300000-62400000.baq.20110328.vcf, freebayes.20:62400000-62500000.baq.20110328.vcf, freebayes.20:62500000-62600000.baq.20110328.vcf, freebayes.20:62600000-62700000.baq.20110328.vcf, freebayes.20:62700000-62800000.baq.20110328.vcf, freebayes.20:62800000-62900000.baq.20110328.vcf, freebayes.20:62900000-63000000.baq.20110328.vcf, freebayes.20:63000000-63025520.baq.20110328.vcf
-#CHROM POS ID REF ALT QUAL FILTER INFO
+#CHROM POS ID REF ALT QUAL FILTER INFO
20 458502 . G GA 4567.01 PASS AA=20;AB=0.61111;ABA=14;ABP=6.8707;ABR=22;AC=38;AF=0.0544;AN=698;BL=374;BR=1129;BVAR;BaseQRankSum=13.364;DP=15979;DP4=1882,2188,45,37;Dels=0.00;EL=5;EPP=13.868;ER=15;FR;FS=6.503;HETAR=11;HOMA=2;HOMR=985;HP=1;HPLen=2;HR=2;HRun=0;HU=G;INDEL;INS;InbreedingCoeff=0.0157;LEN=1;LRB=0.50233;LRBP=826.56;MQ=66.16;MQ0Fraction=0.0110;MQM=70.5;MQRankSum=-3.158;NF;NR;NS=998;PP;PV4=0.15,1,0.42,0.15;RA=3173;RL=1;RPP=38.188;RR=19;RUN=1;ReadPosRankSum=-2.346;SAB=0.7;SAF=14;SAP=9.959;SAR=6;SC=GGGCGTGGTGGTGCATGTAAT;SRB=0.50047;SRF=1588;SRP=3.0165;SRR=1585;TC;TR=9;TU=GGT;VQSLOD=10.0079;set=Intersection;sumGLbyD=23.94
20 539571 . TG T 18546 PASS AA=71;AB=0.92482;ABA=63;ABP=1316.6;ABR=775;AC=42;AF=0.03512;AN=1196;BL=3915;BR=252;BVAR;BaseQRankSum=0.556;DEL;DP=10073;Dels=0.01;EL=47;EPP=19.189;ER=24;FS=2.124;HETAR=290;HOMA=156;HOMR=570;HRun=1;InbreedingCoeff=0.0620;LEN=1;LRB=0.87905;LRBP=6995.1;MQ0=0;MQ0Fraction=0.0000;MQM=127.99;MQRankSum=0.410;NS=1016;RA=3090;RL=71;RPP=157.18;RR=0;RUN=1;ReadPosRankSum=-11.038;SAB=0.66197;SAF=47;SAP=19.189;SAR=24;SRB=0.55016;SRF=1700;SRP=70.544;SRR=1390;VQSLOD=2.6772;set=filterInVQSR-2of5;sumGLbyD=4.71
20 573764 . TA T 591.51 PASS AC=91;AF=0.1987;AN=458;BaseQRankSum=0.137;DP=519;FS=3.153;HRun=1;HaplotypeScore=14.0744;InbreedingCoeff=0.1460;MQ=48.16;MQ0=26;MQ0Fraction=0.0501;MQRankSum=-1.636;QD=3.63;ReadPosRankSum=-4.140;SB=-408.14;VQSLOD=5.2458;set=VQSR
##source_20110031.1=/nfs/users/nfs_p/pd3/cvs/vcftools/perl/vcf-annotate -d /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.desc -a /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.tab.gz -c CHROM,FROM,INFO/VLD,INFO/KGPilot123,INFO/dbSNP
##vcfCTools=filter
##vcfCtools=merge freebayes.20:0-100000.baq.20110328.vcf, freebayes.20:100000-200000.baq.20110328.vcf, freebayes.20:200000-300000.baq.20110328.vcf, freebayes.20:300000-400000.baq.20110328.vcf, freebayes.20:400000-500000.baq.20110328.vcf, freebayes.20:500000-600000.baq.20110328.vcf, freebayes.20:600000-700000.baq.20110328.vcf, freebayes.20:700000-800000.baq.20110328.vcf, freebayes.20:800000-900000.baq.20110328.vcf, freebayes.20:900000-1000000.baq.20110328.vcf, freebayes.20:1000000-1100000.baq.20110328.vcf, freebayes.20:1100000-1200000.baq.20110328.vcf, freebayes.20:1200000-1300000.baq.20110328.vcf, freebayes.20:1300000-1400000.baq.20110328.vcf, freebayes.20:1400000-1500000.baq.20110328.vcf, freebayes.20:1500000-1600000.baq.20110328.vcf, freebayes.20:1600000-1700000.baq.20110328.vcf, freebayes.20:1700000-1800000.baq.20110328.vcf, freebayes.20:1800000-1900000.baq.20110328.vcf, freebayes.20:1900000-2000000.baq.20110328.vcf, freebayes.20:2000000-2100000.baq.20110328.vcf, freebayes.20:2100000-2200000.baq.20110328.vcf, freebayes.20:2200000-2300000.baq.20110328.vcf, freebayes.20:2300000-2400000.baq.20110328.vcf, freebayes.20:2400000-2500000.baq.20110328.vcf, freebayes.20:2500000-2600000.baq.20110328.vcf, freebayes.20:2600000-2700000.baq.20110328.vcf, freebayes.20:2700000-2800000.baq.20110328.vcf, freebayes.20:2800000-2900000.baq.20110328.vcf, freebayes.20:2900000-3000000.baq.20110328.vcf, freebayes.20:3000000-3100000.baq.20110328.vcf, freebayes.20:3100000-3200000.baq.20110328.vcf, freebayes.20:3200000-3300000.baq.20110328.vcf, freebayes.20:3300000-3400000.baq.20110328.vcf, freebayes.20:3400000-3500000.baq.20110328.vcf, freebayes.20:3500000-3600000.baq.20110328.vcf, freebayes.20:3600000-3700000.baq.20110328.vcf, freebayes.20:3700000-3800000.baq.20110328.vcf, freebayes.20:3800000-3900000.baq.20110328.vcf, freebayes.20:3900000-4000000.baq.20110328.vcf, freebayes.20:4000000-4100000.baq.20110328.vcf, freebayes.20:4100000-4200000.baq.20110328.vcf, freebayes.20:4200000-4300000.baq.20110328.vcf, freebayes.20:4300000-4400000.baq.20110328.vcf, freebayes.20:4400000-4500000.baq.20110328.vcf, freebayes.20:4500000-4600000.baq.20110328.vcf, freebayes.20:4600000-4700000.baq.20110328.vcf, freebayes.20:4700000-4800000.baq.20110328.vcf, freebayes.20:4800000-4900000.baq.20110328.vcf, freebayes.20:4900000-5000000.baq.20110328.vcf, freebayes.20:5000000-5100000.baq.20110328.vcf, freebayes.20:5100000-5200000.baq.20110328.vcf, freebayes.20:5200000-5300000.baq.20110328.vcf, freebayes.20:5300000-5400000.baq.20110328.vcf, freebayes.20:5400000-5500000.baq.20110328.vcf, freebayes.20:5500000-5600000.baq.20110328.vcf, freebayes.20:5600000-5700000.baq.20110328.vcf, freebayes.20:5700000-5800000.baq.20110328.vcf, freebayes.20:5800000-5900000.baq.20110328.vcf, freebayes.20:5900000-6000000.baq.20110328.vcf, freebayes.20:6000000-6100000.baq.20110328.vcf, freebayes.20:6100000-6200000.baq.20110328.vcf, freebayes.20:6200000-6300000.baq.20110328.vcf, freebayes.20:6300000-6400000.baq.20110328.vcf, freebayes.20:6400000-6500000.baq.20110328.vcf, freebayes.20:6500000-6600000.baq.20110328.vcf, freebayes.20:6600000-6700000.baq.20110328.vcf, freebayes.20:6700000-6800000.baq.20110328.vcf, freebayes.20:6800000-6900000.baq.20110328.vcf, freebayes.20:6900000-7000000.baq.20110328.vcf, freebayes.20:7000000-7100000.baq.20110328.vcf, freebayes.20:7100000-7200000.baq.20110328.vcf, freebayes.20:7200000-7300000.baq.20110328.vcf, freebayes.20:7300000-7400000.baq.20110328.vcf, freebayes.20:7400000-7500000.baq.20110328.vcf, freebayes.20:7500000-7600000.baq.20110328.vcf, freebayes.20:7600000-7700000.baq.20110328.vcf, freebayes.20:7700000-7800000.baq.20110328.vcf, freebayes.20:7800000-7900000.baq.20110328.vcf, freebayes.20:7900000-8000000.baq.20110328.vcf, freebayes.20:8000000-8100000.baq.20110328.vcf, freebayes.20:8100000-8200000.baq.20110328.vcf, freebayes.20:8200000-8300000.baq.20110328.vcf, freebayes.20:8300000-8400000.baq.20110328.vcf, freebayes.20:8400000-8500000.baq.20110328.vcf, freebayes.20:8500000-8600000.baq.20110328.vcf, freebayes.20:8600000-8700000.baq.20110328.vcf, freebayes.20:8700000-8800000.baq.20110328.vcf, freebayes.20:8800000-8900000.baq.20110328.vcf, freebayes.20:8900000-9000000.baq.20110328.vcf, freebayes.20:9000000-9100000.baq.20110328.vcf, freebayes.20:9100000-9200000.baq.20110328.vcf, freebayes.20:9200000-9300000.baq.20110328.vcf, freebayes.20:9300000-9400000.baq.20110328.vcf, freebayes.20:9400000-9500000.baq.20110328.vcf, freebayes.20:9500000-9600000.baq.20110328.vcf, freebayes.20:9600000-9700000.baq.20110328.vcf, freebayes.20:9700000-9800000.baq.20110328.vcf, freebayes.20:9800000-9900000.baq.20110328.vcf, freebayes.20:9900000-10000000.baq.20110328.vcf, freebayes.20:10000000-10100000.baq.20110328.vcf, freebayes.20:10100000-10200000.baq.20110328.vcf, freebayes.20:10200000-10300000.baq.20110328.vcf, freebayes.20:10300000-10400000.baq.20110328.vcf, freebayes.20:10400000-10500000.baq.20110328.vcf, freebayes.20:10500000-10600000.baq.20110328.vcf, freebayes.20:10600000-10700000.baq.20110328.vcf, freebayes.20:10700000-10800000.baq.20110328.vcf, freebayes.20:10800000-10900000.baq.20110328.vcf, freebayes.20:10900000-11000000.baq.20110328.vcf, freebayes.20:11000000-11100000.baq.20110328.vcf, freebayes.20:11100000-11200000.baq.20110328.vcf, freebayes.20:11200000-11300000.baq.20110328.vcf, freebayes.20:11300000-11400000.baq.20110328.vcf, freebayes.20:11400000-11500000.baq.20110328.vcf, freebayes.20:11500000-11600000.baq.20110328.vcf, freebayes.20:11600000-11700000.baq.20110328.vcf, freebayes.20:11700000-11800000.baq.20110328.vcf, freebayes.20:11800000-11900000.baq.20110328.vcf, freebayes.20:11900000-12000000.baq.20110328.vcf, freebayes.20:12000000-12100000.baq.20110328.vcf, freebayes.20:12100000-12200000.baq.20110328.vcf, freebayes.20:12200000-12300000.baq.20110328.vcf, freebayes.20:12300000-12400000.baq.20110328.vcf, freebayes.20:12400000-12500000.baq.20110328.vcf, freebayes.20:12500000-12600000.baq.20110328.vcf, freebayes.20:12600000-12700000.baq.20110328.vcf, freebayes.20:12700000-12800000.baq.20110328.vcf, freebayes.20:12800000-12900000.baq.20110328.vcf, freebayes.20:12900000-13000000.baq.20110328.vcf, freebayes.20:13000000-13100000.baq.20110328.vcf, freebayes.20:13100000-13200000.baq.20110328.vcf, freebayes.20:13200000-13300000.baq.20110328.vcf, freebayes.20:13300000-13400000.baq.20110328.vcf, freebayes.20:13400000-13500000.baq.20110328.vcf, freebayes.20:13500000-13600000.baq.20110328.vcf, freebayes.20:13600000-13700000.baq.20110328.vcf, freebayes.20:13700000-13800000.baq.20110328.vcf, freebayes.20:13800000-13900000.baq.20110328.vcf, freebayes.20:13900000-14000000.baq.20110328.vcf, freebayes.20:14000000-14100000.baq.20110328.vcf, freebayes.20:14100000-14200000.baq.20110328.vcf, freebayes.20:14200000-14300000.baq.20110328.vcf, freebayes.20:14300000-14400000.baq.20110328.vcf, freebayes.20:14400000-14500000.baq.20110328.vcf, freebayes.20:14500000-14600000.baq.20110328.vcf, freebayes.20:14600000-14700000.baq.20110328.vcf, freebayes.20:14700000-14800000.baq.20110328.vcf, freebayes.20:14800000-14900000.baq.20110328.vcf, freebayes.20:14900000-15000000.baq.20110328.vcf, freebayes.20:15000000-15100000.baq.20110328.vcf, freebayes.20:15100000-15200000.baq.20110328.vcf, freebayes.20:15200000-15300000.baq.20110328.vcf, freebayes.20:15300000-15400000.baq.20110328.vcf, freebayes.20:15400000-15500000.baq.20110328.vcf, freebayes.20:15500000-15600000.baq.20110328.vcf, freebayes.20:15600000-15700000.baq.20110328.vcf, freebayes.20:15700000-15800000.baq.20110328.vcf, freebayes.20:15800000-15900000.baq.20110328.vcf, freebayes.20:15900000-16000000.baq.20110328.vcf, freebayes.20:16000000-16100000.baq.20110328.vcf, freebayes.20:16100000-16200000.baq.20110328.vcf, freebayes.20:16200000-16300000.baq.20110328.vcf, freebayes.20:16300000-16400000.baq.20110328.vcf, freebayes.20:16400000-16500000.baq.20110328.vcf, freebayes.20:16500000-16600000.baq.20110328.vcf, freebayes.20:16600000-16700000.baq.20110328.vcf, freebayes.20:16700000-16800000.baq.20110328.vcf, freebayes.20:16800000-16900000.baq.20110328.vcf, freebayes.20:16900000-17000000.baq.20110328.vcf, freebayes.20:17000000-17100000.baq.20110328.vcf, freebayes.20:17100000-17200000.baq.20110328.vcf, freebayes.20:17200000-17300000.baq.20110328.vcf, freebayes.20:17300000-17400000.baq.20110328.vcf, freebayes.20:17400000-17500000.baq.20110328.vcf, freebayes.20:17500000-17600000.baq.20110328.vcf, freebayes.20:17600000-17700000.baq.20110328.vcf, freebayes.20:17700000-17800000.baq.20110328.vcf, freebayes.20:17800000-17900000.baq.20110328.vcf, freebayes.20:17900000-18000000.baq.20110328.vcf, freebayes.20:18000000-18100000.baq.20110328.vcf, freebayes.20:18100000-18200000.baq.20110328.vcf, freebayes.20:18200000-18300000.baq.20110328.vcf, freebayes.20:18300000-18400000.baq.20110328.vcf, freebayes.20:18400000-18500000.baq.20110328.vcf, freebayes.20:18500000-18600000.baq.20110328.vcf, freebayes.20:18600000-18700000.baq.20110328.vcf, freebayes.20:18700000-18800000.baq.20110328.vcf, freebayes.20:18800000-18900000.baq.20110328.vcf, freebayes.20:18900000-19000000.baq.20110328.vcf, freebayes.20:19000000-19100000.baq.20110328.vcf, freebayes.20:19100000-19200000.baq.20110328.vcf, freebayes.20:19200000-19300000.baq.20110328.vcf, freebayes.20:19300000-19400000.baq.20110328.vcf, freebayes.20:19400000-19500000.baq.20110328.vcf, freebayes.20:19500000-19600000.baq.20110328.vcf, freebayes.20:19600000-19700000.baq.20110328.vcf, freebayes.20:19700000-19800000.baq.20110328.vcf, freebayes.20:19800000-19900000.baq.20110328.vcf, freebayes.20:19900000-20000000.baq.20110328.vcf, freebayes.20:20000000-20100000.baq.20110328.vcf, freebayes.20:20100000-20200000.baq.20110328.vcf, freebayes.20:20200000-20300000.baq.20110328.vcf, freebayes.20:20300000-20400000.baq.20110328.vcf, freebayes.20:20400000-20500000.baq.20110328.vcf, freebayes.20:20500000-20600000.baq.20110328.vcf, freebayes.20:20600000-20700000.baq.20110328.vcf, freebayes.20:20700000-20800000.baq.20110328.vcf, freebayes.20:20800000-20900000.baq.20110328.vcf, freebayes.20:20900000-21000000.baq.20110328.vcf, freebayes.20:21000000-21100000.baq.20110328.vcf, freebayes.20:21100000-21200000.baq.20110328.vcf, freebayes.20:21200000-21300000.baq.20110328.vcf, freebayes.20:21300000-21400000.baq.20110328.vcf, freebayes.20:21400000-21500000.baq.20110328.vcf, freebayes.20:21500000-21600000.baq.20110328.vcf, freebayes.20:21600000-21700000.baq.20110328.vcf, freebayes.20:21700000-21800000.baq.20110328.vcf, freebayes.20:21800000-21900000.baq.20110328.vcf, freebayes.20:21900000-22000000.baq.20110328.vcf, freebayes.20:22000000-22100000.baq.20110328.vcf, freebayes.20:22100000-22200000.baq.20110328.vcf, freebayes.20:22200000-22300000.baq.20110328.vcf, freebayes.20:22300000-22400000.baq.20110328.vcf, freebayes.20:22400000-22500000.baq.20110328.vcf, freebayes.20:22500000-22600000.baq.20110328.vcf, freebayes.20:22600000-22700000.baq.20110328.vcf, freebayes.20:22700000-22800000.baq.20110328.vcf, freebayes.20:22800000-22900000.baq.20110328.vcf, freebayes.20:22900000-23000000.baq.20110328.vcf, freebayes.20:23000000-23100000.baq.20110328.vcf, freebayes.20:23100000-23200000.baq.20110328.vcf, freebayes.20:23200000-23300000.baq.20110328.vcf, freebayes.20:23300000-23400000.baq.20110328.vcf, freebayes.20:23400000-23500000.baq.20110328.vcf, freebayes.20:23500000-23600000.baq.20110328.vcf, freebayes.20:23600000-23700000.baq.20110328.vcf, freebayes.20:23700000-23800000.baq.20110328.vcf, freebayes.20:23800000-23900000.baq.20110328.vcf, freebayes.20:23900000-24000000.baq.20110328.vcf, freebayes.20:24000000-24100000.baq.20110328.vcf, freebayes.20:24100000-24200000.baq.20110328.vcf, freebayes.20:24200000-24300000.baq.20110328.vcf, freebayes.20:24300000-24400000.baq.20110328.vcf, freebayes.20:24400000-24500000.baq.20110328.vcf, freebayes.20:24500000-24600000.baq.20110328.vcf, freebayes.20:24600000-24700000.baq.20110328.vcf, freebayes.20:24700000-24800000.baq.20110328.vcf, freebayes.20:24800000-24900000.baq.20110328.vcf, freebayes.20:24900000-25000000.baq.20110328.vcf, freebayes.20:25000000-25100000.baq.20110328.vcf, freebayes.20:25100000-25200000.baq.20110328.vcf, freebayes.20:25200000-25300000.baq.20110328.vcf, freebayes.20:25300000-25400000.baq.20110328.vcf, freebayes.20:25400000-25500000.baq.20110328.vcf, freebayes.20:25500000-25600000.baq.20110328.vcf, freebayes.20:25600000-25700000.baq.20110328.vcf, freebayes.20:25700000-25800000.baq.20110328.vcf, freebayes.20:25800000-25900000.baq.20110328.vcf, freebayes.20:25900000-26000000.baq.20110328.vcf, freebayes.20:26000000-26100000.baq.20110328.vcf, freebayes.20:26100000-26200000.baq.20110328.vcf, freebayes.20:26200000-26300000.baq.20110328.vcf, freebayes.20:26300000-26400000.baq.20110328.vcf, freebayes.20:26400000-26500000.baq.20110328.vcf, freebayes.20:26500000-26600000.baq.20110328.vcf, freebayes.20:26600000-26700000.baq.20110328.vcf, freebayes.20:26700000-26800000.baq.20110328.vcf, freebayes.20:26800000-26900000.baq.20110328.vcf, freebayes.20:26900000-27000000.baq.20110328.vcf, freebayes.20:27000000-27100000.baq.20110328.vcf, freebayes.20:27100000-27200000.baq.20110328.vcf, freebayes.20:27200000-27300000.baq.20110328.vcf, freebayes.20:27300000-27400000.baq.20110328.vcf, freebayes.20:27400000-27500000.baq.20110328.vcf, freebayes.20:27500000-27600000.baq.20110328.vcf, freebayes.20:27600000-27700000.baq.20110328.vcf, freebayes.20:27700000-27800000.baq.20110328.vcf, freebayes.20:27800000-27900000.baq.20110328.vcf, freebayes.20:27900000-28000000.baq.20110328.vcf, freebayes.20:28000000-28100000.baq.20110328.vcf, freebayes.20:28100000-28200000.baq.20110328.vcf, freebayes.20:28200000-28300000.baq.20110328.vcf, freebayes.20:28300000-28400000.baq.20110328.vcf, freebayes.20:28400000-28500000.baq.20110328.vcf, freebayes.20:28500000-28600000.baq.20110328.vcf, freebayes.20:28600000-28700000.baq.20110328.vcf, freebayes.20:28700000-28800000.baq.20110328.vcf, freebayes.20:28800000-28900000.baq.20110328.vcf, freebayes.20:28900000-29000000.baq.20110328.vcf, freebayes.20:29000000-29100000.baq.20110328.vcf, freebayes.20:29100000-29200000.baq.20110328.vcf, freebayes.20:29200000-29300000.baq.20110328.vcf, freebayes.20:29300000-29400000.baq.20110328.vcf, freebayes.20:29400000-29500000.baq.20110328.vcf, freebayes.20:29500000-29600000.baq.20110328.vcf, freebayes.20:29600000-29700000.baq.20110328.vcf, freebayes.20:29700000-29800000.baq.20110328.vcf, freebayes.20:29800000-29900000.baq.20110328.vcf, freebayes.20:29900000-30000000.baq.20110328.vcf, freebayes.20:30000000-30100000.baq.20110328.vcf, freebayes.20:30100000-30200000.baq.20110328.vcf, freebayes.20:30200000-30300000.baq.20110328.vcf, freebayes.20:30300000-30400000.baq.20110328.vcf, freebayes.20:30400000-30500000.baq.20110328.vcf, freebayes.20:30500000-30600000.baq.20110328.vcf, freebayes.20:30600000-30700000.baq.20110328.vcf, freebayes.20:30700000-30800000.baq.20110328.vcf, freebayes.20:30800000-30900000.baq.20110328.vcf, freebayes.20:30900000-31000000.baq.20110328.vcf, freebayes.20:31000000-31100000.baq.20110328.vcf, freebayes.20:31100000-31200000.baq.20110328.vcf, freebayes.20:31200000-31300000.baq.20110328.vcf, freebayes.20:31300000-31400000.baq.20110328.vcf, freebayes.20:31400000-31500000.baq.20110328.vcf, freebayes.20:31500000-31600000.baq.20110328.vcf, freebayes.20:31600000-31700000.baq.20110328.vcf, freebayes.20:31700000-31800000.baq.20110328.vcf, freebayes.20:31800000-31900000.baq.20110328.vcf, freebayes.20:31900000-32000000.baq.20110328.vcf, freebayes.20:32000000-32100000.baq.20110328.vcf, freebayes.20:32100000-32200000.baq.20110328.vcf, freebayes.20:32200000-32300000.baq.20110328.vcf, freebayes.20:32300000-32400000.baq.20110328.vcf, freebayes.20:32400000-32500000.baq.20110328.vcf, freebayes.20:32500000-32600000.baq.20110328.vcf, freebayes.20:32600000-32700000.baq.20110328.vcf, freebayes.20:32700000-32800000.baq.20110328.vcf, freebayes.20:32800000-32900000.baq.20110328.vcf, freebayes.20:32900000-33000000.baq.20110328.vcf, freebayes.20:33000000-33100000.baq.20110328.vcf, freebayes.20:33100000-33200000.baq.20110328.vcf, freebayes.20:33200000-33300000.baq.20110328.vcf, freebayes.20:33300000-33400000.baq.20110328.vcf, freebayes.20:33400000-33500000.baq.20110328.vcf, freebayes.20:33500000-33600000.baq.20110328.vcf, freebayes.20:33600000-33700000.baq.20110328.vcf, freebayes.20:33700000-33800000.baq.20110328.vcf, freebayes.20:33800000-33900000.baq.20110328.vcf, freebayes.20:33900000-34000000.baq.20110328.vcf, freebayes.20:34000000-34100000.baq.20110328.vcf, freebayes.20:34100000-34200000.baq.20110328.vcf, freebayes.20:34200000-34300000.baq.20110328.vcf, freebayes.20:34300000-34400000.baq.20110328.vcf, freebayes.20:34400000-34500000.baq.20110328.vcf, freebayes.20:34500000-34600000.baq.20110328.vcf, freebayes.20:34600000-34700000.baq.20110328.vcf, freebayes.20:34700000-34800000.baq.20110328.vcf, freebayes.20:34800000-34900000.baq.20110328.vcf, freebayes.20:34900000-35000000.baq.20110328.vcf, freebayes.20:35000000-35100000.baq.20110328.vcf, freebayes.20:35100000-35200000.baq.20110328.vcf, freebayes.20:35200000-35300000.baq.20110328.vcf, freebayes.20:35300000-35400000.baq.20110328.vcf, freebayes.20:35400000-35500000.baq.20110328.vcf, freebayes.20:35500000-35600000.baq.20110328.vcf, freebayes.20:35600000-35700000.baq.20110328.vcf, freebayes.20:35700000-35800000.baq.20110328.vcf, freebayes.20:35800000-35900000.baq.20110328.vcf, freebayes.20:35900000-36000000.baq.20110328.vcf, freebayes.20:36000000-36100000.baq.20110328.vcf, freebayes.20:36100000-36200000.baq.20110328.vcf, freebayes.20:36200000-36300000.baq.20110328.vcf, freebayes.20:36300000-36400000.baq.20110328.vcf, freebayes.20:36400000-36500000.baq.20110328.vcf, freebayes.20:36500000-36600000.baq.20110328.vcf, freebayes.20:36600000-36700000.baq.20110328.vcf, freebayes.20:36700000-36800000.baq.20110328.vcf, freebayes.20:36800000-36900000.baq.20110328.vcf, freebayes.20:36900000-37000000.baq.20110328.vcf, freebayes.20:37000000-37100000.baq.20110328.vcf, freebayes.20:37100000-37200000.baq.20110328.vcf, freebayes.20:37200000-37300000.baq.20110328.vcf, freebayes.20:37300000-37400000.baq.20110328.vcf, freebayes.20:37400000-37500000.baq.20110328.vcf, freebayes.20:37500000-37600000.baq.20110328.vcf, freebayes.20:37600000-37700000.baq.20110328.vcf, freebayes.20:37700000-37800000.baq.20110328.vcf, freebayes.20:37800000-37900000.baq.20110328.vcf, freebayes.20:37900000-38000000.baq.20110328.vcf, freebayes.20:38000000-38100000.baq.20110328.vcf, freebayes.20:38100000-38200000.baq.20110328.vcf, freebayes.20:38200000-38300000.baq.20110328.vcf, freebayes.20:38300000-38400000.baq.20110328.vcf, freebayes.20:38400000-38500000.baq.20110328.vcf, freebayes.20:38500000-38600000.baq.20110328.vcf, freebayes.20:38600000-38700000.baq.20110328.vcf, freebayes.20:38700000-38800000.baq.20110328.vcf, freebayes.20:38800000-38900000.baq.20110328.vcf, freebayes.20:38900000-39000000.baq.20110328.vcf, freebayes.20:39000000-39100000.baq.20110328.vcf, freebayes.20:39100000-39200000.baq.20110328.vcf, freebayes.20:39200000-39300000.baq.20110328.vcf, freebayes.20:39300000-39400000.baq.20110328.vcf, freebayes.20:39400000-39500000.baq.20110328.vcf, freebayes.20:39500000-39600000.baq.20110328.vcf, freebayes.20:39600000-39700000.baq.20110328.vcf, freebayes.20:39700000-39800000.baq.20110328.vcf, freebayes.20:39800000-39900000.baq.20110328.vcf, freebayes.20:39900000-40000000.baq.20110328.vcf, freebayes.20:40000000-40100000.baq.20110328.vcf, freebayes.20:40100000-40200000.baq.20110328.vcf, freebayes.20:40200000-40300000.baq.20110328.vcf, freebayes.20:40300000-40400000.baq.20110328.vcf, freebayes.20:40400000-40500000.baq.20110328.vcf, freebayes.20:40500000-40600000.baq.20110328.vcf, freebayes.20:40600000-40700000.baq.20110328.vcf, freebayes.20:40700000-40800000.baq.20110328.vcf, freebayes.20:40800000-40900000.baq.20110328.vcf, freebayes.20:40900000-41000000.baq.20110328.vcf, freebayes.20:41000000-41100000.baq.20110328.vcf, freebayes.20:41100000-41200000.baq.20110328.vcf, freebayes.20:41200000-41300000.baq.20110328.vcf, freebayes.20:41300000-41400000.baq.20110328.vcf, freebayes.20:41400000-41500000.baq.20110328.vcf, freebayes.20:41500000-41600000.baq.20110328.vcf, freebayes.20:41600000-41700000.baq.20110328.vcf, freebayes.20:41700000-41800000.baq.20110328.vcf, freebayes.20:41800000-41900000.baq.20110328.vcf, freebayes.20:41900000-42000000.baq.20110328.vcf, freebayes.20:42000000-42100000.baq.20110328.vcf, freebayes.20:42100000-42200000.baq.20110328.vcf, freebayes.20:42200000-42300000.baq.20110328.vcf, freebayes.20:42300000-42400000.baq.20110328.vcf, freebayes.20:42400000-42500000.baq.20110328.vcf, freebayes.20:42500000-42600000.baq.20110328.vcf, freebayes.20:42600000-42700000.baq.20110328.vcf, freebayes.20:42700000-42800000.baq.20110328.vcf, freebayes.20:42800000-42900000.baq.20110328.vcf, freebayes.20:42900000-43000000.baq.20110328.vcf, freebayes.20:43000000-43100000.baq.20110328.vcf, freebayes.20:43100000-43200000.baq.20110328.vcf, freebayes.20:43200000-43300000.baq.20110328.vcf, freebayes.20:43300000-43400000.baq.20110328.vcf, freebayes.20:43400000-43500000.baq.20110328.vcf, freebayes.20:43500000-43600000.baq.20110328.vcf, freebayes.20:43600000-43700000.baq.20110328.vcf, freebayes.20:43700000-43800000.baq.20110328.vcf, freebayes.20:43800000-43900000.baq.20110328.vcf, freebayes.20:43900000-44000000.baq.20110328.vcf, freebayes.20:44000000-44100000.baq.20110328.vcf, freebayes.20:44100000-44200000.baq.20110328.vcf, freebayes.20:44200000-44300000.baq.20110328.vcf, freebayes.20:44300000-44400000.baq.20110328.vcf, freebayes.20:44400000-44500000.baq.20110328.vcf, freebayes.20:44500000-44600000.baq.20110328.vcf, freebayes.20:44600000-44700000.baq.20110328.vcf, freebayes.20:44700000-44800000.baq.20110328.vcf, freebayes.20:44800000-44900000.baq.20110328.vcf, freebayes.20:44900000-45000000.baq.20110328.vcf, freebayes.20:45000000-45100000.baq.20110328.vcf, freebayes.20:45100000-45200000.baq.20110328.vcf, freebayes.20:45200000-45300000.baq.20110328.vcf, freebayes.20:45300000-45400000.baq.20110328.vcf, freebayes.20:45400000-45500000.baq.20110328.vcf, freebayes.20:45500000-45600000.baq.20110328.vcf, freebayes.20:45600000-45700000.baq.20110328.vcf, freebayes.20:45700000-45800000.baq.20110328.vcf, freebayes.20:45800000-45900000.baq.20110328.vcf, freebayes.20:45900000-46000000.baq.20110328.vcf, freebayes.20:46000000-46100000.baq.20110328.vcf, freebayes.20:46100000-46200000.baq.20110328.vcf, freebayes.20:46200000-46300000.baq.20110328.vcf, freebayes.20:46300000-46400000.baq.20110328.vcf, freebayes.20:46400000-46500000.baq.20110328.vcf, freebayes.20:46500000-46600000.baq.20110328.vcf, freebayes.20:46600000-46700000.baq.20110328.vcf, freebayes.20:46700000-46800000.baq.20110328.vcf, freebayes.20:46800000-46900000.baq.20110328.vcf, freebayes.20:46900000-47000000.baq.20110328.vcf, freebayes.20:47000000-47100000.baq.20110328.vcf, freebayes.20:47100000-47200000.baq.20110328.vcf, freebayes.20:47200000-47300000.baq.20110328.vcf, freebayes.20:47300000-47400000.baq.20110328.vcf, freebayes.20:47400000-47500000.baq.20110328.vcf, freebayes.20:47500000-47600000.baq.20110328.vcf, freebayes.20:47600000-47700000.baq.20110328.vcf, freebayes.20:47700000-47800000.baq.20110328.vcf, freebayes.20:47800000-47900000.baq.20110328.vcf, freebayes.20:47900000-48000000.baq.20110328.vcf, freebayes.20:48000000-48100000.baq.20110328.vcf, freebayes.20:48100000-48200000.baq.20110328.vcf, freebayes.20:48200000-48300000.baq.20110328.vcf, freebayes.20:48300000-48400000.baq.20110328.vcf, freebayes.20:48400000-48500000.baq.20110328.vcf, freebayes.20:48500000-48600000.baq.20110328.vcf, freebayes.20:48600000-48700000.baq.20110328.vcf, freebayes.20:48700000-48800000.baq.20110328.vcf, freebayes.20:48800000-48900000.baq.20110328.vcf, freebayes.20:48900000-49000000.baq.20110328.vcf, freebayes.20:49000000-49100000.baq.20110328.vcf, freebayes.20:49100000-49200000.baq.20110328.vcf, freebayes.20:49200000-49300000.baq.20110328.vcf, freebayes.20:49300000-49400000.baq.20110328.vcf, freebayes.20:49400000-49500000.baq.20110328.vcf, freebayes.20:49500000-49600000.baq.20110328.vcf, freebayes.20:49600000-49700000.baq.20110328.vcf, freebayes.20:49700000-49800000.baq.20110328.vcf, freebayes.20:49800000-49900000.baq.20110328.vcf, freebayes.20:49900000-50000000.baq.20110328.vcf, freebayes.20:50000000-50100000.baq.20110328.vcf, freebayes.20:50100000-50200000.baq.20110328.vcf, freebayes.20:50200000-50300000.baq.20110328.vcf, freebayes.20:50300000-50400000.baq.20110328.vcf, freebayes.20:50400000-50500000.baq.20110328.vcf, freebayes.20:50500000-50600000.baq.20110328.vcf, freebayes.20:50600000-50700000.baq.20110328.vcf, freebayes.20:50700000-50800000.baq.20110328.vcf, freebayes.20:50800000-50900000.baq.20110328.vcf, freebayes.20:50900000-51000000.baq.20110328.vcf, freebayes.20:51000000-51100000.baq.20110328.vcf, freebayes.20:51100000-51200000.baq.20110328.vcf, freebayes.20:51200000-51300000.baq.20110328.vcf, freebayes.20:51300000-51400000.baq.20110328.vcf, freebayes.20:51400000-51500000.baq.20110328.vcf, freebayes.20:51500000-51600000.baq.20110328.vcf, freebayes.20:51600000-51700000.baq.20110328.vcf, freebayes.20:51700000-51800000.baq.20110328.vcf, freebayes.20:51800000-51900000.baq.20110328.vcf, freebayes.20:51900000-52000000.baq.20110328.vcf, freebayes.20:52000000-52100000.baq.20110328.vcf, freebayes.20:52100000-52200000.baq.20110328.vcf, freebayes.20:52200000-52300000.baq.20110328.vcf, freebayes.20:52300000-52400000.baq.20110328.vcf, freebayes.20:52400000-52500000.baq.20110328.vcf, freebayes.20:52500000-52600000.baq.20110328.vcf, freebayes.20:52600000-52700000.baq.20110328.vcf, freebayes.20:52700000-52800000.baq.20110328.vcf, freebayes.20:52800000-52900000.baq.20110328.vcf, freebayes.20:52900000-53000000.baq.20110328.vcf, freebayes.20:53000000-53100000.baq.20110328.vcf, freebayes.20:53100000-53200000.baq.20110328.vcf, freebayes.20:53200000-53300000.baq.20110328.vcf, freebayes.20:53300000-53400000.baq.20110328.vcf, freebayes.20:53400000-53500000.baq.20110328.vcf, freebayes.20:53500000-53600000.baq.20110328.vcf, freebayes.20:53600000-53700000.baq.20110328.vcf, freebayes.20:53700000-53800000.baq.20110328.vcf, freebayes.20:53800000-53900000.baq.20110328.vcf, freebayes.20:53900000-54000000.baq.20110328.vcf, freebayes.20:54000000-54100000.baq.20110328.vcf, freebayes.20:54100000-54200000.baq.20110328.vcf, freebayes.20:54200000-54300000.baq.20110328.vcf, freebayes.20:54300000-54400000.baq.20110328.vcf, freebayes.20:54400000-54500000.baq.20110328.vcf, freebayes.20:54500000-54600000.baq.20110328.vcf, freebayes.20:54600000-54700000.baq.20110328.vcf, freebayes.20:54700000-54800000.baq.20110328.vcf, freebayes.20:54800000-54900000.baq.20110328.vcf, freebayes.20:54900000-55000000.baq.20110328.vcf, freebayes.20:55000000-55100000.baq.20110328.vcf, freebayes.20:55100000-55200000.baq.20110328.vcf, freebayes.20:55200000-55300000.baq.20110328.vcf, freebayes.20:55300000-55400000.baq.20110328.vcf, freebayes.20:55400000-55500000.baq.20110328.vcf, freebayes.20:55500000-55600000.baq.20110328.vcf, freebayes.20:55600000-55700000.baq.20110328.vcf, freebayes.20:55700000-55800000.baq.20110328.vcf, freebayes.20:55800000-55900000.baq.20110328.vcf, freebayes.20:55900000-56000000.baq.20110328.vcf, freebayes.20:56000000-56100000.baq.20110328.vcf, freebayes.20:56100000-56200000.baq.20110328.vcf, freebayes.20:56200000-56300000.baq.20110328.vcf, freebayes.20:56300000-56400000.baq.20110328.vcf, freebayes.20:56400000-56500000.baq.20110328.vcf, freebayes.20:56500000-56600000.baq.20110328.vcf, freebayes.20:56600000-56700000.baq.20110328.vcf, freebayes.20:56700000-56800000.baq.20110328.vcf, freebayes.20:56800000-56900000.baq.20110328.vcf, freebayes.20:56900000-57000000.baq.20110328.vcf, freebayes.20:57000000-57100000.baq.20110328.vcf, freebayes.20:57100000-57200000.baq.20110328.vcf, freebayes.20:57200000-57300000.baq.20110328.vcf, freebayes.20:57300000-57400000.baq.20110328.vcf, freebayes.20:57400000-57500000.baq.20110328.vcf, freebayes.20:57500000-57600000.baq.20110328.vcf, freebayes.20:57600000-57700000.baq.20110328.vcf, freebayes.20:57700000-57800000.baq.20110328.vcf, freebayes.20:57800000-57900000.baq.20110328.vcf, freebayes.20:57900000-58000000.baq.20110328.vcf, freebayes.20:58000000-58100000.baq.20110328.vcf, freebayes.20:58100000-58200000.baq.20110328.vcf, freebayes.20:58200000-58300000.baq.20110328.vcf, freebayes.20:58300000-58400000.baq.20110328.vcf, freebayes.20:58400000-58500000.baq.20110328.vcf, freebayes.20:58500000-58600000.baq.20110328.vcf, freebayes.20:58600000-58700000.baq.20110328.vcf, freebayes.20:58700000-58800000.baq.20110328.vcf, freebayes.20:58800000-58900000.baq.20110328.vcf, freebayes.20:58900000-59000000.baq.20110328.vcf, freebayes.20:59000000-59100000.baq.20110328.vcf, freebayes.20:59100000-59200000.baq.20110328.vcf, freebayes.20:59200000-59300000.baq.20110328.vcf, freebayes.20:59300000-59400000.baq.20110328.vcf, freebayes.20:59400000-59500000.baq.20110328.vcf, freebayes.20:59500000-59600000.baq.20110328.vcf, freebayes.20:59600000-59700000.baq.20110328.vcf, freebayes.20:59700000-59800000.baq.20110328.vcf, freebayes.20:59800000-59900000.baq.20110328.vcf, freebayes.20:59900000-60000000.baq.20110328.vcf, freebayes.20:60000000-60100000.baq.20110328.vcf, freebayes.20:60100000-60200000.baq.20110328.vcf, freebayes.20:60200000-60300000.baq.20110328.vcf, freebayes.20:60300000-60400000.baq.20110328.vcf, freebayes.20:60400000-60500000.baq.20110328.vcf, freebayes.20:60500000-60600000.baq.20110328.vcf, freebayes.20:60600000-60700000.baq.20110328.vcf, freebayes.20:60700000-60800000.baq.20110328.vcf, freebayes.20:60800000-60900000.baq.20110328.vcf, freebayes.20:60900000-61000000.baq.20110328.vcf, freebayes.20:61000000-61100000.baq.20110328.vcf, freebayes.20:61100000-61200000.baq.20110328.vcf, freebayes.20:61200000-61300000.baq.20110328.vcf, freebayes.20:61300000-61400000.baq.20110328.vcf, freebayes.20:61400000-61500000.baq.20110328.vcf, freebayes.20:61500000-61600000.baq.20110328.vcf, freebayes.20:61600000-61700000.baq.20110328.vcf, freebayes.20:61700000-61800000.baq.20110328.vcf, freebayes.20:61800000-61900000.baq.20110328.vcf, freebayes.20:61900000-62000000.baq.20110328.vcf, freebayes.20:62000000-62100000.baq.20110328.vcf, freebayes.20:62100000-62200000.baq.20110328.vcf, freebayes.20:62200000-62300000.baq.20110328.vcf, freebayes.20:62300000-62400000.baq.20110328.vcf, freebayes.20:62400000-62500000.baq.20110328.vcf, freebayes.20:62500000-62600000.baq.20110328.vcf, freebayes.20:62600000-62700000.baq.20110328.vcf, freebayes.20:62700000-62800000.baq.20110328.vcf, freebayes.20:62800000-62900000.baq.20110328.vcf, freebayes.20:62900000-63000000.baq.20110328.vcf, freebayes.20:63000000-63025520.baq.20110328.vcf
-#CHROM POS ID REF ALT QUAL FILTER INFO
+#CHROM POS ID REF ALT QUAL FILTER INFO
20 458502 . G GA 4567.01 PASS AA=20;AB=0.61111;ABA=14;ABP=6.8707;ABR=22;AC=38;AF=0.0544;AN=698;BL=374;BR=1129;BVAR;BaseQRankSum=13.364;DP=15979;DP4=1882,2188,45,37;Dels=0.00;EL=5;EPP=13.868;ER=15;FR;FS=6.503;HETAR=11;HOMA=2;HOMR=985;HP=1;HPLen=2;HR=2;HRun=0;HU=G;INDEL;INS;InbreedingCoeff=0.0157;IndelType=INS.NOVEL_1.Novel_A.;LEN=1;LRB=0.50233;LRBP=826.56;MQ=66.16;MQ0Fraction=0.0110;MQM=70.5;MQRankSum=-3.158;NF;NR;NS=998;PP;PV4=0.15,1,0.42,0.15;RA=3173;RL=1;RPP=38.188;RR=19;RUN=1;ReadPosRankSum=-2.346;SAB=0.7;SAF=14;SAP=9.959;SAR=6;SC=GGGCGTGGTGGTGCATGTAAT;SRB=0.50047;SRF=1588;SRP=3.0165;SRR=1585;TC;TR=9;TU=GGT;VQSLOD=10.0079;set=Intersection;sumGLbyD=23.94
20 573764 . TA T 591.51 PASS AC=91;AF=0.1987;AN=458;BaseQRankSum=0.137;DP=519;FS=3.153;HRun=1;HaplotypeScore=14.0744;InbreedingCoeff=0.1460;IndelType=DEL.NumRepetitions_1.EventLength_1.RepeatExpansion_A.;MQ=48.16;MQ0=26;MQ0Fraction=0.0501;MQRankSum=-1.636;QD=3.63;ReadPosRankSum=-4.140;SB=-408.14;VQSLOD=5.2458;set=VQSR
20 766143 . C CATCTGGTA 5521.70 PASS AA=24;AB=0.5;ABA=18;ABP=3.0103;ABR=18;AC=14;AF=0.0289;AF1=0.02038;AN=484;BL=655;BR=1542;BVAR;BaseQRankSum=3.801;CI95=0.01549,0.02655;DP=11749;DP4=2222,1998,14,8;Dels=0.00;EL=9;EPP=6.2675;ER=15;FQ=999;FR;FS=2.941;HETAR=9;HOMA=4;HOMR=901;HP=2;HPLen=2;HR=1;HRun=0;HU=A;INDEL;INS;InbreedingCoeff=0.0515;IndelType=INS.NumRepetitions_1.EventLength_8.;LEN=8;LRB=0.40373;LRBP=780.64;MQ=56.81;MQ0Fraction=0.0253;MQM=22.167;MQRankSum=-4.809;NF;NR;NS=914;PP;PV4=0.39,1,5.8e-07,1;RA=3093;RL=6;RPP=16.039;RR=18;RUN=1;ReadPosRankSum=-2.827;SAB=0.625;SAF=15;SAP=6.2675;SAR=9;SC=GCTTTAAATTCATCTGGTACT;SRB=0.61623;SRF=1906;SRP=365.95;SRR=1187;TC;TR=1;TU=A;VQSLOD=7.0268;set=Intersection;sumGLbyD=50.23
##INFO=<ID=Sanger,Number=1,Type=String,Description="Status of Sanger Sequencing for this site">\r
##INFO=<ID=NotCalledInValidationSamples,Number=.,Type=Flag,Description="Was not called polymorphic in sequencing for any of the passing validation samples; could still be polymorphic">\r
##reference=file:///humgen/1kg/reference/human_g1k_v37.fasta\r
-#CHROM POS ID REF ALT QUAL FILTER INFO \r
+#CHROM POS ID REF ALT QUAL FILTER INFO\r
20 207414 . G A . . PacBio=NoCall;Sqnm=NoCall\r
20 792106 . C G . . PacBio=Poly;Sqnm=NoCall\r
20 894031 . G A . . PacBio=Poly;Sqnm=Poly\r
##source=SelectVariants
##source_20110031.1=/nfs/users/nfs_p/pd3/cvs/vcftools/perl/vcf-annotate -d /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.desc -a /nfs/users/nfs_p/pd3/sandbox/hapmap/dbSNP-b132/non-1kg-vld.tab.gz -c CHROM,FROM,INFO/VLD,INFO/KGPilot123,INFO/dbSNP
##vcfCTools=filter
-#CHROM POS ID REF ALT QUAL FILTER INFO
+#CHROM POS ID REF ALT QUAL FILTER INFO
20 458502 . G GA 4567.01 PASS AA=20;AB=0.61111;ABA=14;ABP=6.8707;ABR=22;AC=38;AF=0.0544;AN=698;BL=374;BR=1129;BVAR;BaseQRankSum=13.364;DP=15979;DP4=1882,2188,45,37;Dels=0.00;EL=5;EPP=13.868;ER=15;FR;FS=6.503;HETAR=11;HOMA=2;HOMR=985;HP=1;HPLen=2;HR=2;HRun=0;HU=G;INDEL;INS;InbreedingCoeff=0.0157;IndelType=INS.NOVEL_1.Novel_A.;LEN=1;LRB=0.50233;LRBP=826.56;MQ=66.16;MQ0Fraction=0.0110;MQM=70.5;MQRankSum=-3.158;NF;NR;NS=998;PP;PV4=0.15,1,0.42,0.15;RA=3173;RL=1;RPP=38.188;RR=19;RUN=1;ReadPosRankSum=-2.346;SAB=0.7;SAF=14;SAP=9.959;SAR=6;SC=GGGCGTGGTGGTGCATGTAAT;SET_INTEGRATION;SET_WGVQSR;SRB=0.50047;SRF=1588;SRP=3.0165;SRR=1585;TC;TR=9;TU=GGT;VQSLOD=10.0079;set=Intersection;sumGLbyD=23.94
20 573764 . TA T 591.51 PASS AC=91;AF=0.1987;AN=458;BaseQRankSum=0.137;DP=519;FS=3.153;HRun=1;HaplotypeScore=14.0744;InbreedingCoeff=0.1460;IndelType=DEL.NumRepetitions_1.EventLength_1.RepeatExpansion_A.;MQ=48.16;MQ0=26;MQ0Fraction=0.0501;MQRankSum=-1.636;QD=3.63;ReadPosRankSum=-4.140;SB=-408.14;SET_INTEGRATION;SET_WGVQSR;VQSLOD=5.2458;set=VQSR
20 766143 . C CATCTGGTA 5521.70 PASS AA=24;AB=0.5;ABA=18;ABP=3.0103;ABR=18;AC=14;AF=0.0289;AF1=0.02038;AN=484;BL=655;BR=1542;BVAR;BaseQRankSum=3.801;CI95=0.01549,0.02655;DP=11749;DP4=2222,1998,14,8;Dels=0.00;EL=9;EPP=6.2675;ER=15;FQ=999;FR;FS=2.941;HETAR=9;HOMA=4;HOMR=901;HP=2;HPLen=2;HR=1;HRun=0;HU=A;INDEL;INS;InbreedingCoeff=0.0515;IndelType=INS.NumRepetitions_1.EventLength_8.;LEN=8;LRB=0.40373;LRBP=780.64;MQ=56.81;MQ0Fraction=0.0253;MQM=22.167;MQRankSum=-4.809;NF;NR;NS=914;PP;PV4=0.39,1,5.8e-07,1;RA=3093;RL=6;RPP=16.039;RR=18;RUN=1;ReadPosRankSum=-2.827;SAB=0.625;SAF=15;SAP=6.2675;SAR=9;SC=GCTTTAAATTCATCTGGTACT;SET_INTEGRATION;SET_WGVQSR;SRB=0.61623;SRF=1906;SRP=365.95;SRR=1187;TC;TR=1;TU=A;VQSLOD=7.0268;set=Intersection;sumGLbyD=50.23
"filter", "info", "format")
fail_on_parsing = [
- (24, "Could not parse the header, sample line not found"),
+ (24, 'Could not parse the "#CHROM.." line'),
("issue85", "empty VCF"),
]
fail_on_opening = [
- (24, "Could not parse the header, sample line not found"),
+ (24, 'Could not parse the "#CHROM.." line'),
("issue85", "empty VCF"),
]
coordinate_offset = 0
import re
import copy
import gzip
-from TestUtils import load_and_convert, make_data_files, TABIX_DATADIR
+from TestUtils import load_and_convert, make_data_files, TABIX_DATADIR, IS_PYTHON3
def setUpModule():
make_data_files(TABIX_DATADIR)
+@unittest.skipUnless(IS_PYTHON3, "Requires Python 3 Extended Iterable Unpacking")
+class TestBED(unittest.TestCase):
+
+ filename = os.path.join(TABIX_DATADIR, "fivecolumns.bed.gz")
+
+ def setUp(self):
+ self.tabix = pysam.TabixFile(self.filename)
+
+ def tearDown(self):
+ self.tabix.close()
+
+ def testAssignmentToTargetList(self):
+ # TODO When we drop Python 2, remove exec() & my and simplify these
+ my = {}
+ for row in self.tabix.fetch(parser=pysam.asTuple()):
+ my['row'] = row
+
+ # Test that *others gets the right columns...
+ exec('contig, start, end, *others = row', globals(), my)
+ self.assertEqual(3 + len(my['others']), len(row))
+
+ # ...and that a TupleProxy can be assigned from more than once
+ exec('contig, *others = row', globals(), my)
+ self.assertEqual(1 + len(my['others']), len(row))
+
+
class TestParser(unittest.TestCase):
filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
self.assertEqual("\t".join(map(str, c)),
str(r))
+ @unittest.skipUnless(IS_PYTHON3, "Requires Python 3 Extended Iterable Unpacking")
+ def testAssignmentToTargetList(self):
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
+ my = { 'r': r }
+ exec('col1, col2, *others, colN = r', globals(), my)
+ self.assertEqual(2 + len(my['others']) + 1, len(r))
+
def testWrite(self):
for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):