From 6136085e9b8025bfc1a5e65c5b39d3cd98aceed1 Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Sun, 22 Apr 2018 18:30:02 -0400 Subject: [PATCH] New upstream version 0.14.1+ds --- NEWS | 12 + bcftools/bcftools.h | 17 + bcftools/consensus.c | 13 +- bcftools/consensus.c.pysam.c | 13 +- bcftools/convert.c | 30 +- bcftools/convert.c.pysam.c | 30 +- bcftools/convert.h | 3 +- bcftools/csq.c | 19 +- bcftools/csq.c.pysam.c | 19 +- bcftools/filter.c | 1398 ++++++++++------- bcftools/filter.c.pysam.c | 1398 ++++++++++------- bcftools/main.c.pysam.c | 4 +- bcftools/regidx.c | 40 +- bcftools/regidx.c.pysam.c | 40 +- bcftools/regidx.h | 8 + bcftools/reheader.c | 4 +- bcftools/reheader.c.pysam.c | 4 +- bcftools/vcfannotate.c | 39 +- bcftools/vcfannotate.c.pysam.c | 39 +- bcftools/vcfcnv.c | 12 +- bcftools/vcfcnv.c.pysam.c | 12 +- bcftools/vcfconvert.c | 2 +- bcftools/vcfconvert.c.pysam.c | 2 +- bcftools/vcfgtcheck.c | 6 +- bcftools/vcfgtcheck.c.pysam.c | 6 +- bcftools/vcfnorm.c | 114 +- bcftools/vcfnorm.c.pysam.c | 114 +- bcftools/vcfplugin.c | 8 +- bcftools/vcfplugin.c.pysam.c | 8 +- bcftools/vcfquery.c | 44 +- bcftools/vcfquery.c.pysam.c | 44 +- bcftools/vcfroh.c | 4 +- bcftools/vcfroh.c.pysam.c | 4 +- bcftools/vcfsom.c | 2 +- bcftools/vcfsom.c.pysam.c | 2 +- bcftools/vcfstats.c | 23 +- bcftools/vcfstats.c.pysam.c | 23 +- bcftools/vcfview.c | 9 +- bcftools/vcfview.c.pysam.c | 9 +- bcftools/version.h | 2 +- doc/release.rst | 13 + pysam/libcalignedsegment.pyx | 15 +- pysam/libcalignmentfile.pyx | 45 +- pysam/version.py | 2 +- tests/AlignmentFileHeader_test.py | 17 +- tests/AlignmentFilePileup_test.py | 36 +- tests/AlignmentFile_test.py | 18 +- tests/TestUtils.py | 2 +- tests/faidx_test.py | 6 +- .../pysam_data/example_reverse_complement.sam | 28 + tests/tabix_test.py | 6 +- 51 files changed, 2349 insertions(+), 1419 deletions(-) create mode 100644 tests/pysam_data/example_reverse_complement.sam diff --git a/NEWS b/NEWS index bca0d0c..0e2cb71 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,18 @@ http://pysam.readthedocs.io/en/latest/release.html Release notes ============= +Release 0.14.1 +============== + +This is mostly a bugfix release, though bcftools has now also been +upgraded to 1.7.0. + +* [#621] Add a warning to count_coverage when an alignment has an + empty QUAL field +* [#635] Speed-up of AlignedSegment.find_intro() +* treat border case of all bases in pileup column below quality score +* [#634] Fix access to pileup reference_sequence + Release 0.14.0 ============== diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index dde3ab0..d3ba814 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -63,6 +63,23 @@ static inline char gt2iupac(char a, char b) return iupac[(int)a][(int)b]; } +static inline int iupac_consistent(char iupac, char nt) +{ + static const char iupac_mask[90] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,14,2, + 13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10 + }; + if ( iupac > 89 ) return 0; + if ( nt > 90 ) nt -= 32; // lowercase + if ( nt=='A' ) nt = 1; + else if ( nt=='C' ) nt = 2; + else if ( nt=='G' ) nt = 4; + else if ( nt=='T' ) nt = 8; + return iupac_mask[(int)iupac] & nt ? 1 : 0; +} + static inline char nt_to_upper(char nt) { if ( nt < 97 ) return nt; diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 544eca6..d67a052 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -77,6 +77,7 @@ typedef struct rbuf_t vcf_rbuf; bcf1_t **vcf_buf; int nvcf_buf, rid; + char *chr; regidx_t *mask; regitr_t *itr; @@ -121,6 +122,8 @@ static void destroy_chain(args_t *args) free(chain->block_lengths); free(chain); chain = NULL; + free(args->chr); + args->chr = NULL; } static void print_chain(args_t *args) @@ -162,7 +165,7 @@ static void print_chain(args_t *args) score += chain->block_lengths[n]; } score += last_block_size; - fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id); + fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id); for (n=0; nnum; n++) { fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]); } @@ -248,6 +251,7 @@ static void destroy_data(args_t *args) if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]); free(args->vcf_buf); free(args->fa_buf.s); + free(args->chr); if ( args->mask ) regidx_destroy(args->mask); if ( args->itr ) regitr_destroy(args->itr); if ( args->chain_fname ) @@ -276,6 +280,7 @@ static void init_region(args_t *args, char *line) else to--; } } + args->chr = strdup(line); args->rid = bcf_hdr_name2id(args->hdr,line); if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname); args->fa_buf.l = 0; @@ -380,7 +385,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; } - int i, ialt = 1; + int i, ialt = 1; // the alternate allele if ( args->isample >= 0 ) { bcf_unpack(rec, BCF_UN_FMT); @@ -417,6 +422,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) { char ial = rec->d.allele[ialt][0]; char jal = rec->d.allele[jalt][0]; + if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored rec->d.allele[ialt][0] = gt2iupac(ial,jal); } } @@ -565,11 +571,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) static void mask_region(args_t *args, char *seq, int len) { - char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = args->fa_src_pos - len; int end = args->fa_src_pos; - if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return; + if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return; int idx_start, idx_end, i; while ( regitr_overlap(args->itr) ) diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 9ba5dd0..9015601 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -79,6 +79,7 @@ typedef struct rbuf_t vcf_rbuf; bcf1_t **vcf_buf; int nvcf_buf, rid; + char *chr; regidx_t *mask; regitr_t *itr; @@ -123,6 +124,8 @@ static void destroy_chain(args_t *args) free(chain->block_lengths); free(chain); chain = NULL; + free(args->chr); + args->chr = NULL; } static void print_chain(args_t *args) @@ -164,7 +167,7 @@ static void print_chain(args_t *args) score += chain->block_lengths[n]; } score += last_block_size; - fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id); + fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id); for (n=0; nnum; n++) { fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]); } @@ -250,6 +253,7 @@ static void destroy_data(args_t *args) if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]); free(args->vcf_buf); free(args->fa_buf.s); + free(args->chr); if ( args->mask ) regidx_destroy(args->mask); if ( args->itr ) regitr_destroy(args->itr); if ( args->chain_fname ) @@ -278,6 +282,7 @@ static void init_region(args_t *args, char *line) else to--; } } + args->chr = strdup(line); args->rid = bcf_hdr_name2id(args->hdr,line); if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname); args->fa_buf.l = 0; @@ -382,7 +387,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; } - int i, ialt = 1; + int i, ialt = 1; // the alternate allele if ( args->isample >= 0 ) { bcf_unpack(rec, BCF_UN_FMT); @@ -419,6 +424,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) { char ial = rec->d.allele[ialt][0]; char jal = rec->d.allele[jalt][0]; + if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored rec->d.allele[ialt][0] = gt2iupac(ial,jal); } } @@ -567,11 +573,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) static void mask_region(args_t *args, char *seq, int len) { - char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = args->fa_src_pos - len; int end = args->fa_src_pos; - if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return; + if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return; int idx_start, idx_end, i; while ( regitr_overlap(args->itr) ) diff --git a/bcftools/convert.c b/bcftools/convert.c index 05dce01..d7ed0ab 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -92,6 +92,7 @@ struct _convert_t int ndat; char *undef_info_tag; int allow_undef_tags; + uint8_t **subset_samples; }; typedef struct @@ -174,6 +175,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx) } return ((int32_t*)bcf_array)[idx]; } +static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str) +{ + int n = 0, ibeg = 0; + while ( src[ibeg] && ibegibeg ) + kputsn(src+ibeg, iend-ibeg, str); + else + kputc('.', str); +} static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { if ( fmt->id<0 ) @@ -232,6 +251,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break; case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break; + case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break; default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH @@ -288,6 +308,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa else kputw(ival, str); } + else if ( fmt->fmt->type == BCF_BT_CHAR ) + _copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str); else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type); } else @@ -1312,6 +1334,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) } for (js=0; jsnsamples; js++) { + // Skip samples when filtering was requested + if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; + // Here comes a hack designed for TBCSQ. When running on large files, // such as 1000GP, there are too many empty fields in the output and // it's very very slow. Therefore in case the handler does not add @@ -1362,6 +1387,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) case allow_undef_tags: convert->allow_undef_tags = va_arg(args, int); break; + case subset_samples: + convert->subset_samples = va_arg(args, uint8_t**); + break; default: ret = -1; } diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index db7b6c1..66373bc 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -2,7 +2,7 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2017 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -94,6 +94,7 @@ struct _convert_t int ndat; char *undef_info_tag; int allow_undef_tags; + uint8_t **subset_samples; }; typedef struct @@ -176,6 +177,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx) } return ((int32_t*)bcf_array)[idx]; } +static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str) +{ + int n = 0, ibeg = 0; + while ( src[ibeg] && ibegibeg ) + kputsn(src+ibeg, iend-ibeg, str); + else + kputc('.', str); +} static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { if ( fmt->id<0 ) @@ -234,6 +253,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break; case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break; + case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break; default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH @@ -290,6 +310,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa else kputw(ival, str); } + else if ( fmt->fmt->type == BCF_BT_CHAR ) + _copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str); else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type); } else @@ -1314,6 +1336,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) } for (js=0; jsnsamples; js++) { + // Skip samples when filtering was requested + if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; + // Here comes a hack designed for TBCSQ. When running on large files, // such as 1000GP, there are too many empty fields in the output and // it's very very slow. Therefore in case the handler does not add @@ -1364,6 +1389,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) case allow_undef_tags: convert->allow_undef_tags = va_arg(args, int); break; + case subset_samples: + convert->subset_samples = va_arg(args, uint8_t**); + break; default: ret = -1; } diff --git a/bcftools/convert.h b/bcftools/convert.h index 3712338..11e892d 100644 --- a/bcftools/convert.h +++ b/bcftools/convert.h @@ -30,7 +30,8 @@ THE SOFTWARE. */ typedef struct _convert_t convert_t; enum convert_option { - allow_undef_tags + allow_undef_tags, + subset_samples, }; convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str); diff --git a/bcftools/csq.c b/bcftools/csq.c index 94ac442..b36ebc3 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016 Genome Research Ltd. + Copyright (c) 2016-2018 Genome Research Ltd. Author: Petr Danecek @@ -726,7 +726,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char id = tbl->nstr++; hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); tbl->str[id] = strdup(ss); - int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id); + khash_str2int_set(tbl->str2id, tbl->str[id], id); } *se = tmp; @@ -2713,6 +2713,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } } + void hap_finalize(args_t *args, hap_t *hap) { tscript_t *tr = hap->tr; @@ -2784,7 +2785,10 @@ void hap_finalize(args_t *args, hap_t *hap) } dlen += hap->stack[i].node->dlen; if ( hap->stack[i].node->dlen ) indel = 1; - if ( istack[i+1].node->type != HAP_SSS ) { if ( dlen%3 ) // frameshift { @@ -3388,7 +3392,7 @@ int test_cds(args_t *args, bcf1_t *rec) int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts; if ( gt[0]==bcf_gt_missing ) continue; - if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end ) + if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) ) { if ( args->phase==PHASE_MERGE ) { @@ -3397,7 +3401,7 @@ int test_cds(args_t *args, bcf1_t *rec) if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) { if ( args->phase==PHASE_REQUIRE ) - error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); + error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); if ( args->phase==PHASE_SKIP ) continue; if ( args->phase==PHASE_NON_REF ) @@ -3648,6 +3652,7 @@ void process(args_t *args, bcf1_t **rec_ptr) int call_csq = 1; if ( !rec->n_allele ) call_csq = 0; // no alternate allele else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele + else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc else if ( args->filter ) { call_csq = filter_test(args->filter, rec, NULL); @@ -3695,12 +3700,12 @@ static const char *usage(void) " -c, --custom-tag use this tag instead of the default BCSQ\n" " -l, --local-csq localized predictions, consider only one VCF record at a time\n" " -n, --ncsq maximum number of consequences to consider per site [16]\n" - " -p, --phase how to construct haplotypes and how to deal with unphased data: [r]\n" + " -p, --phase how to handle unphased heterozygous genotypes: [r]\n" " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" - " s: skip unphased GTs\n" + " s: skip unphased hets\n" "Options:\n" " -e, --exclude exclude sites for which the expression is true\n" " -i, --include select sites for which the expression is true\n" diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index 978bd59..e4afa8e 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2016 Genome Research Ltd. + Copyright (c) 2016-2018 Genome Research Ltd. Author: Petr Danecek @@ -728,7 +728,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char id = tbl->nstr++; hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str); tbl->str[id] = strdup(ss); - int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id); + khash_str2int_set(tbl->str2id, tbl->str[id], id); } *se = tmp; @@ -2715,6 +2715,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } } + void hap_finalize(args_t *args, hap_t *hap) { tscript_t *tr = hap->tr; @@ -2786,7 +2787,10 @@ void hap_finalize(args_t *args, hap_t *hap) } dlen += hap->stack[i].node->dlen; if ( hap->stack[i].node->dlen ) indel = 1; - if ( istack[i+1].node->type != HAP_SSS ) { if ( dlen%3 ) // frameshift { @@ -3390,7 +3394,7 @@ int test_cds(args_t *args, bcf1_t *rec) int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts; if ( gt[0]==bcf_gt_missing ) continue; - if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end ) + if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) ) { if ( args->phase==PHASE_MERGE ) { @@ -3399,7 +3403,7 @@ int test_cds(args_t *args, bcf1_t *rec) if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) { if ( args->phase==PHASE_REQUIRE ) - error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); + error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); if ( args->phase==PHASE_SKIP ) continue; if ( args->phase==PHASE_NON_REF ) @@ -3650,6 +3654,7 @@ void process(args_t *args, bcf1_t **rec_ptr) int call_csq = 1; if ( !rec->n_allele ) call_csq = 0; // no alternate allele else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele + else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc else if ( args->filter ) { call_csq = filter_test(args->filter, rec, NULL); @@ -3697,12 +3702,12 @@ static const char *usage(void) " -c, --custom-tag use this tag instead of the default BCSQ\n" " -l, --local-csq localized predictions, consider only one VCF record at a time\n" " -n, --ncsq maximum number of consequences to consider per site [16]\n" - " -p, --phase how to construct haplotypes and how to deal with unphased data: [r]\n" + " -p, --phase how to handle unphased heterozygous genotypes: [r]\n" " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" - " s: skip unphased GTs\n" + " s: skip unphased hets\n" "Options:\n" " -e, --exclude exclude sites for which the expression is true\n" " -i, --include select sites for which the expression is true\n" diff --git a/bcftools/filter.c b/bcftools/filter.c index 3dc91a7..53e8dc7 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -1,6 +1,6 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -67,11 +67,15 @@ typedef struct _token_t char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types - int idx; // 0-based index to VCF vectors, -1: not a vector, + int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) - int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited + int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited + int nidxs, nuidxs; // size of idxs array and the number of elements set to 1 + uint8_t *usmpl; // bitmask of used samples as set by idx + int nsamples; // number of samples for format fields, 0 for info and other fields void (*setter)(filter_t *, bcf1_t *, struct _token_t *); - int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *); + int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack); + void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *); void *hash; // test presence of str value in the hash via comparator regex_t *regex; // precompiled regex for string comparison @@ -81,9 +85,8 @@ typedef struct _token_t int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues int pass_site; // -1 not applicable, 0 fails, >0 pass uint8_t *pass_samples; // status of individual samples - int nsamples; // number of samples int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l - int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1 + int nval1; // number of per-sample fields or string length } token_t; @@ -126,10 +129,11 @@ struct _filter_t #define TOK_ABS 23 #define TOK_LEN 24 #define TOK_FUNC 25 +#define TOK_CNT 26 -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 -// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8}; +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 +// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8}; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f" static int filters_next_token(char **str, int *len) @@ -156,6 +160,7 @@ static int filters_next_token(char **str, int *len) if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; } if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } + if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility @@ -249,12 +254,10 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok) { float *ptr = &line->qual; if ( bcf_float_is_missing(*ptr) ) - tok->nvalues = 0; + bcf_double_set_missing(tok->values[0]); else - { tok->values[0] = (double)line->qual; - tok->nvalues = 1; - } + tok->nvalues = 1; } static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -271,7 +274,7 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) if ( line->d.info[i].key == tok->hdr_id ) break; if ( i==line->n_info ) - tok->nvalues = 0; + tok->nvalues = tok->str_value.l = 0; else if ( line->d.info[i].type==BCF_BT_CHAR ) { int n = line->d.info[i].len; @@ -308,40 +311,51 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) } } } -static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { int a = (int)(atok->nvalues?atok->values[0]:atok->threshold); int b = (int)(btok->nvalues?btok->values[0]:btok->threshold); - if ( op_type==TOK_LIKE ) return a&b ? 1 : 0; - return a&b ? 0 : 1; + if ( rtok->tok_type==TOK_LIKE ) + rtok->pass_site = a&b ? 1 : 0; + else + rtok->pass_site = a&b ? 0 : 1; } -static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { int i; - if ( op_type==TOK_NE ) // AND logic: none of the filters can match + if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match { if ( !line->d.n_flt ) { - if ( atok->hdr_id==-1 ) return 0; // missing value - return 1; // no filter present, eval to true + if ( atok->hdr_id==-1 ) return; // missing value + rtok->pass_site = 1; + return; // no filter present, eval to true } for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) return 0; - return 1; + if ( atok->hdr_id==line->d.flt[i] ) return; + rtok->pass_site = 1; + return; } - // TOK_EQ with OR logic: at least one of the filters must match - if ( !line->d.n_flt ) + else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match { - if ( atok->hdr_id==-1 ) return 1; - return 0; // no filter present, eval to false + if ( !line->d.n_flt ) + { + if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } + return; // no filter present, eval to false + } + for (i=0; id.n_flt; i++) + if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; } + return; } - for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) return 1; - return 0; + else + error("Only == and != operators are supported for FILTER\n"); + return; } -static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { // multiple IDs not supported yet (easy to add though) + if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) + error("Only == and != operators are supported for ID\n"); if ( btok->hash ) { @@ -350,12 +364,15 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin if ( atok->hash ) { int ret = khash_str2int_has_key(atok->hash, line->d.id); - if ( op_type==TOK_EQ ) return ret; - return ret ? 0 : 1; + if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1; + rtok->pass_site = ret; + return; } - if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - return strcmp(btok->str_value.s,line->d.id) ? 1 : 0; + if ( rtok->tok_type==TOK_EQ ) + rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; + else + rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; } /** @@ -431,7 +448,7 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok) } else { - int64_t value; + int64_t value = 0; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else @@ -510,14 +527,13 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok) { flt->tmps.l = 0; ks_resize(&flt->tmps, n); - int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1; - if ( end >= n ) end = n - 1; + int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1; + if ( iend >= n ) iend = n - 1; char *beg = tok->str_value.s, *dst = flt->tmps.s; - for (i=0; i<=end; i++) + for (i=0; i<=iend; i++) { char *end = beg; while ( *end && *end!=',' ) end++; - if ( i>=tok->nidxs || tok->idxs[i] ) { memcpy(dst, beg, end - beg); @@ -525,7 +541,6 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok) dst[0] = ','; dst++; } - beg = end+1; } dst[0] = 0; @@ -549,190 +564,161 @@ static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok) static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) { - int i; - if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) + if ( line->n_sample != tok->nsamples ) + error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); + + int nvals; + if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) { - tok->nvalues = tok->nsamples = 0; + tok->nvalues = 0; return; } - if ( tok->idx >= -1 ) // scalar or vector index + int i, nsrc1 = nvals / tok->nsamples; + tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1); + tok->nvalues = tok->nval1*tok->nsamples; + hts_expand(double, tok->nvalues, tok->mvalues, tok->values); + + if ( tok->idx >= 0 ) // scalar or vector index { - hts_expand(double,flt->nsamples,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - int32_t *ptr = flt->tmpi; - for (i=0; in_sample; i++) + for (i=0; insamples; i++) { - if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end ) + if ( !tok->usmpl[i] ) continue; + int32_t *ptr = flt->tmpi + i*nsrc1; + if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end ) bcf_double_set_missing(tok->values[i]); else - { - tok->values[i] = ptr[idx]; - is_missing = 0; - } - ptr += nvals; + tok->values[i] = ptr[tok->idx]; } - if ( is_missing ) tok->nvalues = 0; - else tok->nvalues = line->n_sample; - tok->nsamples = tok->nvalues; - return; } - if ( tok->idx == -2 ) + else { - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; - if ( end >= nvals ) end = nvals - 1; - int32_t *ptr = flt->tmpi; - for (i=0; in_sample; i++) + int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; + for (i=0; insamples; i++) { - for (k=0; k<=end; k++) - if ( k>=tok->nidxs || tok->idxs[k] ) - { - if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end ) - bcf_double_set_missing(tok->values[j]); - else - { - tok->values[j] = ptr[k]; - is_missing = 0; - } - j++; - } - ptr += nvals; - } - if ( is_missing ) tok->nvalues = tok->nsamples = 0; - else - { - tok->nsamples = line->n_sample; - tok->nvalues = j; + if ( !tok->usmpl[i] ) continue; + int32_t *src = flt->tmpi + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; knidxs && !tok->idxs[k] ) continue; + if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end ) + bcf_double_set_missing(dst[j]); + else + dst[j] = src[k]; + j++; + } + while (j < tok->nval1) + { + bcf_double_set_missing(dst[j]); + j++; + } } - return; } } static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) { - int i; - if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) + if ( line->n_sample != tok->nsamples ) + error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); + + int nvals; + if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) { - tok->nvalues = tok->nsamples = 0; + tok->nvalues = 0; return; } - if ( tok->idx >= -1 ) // scalar or vector index + int i, nsrc1 = nvals / tok->nsamples; + tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1); + tok->nvalues = tok->nval1*tok->nsamples; + hts_expand(double, tok->nvalues, tok->mvalues, tok->values); + + if ( tok->idx >= 0 ) // scalar or vector index { - hts_expand(double,flt->nsamples,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - float *ptr = flt->tmpf; - for (i=0; in_sample; i++) + for (i=0; insamples; i++) { - if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) ) + if ( !tok->usmpl[i] ) continue; + float *ptr = flt->tmpf + i*nsrc1; + if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) ) bcf_double_set_missing(tok->values[i]); else - { - tok->values[i] = ptr[idx]; - is_missing = 0; - } - ptr += nvals; + tok->values[i] = ptr[tok->idx]; } - if ( is_missing ) tok->nvalues = 0; - else tok->nvalues = line->n_sample; - tok->nsamples = tok->nvalues; - return; } - if ( tok->idx == -2 ) + else { - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; - if ( end >= nvals ) end = nvals - 1; - float *ptr = flt->tmpf; - for (i=0; in_sample; i++) + int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; + for (i=0; insamples; i++) { - for (k=0; k<=end; k++) - if ( k>=tok->nidxs || tok->idxs[k] ) - { - if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) ) - bcf_double_set_missing(tok->values[j]); - else - { - tok->values[j] = ptr[k]; - is_missing = 0; - } - j++; - } - ptr += nvals; - } - if ( is_missing ) tok->nvalues = tok->nsamples = 0; - else - { - tok->nsamples = line->n_sample; - tok->nvalues = j; + if ( !tok->usmpl[i] ) continue; + float *src = flt->tmpf + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; knidxs && !tok->idxs[k] ) continue; + if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) ) + bcf_double_set_missing(dst[j]); + else + dst[j] = src[k]; + j++; + } + while (j < tok->nval1) + { + bcf_double_set_missing(dst[j]); + j++; + } } - return; } } static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) { - tok->str_value.l = tok->nvalues = 0; - if ( !line->n_sample ) return; + if ( line->n_sample != tok->nsamples ) + error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); - int ndim = tok->str_value.m; + int i, ndim = tok->str_value.m; int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); tok->str_value.m = ndim; + tok->str_value.l = tok->nvalues = 0; - if ( nstr<=0 ) return; - - if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*] - { - tok->nsamples = line->n_sample; - tok->nstr1 = ndim / line->n_sample; - tok->nvalues = tok->str_value.l = nstr; - return; - } - - int nstr1 = nstr / line->n_sample; + if ( nstr<0 ) return; - // vector, one or multiple indices - int i; - for (i=0; in_sample; i++) + tok->nvalues = tok->str_value.l = nstr; + tok->nval1 = nstr / tok->nsamples; + for (i=0; insamples; i++) { - char *dst = tok->str_value.s + i*nstr1, *str = dst; - int nval = 0, ibeg = 0; - while ( ibeg < nstr1 ) + if ( !tok->usmpl[i] ) continue; + char *src = tok->str_value.s + i*tok->nval1, *dst = src; + int ibeg = 0, idx = 0; + while ( ibeg < tok->nval1 ) { - int iend = ibeg + 1; - while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++; + int iend = ibeg; + while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++; int keep = 0; - if ( tok->idx >=0 ) - keep = tok->idx==nval ? 1 : 0; - else if ( nval < tok->nidxs ) - keep = tok->idxs[nval] ? 1 : 0; + if ( tok->idx >= 0 ) + { + if ( tok->idx==idx ) keep = 1; + } + else if ( idx < tok->nidxs ) + { + if ( tok->idxs[idx] != 0 ) keep = 1; + } else if ( tok->idxs[tok->nidxs-1] < 0 ) keep = 1; if ( keep ) { - if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1); + if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1); dst += iend - ibeg + 1; if ( tok->idx>=0 ) break; } - if ( !str[iend] ) break; + if ( !src[iend] ) break; ibeg = iend + 1; - nval++; + idx++; } - if ( dst==str ) { dst[0] = '.'; dst+=2; } - if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str)); + if ( dst==src ) { dst[0] = '.'; dst+=2; } + if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src)); } - tok->nvalues = tok->str_value.l = nstr; - tok->nstr1 = nstr1; - tok->nsamples = line->n_sample; } static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type) { @@ -743,10 +729,10 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int return; } - int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4; - if ( tok->str_value.m <= nvals*nsmpl ) + int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4; + if ( tok->str_value.m <= nvals1*nsmpl ) { - tok->str_value.m = nvals*nsmpl + 1; + tok->str_value.m = nvals1*nsmpl + 1; tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m); } @@ -768,8 +754,15 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \ } \ } \ - char *dst = &tok->str_value.s[nvals*i]; \ - if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \ + char *dst = &tok->str_value.s[nvals1*i]; \ + if ( type==4 ) \ + { \ + if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \ + else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \ + else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \ + else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \ + } \ + else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \ else if ( type==3 ) \ { \ if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \ @@ -803,31 +796,30 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; } #undef BRANCH_INT - tok->nsamples = nsmpl; - tok->nvalues = tok->str_value.l = nvals*nsmpl; + assert( tok->nsamples == nsmpl ); + tok->nvalues = tok->str_value.l = nvals1*nsmpl; tok->str_value.s[tok->str_value.l] = 0; - tok->nstr1 = nvals; + tok->nval1 = nvals1; } static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } +static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); if ( !fmt ) { - tok->nvalues = tok->nsamples = 0; + tok->nvalues = 0; return; } - int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr); - kstring_t str; + int i, blen = 4, nsmpl = line->n_sample; gt_length_too_big: tok->str_value.l = 0; for (i=0; istr_value.l; - + size_t plen = tok->str_value.l; bcf_format_gt(fmt, i, &tok->str_value); kputc_(0, &tok->str_value); if ( tok->str_value.l - plen > blen ) @@ -845,9 +837,9 @@ gt_length_too_big: plen++; } } - tok->nsamples = nsmpl; + assert( tok->nsamples == nsmpl ); tok->nvalues = tok->str_value.l; - tok->nstr1 = blen; + tok->nval1 = blen; } static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -868,7 +860,7 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok) } else if ( tok->idx==-2 ) { - int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1; + int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1; if ( end >= line->n_allele - 1 ) end = line->n_allele - 2; for (i=0; i<=end; i++) if ( i>=tok->nidxs || tok->idxs[i] ) @@ -998,59 +990,111 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok) } } -static void set_max(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; + rtok->nvalues = 0; + if ( !tok->nvalues ) return 1; double val = -HUGE_VAL; - int i; + int i, has_value = 0; for (i=0; invalues; i++) { - if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i]; + if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val < tok->values[i] ) val = tok->values[i]; + } + if ( has_value ) + { + rtok->values[0] = val; + rtok->nvalues = has_value; } - tok->values[0] = val; - tok->nvalues = 1; - tok->nsamples = 0; + return 1; } -static void set_min(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; + rtok->nvalues = 0; + if ( !tok->nvalues ) return 1; double val = HUGE_VAL; - int i; + int i, has_value = 0; for (i=0; invalues; i++) - if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i]; - tok->values[0] = val; - tok->nvalues = 1; - tok->nsamples = 0; + { + if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val > tok->values[i] ) val = tok->values[i]; + } + if ( has_value ) + { + rtok->values[0] = val; + rtok->nvalues = has_value; + } + return 1; } -static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; + rtok->nvalues = 0; + if ( !tok->nvalues ) return 1; double val = 0; int i, n = 0; for (i=0; invalues; i++) if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } - tok->values[0] = n ? val / n : 0; - tok->nvalues = 1; - tok->nsamples = 0; + if ( n ) + { + rtok->values[0] = val / n; + rtok->nvalues = 1; + } + return 1; } -static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + rtok->nvalues = 0; + token_t *tok = stack[nstack - 1]; + if ( !tok->nvalues ) return 1; double val = 0; int i, n = 0; for (i=0; invalues; i++) if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } - tok->values[0] = val; - tok->nvalues = 1; - tok->nsamples = 0; + if ( n ) + { + rtok->values[0] = val; + rtok->nvalues = 1; + } + return 1; } -static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; if ( tok->is_str ) error("ABS() can be applied only on numeric values\n"); + + rtok->nvalues = tok->nvalues; + if ( !tok->nvalues ) return 1; + hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); int i; for (i=0; invalues; i++) - tok->values[i] = fabs(tok->values[i]); + if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); + else rtok->values[i] = fabs(tok->values[i]); + return 1; +} +static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n"); + + int i, cnt = 0; + for (i=0; insamples; i++) + if ( tok->pass_samples[i] ) cnt++; + + rtok->nvalues = 1; + rtok->values[0] = cnt; + return 1; } -static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - tok->is_str = 0; - if ( !tok->str_value.l ) return; + token_t *tok = stack[nstack - 1]; + rtok->nvalues = rtok->str_value.l = 0; + if ( !tok->str_value.l ) return 1; + if ( tok->idx==-2 ) { int i = 0; @@ -1059,202 +1103,193 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { char *se = ss; while ( *se && *se!=',' ) se++; - hts_expand(double, i+1, tok->mvalues, tok->values); - if ( !*se ) tok->values[i] = strlen(ss); + hts_expand(double, i+1, rtok->mvalues, rtok->values); + if ( !*se ) rtok->values[i] = strlen(ss); else { *se = 0; - tok->values[i] = strlen(ss); + rtok->values[i] = strlen(ss); *se = ','; } ss = *se ? se + 1 : se; i++; } - tok->nvalues = i; + rtok->nvalues = i; } else { - tok->values[0] = strlen(tok->str_value.s); - tok->nvalues = 1; + rtok->values[0] = strlen(tok->str_value.s); + rtok->nvalues = 1; } - tok->str_value.l = 0; + return 1; +} +inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) +{ + token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; + rtok->nvalues = tok->nvalues; + rtok->nval1 = tok->nval1; + hts_expand(double*, rtok->nvalues, rtok->mvalues, rtok->values); } -#define VECTOR_ARITHMETICS(atok,btok,AOP) \ +inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) +{ + if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples ) + { + rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples; + rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); + int i; + for (i=0; insamples; i++) rtok->usmpl[i] |= atok->usmpl[i]; + for (i=0; insamples; i++) rtok->usmpl[i] |= btok->usmpl[i]; + } + memset(rtok->pass_samples, 0, rtok->nsamples); +} + +#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \ { \ + token_t *rtok = _rtok; \ int i, has_values = 0; \ - if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \ + if ( atok->nvalues && btok->nvalues ) \ { \ - (atok)->nvalues = 0; (atok)->nsamples = 0; \ - } \ - else \ - { \ - if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \ - { \ - for (i=0; i<(atok)->nvalues; i++) \ - { \ - if ( bcf_double_is_missing((atok)->values[i]) ) continue; \ - if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \ - has_values = 1; \ - (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \ - } \ - } \ - else if ( (btok)->nsamples ) \ + tok_init_values(atok, btok, rtok); \ + tok_init_samples(atok, btok, rtok); \ + if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \ { \ - hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \ - for (i=0; i<(btok)->nvalues; i++) \ + assert( atok->nsamples==btok->nsamples ); \ + for (i=0; invalues; i++) \ { \ - if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \ + if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \ { \ - bcf_double_set_missing((atok)->values[i]); \ + bcf_double_set_missing(rtok->values[i]); \ continue; \ } \ has_values = 1; \ - (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \ + rtok->values[i] = atok->values[i] AOP btok->values[i]; \ } \ - (atok)->nvalues = (btok)->nvalues; \ - (atok)->nsamples = (btok)->nsamples; \ } \ - else if ( (atok)->nsamples ) \ + else \ { \ - for (i=0; i<(atok)->nvalues; i++) \ + token_t *xtok = atok->nsamples ? atok : btok; \ + token_t *ytok = atok->nsamples ? btok : atok; \ + assert( ytok->nvalues==1 ); \ + if ( !bcf_double_is_missing(ytok->values[0]) ) \ { \ - if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \ + for (i=0; invalues; i++) \ { \ - bcf_double_set_missing((atok)->values[i]); \ - continue; \ + if ( bcf_double_is_missing(xtok->values[i]) ) \ + { \ + bcf_double_set_missing(rtok->values[i]); \ + continue; \ + } \ + has_values = 1; \ + rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \ } \ - has_values = 1; \ - (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \ } \ } \ } \ - if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \ + if ( !has_values ) rtok->nvalues = 0; \ } -static int vector_logic_and(token_t *atok, token_t *btok, int and_type) +static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields), - // or a vector and a scalar (FORMAT field vs threshold) - int i, pass_site = 0; - if ( !atok->nvalues || !btok->nvalues ) - { - atok->nvalues = atok->nsamples = 0; - return 0; - } - if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site; - if ( atok->nsamples && btok->nsamples ) + token_t *atok = stack[nstack-2]; + token_t *btok = stack[nstack-1]; + tok_init_samples(atok, btok, rtok); + + if ( !atok->pass_site && !btok->pass_site ) return 2; + + rtok->pass_site = 1; + if ( !atok->nsamples && !btok->nsamples ) return 2; + + int i; + if ( rtok->tok_type==TOK_OR_VEC ) // ||, select all samples if one is true { - if ( and_type==TOK_AND ) + if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) ) { - // perform AND within a sample - for (i=0; insamples; i++) + // These two conditions are to ensure the following does not set all samples + // at sites with QUAL<=30: + // QUAL>30 || FMT/GQ>30 + + token_t *tok = atok->nsamples ? atok : btok; + for (i=0; insamples; i++) { - atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i]; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = tok->pass_samples[i]; } } else { - // perform AND across samples - int pass_a = 0, pass_b = 0; - for (i=0; insamples; i++) - { - if ( atok->pass_samples[i] ) pass_a = 1; - atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i]; - } - for (i=0; insamples; i++) + for (i=0; insamples; i++) { - if ( btok->pass_samples[i] ) { pass_b = 1; break; } + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = 1; } - pass_site = pass_a && pass_b; } - return pass_site; + return 2; } - if ( btok->nsamples ) + + // |, only select samples which are actually true + + if ( !atok->nsamples || !btok->nsamples ) { - for (i=0; insamples; i++) + token_t *tok = atok->nsamples ? atok : btok; + for (i=0; insamples; i++) { - atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i]; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = tok->pass_samples[i]; } - atok->nsamples = btok->nsamples; - return pass_site; + return 2; } - /* atok->nsamples!=0 */ - for (i=0; insamples; i++) + + assert( atok->nsamples==btok->nsamples ); + + for (i=0; insamples; i++) { - atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i]; } - return pass_site; + return 2; } -static int vector_logic_or(token_t *atok, token_t *btok, int or_type) +static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - int i, pass_site = 0; - if ( !atok->nvalues && !btok->nvalues ) // missing sites in both - { - atok->nvalues = atok->nsamples = 0; - return 0; - } - if ( !atok->nvalues ) // missing value in a - { - for (i=0; insamples; i++) - atok->pass_samples[i] = btok->pass_samples[i]; - atok->nsamples = btok->nsamples; - atok->nvalues = 1; - return btok->pass_site; - } - if ( !btok->nvalues ) // missing value in b - { - btok->nvalues = 1; - return atok->pass_site; - } + token_t *atok = stack[nstack-2]; + token_t *btok = stack[nstack-1]; + tok_init_samples(atok, btok, rtok); - if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site; - if ( !atok->nsamples ) + if ( !atok->pass_site || !btok->pass_site ) return 2; + if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; } + + int i; + if ( !atok->nsamples || !btok->nsamples ) { - if ( or_type==TOK_OR ) + token_t *tok = atok->nsamples ? atok : btok; + for (i=0; insamples; i++) { - for (i=0; insamples; i++) - { - atok->pass_samples[i] = btok->pass_samples[i]; - if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1; - } + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = tok->pass_samples[i]; } - else - { - for (i=0; insamples; i++) - { - atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i]; - if ( atok->pass_samples[i] ) pass_site = 1; - } - } - atok->nsamples = btok->nsamples; - return pass_site; + rtok->pass_site = 1; + return 2; } - if ( !btok->nsamples ) // vector vs site + + assert( atok->nsamples==btok->nsamples ); + if ( rtok->tok_type==TOK_AND_VEC ) // &&, can be true in different samples { - if ( or_type==TOK_OR ) + for (i=0; insamples; i++) { - for (i=0; insamples; i++) - if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i]; } - else - { - for (i=0; insamples; i++) - { - atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site; - if ( atok->pass_samples[i] ) pass_site = 1; - } - } - return pass_site; + rtok->pass_site = 1; } - for (i=0; insamples; i++) + else // &, must be true within one sample { - atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i]; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i]; + if ( rtok->pass_samples[i] ) rtok->pass_site = 1; + } } - return pass_site; + return 2; } #define CMP_MISSING(atok,btok,CMP_OP,ret) \ @@ -1265,224 +1300,394 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) tok->nvalues = 1; \ } -#define CMP_VECTORS(atok,btok,CMP_OP,ret) \ +#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \ { \ - int i, j, has_values = 0, pass_site = 0; \ - if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ - else \ + token_t *rtok = _rtok; \ + int i, j, k; \ + assert( !atok->nsamples || !btok->nsamples ); \ + tok_init_samples(atok, btok, rtok); \ + if ( !atok->nsamples && !btok->nsamples ) \ { \ - if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \ + if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \ + else if ( !atok->nvalues || !btok->nvalues ) \ { \ - /* any field can match: [*] */ \ - for (i=0; i<(atok)->nvalues; i++) \ + token_t *tok = atok->nvalues ? atok : btok; \ + for (j=0; jnvalues; j++) \ { \ - for (j=0; j<(btok)->nvalues; j++) \ - if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \ + if ( bcf_double_is_missing(tok->values[j]) ) \ + { \ + if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \ + } \ + else if ( missing_logic[1] ) { rtok->pass_site = 1; break; } \ } \ } \ - else if ( (atok)->nsamples && (btok)->nsamples ) \ + else \ { \ - for (i=0; i<(atok)->nsamples; i++) \ + for (i=0; invalues; i++) \ { \ - has_values = 1; \ - if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ + int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \ + for (j=0; jnvalues; j++) \ + { \ + int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \ + if ( nmiss ) \ + { \ + if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ + } \ + else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ + } \ } \ - if ( !has_values ) (atok)->nvalues = 0; \ } \ - else if ( (atok)->nsamples ) \ + } \ + else \ + { \ + if ( !atok->nvalues && !btok->nvalues ) \ { \ - for (i=0; i<(atok)->nsamples; i++) \ + if ( missing_logic[2] ) \ { \ - /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \ - has_values = 1; \ - if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ + for (i=0; insamples; i++) \ + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \ } \ - if ( !has_values ) (atok)->nvalues = 0; \ } \ - else if ( (btok)->nsamples ) \ + else if ( !atok->nvalues || !btok->nvalues ) \ { \ - for (i=0; i<(btok)->nsamples; i++) \ + token_t *tok = atok->nvalues ? atok : btok; \ + if ( !tok->nsamples ) \ { \ - if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - has_values = 1; \ - if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ + int miss = 0; \ + for (j=0; jnvalues; j++) \ + miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \ + if ( missing_logic[++miss] ) \ + { \ + for (i=0; insamples; i++) \ + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ + } \ } \ - (atok)->nvalues = (btok)->nvalues; \ - (atok)->nsamples = (btok)->nsamples; \ - if ( !has_values ) (atok)->nvalues = 0; \ + else \ + for (i=0; insamples; i++) \ + { \ + if ( !rtok->usmpl[i] ) continue; \ + double *ptr = tok->values + i*tok->nval1; \ + int miss = 0; \ + for (j=0; jnval1; j++) \ + miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \ + if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ + } \ } \ else \ { \ - if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \ + token_t *xtok = atok->nsamples ? atok : btok; \ + token_t *ytok = atok->nsamples ? btok : atok; \ + for (i=0; insamples; i++) \ + { \ + if ( !rtok->usmpl[i] ) continue; \ + double *xptr = xtok->values + i*xtok->nval1; \ + double *yptr = ytok->values + i*ytok->nval1; \ + for (j=0; jnval1; j++) \ + { \ + int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \ + if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ + for (k=0; knvalues; k++) \ + { \ + int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \ + if ( nmiss ) \ + { \ + if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + } \ + else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + } \ + } \ + } \ } \ - /*fprintf(stderr,"pass=%d\n", pass_site);*/ \ - (ret) = pass_site; \ } \ } -static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE +static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic) { - if ( !atok->str_value.l ) { return 0; } - if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; } - int i, pass_site = 0; - if ( atok->nsamples && atok->nsamples==btok->nsamples ) - { - for (i=0; insamples; i++) - { - char *astr = atok->str_value.s + i*atok->nstr1; - char *bstr = btok->str_value.s + i*btok->nstr1; - char *aend = astr + atok->str_value.l, *a = astr; - while ( astr_value.l, *b = bstr; - while ( bpass_samples[i] = 0; - else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0; - if ( logic!=TOK_EQ ) - atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; - pass_site |= atok->pass_samples[i]; - } - if ( !atok->nsamples ) atok->nsamples = btok->nsamples; - } - else if ( !atok->nsamples && !btok->nsamples ) - { - if ( atok->idx==-2 || btok->idx==-2 ) - { - // any field can match: [*] - if ( atok->idx==-2 && btok->idx==-2 ) - error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value); - token_t *xtok, *ytok; // xtok is scalar, ytok array - if ( btok->idx==-2 ) { xtok = atok; ytok = btok; } - else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l; - char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr; - while ( y<=yend ) - { - if ( y==yend || *y==',' ) - { - if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) ) - { - pass_site = 1; - break; - } - ystr = y+1; - } - y++; - } - } - else - pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1; - if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1; + char *end = str + len; + while ( str < end && *str ) + { + char *mid = str; + while ( mid < end && *mid && *mid!=',' ) mid++; + int miss = mid - str == 1 && str[0]=='.' ? 1 : 0; + if ( miss && missing_logic[miss] ) return 1; + char tmp = *mid; *mid = 0; + int match = regexec(regex, str, 0,NULL,0) ? 0 : 1; + *mid = tmp; + if ( logic==TOK_NLIKE ) match = match ? 0 : 1; + if ( match ) return 1; + if ( !*mid ) break; + str = mid + 1; } - else + return 0; +} +static inline int _has_missing_string(char *beg) +{ + while ( *beg ) { - token_t *xtok, *ytok; - if ( !atok->nsamples ) { xtok = atok; ytok = btok; } - else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value.s; - char *xend = xstr + xtok->str_value.l, *x = xstr; - while ( xnsamples; i++) - { - char *ystr = ytok->str_value.s + i*ytok->nstr1; - char *ybeg = ystr, *yend = ystr + ytok->nstr1; - int pass = 0; - while ( ybeg < yend ) + char *end = beg; + while ( *end && *end!=',' ) end++; + if ( end-beg==1 && beg[0]=='.' ) return 1; + if ( !*end ) break; + beg = end + 1; + } + return 0; +} + +// Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A". +// Returns 1 if any field matches, otherwise returns 0 +static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic) +{ + char *aend = abeg + alen; + char *bend = bstr + blen; + while ( abeg < aend && *abeg ) + { + char *amid = abeg; + while ( amid < aend && *amid && *amid!=',' ) amid++; + int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0; + char *bbeg = bstr; + while ( bbeg < bend && *bbeg ) + { + char *bmid = bbeg; + while ( bmid < bend && *bmid && *bmid!=',' ) bmid++; + int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0); + if ( nmiss ) { - char *y = ybeg; - while ( ypass_samples[i] = pass; - pass_site |= pass; + else + { + int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0; + if ( logic==TOK_NE ) match = match==1 ? 0 : 1; + if ( match ) return 1; + } + if ( !*bmid ) break; + bbeg = bmid + 1; } - if ( !atok->nsamples ) - atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set + if ( !*amid ) break; + abeg = amid + 1; } - return pass_site; + return 0; } -static int regex_vector_strings(token_t *atok, token_t *btok, int negate) +static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) { - int i, pass_site = 0; - if ( atok->nsamples ) + tok_init_samples(atok, btok, rtok); + + int i, logic = rtok->tok_type; // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE + regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL); + + assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l ); + assert( !atok->nsamples || !btok->nsamples ); + assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) ); + + int missing_logic[] = {0,0,0}; + if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1; + else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1; + + if ( !atok->nsamples && !btok->nsamples ) + { + if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; } + if ( !atok->nvalues || !btok->nvalues ) + { + int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s); + if ( missing_logic[miss+1] ) rtok->pass_site = 1; + return; + } + if ( !regex ) + rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); + else + { + token_t *tok = atok->regex ? btok : atok; + rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic); + } + return; + } + + // The case of (!atok->nsamples || !btok->nsamples) + + if ( !atok->nvalues && !btok->nvalues ) { - for (i=0; insamples; i++) + if ( missing_logic[2] ) { - char *ptr = atok->str_value.s + i*atok->nstr1; - atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1; - if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; - pass_site |= atok->pass_samples[i]; + for (i=0; insamples; i++) + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } } - return pass_site; + return; + } + if ( !atok->nvalues || !btok->nvalues ) + { + token_t *tok = atok->nvalues ? atok : btok; + if ( !tok->nsamples ) + { + int miss = _has_missing_string(tok->str_value.s); + if ( !missing_logic[miss+1] ) return; + for (i=0; insamples; i++) + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; } + } + else + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + int miss = _has_missing_string(tok->str_value.s + i*tok->nval1); + if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; } + } + return; + } + + // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) + token_t *xtok = atok->nsamples ? atok : btok; + token_t *ytok = atok->nsamples ? btok : atok; + assert( regex==ytok->regex ); + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + int match; + if ( regex ) + match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic); + else + match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic); + if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; } } - pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1; - if ( negate ) pass_site = pass_site ? 0 : 1; - return pass_site; } -static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG[" +static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx) { - // TAG[*] .. any field - if ( !strncmp("*]", tag_idx, 3) ) + // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1 + if ( *tag_idx==0 || !strcmp("*", tag_idx) ) { - tok->idxs = (int*) malloc(sizeof(int)); - tok->idxs[0] = -1; - tok->nidxs = 1; - tok->idx = -2; - return; + *idxs = (int*) malloc(sizeof(int)); + (*idxs)[0] = -1; + *nidxs = 1; + *idx = -2; + return 0; } - // TAG[integer] .. one field + // TAG[integer] .. one field; idx positive char *end, *beg = tag_idx; - tok->idx = strtol(tag_idx, &end, 10); - if ( tok->idx >= 0 && *end==']' ) return; + *idx = strtol(tag_idx, &end, 10); + if ( *idx >= 0 && *end==0 ) return 0; - - // TAG[0,1] or TAG[0-2] or [1-] etc + // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,.. int i, ibeg = -1; - while ( *beg && *beg!=']' ) + while ( *beg ) { - int idx = strtol(beg, &end, 10); + int num = strtol(beg, &end, 10); if ( end[0]==',' ) beg = end + 1; - else if ( end[0]==']' ) beg = end; - else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; } - else error("Could not parse the index: %s[%s\n", tag, tag_idx+1); - if ( idx >= tok->nidxs ) + else if ( end[0]==0 ) beg = end; + else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; } + else return -1; + if ( num >= *nidxs ) { - tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1)); - memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1)); - tok->nidxs = idx + 1; + *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1)); + memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1)); + *nidxs = num + 1; } if ( ibeg>=0 ) { - for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1; + for (i=ibeg; i<=num; i++) (*idxs)[i] = 1; ibeg = -1; } - tok->idxs[idx] = 1; + (*idxs)[num] = 1; } if ( ibeg >=0 ) { - if ( ibeg >= tok->nidxs ) + if ( ibeg >= *nidxs ) { - tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1)); - memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1)); - tok->nidxs = ibeg + 1; + *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1)); + memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1)); + *nidxs = ibeg + 1; + } + (*idxs)[ibeg] = -1; + } + *idx = -2; + return 0; +} + +static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG[" +{ + int i, len = strlen(tag_idx); + if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0; + char *ori = strdup(tag_idx); + + assert( !tok->idxs && !tok->usmpl ); + int *idxs1 = NULL, nidxs1 = 0, idx1 = 0; + int *idxs2 = NULL, nidxs2 = 0, idx2 = 0; + + int set_samples = 0; + char *colon = index(tag_idx, ':'); + if ( colon ) + { + *colon = 0; + if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori); + if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori); + tok->idxs = idxs2; + tok->nidxs = nidxs2; + tok->idx = idx2; + set_samples = 1; + } + else + { + if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori); + if ( is_fmt ) + { + if ( nidxs1==1 && idxs1[0]==-1 ) + { + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; + } + else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 ) + error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag); + else + tok->idx = 0; + set_samples = 1; + } + else + { + tok->idxs = idxs1; + tok->nidxs = nidxs1; + tok->idx = idx1; + } + } + + if ( set_samples ) + { + tok->nsamples = bcf_hdr_nsamples(hdr); + tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); + if ( idx1>=0 ) + { + if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); + tok->usmpl[idx1] = 1; + } + else if ( idx1==-2 ) + { + for (i=0; i= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); + tok->usmpl[i] = 1; + } + if ( nidxs1 && idxs1[nidxs1-1]==-1 ) // open range, such as "7-" + { + for (; insamples; i++) tok->usmpl[i] = 1; + } } - tok->idxs[ibeg] = -1; + else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2); + free(idxs1); + } + free(ori); + + if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 ) + { + for (i=0; inidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++; } - tok->idx = -2; } static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass_site = -1; - tok->idx = -1; + tok->idx = 0; // is this a string constant? if ( str[0]=='"' || str[0]=='\'' ) @@ -1577,6 +1782,10 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->setter = &filters_set_alt_string; tok->is_str = 1; tok->tag = strdup("ALT"); + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; return 0; } else if ( !strncasecmp(str,"N_ALT",len) ) @@ -1614,8 +1823,6 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) int i; for (i=0; ihdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s); if ( is_fmt==-1 ) @@ -1627,6 +1834,16 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } if ( is_fmt==-1 ) is_fmt = 0; } + if ( is_array ) + parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok); + else if ( is_fmt && !tok->nsamples ) + { + int i; + tok->nsamples = bcf_hdr_nsamples(filter->hdr); + tok->usmpl = (uint8_t*) malloc(tok->nsamples); + for (i=0; insamples; i++) tok->usmpl[i] = 1; + } + tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; if ( tok->hdr_id>=0 ) @@ -1640,7 +1857,12 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) error("No such FORMAT field: %s\n", tmp.s); if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array ) - error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s); + { + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; + } switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) { case BCF_HT_INT: tok->setter = &filters_set_format_int; break; @@ -1738,7 +1960,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { errno = 0; tok->threshold = strtof(tmp.s, &end); // float? - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); } if ( tmp.s ) free(tmp.s); @@ -1874,16 +2096,26 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) int i; for (i=0; i0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) ) + if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) ) { - int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); - if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } - if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } + int set_missing = 0; + if ( out[k].hdr_id>0 ) + { + int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); + if ( type==BCF_HT_INT ) set_missing = 1; + else if ( type==BCF_HT_REAL ) set_missing = 1; + } + else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1; + if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } } } if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE ) @@ -1935,10 +2167,15 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1; else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n"); + // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA @@ -1976,18 +2213,19 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0; for (i=0; insamples ) { out[i].pass_samples = (uint8_t*)malloc(filter->nsamples); int j; - for (j=0; jnsamples; j++) out[i].pass_samples[j] = 1; + for (j=0; jnsamples; j++) out[i].pass_samples[j] = 0; } } @@ -2009,6 +2247,7 @@ void filter_destroy(filter_t *filter) free(filter->filters[i].str_value.s); free(filter->filters[i].tag); free(filter->filters[i].idxs); + free(filter->filters[i].usmpl); free(filter->filters[i].values); free(filter->filters[i].pass_samples); if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash); @@ -2034,9 +2273,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) int i, nstack = 0; for (i=0; infilters; i++) { - filter->filters[i].nsamples = 0; - filter->filters[i].nvalues = 0; - filter->filters[i].pass_site = -1; + filter->filters[i].pass_site = 0; if ( filter->filters[i].tok_type == TOK_VAL ) { @@ -2056,9 +2293,11 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) filter->flt_stack[nstack++] = &filter->filters[i]; continue; } - else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument + else if ( filter->filters[i].func ) { - filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]); + int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack); + filter->flt_stack[nstack-nargs] = &filter->filters[i]; + if ( --nargs > 0 ) nstack -= nargs; continue; } if ( nstack<2 ) @@ -2066,111 +2305,76 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str; - if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC ) - { - if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 ) - error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site); - filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type); - nstack--; - continue; - } - if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC ) - { - if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 ) - error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site); - filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type); - nstack--; - continue; - } - if ( filter->filters[i].tok_type == TOK_ADD ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_SUB ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_MULT ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_DIV ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } - int is_true = 0; + // ideally, these comparators would become func, but this would require more work in init1() if ( filter->filters[i].comparator ) - is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line); - else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues ) { - int skip = 0; - if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1; - if ( filter->filters[i].tok_type != TOK_EQ && filter->filters[i].tok_type != TOK_NE ) skip = 1; - - if ( skip ) - filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0; - else if ( filter->filters[i].tok_type == TOK_EQ ) - CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true) - else if ( filter->filters[i].tok_type == TOK_NE ) - CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true) - } - else if ( filter->filters[i].tok_type == TOK_EQ ) - { - if ( filter->flt_stack[nstack-1]->comparator ) - is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line); - else if ( filter->flt_stack[nstack-2]->comparator ) - is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line); - else if ( is_str==2 ) // both are strings - is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ); - else if ( is_str==1 ) - error("Comparing string to numeric value: %s\n", filter->str); - else - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true); - } - else if ( filter->filters[i].tok_type == TOK_NE ) - { - if ( filter->flt_stack[nstack-1]->comparator ) - is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line); - else if ( filter->flt_stack[nstack-2]->comparator ) - is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line); - else if ( is_str==2 ) - is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE); - else if ( is_str==1 ) - error("Comparing string to numeric value: %s\n", filter->str); - else - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true); + filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line); } - else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE ) + else if ( filter->flt_stack[nstack-1]->comparator ) { - if ( is_str==2 ) - is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1); - else - error("The regex operator can be used on strings only: %s\n", filter->str); - } - else if ( is_str>0 ) - error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value); - else if ( filter->filters[i].tok_type == TOK_LE ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true) - else if ( filter->filters[i].tok_type == TOK_LT ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true) - else if ( filter->filters[i].tok_type == TOK_BT ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true) - else if ( filter->filters[i].tok_type == TOK_BE ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true) + filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line); + } + else if ( filter->flt_stack[nstack-2]->comparator ) + { + filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line); + } + else if ( is_str==2 ) + { + cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]); + } else - error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type); + { + // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2] + int missing_logic[] = {0,0,0}; + if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; } + if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; } - filter->flt_stack[nstack-2]->pass_site = is_true; + if ( filter->filters[i].tok_type == TOK_EQ ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic) + else if ( filter->filters[i].tok_type == TOK_NE ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic) + else if ( filter->filters[i].tok_type == TOK_LE ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic) + else if ( filter->filters[i].tok_type == TOK_LT ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic) + else if ( filter->filters[i].tok_type == TOK_BT ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic) + else if ( filter->filters[i].tok_type == TOK_BE ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic) + else + error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type); + + } + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; } if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 0beb592..83f49e0 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -2,7 +2,7 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -69,11 +69,15 @@ typedef struct _token_t char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types - int idx; // 0-based index to VCF vectors, -1: not a vector, + int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) - int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited + int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited + int nidxs, nuidxs; // size of idxs array and the number of elements set to 1 + uint8_t *usmpl; // bitmask of used samples as set by idx + int nsamples; // number of samples for format fields, 0 for info and other fields void (*setter)(filter_t *, bcf1_t *, struct _token_t *); - int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *); + int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack); + void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *); void *hash; // test presence of str value in the hash via comparator regex_t *regex; // precompiled regex for string comparison @@ -83,9 +87,8 @@ typedef struct _token_t int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues int pass_site; // -1 not applicable, 0 fails, >0 pass uint8_t *pass_samples; // status of individual samples - int nsamples; // number of samples int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l - int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1 + int nval1; // number of per-sample fields or string length } token_t; @@ -128,10 +131,11 @@ struct _filter_t #define TOK_ABS 23 #define TOK_LEN 24 #define TOK_FUNC 25 +#define TOK_CNT 26 -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 -// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8}; +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 +// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8}; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f" static int filters_next_token(char **str, int *len) @@ -158,6 +162,7 @@ static int filters_next_token(char **str, int *len) if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; } if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } + if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility @@ -251,12 +256,10 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok) { float *ptr = &line->qual; if ( bcf_float_is_missing(*ptr) ) - tok->nvalues = 0; + bcf_double_set_missing(tok->values[0]); else - { tok->values[0] = (double)line->qual; - tok->nvalues = 1; - } + tok->nvalues = 1; } static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -273,7 +276,7 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) if ( line->d.info[i].key == tok->hdr_id ) break; if ( i==line->n_info ) - tok->nvalues = 0; + tok->nvalues = tok->str_value.l = 0; else if ( line->d.info[i].type==BCF_BT_CHAR ) { int n = line->d.info[i].len; @@ -310,40 +313,51 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) } } } -static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { int a = (int)(atok->nvalues?atok->values[0]:atok->threshold); int b = (int)(btok->nvalues?btok->values[0]:btok->threshold); - if ( op_type==TOK_LIKE ) return a&b ? 1 : 0; - return a&b ? 0 : 1; + if ( rtok->tok_type==TOK_LIKE ) + rtok->pass_site = a&b ? 1 : 0; + else + rtok->pass_site = a&b ? 0 : 1; } -static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { int i; - if ( op_type==TOK_NE ) // AND logic: none of the filters can match + if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match { if ( !line->d.n_flt ) { - if ( atok->hdr_id==-1 ) return 0; // missing value - return 1; // no filter present, eval to true + if ( atok->hdr_id==-1 ) return; // missing value + rtok->pass_site = 1; + return; // no filter present, eval to true } for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) return 0; - return 1; + if ( atok->hdr_id==line->d.flt[i] ) return; + rtok->pass_site = 1; + return; } - // TOK_EQ with OR logic: at least one of the filters must match - if ( !line->d.n_flt ) + else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match { - if ( atok->hdr_id==-1 ) return 1; - return 0; // no filter present, eval to false + if ( !line->d.n_flt ) + { + if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } + return; // no filter present, eval to false + } + for (i=0; id.n_flt; i++) + if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; } + return; } - for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) return 1; - return 0; + else + error("Only == and != operators are supported for FILTER\n"); + return; } -static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line) +static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { // multiple IDs not supported yet (easy to add though) + if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) + error("Only == and != operators are supported for ID\n"); if ( btok->hash ) { @@ -352,12 +366,15 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin if ( atok->hash ) { int ret = khash_str2int_has_key(atok->hash, line->d.id); - if ( op_type==TOK_EQ ) return ret; - return ret ? 0 : 1; + if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1; + rtok->pass_site = ret; + return; } - if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - return strcmp(btok->str_value.s,line->d.id) ? 1 : 0; + if ( rtok->tok_type==TOK_EQ ) + rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; + else + rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; } /** @@ -433,7 +450,7 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok) } else { - int64_t value; + int64_t value = 0; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else @@ -512,14 +529,13 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok) { flt->tmps.l = 0; ks_resize(&flt->tmps, n); - int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1; - if ( end >= n ) end = n - 1; + int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1; + if ( iend >= n ) iend = n - 1; char *beg = tok->str_value.s, *dst = flt->tmps.s; - for (i=0; i<=end; i++) + for (i=0; i<=iend; i++) { char *end = beg; while ( *end && *end!=',' ) end++; - if ( i>=tok->nidxs || tok->idxs[i] ) { memcpy(dst, beg, end - beg); @@ -527,7 +543,6 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok) dst[0] = ','; dst++; } - beg = end+1; } dst[0] = 0; @@ -551,190 +566,161 @@ static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok) static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) { - int i; - if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) + if ( line->n_sample != tok->nsamples ) + error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); + + int nvals; + if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) { - tok->nvalues = tok->nsamples = 0; + tok->nvalues = 0; return; } - if ( tok->idx >= -1 ) // scalar or vector index + int i, nsrc1 = nvals / tok->nsamples; + tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1); + tok->nvalues = tok->nval1*tok->nsamples; + hts_expand(double, tok->nvalues, tok->mvalues, tok->values); + + if ( tok->idx >= 0 ) // scalar or vector index { - hts_expand(double,flt->nsamples,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - int32_t *ptr = flt->tmpi; - for (i=0; in_sample; i++) + for (i=0; insamples; i++) { - if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end ) + if ( !tok->usmpl[i] ) continue; + int32_t *ptr = flt->tmpi + i*nsrc1; + if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end ) bcf_double_set_missing(tok->values[i]); else - { - tok->values[i] = ptr[idx]; - is_missing = 0; - } - ptr += nvals; + tok->values[i] = ptr[tok->idx]; } - if ( is_missing ) tok->nvalues = 0; - else tok->nvalues = line->n_sample; - tok->nsamples = tok->nvalues; - return; } - if ( tok->idx == -2 ) + else { - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; - if ( end >= nvals ) end = nvals - 1; - int32_t *ptr = flt->tmpi; - for (i=0; in_sample; i++) + int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; + for (i=0; insamples; i++) { - for (k=0; k<=end; k++) - if ( k>=tok->nidxs || tok->idxs[k] ) - { - if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end ) - bcf_double_set_missing(tok->values[j]); - else - { - tok->values[j] = ptr[k]; - is_missing = 0; - } - j++; - } - ptr += nvals; - } - if ( is_missing ) tok->nvalues = tok->nsamples = 0; - else - { - tok->nsamples = line->n_sample; - tok->nvalues = j; + if ( !tok->usmpl[i] ) continue; + int32_t *src = flt->tmpi + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; knidxs && !tok->idxs[k] ) continue; + if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end ) + bcf_double_set_missing(dst[j]); + else + dst[j] = src[k]; + j++; + } + while (j < tok->nval1) + { + bcf_double_set_missing(dst[j]); + j++; + } } - return; } } static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) { - int i; - if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) + if ( line->n_sample != tok->nsamples ) + error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); + + int nvals; + if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) { - tok->nvalues = tok->nsamples = 0; + tok->nvalues = 0; return; } - if ( tok->idx >= -1 ) // scalar or vector index + int i, nsrc1 = nvals / tok->nsamples; + tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1); + tok->nvalues = tok->nval1*tok->nsamples; + hts_expand(double, tok->nvalues, tok->mvalues, tok->values); + + if ( tok->idx >= 0 ) // scalar or vector index { - hts_expand(double,flt->nsamples,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - float *ptr = flt->tmpf; - for (i=0; in_sample; i++) + for (i=0; insamples; i++) { - if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) ) + if ( !tok->usmpl[i] ) continue; + float *ptr = flt->tmpf + i*nsrc1; + if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) ) bcf_double_set_missing(tok->values[i]); else - { - tok->values[i] = ptr[idx]; - is_missing = 0; - } - ptr += nvals; + tok->values[i] = ptr[tok->idx]; } - if ( is_missing ) tok->nvalues = 0; - else tok->nvalues = line->n_sample; - tok->nsamples = tok->nvalues; - return; } - if ( tok->idx == -2 ) + else { - hts_expand(double,tok->nvalues,tok->mvalues,tok->values); - int nvals = tok->nvalues / line->n_sample; - int idx = tok->idx >= 0 ? tok->idx : 0; - int is_missing = 1; - int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1; - if ( end >= nvals ) end = nvals - 1; - float *ptr = flt->tmpf; - for (i=0; in_sample; i++) + int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; + for (i=0; insamples; i++) { - for (k=0; k<=end; k++) - if ( k>=tok->nidxs || tok->idxs[k] ) - { - if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) ) - bcf_double_set_missing(tok->values[j]); - else - { - tok->values[j] = ptr[k]; - is_missing = 0; - } - j++; - } - ptr += nvals; - } - if ( is_missing ) tok->nvalues = tok->nsamples = 0; - else - { - tok->nsamples = line->n_sample; - tok->nvalues = j; + if ( !tok->usmpl[i] ) continue; + float *src = flt->tmpf + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; knidxs && !tok->idxs[k] ) continue; + if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) ) + bcf_double_set_missing(dst[j]); + else + dst[j] = src[k]; + j++; + } + while (j < tok->nval1) + { + bcf_double_set_missing(dst[j]); + j++; + } } - return; } } static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) { - tok->str_value.l = tok->nvalues = 0; - if ( !line->n_sample ) return; + if ( line->n_sample != tok->nsamples ) + error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); - int ndim = tok->str_value.m; + int i, ndim = tok->str_value.m; int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); tok->str_value.m = ndim; + tok->str_value.l = tok->nvalues = 0; - if ( nstr<=0 ) return; - - if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*] - { - tok->nsamples = line->n_sample; - tok->nstr1 = ndim / line->n_sample; - tok->nvalues = tok->str_value.l = nstr; - return; - } - - int nstr1 = nstr / line->n_sample; + if ( nstr<0 ) return; - // vector, one or multiple indices - int i; - for (i=0; in_sample; i++) + tok->nvalues = tok->str_value.l = nstr; + tok->nval1 = nstr / tok->nsamples; + for (i=0; insamples; i++) { - char *dst = tok->str_value.s + i*nstr1, *str = dst; - int nval = 0, ibeg = 0; - while ( ibeg < nstr1 ) + if ( !tok->usmpl[i] ) continue; + char *src = tok->str_value.s + i*tok->nval1, *dst = src; + int ibeg = 0, idx = 0; + while ( ibeg < tok->nval1 ) { - int iend = ibeg + 1; - while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++; + int iend = ibeg; + while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++; int keep = 0; - if ( tok->idx >=0 ) - keep = tok->idx==nval ? 1 : 0; - else if ( nval < tok->nidxs ) - keep = tok->idxs[nval] ? 1 : 0; + if ( tok->idx >= 0 ) + { + if ( tok->idx==idx ) keep = 1; + } + else if ( idx < tok->nidxs ) + { + if ( tok->idxs[idx] != 0 ) keep = 1; + } else if ( tok->idxs[tok->nidxs-1] < 0 ) keep = 1; if ( keep ) { - if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1); + if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1); dst += iend - ibeg + 1; if ( tok->idx>=0 ) break; } - if ( !str[iend] ) break; + if ( !src[iend] ) break; ibeg = iend + 1; - nval++; + idx++; } - if ( dst==str ) { dst[0] = '.'; dst+=2; } - if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str)); + if ( dst==src ) { dst[0] = '.'; dst+=2; } + if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src)); } - tok->nvalues = tok->str_value.l = nstr; - tok->nstr1 = nstr1; - tok->nsamples = line->n_sample; } static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type) { @@ -745,10 +731,10 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int return; } - int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4; - if ( tok->str_value.m <= nvals*nsmpl ) + int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4; + if ( tok->str_value.m <= nvals1*nsmpl ) { - tok->str_value.m = nvals*nsmpl + 1; + tok->str_value.m = nvals1*nsmpl + 1; tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m); } @@ -770,8 +756,15 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \ } \ } \ - char *dst = &tok->str_value.s[nvals*i]; \ - if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \ + char *dst = &tok->str_value.s[nvals1*i]; \ + if ( type==4 ) \ + { \ + if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \ + else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \ + else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \ + else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \ + } \ + else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \ else if ( type==3 ) \ { \ if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \ @@ -805,31 +798,30 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; } #undef BRANCH_INT - tok->nsamples = nsmpl; - tok->nvalues = tok->str_value.l = nvals*nsmpl; + assert( tok->nsamples == nsmpl ); + tok->nvalues = tok->str_value.l = nvals1*nsmpl; tok->str_value.s[tok->str_value.l] = 0; - tok->nstr1 = nvals; + tok->nval1 = nvals1; } static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } +static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); if ( !fmt ) { - tok->nvalues = tok->nsamples = 0; + tok->nvalues = 0; return; } - int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr); - kstring_t str; + int i, blen = 4, nsmpl = line->n_sample; gt_length_too_big: tok->str_value.l = 0; for (i=0; istr_value.l; - + size_t plen = tok->str_value.l; bcf_format_gt(fmt, i, &tok->str_value); kputc_(0, &tok->str_value); if ( tok->str_value.l - plen > blen ) @@ -847,9 +839,9 @@ gt_length_too_big: plen++; } } - tok->nsamples = nsmpl; + assert( tok->nsamples == nsmpl ); tok->nvalues = tok->str_value.l; - tok->nstr1 = blen; + tok->nval1 = blen; } static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -870,7 +862,7 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok) } else if ( tok->idx==-2 ) { - int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1; + int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1; if ( end >= line->n_allele - 1 ) end = line->n_allele - 2; for (i=0; i<=end; i++) if ( i>=tok->nidxs || tok->idxs[i] ) @@ -1000,59 +992,111 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok) } } -static void set_max(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; + rtok->nvalues = 0; + if ( !tok->nvalues ) return 1; double val = -HUGE_VAL; - int i; + int i, has_value = 0; for (i=0; invalues; i++) { - if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i]; + if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val < tok->values[i] ) val = tok->values[i]; + } + if ( has_value ) + { + rtok->values[0] = val; + rtok->nvalues = has_value; } - tok->values[0] = val; - tok->nvalues = 1; - tok->nsamples = 0; + return 1; } -static void set_min(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; + rtok->nvalues = 0; + if ( !tok->nvalues ) return 1; double val = HUGE_VAL; - int i; + int i, has_value = 0; for (i=0; invalues; i++) - if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i]; - tok->values[0] = val; - tok->nvalues = 1; - tok->nsamples = 0; + { + if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; + has_value = 1; + if ( val > tok->values[i] ) val = tok->values[i]; + } + if ( has_value ) + { + rtok->values[0] = val; + rtok->nvalues = has_value; + } + return 1; } -static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; + rtok->nvalues = 0; + if ( !tok->nvalues ) return 1; double val = 0; int i, n = 0; for (i=0; invalues; i++) if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } - tok->values[0] = n ? val / n : 0; - tok->nvalues = 1; - tok->nsamples = 0; + if ( n ) + { + rtok->values[0] = val / n; + rtok->nvalues = 1; + } + return 1; } -static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + rtok->nvalues = 0; + token_t *tok = stack[nstack - 1]; + if ( !tok->nvalues ) return 1; double val = 0; int i, n = 0; for (i=0; invalues; i++) if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } - tok->values[0] = val; - tok->nvalues = 1; - tok->nsamples = 0; + if ( n ) + { + rtok->values[0] = val; + rtok->nvalues = 1; + } + return 1; } -static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { + token_t *tok = stack[nstack - 1]; if ( tok->is_str ) error("ABS() can be applied only on numeric values\n"); + + rtok->nvalues = tok->nvalues; + if ( !tok->nvalues ) return 1; + hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); int i; for (i=0; invalues; i++) - tok->values[i] = fabs(tok->values[i]); + if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); + else rtok->values[i] = fabs(tok->values[i]); + return 1; +} +static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) +{ + token_t *tok = stack[nstack - 1]; + if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n"); + + int i, cnt = 0; + for (i=0; insamples; i++) + if ( tok->pass_samples[i] ) cnt++; + + rtok->nvalues = 1; + rtok->values[0] = cnt; + return 1; } -static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) +static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - tok->is_str = 0; - if ( !tok->str_value.l ) return; + token_t *tok = stack[nstack - 1]; + rtok->nvalues = rtok->str_value.l = 0; + if ( !tok->str_value.l ) return 1; + if ( tok->idx==-2 ) { int i = 0; @@ -1061,202 +1105,193 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { char *se = ss; while ( *se && *se!=',' ) se++; - hts_expand(double, i+1, tok->mvalues, tok->values); - if ( !*se ) tok->values[i] = strlen(ss); + hts_expand(double, i+1, rtok->mvalues, rtok->values); + if ( !*se ) rtok->values[i] = strlen(ss); else { *se = 0; - tok->values[i] = strlen(ss); + rtok->values[i] = strlen(ss); *se = ','; } ss = *se ? se + 1 : se; i++; } - tok->nvalues = i; + rtok->nvalues = i; } else { - tok->values[0] = strlen(tok->str_value.s); - tok->nvalues = 1; + rtok->values[0] = strlen(tok->str_value.s); + rtok->nvalues = 1; } - tok->str_value.l = 0; + return 1; +} +inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) +{ + token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; + rtok->nvalues = tok->nvalues; + rtok->nval1 = tok->nval1; + hts_expand(double*, rtok->nvalues, rtok->mvalues, rtok->values); } -#define VECTOR_ARITHMETICS(atok,btok,AOP) \ +inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) +{ + if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples ) + { + rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples; + rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); + int i; + for (i=0; insamples; i++) rtok->usmpl[i] |= atok->usmpl[i]; + for (i=0; insamples; i++) rtok->usmpl[i] |= btok->usmpl[i]; + } + memset(rtok->pass_samples, 0, rtok->nsamples); +} + +#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \ { \ + token_t *rtok = _rtok; \ int i, has_values = 0; \ - if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \ + if ( atok->nvalues && btok->nvalues ) \ { \ - (atok)->nvalues = 0; (atok)->nsamples = 0; \ - } \ - else \ - { \ - if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \ - { \ - for (i=0; i<(atok)->nvalues; i++) \ - { \ - if ( bcf_double_is_missing((atok)->values[i]) ) continue; \ - if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \ - has_values = 1; \ - (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \ - } \ - } \ - else if ( (btok)->nsamples ) \ + tok_init_values(atok, btok, rtok); \ + tok_init_samples(atok, btok, rtok); \ + if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \ { \ - hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \ - for (i=0; i<(btok)->nvalues; i++) \ + assert( atok->nsamples==btok->nsamples ); \ + for (i=0; invalues; i++) \ { \ - if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \ + if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \ { \ - bcf_double_set_missing((atok)->values[i]); \ + bcf_double_set_missing(rtok->values[i]); \ continue; \ } \ has_values = 1; \ - (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \ + rtok->values[i] = atok->values[i] AOP btok->values[i]; \ } \ - (atok)->nvalues = (btok)->nvalues; \ - (atok)->nsamples = (btok)->nsamples; \ } \ - else if ( (atok)->nsamples ) \ + else \ { \ - for (i=0; i<(atok)->nvalues; i++) \ + token_t *xtok = atok->nsamples ? atok : btok; \ + token_t *ytok = atok->nsamples ? btok : atok; \ + assert( ytok->nvalues==1 ); \ + if ( !bcf_double_is_missing(ytok->values[0]) ) \ { \ - if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \ + for (i=0; invalues; i++) \ { \ - bcf_double_set_missing((atok)->values[i]); \ - continue; \ + if ( bcf_double_is_missing(xtok->values[i]) ) \ + { \ + bcf_double_set_missing(rtok->values[i]); \ + continue; \ + } \ + has_values = 1; \ + rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \ } \ - has_values = 1; \ - (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \ } \ } \ } \ - if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \ + if ( !has_values ) rtok->nvalues = 0; \ } -static int vector_logic_and(token_t *atok, token_t *btok, int and_type) +static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields), - // or a vector and a scalar (FORMAT field vs threshold) - int i, pass_site = 0; - if ( !atok->nvalues || !btok->nvalues ) - { - atok->nvalues = atok->nsamples = 0; - return 0; - } - if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site; - if ( atok->nsamples && btok->nsamples ) + token_t *atok = stack[nstack-2]; + token_t *btok = stack[nstack-1]; + tok_init_samples(atok, btok, rtok); + + if ( !atok->pass_site && !btok->pass_site ) return 2; + + rtok->pass_site = 1; + if ( !atok->nsamples && !btok->nsamples ) return 2; + + int i; + if ( rtok->tok_type==TOK_OR_VEC ) // ||, select all samples if one is true { - if ( and_type==TOK_AND ) + if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) ) { - // perform AND within a sample - for (i=0; insamples; i++) + // These two conditions are to ensure the following does not set all samples + // at sites with QUAL<=30: + // QUAL>30 || FMT/GQ>30 + + token_t *tok = atok->nsamples ? atok : btok; + for (i=0; insamples; i++) { - atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i]; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = tok->pass_samples[i]; } } else { - // perform AND across samples - int pass_a = 0, pass_b = 0; - for (i=0; insamples; i++) - { - if ( atok->pass_samples[i] ) pass_a = 1; - atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i]; - } - for (i=0; insamples; i++) + for (i=0; insamples; i++) { - if ( btok->pass_samples[i] ) { pass_b = 1; break; } + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = 1; } - pass_site = pass_a && pass_b; } - return pass_site; + return 2; } - if ( btok->nsamples ) + + // |, only select samples which are actually true + + if ( !atok->nsamples || !btok->nsamples ) { - for (i=0; insamples; i++) + token_t *tok = atok->nsamples ? atok : btok; + for (i=0; insamples; i++) { - atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i]; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = tok->pass_samples[i]; } - atok->nsamples = btok->nsamples; - return pass_site; + return 2; } - /* atok->nsamples!=0 */ - for (i=0; insamples; i++) + + assert( atok->nsamples==btok->nsamples ); + + for (i=0; insamples; i++) { - atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i]; } - return pass_site; + return 2; } -static int vector_logic_or(token_t *atok, token_t *btok, int or_type) +static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - int i, pass_site = 0; - if ( !atok->nvalues && !btok->nvalues ) // missing sites in both - { - atok->nvalues = atok->nsamples = 0; - return 0; - } - if ( !atok->nvalues ) // missing value in a - { - for (i=0; insamples; i++) - atok->pass_samples[i] = btok->pass_samples[i]; - atok->nsamples = btok->nsamples; - atok->nvalues = 1; - return btok->pass_site; - } - if ( !btok->nvalues ) // missing value in b - { - btok->nvalues = 1; - return atok->pass_site; - } + token_t *atok = stack[nstack-2]; + token_t *btok = stack[nstack-1]; + tok_init_samples(atok, btok, rtok); - if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site; - if ( !atok->nsamples ) + if ( !atok->pass_site || !btok->pass_site ) return 2; + if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; } + + int i; + if ( !atok->nsamples || !btok->nsamples ) { - if ( or_type==TOK_OR ) + token_t *tok = atok->nsamples ? atok : btok; + for (i=0; insamples; i++) { - for (i=0; insamples; i++) - { - atok->pass_samples[i] = btok->pass_samples[i]; - if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1; - } + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = tok->pass_samples[i]; } - else - { - for (i=0; insamples; i++) - { - atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i]; - if ( atok->pass_samples[i] ) pass_site = 1; - } - } - atok->nsamples = btok->nsamples; - return pass_site; + rtok->pass_site = 1; + return 2; } - if ( !btok->nsamples ) // vector vs site + + assert( atok->nsamples==btok->nsamples ); + if ( rtok->tok_type==TOK_AND_VEC ) // &&, can be true in different samples { - if ( or_type==TOK_OR ) + for (i=0; insamples; i++) { - for (i=0; insamples; i++) - if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1; + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i]; } - else - { - for (i=0; insamples; i++) - { - atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site; - if ( atok->pass_samples[i] ) pass_site = 1; - } - } - return pass_site; + rtok->pass_site = 1; } - for (i=0; insamples; i++) + else // &, must be true within one sample { - atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i]; - if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i]; + if ( rtok->pass_samples[i] ) rtok->pass_site = 1; + } } - return pass_site; + return 2; } #define CMP_MISSING(atok,btok,CMP_OP,ret) \ @@ -1267,224 +1302,394 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) tok->nvalues = 1; \ } -#define CMP_VECTORS(atok,btok,CMP_OP,ret) \ +#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \ { \ - int i, j, has_values = 0, pass_site = 0; \ - if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ - else \ + token_t *rtok = _rtok; \ + int i, j, k; \ + assert( !atok->nsamples || !btok->nsamples ); \ + tok_init_samples(atok, btok, rtok); \ + if ( !atok->nsamples && !btok->nsamples ) \ { \ - if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \ + if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \ + else if ( !atok->nvalues || !btok->nvalues ) \ { \ - /* any field can match: [*] */ \ - for (i=0; i<(atok)->nvalues; i++) \ + token_t *tok = atok->nvalues ? atok : btok; \ + for (j=0; jnvalues; j++) \ { \ - for (j=0; j<(btok)->nvalues; j++) \ - if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \ + if ( bcf_double_is_missing(tok->values[j]) ) \ + { \ + if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \ + } \ + else if ( missing_logic[1] ) { rtok->pass_site = 1; break; } \ } \ } \ - else if ( (atok)->nsamples && (btok)->nsamples ) \ + else \ { \ - for (i=0; i<(atok)->nsamples; i++) \ + for (i=0; invalues; i++) \ { \ - has_values = 1; \ - if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ + int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \ + for (j=0; jnvalues; j++) \ + { \ + int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \ + if ( nmiss ) \ + { \ + if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ + } \ + else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ + } \ } \ - if ( !has_values ) (atok)->nvalues = 0; \ } \ - else if ( (atok)->nsamples ) \ + } \ + else \ + { \ + if ( !atok->nvalues && !btok->nvalues ) \ { \ - for (i=0; i<(atok)->nsamples; i++) \ + if ( missing_logic[2] ) \ { \ - /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \ - has_values = 1; \ - if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ + for (i=0; insamples; i++) \ + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \ } \ - if ( !has_values ) (atok)->nvalues = 0; \ } \ - else if ( (btok)->nsamples ) \ + else if ( !atok->nvalues || !btok->nvalues ) \ { \ - for (i=0; i<(btok)->nsamples; i++) \ + token_t *tok = atok->nvalues ? atok : btok; \ + if ( !tok->nsamples ) \ { \ - if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ - has_values = 1; \ - if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ - else (atok)->pass_samples[i] = 0; \ + int miss = 0; \ + for (j=0; jnvalues; j++) \ + miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \ + if ( missing_logic[++miss] ) \ + { \ + for (i=0; insamples; i++) \ + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ + } \ } \ - (atok)->nvalues = (btok)->nvalues; \ - (atok)->nsamples = (btok)->nsamples; \ - if ( !has_values ) (atok)->nvalues = 0; \ + else \ + for (i=0; insamples; i++) \ + { \ + if ( !rtok->usmpl[i] ) continue; \ + double *ptr = tok->values + i*tok->nval1; \ + int miss = 0; \ + for (j=0; jnval1; j++) \ + miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \ + if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ + } \ } \ else \ { \ - if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \ + token_t *xtok = atok->nsamples ? atok : btok; \ + token_t *ytok = atok->nsamples ? btok : atok; \ + for (i=0; insamples; i++) \ + { \ + if ( !rtok->usmpl[i] ) continue; \ + double *xptr = xtok->values + i*xtok->nval1; \ + double *yptr = ytok->values + i*ytok->nval1; \ + for (j=0; jnval1; j++) \ + { \ + int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \ + if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ + for (k=0; knvalues; k++) \ + { \ + int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \ + if ( nmiss ) \ + { \ + if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + } \ + else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ + } \ + } \ + } \ } \ - /*fprintf(bcftools_stderr,"pass=%d\n", pass_site);*/ \ - (ret) = pass_site; \ } \ } -static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE +static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic) { - if ( !atok->str_value.l ) { return 0; } - if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; } - int i, pass_site = 0; - if ( atok->nsamples && atok->nsamples==btok->nsamples ) - { - for (i=0; insamples; i++) - { - char *astr = atok->str_value.s + i*atok->nstr1; - char *bstr = btok->str_value.s + i*btok->nstr1; - char *aend = astr + atok->str_value.l, *a = astr; - while ( astr_value.l, *b = bstr; - while ( bpass_samples[i] = 0; - else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0; - if ( logic!=TOK_EQ ) - atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; - pass_site |= atok->pass_samples[i]; - } - if ( !atok->nsamples ) atok->nsamples = btok->nsamples; - } - else if ( !atok->nsamples && !btok->nsamples ) - { - if ( atok->idx==-2 || btok->idx==-2 ) - { - // any field can match: [*] - if ( atok->idx==-2 && btok->idx==-2 ) - error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value); - token_t *xtok, *ytok; // xtok is scalar, ytok array - if ( btok->idx==-2 ) { xtok = atok; ytok = btok; } - else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l; - char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr; - while ( y<=yend ) - { - if ( y==yend || *y==',' ) - { - if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) ) - { - pass_site = 1; - break; - } - ystr = y+1; - } - y++; - } - } - else - pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1; - if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1; + char *end = str + len; + while ( str < end && *str ) + { + char *mid = str; + while ( mid < end && *mid && *mid!=',' ) mid++; + int miss = mid - str == 1 && str[0]=='.' ? 1 : 0; + if ( miss && missing_logic[miss] ) return 1; + char tmp = *mid; *mid = 0; + int match = regexec(regex, str, 0,NULL,0) ? 0 : 1; + *mid = tmp; + if ( logic==TOK_NLIKE ) match = match ? 0 : 1; + if ( match ) return 1; + if ( !*mid ) break; + str = mid + 1; } - else + return 0; +} +static inline int _has_missing_string(char *beg) +{ + while ( *beg ) { - token_t *xtok, *ytok; - if ( !atok->nsamples ) { xtok = atok; ytok = btok; } - else { xtok = btok; ytok = atok; } - char *xstr = xtok->str_value.s; - char *xend = xstr + xtok->str_value.l, *x = xstr; - while ( xnsamples; i++) - { - char *ystr = ytok->str_value.s + i*ytok->nstr1; - char *ybeg = ystr, *yend = ystr + ytok->nstr1; - int pass = 0; - while ( ybeg < yend ) + char *end = beg; + while ( *end && *end!=',' ) end++; + if ( end-beg==1 && beg[0]=='.' ) return 1; + if ( !*end ) break; + beg = end + 1; + } + return 0; +} + +// Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A". +// Returns 1 if any field matches, otherwise returns 0 +static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic) +{ + char *aend = abeg + alen; + char *bend = bstr + blen; + while ( abeg < aend && *abeg ) + { + char *amid = abeg; + while ( amid < aend && *amid && *amid!=',' ) amid++; + int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0; + char *bbeg = bstr; + while ( bbeg < bend && *bbeg ) + { + char *bmid = bbeg; + while ( bmid < bend && *bmid && *bmid!=',' ) bmid++; + int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0); + if ( nmiss ) { - char *y = ybeg; - while ( ypass_samples[i] = pass; - pass_site |= pass; + else + { + int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0; + if ( logic==TOK_NE ) match = match==1 ? 0 : 1; + if ( match ) return 1; + } + if ( !*bmid ) break; + bbeg = bmid + 1; } - if ( !atok->nsamples ) - atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set + if ( !*amid ) break; + abeg = amid + 1; } - return pass_site; + return 0; } -static int regex_vector_strings(token_t *atok, token_t *btok, int negate) +static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) { - int i, pass_site = 0; - if ( atok->nsamples ) + tok_init_samples(atok, btok, rtok); + + int i, logic = rtok->tok_type; // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE + regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL); + + assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l ); + assert( !atok->nsamples || !btok->nsamples ); + assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) ); + + int missing_logic[] = {0,0,0}; + if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1; + else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1; + + if ( !atok->nsamples && !btok->nsamples ) + { + if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; } + if ( !atok->nvalues || !btok->nvalues ) + { + int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s); + if ( missing_logic[miss+1] ) rtok->pass_site = 1; + return; + } + if ( !regex ) + rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); + else + { + token_t *tok = atok->regex ? btok : atok; + rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic); + } + return; + } + + // The case of (!atok->nsamples || !btok->nsamples) + + if ( !atok->nvalues && !btok->nvalues ) { - for (i=0; insamples; i++) + if ( missing_logic[2] ) { - char *ptr = atok->str_value.s + i*atok->nstr1; - atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1; - if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; - pass_site |= atok->pass_samples[i]; + for (i=0; insamples; i++) + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } } - return pass_site; + return; + } + if ( !atok->nvalues || !btok->nvalues ) + { + token_t *tok = atok->nvalues ? atok : btok; + if ( !tok->nsamples ) + { + int miss = _has_missing_string(tok->str_value.s); + if ( !missing_logic[miss+1] ) return; + for (i=0; insamples; i++) + if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; } + } + else + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + int miss = _has_missing_string(tok->str_value.s + i*tok->nval1); + if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; } + } + return; + } + + // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) + token_t *xtok = atok->nsamples ? atok : btok; + token_t *ytok = atok->nsamples ? btok : atok; + assert( regex==ytok->regex ); + for (i=0; insamples; i++) + { + if ( !rtok->usmpl[i] ) continue; + int match; + if ( regex ) + match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic); + else + match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic); + if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; } } - pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1; - if ( negate ) pass_site = pass_site ? 0 : 1; - return pass_site; } -static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG[" +static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx) { - // TAG[*] .. any field - if ( !strncmp("*]", tag_idx, 3) ) + // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1 + if ( *tag_idx==0 || !strcmp("*", tag_idx) ) { - tok->idxs = (int*) malloc(sizeof(int)); - tok->idxs[0] = -1; - tok->nidxs = 1; - tok->idx = -2; - return; + *idxs = (int*) malloc(sizeof(int)); + (*idxs)[0] = -1; + *nidxs = 1; + *idx = -2; + return 0; } - // TAG[integer] .. one field + // TAG[integer] .. one field; idx positive char *end, *beg = tag_idx; - tok->idx = strtol(tag_idx, &end, 10); - if ( tok->idx >= 0 && *end==']' ) return; + *idx = strtol(tag_idx, &end, 10); + if ( *idx >= 0 && *end==0 ) return 0; - - // TAG[0,1] or TAG[0-2] or [1-] etc + // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,.. int i, ibeg = -1; - while ( *beg && *beg!=']' ) + while ( *beg ) { - int idx = strtol(beg, &end, 10); + int num = strtol(beg, &end, 10); if ( end[0]==',' ) beg = end + 1; - else if ( end[0]==']' ) beg = end; - else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; } - else error("Could not parse the index: %s[%s\n", tag, tag_idx+1); - if ( idx >= tok->nidxs ) + else if ( end[0]==0 ) beg = end; + else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; } + else return -1; + if ( num >= *nidxs ) { - tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1)); - memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1)); - tok->nidxs = idx + 1; + *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1)); + memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1)); + *nidxs = num + 1; } if ( ibeg>=0 ) { - for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1; + for (i=ibeg; i<=num; i++) (*idxs)[i] = 1; ibeg = -1; } - tok->idxs[idx] = 1; + (*idxs)[num] = 1; } if ( ibeg >=0 ) { - if ( ibeg >= tok->nidxs ) + if ( ibeg >= *nidxs ) { - tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1)); - memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1)); - tok->nidxs = ibeg + 1; + *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1)); + memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1)); + *nidxs = ibeg + 1; + } + (*idxs)[ibeg] = -1; + } + *idx = -2; + return 0; +} + +static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG[" +{ + int i, len = strlen(tag_idx); + if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0; + char *ori = strdup(tag_idx); + + assert( !tok->idxs && !tok->usmpl ); + int *idxs1 = NULL, nidxs1 = 0, idx1 = 0; + int *idxs2 = NULL, nidxs2 = 0, idx2 = 0; + + int set_samples = 0; + char *colon = index(tag_idx, ':'); + if ( colon ) + { + *colon = 0; + if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori); + if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori); + tok->idxs = idxs2; + tok->nidxs = nidxs2; + tok->idx = idx2; + set_samples = 1; + } + else + { + if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori); + if ( is_fmt ) + { + if ( nidxs1==1 && idxs1[0]==-1 ) + { + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; + } + else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 ) + error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag); + else + tok->idx = 0; + set_samples = 1; + } + else + { + tok->idxs = idxs1; + tok->nidxs = nidxs1; + tok->idx = idx1; + } + } + + if ( set_samples ) + { + tok->nsamples = bcf_hdr_nsamples(hdr); + tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); + if ( idx1>=0 ) + { + if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); + tok->usmpl[idx1] = 1; + } + else if ( idx1==-2 ) + { + for (i=0; i= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); + tok->usmpl[i] = 1; + } + if ( nidxs1 && idxs1[nidxs1-1]==-1 ) // open range, such as "7-" + { + for (; insamples; i++) tok->usmpl[i] = 1; + } } - tok->idxs[ibeg] = -1; + else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2); + free(idxs1); + } + free(ori); + + if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 ) + { + for (i=0; inidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++; } - tok->idx = -2; } static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass_site = -1; - tok->idx = -1; + tok->idx = 0; // is this a string constant? if ( str[0]=='"' || str[0]=='\'' ) @@ -1579,6 +1784,10 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->setter = &filters_set_alt_string; tok->is_str = 1; tok->tag = strdup("ALT"); + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; return 0; } else if ( !strncasecmp(str,"N_ALT",len) ) @@ -1616,8 +1825,6 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) int i; for (i=0; ihdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s); if ( is_fmt==-1 ) @@ -1629,6 +1836,16 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } if ( is_fmt==-1 ) is_fmt = 0; } + if ( is_array ) + parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok); + else if ( is_fmt && !tok->nsamples ) + { + int i; + tok->nsamples = bcf_hdr_nsamples(filter->hdr); + tok->usmpl = (uint8_t*) malloc(tok->nsamples); + for (i=0; insamples; i++) tok->usmpl[i] = 1; + } + tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; if ( tok->hdr_id>=0 ) @@ -1642,7 +1859,12 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) error("No such FORMAT field: %s\n", tmp.s); if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array ) - error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s); + { + tok->idxs = (int*) malloc(sizeof(int)); + tok->idxs[0] = -1; + tok->nidxs = 1; + tok->idx = -2; + } switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) { case BCF_HT_INT: tok->setter = &filters_set_format_int; break; @@ -1740,7 +1962,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { errno = 0; tok->threshold = strtof(tmp.s, &end); // float? - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); } if ( tmp.s ) free(tmp.s); @@ -1876,16 +2098,26 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) int i; for (i=0; i0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) ) + if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) ) { - int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); - if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } - if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } + int set_missing = 0; + if ( out[k].hdr_id>0 ) + { + int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); + if ( type==BCF_HT_INT ) set_missing = 1; + else if ( type==BCF_HT_REAL ) set_missing = 1; + } + else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1; + if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } } } if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE ) @@ -1937,10 +2169,15 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1; else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); + if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n"); + // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); } + else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); } else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA @@ -1978,18 +2215,19 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0; for (i=0; insamples ) { out[i].pass_samples = (uint8_t*)malloc(filter->nsamples); int j; - for (j=0; jnsamples; j++) out[i].pass_samples[j] = 1; + for (j=0; jnsamples; j++) out[i].pass_samples[j] = 0; } } @@ -2011,6 +2249,7 @@ void filter_destroy(filter_t *filter) free(filter->filters[i].str_value.s); free(filter->filters[i].tag); free(filter->filters[i].idxs); + free(filter->filters[i].usmpl); free(filter->filters[i].values); free(filter->filters[i].pass_samples); if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash); @@ -2036,9 +2275,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) int i, nstack = 0; for (i=0; infilters; i++) { - filter->filters[i].nsamples = 0; - filter->filters[i].nvalues = 0; - filter->filters[i].pass_site = -1; + filter->filters[i].pass_site = 0; if ( filter->filters[i].tok_type == TOK_VAL ) { @@ -2058,9 +2295,11 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) filter->flt_stack[nstack++] = &filter->filters[i]; continue; } - else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument + else if ( filter->filters[i].func ) { - filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]); + int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack); + filter->flt_stack[nstack-nargs] = &filter->filters[i]; + if ( --nargs > 0 ) nstack -= nargs; continue; } if ( nstack<2 ) @@ -2068,111 +2307,76 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str; - if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC ) - { - if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 ) - error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site); - filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type); - nstack--; - continue; - } - if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC ) - { - if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 ) - error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site); - filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type); - nstack--; - continue; - } - if ( filter->filters[i].tok_type == TOK_ADD ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_SUB ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_MULT ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_DIV ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/); + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } - int is_true = 0; + // ideally, these comparators would become func, but this would require more work in init1() if ( filter->filters[i].comparator ) - is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line); - else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues ) { - int skip = 0; - if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1; - if ( filter->filters[i].tok_type != TOK_EQ && filter->filters[i].tok_type != TOK_NE ) skip = 1; - - if ( skip ) - filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0; - else if ( filter->filters[i].tok_type == TOK_EQ ) - CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true) - else if ( filter->filters[i].tok_type == TOK_NE ) - CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true) - } - else if ( filter->filters[i].tok_type == TOK_EQ ) - { - if ( filter->flt_stack[nstack-1]->comparator ) - is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line); - else if ( filter->flt_stack[nstack-2]->comparator ) - is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line); - else if ( is_str==2 ) // both are strings - is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ); - else if ( is_str==1 ) - error("Comparing string to numeric value: %s\n", filter->str); - else - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true); - } - else if ( filter->filters[i].tok_type == TOK_NE ) - { - if ( filter->flt_stack[nstack-1]->comparator ) - is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line); - else if ( filter->flt_stack[nstack-2]->comparator ) - is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line); - else if ( is_str==2 ) - is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE); - else if ( is_str==1 ) - error("Comparing string to numeric value: %s\n", filter->str); - else - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true); + filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line); } - else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE ) + else if ( filter->flt_stack[nstack-1]->comparator ) { - if ( is_str==2 ) - is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1); - else - error("The regex operator can be used on strings only: %s\n", filter->str); - } - else if ( is_str>0 ) - error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value); - else if ( filter->filters[i].tok_type == TOK_LE ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true) - else if ( filter->filters[i].tok_type == TOK_LT ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true) - else if ( filter->filters[i].tok_type == TOK_BT ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true) - else if ( filter->filters[i].tok_type == TOK_BE ) - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true) + filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line); + } + else if ( filter->flt_stack[nstack-2]->comparator ) + { + filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line); + } + else if ( is_str==2 ) + { + cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]); + } else - error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type); + { + // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2] + int missing_logic[] = {0,0,0}; + if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; } + if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; } - filter->flt_stack[nstack-2]->pass_site = is_true; + if ( filter->filters[i].tok_type == TOK_EQ ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic) + else if ( filter->filters[i].tok_type == TOK_NE ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic) + else if ( filter->filters[i].tok_type == TOK_LE ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic) + else if ( filter->filters[i].tok_type == TOK_LT ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic) + else if ( filter->filters[i].tok_type == TOK_BT ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic) + else if ( filter->filters[i].tok_type == TOK_BE ) + CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic) + else + error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type); + + } + filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; } if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index 019adc0..d6b51a0 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -49,7 +49,7 @@ int main_vcfcall(int argc, char *argv[]); int main_vcfannotate(int argc, char *argv[]); int main_vcfroh(int argc, char *argv[]); int main_vcfconcat(int argc, char *argv[]); -int main_bcftools_reheader(int argc, char *argv[]); +int main_reheader(int argc, char *argv[]); int main_vcfconvert(int argc, char *argv[]); int main_vcfcnv(int argc, char *argv[]); #if USE_GPL @@ -125,7 +125,7 @@ static cmd_t cmds[] = .alias = "query", .help = "transform VCF/BCF into user-defined formats" }, - { .func = main_bcftools_reheader, + { .func = main_reheader, .alias = "reheader", .help = "modify VCF/BCF header, change sample names" }, diff --git a/bcftools/regidx.c b/bcftools/regidx.c index 84646a8..9b2c66d 100644 --- a/bcftools/regidx.c +++ b/bcftools/regidx.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2017 Genome Research Ltd. Author: Petr Danecek @@ -189,6 +189,35 @@ int regidx_insert(regidx_t *idx, char *line) return 0; } +regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) +{ + regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); + if ( !idx ) return NULL; + + idx->free = free_f; + idx->parse = parser ? parser : regidx_parse_tab; + idx->usr = usr_dat; + idx->seq2regs = khash_str2int_init(); + idx->payload_size = payload_size; + if ( payload_size ) idx->payload = malloc(payload_size); + + kstring_t tmp = {0,0,0}; + const char *ss = str; + while ( *ss ) + { + while ( *ss && isspace(*ss) ) ss++; + const char *se = ss; + while ( *se && *se!='\r' && *se!='\n' ) se++; + tmp.l = 0; + kputsn(ss, se-ss, &tmp); + regidx_insert(idx,tmp.s); + while ( *se && isspace(*se) ) se++; + ss = se; + } + free(tmp.s); + return idx; +} + regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) { if ( !parser ) @@ -595,4 +624,13 @@ int regitr_loop(regitr_t *regitr) } +void regitr_copy(regitr_t *dst, regitr_t *src) +{ + _itr_t *dst_itr = (_itr_t*) dst->itr; + _itr_t *src_itr = (_itr_t*) src->itr; + *dst_itr = *src_itr; + *dst = *src; + dst->itr = dst_itr; +} + diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c index 62ef61f..1082e4a 100644 --- a/bcftools/regidx.c.pysam.c +++ b/bcftools/regidx.c.pysam.c @@ -1,7 +1,7 @@ #include "bcftools.pysam.h" /* - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2017 Genome Research Ltd. Author: Petr Danecek @@ -191,6 +191,35 @@ int regidx_insert(regidx_t *idx, char *line) return 0; } +regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) +{ + regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); + if ( !idx ) return NULL; + + idx->free = free_f; + idx->parse = parser ? parser : regidx_parse_tab; + idx->usr = usr_dat; + idx->seq2regs = khash_str2int_init(); + idx->payload_size = payload_size; + if ( payload_size ) idx->payload = malloc(payload_size); + + kstring_t tmp = {0,0,0}; + const char *ss = str; + while ( *ss ) + { + while ( *ss && isspace(*ss) ) ss++; + const char *se = ss; + while ( *se && *se!='\r' && *se!='\n' ) se++; + tmp.l = 0; + kputsn(ss, se-ss, &tmp); + regidx_insert(idx,tmp.s); + while ( *se && isspace(*se) ) se++; + ss = se; + } + free(tmp.s); + return idx; +} + regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) { if ( !parser ) @@ -597,4 +626,13 @@ int regitr_loop(regitr_t *regitr) } +void regitr_copy(regitr_t *dst, regitr_t *src) +{ + _itr_t *dst_itr = (_itr_t*) dst->itr; + _itr_t *src_itr = (_itr_t*) src->itr; + *dst_itr = *src_itr; + *dst = *src; + dst->itr = dst_itr; +} + diff --git a/bcftools/regidx.h b/bcftools/regidx.h index 8e25fe1..fe0a897 100644 --- a/bcftools/regidx.h +++ b/bcftools/regidx.h @@ -109,6 +109,8 @@ int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); /* * regidx_init() - creates new index + * regidx_init_string() - creates new index, from a string rather than from a file + * * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, * the format will be autodected, currently either regidx_parse_tab (the default) or @@ -121,6 +123,7 @@ int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); * Returns index on success or NULL on error. */ regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); +regidx_t *regidx_init_string(const char *string, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); /* * regidx_destroy() - free memory allocated by regidx_init @@ -184,6 +187,11 @@ int regitr_overlap(regitr_t *itr); */ int regitr_loop(regitr_t *itr); +/* + * regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop + */ +void regitr_copy(regitr_t *dst, regitr_t *src); + #ifdef __cplusplus } #endif diff --git a/bcftools/reheader.c b/bcftools/reheader.c index 30a441c..2776069 100644 --- a/bcftools/reheader.c +++ b/bcftools/reheader.c @@ -1,6 +1,6 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014,2016 Genome Research Ltd. + Copyright (C) 2014-2017 Genome Research Ltd. Author: Petr Danecek @@ -467,7 +467,7 @@ static void usage(args_t *args) exit(1); } -int main_bcftools_reheader(int argc, char *argv[]) +int main_reheader(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c index 803b483..11c0870 100644 --- a/bcftools/reheader.c.pysam.c +++ b/bcftools/reheader.c.pysam.c @@ -2,7 +2,7 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014,2016 Genome Research Ltd. + Copyright (C) 2014-2017 Genome Research Ltd. Author: Petr Danecek @@ -469,7 +469,7 @@ static void usage(args_t *args) exit(1); } -int main_bcftools_reheader(int argc, char *argv[]) +int main_reheader(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index e6efda9..abea98f 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -95,6 +95,7 @@ typedef struct _args_t filter_t *filter; char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + int keep_sites; rm_tag_t *rm; // tags scheduled for removal int nrm; @@ -263,7 +264,7 @@ static void init_remove_annots(args_t *args) tag->key = strdup(str.s); tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key); if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s); - bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); + if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); } else { @@ -293,32 +294,35 @@ static void init_remove_annots(args_t *args) tag->key = strdup(str.s); if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; - bcf_hdr_remove(args->hdr_out,type,tag->key); + if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key); } } else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id; else if ( !strcasecmp("FILTER",str.s) ) { tag->handler = remove_filter; - remove_hdr_lines(args->hdr_out,BCF_HL_FLT); + if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT); } else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual; else if ( !strcasecmp("INFO",str.s) ) { tag->handler = remove_info; - remove_hdr_lines(args->hdr_out,BCF_HL_INFO); + if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO); } else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) ) { tag->handler = remove_format; - remove_hdr_lines(args->hdr_out,BCF_HL_FMT); + if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT); } else if ( str.l ) { - if ( str.s[0]=='#' && str.s[1]=='#' ) - bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2); - else - bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s); + if ( !args->keep_sites ) + { + if ( str.s[0]=='#' && str.s[1]=='#' ) + bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2); + else + bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s); + } args->nrm--; } @@ -354,7 +358,7 @@ static void init_remove_annots(args_t *args) tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]); } tag->key = strdup(hrec->vals[k]); - bcf_hdr_remove(args->hdr_out,hrec->type,tag->key); + if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key); } } khash_str2int_destroy_free(keep); @@ -1870,6 +1874,7 @@ static void usage(args_t *args) fprintf(stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); fprintf(stderr, " -I, --set-id [+] set ID column, see man page for details\n"); fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); fprintf(stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); @@ -1879,7 +1884,7 @@ static void usage(args_t *args) fprintf(stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); fprintf(stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); fprintf(stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " -x, --remove list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n"); + fprintf(stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(stderr, " --threads number of extra output compression threads [0]\n"); fprintf(stderr, "\n"); exit(1); @@ -1901,6 +1906,7 @@ int main_vcfannotate(int argc, char *argv[]) static struct option loptions[] = { + {"keep-sites",required_argument,NULL,'k'}, {"mark-sites",required_argument,NULL,'m'}, {"set-id",required_argument,NULL,'I'}, {"output",required_argument,NULL,'o'}, @@ -1921,9 +1927,10 @@ int main_vcfannotate(int argc, char *argv[]) {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0) { switch (c) { + case 'k': args->keep_sites = 1; break; case 'm': args->mark_sites_logic = MARK_LISTED; if ( optarg[0]=='+' ) args->mark_sites = optarg+1; @@ -2008,7 +2015,11 @@ int main_vcfannotate(int argc, char *argv[]) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) continue; + if ( !pass ) + { + if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line); + continue; + } } annotate(args, line); bcf_write1(args->out_fh, args->hdr_out, line); diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index 72824f7..87a6cc4 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -2,7 +2,7 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -97,6 +97,7 @@ typedef struct _args_t filter_t *filter; char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + int keep_sites; rm_tag_t *rm; // tags scheduled for removal int nrm; @@ -265,7 +266,7 @@ static void init_remove_annots(args_t *args) tag->key = strdup(str.s); tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key); if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s); - bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); + if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); } else { @@ -295,32 +296,35 @@ static void init_remove_annots(args_t *args) tag->key = strdup(str.s); if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; - bcf_hdr_remove(args->hdr_out,type,tag->key); + if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key); } } else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id; else if ( !strcasecmp("FILTER",str.s) ) { tag->handler = remove_filter; - remove_hdr_lines(args->hdr_out,BCF_HL_FLT); + if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT); } else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual; else if ( !strcasecmp("INFO",str.s) ) { tag->handler = remove_info; - remove_hdr_lines(args->hdr_out,BCF_HL_INFO); + if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO); } else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) ) { tag->handler = remove_format; - remove_hdr_lines(args->hdr_out,BCF_HL_FMT); + if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT); } else if ( str.l ) { - if ( str.s[0]=='#' && str.s[1]=='#' ) - bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2); - else - bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s); + if ( !args->keep_sites ) + { + if ( str.s[0]=='#' && str.s[1]=='#' ) + bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2); + else + bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s); + } args->nrm--; } @@ -356,7 +360,7 @@ static void init_remove_annots(args_t *args) tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]); } tag->key = strdup(hrec->vals[k]); - bcf_hdr_remove(args->hdr_out,hrec->type,tag->key); + if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key); } } khash_str2int_destroy_free(keep); @@ -1872,6 +1876,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); fprintf(bcftools_stderr, " -I, --set-id [+] set ID column, see man page for details\n"); fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); fprintf(bcftools_stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); @@ -1881,7 +1886,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " -x, --remove list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n"); + fprintf(bcftools_stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); fprintf(bcftools_stderr, "\n"); exit(1); @@ -1903,6 +1908,7 @@ int main_vcfannotate(int argc, char *argv[]) static struct option loptions[] = { + {"keep-sites",required_argument,NULL,'k'}, {"mark-sites",required_argument,NULL,'m'}, {"set-id",required_argument,NULL,'I'}, {"output",required_argument,NULL,'o'}, @@ -1923,9 +1929,10 @@ int main_vcfannotate(int argc, char *argv[]) {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0) { switch (c) { + case 'k': args->keep_sites = 1; break; case 'm': args->mark_sites_logic = MARK_LISTED; if ( optarg[0]=='+' ) args->mark_sites = optarg+1; @@ -2010,7 +2017,11 @@ int main_vcfannotate(int argc, char *argv[]) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) continue; + if ( !pass ) + { + if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line); + continue; + } } annotate(args, line); bcf_write1(args->out_fh, args->hdr_out, line); diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c index 11c55bd..abfef54 100644 --- a/bcftools/vcfcnv.c +++ b/bcftools/vcfcnv.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2015 Genome Research Ltd. + Copyright (c) 2014-2018 Genome Research Ltd. Author: Petr Danecek @@ -338,7 +338,7 @@ static void plot_sample(args_t *args, sample_t *smpl) "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" "\n" "dat = {}\n" - "with open('%s', 'rb') as f:\n" + "with open('%s', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" @@ -347,7 +347,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " dat[chr].append([row[1], float(row[2]), float(row[3])])\n" "\n" "cnv = {}\n" - "with open('%s', 'rb') as f:\n" + "with open('%s', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" @@ -429,7 +429,7 @@ static void create_plots(args_t *args) "\n" "def chroms_to_plot(th):\n" " dat = {}\n" - " with open('%s/summary.tab', 'rb') as f:\n" + " with open('%s/summary.tab', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0]!='RG': continue\n" @@ -451,14 +451,14 @@ static void create_plots(args_t *args) " plot_chroms = chroms_to_plot(args.plot_threshold)\n" "\n" "def read_dat(file,dat,plot_chr):\n" - " with open(file, 'rb') as f:\n" + " with open(file, 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" " if chr != plot_chr: continue\n" " dat.append([row[1], float(row[2]), float(row[3])])\n" "def read_cnv(file,cnv,plot_chr):\n" - " with open(file, 'rb') as f:\n" + " with open(file, 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index db4dffc..29b0cb5 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014-2015 Genome Research Ltd. + Copyright (c) 2014-2018 Genome Research Ltd. Author: Petr Danecek @@ -340,7 +340,7 @@ static void plot_sample(args_t *args, sample_t *smpl) "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" "\n" "dat = {}\n" - "with open('%s', 'rb') as f:\n" + "with open('%s', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" @@ -349,7 +349,7 @@ static void plot_sample(args_t *args, sample_t *smpl) " dat[chr].append([row[1], float(row[2]), float(row[3])])\n" "\n" "cnv = {}\n" - "with open('%s', 'rb') as f:\n" + "with open('%s', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" @@ -431,7 +431,7 @@ static void create_plots(args_t *args) "\n" "def chroms_to_plot(th):\n" " dat = {}\n" - " with open('%s/summary.tab', 'rb') as f:\n" + " with open('%s/summary.tab', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0]!='RG': continue\n" @@ -453,14 +453,14 @@ static void create_plots(args_t *args) " plot_chroms = chroms_to_plot(args.plot_threshold)\n" "\n" "def read_dat(file,dat,plot_chr):\n" - " with open(file, 'rb') as f:\n" + " with open(file, 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" " if chr != plot_chr: continue\n" " dat.append([row[1], float(row[2]), float(row[3])])\n" "def read_cnv(file,cnv,plot_chr):\n" - " with open(file, 'rb') as f:\n" + " with open(file, 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " chr = row[0]\n" diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index 1e28ad8..f815e25 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index a054ca8..a0ec8b9 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -2,7 +2,7 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c index 8835db3..c4d77e1 100644 --- a/bcftools/vcfgtcheck.c +++ b/bcftools/vcfgtcheck.c @@ -1,6 +1,6 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -81,7 +81,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample) "sample_ids = False\n" "\n" "dat = []\n" - "with open('%s.tab', 'rb') as f:\n" + "with open('%s.tab', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" @@ -153,7 +153,7 @@ static void plot_cross_check(args_t *args) "dat = None\n" "min = None\n" "max = None\n" - "with open('%s.tab', 'rb') as f:\n" + "with open('%s.tab', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " i = 0\n" " for row in reader:\n" diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index 6a6fa58..462e404 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -2,7 +2,7 @@ /* vcfgtcheck.c -- Check sample identity. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Petr Danecek @@ -83,7 +83,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample) "sample_ids = False\n" "\n" "dat = []\n" - "with open('%s.tab', 'rb') as f:\n" + "with open('%s.tab', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" @@ -155,7 +155,7 @@ static void plot_cross_check(args_t *args) "dat = None\n" "min = None\n" "max = None\n" - "with open('%s.tab', 'rb') as f:\n" + "with open('%s.tab', 'r') as f:\n" " reader = csv.reader(f, 'tab')\n" " i = 0\n" " for row in reader:\n" diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index bc51018..ead1e89 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -34,6 +34,7 @@ THE SOFTWARE. */ #include #include #include +#include #include "bcftools.h" #include "rbuf.h" @@ -53,6 +54,15 @@ typedef struct } map_t; +// primitive comparison of two records' alleles via hashes; normalized alleles assumed +typedef struct +{ + int n; // number of alleles + char *ref, *alt; + void *hash; +} +cmpals_t; + typedef struct { char *tseq, *seq; @@ -72,6 +82,8 @@ typedef struct int aln_win; // the realignment window size (maximum repeat size) bcf_srs_t *files; // using the synced reader only for -r option bcf_hdr_t *hdr; + cmpals_t *cmpals; + int ncmpals, mcmpals; faidx_t *fai; struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; @@ -94,7 +106,7 @@ static inline int replace_iupac_codes(char *seq, int nseq) } static inline int has_non_acgtn(char *seq, int nseq) { - char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number + char *end = seq + nseq; while ( *seq && seqd.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion - if ( has_non_acgtn(line->d.allele[i],0) ) + if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) { if ( args->check_ref==CHECK_REF_EXIT ) error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); @@ -1510,6 +1522,69 @@ static bcf1_t *mrows_flush(args_t *args) } return NULL; } +static void cmpals_add(args_t *args, bcf1_t *rec) +{ + args->ncmpals++; + hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals); + cmpals_t *cmpals = args->cmpals + args->ncmpals - 1; + free(cmpals->ref); + cmpals->ref = strdup(rec->d.allele[0]); + cmpals->n = rec->n_allele; + if ( rec->n_allele==2 ) + { + free(cmpals->alt); + cmpals->alt = strdup(rec->d.allele[1]); + } + else + { + if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); + cmpals->hash = khash_str2int_init(); + int i; + for (i=1; in_allele; i++) + khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i])); + } +} +static int cmpals_match(args_t *args, bcf1_t *rec) +{ + int i, j; + for (i=0; incmpals; i++) + { + cmpals_t *cmpals = args->cmpals + i; + if ( rec->n_allele != cmpals->n ) continue; + + // NB. assuming both are normalized + if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue; + + // the most frequent case + if ( rec->n_allele==2 ) + { + if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue; + return 1; + } + + khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash; + for (j=1; jn_allele; j++) + if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break; + if ( jn_allele ) continue; + return 1; + } + cmpals_add(args, rec); + return 0; +} +static void cmpals_reset(args_t *args) { args->ncmpals = 0; } +static void cmpals_destroy(args_t *args) +{ + int i; + for (i=0; imcmpals; i++) + { + cmpals_t *cmpals = args->cmpals + i; + free(cmpals->ref); + free(cmpals->alt); + if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); + } + free(args->cmpals); +} + static void flush_buffer(args_t *args, htsFile *file, int n) { bcf1_t *line; @@ -1540,17 +1615,20 @@ static void flush_buffer(args_t *args, htsFile *file, int n) int line_type = bcf_get_variant_types(args->lines[k]); if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos ) { - if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue; - if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only + if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; + if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue; } else { prev_rid = args->lines[k]->rid; prev_pos = args->lines[k]->pos; prev_type = 0; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); } prev_type |= line_type; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]); } bcf_write1(file, args->hdr, args->lines[k]); } @@ -1580,6 +1658,7 @@ static void init_data(args_t *args) static void destroy_data(args_t *args) { + cmpals_destroy(args); int i; for (i=0; irbuf.m; i++) if ( args->lines[i] ) bcf_destroy1(args->lines[i]); @@ -1676,17 +1755,20 @@ static void normalize_vcf(args_t *args) int line_type = bcf_get_variant_types(line); if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos ) { - if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue; - if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only + if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; + if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue; } else { prev_rid = line->rid; prev_pos = line->pos; prev_type = 0; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); } prev_type |= line_type; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line); } // still on the same chromosome? @@ -1743,7 +1825,7 @@ static void usage(void) fprintf(stderr, "Options:\n"); fprintf(stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); - fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); + fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|none\n"); fprintf(stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(stderr, " --no-version do not append version and command line to the header\n"); @@ -1804,10 +1886,12 @@ int main_vcfnorm(int argc, char *argv[]) switch (c) { case 'N': args->do_indels = 0; break; case 'd': - if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1; - else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1; - else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1; - else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1; + if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; + else if ( !strcmp("indels",optarg) ) args->rmdup = BCF_SR_PAIR_INDELS; + else if ( !strcmp("both",optarg) ) args->rmdup = BCF_SR_PAIR_BOTH; + else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; + else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; + else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; else error("The argument to -d not recognised: %s\n", optarg); break; case 'm': @@ -1841,7 +1925,7 @@ int main_vcfnorm(int argc, char *argv[]) case 'o': args->output_fname = optarg; break; case 'D': fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); - args->rmdup = COLLAPSE_NONE<<1; + args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; case 'f': args->ref_fname = optarg; break; diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index 5e3a5fb..d204de4 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -2,7 +2,7 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -36,6 +36,7 @@ THE SOFTWARE. */ #include #include #include +#include #include "bcftools.h" #include "rbuf.h" @@ -55,6 +56,15 @@ typedef struct } map_t; +// primitive comparison of two records' alleles via hashes; normalized alleles assumed +typedef struct +{ + int n; // number of alleles + char *ref, *alt; + void *hash; +} +cmpals_t; + typedef struct { char *tseq, *seq; @@ -74,6 +84,8 @@ typedef struct int aln_win; // the realignment window size (maximum repeat size) bcf_srs_t *files; // using the synced reader only for -r option bcf_hdr_t *hdr; + cmpals_t *cmpals; + int ncmpals, mcmpals; faidx_t *fai; struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; @@ -96,7 +108,7 @@ static inline int replace_iupac_codes(char *seq, int nseq) } static inline int has_non_acgtn(char *seq, int nseq) { - char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number + char *end = seq + nseq; while ( *seq && seqd.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion - if ( has_non_acgtn(line->d.allele[i],0) ) + if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) { if ( args->check_ref==CHECK_REF_EXIT ) error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); @@ -1512,6 +1524,69 @@ static bcf1_t *mrows_flush(args_t *args) } return NULL; } +static void cmpals_add(args_t *args, bcf1_t *rec) +{ + args->ncmpals++; + hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals); + cmpals_t *cmpals = args->cmpals + args->ncmpals - 1; + free(cmpals->ref); + cmpals->ref = strdup(rec->d.allele[0]); + cmpals->n = rec->n_allele; + if ( rec->n_allele==2 ) + { + free(cmpals->alt); + cmpals->alt = strdup(rec->d.allele[1]); + } + else + { + if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); + cmpals->hash = khash_str2int_init(); + int i; + for (i=1; in_allele; i++) + khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i])); + } +} +static int cmpals_match(args_t *args, bcf1_t *rec) +{ + int i, j; + for (i=0; incmpals; i++) + { + cmpals_t *cmpals = args->cmpals + i; + if ( rec->n_allele != cmpals->n ) continue; + + // NB. assuming both are normalized + if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue; + + // the most frequent case + if ( rec->n_allele==2 ) + { + if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue; + return 1; + } + + khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash; + for (j=1; jn_allele; j++) + if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break; + if ( jn_allele ) continue; + return 1; + } + cmpals_add(args, rec); + return 0; +} +static void cmpals_reset(args_t *args) { args->ncmpals = 0; } +static void cmpals_destroy(args_t *args) +{ + int i; + for (i=0; imcmpals; i++) + { + cmpals_t *cmpals = args->cmpals + i; + free(cmpals->ref); + free(cmpals->alt); + if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); + } + free(args->cmpals); +} + static void flush_buffer(args_t *args, htsFile *file, int n) { bcf1_t *line; @@ -1542,17 +1617,20 @@ static void flush_buffer(args_t *args, htsFile *file, int n) int line_type = bcf_get_variant_types(args->lines[k]); if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos ) { - if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue; - if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only + if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; + if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue; } else { prev_rid = args->lines[k]->rid; prev_pos = args->lines[k]->pos; prev_type = 0; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); } prev_type |= line_type; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]); } bcf_write1(file, args->hdr, args->lines[k]); } @@ -1582,6 +1660,7 @@ static void init_data(args_t *args) static void destroy_data(args_t *args) { + cmpals_destroy(args); int i; for (i=0; irbuf.m; i++) if ( args->lines[i] ) bcf_destroy1(args->lines[i]); @@ -1678,17 +1757,20 @@ static void normalize_vcf(args_t *args) int line_type = bcf_get_variant_types(line); if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos ) { - if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue; - if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only + if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; + if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue; } else { prev_rid = line->rid; prev_pos = line->pos; prev_type = 0; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); } prev_type |= line_type; + if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line); } // still on the same chromosome? @@ -1745,7 +1827,7 @@ static void usage(void) fprintf(bcftools_stderr, "Options:\n"); fprintf(bcftools_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); fprintf(bcftools_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); - fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); + fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|none\n"); fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); fprintf(bcftools_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); @@ -1806,10 +1888,12 @@ int main_vcfnorm(int argc, char *argv[]) switch (c) { case 'N': args->do_indels = 0; break; case 'd': - if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1; - else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1; - else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1; - else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1; + if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; + else if ( !strcmp("indels",optarg) ) args->rmdup = BCF_SR_PAIR_INDELS; + else if ( !strcmp("both",optarg) ) args->rmdup = BCF_SR_PAIR_BOTH; + else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; + else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; + else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; else error("The argument to -d not recognised: %s\n", optarg); break; case 'm': @@ -1843,7 +1927,7 @@ int main_vcfnorm(int argc, char *argv[]) case 'o': args->output_fname = optarg; break; case 'D': fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); - args->rmdup = COLLAPSE_NONE<<1; + args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; case 'f': args->ref_fname = optarg; break; diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c index a53ac3c..90292f4 100644 --- a/bcftools/vcfplugin.c +++ b/bcftools/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -637,7 +637,11 @@ int main_plugin(int argc, char *argv[]) if ( !pass ) continue; } line = args->plugin.process(line); - if ( line ) bcf_write1(args->out_fh, args->hdr_out, line); + if ( line ) + { + if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode); + bcf_write1(args->out_fh, args->hdr_out, line); + } } destroy_data(args); bcf_sr_destroy(args->files); diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index f8e393b..606e245 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -2,7 +2,7 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2016 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -639,7 +639,11 @@ int main_plugin(int argc, char *argv[]) if ( !pass ) continue; } line = args->plugin.process(line); - if ( line ) bcf_write1(args->out_fh, args->hdr_out, line); + if ( line ) + { + if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode); + bcf_write1(args->out_fh, args->hdr_out, line); + } } destroy_data(args); bcf_sr_destroy(args->files); diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index 04554f8..29c4a9d 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -48,6 +48,7 @@ typedef struct filter_t *filter; char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + uint8_t *smpl_pass; convert_t *convert; bcf_srs_t *files; bcf_hdr_t *header; @@ -99,6 +100,7 @@ static void init_data(args_t *args) } } args->convert = convert_init(args->header, samples, nsamples, args->format_str); + convert_set_option(args->convert, subset_samples, &args->smpl_pass); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); @@ -129,6 +131,7 @@ static void query_vcf(args_t *args) fwrite(str.s, str.l, 1, args->out); } + int i,max_convert_unpack = convert_max_unpack(args->convert); while ( bcf_sr_next_line(args->files) ) { if ( !bcf_sr_has_line(args->files,0) ) continue; @@ -137,9 +140,30 @@ static void query_vcf(args_t *args) if ( args->filter ) { - int pass = filter_test(args->filter, line, NULL); - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) continue; + int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass); + if ( args->filter_logic & FLT_EXCLUDE ) + { + // This code addresses this problem: + // -i can include a site but exclude a sample + // -e exclude a site but include a sample + + if ( pass ) + { + if ( !args->smpl_pass ) continue; + if ( !(max_convert_unpack & BCF_UN_FMT) ) continue; + + pass = 0; + for (i=0; in_sample; i++) + { + if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0; + else { args->smpl_pass[i] = 1; pass = 1; } + } + if ( !pass ) continue; + } + else if ( args->smpl_pass ) + for (i=0; in_sample; i++) args->smpl_pass[i] = 1; + } + else if ( !pass ) continue; } str.l = 0; @@ -198,7 +222,6 @@ static void usage(void) fprintf(stderr, "Usage: bcftools query [options] [ [...]]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --collapse collapse lines with duplicate positions for , see man page [none]\n"); fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); fprintf(stderr, " -f, --format see man page for details\n"); fprintf(stderr, " -H, --print-header print header\n"); @@ -254,14 +277,8 @@ int main_vcfquery(int argc, char *argv[]) case 'f': args->format_str = strdup(optarg); break; case 'H': args->print_header = 1; break; case 'v': args->vcf_list = optarg; break; - case 'c': - if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; - else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; - else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; - else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; - else error("The --collapse string \"%s\" not recognised.\n", optarg); + case 'c': + error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n", optarg); break; case 'a': { @@ -322,7 +339,6 @@ int main_vcfquery(int argc, char *argv[]) { if ( !fname ) usage(); args->files = bcf_sr_init(); - args->files->collapse = collapse; if ( optind+1 < argc ) args->files->require_index = 1; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index e9100e6..8186a59 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -2,7 +2,7 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -50,6 +50,7 @@ typedef struct filter_t *filter; char *filter_str; int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + uint8_t *smpl_pass; convert_t *convert; bcf_srs_t *files; bcf_hdr_t *header; @@ -101,6 +102,7 @@ static void init_data(args_t *args) } } args->convert = convert_init(args->header, samples, nsamples, args->format_str); + convert_set_option(args->convert, subset_samples, &args->smpl_pass); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); @@ -131,6 +133,7 @@ static void query_vcf(args_t *args) fwrite(str.s, str.l, 1, args->out); } + int i,max_convert_unpack = convert_max_unpack(args->convert); while ( bcf_sr_next_line(args->files) ) { if ( !bcf_sr_has_line(args->files,0) ) continue; @@ -139,9 +142,30 @@ static void query_vcf(args_t *args) if ( args->filter ) { - int pass = filter_test(args->filter, line, NULL); - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) continue; + int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass); + if ( args->filter_logic & FLT_EXCLUDE ) + { + // This code addresses this problem: + // -i can include a site but exclude a sample + // -e exclude a site but include a sample + + if ( pass ) + { + if ( !args->smpl_pass ) continue; + if ( !(max_convert_unpack & BCF_UN_FMT) ) continue; + + pass = 0; + for (i=0; in_sample; i++) + { + if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0; + else { args->smpl_pass[i] = 1; pass = 1; } + } + if ( !pass ) continue; + } + else if ( args->smpl_pass ) + for (i=0; in_sample; i++) args->smpl_pass[i] = 1; + } + else if ( !pass ) continue; } str.l = 0; @@ -200,7 +224,6 @@ static void usage(void) fprintf(bcftools_stderr, "Usage: bcftools query [options] [ [...]]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -c, --collapse collapse lines with duplicate positions for , see man page [none]\n"); fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); fprintf(bcftools_stderr, " -f, --format see man page for details\n"); fprintf(bcftools_stderr, " -H, --print-header print header\n"); @@ -256,14 +279,8 @@ int main_vcfquery(int argc, char *argv[]) case 'f': args->format_str = strdup(optarg); break; case 'H': args->print_header = 1; break; case 'v': args->vcf_list = optarg; break; - case 'c': - if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; - else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; - else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; - else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; - else error("The --collapse string \"%s\" not recognised.\n", optarg); + case 'c': + error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n", optarg); break; case 'a': { @@ -324,7 +341,6 @@ int main_vcfquery(int argc, char *argv[]) { if ( !fname ) usage(); args->files = bcf_sr_init(); - args->files->collapse = collapse; if ( optind+1 < argc ) args->files->require_index = 1; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c index 8c1d055..626f975 100644 --- a/bcftools/vcfroh.c +++ b/bcftools/vcfroh.c @@ -1,6 +1,6 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -526,6 +526,8 @@ static void flush_viterbi(args_t *args, int ismpl) smpl->rg.state = 1; smpl->rg.beg = smpl->sites[i]; smpl->rg.rid = args->prev_rid; + smpl->rg.qual = qual; + smpl->rg.nqual = 1; } } else if ( state ) diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index b303a40..77d2f4f 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -2,7 +2,7 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2015 Genome Research Ltd. + Copyright (C) 2013-2017 Genome Research Ltd. Author: Petr Danecek @@ -528,6 +528,8 @@ static void flush_viterbi(args_t *args, int ismpl) smpl->rg.state = 1; smpl->rg.beg = smpl->sites[i]; smpl->rg.rid = args->prev_rid; + smpl->rg.qual = qual; + smpl->rg.nqual = 1; } } else if ( state ) diff --git a/bcftools/vcfsom.c b/bcftools/vcfsom.c index 03181e9..434dc16 100644 --- a/bcftools/vcfsom.c +++ b/bcftools/vcfsom.c @@ -453,7 +453,7 @@ static void create_eval_plot(args_t *args) "import csv\n" "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" "dat = []\n" - "with open('%s.eval', 'rb') as f:\n" + "with open('%s.eval', 'r') as f:\n" "\treader = csv.reader(f, 'tab')\n" "\tfor row in reader:\n" "\t\tif row[0][0]!='#': dat.append(row)\n" diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c index d806c01..092d37a 100644 --- a/bcftools/vcfsom.c.pysam.c +++ b/bcftools/vcfsom.c.pysam.c @@ -455,7 +455,7 @@ static void create_eval_plot(args_t *args) "import csv\n" "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n" "dat = []\n" - "with open('%s.eval', 'rb') as f:\n" + "with open('%s.eval', 'r') as f:\n" "\treader = csv.reader(f, 'tab')\n" "\tfor row in reader:\n" "\t\tif row[0][0]!='#': dat.append(row)\n" diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index 3b73173..4f7765e 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2016 Genome Research Ltd. + Copyright (C) 2012-2017 Genome Research Ltd. Author: Petr Danecek @@ -87,7 +87,7 @@ typedef struct int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; int subst[15]; int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; - int *smpl_hapRef, *smpl_hapAlt; + int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; int *smpl_indel_hets, *smpl_indel_homs; int *smpl_frm_shifts; // not-applicable, in-frame, out-frame unsigned long int *smpl_dp; @@ -305,7 +305,7 @@ int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, i // Sanity check: the reference sequence must match the REF allele for (i=0; ifiles->n_smpl ) { + stats->smpl_missing = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); @@ -551,6 +552,7 @@ static void destroy_stats(args_t *args) #endif free(stats->insertions); free(stats->deletions); + free(stats->smpl_missing); free(stats->smpl_hets); free(stats->smpl_homAA); free(stats->smpl_homRR); @@ -858,7 +860,11 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int { int ial, jal; int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal); - if ( gt==GT_UNKN ) continue; + if ( gt==GT_UNKN ) + { + stats->smpl_missing[is]++; + continue; + } if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) { if ( line_type&VCF_INDEL && stats->smpl_frm_shifts ) @@ -1495,18 +1501,19 @@ static void print_stats(args_t *args) if ( args->files->n_smpl ) { - printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n"); + printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. The rest include both SNPs and indels.\n"); printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" - "\t[12]nHapRef\t[13]nHapAlt\n"); + "\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0; - printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i], + printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i], - stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]); + stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], + stats->smpl_hapAlt[i], stats->smpl_missing[i]); } } diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 875dd6a..46a051f 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -2,7 +2,7 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2016 Genome Research Ltd. + Copyright (C) 2012-2017 Genome Research Ltd. Author: Petr Danecek @@ -89,7 +89,7 @@ typedef struct int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; int subst[15]; int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; - int *smpl_hapRef, *smpl_hapAlt; + int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; int *smpl_indel_hets, *smpl_indel_homs; int *smpl_frm_shifts; // not-applicable, in-frame, out-frame unsigned long int *smpl_dp; @@ -307,7 +307,7 @@ int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, i // Sanity check: the reference sequence must match the REF allele for (i=0; ifiles->n_smpl ) { + stats->smpl_missing = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); @@ -553,6 +554,7 @@ static void destroy_stats(args_t *args) #endif free(stats->insertions); free(stats->deletions); + free(stats->smpl_missing); free(stats->smpl_hets); free(stats->smpl_homAA); free(stats->smpl_homRR); @@ -860,7 +862,11 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int { int ial, jal; int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal); - if ( gt==GT_UNKN ) continue; + if ( gt==GT_UNKN ) + { + stats->smpl_missing[is]++; + continue; + } if ( gt==GT_HAPL_R || gt==GT_HAPL_A ) { if ( line_type&VCF_INDEL && stats->smpl_frm_shifts ) @@ -1497,18 +1503,19 @@ static void print_stats(args_t *args) if ( args->files->n_smpl ) { - fprintf(bcftools_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n"); + fprintf(bcftools_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. The rest include both SNPs and indels.\n"); fprintf(bcftools_stdout, "# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" - "\t[12]nHapRef\t[13]nHapAlt\n"); + "\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; ifiles->n_smpl; i++) { float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0; - fprintf(bcftools_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i], + fprintf(bcftools_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i], - stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]); + stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], + stats->smpl_hapAlt[i], stats->smpl_missing[i]); } } diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index 645cc8a..5d523f1 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Shane McCarthy @@ -342,11 +342,14 @@ int subset_vcf(args_t *args, bcf1_t *line) an += args->ac[i]; } + int update_ac = args->calc_ac; if (args->n_samples) { int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int)); bcf_subset(args->hdr, line, args->n_samples, args->imap); - if (args->calc_ac) { + if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0; + if ( update_ac ) + { bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN an = 0; for (i=0; in_allele; i++) { @@ -442,7 +445,7 @@ int subset_vcf(args_t *args, bcf1_t *line) if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled } - if (args->calc_ac && args->update_info) { + if (update_ac && args->update_info) { bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1); bcf_update_info_int32(args->hdr, line, "AN", &an, 1); } diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index 4fbe35a..4de3ac4 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -2,7 +2,7 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2018 Genome Research Ltd. Author: Shane McCarthy @@ -344,11 +344,14 @@ int subset_vcf(args_t *args, bcf1_t *line) an += args->ac[i]; } + int update_ac = args->calc_ac; if (args->n_samples) { int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int)); bcf_subset(args->hdr, line, args->n_samples, args->imap); - if (args->calc_ac) { + if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0; + if ( update_ac ) + { bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN an = 0; for (i=0; in_allele; i++) { @@ -444,7 +447,7 @@ int subset_vcf(args_t *args, bcf1_t *line) if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled } - if (args->calc_ac && args->update_info) { + if (update_ac && args->update_info) { bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1); bcf_update_info_int32(args->hdr, line, "AN", &an, 1); } diff --git a/bcftools/version.h b/bcftools/version.h index eb2074c..29d2017 100644 --- a/bcftools/version.h +++ b/bcftools/version.h @@ -1 +1 @@ -#define BCFTOOLS_VERSION "1.6" +#define BCFTOOLS_VERSION "1.7" diff --git a/doc/release.rst b/doc/release.rst index d0ece25..1a255b1 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,19 @@ Release notes ============= +Release 0.14.1 +============== + +This is mostly a bugfix release, though bcftools has now also been +upgraded to 1.7.0. + +* [#621] Add a warning to count_coverage when an alignment has an + empty QUAL field +* [#635] Speed-up of AlignedSegment.find_intro() +* treat border case of all bases in pileup column below quality score +* [#634] Fix access to pileup reference_sequence + + Release 0.14.0 ============== diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index e94db54..edb5eaa 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -2804,8 +2804,8 @@ cdef class PileupColumn: property reference_name: """:term:`reference` name (None if no AlignmentFile is associated)""" def __get__(self): - if self._alignment_file is not None: - return self._alignment_file.getrname(self.tid) + if self.header is not None: + return self.header.get_reference_name(self.tid) return None property nsegments: @@ -2966,7 +2966,6 @@ cdef class PileupColumn: buf[n] = p.b.core.qual + 33 n += 1 assert n < MAX_PILEUP_BUFFER_SIZE - if not p.is_del: if p.qpos < p.b.core.l_qseq: cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)] @@ -3032,9 +3031,13 @@ cdef class PileupColumn: n += 1 assert n < MAX_PILEUP_BUFFER_SIZE - # quicker to ensemble all and split than to encode all separately. - # ignore last ":" - return force_str(PyBytes_FromStringAndSize(buf, n-1)).split(":") + if n == 0: + # could be zero if all qualities are too low + return "" + else: + # quicker to ensemble all and split than to encode all separately. + # ignore last ":" + return force_str(PyBytes_FromStringAndSize(buf, n-1)).split(":") def get_query_qualities(self): """query base quality scores at pileup column position. diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 439cc55..739013d 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -1,3 +1,4 @@ + # cython: embedsignature=True # cython: profile=True ######################################################## @@ -1508,7 +1509,6 @@ cdef class AlignmentFile(HTSFile): """ - cdef uint32_t contig_length = self.get_reference_length(contig) cdef int _start = start if start is not None else 0 cdef int _stop = stop if stop is not None else contig_length @@ -1537,12 +1537,14 @@ cdef class AlignmentFile(HTSFile): cdef int refpos cdef int c = 0 cdef int filter_method = 0 + + if read_callback == "all": filter_method = 1 elif read_callback == "nofilter": filter_method = 2 - cdef int _threshold = quality_threshold + cdef int _threshold = quality_threshold or 0 for read in self.fetch(contig=contig, reference=reference, start=start, @@ -1564,10 +1566,13 @@ cdef class AlignmentFile(HTSFile): # count seq = read.seq quality = read.query_qualities + for qpos, refpos in read.get_aligned_pairs(True): if qpos is not None and refpos is not None and \ _start <= refpos < _stop: - if quality[qpos] >= quality_threshold: + + # only check base quality if _threshold > 0 + if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold: if seq[qpos] == 'A': count_a.data.as_ulongs[refpos - _start] += 1 if seq[qpos] == 'C': @@ -1579,7 +1584,7 @@ cdef class AlignmentFile(HTSFile): return count_a, count_c, count_g, count_t - def find_introns(self, read_iterator): + def find_introns_slow(self, read_iterator): """Return a dictionary {(start, stop): count} Listing the intronic sites in the reads (identified by 'N' in the cigar strings), and their support ( = number of reads ). @@ -1604,6 +1609,38 @@ cdef class AlignmentFile(HTSFile): last_read_pos = read_loc return res + def find_introns(self, read_iterator): + """Return a dictionary {(start, stop): count} + Listing the intronic sites in the reads (identified by 'N' in the cigar strings), + and their support ( = number of reads ). + + read_iterator can be the result of a .fetch(...) call. + Or it can be a generator filtering such reads. Example + samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) + """ + cdef: + uint32_t base_position, junc_start, nt + int op + AlignedSegment r + int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP + + import collections + res = collections.Counter() + + match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position + for r in read_iterator: + base_position = r.pos + + for op, nt in r.cigartuples: + if op in match_or_deletion: + base_position += nt + elif op == BAM_CREF_SKIP: + junc_start = base_position + base_position += nt + res[(junc_start, base_position)] += 1 + return res + + def close(self): '''closes the :class:`pysam.AlignmentFile`.''' diff --git a/pysam/version.py b/pysam/version.py index 43da562..8a68bcc 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,5 +1,5 @@ # pysam versioning information -__version__ = "0.14" +__version__ = "0.14.1" # TODO: upgrade number __samtools_version__ = "1.7" diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py index 1cdbb69..1cf733e 100644 --- a/tests/AlignmentFileHeader_test.py +++ b/tests/AlignmentFileHeader_test.py @@ -7,28 +7,19 @@ and data files located there. import unittest import os -import shutil import sys import re import copy -import collections from collections import OrderedDict as odict -import subprocess -import logging -import array +import pysam +import pysam.samtools +from TestUtils import get_temp_filename, BAM_DATADIR + if sys.version_info.major >= 3: from io import StringIO else: from StringIO import StringIO -from functools import partial - -import pysam -import pysam.samtools -from TestUtils import checkBinaryEqual, checkURL, \ - check_samtools_view_equal, checkFieldEqual, force_str, \ - get_temp_filename, BAM_DATADIR - class TestHeaderConstruction(unittest.TestCase): """testing header construction.""" diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py index 1851da8..32738fe 100644 --- a/tests/AlignmentFilePileup_test.py +++ b/tests/AlignmentFilePileup_test.py @@ -1,9 +1,9 @@ """Benchmarking module for AlignmentFile functionality""" import os -import re +import pysam import unittest from TestUtils import BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list -from PileupTestUtils import * +import PileupTestUtils class TestPileupReadSelection(unittest.TestCase): @@ -162,6 +162,10 @@ class TestPileupObjects(unittest.TestCase): pcolumn1.reference_id, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.reference_id, 0)) + self.assertEqual( + pcolumn1.reference_name, "chr1", + "chromosome mismatch in position 1: %s != %s" % + (pcolumn1.reference_name, "chr1")) self.assertEqual( pcolumn1.reference_pos, 105 - 1, "position mismatch in position 1: %s != %s" % @@ -182,6 +186,10 @@ class TestPileupObjects(unittest.TestCase): pcolumn2.reference_id, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.reference_id, 1)) + self.assertEqual( + pcolumn2.reference_name, "chr2", + "chromosome mismatch in position 1: %s != %s" % + (pcolumn2.reference_name, "chr2")) self.assertEqual( pcolumn2.reference_pos, 1480 - 1, "position mismatch in position 1: %s != %s" % @@ -338,32 +346,32 @@ class PileUpColumnTests(unittest.TestCase): fn_fasta = os.path.join(BAM_DATADIR, "ex1.fa") def test_pileup_depths_are_equal(self): - samtools_result = build_depth_with_samtoolspipe(self.fn) - pysam_result = build_depth_with_filter_with_pysam(self.fn) + samtools_result = PileupTestUtils.build_depth_with_samtoolspipe(self.fn) + pysam_result = PileupTestUtils.build_depth_with_filter_with_pysam(self.fn) self.assertEqual(pysam_result, samtools_result) def test_pileup_query_bases_without_reference_are_equal(self): - samtools_result = build_query_bases_with_samtoolspipe(self.fn) - pysam_result = build_query_bases_with_pysam(self.fn) + samtools_result = PileupTestUtils.build_query_bases_with_samtoolspipe(self.fn) + pysam_result = PileupTestUtils.build_query_bases_with_pysam(self.fn) self.assertEqual(["".join(x) for x in pysam_result], samtools_result) def test_pileup_query_bases_with_reference_are_equal(self): - samtools_result = build_query_bases_with_samtoolspipe(self.fn, "-f", self.fn_fasta) + samtools_result = PileupTestUtils.build_query_bases_with_samtoolspipe(self.fn, "-f", self.fn_fasta) with pysam.FastaFile(self.fn_fasta) as fasta: - pysam_result = build_query_bases_with_pysam(self.fn, fastafile=fasta, stepper="samtools") + pysam_result = PileupTestUtils.build_query_bases_with_pysam(self.fn, fastafile=fasta, stepper="samtools") self.assertEqual(["".join(x) for x in pysam_result], samtools_result) def test_pileup_query_qualities_are_equal(self): - samtools_result = build_query_qualities_with_samtoolspipe(self.fn) - pysam_result = build_query_qualities_with_pysam(self.fn) + samtools_result = PileupTestUtils.build_query_qualities_with_samtoolspipe(self.fn) + pysam_result = PileupTestUtils.build_query_qualities_with_pysam(self.fn) pysam_result = [ [chr(min(126, x + 33)) for x in l] for l in pysam_result] self.assertEqual("".join(flatten_nested_list(pysam_result)), "".join(flatten_nested_list(samtools_result))) def test_pileup_mapping_qualities_are_equal(self): - samtools_result = build_mapping_qualities_with_samtoolspipe(self.fn) - pysam_result = build_mapping_qualities_with_pysam(self.fn) + samtools_result = PileupTestUtils.build_mapping_qualities_with_samtoolspipe(self.fn) + pysam_result = PileupTestUtils.build_mapping_qualities_with_pysam(self.fn) # convert to chars pysam_result = [ [chr(min(126, x + 33)) for x in l] for l in pysam_result] @@ -372,8 +380,8 @@ class PileUpColumnTests(unittest.TestCase): "".join(flatten_nested_list(samtools_result))) def test_pileup_query_qualities_from_pileups_are_equal(self): - samtools_result = build_query_qualities_with_samtoolspipe(self.fn) - pysam_result = build_query_qualities_with_pysam_pileups(self.fn) + samtools_result = PileupTestUtils.build_query_qualities_with_samtoolspipe(self.fn) + pysam_result = PileupTestUtils.build_query_qualities_with_pysam_pileups(self.fn) pysam_result = [ "".join([chr(min(126, x + 33)) for x in l]) for l in pysam_result] diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 700018c..73a7b0a 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -9,8 +9,6 @@ import unittest import os import shutil import sys -import re -import copy import collections import subprocess import logging @@ -24,7 +22,7 @@ from functools import partial import pysam import pysam.samtools -from TestUtils import checkBinaryEqual, checkURL, \ +from TestUtils import checkBinaryEqual, check_url, \ check_samtools_view_equal, checkFieldEqual, force_str, \ get_temp_filename, BAM_DATADIR @@ -1572,7 +1570,7 @@ class TestRemoteFileFTP(unittest.TestCase): def testFTPView(self): return - if not checkURL(self.url): + if not check_url(self.url): return result = pysam.samtools.view(self.url, self.region) @@ -1580,7 +1578,7 @@ class TestRemoteFileFTP(unittest.TestCase): def testFTPFetch(self): return - if not checkURL(self.url): + if not check_url(self.url): return samfile = pysam.AlignmentFile(self.url, "rb") @@ -1595,7 +1593,7 @@ class TestRemoteFileHTTP(unittest.TestCase): local = os.path.join(BAM_DATADIR, "ex1.bam") def testView(self): - if not checkURL(self.url): + if not check_url(self.url): return samfile_local = pysam.AlignmentFile(self.local, "rb") @@ -1605,7 +1603,7 @@ class TestRemoteFileHTTP(unittest.TestCase): self.assertEqual(len(result.splitlines()), len(ref)) def testFetch(self): - if not checkURL(self.url): + if not check_url(self.url): return with pysam.AlignmentFile(self.url, "rb") as samfile: @@ -1619,7 +1617,7 @@ class TestRemoteFileHTTP(unittest.TestCase): self.assertEqual(x.compare(y), 0) def testFetchAll(self): - if not checkURL(self.url): + if not check_url(self.url): return with pysam.AlignmentFile(self.url, "rb") as samfile: @@ -2206,7 +2204,7 @@ class TestExplicitIndex(unittest.TestCase): samfile.fetch("chr1") def testRemoteExplicitIndexBAM(self): - if not checkURL( + if not check_url( "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam"): return @@ -2261,7 +2259,7 @@ class TestHeader1000Genomes(unittest.TestCase): def testRead(self): - if not checkURL(self.bamfile): + if not check_url(self.bamfile): return f = pysam.AlignmentFile(self.bamfile, "rb") diff --git a/tests/TestUtils.py b/tests/TestUtils.py index f4fe8e3..7811b9e 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -132,7 +132,7 @@ def check_samtools_view_equal( return True -def checkURL(url): +def check_url(url): '''return True if URL is available. A URL might not be available if it is the wrong URL diff --git a/tests/faidx_test.py b/tests/faidx_test.py index c618a92..3126e23 100644 --- a/tests/faidx_test.py +++ b/tests/faidx_test.py @@ -5,7 +5,7 @@ import gzip import copy import shutil -from TestUtils import checkURL, BAM_DATADIR, get_temp_filename +from TestUtils import check_url, BAM_DATADIR, get_temp_filename class TestFastaFile(unittest.TestCase): @@ -220,7 +220,7 @@ class TestRemoteFileFTP(unittest.TestCase): "GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa") def testFTPView(self): - if not checkURL(self.url): + if not check_url(self.url): return with pysam.Fastafile(self.url) as f: @@ -229,7 +229,7 @@ class TestRemoteFileFTP(unittest.TestCase): 1000) def test_sequence_lengths_are_available(self): - if not checkURL(self.url): + if not check_url(self.url): return with pysam.Fastafile(self.url) as f: diff --git a/tests/pysam_data/example_reverse_complement.sam b/tests/pysam_data/example_reverse_complement.sam new file mode 100644 index 0000000..795d19c --- /dev/null +++ b/tests/pysam_data/example_reverse_complement.sam @@ -0,0 +1,28 @@ +@SQ SN:chr1 LN:249250621 +@SQ SN:chr10 LN:135534747 +@SQ SN:chr11 LN:135006516 +@SQ SN:chr12 LN:133851895 +@SQ SN:chr13 LN:115169878 +@SQ SN:chr14 LN:107349540 +@SQ SN:chr15 LN:102531392 +@SQ SN:chr16 LN:90354753 +@SQ SN:chr17 LN:81195210 +@SQ SN:chr18 LN:78077248 +@SQ SN:chr19 LN:59128983 +@SQ SN:chr2 LN:243199373 +@SQ SN:chr20 LN:63025520 +@SQ SN:chr21 LN:48129895 +@SQ SN:chr22 LN:51304566 +@SQ SN:chr3 LN:198022430 +@SQ SN:chr4 LN:191154276 +@SQ SN:chr5 LN:180915260 +@SQ SN:chr6 LN:171115067 +@SQ SN:chr7 LN:159138663 +@SQ SN:chr8 LN:146364022 +@SQ SN:chr9 LN:141213431 +@SQ SN:chrM LN:16571 +@SQ SN:chrX LN:155270560 +@SQ SN:chrY LN:59373566 +@PG ID:bwa PN:bwa VN:0.7.12-r1039 CL:bwa mem /ifs/mirror/genomes/bwa-0.7.5a/hg19 both.fq +chr22:25000000-25000500 0 chr22 25000000 60 501M * 0 0 GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA NM:i:0 MD:Z:501 AS:i:501 XS:i:343 +chr22:25000000-25000500 16 chr22 25000000 60 501M * 0 0 GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA CGAGACACGGCGTTCCCACCTTTGACACTCTCTGTCTAAGGTTGAGGTGCAGACCCATCATTCGTAGGTCAGGTCCCGACATCCGTCAGGACCCCTTCTGCGGTCTCTAGACACGTAAGAGTATAGCTCCCTATCGCTGAGGTCCGACCCCCGACCGTCCCATTCCCCACCCACCCAGGACCCGAATGGGCGTCCAGACGTCTGAAGGACCCCGGTCGACTGGAGCATTTTAGGGAAAACAGATTCGAAGTCAAAGGACGGACACTTACCCCAACCCCGACACGAGACCAAAGTGGGAACACCGAGACCCCAACACCACTGTTTCGGTAGTTCGACCCAACTTCCTAATTGGTCCTTTGAAGTCTGACCGACGGCACAGATGGAGAAGGAGTATGAGGAGAGAGACGACGTAGGACCCTTCGACGAGACGAGTCGGATTTACTCCGAGTCAACACACACACGCGTGCACGAACGTGCACACAACCTTCACCCACTATGACT NM:i:0 MD:Z:501 AS:i:501 XS:i:343 diff --git a/tests/tabix_test.py b/tests/tabix_test.py index 013ff86..265c60a 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -16,7 +16,7 @@ import glob import re import copy import tempfile -from TestUtils import checkURL, load_and_convert, TABIX_DATADIR, get_temp_filename +from TestUtils import check_url, load_and_convert, TABIX_DATADIR, get_temp_filename IS_PYTHON3 = sys.version_info[0] >= 3 @@ -1046,7 +1046,7 @@ class TestRemoteFileHTTP(unittest.TestCase): local = os.path.join(TABIX_DATADIR, "example.gtf.gz") def setUp(self): - if not pysam.config.HAVE_LIBCURL or not checkURL(self.url): + if not pysam.config.HAVE_LIBCURL or not check_url(self.url): self.remote_file = None else: self.remote_file = pysam.TabixFile(self.url, "r") @@ -1085,7 +1085,7 @@ class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP): local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz") def setUp(self): - if not pysam.config.HAVE_LIBCURL or not checkURL(self.url): + if not pysam.config.HAVE_LIBCURL or not check_url(self.url): self.remote_file = None else: self.remote_file = pysam.TabixFile(self.url, "r") -- 2.30.2