Release notes
=============
+Release 0.14.1
+==============
+
+This is mostly a bugfix release, though bcftools has now also been
+upgraded to 1.7.0.
+
+* [#621] Add a warning to count_coverage when an alignment has an
+ empty QUAL field
+* [#635] Speed-up of AlignedSegment.find_intro()
+* treat border case of all bases in pileup column below quality score
+* [#634] Fix access to pileup reference_sequence
+
Release 0.14.0
==============
return iupac[(int)a][(int)b];
}
+static inline int iupac_consistent(char iupac, char nt)
+{
+ static const char iupac_mask[90] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,14,2,
+ 13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10
+ };
+ if ( iupac > 89 ) return 0;
+ if ( nt > 90 ) nt -= 32; // lowercase
+ if ( nt=='A' ) nt = 1;
+ else if ( nt=='C' ) nt = 2;
+ else if ( nt=='G' ) nt = 4;
+ else if ( nt=='T' ) nt = 8;
+ return iupac_mask[(int)iupac] & nt ? 1 : 0;
+}
+
static inline char nt_to_upper(char nt)
{
if ( nt < 97 ) return nt;
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
int nvcf_buf, rid;
+ char *chr;
regidx_t *mask;
regitr_t *itr;
free(chain->block_lengths);
free(chain);
chain = NULL;
+ free(args->chr);
+ args->chr = NULL;
}
static void print_chain(args_t *args)
score += chain->block_lengths[n];
}
score += last_block_size;
- fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
+ fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
for (n=0; n<chain->num; n++) {
fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
}
if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
free(args->vcf_buf);
free(args->fa_buf.s);
+ free(args->chr);
if ( args->mask ) regidx_destroy(args->mask);
if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
else to--;
}
}
+ args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
}
- int i, ialt = 1;
+ int i, ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
+ if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
static void mask_region(args_t *args, char *seq, int len)
{
- char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
- if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
+ if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
while ( regitr_overlap(args->itr) )
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
int nvcf_buf, rid;
+ char *chr;
regidx_t *mask;
regitr_t *itr;
free(chain->block_lengths);
free(chain);
chain = NULL;
+ free(args->chr);
+ args->chr = NULL;
}
static void print_chain(args_t *args)
score += chain->block_lengths[n];
}
score += last_block_size;
- fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
+ fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
for (n=0; n<chain->num; n++) {
fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
}
if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
free(args->vcf_buf);
free(args->fa_buf.s);
+ free(args->chr);
if ( args->mask ) regidx_destroy(args->mask);
if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
else to--;
}
}
+ args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
}
- int i, ialt = 1;
+ int i, ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
+ if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
static void mask_region(args_t *args, char *seq, int len)
{
- char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
- if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
+ if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
while ( regitr_overlap(args->itr) )
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int ndat;
char *undef_info_tag;
int allow_undef_tags;
+ uint8_t **subset_samples;
};
typedef struct
}
return ((int32_t*)bcf_array)[idx];
}
+static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
+{
+ int n = 0, ibeg = 0;
+ while ( src[ibeg] && ibeg<len && n < idx )
+ {
+ if ( src[ibeg]==',' ) n++;
+ ibeg++;
+ }
+ if ( ibeg==len ) { kputc('.', str); return; }
+
+ int iend = ibeg;
+ while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
+
+ if ( iend>ibeg )
+ kputsn(src+ibeg, iend-ibeg, str);
+ else
+ kputc('.', str);
+}
static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( fmt->id<0 )
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
+ case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
else
kputw(ival, str);
}
+ else if ( fmt->fmt->type == BCF_BT_CHAR )
+ _copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
}
else
}
for (js=0; js<convert->nsamples; js++)
{
+ // Skip samples when filtering was requested
+ if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// it's very very slow. Therefore in case the handler does not add
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
break;
+ case subset_samples:
+ convert->subset_samples = va_arg(args, uint8_t**);
+ break;
default:
ret = -1;
}
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2017 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int ndat;
char *undef_info_tag;
int allow_undef_tags;
+ uint8_t **subset_samples;
};
typedef struct
}
return ((int32_t*)bcf_array)[idx];
}
+static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
+{
+ int n = 0, ibeg = 0;
+ while ( src[ibeg] && ibeg<len && n < idx )
+ {
+ if ( src[ibeg]==',' ) n++;
+ ibeg++;
+ }
+ if ( ibeg==len ) { kputc('.', str); return; }
+
+ int iend = ibeg;
+ while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
+
+ if ( iend>ibeg )
+ kputsn(src+ibeg, iend-ibeg, str);
+ else
+ kputc('.', str);
+}
static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( fmt->id<0 )
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
+ case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
else
kputw(ival, str);
}
+ else if ( fmt->fmt->type == BCF_BT_CHAR )
+ _copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
}
else
}
for (js=0; js<convert->nsamples; js++)
{
+ // Skip samples when filtering was requested
+ if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// it's very very slow. Therefore in case the handler does not add
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
break;
+ case subset_samples:
+ convert->subset_samples = va_arg(args, uint8_t**);
+ break;
default:
ret = -1;
}
typedef struct _convert_t convert_t;
enum convert_option
{
- allow_undef_tags
+ allow_undef_tags,
+ subset_samples,
};
convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
/* The MIT License
- Copyright (c) 2016 Genome Research Ltd.
+ Copyright (c) 2016-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
id = tbl->nstr++;
hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
tbl->str[id] = strdup(ss);
- int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+ khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;
}
}
+
void hap_finalize(args_t *args, hap_t *hap)
{
tscript_t *tr = hap->tr;
}
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
- if ( i<istack )
+
+ // This condition extends compound variants. Note that s/s/s sites are forced out to always break
+ // a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
+ if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
{
if ( dlen%3 ) // frameshift
{
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
if ( gt[0]==bcf_gt_missing ) continue;
- if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+ if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
{
if ( args->phase==PHASE_MERGE )
{
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
- error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
int call_csq = 1;
if ( !rec->n_allele ) call_csq = 0; // no alternate allele
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
+ else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc
else if ( args->filter )
{
call_csq = filter_test(args->filter, rec, NULL);
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
- " -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
+ " -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
- " s: skip unphased GTs\n"
+ " s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" -i, --include <expr> select sites for which the expression is true\n"
/* The MIT License
- Copyright (c) 2016 Genome Research Ltd.
+ Copyright (c) 2016-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
id = tbl->nstr++;
hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
tbl->str[id] = strdup(ss);
- int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+ khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;
}
}
+
void hap_finalize(args_t *args, hap_t *hap)
{
tscript_t *tr = hap->tr;
}
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
- if ( i<istack )
+
+ // This condition extends compound variants. Note that s/s/s sites are forced out to always break
+ // a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
+ if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
{
if ( dlen%3 ) // frameshift
{
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
if ( gt[0]==bcf_gt_missing ) continue;
- if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+ if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
{
if ( args->phase==PHASE_MERGE )
{
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
- error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
int call_csq = 1;
if ( !rec->n_allele ) call_csq = 0; // no alternate allele
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
+ else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc
else if ( args->filter )
{
call_csq = filter_test(args->filter, rec, NULL);
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
- " -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
+ " -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
- " s: skip unphased GTs\n"
+ " s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" -i, --include <expr> select sites for which the expression is true\n"
/* filter.c -- filter expressions.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
- int idx; // 0-based index to VCF vectors, -1: not a vector,
+ int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
- int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+ int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+ int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
+ uint8_t *usmpl; // bitmask of used samples as set by idx
+ int nsamples; // number of samples for format fields, 0 for info and other fields
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
- int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
+ int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
+ void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
void *hash; // test presence of str value in the hash via comparator
regex_t *regex; // precompiled regex for string comparison
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
uint8_t *pass_samples; // status of individual samples
- int nsamples; // number of samples
int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
- int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1
+ int nval1; // number of per-sample fields or string length
}
token_t;
#define TOK_ABS 23
#define TOK_LEN 24
#define TOK_FUNC 25
+#define TOK_CNT 26
-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8};
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8};
#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
static int filters_next_token(char **str, int *len)
if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
+ if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
{
float *ptr = &line->qual;
if ( bcf_float_is_missing(*ptr) )
- tok->nvalues = 0;
+ bcf_double_set_missing(tok->values[0]);
else
- {
tok->values[0] = (double)line->qual;
- tok->nvalues = 1;
- }
+ tok->nvalues = 1;
}
static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( line->d.info[i].key == tok->hdr_id ) break;
if ( i==line->n_info )
- tok->nvalues = 0;
+ tok->nvalues = tok->str_value.l = 0;
else if ( line->d.info[i].type==BCF_BT_CHAR )
{
int n = line->d.info[i].len;
}
}
}
-static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
- if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
- return a&b ? 0 : 1;
+ if ( rtok->tok_type==TOK_LIKE )
+ rtok->pass_site = a&b ? 1 : 0;
+ else
+ rtok->pass_site = a&b ? 0 : 1;
}
-static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
int i;
- if ( op_type==TOK_NE ) // AND logic: none of the filters can match
+ if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match
{
if ( !line->d.n_flt )
{
- if ( atok->hdr_id==-1 ) return 0; // missing value
- return 1; // no filter present, eval to true
+ if ( atok->hdr_id==-1 ) return; // missing value
+ rtok->pass_site = 1;
+ return; // no filter present, eval to true
}
for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) return 0;
- return 1;
+ if ( atok->hdr_id==line->d.flt[i] ) return;
+ rtok->pass_site = 1;
+ return;
}
- // TOK_EQ with OR logic: at least one of the filters must match
- if ( !line->d.n_flt )
+ else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
{
- if ( atok->hdr_id==-1 ) return 1;
- return 0; // no filter present, eval to false
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+ return; // no filter present, eval to false
+ }
+ for (i=0; i<line->d.n_flt; i++)
+ if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
+ return;
}
- for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) return 1;
- return 0;
+ else
+ error("Only == and != operators are supported for FILTER\n");
+ return;
}
-static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
// multiple IDs not supported yet (easy to add though)
+ if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+ error("Only == and != operators are supported for ID\n");
if ( btok->hash )
{
if ( atok->hash )
{
int ret = khash_str2int_has_key(atok->hash, line->d.id);
- if ( op_type==TOK_EQ ) return ret;
- return ret ? 0 : 1;
+ if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+ rtok->pass_site = ret;
+ return;
}
- if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
- return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
+ if ( rtok->tok_type==TOK_EQ )
+ rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+ else
+ rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
}
/**
}
else
{
- int64_t value;
+ int64_t value = 0;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
{
flt->tmps.l = 0;
ks_resize(&flt->tmps, n);
- int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
- if ( end >= n ) end = n - 1;
+ int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+ if ( iend >= n ) iend = n - 1;
char *beg = tok->str_value.s, *dst = flt->tmps.s;
- for (i=0; i<=end; i++)
+ for (i=0; i<=iend; i++)
{
char *end = beg;
while ( *end && *end!=',' ) end++;
-
if ( i>=tok->nidxs || tok->idxs[i] )
{
memcpy(dst, beg, end - beg);
dst[0] = ',';
dst++;
}
-
beg = end+1;
}
dst[0] = 0;
static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int i;
- if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
+ if ( line->n_sample != tok->nsamples )
+ error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+ int nvals;
+ if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
{
- tok->nvalues = tok->nsamples = 0;
+ tok->nvalues = 0;
return;
}
- if ( tok->idx >= -1 ) // scalar or vector index
+ int i, nsrc1 = nvals / tok->nsamples;
+ tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+ tok->nvalues = tok->nval1*tok->nsamples;
+ hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+ if ( tok->idx >= 0 ) // scalar or vector index
{
- hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- int32_t *ptr = flt->tmpi;
- for (i=0; i<line->n_sample; i++)
+ for (i=0; i<tok->nsamples; i++)
{
- if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end )
+ if ( !tok->usmpl[i] ) continue;
+ int32_t *ptr = flt->tmpi + i*nsrc1;
+ if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end )
bcf_double_set_missing(tok->values[i]);
else
- {
- tok->values[i] = ptr[idx];
- is_missing = 0;
- }
- ptr += nvals;
+ tok->values[i] = ptr[tok->idx];
}
- if ( is_missing ) tok->nvalues = 0;
- else tok->nvalues = line->n_sample;
- tok->nsamples = tok->nvalues;
- return;
}
- if ( tok->idx == -2 )
+ else
{
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
- if ( end >= nvals ) end = nvals - 1;
- int32_t *ptr = flt->tmpi;
- for (i=0; i<line->n_sample; i++)
+ int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+ for (i=0; i<tok->nsamples; i++)
{
- for (k=0; k<=end; k++)
- if ( k>=tok->nidxs || tok->idxs[k] )
- {
- if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
- bcf_double_set_missing(tok->values[j]);
- else
- {
- tok->values[j] = ptr[k];
- is_missing = 0;
- }
- j++;
- }
- ptr += nvals;
- }
- if ( is_missing ) tok->nvalues = tok->nsamples = 0;
- else
- {
- tok->nsamples = line->n_sample;
- tok->nvalues = j;
+ if ( !tok->usmpl[i] ) continue;
+ int32_t *src = flt->tmpi + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<kend; k++)
+ {
+ if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+ if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end )
+ bcf_double_set_missing(dst[j]);
+ else
+ dst[j] = src[k];
+ j++;
+ }
+ while (j < tok->nval1)
+ {
+ bcf_double_set_missing(dst[j]);
+ j++;
+ }
}
- return;
}
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int i;
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
+ if ( line->n_sample != tok->nsamples )
+ error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+ int nvals;
+ if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
{
- tok->nvalues = tok->nsamples = 0;
+ tok->nvalues = 0;
return;
}
- if ( tok->idx >= -1 ) // scalar or vector index
+ int i, nsrc1 = nvals / tok->nsamples;
+ tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+ tok->nvalues = tok->nval1*tok->nsamples;
+ hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+ if ( tok->idx >= 0 ) // scalar or vector index
{
- hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- float *ptr = flt->tmpf;
- for (i=0; i<line->n_sample; i++)
+ for (i=0; i<tok->nsamples; i++)
{
- if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
+ if ( !tok->usmpl[i] ) continue;
+ float *ptr = flt->tmpf + i*nsrc1;
+ if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) )
bcf_double_set_missing(tok->values[i]);
else
- {
- tok->values[i] = ptr[idx];
- is_missing = 0;
- }
- ptr += nvals;
+ tok->values[i] = ptr[tok->idx];
}
- if ( is_missing ) tok->nvalues = 0;
- else tok->nvalues = line->n_sample;
- tok->nsamples = tok->nvalues;
- return;
}
- if ( tok->idx == -2 )
+ else
{
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
- if ( end >= nvals ) end = nvals - 1;
- float *ptr = flt->tmpf;
- for (i=0; i<line->n_sample; i++)
+ int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+ for (i=0; i<tok->nsamples; i++)
{
- for (k=0; k<=end; k++)
- if ( k>=tok->nidxs || tok->idxs[k] )
- {
- if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
- bcf_double_set_missing(tok->values[j]);
- else
- {
- tok->values[j] = ptr[k];
- is_missing = 0;
- }
- j++;
- }
- ptr += nvals;
- }
- if ( is_missing ) tok->nvalues = tok->nsamples = 0;
- else
- {
- tok->nsamples = line->n_sample;
- tok->nvalues = j;
+ if ( !tok->usmpl[i] ) continue;
+ float *src = flt->tmpf + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<kend; k++)
+ {
+ if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+ if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) )
+ bcf_double_set_missing(dst[j]);
+ else
+ dst[j] = src[k];
+ j++;
+ }
+ while (j < tok->nval1)
+ {
+ bcf_double_set_missing(dst[j]);
+ j++;
+ }
}
- return;
}
}
static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- tok->str_value.l = tok->nvalues = 0;
- if ( !line->n_sample ) return;
+ if ( line->n_sample != tok->nsamples )
+ error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
- int ndim = tok->str_value.m;
+ int i, ndim = tok->str_value.m;
int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
tok->str_value.m = ndim;
+ tok->str_value.l = tok->nvalues = 0;
- if ( nstr<=0 ) return;
-
- if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
- {
- tok->nsamples = line->n_sample;
- tok->nstr1 = ndim / line->n_sample;
- tok->nvalues = tok->str_value.l = nstr;
- return;
- }
-
- int nstr1 = nstr / line->n_sample;
+ if ( nstr<0 ) return;
- // vector, one or multiple indices
- int i;
- for (i=0; i<line->n_sample; i++)
+ tok->nvalues = tok->str_value.l = nstr;
+ tok->nval1 = nstr / tok->nsamples;
+ for (i=0; i<tok->nsamples; i++)
{
- char *dst = tok->str_value.s + i*nstr1, *str = dst;
- int nval = 0, ibeg = 0;
- while ( ibeg < nstr1 )
+ if ( !tok->usmpl[i] ) continue;
+ char *src = tok->str_value.s + i*tok->nval1, *dst = src;
+ int ibeg = 0, idx = 0;
+ while ( ibeg < tok->nval1 )
{
- int iend = ibeg + 1;
- while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+ int iend = ibeg;
+ while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
int keep = 0;
- if ( tok->idx >=0 )
- keep = tok->idx==nval ? 1 : 0;
- else if ( nval < tok->nidxs )
- keep = tok->idxs[nval] ? 1 : 0;
+ if ( tok->idx >= 0 )
+ {
+ if ( tok->idx==idx ) keep = 1;
+ }
+ else if ( idx < tok->nidxs )
+ {
+ if ( tok->idxs[idx] != 0 ) keep = 1;
+ }
else if ( tok->idxs[tok->nidxs-1] < 0 )
keep = 1;
if ( keep )
{
- if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+ if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
dst += iend - ibeg + 1;
if ( tok->idx>=0 ) break;
}
- if ( !str[iend] ) break;
+ if ( !src[iend] ) break;
ibeg = iend + 1;
- nval++;
+ idx++;
}
- if ( dst==str ) { dst[0] = '.'; dst+=2; }
- if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
+ if ( dst==src ) { dst[0] = '.'; dst+=2; }
+ if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src));
}
- tok->nvalues = tok->str_value.l = nstr;
- tok->nstr1 = nstr1;
- tok->nsamples = line->n_sample;
}
static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
{
return;
}
- int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
- if ( tok->str_value.m <= nvals*nsmpl )
+ int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
+ if ( tok->str_value.m <= nvals1*nsmpl )
{
- tok->str_value.m = nvals*nsmpl + 1;
+ tok->str_value.m = nvals1*nsmpl + 1;
tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
}
if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
} \
} \
- char *dst = &tok->str_value.s[nvals*i]; \
- if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+ char *dst = &tok->str_value.s[nvals1*i]; \
+ if ( type==4 ) \
+ { \
+ if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \
+ else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \
+ else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \
+ else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \
+ } \
+ else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
else if ( type==3 ) \
{ \
if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
}
#undef BRANCH_INT
- tok->nsamples = nsmpl;
- tok->nvalues = tok->str_value.l = nvals*nsmpl;
+ assert( tok->nsamples == nsmpl );
+ tok->nvalues = tok->str_value.l = nvals1*nsmpl;
tok->str_value.s[tok->str_value.l] = 0;
- tok->nstr1 = nvals;
+ tok->nval1 = nvals1;
}
static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
if ( !fmt )
{
- tok->nvalues = tok->nsamples = 0;
+ tok->nvalues = 0;
return;
}
- int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
- kstring_t str;
+ int i, blen = 4, nsmpl = line->n_sample;
gt_length_too_big:
tok->str_value.l = 0;
for (i=0; i<nsmpl; i++)
{
- int plen = tok->str_value.l;
-
+ size_t plen = tok->str_value.l;
bcf_format_gt(fmt, i, &tok->str_value);
kputc_(0, &tok->str_value);
if ( tok->str_value.l - plen > blen )
plen++;
}
}
- tok->nsamples = nsmpl;
+ assert( tok->nsamples == nsmpl );
tok->nvalues = tok->str_value.l;
- tok->nstr1 = blen;
+ tok->nval1 = blen;
}
static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
}
else if ( tok->idx==-2 )
{
- int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+ int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1;
if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
for (i=0; i<=end; i++)
if ( i>=tok->nidxs || tok->idxs[i] )
}
}
-static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = 0;
+ if ( !tok->nvalues ) return 1;
double val = -HUGE_VAL;
- int i;
+ int i, has_value = 0;
for (i=0; i<tok->nvalues; i++)
{
- if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val < tok->values[i] ) val = tok->values[i];
+ }
+ if ( has_value )
+ {
+ rtok->values[0] = val;
+ rtok->nvalues = has_value;
}
- tok->values[0] = val;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ return 1;
}
-static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = 0;
+ if ( !tok->nvalues ) return 1;
double val = HUGE_VAL;
- int i;
+ int i, has_value = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
- tok->values[0] = val;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ {
+ if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val > tok->values[i] ) val = tok->values[i];
+ }
+ if ( has_value )
+ {
+ rtok->values[0] = val;
+ rtok->nvalues = has_value;
+ }
+ return 1;
}
-static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = 0;
+ if ( !tok->nvalues ) return 1;
double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
- tok->values[0] = n ? val / n : 0;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ if ( n )
+ {
+ rtok->values[0] = val / n;
+ rtok->nvalues = 1;
+ }
+ return 1;
}
-static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ rtok->nvalues = 0;
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nvalues ) return 1;
double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
- tok->values[0] = val;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ if ( n )
+ {
+ rtok->values[0] = val;
+ rtok->nvalues = 1;
+ }
+ return 1;
}
-static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
+
+ rtok->nvalues = tok->nvalues;
+ if ( !tok->nvalues ) return 1;
+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
int i;
for (i=0; i<tok->nvalues; i++)
- tok->values[i] = fabs(tok->values[i]);
+ if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+ else rtok->values[i] = fabs(tok->values[i]);
+ return 1;
+}
+static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n");
+
+ int i, cnt = 0;
+ for (i=0; i<tok->nsamples; i++)
+ if ( tok->pass_samples[i] ) cnt++;
+
+ rtok->nvalues = 1;
+ rtok->values[0] = cnt;
+ return 1;
}
-static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- tok->is_str = 0;
- if ( !tok->str_value.l ) return;
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = rtok->str_value.l = 0;
+ if ( !tok->str_value.l ) return 1;
+
if ( tok->idx==-2 )
{
int i = 0;
{
char *se = ss;
while ( *se && *se!=',' ) se++;
- hts_expand(double, i+1, tok->mvalues, tok->values);
- if ( !*se ) tok->values[i] = strlen(ss);
+ hts_expand(double, i+1, rtok->mvalues, rtok->values);
+ if ( !*se ) rtok->values[i] = strlen(ss);
else
{
*se = 0;
- tok->values[i] = strlen(ss);
+ rtok->values[i] = strlen(ss);
*se = ',';
}
ss = *se ? se + 1 : se;
i++;
}
- tok->nvalues = i;
+ rtok->nvalues = i;
}
else
{
- tok->values[0] = strlen(tok->str_value.s);
- tok->nvalues = 1;
+ rtok->values[0] = strlen(tok->str_value.s);
+ rtok->nvalues = 1;
}
- tok->str_value.l = 0;
+ return 1;
+}
+inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
+{
+ token_t *tok = atok->nvalues > btok->nvalues ? atok : btok;
+ rtok->nvalues = tok->nvalues;
+ rtok->nval1 = tok->nval1;
+ hts_expand(double*, rtok->nvalues, rtok->mvalues, rtok->values);
}
-#define VECTOR_ARITHMETICS(atok,btok,AOP) \
+inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
+{
+ if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
+ {
+ rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
+ rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
+ int i;
+ for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
+ for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
+ }
+ memset(rtok->pass_samples, 0, rtok->nsamples);
+}
+
+#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
{ \
+ token_t *rtok = _rtok; \
int i, has_values = 0; \
- if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \
+ if ( atok->nvalues && btok->nvalues ) \
{ \
- (atok)->nvalues = 0; (atok)->nsamples = 0; \
- } \
- else \
- { \
- if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \
- { \
- for (i=0; i<(atok)->nvalues; i++) \
- { \
- if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
- if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
- has_values = 1; \
- (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
- } \
- } \
- else if ( (btok)->nsamples ) \
+ tok_init_values(atok, btok, rtok); \
+ tok_init_samples(atok, btok, rtok); \
+ if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
{ \
- hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
- for (i=0; i<(btok)->nvalues; i++) \
+ assert( atok->nsamples==btok->nsamples ); \
+ for (i=0; i<atok->nvalues; i++) \
{ \
- if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
+ if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \
{ \
- bcf_double_set_missing((atok)->values[i]); \
+ bcf_double_set_missing(rtok->values[i]); \
continue; \
} \
has_values = 1; \
- (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \
+ rtok->values[i] = atok->values[i] AOP btok->values[i]; \
} \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
} \
- else if ( (atok)->nsamples ) \
+ else \
{ \
- for (i=0; i<(atok)->nvalues; i++) \
+ token_t *xtok = atok->nsamples ? atok : btok; \
+ token_t *ytok = atok->nsamples ? btok : atok; \
+ assert( ytok->nvalues==1 ); \
+ if ( !bcf_double_is_missing(ytok->values[0]) ) \
{ \
- if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
+ for (i=0; i<xtok->nvalues; i++) \
{ \
- bcf_double_set_missing((atok)->values[i]); \
- continue; \
+ if ( bcf_double_is_missing(xtok->values[i]) ) \
+ { \
+ bcf_double_set_missing(rtok->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
} \
- has_values = 1; \
- (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \
} \
} \
} \
- if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \
+ if ( !has_values ) rtok->nvalues = 0; \
}
-static int vector_logic_and(token_t *atok, token_t *btok, int and_type)
+static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields),
- // or a vector and a scalar (FORMAT field vs threshold)
- int i, pass_site = 0;
- if ( !atok->nvalues || !btok->nvalues )
- {
- atok->nvalues = atok->nsamples = 0;
- return 0;
- }
- if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site;
- if ( atok->nsamples && btok->nsamples )
+ token_t *atok = stack[nstack-2];
+ token_t *btok = stack[nstack-1];
+ tok_init_samples(atok, btok, rtok);
+
+ if ( !atok->pass_site && !btok->pass_site ) return 2;
+
+ rtok->pass_site = 1;
+ if ( !atok->nsamples && !btok->nsamples ) return 2;
+
+ int i;
+ if ( rtok->tok_type==TOK_OR_VEC ) // ||, select all samples if one is true
{
- if ( and_type==TOK_AND )
+ if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) )
{
- // perform AND within a sample
- for (i=0; i<atok->nsamples; i++)
+ // These two conditions are to ensure the following does not set all samples
+ // at sites with QUAL<=30:
+ // QUAL>30 || FMT/GQ>30
+
+ token_t *tok = atok->nsamples ? atok : btok;
+ for (i=0; i<rtok->nsamples; i++)
{
- atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
}
else
{
- // perform AND across samples
- int pass_a = 0, pass_b = 0;
- for (i=0; i<atok->nsamples; i++)
- {
- if ( atok->pass_samples[i] ) pass_a = 1;
- atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
- }
- for (i=0; i<btok->nsamples; i++)
+ for (i=0; i<rtok->nsamples; i++)
{
- if ( btok->pass_samples[i] ) { pass_b = 1; break; }
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = 1;
}
- pass_site = pass_a && pass_b;
}
- return pass_site;
+ return 2;
}
- if ( btok->nsamples )
+
+ // |, only select samples which are actually true
+
+ if ( !atok->nsamples || !btok->nsamples )
{
- for (i=0; i<btok->nsamples; i++)
+ token_t *tok = atok->nsamples ? atok : btok;
+ for (i=0; i<rtok->nsamples; i++)
{
- atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i];
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
- atok->nsamples = btok->nsamples;
- return pass_site;
+ return 2;
}
- /* atok->nsamples!=0 */
- for (i=0; i<atok->nsamples; i++)
+
+ assert( atok->nsamples==btok->nsamples );
+
+ for (i=0; i<rtok->nsamples; i++)
{
- atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site;
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
}
- return pass_site;
+ return 2;
}
-static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
+static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- int i, pass_site = 0;
- if ( !atok->nvalues && !btok->nvalues ) // missing sites in both
- {
- atok->nvalues = atok->nsamples = 0;
- return 0;
- }
- if ( !atok->nvalues ) // missing value in a
- {
- for (i=0; i<btok->nsamples; i++)
- atok->pass_samples[i] = btok->pass_samples[i];
- atok->nsamples = btok->nsamples;
- atok->nvalues = 1;
- return btok->pass_site;
- }
- if ( !btok->nvalues ) // missing value in b
- {
- btok->nvalues = 1;
- return atok->pass_site;
- }
+ token_t *atok = stack[nstack-2];
+ token_t *btok = stack[nstack-1];
+ tok_init_samples(atok, btok, rtok);
- if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
- if ( !atok->nsamples )
+ if ( !atok->pass_site || !btok->pass_site ) return 2;
+ if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; }
+
+ int i;
+ if ( !atok->nsamples || !btok->nsamples )
{
- if ( or_type==TOK_OR )
+ token_t *tok = atok->nsamples ? atok : btok;
+ for (i=0; i<rtok->nsamples; i++)
{
- for (i=0; i<btok->nsamples; i++)
- {
- atok->pass_samples[i] = btok->pass_samples[i];
- if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1;
- }
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
- else
- {
- for (i=0; i<btok->nsamples; i++)
- {
- atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i];
- if ( atok->pass_samples[i] ) pass_site = 1;
- }
- }
- atok->nsamples = btok->nsamples;
- return pass_site;
+ rtok->pass_site = 1;
+ return 2;
}
- if ( !btok->nsamples ) // vector vs site
+
+ assert( atok->nsamples==btok->nsamples );
+ if ( rtok->tok_type==TOK_AND_VEC ) // &&, can be true in different samples
{
- if ( or_type==TOK_OR )
+ for (i=0; i<rtok->nsamples; i++)
{
- for (i=0; i<atok->nsamples; i++)
- if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
}
- else
- {
- for (i=0; i<atok->nsamples; i++)
- {
- atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site;
- if ( atok->pass_samples[i] ) pass_site = 1;
- }
- }
- return pass_site;
+ rtok->pass_site = 1;
}
- for (i=0; i<atok->nsamples; i++)
+ else // &, must be true within one sample
{
- atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i];
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ for (i=0; i<rtok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i];
+ if ( rtok->pass_samples[i] ) rtok->pass_site = 1;
+ }
}
- return pass_site;
+ return 2;
}
#define CMP_MISSING(atok,btok,CMP_OP,ret) \
tok->nvalues = 1; \
}
-#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
+#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
{ \
- int i, j, has_values = 0, pass_site = 0; \
- if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ token_t *rtok = _rtok; \
+ int i, j, k; \
+ assert( !atok->nsamples || !btok->nsamples ); \
+ tok_init_samples(atok, btok, rtok); \
+ if ( !atok->nsamples && !btok->nsamples ) \
{ \
- if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+ if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \
+ else if ( !atok->nvalues || !btok->nvalues ) \
{ \
- /* any field can match: [*] */ \
- for (i=0; i<(atok)->nvalues; i++) \
+ token_t *tok = atok->nvalues ? atok : btok; \
+ for (j=0; j<tok->nvalues; j++) \
{ \
- for (j=0; j<(btok)->nvalues; j++) \
- if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+ if ( bcf_double_is_missing(tok->values[j]) ) \
+ { \
+ if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \
+ } \
+ else if ( missing_logic[1] ) { rtok->pass_site = 1; break; } \
} \
} \
- else if ( (atok)->nsamples && (btok)->nsamples ) \
+ else \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
+ for (i=0; i<atok->nvalues; i++) \
{ \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
+ int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \
+ for (j=0; j<btok->nvalues; j++) \
+ { \
+ int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \
+ if ( nmiss ) \
+ { \
+ if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+ } \
+ else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+ } \
} \
- if ( !has_values ) (atok)->nvalues = 0; \
} \
- else if ( (atok)->nsamples ) \
+ } \
+ else \
+ { \
+ if ( !atok->nvalues && !btok->nvalues ) \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
+ if ( missing_logic[2] ) \
{ \
- /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
+ for (i=0; i<rtok->nsamples; i++) \
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \
} \
- if ( !has_values ) (atok)->nvalues = 0; \
} \
- else if ( (btok)->nsamples ) \
+ else if ( !atok->nvalues || !btok->nvalues ) \
{ \
- for (i=0; i<(btok)->nsamples; i++) \
+ token_t *tok = atok->nvalues ? atok : btok; \
+ if ( !tok->nsamples ) \
{ \
- if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
+ int miss = 0; \
+ for (j=0; j<tok->nvalues; j++) \
+ miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \
+ if ( missing_logic[++miss] ) \
+ { \
+ for (i=0; i<rtok->nsamples; i++) \
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+ } \
} \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
- if ( !has_values ) (atok)->nvalues = 0; \
+ else \
+ for (i=0; i<tok->nsamples; i++) \
+ { \
+ if ( !rtok->usmpl[i] ) continue; \
+ double *ptr = tok->values + i*tok->nval1; \
+ int miss = 0; \
+ for (j=0; j<tok->nval1; j++) \
+ miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \
+ if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+ } \
} \
else \
{ \
- if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
+ token_t *xtok = atok->nsamples ? atok : btok; \
+ token_t *ytok = atok->nsamples ? btok : atok; \
+ for (i=0; i<xtok->nsamples; i++) \
+ { \
+ if ( !rtok->usmpl[i] ) continue; \
+ double *xptr = xtok->values + i*xtok->nval1; \
+ double *yptr = ytok->values + i*ytok->nval1; \
+ for (j=0; j<xtok->nval1; j++) \
+ { \
+ int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \
+ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+ for (k=0; k<ytok->nvalues; k++) \
+ { \
+ int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \
+ if ( nmiss ) \
+ { \
+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ } \
+ else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ } \
+ } \
+ } \
} \
- /*fprintf(stderr,"pass=%d\n", pass_site);*/ \
- (ret) = pass_site; \
} \
}
-static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE
+static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic)
{
- if ( !atok->str_value.l ) { return 0; }
- if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
- int i, pass_site = 0;
- if ( atok->nsamples && atok->nsamples==btok->nsamples )
- {
- for (i=0; i<atok->nsamples; i++)
- {
- char *astr = atok->str_value.s + i*atok->nstr1;
- char *bstr = btok->str_value.s + i*btok->nstr1;
- char *aend = astr + atok->str_value.l, *a = astr;
- while ( a<aend && *a ) a++;
- char *bend = bstr + btok->str_value.l, *b = bstr;
- while ( b<bend && *b ) b++;
- if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
- else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
- if ( logic!=TOK_EQ )
- atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
- pass_site |= atok->pass_samples[i];
- }
- if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
- }
- else if ( !atok->nsamples && !btok->nsamples )
- {
- if ( atok->idx==-2 || btok->idx==-2 )
- {
- // any field can match: [*]
- if ( atok->idx==-2 && btok->idx==-2 )
- error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value);
- token_t *xtok, *ytok; // xtok is scalar, ytok array
- if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
- else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
- char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
- while ( y<=yend )
- {
- if ( y==yend || *y==',' )
- {
- if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) )
- {
- pass_site = 1;
- break;
- }
- ystr = y+1;
- }
- y++;
- }
- }
- else
- pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
- if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
+ char *end = str + len;
+ while ( str < end && *str )
+ {
+ char *mid = str;
+ while ( mid < end && *mid && *mid!=',' ) mid++;
+ int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
+ if ( miss && missing_logic[miss] ) return 1;
+ char tmp = *mid; *mid = 0;
+ int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
+ *mid = tmp;
+ if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
+ if ( match ) return 1;
+ if ( !*mid ) break;
+ str = mid + 1;
}
- else
+ return 0;
+}
+static inline int _has_missing_string(char *beg)
+{
+ while ( *beg )
{
- token_t *xtok, *ytok;
- if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
- else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value.s;
- char *xend = xstr + xtok->str_value.l, *x = xstr;
- while ( x<xend && *x ) x++;
- for (i=0; i<ytok->nsamples; i++)
- {
- char *ystr = ytok->str_value.s + i*ytok->nstr1;
- char *ybeg = ystr, *yend = ystr + ytok->nstr1;
- int pass = 0;
- while ( ybeg < yend )
+ char *end = beg;
+ while ( *end && *end!=',' ) end++;
+ if ( end-beg==1 && beg[0]=='.' ) return 1;
+ if ( !*end ) break;
+ beg = end + 1;
+ }
+ return 0;
+}
+
+// Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A".
+// Returns 1 if any field matches, otherwise returns 0
+static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic)
+{
+ char *aend = abeg + alen;
+ char *bend = bstr + blen;
+ while ( abeg < aend && *abeg )
+ {
+ char *amid = abeg;
+ while ( amid < aend && *amid && *amid!=',' ) amid++;
+ int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0;
+ char *bbeg = bstr;
+ while ( bbeg < bend && *bbeg )
+ {
+ char *bmid = bbeg;
+ while ( bmid < bend && *bmid && *bmid!=',' ) bmid++;
+ int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0);
+ if ( nmiss )
{
- char *y = ybeg;
- while ( y<yend && *y && *y!=',' ) y++;
- if ( y - ybeg != x - xstr ) pass = 0;
- else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
- if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
- if ( pass || !*y ) break;
- ybeg = y+1;
+ if ( missing_logic[nmiss] ) return 1;
}
- atok->pass_samples[i] = pass;
- pass_site |= pass;
+ else
+ {
+ int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0;
+ if ( logic==TOK_NE ) match = match==1 ? 0 : 1;
+ if ( match ) return 1;
+ }
+ if ( !*bmid ) break;
+ bbeg = bmid + 1;
}
- if ( !atok->nsamples )
- atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
+ if ( !*amid ) break;
+ abeg = amid + 1;
}
- return pass_site;
+ return 0;
}
-static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
+static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
{
- int i, pass_site = 0;
- if ( atok->nsamples )
+ tok_init_samples(atok, btok, rtok);
+
+ int i, logic = rtok->tok_type; // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE
+ regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
+
+ assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l );
+ assert( !atok->nsamples || !btok->nsamples );
+ assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) );
+
+ int missing_logic[] = {0,0,0};
+ if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1;
+ else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1;
+
+ if ( !atok->nsamples && !btok->nsamples )
+ {
+ if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; }
+ if ( !atok->nvalues || !btok->nvalues )
+ {
+ int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s);
+ if ( missing_logic[miss+1] ) rtok->pass_site = 1;
+ return;
+ }
+ if ( !regex )
+ rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
+ else
+ {
+ token_t *tok = atok->regex ? btok : atok;
+ rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
+ }
+ return;
+ }
+
+ // The case of (!atok->nsamples || !btok->nsamples)
+
+ if ( !atok->nvalues && !btok->nvalues )
{
- for (i=0; i<atok->nsamples; i++)
+ if ( missing_logic[2] )
{
- char *ptr = atok->str_value.s + i*atok->nstr1;
- atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
- if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
- pass_site |= atok->pass_samples[i];
+ for (i=0; i<rtok->nsamples; i++)
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
}
- return pass_site;
+ return;
+ }
+ if ( !atok->nvalues || !btok->nvalues )
+ {
+ token_t *tok = atok->nvalues ? atok : btok;
+ if ( !tok->nsamples )
+ {
+ int miss = _has_missing_string(tok->str_value.s);
+ if ( !missing_logic[miss+1] ) return;
+ for (i=0; i<rtok->nsamples; i++)
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+ }
+ else
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ int miss = _has_missing_string(tok->str_value.s + i*tok->nval1);
+ if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+ }
+ return;
+ }
+
+ // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
+ token_t *xtok = atok->nsamples ? atok : btok;
+ token_t *ytok = atok->nsamples ? btok : atok;
+ assert( regex==ytok->regex );
+ for (i=0; i<xtok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ int match;
+ if ( regex )
+ match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic);
+ else
+ match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic);
+ if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
}
- pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
- if ( negate ) pass_site = pass_site ? 0 : 1;
- return pass_site;
}
-static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
+static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
{
- // TAG[*] .. any field
- if ( !strncmp("*]", tag_idx, 3) )
+ // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1
+ if ( *tag_idx==0 || !strcmp("*", tag_idx) )
{
- tok->idxs = (int*) malloc(sizeof(int));
- tok->idxs[0] = -1;
- tok->nidxs = 1;
- tok->idx = -2;
- return;
+ *idxs = (int*) malloc(sizeof(int));
+ (*idxs)[0] = -1;
+ *nidxs = 1;
+ *idx = -2;
+ return 0;
}
- // TAG[integer] .. one field
+ // TAG[integer] .. one field; idx positive
char *end, *beg = tag_idx;
- tok->idx = strtol(tag_idx, &end, 10);
- if ( tok->idx >= 0 && *end==']' ) return;
+ *idx = strtol(tag_idx, &end, 10);
+ if ( *idx >= 0 && *end==0 ) return 0;
-
- // TAG[0,1] or TAG[0-2] or [1-] etc
+ // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,..
int i, ibeg = -1;
- while ( *beg && *beg!=']' )
+ while ( *beg )
{
- int idx = strtol(beg, &end, 10);
+ int num = strtol(beg, &end, 10);
if ( end[0]==',' ) beg = end + 1;
- else if ( end[0]==']' ) beg = end;
- else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
- else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
- if ( idx >= tok->nidxs )
+ else if ( end[0]==0 ) beg = end;
+ else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; }
+ else return -1;
+ if ( num >= *nidxs )
{
- tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
- memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
- tok->nidxs = idx + 1;
+ *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1));
+ memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1));
+ *nidxs = num + 1;
}
if ( ibeg>=0 )
{
- for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+ for (i=ibeg; i<=num; i++) (*idxs)[i] = 1;
ibeg = -1;
}
- tok->idxs[idx] = 1;
+ (*idxs)[num] = 1;
}
if ( ibeg >=0 )
{
- if ( ibeg >= tok->nidxs )
+ if ( ibeg >= *nidxs )
{
- tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
- memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
- tok->nidxs = ibeg + 1;
+ *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1));
+ memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1));
+ *nidxs = ibeg + 1;
+ }
+ (*idxs)[ibeg] = -1;
+ }
+ *idx = -2;
+ return 0;
+}
+
+static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
+{
+ int i, len = strlen(tag_idx);
+ if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0;
+ char *ori = strdup(tag_idx);
+
+ assert( !tok->idxs && !tok->usmpl );
+ int *idxs1 = NULL, nidxs1 = 0, idx1 = 0;
+ int *idxs2 = NULL, nidxs2 = 0, idx2 = 0;
+
+ int set_samples = 0;
+ char *colon = index(tag_idx, ':');
+ if ( colon )
+ {
+ *colon = 0;
+ if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+ if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
+ tok->idxs = idxs2;
+ tok->nidxs = nidxs2;
+ tok->idx = idx2;
+ set_samples = 1;
+ }
+ else
+ {
+ if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+ if ( is_fmt )
+ {
+ if ( nidxs1==1 && idxs1[0]==-1 )
+ {
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
+ }
+ else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
+ error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
+ else
+ tok->idx = 0;
+ set_samples = 1;
+ }
+ else
+ {
+ tok->idxs = idxs1;
+ tok->nidxs = nidxs1;
+ tok->idx = idx1;
+ }
+ }
+
+ if ( set_samples )
+ {
+ tok->nsamples = bcf_hdr_nsamples(hdr);
+ tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
+ if ( idx1>=0 )
+ {
+ if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+ tok->usmpl[idx1] = 1;
+ }
+ else if ( idx1==-2 )
+ {
+ for (i=0; i<nidxs1; i++)
+ {
+ if ( idxs1[i]==0 ) continue;
+ if ( idxs1[i]==-1 ) break;
+ if ( i >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+ tok->usmpl[i] = 1;
+ }
+ if ( nidxs1 && idxs1[nidxs1-1]==-1 ) // open range, such as "7-"
+ {
+ for (; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+ }
}
- tok->idxs[ibeg] = -1;
+ else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2);
+ free(idxs1);
+ }
+ free(ori);
+
+ if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 )
+ {
+ for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
}
- tok->idx = -2;
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
tok->tok_type = TOK_VAL;
tok->hdr_id = -1;
tok->pass_site = -1;
- tok->idx = -1;
+ tok->idx = 0;
// is this a string constant?
if ( str[0]=='"' || str[0]=='\'' )
tok->setter = &filters_set_alt_string;
tok->is_str = 1;
tok->tag = strdup("ALT");
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
return 0;
}
else if ( !strncasecmp(str,"N_ALT",len) )
int i;
for (i=0; i<tmp.l; i++)
if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
- if ( is_array )
- parse_tag_idx(tmp.s, tmp.s+is_array, tok);
}
tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
if ( is_fmt==-1 )
}
if ( is_fmt==-1 ) is_fmt = 0;
}
+ if ( is_array )
+ parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
+ else if ( is_fmt && !tok->nsamples )
+ {
+ int i;
+ tok->nsamples = bcf_hdr_nsamples(filter->hdr);
+ tok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+ }
+
tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
if ( tok->hdr_id>=0 )
if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
error("No such FORMAT field: %s\n", tmp.s);
if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
- error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s);
+ {
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
+ }
switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
{
case BCF_HT_INT: tok->setter = &filters_set_format_int; break;
{
errno = 0;
tok->threshold = strtof(tmp.s, &end); // float?
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
}
if ( tmp.s ) free(tmp.s);
int i;
for (i=0; i<nout; i++)
{
+ if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
+ out[i].func = vector_logic_or;
+ if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC )
+ out[i].func = vector_logic_and;
if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
{
// Look for j="." and k numeric type
int j = i-1, k = i-2;
if ( !out[j].is_str ) { k = i-1, j = i-2; }
- if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
+ if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
{
- int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
- if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
- if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+ int set_missing = 0;
+ if ( out[k].hdr_id>0 )
+ {
+ int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
+ if ( type==BCF_HT_INT ) set_missing = 1;
+ else if ( type==BCF_HT_REAL ) set_missing = 1;
+ }
+ else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
+ if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
}
if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n");
+
// assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra
else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA
filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
for (i=0; i<nout; i++)
{
- if ( out[i].tok_type==TOK_MAX ) { out[i].setter = set_max; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_MIN ) { out[i].setter = set_min; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_AVG ) { out[i].setter = set_avg; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
+ if ( out[i].tok_type==TOK_MAX ) { out[i].func = func_max; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_MIN ) { out[i].func = func_min; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_SUM ) { out[i].func = func_sum; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_ABS ) { out[i].func = func_abs; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_CNT ) { out[i].func = func_count; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
int j;
- for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 1;
+ for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 0;
}
}
free(filter->filters[i].str_value.s);
free(filter->filters[i].tag);
free(filter->filters[i].idxs);
+ free(filter->filters[i].usmpl);
free(filter->filters[i].values);
free(filter->filters[i].pass_samples);
if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
int i, nstack = 0;
for (i=0; i<filter->nfilters; i++)
{
- filter->filters[i].nsamples = 0;
- filter->filters[i].nvalues = 0;
- filter->filters[i].pass_site = -1;
+ filter->filters[i].pass_site = 0;
if ( filter->filters[i].tok_type == TOK_VAL )
{
filter->flt_stack[nstack++] = &filter->filters[i];
continue;
}
- else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument
+ else if ( filter->filters[i].func )
{
- filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]);
+ int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack);
+ filter->flt_stack[nstack-nargs] = &filter->filters[i];
+ if ( --nargs > 0 ) nstack -= nargs;
continue;
}
if ( nstack<2 )
int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
- if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC )
- {
- if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
- error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
- filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
- nstack--;
- continue;
- }
- if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC )
- {
- if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
- error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
- filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
- nstack--;
- continue;
- }
-
if ( filter->filters[i].tok_type == TOK_ADD )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_SUB )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_MULT )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_DIV )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
- int is_true = 0;
+ // ideally, these comparators would become func, but this would require more work in init1()
if ( filter->filters[i].comparator )
- is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
- else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
{
- int skip = 0;
- if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
- if ( filter->filters[i].tok_type != TOK_EQ && filter->filters[i].tok_type != TOK_NE ) skip = 1;
-
- if ( skip )
- filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0;
- else if ( filter->filters[i].tok_type == TOK_EQ )
- CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true)
- else if ( filter->filters[i].tok_type == TOK_NE )
- CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true)
- }
- else if ( filter->filters[i].tok_type == TOK_EQ )
- {
- if ( filter->flt_stack[nstack-1]->comparator )
- is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line);
- else if ( filter->flt_stack[nstack-2]->comparator )
- is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line);
- else if ( is_str==2 ) // both are strings
- is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ);
- else if ( is_str==1 )
- error("Comparing string to numeric value: %s\n", filter->str);
- else
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true);
- }
- else if ( filter->filters[i].tok_type == TOK_NE )
- {
- if ( filter->flt_stack[nstack-1]->comparator )
- is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line);
- else if ( filter->flt_stack[nstack-2]->comparator )
- is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line);
- else if ( is_str==2 )
- is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE);
- else if ( is_str==1 )
- error("Comparing string to numeric value: %s\n", filter->str);
- else
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true);
+ filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
}
- else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
+ else if ( filter->flt_stack[nstack-1]->comparator )
{
- if ( is_str==2 )
- is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
- else
- error("The regex operator can be used on strings only: %s\n", filter->str);
- }
- else if ( is_str>0 )
- error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value);
- else if ( filter->filters[i].tok_type == TOK_LE )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true)
- else if ( filter->filters[i].tok_type == TOK_LT )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true)
- else if ( filter->filters[i].tok_type == TOK_BT )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true)
- else if ( filter->filters[i].tok_type == TOK_BE )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true)
+ filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
+ }
+ else if ( filter->flt_stack[nstack-2]->comparator )
+ {
+ filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line);
+ }
+ else if ( is_str==2 )
+ {
+ cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]);
+ }
else
- error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type);
+ {
+ // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2]
+ int missing_logic[] = {0,0,0};
+ if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; }
+ if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; }
- filter->flt_stack[nstack-2]->pass_site = is_true;
+ if ( filter->filters[i].tok_type == TOK_EQ )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_NE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_LE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_LT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_BT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_BE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic)
+ else
+ error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type);
+
+ }
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
}
if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack
/* filter.c -- filter expressions.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
- int idx; // 0-based index to VCF vectors, -1: not a vector,
+ int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
- int *idxs, nidxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+ int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+ int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
+ uint8_t *usmpl; // bitmask of used samples as set by idx
+ int nsamples; // number of samples for format fields, 0 for info and other fields
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
- int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
+ int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
+ void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
void *hash; // test presence of str value in the hash via comparator
regex_t *regex; // precompiled regex for string comparison
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
uint8_t *pass_samples; // status of individual samples
- int nsamples; // number of samples
int nvalues, mvalues; // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
- int nstr1; // per-sample string length, set only with str_value.l>0 && nsamples>1
+ int nval1; // number of per-sample fields or string length
}
token_t;
#define TOK_ABS 23
#define TOK_LEN 24
#define TOK_FUNC 25
+#define TOK_CNT 26
-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8};
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8};
#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
static int filters_next_token(char **str, int *len)
if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
+ if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
{
float *ptr = &line->qual;
if ( bcf_float_is_missing(*ptr) )
- tok->nvalues = 0;
+ bcf_double_set_missing(tok->values[0]);
else
- {
tok->values[0] = (double)line->qual;
- tok->nvalues = 1;
- }
+ tok->nvalues = 1;
}
static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
{
if ( line->d.info[i].key == tok->hdr_id ) break;
if ( i==line->n_info )
- tok->nvalues = 0;
+ tok->nvalues = tok->str_value.l = 0;
else if ( line->d.info[i].type==BCF_BT_CHAR )
{
int n = line->d.info[i].len;
}
}
}
-static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
- if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
- return a&b ? 0 : 1;
+ if ( rtok->tok_type==TOK_LIKE )
+ rtok->pass_site = a&b ? 1 : 0;
+ else
+ rtok->pass_site = a&b ? 0 : 1;
}
-static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
int i;
- if ( op_type==TOK_NE ) // AND logic: none of the filters can match
+ if ( rtok->tok_type==TOK_NE ) // AND logic: none of the filters can match
{
if ( !line->d.n_flt )
{
- if ( atok->hdr_id==-1 ) return 0; // missing value
- return 1; // no filter present, eval to true
+ if ( atok->hdr_id==-1 ) return; // missing value
+ rtok->pass_site = 1;
+ return; // no filter present, eval to true
}
for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) return 0;
- return 1;
+ if ( atok->hdr_id==line->d.flt[i] ) return;
+ rtok->pass_site = 1;
+ return;
}
- // TOK_EQ with OR logic: at least one of the filters must match
- if ( !line->d.n_flt )
+ else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
{
- if ( atok->hdr_id==-1 ) return 1;
- return 0; // no filter present, eval to false
+ if ( !line->d.n_flt )
+ {
+ if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+ return; // no filter present, eval to false
+ }
+ for (i=0; i<line->d.n_flt; i++)
+ if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
+ return;
}
- for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) return 1;
- return 0;
+ else
+ error("Only == and != operators are supported for FILTER\n");
+ return;
}
-static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
// multiple IDs not supported yet (easy to add though)
+ if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+ error("Only == and != operators are supported for ID\n");
if ( btok->hash )
{
if ( atok->hash )
{
int ret = khash_str2int_has_key(atok->hash, line->d.id);
- if ( op_type==TOK_EQ ) return ret;
- return ret ? 0 : 1;
+ if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+ rtok->pass_site = ret;
+ return;
}
- if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
- return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
+ if ( rtok->tok_type==TOK_EQ )
+ rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+ else
+ rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
}
/**
}
else
{
- int64_t value;
+ int64_t value = 0;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
{
flt->tmps.l = 0;
ks_resize(&flt->tmps, n);
- int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
- if ( end >= n ) end = n - 1;
+ int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+ if ( iend >= n ) iend = n - 1;
char *beg = tok->str_value.s, *dst = flt->tmps.s;
- for (i=0; i<=end; i++)
+ for (i=0; i<=iend; i++)
{
char *end = beg;
while ( *end && *end!=',' ) end++;
-
if ( i>=tok->nidxs || tok->idxs[i] )
{
memcpy(dst, beg, end - beg);
dst[0] = ',';
dst++;
}
-
beg = end+1;
}
dst[0] = 0;
static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int i;
- if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
+ if ( line->n_sample != tok->nsamples )
+ error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+ int nvals;
+ if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
{
- tok->nvalues = tok->nsamples = 0;
+ tok->nvalues = 0;
return;
}
- if ( tok->idx >= -1 ) // scalar or vector index
+ int i, nsrc1 = nvals / tok->nsamples;
+ tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+ tok->nvalues = tok->nval1*tok->nsamples;
+ hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+ if ( tok->idx >= 0 ) // scalar or vector index
{
- hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- int32_t *ptr = flt->tmpi;
- for (i=0; i<line->n_sample; i++)
+ for (i=0; i<tok->nsamples; i++)
{
- if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end )
+ if ( !tok->usmpl[i] ) continue;
+ int32_t *ptr = flt->tmpi + i*nsrc1;
+ if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end )
bcf_double_set_missing(tok->values[i]);
else
- {
- tok->values[i] = ptr[idx];
- is_missing = 0;
- }
- ptr += nvals;
+ tok->values[i] = ptr[tok->idx];
}
- if ( is_missing ) tok->nvalues = 0;
- else tok->nvalues = line->n_sample;
- tok->nsamples = tok->nvalues;
- return;
}
- if ( tok->idx == -2 )
+ else
{
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
- if ( end >= nvals ) end = nvals - 1;
- int32_t *ptr = flt->tmpi;
- for (i=0; i<line->n_sample; i++)
+ int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+ for (i=0; i<tok->nsamples; i++)
{
- for (k=0; k<=end; k++)
- if ( k>=tok->nidxs || tok->idxs[k] )
- {
- if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
- bcf_double_set_missing(tok->values[j]);
- else
- {
- tok->values[j] = ptr[k];
- is_missing = 0;
- }
- j++;
- }
- ptr += nvals;
- }
- if ( is_missing ) tok->nvalues = tok->nsamples = 0;
- else
- {
- tok->nsamples = line->n_sample;
- tok->nvalues = j;
+ if ( !tok->usmpl[i] ) continue;
+ int32_t *src = flt->tmpi + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<kend; k++)
+ {
+ if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+ if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end )
+ bcf_double_set_missing(dst[j]);
+ else
+ dst[j] = src[k];
+ j++;
+ }
+ while (j < tok->nval1)
+ {
+ bcf_double_set_missing(dst[j]);
+ j++;
+ }
}
- return;
}
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
- int i;
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
+ if ( line->n_sample != tok->nsamples )
+ error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+ int nvals;
+ if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
{
- tok->nvalues = tok->nsamples = 0;
+ tok->nvalues = 0;
return;
}
- if ( tok->idx >= -1 ) // scalar or vector index
+ int i, nsrc1 = nvals / tok->nsamples;
+ tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+ tok->nvalues = tok->nval1*tok->nsamples;
+ hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+ if ( tok->idx >= 0 ) // scalar or vector index
{
- hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- float *ptr = flt->tmpf;
- for (i=0; i<line->n_sample; i++)
+ for (i=0; i<tok->nsamples; i++)
{
- if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
+ if ( !tok->usmpl[i] ) continue;
+ float *ptr = flt->tmpf + i*nsrc1;
+ if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) )
bcf_double_set_missing(tok->values[i]);
else
- {
- tok->values[i] = ptr[idx];
- is_missing = 0;
- }
- ptr += nvals;
+ tok->values[i] = ptr[tok->idx];
}
- if ( is_missing ) tok->nvalues = 0;
- else tok->nvalues = line->n_sample;
- tok->nsamples = tok->nvalues;
- return;
}
- if ( tok->idx == -2 )
+ else
{
- hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
- int nvals = tok->nvalues / line->n_sample;
- int idx = tok->idx >= 0 ? tok->idx : 0;
- int is_missing = 1;
- int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
- if ( end >= nvals ) end = nvals - 1;
- float *ptr = flt->tmpf;
- for (i=0; i<line->n_sample; i++)
+ int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+ for (i=0; i<tok->nsamples; i++)
{
- for (k=0; k<=end; k++)
- if ( k>=tok->nidxs || tok->idxs[k] )
- {
- if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
- bcf_double_set_missing(tok->values[j]);
- else
- {
- tok->values[j] = ptr[k];
- is_missing = 0;
- }
- j++;
- }
- ptr += nvals;
- }
- if ( is_missing ) tok->nvalues = tok->nsamples = 0;
- else
- {
- tok->nsamples = line->n_sample;
- tok->nvalues = j;
+ if ( !tok->usmpl[i] ) continue;
+ float *src = flt->tmpf + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<kend; k++)
+ {
+ if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+ if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) )
+ bcf_double_set_missing(dst[j]);
+ else
+ dst[j] = src[k];
+ j++;
+ }
+ while (j < tok->nval1)
+ {
+ bcf_double_set_missing(dst[j]);
+ j++;
+ }
}
- return;
}
}
static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
- tok->str_value.l = tok->nvalues = 0;
- if ( !line->n_sample ) return;
+ if ( line->n_sample != tok->nsamples )
+ error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
- int ndim = tok->str_value.m;
+ int i, ndim = tok->str_value.m;
int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
tok->str_value.m = ndim;
+ tok->str_value.l = tok->nvalues = 0;
- if ( nstr<=0 ) return;
-
- if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
- {
- tok->nsamples = line->n_sample;
- tok->nstr1 = ndim / line->n_sample;
- tok->nvalues = tok->str_value.l = nstr;
- return;
- }
-
- int nstr1 = nstr / line->n_sample;
+ if ( nstr<0 ) return;
- // vector, one or multiple indices
- int i;
- for (i=0; i<line->n_sample; i++)
+ tok->nvalues = tok->str_value.l = nstr;
+ tok->nval1 = nstr / tok->nsamples;
+ for (i=0; i<tok->nsamples; i++)
{
- char *dst = tok->str_value.s + i*nstr1, *str = dst;
- int nval = 0, ibeg = 0;
- while ( ibeg < nstr1 )
+ if ( !tok->usmpl[i] ) continue;
+ char *src = tok->str_value.s + i*tok->nval1, *dst = src;
+ int ibeg = 0, idx = 0;
+ while ( ibeg < tok->nval1 )
{
- int iend = ibeg + 1;
- while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+ int iend = ibeg;
+ while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
int keep = 0;
- if ( tok->idx >=0 )
- keep = tok->idx==nval ? 1 : 0;
- else if ( nval < tok->nidxs )
- keep = tok->idxs[nval] ? 1 : 0;
+ if ( tok->idx >= 0 )
+ {
+ if ( tok->idx==idx ) keep = 1;
+ }
+ else if ( idx < tok->nidxs )
+ {
+ if ( tok->idxs[idx] != 0 ) keep = 1;
+ }
else if ( tok->idxs[tok->nidxs-1] < 0 )
keep = 1;
if ( keep )
{
- if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+ if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
dst += iend - ibeg + 1;
if ( tok->idx>=0 ) break;
}
- if ( !str[iend] ) break;
+ if ( !src[iend] ) break;
ibeg = iend + 1;
- nval++;
+ idx++;
}
- if ( dst==str ) { dst[0] = '.'; dst+=2; }
- if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
+ if ( dst==src ) { dst[0] = '.'; dst+=2; }
+ if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src));
}
- tok->nvalues = tok->str_value.l = nstr;
- tok->nstr1 = nstr1;
- tok->nsamples = line->n_sample;
}
static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
{
return;
}
- int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
- if ( tok->str_value.m <= nvals*nsmpl )
+ int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
+ if ( tok->str_value.m <= nvals1*nsmpl )
{
- tok->str_value.m = nvals*nsmpl + 1;
+ tok->str_value.m = nvals1*nsmpl + 1;
tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
}
if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
} \
} \
- char *dst = &tok->str_value.s[nvals*i]; \
- if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+ char *dst = &tok->str_value.s[nvals1*i]; \
+ if ( type==4 ) \
+ { \
+ if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \
+ else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \
+ else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \
+ else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \
+ } \
+ else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
else if ( type==3 ) \
{ \
if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
}
#undef BRANCH_INT
- tok->nsamples = nsmpl;
- tok->nvalues = tok->str_value.l = nvals*nsmpl;
+ assert( tok->nsamples == nsmpl );
+ tok->nvalues = tok->str_value.l = nvals1*nsmpl;
tok->str_value.s[tok->str_value.l] = 0;
- tok->nstr1 = nvals;
+ tok->nval1 = nvals1;
}
static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
if ( !fmt )
{
- tok->nvalues = tok->nsamples = 0;
+ tok->nvalues = 0;
return;
}
- int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
- kstring_t str;
+ int i, blen = 4, nsmpl = line->n_sample;
gt_length_too_big:
tok->str_value.l = 0;
for (i=0; i<nsmpl; i++)
{
- int plen = tok->str_value.l;
-
+ size_t plen = tok->str_value.l;
bcf_format_gt(fmt, i, &tok->str_value);
kputc_(0, &tok->str_value);
if ( tok->str_value.l - plen > blen )
plen++;
}
}
- tok->nsamples = nsmpl;
+ assert( tok->nsamples == nsmpl );
tok->nvalues = tok->str_value.l;
- tok->nstr1 = blen;
+ tok->nval1 = blen;
}
static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
}
else if ( tok->idx==-2 )
{
- int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+ int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1;
if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
for (i=0; i<=end; i++)
if ( i>=tok->nidxs || tok->idxs[i] )
}
}
-static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = 0;
+ if ( !tok->nvalues ) return 1;
double val = -HUGE_VAL;
- int i;
+ int i, has_value = 0;
for (i=0; i<tok->nvalues; i++)
{
- if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val < tok->values[i] ) val = tok->values[i];
+ }
+ if ( has_value )
+ {
+ rtok->values[0] = val;
+ rtok->nvalues = has_value;
}
- tok->values[0] = val;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ return 1;
}
-static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = 0;
+ if ( !tok->nvalues ) return 1;
double val = HUGE_VAL;
- int i;
+ int i, has_value = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
- tok->values[0] = val;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ {
+ if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+ has_value = 1;
+ if ( val > tok->values[i] ) val = tok->values[i];
+ }
+ if ( has_value )
+ {
+ rtok->values[0] = val;
+ rtok->nvalues = has_value;
+ }
+ return 1;
}
-static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = 0;
+ if ( !tok->nvalues ) return 1;
double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
- tok->values[0] = n ? val / n : 0;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ if ( n )
+ {
+ rtok->values[0] = val / n;
+ rtok->nvalues = 1;
+ }
+ return 1;
}
-static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ rtok->nvalues = 0;
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nvalues ) return 1;
double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
- tok->values[0] = val;
- tok->nvalues = 1;
- tok->nsamples = 0;
+ if ( n )
+ {
+ rtok->values[0] = val;
+ rtok->nvalues = 1;
+ }
+ return 1;
}
-static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
+ token_t *tok = stack[nstack - 1];
if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
+
+ rtok->nvalues = tok->nvalues;
+ if ( !tok->nvalues ) return 1;
+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
int i;
for (i=0; i<tok->nvalues; i++)
- tok->values[i] = fabs(tok->values[i]);
+ if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+ else rtok->values[i] = fabs(tok->values[i]);
+ return 1;
+}
+static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+ token_t *tok = stack[nstack - 1];
+ if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n");
+
+ int i, cnt = 0;
+ for (i=0; i<tok->nsamples; i++)
+ if ( tok->pass_samples[i] ) cnt++;
+
+ rtok->nvalues = 1;
+ rtok->values[0] = cnt;
+ return 1;
}
-static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- tok->is_str = 0;
- if ( !tok->str_value.l ) return;
+ token_t *tok = stack[nstack - 1];
+ rtok->nvalues = rtok->str_value.l = 0;
+ if ( !tok->str_value.l ) return 1;
+
if ( tok->idx==-2 )
{
int i = 0;
{
char *se = ss;
while ( *se && *se!=',' ) se++;
- hts_expand(double, i+1, tok->mvalues, tok->values);
- if ( !*se ) tok->values[i] = strlen(ss);
+ hts_expand(double, i+1, rtok->mvalues, rtok->values);
+ if ( !*se ) rtok->values[i] = strlen(ss);
else
{
*se = 0;
- tok->values[i] = strlen(ss);
+ rtok->values[i] = strlen(ss);
*se = ',';
}
ss = *se ? se + 1 : se;
i++;
}
- tok->nvalues = i;
+ rtok->nvalues = i;
}
else
{
- tok->values[0] = strlen(tok->str_value.s);
- tok->nvalues = 1;
+ rtok->values[0] = strlen(tok->str_value.s);
+ rtok->nvalues = 1;
}
- tok->str_value.l = 0;
+ return 1;
+}
+inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
+{
+ token_t *tok = atok->nvalues > btok->nvalues ? atok : btok;
+ rtok->nvalues = tok->nvalues;
+ rtok->nval1 = tok->nval1;
+ hts_expand(double*, rtok->nvalues, rtok->mvalues, rtok->values);
}
-#define VECTOR_ARITHMETICS(atok,btok,AOP) \
+inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
+{
+ if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
+ {
+ rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
+ rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
+ int i;
+ for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
+ for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
+ }
+ memset(rtok->pass_samples, 0, rtok->nsamples);
+}
+
+#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
{ \
+ token_t *rtok = _rtok; \
int i, has_values = 0; \
- if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \
+ if ( atok->nvalues && btok->nvalues ) \
{ \
- (atok)->nvalues = 0; (atok)->nsamples = 0; \
- } \
- else \
- { \
- if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \
- { \
- for (i=0; i<(atok)->nvalues; i++) \
- { \
- if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
- if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
- has_values = 1; \
- (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
- } \
- } \
- else if ( (btok)->nsamples ) \
+ tok_init_values(atok, btok, rtok); \
+ tok_init_samples(atok, btok, rtok); \
+ if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
{ \
- hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
- for (i=0; i<(btok)->nvalues; i++) \
+ assert( atok->nsamples==btok->nsamples ); \
+ for (i=0; i<atok->nvalues; i++) \
{ \
- if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
+ if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \
{ \
- bcf_double_set_missing((atok)->values[i]); \
+ bcf_double_set_missing(rtok->values[i]); \
continue; \
} \
has_values = 1; \
- (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \
+ rtok->values[i] = atok->values[i] AOP btok->values[i]; \
} \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
} \
- else if ( (atok)->nsamples ) \
+ else \
{ \
- for (i=0; i<(atok)->nvalues; i++) \
+ token_t *xtok = atok->nsamples ? atok : btok; \
+ token_t *ytok = atok->nsamples ? btok : atok; \
+ assert( ytok->nvalues==1 ); \
+ if ( !bcf_double_is_missing(ytok->values[0]) ) \
{ \
- if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
+ for (i=0; i<xtok->nvalues; i++) \
{ \
- bcf_double_set_missing((atok)->values[i]); \
- continue; \
+ if ( bcf_double_is_missing(xtok->values[i]) ) \
+ { \
+ bcf_double_set_missing(rtok->values[i]); \
+ continue; \
+ } \
+ has_values = 1; \
+ rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
} \
- has_values = 1; \
- (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \
} \
} \
} \
- if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \
+ if ( !has_values ) rtok->nvalues = 0; \
}
-static int vector_logic_and(token_t *atok, token_t *btok, int and_type)
+static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields),
- // or a vector and a scalar (FORMAT field vs threshold)
- int i, pass_site = 0;
- if ( !atok->nvalues || !btok->nvalues )
- {
- atok->nvalues = atok->nsamples = 0;
- return 0;
- }
- if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site;
- if ( atok->nsamples && btok->nsamples )
+ token_t *atok = stack[nstack-2];
+ token_t *btok = stack[nstack-1];
+ tok_init_samples(atok, btok, rtok);
+
+ if ( !atok->pass_site && !btok->pass_site ) return 2;
+
+ rtok->pass_site = 1;
+ if ( !atok->nsamples && !btok->nsamples ) return 2;
+
+ int i;
+ if ( rtok->tok_type==TOK_OR_VEC ) // ||, select all samples if one is true
{
- if ( and_type==TOK_AND )
+ if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) )
{
- // perform AND within a sample
- for (i=0; i<atok->nsamples; i++)
+ // These two conditions are to ensure the following does not set all samples
+ // at sites with QUAL<=30:
+ // QUAL>30 || FMT/GQ>30
+
+ token_t *tok = atok->nsamples ? atok : btok;
+ for (i=0; i<rtok->nsamples; i++)
{
- atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
}
else
{
- // perform AND across samples
- int pass_a = 0, pass_b = 0;
- for (i=0; i<atok->nsamples; i++)
- {
- if ( atok->pass_samples[i] ) pass_a = 1;
- atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
- }
- for (i=0; i<btok->nsamples; i++)
+ for (i=0; i<rtok->nsamples; i++)
{
- if ( btok->pass_samples[i] ) { pass_b = 1; break; }
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = 1;
}
- pass_site = pass_a && pass_b;
}
- return pass_site;
+ return 2;
}
- if ( btok->nsamples )
+
+ // |, only select samples which are actually true
+
+ if ( !atok->nsamples || !btok->nsamples )
{
- for (i=0; i<btok->nsamples; i++)
+ token_t *tok = atok->nsamples ? atok : btok;
+ for (i=0; i<rtok->nsamples; i++)
{
- atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i];
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
- atok->nsamples = btok->nsamples;
- return pass_site;
+ return 2;
}
- /* atok->nsamples!=0 */
- for (i=0; i<atok->nsamples; i++)
+
+ assert( atok->nsamples==btok->nsamples );
+
+ for (i=0; i<rtok->nsamples; i++)
{
- atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site;
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
}
- return pass_site;
+ return 2;
}
-static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
+static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
- int i, pass_site = 0;
- if ( !atok->nvalues && !btok->nvalues ) // missing sites in both
- {
- atok->nvalues = atok->nsamples = 0;
- return 0;
- }
- if ( !atok->nvalues ) // missing value in a
- {
- for (i=0; i<btok->nsamples; i++)
- atok->pass_samples[i] = btok->pass_samples[i];
- atok->nsamples = btok->nsamples;
- atok->nvalues = 1;
- return btok->pass_site;
- }
- if ( !btok->nvalues ) // missing value in b
- {
- btok->nvalues = 1;
- return atok->pass_site;
- }
+ token_t *atok = stack[nstack-2];
+ token_t *btok = stack[nstack-1];
+ tok_init_samples(atok, btok, rtok);
- if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
- if ( !atok->nsamples )
+ if ( !atok->pass_site || !btok->pass_site ) return 2;
+ if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; }
+
+ int i;
+ if ( !atok->nsamples || !btok->nsamples )
{
- if ( or_type==TOK_OR )
+ token_t *tok = atok->nsamples ? atok : btok;
+ for (i=0; i<rtok->nsamples; i++)
{
- for (i=0; i<btok->nsamples; i++)
- {
- atok->pass_samples[i] = btok->pass_samples[i];
- if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1;
- }
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
- else
- {
- for (i=0; i<btok->nsamples; i++)
- {
- atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i];
- if ( atok->pass_samples[i] ) pass_site = 1;
- }
- }
- atok->nsamples = btok->nsamples;
- return pass_site;
+ rtok->pass_site = 1;
+ return 2;
}
- if ( !btok->nsamples ) // vector vs site
+
+ assert( atok->nsamples==btok->nsamples );
+ if ( rtok->tok_type==TOK_AND_VEC ) // &&, can be true in different samples
{
- if ( or_type==TOK_OR )
+ for (i=0; i<rtok->nsamples; i++)
{
- for (i=0; i<atok->nsamples; i++)
- if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
}
- else
- {
- for (i=0; i<atok->nsamples; i++)
- {
- atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site;
- if ( atok->pass_samples[i] ) pass_site = 1;
- }
- }
- return pass_site;
+ rtok->pass_site = 1;
}
- for (i=0; i<atok->nsamples; i++)
+ else // &, must be true within one sample
{
- atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i];
- if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+ for (i=0; i<rtok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i];
+ if ( rtok->pass_samples[i] ) rtok->pass_site = 1;
+ }
}
- return pass_site;
+ return 2;
}
#define CMP_MISSING(atok,btok,CMP_OP,ret) \
tok->nvalues = 1; \
}
-#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
+#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
{ \
- int i, j, has_values = 0, pass_site = 0; \
- if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ token_t *rtok = _rtok; \
+ int i, j, k; \
+ assert( !atok->nsamples || !btok->nsamples ); \
+ tok_init_samples(atok, btok, rtok); \
+ if ( !atok->nsamples && !btok->nsamples ) \
{ \
- if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+ if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \
+ else if ( !atok->nvalues || !btok->nvalues ) \
{ \
- /* any field can match: [*] */ \
- for (i=0; i<(atok)->nvalues; i++) \
+ token_t *tok = atok->nvalues ? atok : btok; \
+ for (j=0; j<tok->nvalues; j++) \
{ \
- for (j=0; j<(btok)->nvalues; j++) \
- if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+ if ( bcf_double_is_missing(tok->values[j]) ) \
+ { \
+ if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \
+ } \
+ else if ( missing_logic[1] ) { rtok->pass_site = 1; break; } \
} \
} \
- else if ( (atok)->nsamples && (btok)->nsamples ) \
+ else \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
+ for (i=0; i<atok->nvalues; i++) \
{ \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
+ int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \
+ for (j=0; j<btok->nvalues; j++) \
+ { \
+ int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \
+ if ( nmiss ) \
+ { \
+ if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+ } \
+ else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+ } \
} \
- if ( !has_values ) (atok)->nvalues = 0; \
} \
- else if ( (atok)->nsamples ) \
+ } \
+ else \
+ { \
+ if ( !atok->nvalues && !btok->nvalues ) \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
+ if ( missing_logic[2] ) \
{ \
- /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
+ for (i=0; i<rtok->nsamples; i++) \
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \
} \
- if ( !has_values ) (atok)->nvalues = 0; \
} \
- else if ( (btok)->nsamples ) \
+ else if ( !atok->nvalues || !btok->nvalues ) \
{ \
- for (i=0; i<(btok)->nsamples; i++) \
+ token_t *tok = atok->nvalues ? atok : btok; \
+ if ( !tok->nsamples ) \
{ \
- if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
+ int miss = 0; \
+ for (j=0; j<tok->nvalues; j++) \
+ miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \
+ if ( missing_logic[++miss] ) \
+ { \
+ for (i=0; i<rtok->nsamples; i++) \
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+ } \
} \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
- if ( !has_values ) (atok)->nvalues = 0; \
+ else \
+ for (i=0; i<tok->nsamples; i++) \
+ { \
+ if ( !rtok->usmpl[i] ) continue; \
+ double *ptr = tok->values + i*tok->nval1; \
+ int miss = 0; \
+ for (j=0; j<tok->nval1; j++) \
+ miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \
+ if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+ } \
} \
else \
{ \
- if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
+ token_t *xtok = atok->nsamples ? atok : btok; \
+ token_t *ytok = atok->nsamples ? btok : atok; \
+ for (i=0; i<xtok->nsamples; i++) \
+ { \
+ if ( !rtok->usmpl[i] ) continue; \
+ double *xptr = xtok->values + i*xtok->nval1; \
+ double *yptr = ytok->values + i*ytok->nval1; \
+ for (j=0; j<xtok->nval1; j++) \
+ { \
+ int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \
+ if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+ for (k=0; k<ytok->nvalues; k++) \
+ { \
+ int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \
+ if ( nmiss ) \
+ { \
+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ } \
+ else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+ } \
+ } \
+ } \
} \
- /*fprintf(bcftools_stderr,"pass=%d\n", pass_site);*/ \
- (ret) = pass_site; \
} \
}
-static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE
+static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic)
{
- if ( !atok->str_value.l ) { return 0; }
- if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
- int i, pass_site = 0;
- if ( atok->nsamples && atok->nsamples==btok->nsamples )
- {
- for (i=0; i<atok->nsamples; i++)
- {
- char *astr = atok->str_value.s + i*atok->nstr1;
- char *bstr = btok->str_value.s + i*btok->nstr1;
- char *aend = astr + atok->str_value.l, *a = astr;
- while ( a<aend && *a ) a++;
- char *bend = bstr + btok->str_value.l, *b = bstr;
- while ( b<bend && *b ) b++;
- if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
- else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
- if ( logic!=TOK_EQ )
- atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
- pass_site |= atok->pass_samples[i];
- }
- if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
- }
- else if ( !atok->nsamples && !btok->nsamples )
- {
- if ( atok->idx==-2 || btok->idx==-2 )
- {
- // any field can match: [*]
- if ( atok->idx==-2 && btok->idx==-2 )
- error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value);
- token_t *xtok, *ytok; // xtok is scalar, ytok array
- if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
- else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
- char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
- while ( y<=yend )
- {
- if ( y==yend || *y==',' )
- {
- if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) )
- {
- pass_site = 1;
- break;
- }
- ystr = y+1;
- }
- y++;
- }
- }
- else
- pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
- if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
+ char *end = str + len;
+ while ( str < end && *str )
+ {
+ char *mid = str;
+ while ( mid < end && *mid && *mid!=',' ) mid++;
+ int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
+ if ( miss && missing_logic[miss] ) return 1;
+ char tmp = *mid; *mid = 0;
+ int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
+ *mid = tmp;
+ if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
+ if ( match ) return 1;
+ if ( !*mid ) break;
+ str = mid + 1;
}
- else
+ return 0;
+}
+static inline int _has_missing_string(char *beg)
+{
+ while ( *beg )
{
- token_t *xtok, *ytok;
- if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
- else { xtok = btok; ytok = atok; }
- char *xstr = xtok->str_value.s;
- char *xend = xstr + xtok->str_value.l, *x = xstr;
- while ( x<xend && *x ) x++;
- for (i=0; i<ytok->nsamples; i++)
- {
- char *ystr = ytok->str_value.s + i*ytok->nstr1;
- char *ybeg = ystr, *yend = ystr + ytok->nstr1;
- int pass = 0;
- while ( ybeg < yend )
+ char *end = beg;
+ while ( *end && *end!=',' ) end++;
+ if ( end-beg==1 && beg[0]=='.' ) return 1;
+ if ( !*end ) break;
+ beg = end + 1;
+ }
+ return 0;
+}
+
+// Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A".
+// Returns 1 if any field matches, otherwise returns 0
+static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic)
+{
+ char *aend = abeg + alen;
+ char *bend = bstr + blen;
+ while ( abeg < aend && *abeg )
+ {
+ char *amid = abeg;
+ while ( amid < aend && *amid && *amid!=',' ) amid++;
+ int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0;
+ char *bbeg = bstr;
+ while ( bbeg < bend && *bbeg )
+ {
+ char *bmid = bbeg;
+ while ( bmid < bend && *bmid && *bmid!=',' ) bmid++;
+ int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0);
+ if ( nmiss )
{
- char *y = ybeg;
- while ( y<yend && *y && *y!=',' ) y++;
- if ( y - ybeg != x - xstr ) pass = 0;
- else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
- if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
- if ( pass || !*y ) break;
- ybeg = y+1;
+ if ( missing_logic[nmiss] ) return 1;
}
- atok->pass_samples[i] = pass;
- pass_site |= pass;
+ else
+ {
+ int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0;
+ if ( logic==TOK_NE ) match = match==1 ? 0 : 1;
+ if ( match ) return 1;
+ }
+ if ( !*bmid ) break;
+ bbeg = bmid + 1;
}
- if ( !atok->nsamples )
- atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
+ if ( !*amid ) break;
+ abeg = amid + 1;
}
- return pass_site;
+ return 0;
}
-static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
+static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
{
- int i, pass_site = 0;
- if ( atok->nsamples )
+ tok_init_samples(atok, btok, rtok);
+
+ int i, logic = rtok->tok_type; // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE
+ regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
+
+ assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l );
+ assert( !atok->nsamples || !btok->nsamples );
+ assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) );
+
+ int missing_logic[] = {0,0,0};
+ if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1;
+ else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1;
+
+ if ( !atok->nsamples && !btok->nsamples )
+ {
+ if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; }
+ if ( !atok->nvalues || !btok->nvalues )
+ {
+ int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s);
+ if ( missing_logic[miss+1] ) rtok->pass_site = 1;
+ return;
+ }
+ if ( !regex )
+ rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
+ else
+ {
+ token_t *tok = atok->regex ? btok : atok;
+ rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
+ }
+ return;
+ }
+
+ // The case of (!atok->nsamples || !btok->nsamples)
+
+ if ( !atok->nvalues && !btok->nvalues )
{
- for (i=0; i<atok->nsamples; i++)
+ if ( missing_logic[2] )
{
- char *ptr = atok->str_value.s + i*atok->nstr1;
- atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
- if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
- pass_site |= atok->pass_samples[i];
+ for (i=0; i<rtok->nsamples; i++)
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
}
- return pass_site;
+ return;
+ }
+ if ( !atok->nvalues || !btok->nvalues )
+ {
+ token_t *tok = atok->nvalues ? atok : btok;
+ if ( !tok->nsamples )
+ {
+ int miss = _has_missing_string(tok->str_value.s);
+ if ( !missing_logic[miss+1] ) return;
+ for (i=0; i<rtok->nsamples; i++)
+ if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+ }
+ else
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ int miss = _has_missing_string(tok->str_value.s + i*tok->nval1);
+ if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+ }
+ return;
+ }
+
+ // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
+ token_t *xtok = atok->nsamples ? atok : btok;
+ token_t *ytok = atok->nsamples ? btok : atok;
+ assert( regex==ytok->regex );
+ for (i=0; i<xtok->nsamples; i++)
+ {
+ if ( !rtok->usmpl[i] ) continue;
+ int match;
+ if ( regex )
+ match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic);
+ else
+ match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic);
+ if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
}
- pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
- if ( negate ) pass_site = pass_site ? 0 : 1;
- return pass_site;
}
-static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
+static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
{
- // TAG[*] .. any field
- if ( !strncmp("*]", tag_idx, 3) )
+ // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1
+ if ( *tag_idx==0 || !strcmp("*", tag_idx) )
{
- tok->idxs = (int*) malloc(sizeof(int));
- tok->idxs[0] = -1;
- tok->nidxs = 1;
- tok->idx = -2;
- return;
+ *idxs = (int*) malloc(sizeof(int));
+ (*idxs)[0] = -1;
+ *nidxs = 1;
+ *idx = -2;
+ return 0;
}
- // TAG[integer] .. one field
+ // TAG[integer] .. one field; idx positive
char *end, *beg = tag_idx;
- tok->idx = strtol(tag_idx, &end, 10);
- if ( tok->idx >= 0 && *end==']' ) return;
+ *idx = strtol(tag_idx, &end, 10);
+ if ( *idx >= 0 && *end==0 ) return 0;
-
- // TAG[0,1] or TAG[0-2] or [1-] etc
+ // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,..
int i, ibeg = -1;
- while ( *beg && *beg!=']' )
+ while ( *beg )
{
- int idx = strtol(beg, &end, 10);
+ int num = strtol(beg, &end, 10);
if ( end[0]==',' ) beg = end + 1;
- else if ( end[0]==']' ) beg = end;
- else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
- else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
- if ( idx >= tok->nidxs )
+ else if ( end[0]==0 ) beg = end;
+ else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; }
+ else return -1;
+ if ( num >= *nidxs )
{
- tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
- memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
- tok->nidxs = idx + 1;
+ *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1));
+ memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1));
+ *nidxs = num + 1;
}
if ( ibeg>=0 )
{
- for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+ for (i=ibeg; i<=num; i++) (*idxs)[i] = 1;
ibeg = -1;
}
- tok->idxs[idx] = 1;
+ (*idxs)[num] = 1;
}
if ( ibeg >=0 )
{
- if ( ibeg >= tok->nidxs )
+ if ( ibeg >= *nidxs )
{
- tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
- memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
- tok->nidxs = ibeg + 1;
+ *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1));
+ memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1));
+ *nidxs = ibeg + 1;
+ }
+ (*idxs)[ibeg] = -1;
+ }
+ *idx = -2;
+ return 0;
+}
+
+static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok) // tag_idx points just after "TAG["
+{
+ int i, len = strlen(tag_idx);
+ if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0;
+ char *ori = strdup(tag_idx);
+
+ assert( !tok->idxs && !tok->usmpl );
+ int *idxs1 = NULL, nidxs1 = 0, idx1 = 0;
+ int *idxs2 = NULL, nidxs2 = 0, idx2 = 0;
+
+ int set_samples = 0;
+ char *colon = index(tag_idx, ':');
+ if ( colon )
+ {
+ *colon = 0;
+ if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+ if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
+ tok->idxs = idxs2;
+ tok->nidxs = nidxs2;
+ tok->idx = idx2;
+ set_samples = 1;
+ }
+ else
+ {
+ if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+ if ( is_fmt )
+ {
+ if ( nidxs1==1 && idxs1[0]==-1 )
+ {
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
+ }
+ else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
+ error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
+ else
+ tok->idx = 0;
+ set_samples = 1;
+ }
+ else
+ {
+ tok->idxs = idxs1;
+ tok->nidxs = nidxs1;
+ tok->idx = idx1;
+ }
+ }
+
+ if ( set_samples )
+ {
+ tok->nsamples = bcf_hdr_nsamples(hdr);
+ tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
+ if ( idx1>=0 )
+ {
+ if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+ tok->usmpl[idx1] = 1;
+ }
+ else if ( idx1==-2 )
+ {
+ for (i=0; i<nidxs1; i++)
+ {
+ if ( idxs1[i]==0 ) continue;
+ if ( idxs1[i]==-1 ) break;
+ if ( i >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+ tok->usmpl[i] = 1;
+ }
+ if ( nidxs1 && idxs1[nidxs1-1]==-1 ) // open range, such as "7-"
+ {
+ for (; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+ }
}
- tok->idxs[ibeg] = -1;
+ else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2);
+ free(idxs1);
+ }
+ free(ori);
+
+ if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 )
+ {
+ for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
}
- tok->idx = -2;
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
tok->tok_type = TOK_VAL;
tok->hdr_id = -1;
tok->pass_site = -1;
- tok->idx = -1;
+ tok->idx = 0;
// is this a string constant?
if ( str[0]=='"' || str[0]=='\'' )
tok->setter = &filters_set_alt_string;
tok->is_str = 1;
tok->tag = strdup("ALT");
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
return 0;
}
else if ( !strncasecmp(str,"N_ALT",len) )
int i;
for (i=0; i<tmp.l; i++)
if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
- if ( is_array )
- parse_tag_idx(tmp.s, tmp.s+is_array, tok);
}
tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
if ( is_fmt==-1 )
}
if ( is_fmt==-1 ) is_fmt = 0;
}
+ if ( is_array )
+ parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
+ else if ( is_fmt && !tok->nsamples )
+ {
+ int i;
+ tok->nsamples = bcf_hdr_nsamples(filter->hdr);
+ tok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+ }
+
tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
if ( tok->hdr_id>=0 )
if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
error("No such FORMAT field: %s\n", tmp.s);
if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
- error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s);
+ {
+ tok->idxs = (int*) malloc(sizeof(int));
+ tok->idxs[0] = -1;
+ tok->nidxs = 1;
+ tok->idx = -2;
+ }
switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
{
case BCF_HT_INT: tok->setter = &filters_set_format_int; break;
{
errno = 0;
tok->threshold = strtof(tmp.s, &end); // float?
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
}
if ( tmp.s ) free(tmp.s);
int i;
for (i=0; i<nout; i++)
{
+ if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
+ out[i].func = vector_logic_or;
+ if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC )
+ out[i].func = vector_logic_and;
if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
{
// Look for j="." and k numeric type
int j = i-1, k = i-2;
if ( !out[j].is_str ) { k = i-1, j = i-2; }
- if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
+ if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
{
- int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
- if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
- if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+ int set_missing = 0;
+ if ( out[k].hdr_id>0 )
+ {
+ int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
+ if ( type==BCF_HT_INT ) set_missing = 1;
+ else if ( type==BCF_HT_REAL ) set_missing = 1;
+ }
+ else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
+ if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
}
if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n");
+
// assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
+ else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2; str_to_lower(out[ival].key); }
else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; } // ra
else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; } // aA
filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
for (i=0; i<nout; i++)
{
- if ( out[i].tok_type==TOK_MAX ) { out[i].setter = set_max; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_MIN ) { out[i].setter = set_min; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_AVG ) { out[i].setter = set_avg; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
- else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
+ if ( out[i].tok_type==TOK_MAX ) { out[i].func = func_max; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_MIN ) { out[i].func = func_min; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_SUM ) { out[i].func = func_sum; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_ABS ) { out[i].func = func_abs; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_CNT ) { out[i].func = func_count; out[i].tok_type = TOK_FUNC; }
+ else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
int j;
- for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 1;
+ for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 0;
}
}
free(filter->filters[i].str_value.s);
free(filter->filters[i].tag);
free(filter->filters[i].idxs);
+ free(filter->filters[i].usmpl);
free(filter->filters[i].values);
free(filter->filters[i].pass_samples);
if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
int i, nstack = 0;
for (i=0; i<filter->nfilters; i++)
{
- filter->filters[i].nsamples = 0;
- filter->filters[i].nvalues = 0;
- filter->filters[i].pass_site = -1;
+ filter->filters[i].pass_site = 0;
if ( filter->filters[i].tok_type == TOK_VAL )
{
filter->flt_stack[nstack++] = &filter->filters[i];
continue;
}
- else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument
+ else if ( filter->filters[i].func )
{
- filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]);
+ int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack);
+ filter->flt_stack[nstack-nargs] = &filter->filters[i];
+ if ( --nargs > 0 ) nstack -= nargs;
continue;
}
if ( nstack<2 )
int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
- if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC )
- {
- if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
- error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
- filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
- nstack--;
- continue;
- }
- if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC )
- {
- if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
- error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
- filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
- nstack--;
- continue;
- }
-
if ( filter->filters[i].tok_type == TOK_ADD )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_SUB )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_MULT )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_DIV )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
- int is_true = 0;
+ // ideally, these comparators would become func, but this would require more work in init1()
if ( filter->filters[i].comparator )
- is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
- else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
{
- int skip = 0;
- if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
- if ( filter->filters[i].tok_type != TOK_EQ && filter->filters[i].tok_type != TOK_NE ) skip = 1;
-
- if ( skip )
- filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0;
- else if ( filter->filters[i].tok_type == TOK_EQ )
- CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true)
- else if ( filter->filters[i].tok_type == TOK_NE )
- CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true)
- }
- else if ( filter->filters[i].tok_type == TOK_EQ )
- {
- if ( filter->flt_stack[nstack-1]->comparator )
- is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line);
- else if ( filter->flt_stack[nstack-2]->comparator )
- is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line);
- else if ( is_str==2 ) // both are strings
- is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ);
- else if ( is_str==1 )
- error("Comparing string to numeric value: %s\n", filter->str);
- else
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true);
- }
- else if ( filter->filters[i].tok_type == TOK_NE )
- {
- if ( filter->flt_stack[nstack-1]->comparator )
- is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line);
- else if ( filter->flt_stack[nstack-2]->comparator )
- is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line);
- else if ( is_str==2 )
- is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE);
- else if ( is_str==1 )
- error("Comparing string to numeric value: %s\n", filter->str);
- else
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true);
+ filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
}
- else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
+ else if ( filter->flt_stack[nstack-1]->comparator )
{
- if ( is_str==2 )
- is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
- else
- error("The regex operator can be used on strings only: %s\n", filter->str);
- }
- else if ( is_str>0 )
- error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value);
- else if ( filter->filters[i].tok_type == TOK_LE )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true)
- else if ( filter->filters[i].tok_type == TOK_LT )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true)
- else if ( filter->filters[i].tok_type == TOK_BT )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true)
- else if ( filter->filters[i].tok_type == TOK_BE )
- CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true)
+ filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
+ }
+ else if ( filter->flt_stack[nstack-2]->comparator )
+ {
+ filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line);
+ }
+ else if ( is_str==2 )
+ {
+ cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]);
+ }
else
- error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type);
+ {
+ // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2]
+ int missing_logic[] = {0,0,0};
+ if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; }
+ if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; }
- filter->flt_stack[nstack-2]->pass_site = is_true;
+ if ( filter->filters[i].tok_type == TOK_EQ )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_NE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_LE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_LT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_BT )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic)
+ else if ( filter->filters[i].tok_type == TOK_BE )
+ CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic)
+ else
+ error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type);
+
+ }
+ filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
}
if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack
int main_vcfannotate(int argc, char *argv[]);
int main_vcfroh(int argc, char *argv[]);
int main_vcfconcat(int argc, char *argv[]);
-int main_bcftools_reheader(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
int main_vcfconvert(int argc, char *argv[]);
int main_vcfcnv(int argc, char *argv[]);
#if USE_GPL
.alias = "query",
.help = "transform VCF/BCF into user-defined formats"
},
- { .func = main_bcftools_reheader,
+ { .func = main_reheader,
.alias = "reheader",
.help = "modify VCF/BCF header, change sample names"
},
/*
- Copyright (C) 2014-2016 Genome Research Ltd.
+ Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
return 0;
}
+regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ if ( !idx ) return NULL;
+
+ idx->free = free_f;
+ idx->parse = parser ? parser : regidx_parse_tab;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ kstring_t tmp = {0,0,0};
+ const char *ss = str;
+ while ( *ss )
+ {
+ while ( *ss && isspace(*ss) ) ss++;
+ const char *se = ss;
+ while ( *se && *se!='\r' && *se!='\n' ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ regidx_insert(idx,tmp.s);
+ while ( *se && isspace(*se) ) se++;
+ ss = se;
+ }
+ free(tmp.s);
+ return idx;
+}
+
regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
if ( !parser )
}
+void regitr_copy(regitr_t *dst, regitr_t *src)
+{
+ _itr_t *dst_itr = (_itr_t*) dst->itr;
+ _itr_t *src_itr = (_itr_t*) src->itr;
+ *dst_itr = *src_itr;
+ *dst = *src;
+ dst->itr = dst_itr;
+}
+
#include "bcftools.pysam.h"
/*
- Copyright (C) 2014-2016 Genome Research Ltd.
+ Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
return 0;
}
+regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ if ( !idx ) return NULL;
+
+ idx->free = free_f;
+ idx->parse = parser ? parser : regidx_parse_tab;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ kstring_t tmp = {0,0,0};
+ const char *ss = str;
+ while ( *ss )
+ {
+ while ( *ss && isspace(*ss) ) ss++;
+ const char *se = ss;
+ while ( *se && *se!='\r' && *se!='\n' ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ regidx_insert(idx,tmp.s);
+ while ( *se && isspace(*se) ) se++;
+ ss = se;
+ }
+ free(tmp.s);
+ return idx;
+}
+
regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
if ( !parser )
}
+void regitr_copy(regitr_t *dst, regitr_t *src)
+{
+ _itr_t *dst_itr = (_itr_t*) dst->itr;
+ _itr_t *src_itr = (_itr_t*) src->itr;
+ *dst_itr = *src_itr;
+ *dst = *src;
+ dst->itr = dst_itr;
+}
+
/*
* regidx_init() - creates new index
+ * regidx_init_string() - creates new index, from a string rather than from a file
+ *
* @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert()
* @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
* the format will be autodected, currently either regidx_parse_tab (the default) or
* Returns index on success or NULL on error.
*/
regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
+regidx_t *regidx_init_string(const char *string, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
/*
* regidx_destroy() - free memory allocated by regidx_init
*/
int regitr_loop(regitr_t *itr);
+/*
+ * regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop
+ */
+void regitr_copy(regitr_t *dst, regitr_t *src);
+
#ifdef __cplusplus
}
#endif
/* reheader.c -- reheader subcommand.
- Copyright (C) 2014,2016 Genome Research Ltd.
+ Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
exit(1);
}
-int main_bcftools_reheader(int argc, char *argv[])
+int main_reheader(int argc, char *argv[])
{
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
/* reheader.c -- reheader subcommand.
- Copyright (C) 2014,2016 Genome Research Ltd.
+ Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
exit(1);
}
-int main_bcftools_reheader(int argc, char *argv[])
+int main_reheader(int argc, char *argv[])
{
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
filter_t *filter;
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ int keep_sites;
rm_tag_t *rm; // tags scheduled for removal
int nrm;
tag->key = strdup(str.s);
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
- bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
+ if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
}
else
{
tag->key = strdup(str.s);
if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
- bcf_hdr_remove(args->hdr_out,type,tag->key);
+ if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
}
}
else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
else if ( !strcasecmp("FILTER",str.s) )
{
tag->handler = remove_filter;
- remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
+ if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
}
else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
else if ( !strcasecmp("INFO",str.s) )
{
tag->handler = remove_info;
- remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
+ if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
}
else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
{
tag->handler = remove_format;
- remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
+ if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
}
else if ( str.l )
{
- if ( str.s[0]=='#' && str.s[1]=='#' )
- bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
- else
- bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+ if ( !args->keep_sites )
+ {
+ if ( str.s[0]=='#' && str.s[1]=='#' )
+ bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
+ else
+ bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+ }
args->nrm--;
}
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
}
tag->key = strdup(hrec->vals[k]);
- bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
+ if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
}
}
khash_str2int_destroy_free(keep);
fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
fprintf(stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
fprintf(stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
fprintf(stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(stderr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
+ fprintf(stderr, " -x, --remove <list> list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(stderr, "\n");
exit(1);
static struct option loptions[] =
{
+ {"keep-sites",required_argument,NULL,'k'},
{"mark-sites",required_argument,NULL,'m'},
{"set-id",required_argument,NULL,'I'},
{"output",required_argument,NULL,'o'},
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0)
{
switch (c) {
+ case 'k': args->keep_sites = 1; break;
case 'm':
args->mark_sites_logic = MARK_LISTED;
if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
{
int pass = filter_test(args->filter, line, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
- if ( !pass ) continue;
+ if ( !pass )
+ {
+ if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line);
+ continue;
+ }
}
annotate(args, line);
bcf_write1(args->out_fh, args->hdr_out, line);
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
filter_t *filter;
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ int keep_sites;
rm_tag_t *rm; // tags scheduled for removal
int nrm;
tag->key = strdup(str.s);
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
- bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
+ if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
}
else
{
tag->key = strdup(str.s);
if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
- bcf_hdr_remove(args->hdr_out,type,tag->key);
+ if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
}
}
else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
else if ( !strcasecmp("FILTER",str.s) )
{
tag->handler = remove_filter;
- remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
+ if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
}
else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
else if ( !strcasecmp("INFO",str.s) )
{
tag->handler = remove_info;
- remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
+ if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
}
else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
{
tag->handler = remove_format;
- remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
+ if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
}
else if ( str.l )
{
- if ( str.s[0]=='#' && str.s[1]=='#' )
- bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
- else
- bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+ if ( !args->keep_sites )
+ {
+ if ( str.s[0]=='#' && str.s[1]=='#' )
+ bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
+ else
+ bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+ }
args->nrm--;
}
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
}
tag->key = strdup(hrec->vals[k]);
- bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
+ if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
}
}
khash_str2int_destroy_free(keep);
fprintf(bcftools_stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
fprintf(bcftools_stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
fprintf(bcftools_stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(bcftools_stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
fprintf(bcftools_stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
fprintf(bcftools_stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
- fprintf(bcftools_stderr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
+ fprintf(bcftools_stderr, " -x, --remove <list> list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(bcftools_stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(bcftools_stderr, "\n");
exit(1);
static struct option loptions[] =
{
+ {"keep-sites",required_argument,NULL,'k'},
{"mark-sites",required_argument,NULL,'m'},
{"set-id",required_argument,NULL,'I'},
{"output",required_argument,NULL,'o'},
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0)
{
switch (c) {
+ case 'k': args->keep_sites = 1; break;
case 'm':
args->mark_sites_logic = MARK_LISTED;
if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
{
int pass = filter_test(args->filter, line, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
- if ( !pass ) continue;
+ if ( !pass )
+ {
+ if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line);
+ continue;
+ }
}
annotate(args, line);
bcf_write1(args->out_fh, args->hdr_out, line);
/* The MIT License
- Copyright (c) 2014-2015 Genome Research Ltd.
+ Copyright (c) 2014-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
"csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
"\n"
"dat = {}\n"
- "with open('%s', 'rb') as f:\n"
+ "with open('%s', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
" dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
"\n"
"cnv = {}\n"
- "with open('%s', 'rb') as f:\n"
+ "with open('%s', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
"\n"
"def chroms_to_plot(th):\n"
" dat = {}\n"
- " with open('%s/summary.tab', 'rb') as f:\n"
+ " with open('%s/summary.tab', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" if row[0]!='RG': continue\n"
" plot_chroms = chroms_to_plot(args.plot_threshold)\n"
"\n"
"def read_dat(file,dat,plot_chr):\n"
- " with open(file, 'rb') as f:\n"
+ " with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
" if chr != plot_chr: continue\n"
" dat.append([row[1], float(row[2]), float(row[3])])\n"
"def read_cnv(file,cnv,plot_chr):\n"
- " with open(file, 'rb') as f:\n"
+ " with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
/* The MIT License
- Copyright (c) 2014-2015 Genome Research Ltd.
+ Copyright (c) 2014-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
"csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
"\n"
"dat = {}\n"
- "with open('%s', 'rb') as f:\n"
+ "with open('%s', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
" dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
"\n"
"cnv = {}\n"
- "with open('%s', 'rb') as f:\n"
+ "with open('%s', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
"\n"
"def chroms_to_plot(th):\n"
" dat = {}\n"
- " with open('%s/summary.tab', 'rb') as f:\n"
+ " with open('%s/summary.tab', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" if row[0]!='RG': continue\n"
" plot_chroms = chroms_to_plot(args.plot_threshold)\n"
"\n"
"def read_dat(file,dat,plot_chr):\n"
- " with open(file, 'rb') as f:\n"
+ " with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
" if chr != plot_chr: continue\n"
" dat.append([row[1], float(row[2]), float(row[3])])\n"
"def read_cnv(file,cnv,plot_chr):\n"
- " with open(file, 'rb') as f:\n"
+ " with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* vcfconvert.c -- convert between VCF/BCF and related formats.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
"sample_ids = False\n"
"\n"
"dat = []\n"
- "with open('%s.tab', 'rb') as f:\n"
+ "with open('%s.tab', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" if row[0][0]=='#': continue\n"
"dat = None\n"
"min = None\n"
"max = None\n"
- "with open('%s.tab', 'rb') as f:\n"
+ "with open('%s.tab', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" i = 0\n"
" for row in reader:\n"
/* vcfgtcheck.c -- Check sample identity.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
"sample_ids = False\n"
"\n"
"dat = []\n"
- "with open('%s.tab', 'rb') as f:\n"
+ "with open('%s.tab', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" if row[0][0]=='#': continue\n"
"dat = None\n"
"min = None\n"
"max = None\n"
- "with open('%s.tab', 'rb') as f:\n"
+ "with open('%s.tab', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" i = 0\n"
" for row in reader:\n"
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/faidx.h>
+#include <htslib/khash_str2int.h>
#include "bcftools.h"
#include "rbuf.h"
}
map_t;
+// primitive comparison of two records' alleles via hashes; normalized alleles assumed
+typedef struct
+{
+ int n; // number of alleles
+ char *ref, *alt;
+ void *hash;
+}
+cmpals_t;
+
typedef struct
{
char *tseq, *seq;
int aln_win; // the realignment window size (maximum repeat size)
bcf_srs_t *files; // using the synced reader only for -r option
bcf_hdr_t *hdr;
+ cmpals_t *cmpals;
+ int ncmpals, mcmpals;
faidx_t *fai;
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
}
static inline int has_non_acgtn(char *seq, int nseq)
{
- char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number
+ char *end = seq + nseq;
while ( *seq && seq<end )
{
char c = toupper(*seq);
{
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
- if ( has_non_acgtn(line->d.allele[i],0) )
+ if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
{
if ( args->check_ref==CHECK_REF_EXIT )
error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
}
return NULL;
}
+static void cmpals_add(args_t *args, bcf1_t *rec)
+{
+ args->ncmpals++;
+ hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals);
+ cmpals_t *cmpals = args->cmpals + args->ncmpals - 1;
+ free(cmpals->ref);
+ cmpals->ref = strdup(rec->d.allele[0]);
+ cmpals->n = rec->n_allele;
+ if ( rec->n_allele==2 )
+ {
+ free(cmpals->alt);
+ cmpals->alt = strdup(rec->d.allele[1]);
+ }
+ else
+ {
+ if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+ cmpals->hash = khash_str2int_init();
+ int i;
+ for (i=1; i<rec->n_allele; i++)
+ khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i]));
+ }
+}
+static int cmpals_match(args_t *args, bcf1_t *rec)
+{
+ int i, j;
+ for (i=0; i<args->ncmpals; i++)
+ {
+ cmpals_t *cmpals = args->cmpals + i;
+ if ( rec->n_allele != cmpals->n ) continue;
+
+ // NB. assuming both are normalized
+ if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue;
+
+ // the most frequent case
+ if ( rec->n_allele==2 )
+ {
+ if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue;
+ return 1;
+ }
+
+ khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash;
+ for (j=1; j<rec->n_allele; j++)
+ if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break;
+ if ( j<rec->n_allele ) continue;
+ return 1;
+ }
+ cmpals_add(args, rec);
+ return 0;
+}
+static void cmpals_reset(args_t *args) { args->ncmpals = 0; }
+static void cmpals_destroy(args_t *args)
+{
+ int i;
+ for (i=0; i<args->mcmpals; i++)
+ {
+ cmpals_t *cmpals = args->cmpals + i;
+ free(cmpals->ref);
+ free(cmpals->alt);
+ if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+ }
+ free(args->cmpals);
+}
+
static void flush_buffer(args_t *args, htsFile *file, int n)
{
bcf1_t *line;
int line_type = bcf_get_variant_types(args->lines[k]);
if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
{
- if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only
+ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue;
}
else
{
prev_rid = args->lines[k]->rid;
prev_pos = args->lines[k]->pos;
prev_type = 0;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
}
prev_type |= line_type;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]);
}
bcf_write1(file, args->hdr, args->lines[k]);
}
static void destroy_data(args_t *args)
{
+ cmpals_destroy(args);
int i;
for (i=0; i<args->rbuf.m; i++)
if ( args->lines[i] ) bcf_destroy1(args->lines[i]);
int line_type = bcf_get_variant_types(line);
if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
{
- if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only
+ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue;
}
else
{
prev_rid = line->rid;
prev_pos = line->pos;
prev_type = 0;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
}
prev_type |= line_type;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line);
}
// still on the same chromosome?
fprintf(stderr, "Options:\n");
fprintf(stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
- fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
+ fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|all|none\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence (MANDATORY)\n");
fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(stderr, " --no-version do not append version and command line to the header\n");
switch (c) {
case 'N': args->do_indels = 0; break;
case 'd':
- if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1;
- else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1;
- else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1;
- else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1;
+ if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
+ else if ( !strcmp("indels",optarg) ) args->rmdup = BCF_SR_PAIR_INDELS;
+ else if ( !strcmp("both",optarg) ) args->rmdup = BCF_SR_PAIR_BOTH;
+ else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+ else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+ else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT;
else error("The argument to -d not recognised: %s\n", optarg);
break;
case 'm':
case 'o': args->output_fname = optarg; break;
case 'D':
fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n");
- args->rmdup = COLLAPSE_NONE<<1;
+ args->rmdup = BCF_SR_PAIR_EXACT;
break;
case 's': args->strict_filter = 1; break;
case 'f': args->ref_fname = optarg; break;
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/faidx.h>
+#include <htslib/khash_str2int.h>
#include "bcftools.h"
#include "rbuf.h"
}
map_t;
+// primitive comparison of two records' alleles via hashes; normalized alleles assumed
+typedef struct
+{
+ int n; // number of alleles
+ char *ref, *alt;
+ void *hash;
+}
+cmpals_t;
+
typedef struct
{
char *tseq, *seq;
int aln_win; // the realignment window size (maximum repeat size)
bcf_srs_t *files; // using the synced reader only for -r option
bcf_hdr_t *hdr;
+ cmpals_t *cmpals;
+ int ncmpals, mcmpals;
faidx_t *fai;
struct { int tot, set, swap; } nref;
char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
}
static inline int has_non_acgtn(char *seq, int nseq)
{
- char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number
+ char *end = seq + nseq;
while ( *seq && seq<end )
{
char c = toupper(*seq);
{
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
- if ( has_non_acgtn(line->d.allele[i],0) )
+ if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
{
if ( args->check_ref==CHECK_REF_EXIT )
error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
}
return NULL;
}
+static void cmpals_add(args_t *args, bcf1_t *rec)
+{
+ args->ncmpals++;
+ hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals);
+ cmpals_t *cmpals = args->cmpals + args->ncmpals - 1;
+ free(cmpals->ref);
+ cmpals->ref = strdup(rec->d.allele[0]);
+ cmpals->n = rec->n_allele;
+ if ( rec->n_allele==2 )
+ {
+ free(cmpals->alt);
+ cmpals->alt = strdup(rec->d.allele[1]);
+ }
+ else
+ {
+ if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+ cmpals->hash = khash_str2int_init();
+ int i;
+ for (i=1; i<rec->n_allele; i++)
+ khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i]));
+ }
+}
+static int cmpals_match(args_t *args, bcf1_t *rec)
+{
+ int i, j;
+ for (i=0; i<args->ncmpals; i++)
+ {
+ cmpals_t *cmpals = args->cmpals + i;
+ if ( rec->n_allele != cmpals->n ) continue;
+
+ // NB. assuming both are normalized
+ if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue;
+
+ // the most frequent case
+ if ( rec->n_allele==2 )
+ {
+ if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue;
+ return 1;
+ }
+
+ khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash;
+ for (j=1; j<rec->n_allele; j++)
+ if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break;
+ if ( j<rec->n_allele ) continue;
+ return 1;
+ }
+ cmpals_add(args, rec);
+ return 0;
+}
+static void cmpals_reset(args_t *args) { args->ncmpals = 0; }
+static void cmpals_destroy(args_t *args)
+{
+ int i;
+ for (i=0; i<args->mcmpals; i++)
+ {
+ cmpals_t *cmpals = args->cmpals + i;
+ free(cmpals->ref);
+ free(cmpals->alt);
+ if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+ }
+ free(args->cmpals);
+}
+
static void flush_buffer(args_t *args, htsFile *file, int n)
{
bcf1_t *line;
int line_type = bcf_get_variant_types(args->lines[k]);
if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
{
- if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only
+ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue;
}
else
{
prev_rid = args->lines[k]->rid;
prev_pos = args->lines[k]->pos;
prev_type = 0;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
}
prev_type |= line_type;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]);
}
bcf_write1(file, args->hdr, args->lines[k]);
}
static void destroy_data(args_t *args)
{
+ cmpals_destroy(args);
int i;
for (i=0; i<args->rbuf.m; i++)
if ( args->lines[i] ) bcf_destroy1(args->lines[i]);
int line_type = bcf_get_variant_types(line);
if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
{
- if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
- if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only
+ if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue;
}
else
{
prev_rid = line->rid;
prev_pos = line->pos;
prev_type = 0;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
}
prev_type |= line_type;
+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line);
}
// still on the same chromosome?
fprintf(bcftools_stderr, "Options:\n");
fprintf(bcftools_stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
fprintf(bcftools_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
- fprintf(bcftools_stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
+ fprintf(bcftools_stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|all|none\n");
fprintf(bcftools_stderr, " -f, --fasta-ref <file> reference sequence (MANDATORY)\n");
fprintf(bcftools_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
switch (c) {
case 'N': args->do_indels = 0; break;
case 'd':
- if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1;
- else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1;
- else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1;
- else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1;
+ if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
+ else if ( !strcmp("indels",optarg) ) args->rmdup = BCF_SR_PAIR_INDELS;
+ else if ( !strcmp("both",optarg) ) args->rmdup = BCF_SR_PAIR_BOTH;
+ else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+ else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+ else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT;
else error("The argument to -d not recognised: %s\n", optarg);
break;
case 'm':
case 'o': args->output_fname = optarg; break;
case 'D':
fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n");
- args->rmdup = COLLAPSE_NONE<<1;
+ args->rmdup = BCF_SR_PAIR_EXACT;
break;
case 's': args->strict_filter = 1; break;
case 'f': args->ref_fname = optarg; break;
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
if ( !pass ) continue;
}
line = args->plugin.process(line);
- if ( line ) bcf_write1(args->out_fh, args->hdr_out, line);
+ if ( line )
+ {
+ if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode);
+ bcf_write1(args->out_fh, args->hdr_out, line);
+ }
}
destroy_data(args);
bcf_sr_destroy(args->files);
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2016 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
if ( !pass ) continue;
}
line = args->plugin.process(line);
- if ( line ) bcf_write1(args->out_fh, args->hdr_out, line);
+ if ( line )
+ {
+ if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode);
+ bcf_write1(args->out_fh, args->hdr_out, line);
+ }
}
destroy_data(args);
bcf_sr_destroy(args->files);
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
filter_t *filter;
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ uint8_t *smpl_pass;
convert_t *convert;
bcf_srs_t *files;
bcf_hdr_t *header;
}
}
args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+ convert_set_option(args->convert, subset_samples, &args->smpl_pass);
if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
free(samples);
fwrite(str.s, str.l, 1, args->out);
}
+ int i,max_convert_unpack = convert_max_unpack(args->convert);
while ( bcf_sr_next_line(args->files) )
{
if ( !bcf_sr_has_line(args->files,0) ) continue;
if ( args->filter )
{
- int pass = filter_test(args->filter, line, NULL);
- if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
- if ( !pass ) continue;
+ int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass);
+ if ( args->filter_logic & FLT_EXCLUDE )
+ {
+ // This code addresses this problem:
+ // -i can include a site but exclude a sample
+ // -e exclude a site but include a sample
+
+ if ( pass )
+ {
+ if ( !args->smpl_pass ) continue;
+ if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
+
+ pass = 0;
+ for (i=0; i<line->n_sample; i++)
+ {
+ if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
+ else { args->smpl_pass[i] = 1; pass = 1; }
+ }
+ if ( !pass ) continue;
+ }
+ else if ( args->smpl_pass )
+ for (i=0; i<line->n_sample; i++) args->smpl_pass[i] = 1;
+ }
+ else if ( !pass ) continue;
}
str.l = 0;
fprintf(stderr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -c, --collapse <string> collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -f, --format <string> see man page for details\n");
fprintf(stderr, " -H, --print-header print header\n");
case 'f': args->format_str = strdup(optarg); break;
case 'H': args->print_header = 1; break;
case 'v': args->vcf_list = optarg; break;
- case 'c':
- if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
- else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
- else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
- else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
- else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
- else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
- else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ case 'c':
+ error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n", optarg);
break;
case 'a':
{
{
if ( !fname ) usage();
args->files = bcf_sr_init();
- args->files->collapse = collapse;
if ( optind+1 < argc ) args->files->require_index = 1;
if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
/* vcfquery.c -- Extracts fields from VCF/BCF file.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
filter_t *filter;
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+ uint8_t *smpl_pass;
convert_t *convert;
bcf_srs_t *files;
bcf_hdr_t *header;
}
}
args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+ convert_set_option(args->convert, subset_samples, &args->smpl_pass);
if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
free(samples);
fwrite(str.s, str.l, 1, args->out);
}
+ int i,max_convert_unpack = convert_max_unpack(args->convert);
while ( bcf_sr_next_line(args->files) )
{
if ( !bcf_sr_has_line(args->files,0) ) continue;
if ( args->filter )
{
- int pass = filter_test(args->filter, line, NULL);
- if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
- if ( !pass ) continue;
+ int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass);
+ if ( args->filter_logic & FLT_EXCLUDE )
+ {
+ // This code addresses this problem:
+ // -i can include a site but exclude a sample
+ // -e exclude a site but include a sample
+
+ if ( pass )
+ {
+ if ( !args->smpl_pass ) continue;
+ if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
+
+ pass = 0;
+ for (i=0; i<line->n_sample; i++)
+ {
+ if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
+ else { args->smpl_pass[i] = 1; pass = 1; }
+ }
+ if ( !pass ) continue;
+ }
+ else if ( args->smpl_pass )
+ for (i=0; i<line->n_sample; i++) args->smpl_pass[i] = 1;
+ }
+ else if ( !pass ) continue;
}
str.l = 0;
fprintf(bcftools_stderr, "Usage: bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -c, --collapse <string> collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(bcftools_stderr, " -f, --format <string> see man page for details\n");
fprintf(bcftools_stderr, " -H, --print-header print header\n");
case 'f': args->format_str = strdup(optarg); break;
case 'H': args->print_header = 1; break;
case 'v': args->vcf_list = optarg; break;
- case 'c':
- if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
- else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
- else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
- else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
- else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
- else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
- else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ case 'c':
+ error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n", optarg);
break;
case 'a':
{
{
if ( !fname ) usage();
args->files = bcf_sr_init();
- args->files->collapse = collapse;
if ( optind+1 < argc ) args->files->require_index = 1;
if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
/* vcfroh.c -- HMM model for detecting runs of autozygosity.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
smpl->rg.state = 1;
smpl->rg.beg = smpl->sites[i];
smpl->rg.rid = args->prev_rid;
+ smpl->rg.qual = qual;
+ smpl->rg.nqual = 1;
}
}
else if ( state )
/* vcfroh.c -- HMM model for detecting runs of autozygosity.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
smpl->rg.state = 1;
smpl->rg.beg = smpl->sites[i];
smpl->rg.rid = args->prev_rid;
+ smpl->rg.qual = qual;
+ smpl->rg.nqual = 1;
}
}
else if ( state )
"import csv\n"
"csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
"dat = []\n"
- "with open('%s.eval', 'rb') as f:\n"
+ "with open('%s.eval', 'r') as f:\n"
"\treader = csv.reader(f, 'tab')\n"
"\tfor row in reader:\n"
"\t\tif row[0][0]!='#': dat.append(row)\n"
"import csv\n"
"csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
"dat = []\n"
- "with open('%s.eval', 'rb') as f:\n"
+ "with open('%s.eval', 'r') as f:\n"
"\treader = csv.reader(f, 'tab')\n"
"\tfor row in reader:\n"
"\t\tif row[0][0]!='#': dat.append(row)\n"
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2016 Genome Research Ltd.
+ Copyright (C) 2012-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
int subst[15];
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
- int *smpl_hapRef, *smpl_hapAlt;
+ int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
int *smpl_indel_hets, *smpl_indel_homs;
int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
unsigned long int *smpl_dp;
// Sanity check: the reference sequence must match the REF allele
for (i=0; i<fai_ref_len && i<ref_len; i++)
- if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] )
+ if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] && !iupac_consistent(fai_ref[i], ref[i]) )
error("\nSanity check failed, the reference sequence differs: %s:%d+%d .. %c vs %c\n", chr, pos, i, ref[i],fai_ref[i]);
// Count occurrences of all possible kmers
#endif
if ( args->files->n_smpl )
{
+ stats->smpl_missing = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
#endif
free(stats->insertions);
free(stats->deletions);
+ free(stats->smpl_missing);
free(stats->smpl_hets);
free(stats->smpl_homAA);
free(stats->smpl_homRR);
{
int ial, jal;
int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
- if ( gt==GT_UNKN ) continue;
+ if ( gt==GT_UNKN )
+ {
+ stats->smpl_missing[is]++;
+ continue;
+ }
if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
{
if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
if ( args->files->n_smpl )
{
- printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+ printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. The rest include both SNPs and indels.\n");
printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
- "\t[12]nHapRef\t[13]nHapAlt\n");
+ "\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
{
float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
- printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
+ printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i],
stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
- stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
+ stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i],
+ stats->smpl_hapAlt[i], stats->smpl_missing[i]);
}
}
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2016 Genome Research Ltd.
+ Copyright (C) 2012-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
int subst[15];
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
- int *smpl_hapRef, *smpl_hapAlt;
+ int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
int *smpl_indel_hets, *smpl_indel_homs;
int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
unsigned long int *smpl_dp;
// Sanity check: the reference sequence must match the REF allele
for (i=0; i<fai_ref_len && i<ref_len; i++)
- if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] )
+ if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] && !iupac_consistent(fai_ref[i], ref[i]) )
error("\nSanity check failed, the reference sequence differs: %s:%d+%d .. %c vs %c\n", chr, pos, i, ref[i],fai_ref[i]);
// Count occurrences of all possible kmers
#endif
if ( args->files->n_smpl )
{
+ stats->smpl_missing = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
#endif
free(stats->insertions);
free(stats->deletions);
+ free(stats->smpl_missing);
free(stats->smpl_hets);
free(stats->smpl_homAA);
free(stats->smpl_homRR);
{
int ial, jal;
int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
- if ( gt==GT_UNKN ) continue;
+ if ( gt==GT_UNKN )
+ {
+ stats->smpl_missing[is]++;
+ continue;
+ }
if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
{
if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
if ( args->files->n_smpl )
{
- fprintf(bcftools_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+ fprintf(bcftools_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. The rest include both SNPs and indels.\n");
fprintf(bcftools_stdout, "# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
- "\t[12]nHapRef\t[13]nHapAlt\n");
+ "\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing\n");
for (id=0; id<args->nstats; id++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
{
float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
- fprintf(bcftools_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
+ fprintf(bcftools_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i],
stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
- stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
+ stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i],
+ stats->smpl_hapAlt[i], stats->smpl_missing[i]);
}
}
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
an += args->ac[i];
}
+ int update_ac = args->calc_ac;
if (args->n_samples)
{
int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
bcf_subset(args->hdr, line, args->n_samples, args->imap);
- if (args->calc_ac) {
+ if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
+ if ( update_ac )
+ {
bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
an = 0;
for (i=0; i<line->n_allele; i++) {
if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
}
- if (args->calc_ac && args->update_info) {
+ if (update_ac && args->update_info) {
bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
}
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2018 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
an += args->ac[i];
}
+ int update_ac = args->calc_ac;
if (args->n_samples)
{
int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
bcf_subset(args->hdr, line, args->n_samples, args->imap);
- if (args->calc_ac) {
+ if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
+ if ( update_ac )
+ {
bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
an = 0;
for (i=0; i<line->n_allele; i++) {
if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
}
- if (args->calc_ac && args->update_info) {
+ if (update_ac && args->update_info) {
bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
}
-#define BCFTOOLS_VERSION "1.6"
+#define BCFTOOLS_VERSION "1.7"
Release notes
=============
+Release 0.14.1
+==============
+
+This is mostly a bugfix release, though bcftools has now also been
+upgraded to 1.7.0.
+
+* [#621] Add a warning to count_coverage when an alignment has an
+ empty QUAL field
+* [#635] Speed-up of AlignedSegment.find_intro()
+* treat border case of all bases in pileup column below quality score
+* [#634] Fix access to pileup reference_sequence
+
+
Release 0.14.0
==============
property reference_name:
""":term:`reference` name (None if no AlignmentFile is associated)"""
def __get__(self):
- if self._alignment_file is not None:
- return self._alignment_file.getrname(self.tid)
+ if self.header is not None:
+ return self.header.get_reference_name(self.tid)
return None
property nsegments:
buf[n] = p.b.core.qual + 33
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE
-
if not p.is_del:
if p.qpos < p.b.core.l_qseq:
cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
n += 1
assert n < MAX_PILEUP_BUFFER_SIZE
- # quicker to ensemble all and split than to encode all separately.
- # ignore last ":"
- return force_str(PyBytes_FromStringAndSize(<char*>buf, n-1)).split(":")
+ if n == 0:
+ # could be zero if all qualities are too low
+ return ""
+ else:
+ # quicker to ensemble all and split than to encode all separately.
+ # ignore last ":"
+ return force_str(PyBytes_FromStringAndSize(<char*>buf, n-1)).split(":")
def get_query_qualities(self):
"""query base quality scores at pileup column position.
+
# cython: embedsignature=True
# cython: profile=True
########################################################
"""
-
cdef uint32_t contig_length = self.get_reference_length(contig)
cdef int _start = start if start is not None else 0
cdef int _stop = stop if stop is not None else contig_length
cdef int refpos
cdef int c = 0
cdef int filter_method = 0
+
+
if read_callback == "all":
filter_method = 1
elif read_callback == "nofilter":
filter_method = 2
- cdef int _threshold = quality_threshold
+ cdef int _threshold = quality_threshold or 0
for read in self.fetch(contig=contig,
reference=reference,
start=start,
# count
seq = read.seq
quality = read.query_qualities
+
for qpos, refpos in read.get_aligned_pairs(True):
if qpos is not None and refpos is not None and \
_start <= refpos < _stop:
- if quality[qpos] >= quality_threshold:
+
+ # only check base quality if _threshold > 0
+ if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
if seq[qpos] == 'A':
count_a.data.as_ulongs[refpos - _start] += 1
if seq[qpos] == 'C':
return count_a, count_c, count_g, count_t
- def find_introns(self, read_iterator):
+ def find_introns_slow(self, read_iterator):
"""Return a dictionary {(start, stop): count}
Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
and their support ( = number of reads ).
last_read_pos = read_loc
return res
+ def find_introns(self, read_iterator):
+ """Return a dictionary {(start, stop): count}
+ Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+ and their support ( = number of reads ).
+
+ read_iterator can be the result of a .fetch(...) call.
+ Or it can be a generator filtering such reads. Example
+ samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+ """
+ cdef:
+ uint32_t base_position, junc_start, nt
+ int op
+ AlignedSegment r
+ int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
+
+ import collections
+ res = collections.Counter()
+
+ match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
+ for r in read_iterator:
+ base_position = r.pos
+
+ for op, nt in r.cigartuples:
+ if op in match_or_deletion:
+ base_position += nt
+ elif op == BAM_CREF_SKIP:
+ junc_start = base_position
+ base_position += nt
+ res[(junc_start, base_position)] += 1
+ return res
+
+
def close(self):
'''closes the :class:`pysam.AlignmentFile`.'''
# pysam versioning information
-__version__ = "0.14"
+__version__ = "0.14.1"
# TODO: upgrade number
__samtools_version__ = "1.7"
import unittest
import os
-import shutil
import sys
import re
import copy
-import collections
from collections import OrderedDict as odict
-import subprocess
-import logging
-import array
+import pysam
+import pysam.samtools
+from TestUtils import get_temp_filename, BAM_DATADIR
+
if sys.version_info.major >= 3:
from io import StringIO
else:
from StringIO import StringIO
-from functools import partial
-
-import pysam
-import pysam.samtools
-from TestUtils import checkBinaryEqual, checkURL, \
- check_samtools_view_equal, checkFieldEqual, force_str, \
- get_temp_filename, BAM_DATADIR
-
class TestHeaderConstruction(unittest.TestCase):
"""testing header construction."""
"""Benchmarking module for AlignmentFile functionality"""
import os
-import re
+import pysam
import unittest
from TestUtils import BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list
-from PileupTestUtils import *
+import PileupTestUtils
class TestPileupReadSelection(unittest.TestCase):
pcolumn1.reference_id, 0,
"chromosome/target id mismatch in position 1: %s != %s" %
(pcolumn1.reference_id, 0))
+ self.assertEqual(
+ pcolumn1.reference_name, "chr1",
+ "chromosome mismatch in position 1: %s != %s" %
+ (pcolumn1.reference_name, "chr1"))
self.assertEqual(
pcolumn1.reference_pos, 105 - 1,
"position mismatch in position 1: %s != %s" %
pcolumn2.reference_id, 1,
"chromosome/target id mismatch in position 1: %s != %s" %
(pcolumn2.reference_id, 1))
+ self.assertEqual(
+ pcolumn2.reference_name, "chr2",
+ "chromosome mismatch in position 1: %s != %s" %
+ (pcolumn2.reference_name, "chr2"))
self.assertEqual(
pcolumn2.reference_pos, 1480 - 1,
"position mismatch in position 1: %s != %s" %
fn_fasta = os.path.join(BAM_DATADIR, "ex1.fa")
def test_pileup_depths_are_equal(self):
- samtools_result = build_depth_with_samtoolspipe(self.fn)
- pysam_result = build_depth_with_filter_with_pysam(self.fn)
+ samtools_result = PileupTestUtils.build_depth_with_samtoolspipe(self.fn)
+ pysam_result = PileupTestUtils.build_depth_with_filter_with_pysam(self.fn)
self.assertEqual(pysam_result, samtools_result)
def test_pileup_query_bases_without_reference_are_equal(self):
- samtools_result = build_query_bases_with_samtoolspipe(self.fn)
- pysam_result = build_query_bases_with_pysam(self.fn)
+ samtools_result = PileupTestUtils.build_query_bases_with_samtoolspipe(self.fn)
+ pysam_result = PileupTestUtils.build_query_bases_with_pysam(self.fn)
self.assertEqual(["".join(x) for x in pysam_result], samtools_result)
def test_pileup_query_bases_with_reference_are_equal(self):
- samtools_result = build_query_bases_with_samtoolspipe(self.fn, "-f", self.fn_fasta)
+ samtools_result = PileupTestUtils.build_query_bases_with_samtoolspipe(self.fn, "-f", self.fn_fasta)
with pysam.FastaFile(self.fn_fasta) as fasta:
- pysam_result = build_query_bases_with_pysam(self.fn, fastafile=fasta, stepper="samtools")
+ pysam_result = PileupTestUtils.build_query_bases_with_pysam(self.fn, fastafile=fasta, stepper="samtools")
self.assertEqual(["".join(x) for x in pysam_result], samtools_result)
def test_pileup_query_qualities_are_equal(self):
- samtools_result = build_query_qualities_with_samtoolspipe(self.fn)
- pysam_result = build_query_qualities_with_pysam(self.fn)
+ samtools_result = PileupTestUtils.build_query_qualities_with_samtoolspipe(self.fn)
+ pysam_result = PileupTestUtils.build_query_qualities_with_pysam(self.fn)
pysam_result = [
[chr(min(126, x + 33)) for x in l] for l in pysam_result]
self.assertEqual("".join(flatten_nested_list(pysam_result)),
"".join(flatten_nested_list(samtools_result)))
def test_pileup_mapping_qualities_are_equal(self):
- samtools_result = build_mapping_qualities_with_samtoolspipe(self.fn)
- pysam_result = build_mapping_qualities_with_pysam(self.fn)
+ samtools_result = PileupTestUtils.build_mapping_qualities_with_samtoolspipe(self.fn)
+ pysam_result = PileupTestUtils.build_mapping_qualities_with_pysam(self.fn)
# convert to chars
pysam_result = [
[chr(min(126, x + 33)) for x in l] for l in pysam_result]
"".join(flatten_nested_list(samtools_result)))
def test_pileup_query_qualities_from_pileups_are_equal(self):
- samtools_result = build_query_qualities_with_samtoolspipe(self.fn)
- pysam_result = build_query_qualities_with_pysam_pileups(self.fn)
+ samtools_result = PileupTestUtils.build_query_qualities_with_samtoolspipe(self.fn)
+ pysam_result = PileupTestUtils.build_query_qualities_with_pysam_pileups(self.fn)
pysam_result = [
"".join([chr(min(126, x + 33)) for x in l]) for l in pysam_result]
import os
import shutil
import sys
-import re
-import copy
import collections
import subprocess
import logging
import pysam
import pysam.samtools
-from TestUtils import checkBinaryEqual, checkURL, \
+from TestUtils import checkBinaryEqual, check_url, \
check_samtools_view_equal, checkFieldEqual, force_str, \
get_temp_filename, BAM_DATADIR
def testFTPView(self):
return
- if not checkURL(self.url):
+ if not check_url(self.url):
return
result = pysam.samtools.view(self.url, self.region)
def testFTPFetch(self):
return
- if not checkURL(self.url):
+ if not check_url(self.url):
return
samfile = pysam.AlignmentFile(self.url, "rb")
local = os.path.join(BAM_DATADIR, "ex1.bam")
def testView(self):
- if not checkURL(self.url):
+ if not check_url(self.url):
return
samfile_local = pysam.AlignmentFile(self.local, "rb")
self.assertEqual(len(result.splitlines()), len(ref))
def testFetch(self):
- if not checkURL(self.url):
+ if not check_url(self.url):
return
with pysam.AlignmentFile(self.url, "rb") as samfile:
self.assertEqual(x.compare(y), 0)
def testFetchAll(self):
- if not checkURL(self.url):
+ if not check_url(self.url):
return
with pysam.AlignmentFile(self.url, "rb") as samfile:
samfile.fetch("chr1")
def testRemoteExplicitIndexBAM(self):
- if not checkURL(
+ if not check_url(
"http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam"):
return
def testRead(self):
- if not checkURL(self.bamfile):
+ if not check_url(self.bamfile):
return
f = pysam.AlignmentFile(self.bamfile, "rb")
return True
-def checkURL(url):
+def check_url(url):
'''return True if URL is available.
A URL might not be available if it is the wrong URL
import copy
import shutil
-from TestUtils import checkURL, BAM_DATADIR, get_temp_filename
+from TestUtils import check_url, BAM_DATADIR, get_temp_filename
class TestFastaFile(unittest.TestCase):
"GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa")
def testFTPView(self):
- if not checkURL(self.url):
+ if not check_url(self.url):
return
with pysam.Fastafile(self.url) as f:
1000)
def test_sequence_lengths_are_available(self):
- if not checkURL(self.url):
+ if not check_url(self.url):
return
with pysam.Fastafile(self.url) as f:
--- /dev/null
+@SQ SN:chr1 LN:249250621
+@SQ SN:chr10 LN:135534747
+@SQ SN:chr11 LN:135006516
+@SQ SN:chr12 LN:133851895
+@SQ SN:chr13 LN:115169878
+@SQ SN:chr14 LN:107349540
+@SQ SN:chr15 LN:102531392
+@SQ SN:chr16 LN:90354753
+@SQ SN:chr17 LN:81195210
+@SQ SN:chr18 LN:78077248
+@SQ SN:chr19 LN:59128983
+@SQ SN:chr2 LN:243199373
+@SQ SN:chr20 LN:63025520
+@SQ SN:chr21 LN:48129895
+@SQ SN:chr22 LN:51304566
+@SQ SN:chr3 LN:198022430
+@SQ SN:chr4 LN:191154276
+@SQ SN:chr5 LN:180915260
+@SQ SN:chr6 LN:171115067
+@SQ SN:chr7 LN:159138663
+@SQ SN:chr8 LN:146364022
+@SQ SN:chr9 LN:141213431
+@SQ SN:chrM LN:16571
+@SQ SN:chrX LN:155270560
+@SQ SN:chrY LN:59373566
+@PG ID:bwa PN:bwa VN:0.7.12-r1039 CL:bwa mem /ifs/mirror/genomes/bwa-0.7.5a/hg19 both.fq
+chr22:25000000-25000500 0 chr22 25000000 60 501M * 0 0 GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA NM:i:0 MD:Z:501 AS:i:501 XS:i:343
+chr22:25000000-25000500 16 chr22 25000000 60 501M * 0 0 GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA CGAGACACGGCGTTCCCACCTTTGACACTCTCTGTCTAAGGTTGAGGTGCAGACCCATCATTCGTAGGTCAGGTCCCGACATCCGTCAGGACCCCTTCTGCGGTCTCTAGACACGTAAGAGTATAGCTCCCTATCGCTGAGGTCCGACCCCCGACCGTCCCATTCCCCACCCACCCAGGACCCGAATGGGCGTCCAGACGTCTGAAGGACCCCGGTCGACTGGAGCATTTTAGGGAAAACAGATTCGAAGTCAAAGGACGGACACTTACCCCAACCCCGACACGAGACCAAAGTGGGAACACCGAGACCCCAACACCACTGTTTCGGTAGTTCGACCCAACTTCCTAATTGGTCCTTTGAAGTCTGACCGACGGCACAGATGGAGAAGGAGTATGAGGAGAGAGACGACGTAGGACCCTTCGACGAGACGAGTCGGATTTACTCCGAGTCAACACACACACGCGTGCACGAACGTGCACACAACCTTCACCCACTATGACT NM:i:0 MD:Z:501 AS:i:501 XS:i:343
import re
import copy
import tempfile
-from TestUtils import checkURL, load_and_convert, TABIX_DATADIR, get_temp_filename
+from TestUtils import check_url, load_and_convert, TABIX_DATADIR, get_temp_filename
IS_PYTHON3 = sys.version_info[0] >= 3
local = os.path.join(TABIX_DATADIR, "example.gtf.gz")
def setUp(self):
- if not pysam.config.HAVE_LIBCURL or not checkURL(self.url):
+ if not pysam.config.HAVE_LIBCURL or not check_url(self.url):
self.remote_file = None
else:
self.remote_file = pysam.TabixFile(self.url, "r")
local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz")
def setUp(self):
- if not pysam.config.HAVE_LIBCURL or not checkURL(self.url):
+ if not pysam.config.HAVE_LIBCURL or not check_url(self.url):
self.remote_file = None
else:
self.remote_file = pysam.TabixFile(self.url, "r")