New upstream version 0.14.1+ds

author Afif Elghraoui <afif@debian.org>

Sun, 22 Apr 2018 22:30:02 +0000 (18:30 -0400)

committer Afif Elghraoui <afif@debian.org>

Sun, 22 Apr 2018 22:30:02 +0000 (18:30 -0400)
author Afif Elghraoui <afif@debian.org>
Sun, 22 Apr 2018 22:30:02 +0000 (18:30 -0400)
committer Afif Elghraoui <afif@debian.org>
Sun, 22 Apr 2018 22:30:02 +0000 (18:30 -0400)
diff --git a/NEWS b/NEWS

index bca0d0c834307dc4f031df9e681123f3c743dcba..0e2cb71c790c3d68e8acbb8125fbd46ea83ac16b 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,18 @@ http://pysam.readthedocs.io/en/latest/release.html
  Release notes
  =============
  
+Release 0.14.1
+==============
+
+This is mostly a bugfix release, though bcftools has now also been
+upgraded to 1.7.0.
+
+* [#621] Add a warning to count_coverage when an alignment has an
+  empty QUAL field
+* [#635] Speed-up of AlignedSegment.find_intro()
+* treat border case of all bases in pileup column below quality score
+* [#634] Fix access to pileup reference_sequence
+
  Release 0.14.0
  ==============
  
diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h

index dde3ab0101757b138d7de2c1e6b881b353c2abbc..d3ba814197826231d28173e63731a3f0809625f4 100644 (file)
--- a/bcftools/bcftools.h
+++ b/bcftools/bcftools.h
@@ -63,6 +63,23 @@ static inline char gt2iupac(char a, char b)
      return iupac[(int)a][(int)b];
  }
  
+static inline int iupac_consistent(char iupac, char nt)
+{
+    static const char iupac_mask[90] = {
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,14,2,
+        13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10
+    };
+    if ( iupac > 89 ) return 0;
+    if ( nt > 90 ) nt -=  32;  // lowercase 
+    if ( nt=='A' ) nt = 1;
+    else if ( nt=='C' ) nt = 2;
+    else if ( nt=='G' ) nt = 4;
+    else if ( nt=='T' ) nt = 8;
+    return iupac_mask[(int)iupac] & nt ? 1 : 0;
+}
+
  static inline char nt_to_upper(char nt)
  {
      if ( nt < 97 ) return nt;
diff --git a/bcftools/consensus.c b/bcftools/consensus.c

index 544eca6248b1be4477c8610a8d320331c1f7649c..d67a052197e6c69c4cbb59ac948a2dfee5d48be9 100644 (file)
--- a/bcftools/consensus.c
+++ b/bcftools/consensus.c
@@ -77,6 +77,7 @@ typedef struct
      rbuf_t vcf_rbuf;
      bcf1_t **vcf_buf;
      int nvcf_buf, rid;
+    char *chr;
  
      regidx_t *mask;
      regitr_t *itr;
@@ -121,6 +122,8 @@ static void destroy_chain(args_t *args)
      free(chain->block_lengths);
      free(chain);
      chain = NULL;
+    free(args->chr);
+    args->chr = NULL;
  }
  
  static void print_chain(args_t *args)
@@ -162,7 +165,7 @@ static void print_chain(args_t *args)
          score += chain->block_lengths[n];
      }
      score += last_block_size;
-    fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
+    fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
      for (n=0; n<chain->num; n++) {
          fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
      }
@@ -248,6 +251,7 @@ static void destroy_data(args_t *args)
          if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
      free(args->vcf_buf);
      free(args->fa_buf.s);
+    free(args->chr);
      if ( args->mask ) regidx_destroy(args->mask);
      if ( args->itr ) regitr_destroy(args->itr);
      if ( args->chain_fname )
@@ -276,6 +280,7 @@ static void init_region(args_t *args, char *line)
              else to--;
          }
      }
+    args->chr = strdup(line);
      args->rid = bcf_hdr_name2id(args->hdr,line);
      if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
      args->fa_buf.l = 0;
@@ -380,7 +385,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
      }
  
-    int i, ialt = 1;
+    int i, ialt = 1;    // the alternate allele
      if ( args->isample >= 0 )
      {
          bcf_unpack(rec, BCF_UN_FMT);
@@ -417,6 +422,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
              {
                  char ial = rec->d.allele[ialt][0];
                  char jal = rec->d.allele[jalt][0];
+                if ( !ialt ) ialt = jalt;   // only ialt is used, make sure 0/1 is not ignored
                  rec->d.allele[ialt][0] = gt2iupac(ial,jal);
              }
          }
@@ -565,11 +571,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
  
  static void mask_region(args_t *args, char *seq, int len)
  {
-    char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
      int start = args->fa_src_pos - len;
      int end   = args->fa_src_pos;
  
-    if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
+    if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
  
      int idx_start, idx_end, i;
      while ( regitr_overlap(args->itr) )
diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c

index 9ba5dd0d39af64aabf81f350615ed0d370b96b59..901560181adda490c01a7454d281c922a1131b34 100644 (file)
--- a/bcftools/consensus.c.pysam.c
+++ b/bcftools/consensus.c.pysam.c
@@ -79,6 +79,7 @@ typedef struct
      rbuf_t vcf_rbuf;
      bcf1_t **vcf_buf;
      int nvcf_buf, rid;
+    char *chr;
  
      regidx_t *mask;
      regitr_t *itr;
@@ -123,6 +124,8 @@ static void destroy_chain(args_t *args)
      free(chain->block_lengths);
      free(chain);
      chain = NULL;
+    free(args->chr);
+    args->chr = NULL;
  }
  
  static void print_chain(args_t *args)
@@ -164,7 +167,7 @@ static void print_chain(args_t *args)
          score += chain->block_lengths[n];
      }
      score += last_block_size;
-    fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
+    fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
      for (n=0; n<chain->num; n++) {
          fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
      }
@@ -250,6 +253,7 @@ static void destroy_data(args_t *args)
          if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
      free(args->vcf_buf);
      free(args->fa_buf.s);
+    free(args->chr);
      if ( args->mask ) regidx_destroy(args->mask);
      if ( args->itr ) regitr_destroy(args->itr);
      if ( args->chain_fname )
@@ -278,6 +282,7 @@ static void init_region(args_t *args, char *line)
              else to--;
          }
      }
+    args->chr = strdup(line);
      args->rid = bcf_hdr_name2id(args->hdr,line);
      if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
      args->fa_buf.l = 0;
@@ -382,7 +387,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
          if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
      }
  
-    int i, ialt = 1;
+    int i, ialt = 1;    // the alternate allele
      if ( args->isample >= 0 )
      {
          bcf_unpack(rec, BCF_UN_FMT);
@@ -419,6 +424,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
              {
                  char ial = rec->d.allele[ialt][0];
                  char jal = rec->d.allele[jalt][0];
+                if ( !ialt ) ialt = jalt;   // only ialt is used, make sure 0/1 is not ignored
                  rec->d.allele[ialt][0] = gt2iupac(ial,jal);
              }
          }
@@ -567,11 +573,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
  
  static void mask_region(args_t *args, char *seq, int len)
  {
-    char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
      int start = args->fa_src_pos - len;
      int end   = args->fa_src_pos;
  
-    if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
+    if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
  
      int idx_start, idx_end, i;
      while ( regitr_overlap(args->itr) )
diff --git a/bcftools/convert.c b/bcftools/convert.c

index 05dce01513ed6eab24995588419e3f340106d65a..d7ed0abc10bfc63685903d4f20e5d8b6af4753ca 100644 (file)
--- a/bcftools/convert.c
+++ b/bcftools/convert.c
@@ -1,6 +1,6 @@
  /*  convert.c -- functions for converting between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -92,6 +92,7 @@ struct _convert_t
      int ndat;
      char *undef_info_tag;
      int allow_undef_tags;
+    uint8_t **subset_samples;
  };
  
  typedef struct
@@ -174,6 +175,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
      }
      return ((int32_t*)bcf_array)[idx];
  }
+static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
+{
+    int n = 0, ibeg = 0;
+    while ( src[ibeg] && ibeg<len && n < idx )
+    {
+        if ( src[ibeg]==',' ) n++;
+        ibeg++;
+    }
+    if ( ibeg==len ) { kputc('.', str); return; }
+
+    int iend = ibeg;
+    while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
+
+    if ( iend>ibeg )
+        kputsn(src+ibeg, iend-ibeg, str);
+    else
+        kputc('.', str);
+}
  static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
      if ( fmt->id<0 )
@@ -232,6 +251,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
              case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
              case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
              case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
+            case BCF_BT_CHAR:  _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
              default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
          }
          #undef BRANCH
@@ -288,6 +308,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
              else
                  kputw(ival, str);
          }
+        else if ( fmt->fmt->type == BCF_BT_CHAR )
+            _copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
          else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
      }
      else
@@ -1312,6 +1334,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
              }
              for (js=0; js<convert->nsamples; js++)
              {
+                // Skip samples when filtering was requested
+                if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+
                  // Here comes a hack designed for TBCSQ. When running on large files,
                  // such as 1000GP, there are too many empty fields in the output and
                  // it's very very slow. Therefore in case the handler does not add
@@ -1362,6 +1387,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
          case allow_undef_tags:
              convert->allow_undef_tags = va_arg(args, int);
              break;
+        case subset_samples:
+            convert->subset_samples = va_arg(args, uint8_t**);
+            break;
          default:
              ret = -1;
      }
diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c

index db7b6c1ec638e70e442a1196652580d9b7b36710..66373bcda447b2b7b45dd8201eac331ddfbf651e 100644 (file)
--- a/bcftools/convert.c.pysam.c
+++ b/bcftools/convert.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  convert.c -- functions for converting between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2017 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -94,6 +94,7 @@ struct _convert_t
      int ndat;
      char *undef_info_tag;
      int allow_undef_tags;
+    uint8_t **subset_samples;
  };
  
  typedef struct
@@ -176,6 +177,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
      }
      return ((int32_t*)bcf_array)[idx];
  }
+static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
+{
+    int n = 0, ibeg = 0;
+    while ( src[ibeg] && ibeg<len && n < idx )
+    {
+        if ( src[ibeg]==',' ) n++;
+        ibeg++;
+    }
+    if ( ibeg==len ) { kputc('.', str); return; }
+
+    int iend = ibeg;
+    while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
+
+    if ( iend>ibeg )
+        kputsn(src+ibeg, iend-ibeg, str);
+    else
+        kputc('.', str);
+}
  static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
  {
      if ( fmt->id<0 )
@@ -234,6 +253,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
              case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
              case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
              case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
+            case BCF_BT_CHAR:  _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
              default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
          }
          #undef BRANCH
@@ -290,6 +310,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
              else
                  kputw(ival, str);
          }
+        else if ( fmt->fmt->type == BCF_BT_CHAR )
+            _copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
          else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
      }
      else
@@ -1314,6 +1336,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
              }
              for (js=0; js<convert->nsamples; js++)
              {
+                // Skip samples when filtering was requested
+                if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
+
                  // Here comes a hack designed for TBCSQ. When running on large files,
                  // such as 1000GP, there are too many empty fields in the output and
                  // it's very very slow. Therefore in case the handler does not add
@@ -1364,6 +1389,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
          case allow_undef_tags:
              convert->allow_undef_tags = va_arg(args, int);
              break;
+        case subset_samples:
+            convert->subset_samples = va_arg(args, uint8_t**);
+            break;
          default:
              ret = -1;
      }
diff --git a/bcftools/convert.h b/bcftools/convert.h

index 37123388d22a1cfaabaec91ae7958147faa460ff..11e892de3f99677f6f622fe5c16ebe947003d7a5 100644 (file)
--- a/bcftools/convert.h
+++ b/bcftools/convert.h
@@ -30,7 +30,8 @@ THE SOFTWARE.  */
  typedef struct _convert_t convert_t;
  enum convert_option
  {
-    allow_undef_tags
+    allow_undef_tags,
+    subset_samples,
  };
  
  convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
diff --git a/bcftools/csq.c b/bcftools/csq.c

index 94ac442f20f1608116a95dff836f8a7ad18b04c6..b36ebc32d98ba13c953f1d0b4fd813b00931d645 100644 (file)
--- a/bcftools/csq.c
+++ b/bcftools/csq.c
@@ -1,6 +1,6 @@
  /* The MIT License
  
-   Copyright (c) 2016 Genome Research Ltd.
+   Copyright (c) 2016-2018 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -726,7 +726,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char
          id = tbl->nstr++;
          hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
          tbl->str[id] = strdup(ss);
-        int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+        khash_str2int_set(tbl->str2id, tbl->str[id], id);
      }
      *se = tmp;
  
@@ -2713,6 +2713,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
      }
  }
  
+
  void hap_finalize(args_t *args, hap_t *hap)
  {
      tscript_t *tr = hap->tr;
@@ -2784,7 +2785,10 @@ void hap_finalize(args_t *args, hap_t *hap)
                  }
                  dlen += hap->stack[i].node->dlen;
                  if ( hap->stack[i].node->dlen ) indel = 1;
-                if ( i<istack )
+
+                // This condition extends compound variants. Note that s/s/s sites are forced out to always break
+                // a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
+                if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
                  {
                      if ( dlen%3 )   // frameshift
                      {
@@ -3388,7 +3392,7 @@ int test_cds(args_t *args, bcf1_t *rec)
              int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
              if ( gt[0]==bcf_gt_missing ) continue;
  
-            if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+            if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
              {
                  if ( args->phase==PHASE_MERGE )
                  {
@@ -3397,7 +3401,7 @@ int test_cds(args_t *args, bcf1_t *rec)
                  if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
                  {
                      if ( args->phase==PHASE_REQUIRE )
-                        error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+                        error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
                      if ( args->phase==PHASE_SKIP )
                          continue;
                      if ( args->phase==PHASE_NON_REF )
@@ -3648,6 +3652,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
      int call_csq = 1;
      if ( !rec->n_allele ) call_csq = 0;   // no alternate allele
      else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0;     // gVCF, no alt allele
+    else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0;                            // a symbolic allele, not ready for CNVs etc
      else if ( args->filter )
      {
          call_csq = filter_test(args->filter, rec, NULL);
@@ -3695,12 +3700,12 @@ static const char *usage(void)
          "   -c, --custom-tag <string>       use this tag instead of the default BCSQ\n"
          "   -l, --local-csq                 localized predictions, consider only one VCF record at a time\n"
          "   -n, --ncsq <int>                maximum number of consequences to consider per site [16]\n"
-        "   -p, --phase <a|m|r|R|s>         how to construct haplotypes and how to deal with unphased data: [r]\n"
+        "   -p, --phase <a|m|r|R|s>         how to handle unphased heterozygous genotypes: [r]\n"
          "                                     a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
          "                                     m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
          "                                     r: require phased GTs, throw an error on unphased het GTs\n"
          "                                     R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
-        "                                     s: skip unphased GTs\n"
+        "                                     s: skip unphased hets\n"
          "Options:\n"
          "   -e, --exclude <expr>            exclude sites for which the expression is true\n"
          "   -i, --include <expr>            select sites for which the expression is true\n"
diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c

index 978bd59a3d6be18904f60c6c47bdd8790a39504f..e4afa8ea3e66ebc898b2f72210cd27d76fe6f775 100644 (file)
--- a/bcftools/csq.c.pysam.c
+++ b/bcftools/csq.c.pysam.c
@@ -2,7 +2,7 @@
  
  /* The MIT License
  
-   Copyright (c) 2016 Genome Research Ltd.
+   Copyright (c) 2016-2018 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -728,7 +728,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char
          id = tbl->nstr++;
          hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
          tbl->str[id] = strdup(ss);
-        int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
+        khash_str2int_set(tbl->str2id, tbl->str[id], id);
      }
      *se = tmp;
  
@@ -2715,6 +2715,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
      }
  }
  
+
  void hap_finalize(args_t *args, hap_t *hap)
  {
      tscript_t *tr = hap->tr;
@@ -2786,7 +2787,10 @@ void hap_finalize(args_t *args, hap_t *hap)
                  }
                  dlen += hap->stack[i].node->dlen;
                  if ( hap->stack[i].node->dlen ) indel = 1;
-                if ( i<istack )
+
+                // This condition extends compound variants. Note that s/s/s sites are forced out to always break
+                // a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
+                if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
                  {
                      if ( dlen%3 )   // frameshift
                      {
@@ -3390,7 +3394,7 @@ int test_cds(args_t *args, bcf1_t *rec)
              int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
              if ( gt[0]==bcf_gt_missing ) continue;
  
-            if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+            if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
              {
                  if ( args->phase==PHASE_MERGE )
                  {
@@ -3399,7 +3403,7 @@ int test_cds(args_t *args, bcf1_t *rec)
                  if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
                  {
                      if ( args->phase==PHASE_REQUIRE )
-                        error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+                        error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
                      if ( args->phase==PHASE_SKIP )
                          continue;
                      if ( args->phase==PHASE_NON_REF )
@@ -3650,6 +3654,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
      int call_csq = 1;
      if ( !rec->n_allele ) call_csq = 0;   // no alternate allele
      else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0;     // gVCF, no alt allele
+    else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0;                            // a symbolic allele, not ready for CNVs etc
      else if ( args->filter )
      {
          call_csq = filter_test(args->filter, rec, NULL);
@@ -3697,12 +3702,12 @@ static const char *usage(void)
          "   -c, --custom-tag <string>       use this tag instead of the default BCSQ\n"
          "   -l, --local-csq                 localized predictions, consider only one VCF record at a time\n"
          "   -n, --ncsq <int>                maximum number of consequences to consider per site [16]\n"
-        "   -p, --phase <a|m|r|R|s>         how to construct haplotypes and how to deal with unphased data: [r]\n"
+        "   -p, --phase <a|m|r|R|s>         how to handle unphased heterozygous genotypes: [r]\n"
          "                                     a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
          "                                     m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
          "                                     r: require phased GTs, throw an error on unphased het GTs\n"
          "                                     R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
-        "                                     s: skip unphased GTs\n"
+        "                                     s: skip unphased hets\n"
          "Options:\n"
          "   -e, --exclude <expr>            exclude sites for which the expression is true\n"
          "   -i, --include <expr>            select sites for which the expression is true\n"
diff --git a/bcftools/filter.c b/bcftools/filter.c

index 3dc91a73d1d6454abe81dc82bf4ec4049857b599..53e8dc7747678de89381e6a6364f49b5b85bac07 100644 (file)
--- a/bcftools/filter.c
+++ b/bcftools/filter.c
@@ -1,6 +1,6 @@
  /*  filter.c -- filter expressions.
  
-    Copyright (C) 2013-2015 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -67,11 +67,15 @@ typedef struct _token_t
      char *tag;          // for debugging and printout only, VCF tag name
      double threshold;   // filtering threshold
      int hdr_id, type;   // BCF header lookup ID and one of BCF_HT_* types
-    int idx;            // 0-based index to VCF vectors, -1: not a vector, 
+    int idx;            // 0-based index to VCF vectors,
                          //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
-    int *idxs, nidxs;   // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+    int *idxs;          // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+    int nidxs, nuidxs;  // size of idxs array and the number of elements set to 1
+    uint8_t *usmpl;     // bitmask of used samples as set by idx
+    int nsamples;       // number of samples for format fields, 0 for info and other fields
      void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
-    int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
+    int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
+    void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
      void *hash;         // test presence of str value in the hash via comparator
      regex_t *regex;     // precompiled regex for string comparison
  
@@ -81,9 +85,8 @@ typedef struct _token_t
      int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
      int pass_site;          // -1 not applicable, 0 fails, >0 pass
      uint8_t *pass_samples;  // status of individual samples
-    int nsamples;           // number of samples
      int nvalues, mvalues;   // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
-    int nstr1;              // per-sample string length, set only with str_value.l>0 && nsamples>1
+    int nval1;              // number of per-sample fields or string length
  }
  token_t;
  
@@ -126,10 +129,11 @@ struct _filter_t
  #define TOK_ABS     23
  #define TOK_LEN     24
  #define TOK_FUNC    25
+#define TOK_CNT     26
  
-//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
-//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8};
+//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
+//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8};
  #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
  
  static int filters_next_token(char **str, int *len)
@@ -156,6 +160,7 @@ static int filters_next_token(char **str, int *len)
      if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
      if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
      if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
+    if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
      if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
      if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
      if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
@@ -249,12 +254,10 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
  {
      float *ptr = &line->qual;
      if ( bcf_float_is_missing(*ptr) )
-        tok->nvalues = 0;
+        bcf_double_set_missing(tok->values[0]);
      else
-    {
          tok->values[0] = (double)line->qual;
-        tok->nvalues = 1;
-    }
+    tok->nvalues  = 1;
  }
  static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
  {
@@ -271,7 +274,7 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
          if ( line->d.info[i].key == tok->hdr_id ) break;
  
      if ( i==line->n_info )
-        tok->nvalues = 0;
+        tok->nvalues = tok->str_value.l = 0;
      else if ( line->d.info[i].type==BCF_BT_CHAR )
      {
          int n = line->d.info[i].len;
@@ -308,40 +311,51 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
          }
      }
  }
-static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
      int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
-    if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
-    return a&b ? 0 : 1;
+    if ( rtok->tok_type==TOK_LIKE )
+        rtok->pass_site = a&b ? 1 : 0;
+    else
+        rtok->pass_site = a&b ? 0 : 1;
  }
-static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      int i;
-    if ( op_type==TOK_NE )  // AND logic: none of the filters can match
+    if ( rtok->tok_type==TOK_NE )  // AND logic: none of the filters can match
      {
          if ( !line->d.n_flt )
          {
-            if ( atok->hdr_id==-1 ) return 0;   // missing value
-            return 1; // no filter present, eval to true
+            if ( atok->hdr_id==-1 ) return;   // missing value
+            rtok->pass_site = 1;
+            return; // no filter present, eval to true
          }
          for (i=0; i<line->d.n_flt; i++)
-            if ( atok->hdr_id==line->d.flt[i] ) return 0;
-        return 1;
+            if ( atok->hdr_id==line->d.flt[i] ) return;
+        rtok->pass_site = 1;
+        return;
      }
-    // TOK_EQ with OR logic: at least one of the filters must match
-    if ( !line->d.n_flt )
+    else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
      {
-        if ( atok->hdr_id==-1 ) return 1;
-        return 0; // no filter present, eval to false
+        if ( !line->d.n_flt )
+        {
+            if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+            return; // no filter present, eval to false
+        }
+        for (i=0; i<line->d.n_flt; i++)
+            if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
+        return;
      }
-    for (i=0; i<line->d.n_flt; i++)
-        if ( atok->hdr_id==line->d.flt[i] ) return 1;
-    return 0;
+    else 
+        error("Only == and != operators are supported for FILTER\n");
+    return;
  }
-static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      // multiple IDs not supported yet (easy to add though)
+    if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+        error("Only == and != operators are supported for ID\n");
  
      if ( btok->hash )
      {
@@ -350,12 +364,15 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
      if ( atok->hash )
      {
          int ret = khash_str2int_has_key(atok->hash, line->d.id);
-        if ( op_type==TOK_EQ ) return ret;
-        return ret ? 0 : 1;
+        if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+        rtok->pass_site = ret;
+        return;
      }
  
-    if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
-    return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
+    if ( rtok->tok_type==TOK_EQ ) 
+        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+    else
+        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
  }
  
  /**
@@ -431,7 +448,7 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
      }
      else
      {
-        int64_t value;
+        int64_t value = 0;
          if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
              tok->nvalues = 0;
          else
@@ -510,14 +527,13 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
      {
          flt->tmps.l = 0;
          ks_resize(&flt->tmps, n);
-        int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
-        if ( end >= n ) end = n - 1;
+        int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+        if ( iend >= n ) iend = n - 1;
          char *beg = tok->str_value.s, *dst = flt->tmps.s;
-        for (i=0; i<=end; i++)
+        for (i=0; i<=iend; i++)
          {
              char *end = beg;
              while ( *end && *end!=',' ) end++;
-
              if ( i>=tok->nidxs || tok->idxs[i] )
              {
                  memcpy(dst, beg, end - beg);
@@ -525,7 +541,6 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
                  dst[0] = ',';
                  dst++;
              }
-
              beg = end+1;
          }
          dst[0] = 0;
@@ -549,190 +564,161 @@ static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
  
  static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
  {
-    int i;
-    if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
+    if ( line->n_sample != tok->nsamples )
+        error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+    int nvals;
+    if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
      {
-        tok->nvalues = tok->nsamples = 0;
+        tok->nvalues = 0;
          return;
      }
-    if ( tok->idx >= -1 )    // scalar or vector index
+    int i, nsrc1 = nvals / tok->nsamples;
+    tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+    tok->nvalues = tok->nval1*tok->nsamples;
+    hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+    if ( tok->idx >= 0 )    // scalar or vector index
      {
-        hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        int32_t *ptr = flt->tmpi;
-        for (i=0; i<line->n_sample; i++)
+        for (i=0; i<tok->nsamples; i++)
          {
-            if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end ) 
+            if ( !tok->usmpl[i] ) continue;
+            int32_t *ptr = flt->tmpi + i*nsrc1;
+            if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end )
                  bcf_double_set_missing(tok->values[i]);
              else
-            {
-                tok->values[i] = ptr[idx];
-                is_missing = 0;
-            }
-            ptr += nvals;
+                tok->values[i] = ptr[tok->idx];
          }
-        if ( is_missing ) tok->nvalues = 0;
-        else tok->nvalues = line->n_sample;
-        tok->nsamples = tok->nvalues;
-        return;
      }
-    if ( tok->idx == -2 )
+    else
      {
-        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
-        if ( end >= nvals ) end = nvals - 1;
-        int32_t *ptr = flt->tmpi;
-        for (i=0; i<line->n_sample; i++)
+        int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+        for (i=0; i<tok->nsamples; i++)
          {
-            for (k=0; k<=end; k++)
-                if ( k>=tok->nidxs || tok->idxs[k] )
-                {
-                    if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
-                        bcf_double_set_missing(tok->values[j]);
-                    else
-                    {
-                        tok->values[j] = ptr[k];
-                        is_missing = 0;
-                    }
-                    j++;
-                }
-            ptr += nvals;
-        }
-        if ( is_missing ) tok->nvalues = tok->nsamples = 0;
-        else
-        {
-            tok->nsamples = line->n_sample;
-            tok->nvalues = j;
+            if ( !tok->usmpl[i] ) continue;
+            int32_t *src = flt->tmpi + i*nsrc1;
+            double  *dst = tok->values + i*tok->nval1;
+            int k, j = 0;
+            for (k=0; k<kend; k++)
+            {
+                if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+                if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end )
+                    bcf_double_set_missing(dst[j]);
+                else
+                    dst[j] = src[k];
+                j++;
+            }
+            while (j < tok->nval1)
+            {
+                bcf_double_set_missing(dst[j]);
+                j++;
+            }
          }
-        return;
      }
  }
  static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
  {
-    int i;
-    if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
+    if ( line->n_sample != tok->nsamples )
+        error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+    int nvals;
+    if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
      {
-        tok->nvalues = tok->nsamples = 0;
+        tok->nvalues = 0;
          return;
      }
-    if ( tok->idx >= -1 )    // scalar or vector index
+    int i, nsrc1 = nvals / tok->nsamples;
+    tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+    tok->nvalues = tok->nval1*tok->nsamples;
+    hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+    if ( tok->idx >= 0 )    // scalar or vector index
      {
-        hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        float *ptr = flt->tmpf;
-        for (i=0; i<line->n_sample; i++)
+        for (i=0; i<tok->nsamples; i++)
          {
-            if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
+            if ( !tok->usmpl[i] ) continue;
+            float *ptr = flt->tmpf + i*nsrc1;
+            if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) )
                  bcf_double_set_missing(tok->values[i]);
              else
-            {
-                tok->values[i] = ptr[idx];
-                is_missing = 0;
-            }
-            ptr += nvals;
+                tok->values[i] = ptr[tok->idx];
          }
-        if ( is_missing ) tok->nvalues = 0;
-        else tok->nvalues = line->n_sample;
-        tok->nsamples = tok->nvalues;
-        return;
      }
-    if ( tok->idx == -2 )
+    else
      {
-        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
-        if ( end >= nvals ) end = nvals - 1;
-        float *ptr = flt->tmpf;
-        for (i=0; i<line->n_sample; i++)
+        int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+        for (i=0; i<tok->nsamples; i++)
          {
-            for (k=0; k<=end; k++)
-                if ( k>=tok->nidxs || tok->idxs[k] )
-                {
-                    if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
-                        bcf_double_set_missing(tok->values[j]);
-                    else
-                    {
-                        tok->values[j] = ptr[k];
-                        is_missing = 0;
-                    }
-                    j++;
-                }
-            ptr += nvals;
-        }
-        if ( is_missing ) tok->nvalues = tok->nsamples = 0;
-        else
-        {
-            tok->nsamples = line->n_sample;
-            tok->nvalues = j;
+            if ( !tok->usmpl[i] ) continue;
+            float *src = flt->tmpf + i*nsrc1;
+            double  *dst = tok->values + i*tok->nval1;
+            int k, j = 0;
+            for (k=0; k<kend; k++)
+            {
+                if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+                if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) )
+                    bcf_double_set_missing(dst[j]);
+                else
+                    dst[j] = src[k];
+                j++;
+            }
+            while (j < tok->nval1)
+            {
+                bcf_double_set_missing(dst[j]);
+                j++;
+            }
          }
-        return;
      }
  }
  static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
  {
-    tok->str_value.l = tok->nvalues = 0;
-    if ( !line->n_sample ) return;
+    if ( line->n_sample != tok->nsamples )
+        error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
  
-    int ndim = tok->str_value.m;
+    int i, ndim = tok->str_value.m;
      int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
      tok->str_value.m = ndim;
+    tok->str_value.l = tok->nvalues = 0;
  
-    if ( nstr<=0 ) return;
-
-    if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
-    {
-        tok->nsamples = line->n_sample;
-        tok->nstr1 = ndim / line->n_sample;
-        tok->nvalues = tok->str_value.l = nstr;
-        return;
-    }
-
-    int nstr1 = nstr / line->n_sample;
+    if ( nstr<0 ) return;
  
-    // vector, one or multiple indices
-    int i;
-    for (i=0; i<line->n_sample; i++)
+    tok->nvalues = tok->str_value.l = nstr;
+    tok->nval1   = nstr / tok->nsamples;
+    for (i=0; i<tok->nsamples; i++)
      {
-        char *dst = tok->str_value.s + i*nstr1, *str = dst;
-        int nval = 0, ibeg = 0;
-        while ( ibeg < nstr1 )
+        if ( !tok->usmpl[i] ) continue;
+        char *src = tok->str_value.s + i*tok->nval1, *dst = src;
+        int ibeg = 0, idx = 0;
+        while ( ibeg < tok->nval1 )
          {
-            int iend = ibeg + 1;
-            while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+            int iend = ibeg;
+            while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
  
              int keep = 0;
-            if ( tok->idx >=0 )
-                keep = tok->idx==nval ? 1 : 0;
-            else if ( nval < tok->nidxs )
-                keep = tok->idxs[nval] ? 1 : 0;
+            if ( tok->idx >= 0 )
+            {
+                if ( tok->idx==idx ) keep = 1;
+            }
+            else if ( idx < tok->nidxs )
+            {
+                if ( tok->idxs[idx] != 0 ) keep = 1;
+            }
              else if ( tok->idxs[tok->nidxs-1] < 0 )
                  keep = 1;
  
              if ( keep )
              {
-                if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+                if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
                  dst += iend - ibeg + 1;
                  if ( tok->idx>=0 ) break;
              }
-            if ( !str[iend] ) break;
+            if ( !src[iend] ) break;
              ibeg = iend + 1;
-            nval++;
+            idx++;
          }
-        if ( dst==str ) { dst[0] = '.'; dst+=2; }
-        if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
+        if ( dst==src ) { dst[0] = '.'; dst+=2; }
+        if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src));
      }
-    tok->nvalues = tok->str_value.l = nstr;
-    tok->nstr1 = nstr1;
-    tok->nsamples = line->n_sample;
  }
  static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
  {
@@ -743,10 +729,10 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
          return;
      }
      
-    int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
-    if ( tok->str_value.m <= nvals*nsmpl )
+    int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
+    if ( tok->str_value.m <= nvals1*nsmpl )
      {
-        tok->str_value.m = nvals*nsmpl + 1;
+        tok->str_value.m = nvals1*nsmpl + 1;
          tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
      }
  
@@ -768,8 +754,15 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
                      if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
                  } \
              } \
-            char *dst = &tok->str_value.s[nvals*i]; \
-            if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+            char *dst = &tok->str_value.s[nvals1*i]; \
+            if ( type==4 ) \
+            { \
+                if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \
+                else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \
+                else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \
+                else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \
+            } \
+            else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
              else if ( type==3 ) \
              { \
                  if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
@@ -803,31 +796,30 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
          default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
      }
  #undef BRANCH_INT
-    tok->nsamples = nsmpl;
-    tok->nvalues = tok->str_value.l = nvals*nsmpl;
+    assert( tok->nsamples == nsmpl );
+    tok->nvalues = tok->str_value.l = nvals1*nsmpl;
      tok->str_value.s[tok->str_value.l] = 0;
-    tok->nstr1 = nvals;
+    tok->nval1 = nvals1;
  }
  static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
  static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
  
  static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
  {
      bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
      if ( !fmt )
      {
-        tok->nvalues = tok->nsamples = 0;
+        tok->nvalues = 0;
          return;
      }
-    int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
-    kstring_t str;
+    int i, blen = 4, nsmpl = line->n_sample;
  
  gt_length_too_big:
      tok->str_value.l = 0;
      for (i=0; i<nsmpl; i++)
      {
-        int plen = tok->str_value.l;
-
+        size_t plen = tok->str_value.l;
          bcf_format_gt(fmt, i, &tok->str_value);
          kputc_(0, &tok->str_value);
          if ( tok->str_value.l - plen > blen )
@@ -845,9 +837,9 @@ gt_length_too_big:
              plen++;
          }
      }
-    tok->nsamples = nsmpl;
+    assert( tok->nsamples == nsmpl );
      tok->nvalues = tok->str_value.l;
-    tok->nstr1 = blen;
+    tok->nval1 = blen;
  }
  static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
  {
@@ -868,7 +860,7 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
      }
      else if ( tok->idx==-2 )
      {
-        int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+        int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1;
          if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
          for (i=0; i<=end; i++)
              if ( i>=tok->nidxs || tok->idxs[i] )
@@ -998,59 +990,111 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
      }
  }
  
-static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = 0;
+    if ( !tok->nvalues ) return 1;
      double val = -HUGE_VAL;
-    int i;
+    int i, has_value = 0;
      for (i=0; i<tok->nvalues; i++)
      {
-        if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+        if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+        has_value = 1;
+        if ( val < tok->values[i] ) val = tok->values[i];
+    }
+    if ( has_value )
+    {
+        rtok->values[0] = val;
+        rtok->nvalues = has_value;
      }
-    tok->values[0] = val;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    return 1;
  }
-static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = 0;
+    if ( !tok->nvalues ) return 1;
      double val = HUGE_VAL;
-    int i;
+    int i, has_value = 0;
      for (i=0; i<tok->nvalues; i++)
-        if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
-    tok->values[0] = val;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    {
+        if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+        has_value = 1;
+        if ( val > tok->values[i] ) val = tok->values[i];
+    }
+    if ( has_value )
+    {
+        rtok->values[0] = val;
+        rtok->nvalues = has_value;
+    }
+    return 1;
  }
-static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = 0;
+    if ( !tok->nvalues ) return 1;
      double val = 0;
      int i, n = 0;
      for (i=0; i<tok->nvalues; i++)
          if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
-    tok->values[0] = n ? val / n : 0;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    if ( n )
+    {
+        rtok->values[0] = val / n;
+        rtok->nvalues   = 1;
+    }
+    return 1;
  }
-static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    rtok->nvalues = 0;
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nvalues ) return 1;
      double val = 0;
      int i, n = 0;
      for (i=0; i<tok->nvalues; i++)
          if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
-    tok->values[0] = val;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    if ( n )
+    {
+        rtok->values[0] = val;
+        rtok->nvalues   = 1;
+    }
+    return 1;
  }
-static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
      if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
+
+    rtok->nvalues = tok->nvalues;
+    if ( !tok->nvalues ) return 1;
+    hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
      int i;
      for (i=0; i<tok->nvalues; i++)
-        tok->values[i] = fabs(tok->values[i]);
+        if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+        else rtok->values[i] = fabs(tok->values[i]);
+    return 1;
+}
+static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n");
+
+    int i, cnt = 0;
+    for (i=0; i<tok->nsamples; i++)
+        if ( tok->pass_samples[i] ) cnt++;
+
+    rtok->nvalues = 1;
+    rtok->values[0] = cnt;
+    return 1;
  }
-static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    tok->is_str = 0;
-    if ( !tok->str_value.l ) return;
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = rtok->str_value.l = 0;
+    if ( !tok->str_value.l ) return 1;
+
      if ( tok->idx==-2 )
      {
          int i = 0;
@@ -1059,202 +1103,193 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
          {
              char *se = ss;
              while ( *se && *se!=',' ) se++;
-            hts_expand(double, i+1, tok->mvalues, tok->values);
-            if ( !*se ) tok->values[i] = strlen(ss);
+            hts_expand(double, i+1, rtok->mvalues, rtok->values);
+            if ( !*se ) rtok->values[i] = strlen(ss);
              else
              {
                  *se = 0;
-                tok->values[i] = strlen(ss);
+                rtok->values[i] = strlen(ss);
                  *se = ',';
              }
              ss = *se ? se + 1 : se;
              i++;
          }
-        tok->nvalues = i;
+        rtok->nvalues = i;
      }
      else
      {
-        tok->values[0] = strlen(tok->str_value.s);
-        tok->nvalues = 1;
+        rtok->values[0] = strlen(tok->str_value.s);
+        rtok->nvalues = 1;
      }
-    tok->str_value.l = 0;
+    return 1;
+}
+inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
+{
+    token_t *tok = atok->nvalues > btok->nvalues ? atok : btok;
+    rtok->nvalues = tok->nvalues;
+    rtok->nval1   = tok->nval1;
+    hts_expand(double*, rtok->nvalues, rtok->mvalues, rtok->values);
  }
-#define VECTOR_ARITHMETICS(atok,btok,AOP) \
+inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
+{
+    if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
+    {
+        rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
+        rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); 
+        int i;
+        for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
+        for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
+    }
+    memset(rtok->pass_samples, 0, rtok->nsamples);
+}
+
+#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
  { \
+    token_t *rtok = _rtok; \
      int i, has_values = 0; \
-    if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \
+    if ( atok->nvalues && btok->nvalues ) \
      { \
-        (atok)->nvalues = 0; (atok)->nsamples = 0; \
-    } \
-    else \
-    { \
-        if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \
-        { \
-            for (i=0; i<(atok)->nvalues; i++) \
-            { \
-                if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
-                if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
-                has_values = 1; \
-                (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
-            } \
-        } \
-        else if ( (btok)->nsamples ) \
+        tok_init_values(atok, btok, rtok); \
+        tok_init_samples(atok, btok, rtok); \
+        if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
          { \
-            hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
-            for (i=0; i<(btok)->nvalues; i++) \
+            assert( atok->nsamples==btok->nsamples ); \
+            for (i=0; i<atok->nvalues; i++) \
              { \
-                if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
+                if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \
                  { \
-                    bcf_double_set_missing((atok)->values[i]); \
+                    bcf_double_set_missing(rtok->values[i]); \
                      continue; \
                  } \
                  has_values = 1; \
-                (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \
+                rtok->values[i] = atok->values[i] AOP btok->values[i]; \
              } \
-            (atok)->nvalues  = (btok)->nvalues; \
-            (atok)->nsamples = (btok)->nsamples; \
          } \
-        else if ( (atok)->nsamples ) \
+        else \
          { \
-            for (i=0; i<(atok)->nvalues; i++) \
+            token_t *xtok = atok->nsamples ? atok : btok; \
+            token_t *ytok = atok->nsamples ? btok : atok; \
+            assert( ytok->nvalues==1 ); \
+            if ( !bcf_double_is_missing(ytok->values[0]) ) \
              { \
-                if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
+                for (i=0; i<xtok->nvalues; i++) \
                  { \
-                    bcf_double_set_missing((atok)->values[i]); \
-                    continue; \
+                    if ( bcf_double_is_missing(xtok->values[i]) ) \
+                    { \
+                        bcf_double_set_missing(rtok->values[i]); \
+                        continue; \
+                    } \
+                    has_values = 1; \
+                    rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
                  } \
-                has_values = 1; \
-                (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \
              } \
          } \
      } \
-    if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \
+    if ( !has_values ) rtok->nvalues = 0; \
  }
  
-static int vector_logic_and(token_t *atok, token_t *btok, int and_type)
+static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields),
-    // or a vector and a scalar (FORMAT field vs threshold)
-    int i, pass_site = 0;
-    if ( !atok->nvalues || !btok->nvalues )
-    {
-        atok->nvalues = atok->nsamples = 0;
-        return 0;
-    }
-    if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site;
-    if ( atok->nsamples && btok->nsamples )
+    token_t *atok = stack[nstack-2];
+    token_t *btok = stack[nstack-1];
+    tok_init_samples(atok, btok, rtok);
+
+    if ( !atok->pass_site && !btok->pass_site ) return 2;
+
+    rtok->pass_site = 1;
+    if ( !atok->nsamples && !btok->nsamples ) return 2;
+
+    int i;
+    if ( rtok->tok_type==TOK_OR_VEC )   // ||, select all samples if one is true
      {
-        if ( and_type==TOK_AND )
+        if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) )
          {
-            // perform AND within a sample
-            for (i=0; i<atok->nsamples; i++)
+            // These two conditions are to ensure the following does not set all samples
+            // at sites with QUAL<=30:
+            //      QUAL>30 || FMT/GQ>30
+
+            token_t *tok = atok->nsamples ? atok : btok;
+            for (i=0; i<rtok->nsamples; i++)
              {
-                atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
-                if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+                if ( !rtok->usmpl[i] ) continue;
+                rtok->pass_samples[i] = tok->pass_samples[i];
              }
          }
          else
          {
-            // perform AND across samples
-            int pass_a = 0, pass_b = 0;
-            for (i=0; i<atok->nsamples; i++)
-            {
-                if ( atok->pass_samples[i] ) pass_a = 1;
-                atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
-            }
-            for (i=0; i<btok->nsamples; i++)
+            for (i=0; i<rtok->nsamples; i++)
              {
-                if ( btok->pass_samples[i] ) { pass_b = 1; break; }
+                if ( !rtok->usmpl[i] ) continue;
+                rtok->pass_samples[i] = 1;
              }
-            pass_site = pass_a && pass_b;
          }
-        return pass_site;
+        return 2;
      }
-    if ( btok->nsamples )
+
+    //  |, only select samples which are actually true
+
+    if ( !atok->nsamples || !btok->nsamples )
      {
-        for (i=0; i<btok->nsamples; i++)
+        token_t *tok = atok->nsamples ? atok : btok;
+        for (i=0; i<rtok->nsamples; i++)
          {
-            atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i];
-            if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = tok->pass_samples[i];
          }
-        atok->nsamples = btok->nsamples;
-        return pass_site;
+        return 2;
      }
-    /* atok->nsamples!=0 */
-    for (i=0; i<atok->nsamples; i++)
+
+    assert( atok->nsamples==btok->nsamples );
+
+    for (i=0; i<rtok->nsamples; i++)
      {
-        atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site;
-        if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+        if ( !rtok->usmpl[i] ) continue;
+        rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
      }
-    return pass_site;
+    return 2;
  }
-static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
+static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    int i, pass_site = 0;
-    if ( !atok->nvalues && !btok->nvalues )   // missing sites in both
-    {
-        atok->nvalues = atok->nsamples = 0;
-        return 0;
-    }
-    if ( !atok->nvalues ) // missing value in a
-    {
-        for (i=0; i<btok->nsamples; i++)
-            atok->pass_samples[i] = btok->pass_samples[i];
-        atok->nsamples = btok->nsamples;
-        atok->nvalues  = 1;
-        return btok->pass_site;
-    }
-    if ( !btok->nvalues ) // missing value in b
-    {
-        btok->nvalues = 1;
-        return atok->pass_site;
-    }
+    token_t *atok = stack[nstack-2];
+    token_t *btok = stack[nstack-1];
+    tok_init_samples(atok, btok, rtok);
  
-    if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
-    if ( !atok->nsamples )
+    if ( !atok->pass_site || !btok->pass_site ) return 2;
+    if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; }
+
+    int i;
+    if ( !atok->nsamples || !btok->nsamples )
      {
-        if ( or_type==TOK_OR )
+        token_t *tok = atok->nsamples ? atok : btok;
+        for (i=0; i<rtok->nsamples; i++)
          {
-            for (i=0; i<btok->nsamples; i++)
-            {
-                atok->pass_samples[i] = btok->pass_samples[i];
-                if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1;
-            }
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = tok->pass_samples[i];  
          }
-        else
-        {
-            for (i=0; i<btok->nsamples; i++)
-            {
-                atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i];
-                if ( atok->pass_samples[i] ) pass_site = 1;
-            }
-        }
-        atok->nsamples = btok->nsamples;
-        return pass_site;
+        rtok->pass_site = 1;
+        return 2;
      }
-    if ( !btok->nsamples )  // vector vs site
+
+    assert( atok->nsamples==btok->nsamples );
+    if ( rtok->tok_type==TOK_AND_VEC )  // &&, can be true in different samples
      {
-        if ( or_type==TOK_OR )
+        for (i=0; i<rtok->nsamples; i++)
          {
-            for (i=0; i<atok->nsamples; i++)
-                if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
          }
-        else
-        {
-            for (i=0; i<atok->nsamples; i++)
-            {
-                atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site;
-                if ( atok->pass_samples[i] ) pass_site = 1;
-            }
-        }
-        return pass_site;
+        rtok->pass_site = 1;
      }
-    for (i=0; i<atok->nsamples; i++)
+    else    // &, must be true within one sample
      {
-        atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i];
-        if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+        for (i=0; i<rtok->nsamples; i++)
+        {
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i];
+            if ( rtok->pass_samples[i] ) rtok->pass_site = 1;
+        }
      }
-    return pass_site;
+    return 2;
  }
  
  #define CMP_MISSING(atok,btok,CMP_OP,ret) \
@@ -1265,224 +1300,394 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
      tok->nvalues = 1; \
  }
  
-#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
+#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
  { \
-    int i, j, has_values = 0, pass_site = 0; \
-    if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
-    else \
+    token_t *rtok = _rtok; \
+    int i, j, k; \
+    assert( !atok->nsamples || !btok->nsamples ); \
+    tok_init_samples(atok, btok, rtok); \
+    if ( !atok->nsamples && !btok->nsamples ) \
      { \
-        if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+        if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \
+        else if ( !atok->nvalues || !btok->nvalues ) \
          { \
-            /* any field can match: [*] */ \
-            for (i=0; i<(atok)->nvalues; i++) \
+            token_t *tok = atok->nvalues ? atok : btok; \
+            for (j=0; j<tok->nvalues; j++) \
              { \
-                for (j=0; j<(btok)->nvalues; j++) \
-                    if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+                if ( bcf_double_is_missing(tok->values[j]) ) \
+                { \
+                    if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \
+                } \
+                else if ( missing_logic[1] ) {  rtok->pass_site = 1; break; } \
              } \
          } \
-        else if ( (atok)->nsamples && (btok)->nsamples ) \
+        else \
          { \
-            for (i=0; i<(atok)->nsamples; i++) \
+            for (i=0; i<atok->nvalues; i++) \
              { \
-                has_values = 1; \
-                if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
-                else (atok)->pass_samples[i] = 0; \
+                int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \
+                for (j=0; j<btok->nvalues; j++) \
+                { \
+                    int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \
+                    if ( nmiss ) \
+                    { \
+                        if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+                    } \
+                    else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+                } \
              } \
-            if ( !has_values ) (atok)->nvalues = 0; \
          } \
-        else if ( (atok)->nsamples ) \
+    } \
+    else \
+    { \
+        if ( !atok->nvalues && !btok->nvalues ) \
          { \
-            for (i=0; i<(atok)->nsamples; i++) \
+            if ( missing_logic[2] ) \
              { \
-                /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
-                has_values = 1; \
-                if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
-                else (atok)->pass_samples[i] = 0; \
+                for (i=0; i<rtok->nsamples; i++) \
+                    if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \
              } \
-            if ( !has_values ) (atok)->nvalues = 0; \
          } \
-        else if ( (btok)->nsamples ) \
+        else if ( !atok->nvalues || !btok->nvalues ) \
          { \
-            for (i=0; i<(btok)->nsamples; i++) \
+            token_t *tok = atok->nvalues ? atok : btok; \
+            if ( !tok->nsamples ) \
              { \
-                if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
-                has_values = 1; \
-                if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
-                else (atok)->pass_samples[i] = 0; \
+                int miss = 0; \
+                for (j=0; j<tok->nvalues; j++) \
+                    miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \
+                if ( missing_logic[++miss] ) \
+                { \
+                    for (i=0; i<rtok->nsamples; i++) \
+                        if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+                } \
              } \
-            (atok)->nvalues  = (btok)->nvalues; \
-            (atok)->nsamples = (btok)->nsamples; \
-            if ( !has_values ) (atok)->nvalues = 0; \
+            else \
+                for (i=0; i<tok->nsamples; i++) \
+                { \
+                    if ( !rtok->usmpl[i] ) continue; \
+                    double *ptr = tok->values + i*tok->nval1; \
+                    int miss = 0; \
+                    for (j=0; j<tok->nval1; j++) \
+                        miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \
+                    if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+                } \
          } \
          else \
          { \
-            if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
+            token_t *xtok = atok->nsamples ? atok : btok; \
+            token_t *ytok = atok->nsamples ? btok : atok; \
+            for (i=0; i<xtok->nsamples; i++) \
+            { \
+                if ( !rtok->usmpl[i] ) continue; \
+                double *xptr = xtok->values + i*xtok->nval1; \
+                double *yptr = ytok->values + i*ytok->nval1; \
+                for (j=0; j<xtok->nval1; j++) \
+                { \
+                    int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \
+                    if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+                    for (k=0; k<ytok->nvalues; k++) \
+                    { \
+                        int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \
+                        if ( nmiss ) \
+                        { \
+                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                        } \
+                        else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                    } \
+                } \
+            } \
          } \
-        /*fprintf(stderr,"pass=%d\n", pass_site);*/ \
-        (ret) = pass_site; \
      } \
  }
-static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // logic: TOK_EQ or TOK_NE
+static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic)
  {
-    if ( !atok->str_value.l ) { return 0; }
-    if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
-    int i, pass_site = 0;
-    if ( atok->nsamples && atok->nsamples==btok->nsamples )
-    {
-        for (i=0; i<atok->nsamples; i++)
-        {
-            char *astr = atok->str_value.s + i*atok->nstr1;
-            char *bstr = btok->str_value.s + i*btok->nstr1;
-            char *aend = astr + atok->str_value.l, *a = astr;
-            while ( a<aend && *a ) a++;
-            char *bend = bstr + btok->str_value.l, *b = bstr;
-            while ( b<bend && *b ) b++;
-            if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
-            else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
-            if ( logic!=TOK_EQ )
-                atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
-            pass_site |= atok->pass_samples[i];
-        }
-        if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
-    }
-    else if ( !atok->nsamples && !btok->nsamples )
-    {
-        if ( atok->idx==-2 || btok->idx==-2 )
-        {
-            // any field can match: [*]
-            if ( atok->idx==-2 && btok->idx==-2 )
-                error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value);
-            token_t *xtok, *ytok;   // xtok is scalar, ytok array
-            if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
-            else { xtok = btok; ytok = atok; }
-            char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
-            char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
-            while ( y<=yend )
-            {
-                if ( y==yend || *y==',' )
-                {
-                    if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) )
-                    {
-                        pass_site = 1;
-                        break;
-                    }
-                    ystr = y+1;
-                }
-                y++;
-            }
-        }
-        else
-            pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
-        if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
+    char *end = str + len;
+    while ( str < end && *str )
+    {
+        char *mid = str;
+        while ( mid < end && *mid && *mid!=',' ) mid++;
+        int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
+        if ( miss && missing_logic[miss] ) return 1;
+        char tmp = *mid; *mid = 0;
+        int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
+        *mid = tmp;
+        if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
+        if ( match ) return 1;
+        if ( !*mid ) break;
+        str = mid + 1;
      }
-    else
+    return 0;
+}
+static inline int _has_missing_string(char *beg)
+{
+    while ( *beg )
      {
-        token_t *xtok, *ytok;
-        if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
-        else { xtok = btok; ytok = atok; }
-        char *xstr = xtok->str_value.s;
-        char *xend = xstr + xtok->str_value.l, *x = xstr;
-        while ( x<xend && *x ) x++;
-        for (i=0; i<ytok->nsamples; i++)
-        {
-            char *ystr = ytok->str_value.s + i*ytok->nstr1;
-            char *ybeg = ystr, *yend = ystr + ytok->nstr1;
-            int pass = 0;
-            while ( ybeg < yend )
+        char *end = beg;
+        while ( *end && *end!=',' ) end++;
+        if ( end-beg==1 && beg[0]=='.' ) return 1;
+        if ( !*end ) break;
+        beg = end + 1;
+    }
+    return 0;
+}
+
+// Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A".
+// Returns 1 if any field matches, otherwise returns 0
+static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic)
+{
+    char *aend = abeg + alen;
+    char *bend = bstr + blen;
+    while ( abeg < aend && *abeg )
+    {
+        char *amid = abeg;
+        while ( amid < aend && *amid && *amid!=',' ) amid++;
+        int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0;
+        char *bbeg = bstr;
+        while ( bbeg < bend && *bbeg )
+        {
+            char *bmid = bbeg;
+            while ( bmid < bend && *bmid && *bmid!=',' ) bmid++;
+            int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0);
+            if ( nmiss )
              {
-                char *y = ybeg;
-                while ( y<yend && *y && *y!=',' ) y++;
-                if ( y - ybeg != x - xstr ) pass = 0;
-                else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
-                if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
-                if ( pass || !*y ) break;
-                ybeg = y+1;
+                if ( missing_logic[nmiss] ) return 1;
              }
-            atok->pass_samples[i] = pass;
-            pass_site |= pass;
+            else
+            {
+                int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0;
+                if ( logic==TOK_NE ) match = match==1 ? 0 : 1;
+                if ( match ) return 1;
+            }
+            if ( !*bmid ) break;
+            bbeg = bmid + 1;
          }
-        if ( !atok->nsamples )
-            atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
+        if ( !*amid ) break;
+        abeg = amid + 1;
      }
-    return pass_site;
+    return 0;
  }
-static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
+static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
  {
-    int i, pass_site = 0;
-    if ( atok->nsamples )
+    tok_init_samples(atok, btok, rtok);
+
+    int i, logic = rtok->tok_type;     // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE
+    regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
+
+    assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l );
+    assert( !atok->nsamples || !btok->nsamples );
+    assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) );
+
+    int missing_logic[] = {0,0,0};
+    if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1;
+    else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1;
+
+    if ( !atok->nsamples && !btok->nsamples )
+    {
+        if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; }
+        if ( !atok->nvalues || !btok->nvalues )
+        {
+            int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s);
+            if ( missing_logic[miss+1] ) rtok->pass_site = 1;
+            return;
+        }
+        if ( !regex )
+            rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); 
+        else
+        {
+            token_t *tok = atok->regex ? btok : atok;
+            rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
+        }
+        return;
+    }
+
+    // The case of (!atok->nsamples || !btok->nsamples)
+
+    if ( !atok->nvalues && !btok->nvalues )
      {
-        for (i=0; i<atok->nsamples; i++)
+        if ( missing_logic[2] )
          {
-            char *ptr = atok->str_value.s + i*atok->nstr1;
-            atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
-            if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
-            pass_site |= atok->pass_samples[i];
+            for (i=0; i<rtok->nsamples; i++) 
+                if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
          }
-        return pass_site;
+        return;
+    }
+    if ( !atok->nvalues || !btok->nvalues )
+    {
+        token_t *tok = atok->nvalues ? atok : btok;
+        if ( !tok->nsamples )
+        {
+            int miss = _has_missing_string(tok->str_value.s);
+            if ( !missing_logic[miss+1] ) return;
+            for (i=0; i<rtok->nsamples; i++)
+                if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+        }
+        else
+            for (i=0; i<tok->nsamples; i++)
+            {
+                if ( !rtok->usmpl[i] ) continue;
+                int miss = _has_missing_string(tok->str_value.s + i*tok->nval1);
+                if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+            }
+        return;
+    }
+
+    // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) 
+    token_t *xtok = atok->nsamples ? atok : btok;
+    token_t *ytok = atok->nsamples ? btok : atok;
+    assert( regex==ytok->regex );
+    for (i=0; i<xtok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        int match;
+        if ( regex )
+            match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic);
+        else
+            match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic);
+        if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
      }
-    pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
-    if ( negate ) pass_site = pass_site ? 0 : 1;
-    return pass_site;
  }
  
-static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok)   // tag_idx points just after "TAG["
+static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
  {
-    // TAG[*] .. any field
-    if ( !strncmp("*]", tag_idx, 3) )
+    // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1
+    if ( *tag_idx==0 || !strcmp("*", tag_idx) )
      {
-        tok->idxs = (int*) malloc(sizeof(int));
-        tok->idxs[0] = -1;
-        tok->nidxs   = 1;
-        tok->idx     = -2;
-        return;
+        *idxs = (int*) malloc(sizeof(int));
+        (*idxs)[0] = -1;
+        *nidxs     = 1;
+        *idx       = -2;
+        return 0;
      }
  
-    // TAG[integer] .. one field
+    // TAG[integer] .. one field; idx positive
      char *end, *beg = tag_idx;
-    tok->idx = strtol(tag_idx, &end, 10);
-    if ( tok->idx >= 0 && *end==']' ) return;
+    *idx = strtol(tag_idx, &end, 10);
+    if ( *idx >= 0 && *end==0 ) return 0;
  
-
-    // TAG[0,1] or TAG[0-2] or [1-] etc
+    // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,..
      int i, ibeg = -1;
-    while ( *beg && *beg!=']' )
+    while ( *beg )
      {
-        int idx = strtol(beg, &end, 10);
+        int num = strtol(beg, &end, 10);
          if ( end[0]==',' ) beg = end + 1;
-        else if ( end[0]==']' ) beg = end;
-        else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
-        else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
-        if ( idx >= tok->nidxs )
+        else if ( end[0]==0 ) beg = end;
+        else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; }
+        else return -1;
+        if ( num >= *nidxs )
          {
-            tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
-            memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
-            tok->nidxs = idx + 1;
+            *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1));
+            memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1));
+            *nidxs = num + 1;
          }
          if ( ibeg>=0 )
          {
-            for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+            for (i=ibeg; i<=num; i++) (*idxs)[i] = 1;
              ibeg = -1;
          }
-        tok->idxs[idx] = 1;
+        (*idxs)[num] = 1;
      }
      if ( ibeg >=0 )
      {
-        if ( ibeg >= tok->nidxs )
+        if ( ibeg >= *nidxs )
          {
-            tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
-            memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
-            tok->nidxs = ibeg + 1;
+            *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1));
+            memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1));
+            *nidxs = ibeg + 1;
+        }
+        (*idxs)[ibeg] = -1;
+    }
+    *idx = -2;
+    return 0;
+}
+
+static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok)   // tag_idx points just after "TAG["
+{
+    int i, len = strlen(tag_idx);
+    if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0;
+    char *ori = strdup(tag_idx);
+
+    assert( !tok->idxs && !tok->usmpl );
+    int *idxs1 = NULL, nidxs1 = 0, idx1 = 0;
+    int *idxs2 = NULL, nidxs2 = 0, idx2 = 0;
+
+    int set_samples = 0;
+    char *colon = index(tag_idx, ':');
+    if ( colon )
+    {
+        *colon = 0;
+        if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+        if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
+        tok->idxs  = idxs2;
+        tok->nidxs = nidxs2;
+        tok->idx   = idx2;
+        set_samples = 1;
+    }
+    else
+    {
+        if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+        if ( is_fmt )
+        {
+            if ( nidxs1==1 && idxs1[0]==-1 )
+            {
+                tok->idxs = (int*) malloc(sizeof(int));
+                tok->idxs[0] = -1;
+                tok->nidxs   = 1;
+                tok->idx     = -2;
+            }
+            else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
+                error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
+            else
+                tok->idx = 0;
+            set_samples = 1;
+        }
+        else
+        {
+            tok->idxs  = idxs1; 
+            tok->nidxs = nidxs1;
+            tok->idx   = idx1;
+        }
+    }
+
+    if ( set_samples )
+    {
+        tok->nsamples = bcf_hdr_nsamples(hdr);
+        tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); 
+        if ( idx1>=0 )
+        {
+            if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+            tok->usmpl[idx1] = 1;
+        }
+        else if ( idx1==-2 )
+        {
+            for (i=0; i<nidxs1; i++)
+            {
+                if ( idxs1[i]==0 ) continue;
+                if ( idxs1[i]==-1 ) break;
+                if ( i >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+                tok->usmpl[i] = 1;
+            }
+            if ( nidxs1 && idxs1[nidxs1-1]==-1 )    // open range, such as "7-"
+            {
+                for (; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+            }
          }
-        tok->idxs[ibeg] = -1;
+        else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2);
+        free(idxs1);
+    }
+    free(ori);
+
+    if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 )
+    {
+        for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
      }
-    tok->idx = -2;
  }
  static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
  {
      tok->tok_type  = TOK_VAL;
      tok->hdr_id    = -1;
      tok->pass_site = -1;
-    tok->idx       = -1;
+    tok->idx       = 0;
  
      // is this a string constant?
      if ( str[0]=='"' || str[0]=='\'' )
@@ -1577,6 +1782,10 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->setter = &filters_set_alt_string;
              tok->is_str = 1;
              tok->tag = strdup("ALT");
+            tok->idxs = (int*) malloc(sizeof(int));
+            tok->idxs[0] = -1;
+            tok->nidxs   = 1;
+            tok->idx     = -2;
              return 0;
          }
          else if ( !strncasecmp(str,"N_ALT",len) )
@@ -1614,8 +1823,6 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          int i;
          for (i=0; i<tmp.l; i++)
              if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
-        if ( is_array ) 
-            parse_tag_idx(tmp.s, tmp.s+is_array, tok);
      }
      tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
      if ( is_fmt==-1 )
@@ -1627,6 +1834,16 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          }
          if ( is_fmt==-1 ) is_fmt = 0;
      }
+    if ( is_array ) 
+        parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
+    else if ( is_fmt && !tok->nsamples ) 
+    {
+        int i;
+        tok->nsamples = bcf_hdr_nsamples(filter->hdr);
+        tok->usmpl = (uint8_t*) malloc(tok->nsamples); 
+        for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+    }
+
      tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
      if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
      if ( tok->hdr_id>=0 )
@@ -1640,7 +1857,12 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
                  error("No such FORMAT field: %s\n", tmp.s);
              if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
-                error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s);
+            {
+                tok->idxs = (int*) malloc(sizeof(int));
+                tok->idxs[0] = -1;
+                tok->nidxs   = 1;
+                tok->idx     = -2;
+            }
              switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
              {
                  case BCF_HT_INT:  tok->setter = &filters_set_format_int; break;
@@ -1738,7 +1960,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
      {
          errno = 0;
          tok->threshold = strtof(tmp.s, &end);   // float?
-        if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+        if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
      }
  
      if ( tmp.s ) free(tmp.s);
@@ -1874,16 +2096,26 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
      int i;
      for (i=0; i<nout; i++)
      {
+        if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
+            out[i].func = vector_logic_or;
+        if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC )
+            out[i].func = vector_logic_and;
          if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
          {
              // Look for j="." and k numeric type
              int j = i-1, k = i-2;
              if ( !out[j].is_str ) { k = i-1, j = i-2; }
-            if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
+            if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
              {
-                int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
-                if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
-                if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+                int set_missing = 0;
+                if ( out[k].hdr_id>0 )
+                {
+                    int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
+                    if ( type==BCF_HT_INT ) set_missing = 1;
+                    else if ( type==BCF_HT_REAL ) set_missing = 1;
+                }
+                else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
+                if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
              }
          }
          if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
@@ -1935,10 +2167,15 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
              else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
              else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
  
+            if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n");
+
              // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
              if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; }   // ra
              else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; }   // aA
@@ -1976,18 +2213,19 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
      filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
      for (i=0; i<nout; i++)
      {
-        if ( out[i].tok_type==TOK_MAX )      { out[i].setter = set_max; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_MIN ) { out[i].setter = set_min; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_AVG ) { out[i].setter = set_avg; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
+        if ( out[i].tok_type==TOK_MAX )      { out[i].func = func_max; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_MIN ) { out[i].func = func_min; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_SUM ) { out[i].func = func_sum; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_ABS ) { out[i].func = func_abs; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_CNT ) { out[i].func = func_count; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
          hts_expand0(double,1,out[i].mvalues,out[i].values);
          if ( filter->nsamples )
          {
              out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
              int j;
-            for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 1;
+            for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 0;
          }
      }
  
@@ -2009,6 +2247,7 @@ void filter_destroy(filter_t *filter)
          free(filter->filters[i].str_value.s);
          free(filter->filters[i].tag);
          free(filter->filters[i].idxs);
+        free(filter->filters[i].usmpl);
          free(filter->filters[i].values);
          free(filter->filters[i].pass_samples);
          if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
@@ -2034,9 +2273,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
      int i, nstack = 0;
      for (i=0; i<filter->nfilters; i++)
      {
-        filter->filters[i].nsamples  = 0;
-        filter->filters[i].nvalues   = 0;
-        filter->filters[i].pass_site = -1;
+        filter->filters[i].pass_site = 0;
  
          if ( filter->filters[i].tok_type == TOK_VAL )
          {
@@ -2056,9 +2293,11 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
              filter->flt_stack[nstack++] = &filter->filters[i];
              continue;
          }
-        else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument
+        else if ( filter->filters[i].func )
          {
-            filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]);
+            int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack);
+            filter->flt_stack[nstack-nargs] = &filter->filters[i];
+            if ( --nargs > 0 ) nstack -= nargs;
              continue;
          }
          if ( nstack<2 )
@@ -2066,111 +2305,76 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
  
          int is_str  = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
  
-        if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC )
-        {
-            if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
-                error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
-            filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
-            nstack--;
-            continue;
-        }
-        if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC )
-        {
-            if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
-                error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
-            filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
-            nstack--;
-            continue;
-        }
-
          if ( filter->filters[i].tok_type == TOK_ADD )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
          else if ( filter->filters[i].tok_type == TOK_SUB )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
          else if ( filter->filters[i].tok_type == TOK_MULT )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
          else if ( filter->filters[i].tok_type == TOK_DIV )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
  
-        int is_true = 0;
+        // ideally, these comparators would become func, but this would require more work in init1()
          if ( filter->filters[i].comparator )
-            is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
-        else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
          {
-            int skip = 0;
-            if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
-            if ( filter->filters[i].tok_type != TOK_EQ  && filter->filters[i].tok_type != TOK_NE ) skip = 1;
-
-            if ( skip ) 
-                filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0;
-            else if ( filter->filters[i].tok_type == TOK_EQ )
-                CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true)
-            else if ( filter->filters[i].tok_type == TOK_NE )
-                CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true)
-        }
-        else if ( filter->filters[i].tok_type == TOK_EQ )
-        {
-            if ( filter->flt_stack[nstack-1]->comparator )
-                is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line);
-            else if ( filter->flt_stack[nstack-2]->comparator )
-                is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line);
-            else if ( is_str==2 )   // both are strings
-                is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ);
-            else if ( is_str==1 )
-                error("Comparing string to numeric value: %s\n", filter->str);
-            else
-                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true);
-        }
-        else if ( filter->filters[i].tok_type == TOK_NE )
-        {
-            if ( filter->flt_stack[nstack-1]->comparator )
-                is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line);
-            else if ( filter->flt_stack[nstack-2]->comparator )
-                is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line);
-            else if ( is_str==2 )
-                is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE);
-            else if ( is_str==1 )
-                error("Comparing string to numeric value: %s\n", filter->str);
-            else
-                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true);
+            filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
          }
-        else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
+        else if ( filter->flt_stack[nstack-1]->comparator )
          {
-            if ( is_str==2 )
-                is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
-            else
-                error("The regex operator can be used on strings only: %s\n", filter->str);
-        }
-        else if ( is_str>0 )
-            error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value);
-        else if ( filter->filters[i].tok_type == TOK_LE )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true)
-        else if ( filter->filters[i].tok_type == TOK_LT )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true)
-        else if ( filter->filters[i].tok_type == TOK_BT )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true)
-        else if ( filter->filters[i].tok_type == TOK_BE )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true)
+            filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
+        }
+        else if ( filter->flt_stack[nstack-2]->comparator )
+        {
+            filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line);
+        }
+        else if ( is_str==2 )
+        {
+            cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]);
+        }
          else
-            error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type);
+        {
+            // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2]
+            int missing_logic[] = {0,0,0};
+            if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; }
+            if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; }
  
-        filter->flt_stack[nstack-2]->pass_site = is_true;
+            if ( filter->filters[i].tok_type == TOK_EQ )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_NE )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_LE )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_LT )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_BT )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_BE )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic)
+            else
+                error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type);
+
+        }
+        filter->flt_stack[nstack-2] = &filter->filters[i];
          nstack--;
      }
      if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack);    // too few values left on the stack
diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c

index 0beb5926bf64c7ecf3e1ae6d92e321626b713e3f..83f49e0f69932bece89c7632eca48cf9f2fbe41d 100644 (file)
--- a/bcftools/filter.c.pysam.c
+++ b/bcftools/filter.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  filter.c -- filter expressions.
  
-    Copyright (C) 2013-2015 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -69,11 +69,15 @@ typedef struct _token_t
      char *tag;          // for debugging and printout only, VCF tag name
      double threshold;   // filtering threshold
      int hdr_id, type;   // BCF header lookup ID and one of BCF_HT_* types
-    int idx;            // 0-based index to VCF vectors, -1: not a vector, 
+    int idx;            // 0-based index to VCF vectors,
                          //  -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
-    int *idxs, nidxs;   // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+    int *idxs;          // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited
+    int nidxs, nuidxs;  // size of idxs array and the number of elements set to 1
+    uint8_t *usmpl;     // bitmask of used samples as set by idx
+    int nsamples;       // number of samples for format fields, 0 for info and other fields
      void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
-    int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *);
+    int (*func)(filter_t *, bcf1_t *, struct _token_t *rtok, struct _token_t **stack, int nstack);
+    void (*comparator)(struct _token_t *, struct _token_t *, struct _token_t *rtok, bcf1_t *);
      void *hash;         // test presence of str value in the hash via comparator
      regex_t *regex;     // precompiled regex for string comparison
  
@@ -83,9 +87,8 @@ typedef struct _token_t
      int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
      int pass_site;          // -1 not applicable, 0 fails, >0 pass
      uint8_t *pass_samples;  // status of individual samples
-    int nsamples;           // number of samples
      int nvalues, mvalues;   // number of used values: n=0 for missing values, n=1 for scalars, for strings n=str_value.l
-    int nstr1;              // per-sample string length, set only with str_value.l>0 && nsamples>1
+    int nval1;              // number of per-sample fields or string length
  }
  token_t;
  
@@ -128,10 +131,11 @@ struct _filter_t
  #define TOK_ABS     23
  #define TOK_LEN     24
  #define TOK_FUNC    25
+#define TOK_CNT     26
  
-//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
-//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8};
+//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
+//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8};
  #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
  
  static int filters_next_token(char **str, int *len)
@@ -158,6 +162,7 @@ static int filters_next_token(char **str, int *len)
      if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; }
      if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
      if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
+    if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
      if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
      if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility
      if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility
@@ -251,12 +256,10 @@ static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
  {
      float *ptr = &line->qual;
      if ( bcf_float_is_missing(*ptr) )
-        tok->nvalues = 0;
+        bcf_double_set_missing(tok->values[0]);
      else
-    {
          tok->values[0] = (double)line->qual;
-        tok->nvalues = 1;
-    }
+    tok->nvalues  = 1;
  }
  static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
  {
@@ -273,7 +276,7 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
          if ( line->d.info[i].key == tok->hdr_id ) break;
  
      if ( i==line->n_info )
-        tok->nvalues = 0;
+        tok->nvalues = tok->str_value.l = 0;
      else if ( line->d.info[i].type==BCF_BT_CHAR )
      {
          int n = line->d.info[i].len;
@@ -310,40 +313,51 @@ static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
          }
      }
  }
-static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
      int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
-    if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
-    return a&b ? 0 : 1;
+    if ( rtok->tok_type==TOK_LIKE )
+        rtok->pass_site = a&b ? 1 : 0;
+    else
+        rtok->pass_site = a&b ? 0 : 1;
  }
-static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      int i;
-    if ( op_type==TOK_NE )  // AND logic: none of the filters can match
+    if ( rtok->tok_type==TOK_NE )  // AND logic: none of the filters can match
      {
          if ( !line->d.n_flt )
          {
-            if ( atok->hdr_id==-1 ) return 0;   // missing value
-            return 1; // no filter present, eval to true
+            if ( atok->hdr_id==-1 ) return;   // missing value
+            rtok->pass_site = 1;
+            return; // no filter present, eval to true
          }
          for (i=0; i<line->d.n_flt; i++)
-            if ( atok->hdr_id==line->d.flt[i] ) return 0;
-        return 1;
+            if ( atok->hdr_id==line->d.flt[i] ) return;
+        rtok->pass_site = 1;
+        return;
      }
-    // TOK_EQ with OR logic: at least one of the filters must match
-    if ( !line->d.n_flt )
+    else if ( rtok->tok_type==TOK_EQ ) // OR logic: at least one of the filters must match
      {
-        if ( atok->hdr_id==-1 ) return 1;
-        return 0; // no filter present, eval to false
+        if ( !line->d.n_flt )
+        {
+            if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
+            return; // no filter present, eval to false
+        }
+        for (i=0; i<line->d.n_flt; i++)
+            if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
+        return;
      }
-    for (i=0; i<line->d.n_flt; i++)
-        if ( atok->hdr_id==line->d.flt[i] ) return 1;
-    return 0;
+    else 
+        error("Only == and != operators are supported for FILTER\n");
+    return;
  }
-static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
  {
      // multiple IDs not supported yet (easy to add though)
+    if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE )
+        error("Only == and != operators are supported for ID\n");
  
      if ( btok->hash )
      {
@@ -352,12 +366,15 @@ static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *lin
      if ( atok->hash )
      {
          int ret = khash_str2int_has_key(atok->hash, line->d.id);
-        if ( op_type==TOK_EQ ) return ret;
-        return ret ? 0 : 1;
+        if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1;
+        rtok->pass_site = ret;
+        return;
      }
  
-    if ( op_type==TOK_EQ ) return strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
-    return strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
+    if ( rtok->tok_type==TOK_EQ ) 
+        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
+    else
+        rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
  }
  
  /**
@@ -433,7 +450,7 @@ static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok)
      }
      else
      {
-        int64_t value;
+        int64_t value = 0;
          if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
              tok->nvalues = 0;
          else
@@ -512,14 +529,13 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
      {
          flt->tmps.l = 0;
          ks_resize(&flt->tmps, n);
-        int i, end = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
-        if ( end >= n ) end = n - 1;
+        int i, iend = tok->idxs[tok->nidxs-1] < 0 ? n - 1 : tok->nidxs - 1;
+        if ( iend >= n ) iend = n - 1;
          char *beg = tok->str_value.s, *dst = flt->tmps.s;
-        for (i=0; i<=end; i++)
+        for (i=0; i<=iend; i++)
          {
              char *end = beg;
              while ( *end && *end!=',' ) end++;
-
              if ( i>=tok->nidxs || tok->idxs[i] )
              {
                  memcpy(dst, beg, end - beg);
@@ -527,7 +543,6 @@ static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok)
                  dst[0] = ',';
                  dst++;
              }
-
              beg = end+1;
          }
          dst[0] = 0;
@@ -551,190 +566,161 @@ static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok)
  
  static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
  {
-    int i;
-    if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
+    if ( line->n_sample != tok->nsamples )
+        error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+    int nvals;
+    if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 )
      {
-        tok->nvalues = tok->nsamples = 0;
+        tok->nvalues = 0;
          return;
      }
-    if ( tok->idx >= -1 )    // scalar or vector index
+    int i, nsrc1 = nvals / tok->nsamples;
+    tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+    tok->nvalues = tok->nval1*tok->nsamples;
+    hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+    if ( tok->idx >= 0 )    // scalar or vector index
      {
-        hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        int32_t *ptr = flt->tmpi;
-        for (i=0; i<line->n_sample; i++)
+        for (i=0; i<tok->nsamples; i++)
          {
-            if ( ptr[idx]==bcf_int32_missing || ptr[idx]==bcf_int32_vector_end ) 
+            if ( !tok->usmpl[i] ) continue;
+            int32_t *ptr = flt->tmpi + i*nsrc1;
+            if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end )
                  bcf_double_set_missing(tok->values[i]);
              else
-            {
-                tok->values[i] = ptr[idx];
-                is_missing = 0;
-            }
-            ptr += nvals;
+                tok->values[i] = ptr[tok->idx];
          }
-        if ( is_missing ) tok->nvalues = 0;
-        else tok->nvalues = line->n_sample;
-        tok->nsamples = tok->nvalues;
-        return;
      }
-    if ( tok->idx == -2 )
+    else
      {
-        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
-        if ( end >= nvals ) end = nvals - 1;
-        int32_t *ptr = flt->tmpi;
-        for (i=0; i<line->n_sample; i++)
+        int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+        for (i=0; i<tok->nsamples; i++)
          {
-            for (k=0; k<=end; k++)
-                if ( k>=tok->nidxs || tok->idxs[k] )
-                {
-                    if ( ptr[k]==bcf_int32_missing || ptr[k]==bcf_int32_vector_end )
-                        bcf_double_set_missing(tok->values[j]);
-                    else
-                    {
-                        tok->values[j] = ptr[k];
-                        is_missing = 0;
-                    }
-                    j++;
-                }
-            ptr += nvals;
-        }
-        if ( is_missing ) tok->nvalues = tok->nsamples = 0;
-        else
-        {
-            tok->nsamples = line->n_sample;
-            tok->nvalues = j;
+            if ( !tok->usmpl[i] ) continue;
+            int32_t *src = flt->tmpi + i*nsrc1;
+            double  *dst = tok->values + i*tok->nval1;
+            int k, j = 0;
+            for (k=0; k<kend; k++)
+            {
+                if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+                if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end )
+                    bcf_double_set_missing(dst[j]);
+                else
+                    dst[j] = src[k];
+                j++;
+            }
+            while (j < tok->nval1)
+            {
+                bcf_double_set_missing(dst[j]);
+                j++;
+            }
          }
-        return;
      }
  }
  static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
  {
-    int i;
-    if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
+    if ( line->n_sample != tok->nsamples )
+        error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
+
+    int nvals;
+    if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 )
      {
-        tok->nvalues = tok->nsamples = 0;
+        tok->nvalues = 0;
          return;
      }
-    if ( tok->idx >= -1 )    // scalar or vector index
+    int i, nsrc1 = nvals / tok->nsamples;
+    tok->nval1 = tok->idx >= 0 ? 1 : (tok->nuidxs ? tok->nuidxs : nsrc1);
+    tok->nvalues = tok->nval1*tok->nsamples;
+    hts_expand(double, tok->nvalues, tok->mvalues, tok->values);
+
+    if ( tok->idx >= 0 )    // scalar or vector index
      {
-        hts_expand(double,flt->nsamples,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        float *ptr = flt->tmpf;
-        for (i=0; i<line->n_sample; i++)
+        for (i=0; i<tok->nsamples; i++)
          {
-            if ( bcf_float_is_missing(ptr[idx]) || bcf_float_is_vector_end(ptr[idx]) )
+            if ( !tok->usmpl[i] ) continue;
+            float *ptr = flt->tmpf + i*nsrc1;
+            if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) )
                  bcf_double_set_missing(tok->values[i]);
              else
-            {
-                tok->values[i] = ptr[idx];
-                is_missing = 0;
-            }
-            ptr += nvals;
+                tok->values[i] = ptr[tok->idx];
          }
-        if ( is_missing ) tok->nvalues = 0;
-        else tok->nvalues = line->n_sample;
-        tok->nsamples = tok->nvalues;
-        return;
      }
-    if ( tok->idx == -2 )
+    else
      {
-        hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
-        int nvals = tok->nvalues / line->n_sample;
-        int idx = tok->idx >= 0 ? tok->idx : 0;
-        int is_missing = 1;
-        int k, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? nvals - 1 : tok->nidxs - 1;
-        if ( end >= nvals ) end = nvals - 1;
-        float *ptr = flt->tmpf;
-        for (i=0; i<line->n_sample; i++)
+        int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
+        for (i=0; i<tok->nsamples; i++)
          {
-            for (k=0; k<=end; k++)
-                if ( k>=tok->nidxs || tok->idxs[k] )
-                {
-                    if ( bcf_float_is_missing(ptr[k]) || bcf_float_is_vector_end(ptr[k]) )
-                        bcf_double_set_missing(tok->values[j]);
-                    else
-                    {
-                        tok->values[j] = ptr[k];
-                        is_missing = 0;
-                    }
-                    j++;
-                }
-            ptr += nvals;
-        }
-        if ( is_missing ) tok->nvalues = tok->nsamples = 0;
-        else
-        {
-            tok->nsamples = line->n_sample;
-            tok->nvalues = j;
+            if ( !tok->usmpl[i] ) continue;
+            float *src = flt->tmpf + i*nsrc1;
+            double  *dst = tok->values + i*tok->nval1;
+            int k, j = 0;
+            for (k=0; k<kend; k++)
+            {
+                if ( k<tok->nidxs && !tok->idxs[k] ) continue;
+                if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) )
+                    bcf_double_set_missing(dst[j]);
+                else
+                    dst[j] = src[k];
+                j++;
+            }
+            while (j < tok->nval1)
+            {
+                bcf_double_set_missing(dst[j]);
+                j++;
+            }
          }
-        return;
      }
  }
  static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
  {
-    tok->str_value.l = tok->nvalues = 0;
-    if ( !line->n_sample ) return;
+    if ( line->n_sample != tok->nsamples )
+        error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples);
  
-    int ndim = tok->str_value.m;
+    int i, ndim = tok->str_value.m;
      int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim);
      tok->str_value.m = ndim;
+    tok->str_value.l = tok->nvalues = 0;
  
-    if ( nstr<=0 ) return;
-
-    if ( tok->idx == -1 || (tok->idx==-2 && tok->idxs[0]==-1) ) // scalar or keep all values of a vector: TAG[*]
-    {
-        tok->nsamples = line->n_sample;
-        tok->nstr1 = ndim / line->n_sample;
-        tok->nvalues = tok->str_value.l = nstr;
-        return;
-    }
-
-    int nstr1 = nstr / line->n_sample;
+    if ( nstr<0 ) return;
  
-    // vector, one or multiple indices
-    int i;
-    for (i=0; i<line->n_sample; i++)
+    tok->nvalues = tok->str_value.l = nstr;
+    tok->nval1   = nstr / tok->nsamples;
+    for (i=0; i<tok->nsamples; i++)
      {
-        char *dst = tok->str_value.s + i*nstr1, *str = dst;
-        int nval = 0, ibeg = 0;
-        while ( ibeg < nstr1 )
+        if ( !tok->usmpl[i] ) continue;
+        char *src = tok->str_value.s + i*tok->nval1, *dst = src;
+        int ibeg = 0, idx = 0;
+        while ( ibeg < tok->nval1 )
          {
-            int iend = ibeg + 1;
-            while ( iend < nstr1 && str[iend] && str[iend]!=',' ) iend++;
+            int iend = ibeg;
+            while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
  
              int keep = 0;
-            if ( tok->idx >=0 )
-                keep = tok->idx==nval ? 1 : 0;
-            else if ( nval < tok->nidxs )
-                keep = tok->idxs[nval] ? 1 : 0;
+            if ( tok->idx >= 0 )
+            {
+                if ( tok->idx==idx ) keep = 1;
+            }
+            else if ( idx < tok->nidxs )
+            {
+                if ( tok->idxs[idx] != 0 ) keep = 1;
+            }
              else if ( tok->idxs[tok->nidxs-1] < 0 )
                  keep = 1;
  
              if ( keep )
              {
-                if ( ibeg>0 ) memmove(dst, str+ibeg, iend-ibeg+1);
+                if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
                  dst += iend - ibeg + 1;
                  if ( tok->idx>=0 ) break;
              }
-            if ( !str[iend] ) break;
+            if ( !src[iend] ) break;
              ibeg = iend + 1;
-            nval++;
+            idx++;
          }
-        if ( dst==str ) { dst[0] = '.'; dst+=2; }
-        if ( dst - str < nstr1 ) memset(dst-1, 0, nstr1 - (dst - str));
+        if ( dst==src ) { dst[0] = '.'; dst+=2; }
+        if ( dst - src < tok->nval1 ) memset(dst-1, 0, tok->nval1 - (dst - src));
      }
-    tok->nvalues = tok->str_value.l = nstr;
-    tok->nstr1 = nstr1;
-    tok->nsamples = line->n_sample;
  }
  static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int type)
  {
@@ -745,10 +731,10 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
          return;
      }
      
-    int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals = type==2 ? 3 : 4;
-    if ( tok->str_value.m <= nvals*nsmpl )
+    int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
+    if ( tok->str_value.m <= nvals1*nsmpl )
      {
-        tok->str_value.m = nvals*nsmpl + 1;
+        tok->str_value.m = nvals1*nsmpl + 1;
          tok->str_value.s = (char*)realloc(tok->str_value.s, tok->str_value.m);
      }
  
@@ -770,8 +756,15 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
                      if ( bcf_gt_allele(ial)!=bcf_gt_allele(jal) ) is_het = 1; \
                  } \
              } \
-            char *dst = &tok->str_value.s[nvals*i]; \
-            if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
+            char *dst = &tok->str_value.s[nvals1*i]; \
+            if ( type==4 ) \
+            { \
+                if ( !j || missing ) dst[0]='m', dst[1]='i', dst[2]='s', dst[3] = 0; /* mis, missing genotype */ \
+                else if ( !has_ref ) dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, no ref, must have alt allele */ \
+                else if ( !is_het ) dst[0]='r', dst[1]='e', dst[2]='f', dst[3] = 0; /* ref, must be ref-only, no alt alelle */ \
+                else dst[0]='a', dst[1]='l', dst[2]='t', dst[3] = 0; /* alt, is het, has alt allele */ \
+            } \
+            else if ( !j || missing ) dst[0]='.', dst[1]=0; /* ., missing genotype */ \
              else if ( type==3 ) \
              { \
                  if ( j==1 ) dst[0]='h', dst[1]='a', dst[2]='p', dst[3] = 0; /* hap, haploid */ \
@@ -805,31 +798,30 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
          default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break;
      }
  #undef BRANCH_INT
-    tok->nsamples = nsmpl;
-    tok->nvalues = tok->str_value.l = nvals*nsmpl;
+    assert( tok->nsamples == nsmpl );
+    tok->nvalues = tok->str_value.l = nvals1*nsmpl;
      tok->str_value.s[tok->str_value.l] = 0;
-    tok->nstr1 = nvals;
+    tok->nval1 = nvals1;
  }
  static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
  static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
+static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
  
  static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
  {
      bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT");
      if ( !fmt )
      {
-        tok->nvalues = tok->nsamples = 0;
+        tok->nvalues = 0;
          return;
      }
-    int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
-    kstring_t str;
+    int i, blen = 4, nsmpl = line->n_sample;
  
  gt_length_too_big:
      tok->str_value.l = 0;
      for (i=0; i<nsmpl; i++)
      {
-        int plen = tok->str_value.l;
-
+        size_t plen = tok->str_value.l;
          bcf_format_gt(fmt, i, &tok->str_value);
          kputc_(0, &tok->str_value);
          if ( tok->str_value.l - plen > blen )
@@ -847,9 +839,9 @@ gt_length_too_big:
              plen++;
          }
      }
-    tok->nsamples = nsmpl;
+    assert( tok->nsamples == nsmpl );
      tok->nvalues = tok->str_value.l;
-    tok->nstr1 = blen;
+    tok->nval1 = blen;
  }
  static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok)
  {
@@ -870,7 +862,7 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
      }
      else if ( tok->idx==-2 )
      {
-        int i, j = 0, end = tok->idxs[tok->nidxs-1] < 0 ? line->n_allele - 1 : tok->nidxs - 1;
+        int i, end = tok->nuidxs ? tok->nuidxs : line->n_allele - 1;
          if ( end >= line->n_allele - 1 ) end = line->n_allele - 2;
          for (i=0; i<=end; i++)
              if ( i>=tok->nidxs || tok->idxs[i] )
@@ -1000,59 +992,111 @@ static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
      }
  }
  
-static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = 0;
+    if ( !tok->nvalues ) return 1;
      double val = -HUGE_VAL;
-    int i;
+    int i, has_value = 0;
      for (i=0; i<tok->nvalues; i++)
      {
-        if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+        if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+        has_value = 1;
+        if ( val < tok->values[i] ) val = tok->values[i];
+    }
+    if ( has_value )
+    {
+        rtok->values[0] = val;
+        rtok->nvalues = has_value;
      }
-    tok->values[0] = val;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    return 1;
  }
-static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_min(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = 0;
+    if ( !tok->nvalues ) return 1;
      double val = HUGE_VAL;
-    int i;
+    int i, has_value = 0;
      for (i=0; i<tok->nvalues; i++)
-        if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
-    tok->values[0] = val;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    {
+        if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue;
+        has_value = 1;
+        if ( val > tok->values[i] ) val = tok->values[i];
+    }
+    if ( has_value )
+    {
+        rtok->values[0] = val;
+        rtok->nvalues = has_value;
+    }
+    return 1;
  }
-static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_avg(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = 0;
+    if ( !tok->nvalues ) return 1;
      double val = 0;
      int i, n = 0;
      for (i=0; i<tok->nvalues; i++)
          if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
-    tok->values[0] = n ? val / n : 0;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    if ( n )
+    {
+        rtok->values[0] = val / n;
+        rtok->nvalues   = 1;
+    }
+    return 1;
  }
-static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    rtok->nvalues = 0;
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nvalues ) return 1;
      double val = 0;
      int i, n = 0;
      for (i=0; i<tok->nvalues; i++)
          if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
-    tok->values[0] = val;
-    tok->nvalues   = 1;
-    tok->nsamples  = 0;
+    if ( n )
+    {
+        rtok->values[0] = val;
+        rtok->nvalues   = 1;
+    }
+    return 1;
  }
-static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_abs(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
+    token_t *tok = stack[nstack - 1];
      if ( tok->is_str ) error("ABS() can be applied only on numeric values\n");
+
+    rtok->nvalues = tok->nvalues;
+    if ( !tok->nvalues ) return 1;
+    hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
      int i;
      for (i=0; i<tok->nvalues; i++)
-        tok->values[i] = fabs(tok->values[i]);
+        if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]);
+        else rtok->values[i] = fabs(tok->values[i]);
+    return 1;
+}
+static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+    token_t *tok = stack[nstack - 1];
+    if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n");
+
+    int i, cnt = 0;
+    for (i=0; i<tok->nsamples; i++)
+        if ( tok->pass_samples[i] ) cnt++;
+
+    rtok->nvalues = 1;
+    rtok->values[0] = cnt;
+    return 1;
  }
-static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
+static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    tok->is_str = 0;
-    if ( !tok->str_value.l ) return;
+    token_t *tok = stack[nstack - 1];
+    rtok->nvalues = rtok->str_value.l = 0;
+    if ( !tok->str_value.l ) return 1;
+
      if ( tok->idx==-2 )
      {
          int i = 0;
@@ -1061,202 +1105,193 @@ static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok)
          {
              char *se = ss;
              while ( *se && *se!=',' ) se++;
-            hts_expand(double, i+1, tok->mvalues, tok->values);
-            if ( !*se ) tok->values[i] = strlen(ss);
+            hts_expand(double, i+1, rtok->mvalues, rtok->values);
+            if ( !*se ) rtok->values[i] = strlen(ss);
              else
              {
                  *se = 0;
-                tok->values[i] = strlen(ss);
+                rtok->values[i] = strlen(ss);
                  *se = ',';
              }
              ss = *se ? se + 1 : se;
              i++;
          }
-        tok->nvalues = i;
+        rtok->nvalues = i;
      }
      else
      {
-        tok->values[0] = strlen(tok->str_value.s);
-        tok->nvalues = 1;
+        rtok->values[0] = strlen(tok->str_value.s);
+        rtok->nvalues = 1;
      }
-    tok->str_value.l = 0;
+    return 1;
+}
+inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok)
+{
+    token_t *tok = atok->nvalues > btok->nvalues ? atok : btok;
+    rtok->nvalues = tok->nvalues;
+    rtok->nval1   = tok->nval1;
+    hts_expand(double*, rtok->nvalues, rtok->mvalues, rtok->values);
  }
-#define VECTOR_ARITHMETICS(atok,btok,AOP) \
+inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok)
+{
+    if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
+    {
+        rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
+        rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); 
+        int i;
+        for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
+        for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
+    }
+    memset(rtok->pass_samples, 0, rtok->nsamples);
+}
+
+#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
  { \
+    token_t *rtok = _rtok; \
      int i, has_values = 0; \
-    if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \
+    if ( atok->nvalues && btok->nvalues ) \
      { \
-        (atok)->nvalues = 0; (atok)->nsamples = 0; \
-    } \
-    else \
-    { \
-        if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \
-        { \
-            for (i=0; i<(atok)->nvalues; i++) \
-            { \
-                if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
-                if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
-                has_values = 1; \
-                (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
-            } \
-        } \
-        else if ( (btok)->nsamples ) \
+        tok_init_values(atok, btok, rtok); \
+        tok_init_samples(atok, btok, rtok); \
+        if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
          { \
-            hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
-            for (i=0; i<(btok)->nvalues; i++) \
+            assert( atok->nsamples==btok->nsamples ); \
+            for (i=0; i<atok->nvalues; i++) \
              { \
-                if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
+                if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \
                  { \
-                    bcf_double_set_missing((atok)->values[i]); \
+                    bcf_double_set_missing(rtok->values[i]); \
                      continue; \
                  } \
                  has_values = 1; \
-                (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \
+                rtok->values[i] = atok->values[i] AOP btok->values[i]; \
              } \
-            (atok)->nvalues  = (btok)->nvalues; \
-            (atok)->nsamples = (btok)->nsamples; \
          } \
-        else if ( (atok)->nsamples ) \
+        else \
          { \
-            for (i=0; i<(atok)->nvalues; i++) \
+            token_t *xtok = atok->nsamples ? atok : btok; \
+            token_t *ytok = atok->nsamples ? btok : atok; \
+            assert( ytok->nvalues==1 ); \
+            if ( !bcf_double_is_missing(ytok->values[0]) ) \
              { \
-                if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
+                for (i=0; i<xtok->nvalues; i++) \
                  { \
-                    bcf_double_set_missing((atok)->values[i]); \
-                    continue; \
+                    if ( bcf_double_is_missing(xtok->values[i]) ) \
+                    { \
+                        bcf_double_set_missing(rtok->values[i]); \
+                        continue; \
+                    } \
+                    has_values = 1; \
+                    rtok->values[i] = xtok->values[i] AOP ytok->values[0]; \
                  } \
-                has_values = 1; \
-                (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \
              } \
          } \
      } \
-    if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \
+    if ( !has_values ) rtok->nvalues = 0; \
  }
  
-static int vector_logic_and(token_t *atok, token_t *btok, int and_type)
+static int vector_logic_or(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields),
-    // or a vector and a scalar (FORMAT field vs threshold)
-    int i, pass_site = 0;
-    if ( !atok->nvalues || !btok->nvalues )
-    {
-        atok->nvalues = atok->nsamples = 0;
-        return 0;
-    }
-    if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site;
-    if ( atok->nsamples && btok->nsamples )
+    token_t *atok = stack[nstack-2];
+    token_t *btok = stack[nstack-1];
+    tok_init_samples(atok, btok, rtok);
+
+    if ( !atok->pass_site && !btok->pass_site ) return 2;
+
+    rtok->pass_site = 1;
+    if ( !atok->nsamples && !btok->nsamples ) return 2;
+
+    int i;
+    if ( rtok->tok_type==TOK_OR_VEC )   // ||, select all samples if one is true
      {
-        if ( and_type==TOK_AND )
+        if ( (!atok->nsamples && !atok->pass_site) || (!btok->nsamples && !btok->pass_site) )
          {
-            // perform AND within a sample
-            for (i=0; i<atok->nsamples; i++)
+            // These two conditions are to ensure the following does not set all samples
+            // at sites with QUAL<=30:
+            //      QUAL>30 || FMT/GQ>30
+
+            token_t *tok = atok->nsamples ? atok : btok;
+            for (i=0; i<rtok->nsamples; i++)
              {
-                atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
-                if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+                if ( !rtok->usmpl[i] ) continue;
+                rtok->pass_samples[i] = tok->pass_samples[i];
              }
          }
          else
          {
-            // perform AND across samples
-            int pass_a = 0, pass_b = 0;
-            for (i=0; i<atok->nsamples; i++)
-            {
-                if ( atok->pass_samples[i] ) pass_a = 1;
-                atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i];
-            }
-            for (i=0; i<btok->nsamples; i++)
+            for (i=0; i<rtok->nsamples; i++)
              {
-                if ( btok->pass_samples[i] ) { pass_b = 1; break; }
+                if ( !rtok->usmpl[i] ) continue;
+                rtok->pass_samples[i] = 1;
              }
-            pass_site = pass_a && pass_b;
          }
-        return pass_site;
+        return 2;
      }
-    if ( btok->nsamples )
+
+    //  |, only select samples which are actually true
+
+    if ( !atok->nsamples || !btok->nsamples )
      {
-        for (i=0; i<btok->nsamples; i++)
+        token_t *tok = atok->nsamples ? atok : btok;
+        for (i=0; i<rtok->nsamples; i++)
          {
-            atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i];
-            if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = tok->pass_samples[i];
          }
-        atok->nsamples = btok->nsamples;
-        return pass_site;
+        return 2;
      }
-    /* atok->nsamples!=0 */
-    for (i=0; i<atok->nsamples; i++)
+
+    assert( atok->nsamples==btok->nsamples );
+
+    for (i=0; i<rtok->nsamples; i++)
      {
-        atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site;
-        if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+        if ( !rtok->usmpl[i] ) continue;
+        rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
      }
-    return pass_site;
+    return 2;
  }
-static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
+static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
  {
-    int i, pass_site = 0;
-    if ( !atok->nvalues && !btok->nvalues )   // missing sites in both
-    {
-        atok->nvalues = atok->nsamples = 0;
-        return 0;
-    }
-    if ( !atok->nvalues ) // missing value in a
-    {
-        for (i=0; i<btok->nsamples; i++)
-            atok->pass_samples[i] = btok->pass_samples[i];
-        atok->nsamples = btok->nsamples;
-        atok->nvalues  = 1;
-        return btok->pass_site;
-    }
-    if ( !btok->nvalues ) // missing value in b
-    {
-        btok->nvalues = 1;
-        return atok->pass_site;
-    }
+    token_t *atok = stack[nstack-2];
+    token_t *btok = stack[nstack-1];
+    tok_init_samples(atok, btok, rtok);
  
-    if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
-    if ( !atok->nsamples )
+    if ( !atok->pass_site || !btok->pass_site ) return 2;
+    if ( !atok->nsamples && !btok->nsamples ) { rtok->pass_site = 1; return 2; }
+
+    int i;
+    if ( !atok->nsamples || !btok->nsamples )
      {
-        if ( or_type==TOK_OR )
+        token_t *tok = atok->nsamples ? atok : btok;
+        for (i=0; i<rtok->nsamples; i++)
          {
-            for (i=0; i<btok->nsamples; i++)
-            {
-                atok->pass_samples[i] = btok->pass_samples[i];
-                if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1;
-            }
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = tok->pass_samples[i];  
          }
-        else
-        {
-            for (i=0; i<btok->nsamples; i++)
-            {
-                atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i];
-                if ( atok->pass_samples[i] ) pass_site = 1;
-            }
-        }
-        atok->nsamples = btok->nsamples;
-        return pass_site;
+        rtok->pass_site = 1;
+        return 2;
      }
-    if ( !btok->nsamples )  // vector vs site
+
+    assert( atok->nsamples==btok->nsamples );
+    if ( rtok->tok_type==TOK_AND_VEC )  // &&, can be true in different samples
      {
-        if ( or_type==TOK_OR )
+        for (i=0; i<rtok->nsamples; i++)
          {
-            for (i=0; i<atok->nsamples; i++)
-                if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1;
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = atok->pass_samples[i] | btok->pass_samples[i];
          }
-        else
-        {
-            for (i=0; i<atok->nsamples; i++)
-            {
-                atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site;
-                if ( atok->pass_samples[i] ) pass_site = 1;
-            }
-        }
-        return pass_site;
+        rtok->pass_site = 1;
      }
-    for (i=0; i<atok->nsamples; i++)
+    else    // &, must be true within one sample
      {
-        atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i];
-        if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+        for (i=0; i<rtok->nsamples; i++)
+        {
+            if ( !rtok->usmpl[i] ) continue;
+            rtok->pass_samples[i] = atok->pass_samples[i] & btok->pass_samples[i];
+            if ( rtok->pass_samples[i] ) rtok->pass_site = 1;
+        }
      }
-    return pass_site;
+    return 2;
  }
  
  #define CMP_MISSING(atok,btok,CMP_OP,ret) \
@@ -1267,224 +1302,394 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type)
      tok->nvalues = 1; \
  }
  
-#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
+#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
  { \
-    int i, j, has_values = 0, pass_site = 0; \
-    if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
-    else \
+    token_t *rtok = _rtok; \
+    int i, j, k; \
+    assert( !atok->nsamples || !btok->nsamples ); \
+    tok_init_samples(atok, btok, rtok); \
+    if ( !atok->nsamples && !btok->nsamples ) \
      { \
-        if ( (atok)->idx<=-2 || (btok)->idx<=-2 ) \
+        if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; } \
+        else if ( !atok->nvalues || !btok->nvalues ) \
          { \
-            /* any field can match: [*] */ \
-            for (i=0; i<(atok)->nvalues; i++) \
+            token_t *tok = atok->nvalues ? atok : btok; \
+            for (j=0; j<tok->nvalues; j++) \
              { \
-                for (j=0; j<(btok)->nvalues; j++) \
-                    if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \
+                if ( bcf_double_is_missing(tok->values[j]) ) \
+                { \
+                    if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \
+                } \
+                else if ( missing_logic[1] ) {  rtok->pass_site = 1; break; } \
              } \
          } \
-        else if ( (atok)->nsamples && (btok)->nsamples ) \
+        else \
          { \
-            for (i=0; i<(atok)->nsamples; i++) \
+            for (i=0; i<atok->nvalues; i++) \
              { \
-                has_values = 1; \
-                if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
-                else (atok)->pass_samples[i] = 0; \
+                int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \
+                for (j=0; j<btok->nvalues; j++) \
+                { \
+                    int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \
+                    if ( nmiss ) \
+                    { \
+                        if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+                    } \
+                    else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \
+                } \
              } \
-            if ( !has_values ) (atok)->nvalues = 0; \
          } \
-        else if ( (atok)->nsamples ) \
+    } \
+    else \
+    { \
+        if ( !atok->nvalues && !btok->nvalues ) \
          { \
-            for (i=0; i<(atok)->nsamples; i++) \
+            if ( missing_logic[2] ) \
              { \
-                /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
-                has_values = 1; \
-                if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
-                else (atok)->pass_samples[i] = 0; \
+                for (i=0; i<rtok->nsamples; i++) \
+                    if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } \
              } \
-            if ( !has_values ) (atok)->nvalues = 0; \
          } \
-        else if ( (btok)->nsamples ) \
+        else if ( !atok->nvalues || !btok->nvalues ) \
          { \
-            for (i=0; i<(btok)->nsamples; i++) \
+            token_t *tok = atok->nvalues ? atok : btok; \
+            if ( !tok->nsamples ) \
              { \
-                if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
-                has_values = 1; \
-                if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
-                else (atok)->pass_samples[i] = 0; \
+                int miss = 0; \
+                for (j=0; j<tok->nvalues; j++) \
+                    miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \
+                if ( missing_logic[++miss] ) \
+                { \
+                    for (i=0; i<rtok->nsamples; i++) \
+                        if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+                } \
              } \
-            (atok)->nvalues  = (btok)->nvalues; \
-            (atok)->nsamples = (btok)->nsamples; \
-            if ( !has_values ) (atok)->nvalues = 0; \
+            else \
+                for (i=0; i<tok->nsamples; i++) \
+                { \
+                    if ( !rtok->usmpl[i] ) continue; \
+                    double *ptr = tok->values + i*tok->nval1; \
+                    int miss = 0; \
+                    for (j=0; j<tok->nval1; j++) \
+                        miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \
+                    if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \
+                } \
          } \
          else \
          { \
-            if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \
+            token_t *xtok = atok->nsamples ? atok : btok; \
+            token_t *ytok = atok->nsamples ? btok : atok; \
+            for (i=0; i<xtok->nsamples; i++) \
+            { \
+                if ( !rtok->usmpl[i] ) continue; \
+                double *xptr = xtok->values + i*xtok->nval1; \
+                double *yptr = ytok->values + i*ytok->nval1; \
+                for (j=0; j<xtok->nval1; j++) \
+                { \
+                    int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \
+                    if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
+                    for (k=0; k<ytok->nvalues; k++) \
+                    { \
+                        int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \
+                        if ( nmiss ) \
+                        { \
+                            if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                        } \
+                        else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \
+                    } \
+                } \
+            } \
          } \
-        /*fprintf(bcftools_stderr,"pass=%d\n", pass_site);*/ \
-        (ret) = pass_site; \
      } \
  }
-static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // logic: TOK_EQ or TOK_NE
+static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logic, int *missing_logic)
  {
-    if ( !atok->str_value.l ) { return 0; }
-    if ( !btok->str_value.l ) { atok->str_value.l = 0; return 0; }
-    int i, pass_site = 0;
-    if ( atok->nsamples && atok->nsamples==btok->nsamples )
-    {
-        for (i=0; i<atok->nsamples; i++)
-        {
-            char *astr = atok->str_value.s + i*atok->nstr1;
-            char *bstr = btok->str_value.s + i*btok->nstr1;
-            char *aend = astr + atok->str_value.l, *a = astr;
-            while ( a<aend && *a ) a++;
-            char *bend = bstr + btok->str_value.l, *b = bstr;
-            while ( b<bend && *b ) b++;
-            if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
-            else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
-            if ( logic!=TOK_EQ )
-                atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
-            pass_site |= atok->pass_samples[i];
-        }
-        if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
-    }
-    else if ( !atok->nsamples && !btok->nsamples )
-    {
-        if ( atok->idx==-2 || btok->idx==-2 )
-        {
-            // any field can match: [*]
-            if ( atok->idx==-2 && btok->idx==-2 )
-                error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value);
-            token_t *xtok, *ytok;   // xtok is scalar, ytok array
-            if ( btok->idx==-2 ) { xtok = atok; ytok = btok; }
-            else { xtok = btok; ytok = atok; }
-            char *xstr = xtok->str_value.s, *xend = xstr + xtok->str_value.l;
-            char *ystr = ytok->str_value.s, *yend = ystr + ytok->str_value.l, *y = ystr;
-            while ( y<=yend )
-            {
-                if ( y==yend || *y==',' )
-                {
-                    if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) )
-                    {
-                        pass_site = 1;
-                        break;
-                    }
-                    ystr = y+1;
-                }
-                y++;
-            }
-        }
-        else
-            pass_site = strcmp(atok->str_value.s,btok->str_value.s) ? 0 : 1;
-        if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
+    char *end = str + len;
+    while ( str < end && *str )
+    {
+        char *mid = str;
+        while ( mid < end && *mid && *mid!=',' ) mid++;
+        int miss = mid - str == 1 && str[0]=='.' ? 1 : 0;
+        if ( miss && missing_logic[miss] ) return 1;
+        char tmp = *mid; *mid = 0;
+        int match = regexec(regex, str, 0,NULL,0) ? 0 : 1;
+        *mid = tmp;
+        if ( logic==TOK_NLIKE ) match = match ? 0 : 1;
+        if ( match ) return 1;
+        if ( !*mid ) break;
+        str = mid + 1;
      }
-    else
+    return 0;
+}
+static inline int _has_missing_string(char *beg)
+{
+    while ( *beg )
      {
-        token_t *xtok, *ytok;
-        if ( !atok->nsamples ) { xtok = atok; ytok = btok; }
-        else { xtok = btok; ytok = atok; }
-        char *xstr = xtok->str_value.s;
-        char *xend = xstr + xtok->str_value.l, *x = xstr;
-        while ( x<xend && *x ) x++;
-        for (i=0; i<ytok->nsamples; i++)
-        {
-            char *ystr = ytok->str_value.s + i*ytok->nstr1;
-            char *ybeg = ystr, *yend = ystr + ytok->nstr1;
-            int pass = 0;
-            while ( ybeg < yend )
+        char *end = beg;
+        while ( *end && *end!=',' ) end++;
+        if ( end-beg==1 && beg[0]=='.' ) return 1;
+        if ( !*end ) break;
+        beg = end + 1;
+    }
+    return 0;
+}
+
+// Compare two strings with multiple fields, for example "A,B,.,C"=="X,Y,A".
+// Returns 1 if any field matches, otherwise returns 0
+static inline int _match_vector_strings(char *abeg, size_t alen, char *bstr, size_t blen, int logic, int *missing_logic)
+{
+    char *aend = abeg + alen;
+    char *bend = bstr + blen;
+    while ( abeg < aend && *abeg )
+    {
+        char *amid = abeg;
+        while ( amid < aend && *amid && *amid!=',' ) amid++;
+        int miss = amid - abeg == 1 && abeg[0]=='.' ? 1 : 0;
+        char *bbeg = bstr;
+        while ( bbeg < bend && *bbeg )
+        {
+            char *bmid = bbeg;
+            while ( bmid < bend && *bmid && *bmid!=',' ) bmid++;
+            int nmiss = miss + (bmid - bbeg == 1 && bbeg[0]=='.' ? 1 : 0);
+            if ( nmiss )
              {
-                char *y = ybeg;
-                while ( y<yend && *y && *y!=',' ) y++;
-                if ( y - ybeg != x - xstr ) pass = 0;
-                else pass = strncmp(xstr,ybeg,x-xstr)==0 ? 1 : 0;
-                if ( logic!=TOK_EQ ) pass = pass ? 0 : 1;
-                if ( pass || !*y ) break;
-                ybeg = y+1;
+                if ( missing_logic[nmiss] ) return 1;
              }
-            atok->pass_samples[i] = pass;
-            pass_site |= pass;
+            else
+            {
+                int match = amid-abeg==bmid-bbeg && !strncmp(abeg,bbeg,amid-abeg) ? 1 : 0;
+                if ( logic==TOK_NE ) match = match==1 ? 0 : 1;
+                if ( match ) return 1;
+            }
+            if ( !*bmid ) break;
+            bbeg = bmid + 1;
          }
-        if ( !atok->nsamples )
-            atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
+        if ( !*amid ) break;
+        abeg = amid + 1;
      }
-    return pass_site;
+    return 0;
  }
-static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
+static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok)
  {
-    int i, pass_site = 0;
-    if ( atok->nsamples )
+    tok_init_samples(atok, btok, rtok);
+
+    int i, logic = rtok->tok_type;     // TOK_EQ, TOK_NE, TOK_LIKE, TOK_NLIKE
+    regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL);
+
+    assert( atok->nvalues==atok->str_value.l && btok->nvalues==btok->str_value.l );
+    assert( !atok->nsamples || !btok->nsamples );
+    assert( (!regex && (logic==TOK_EQ || logic==TOK_NE)) || (regex && (logic==TOK_LIKE || logic==TOK_NLIKE)) );
+
+    int missing_logic[] = {0,0,0};
+    if ( logic==TOK_EQ || logic==TOK_LIKE ) missing_logic[0] = missing_logic[2] = 1;
+    else if ( logic==TOK_NE || logic==TOK_NLIKE ) missing_logic[0] = missing_logic[1] = 1;
+
+    if ( !atok->nsamples && !btok->nsamples )
+    {
+        if ( !atok->nvalues && !btok->nvalues ) { rtok->pass_site = missing_logic[2]; return; }
+        if ( !atok->nvalues || !btok->nvalues )
+        {
+            int miss = _has_missing_string(atok->nvalues ? atok->str_value.s : btok->str_value.s);
+            if ( missing_logic[miss+1] ) rtok->pass_site = 1;
+            return;
+        }
+        if ( !regex )
+            rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); 
+        else
+        {
+            token_t *tok = atok->regex ? btok : atok;
+            rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic);
+        }
+        return;
+    }
+
+    // The case of (!atok->nsamples || !btok->nsamples)
+
+    if ( !atok->nvalues && !btok->nvalues )
      {
-        for (i=0; i<atok->nsamples; i++)
+        if ( missing_logic[2] )
          {
-            char *ptr = atok->str_value.s + i*atok->nstr1;
-            atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
-            if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
-            pass_site |= atok->pass_samples[i];
+            for (i=0; i<rtok->nsamples; i++) 
+                if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
          }
-        return pass_site;
+        return;
+    }
+    if ( !atok->nvalues || !btok->nvalues )
+    {
+        token_t *tok = atok->nvalues ? atok : btok;
+        if ( !tok->nsamples )
+        {
+            int miss = _has_missing_string(tok->str_value.s);
+            if ( !missing_logic[miss+1] ) return;
+            for (i=0; i<rtok->nsamples; i++)
+                if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+        }
+        else
+            for (i=0; i<tok->nsamples; i++)
+            {
+                if ( !rtok->usmpl[i] ) continue;
+                int miss = _has_missing_string(tok->str_value.s + i*tok->nval1);
+                if ( missing_logic[miss+1] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
+            }
+        return;
+    }
+
+    // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) 
+    token_t *xtok = atok->nsamples ? atok : btok;
+    token_t *ytok = atok->nsamples ? btok : atok;
+    assert( regex==ytok->regex );
+    for (i=0; i<xtok->nsamples; i++)
+    {
+        if ( !rtok->usmpl[i] ) continue;
+        int match;
+        if ( regex )
+            match = _regex_vector_strings(regex, xtok->str_value.s + i*xtok->nval1, xtok->nval1, logic, missing_logic);
+        else
+            match = _match_vector_strings(xtok->str_value.s + i*xtok->nval1, xtok->nval1, ytok->str_value.s, ytok->str_value.l, logic, missing_logic);
+        if ( match ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; }
      }
-    pass_site = regexec(btok->regex, atok->str_value.s, 0,NULL,0) ? 0 : 1;
-    if ( negate ) pass_site = pass_site ? 0 : 1;
-    return pass_site;
  }
  
-static void parse_tag_idx(char *tag, char *tag_idx, token_t *tok)   // tag_idx points just after "TAG["
+static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
  {
-    // TAG[*] .. any field
-    if ( !strncmp("*]", tag_idx, 3) )
+    // TAG[], TAG[*] .. any field; sets idx=-2, idxs[0]=-1
+    if ( *tag_idx==0 || !strcmp("*", tag_idx) )
      {
-        tok->idxs = (int*) malloc(sizeof(int));
-        tok->idxs[0] = -1;
-        tok->nidxs   = 1;
-        tok->idx     = -2;
-        return;
+        *idxs = (int*) malloc(sizeof(int));
+        (*idxs)[0] = -1;
+        *nidxs     = 1;
+        *idx       = -2;
+        return 0;
      }
  
-    // TAG[integer] .. one field
+    // TAG[integer] .. one field; idx positive
      char *end, *beg = tag_idx;
-    tok->idx = strtol(tag_idx, &end, 10);
-    if ( tok->idx >= 0 && *end==']' ) return;
+    *idx = strtol(tag_idx, &end, 10);
+    if ( *idx >= 0 && *end==0 ) return 0;
  
-
-    // TAG[0,1] or TAG[0-2] or [1-] etc
+    // TAG[0,1] or TAG[0-2] or [1-] etc; idx=-2, idxs[...]=0,0,1,1,..
      int i, ibeg = -1;
-    while ( *beg && *beg!=']' )
+    while ( *beg )
      {
-        int idx = strtol(beg, &end, 10);
+        int num = strtol(beg, &end, 10);
          if ( end[0]==',' ) beg = end + 1;
-        else if ( end[0]==']' ) beg = end;
-        else if ( end[0]=='-' ) { beg = end + 1; ibeg = idx; continue; }
-        else error("Could not parse the index: %s[%s\n", tag, tag_idx+1);
-        if ( idx >= tok->nidxs )
+        else if ( end[0]==0 ) beg = end;
+        else if ( end[0]=='-' ) { beg = end + 1; ibeg = num; continue; }
+        else return -1;
+        if ( num >= *nidxs )
          {
-            tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(idx+1));
-            memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(idx - tok->nidxs + 1));
-            tok->nidxs = idx + 1;
+            *idxs = (int*) realloc(*idxs, sizeof(int)*(num+1));
+            memset(*idxs + *nidxs, 0, sizeof(int)*(num - *nidxs + 1));
+            *nidxs = num + 1;
          }
          if ( ibeg>=0 )
          {
-            for (i=ibeg; i<=idx; i++) tok->idxs[i] = 1;
+            for (i=ibeg; i<=num; i++) (*idxs)[i] = 1;
              ibeg = -1;
          }
-        tok->idxs[idx] = 1;
+        (*idxs)[num] = 1;
      }
      if ( ibeg >=0 )
      {
-        if ( ibeg >= tok->nidxs )
+        if ( ibeg >= *nidxs )
          {
-            tok->idxs = (int*) realloc(tok->idxs, sizeof(int)*(ibeg+1));
-            memset(tok->idxs + tok->nidxs, 0, sizeof(int)*(ibeg - tok->nidxs + 1));
-            tok->nidxs = ibeg + 1;
+            *idxs = (int*) realloc(*idxs, sizeof(int)*(ibeg+1));
+            memset(*idxs + *nidxs, 0, sizeof(int)*(ibeg - *nidxs + 1));
+            *nidxs = ibeg + 1;
+        }
+        (*idxs)[ibeg] = -1;
+    }
+    *idx = -2;
+    return 0;
+}
+
+static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, token_t *tok)   // tag_idx points just after "TAG["
+{
+    int i, len = strlen(tag_idx);
+    if ( tag_idx[len-1] == ']' ) tag_idx[len-1] = 0;
+    char *ori = strdup(tag_idx);
+
+    assert( !tok->idxs && !tok->usmpl );
+    int *idxs1 = NULL, nidxs1 = 0, idx1 = 0;
+    int *idxs2 = NULL, nidxs2 = 0, idx2 = 0;
+
+    int set_samples = 0;
+    char *colon = index(tag_idx, ':');
+    if ( colon )
+    {
+        *colon = 0;
+        if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+        if ( parse_idxs(colon+1, &idxs2, &nidxs2, &idx2) != 0 ) error("Could not parse the index: %s\n", ori);
+        tok->idxs  = idxs2;
+        tok->nidxs = nidxs2;
+        tok->idx   = idx2;
+        set_samples = 1;
+    }
+    else
+    {
+        if ( parse_idxs(tag_idx, &idxs1, &nidxs1, &idx1) != 0 ) error("Could not parse the index: %s\n", ori);
+        if ( is_fmt )
+        {
+            if ( nidxs1==1 && idxs1[0]==-1 )
+            {
+                tok->idxs = (int*) malloc(sizeof(int));
+                tok->idxs[0] = -1;
+                tok->nidxs   = 1;
+                tok->idx     = -2;
+            }
+            else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
+                error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
+            else
+                tok->idx = 0;
+            set_samples = 1;
+        }
+        else
+        {
+            tok->idxs  = idxs1; 
+            tok->nidxs = nidxs1;
+            tok->idx   = idx1;
+        }
+    }
+
+    if ( set_samples )
+    {
+        tok->nsamples = bcf_hdr_nsamples(hdr);
+        tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); 
+        if ( idx1>=0 )
+        {
+            if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+            tok->usmpl[idx1] = 1;
+        }
+        else if ( idx1==-2 )
+        {
+            for (i=0; i<nidxs1; i++)
+            {
+                if ( idxs1[i]==0 ) continue;
+                if ( idxs1[i]==-1 ) break;
+                if ( i >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
+                tok->usmpl[i] = 1;
+            }
+            if ( nidxs1 && idxs1[nidxs1-1]==-1 )    // open range, such as "7-"
+            {
+                for (; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+            }
          }
-        tok->idxs[ibeg] = -1;
+        else error("todo: %s:%d .. %d\n", __FILE__,__LINE__, idx2);
+        free(idxs1);
+    }
+    free(ori);
+
+    if ( tok->nidxs && tok->idxs[tok->nidxs-1]!=-1 )
+    {
+        for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
      }
-    tok->idx = -2;
  }
  static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
  {
      tok->tok_type  = TOK_VAL;
      tok->hdr_id    = -1;
      tok->pass_site = -1;
-    tok->idx       = -1;
+    tok->idx       = 0;
  
      // is this a string constant?
      if ( str[0]=='"' || str[0]=='\'' )
@@ -1579,6 +1784,10 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              tok->setter = &filters_set_alt_string;
              tok->is_str = 1;
              tok->tag = strdup("ALT");
+            tok->idxs = (int*) malloc(sizeof(int));
+            tok->idxs[0] = -1;
+            tok->nidxs   = 1;
+            tok->idx     = -2;
              return 0;
          }
          else if ( !strncasecmp(str,"N_ALT",len) )
@@ -1616,8 +1825,6 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          int i;
          for (i=0; i<tmp.l; i++)
              if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; is_array = i+1; break; }
-        if ( is_array ) 
-            parse_tag_idx(tmp.s, tmp.s+is_array, tok);
      }
      tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s);
      if ( is_fmt==-1 )
@@ -1629,6 +1836,16 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
          }
          if ( is_fmt==-1 ) is_fmt = 0;
      }
+    if ( is_array ) 
+        parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
+    else if ( is_fmt && !tok->nsamples ) 
+    {
+        int i;
+        tok->nsamples = bcf_hdr_nsamples(filter->hdr);
+        tok->usmpl = (uint8_t*) malloc(tok->nsamples); 
+        for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
+    }
+
      tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO;
      if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT;
      if ( tok->hdr_id>=0 )
@@ -1642,7 +1859,12 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
              if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
                  error("No such FORMAT field: %s\n", tmp.s);
              if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array )
-                error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s);
+            {
+                tok->idxs = (int*) malloc(sizeof(int));
+                tok->idxs[0] = -1;
+                tok->nidxs   = 1;
+                tok->idx     = -2;
+            }
              switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
              {
                  case BCF_HT_INT:  tok->setter = &filters_set_format_int; break;
@@ -1740,7 +1962,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
      {
          errno = 0;
          tok->threshold = strtof(tmp.s, &end);   // float?
-        if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+        if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
      }
  
      if ( tmp.s ) free(tmp.s);
@@ -1876,16 +2098,26 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
      int i;
      for (i=0; i<nout; i++)
      {
+        if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC )
+            out[i].func = vector_logic_or;
+        if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC )
+            out[i].func = vector_logic_and;
          if ( out[i].tok_type==TOK_EQ || out[i].tok_type==TOK_NE )
          {
              // Look for j="." and k numeric type
              int j = i-1, k = i-2;
              if ( !out[j].is_str ) { k = i-1, j = i-2; }
-            if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
+            if ( out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
              {
-                int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
-                if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
-                if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+                int set_missing = 0;
+                if ( out[k].hdr_id>0 )
+                {
+                    int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
+                    if ( type==BCF_HT_INT ) set_missing = 1;
+                    else if ( type==BCF_HT_REAL ) set_missing = 1;
+                }
+                else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1;
+                if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
              }
          }
          if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
@@ -1937,10 +2169,15 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
              else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) ival = i + 1;
              else error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
  
+            if ( !out[ival].key ) error("Comparison between samples is not supported, sorry!\n");
+
              // assign correct setters and unify expressions, eg ar->ra, HOM->hom, etc
              if ( !strcasecmp(out[ival].key,"hom") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"het") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"hap") ) { out[i].setter = filters_set_genotype3;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"mis") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"ref") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
+            else if ( !strcasecmp(out[ival].key,"alt") ) { out[i].setter = filters_set_genotype4;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"rr") ) { out[i].setter = filters_set_genotype2;  str_to_lower(out[ival].key); }
              else if ( !strcasecmp(out[ival].key,"ra") || !strcasecmp(out[ival].key,"ar") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]='a'; }   // ra
              else if ( !strcmp(out[ival].key,"aA") || !strcmp(out[ival].key,"Aa") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='a'; out[ival].key[1]='A'; }   // aA
@@ -1978,18 +2215,19 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
      filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
      for (i=0; i<nout; i++)
      {
-        if ( out[i].tok_type==TOK_MAX )      { out[i].setter = set_max; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_MIN ) { out[i].setter = set_min; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_AVG ) { out[i].setter = set_avg; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
-        else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
+        if ( out[i].tok_type==TOK_MAX )      { out[i].func = func_max; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_MIN ) { out[i].func = func_min; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_AVG ) { out[i].func = func_avg; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_SUM ) { out[i].func = func_sum; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_ABS ) { out[i].func = func_abs; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_CNT ) { out[i].func = func_count; out[i].tok_type = TOK_FUNC; }
+        else if ( out[i].tok_type==TOK_LEN ) { out[i].func = func_strlen; out[i].tok_type = TOK_FUNC; }
          hts_expand0(double,1,out[i].mvalues,out[i].values);
          if ( filter->nsamples )
          {
              out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
              int j;
-            for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 1;
+            for (j=0; j<filter->nsamples; j++) out[i].pass_samples[j] = 0;
          }
      }
  
@@ -2011,6 +2249,7 @@ void filter_destroy(filter_t *filter)
          free(filter->filters[i].str_value.s);
          free(filter->filters[i].tag);
          free(filter->filters[i].idxs);
+        free(filter->filters[i].usmpl);
          free(filter->filters[i].values);
          free(filter->filters[i].pass_samples);
          if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash);
@@ -2036,9 +2275,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
      int i, nstack = 0;
      for (i=0; i<filter->nfilters; i++)
      {
-        filter->filters[i].nsamples  = 0;
-        filter->filters[i].nvalues   = 0;
-        filter->filters[i].pass_site = -1;
+        filter->filters[i].pass_site = 0;
  
          if ( filter->filters[i].tok_type == TOK_VAL )
          {
@@ -2058,9 +2295,11 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
              filter->flt_stack[nstack++] = &filter->filters[i];
              continue;
          }
-        else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument
+        else if ( filter->filters[i].func )
          {
-            filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]);
+            int nargs = filter->filters[i].func(filter, line, &filter->filters[i], filter->flt_stack, nstack);
+            filter->flt_stack[nstack-nargs] = &filter->filters[i];
+            if ( --nargs > 0 ) nstack -= nargs;
              continue;
          }
          if ( nstack<2 )
@@ -2068,111 +2307,76 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
  
          int is_str  = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str;
  
-        if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC )
-        {
-            if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
-                error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
-            filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
-            nstack--;
-            continue;
-        }
-        if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC )
-        {
-            if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 )
-                error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site);
-            filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type);
-            nstack--;
-            continue;
-        }
-
          if ( filter->filters[i].tok_type == TOK_ADD )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
          else if ( filter->filters[i].tok_type == TOK_SUB )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
          else if ( filter->filters[i].tok_type == TOK_MULT )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
          else if ( filter->filters[i].tok_type == TOK_DIV )
          {
-            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/);
+            VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
+            filter->flt_stack[nstack-2] = &filter->filters[i];
              nstack--;
              continue;
          }
  
-        int is_true = 0;
+        // ideally, these comparators would become func, but this would require more work in init1()
          if ( filter->filters[i].comparator )
-            is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
-        else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
          {
-            int skip = 0;
-            if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
-            if ( filter->filters[i].tok_type != TOK_EQ  && filter->filters[i].tok_type != TOK_NE ) skip = 1;
-
-            if ( skip ) 
-                filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0;
-            else if ( filter->filters[i].tok_type == TOK_EQ )
-                CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true)
-            else if ( filter->filters[i].tok_type == TOK_NE )
-                CMP_MISSING(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true)
-        }
-        else if ( filter->filters[i].tok_type == TOK_EQ )
-        {
-            if ( filter->flt_stack[nstack-1]->comparator )
-                is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line);
-            else if ( filter->flt_stack[nstack-2]->comparator )
-                is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line);
-            else if ( is_str==2 )   // both are strings
-                is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ);
-            else if ( is_str==1 )
-                error("Comparing string to numeric value: %s\n", filter->str);
-            else
-                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true);
-        }
-        else if ( filter->filters[i].tok_type == TOK_NE )
-        {
-            if ( filter->flt_stack[nstack-1]->comparator )
-                is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line);
-            else if ( filter->flt_stack[nstack-2]->comparator )
-                is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line);
-            else if ( is_str==2 )
-                is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE);
-            else if ( is_str==1 )
-                error("Comparing string to numeric value: %s\n", filter->str);
-            else
-                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true);
+            filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
          }
-        else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
+        else if ( filter->flt_stack[nstack-1]->comparator )
          {
-            if ( is_str==2 )
-                is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
-            else
-                error("The regex operator can be used on strings only: %s\n", filter->str);
-        }
-        else if ( is_str>0 )
-            error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value);
-        else if ( filter->filters[i].tok_type == TOK_LE )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true)
-        else if ( filter->filters[i].tok_type == TOK_LT )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true)
-        else if ( filter->filters[i].tok_type == TOK_BT )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true)
-        else if ( filter->filters[i].tok_type == TOK_BE )
-            CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true)
+            filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],&filter->filters[i],line);
+        }
+        else if ( filter->flt_stack[nstack-2]->comparator )
+        {
+            filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],line);
+        }
+        else if ( is_str==2 )
+        {
+            cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i]);
+        }
          else
-            error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type);
+        {
+            // Determine what to do with one [1] or both [2] sides missing. The first field [0] gives [1]|[2]
+            int missing_logic[] = {0,0,0};
+            if ( filter->filters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; }
+            if ( filter->filters[i].tok_type == TOK_NE ) { missing_logic[0] = missing_logic[1] = 1; }
  
-        filter->flt_stack[nstack-2]->pass_site = is_true;
+            if ( filter->filters[i].tok_type == TOK_EQ )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],==,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_NE )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],!=,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_LE )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<=,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_LT )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],<,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_BT )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>,missing_logic)
+            else if ( filter->filters[i].tok_type == TOK_BE )
+                CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic)
+            else
+                error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type);
+
+        }
+        filter->flt_stack[nstack-2] = &filter->filters[i];
          nstack--;
      }
      if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack);    // too few values left on the stack
diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c

index 019adc0faf6f463e7b5a40b9c47941267dfc373d..d6b51a04c71c142e102da45785a4257029d6940e 100644 (file)
--- a/bcftools/main.c.pysam.c
+++ b/bcftools/main.c.pysam.c
@@ -49,7 +49,7 @@ int main_vcfcall(int argc, char *argv[]);
  int main_vcfannotate(int argc, char *argv[]);
  int main_vcfroh(int argc, char *argv[]);
  int main_vcfconcat(int argc, char *argv[]);
-int main_bcftools_reheader(int argc, char *argv[]);
+int main_reheader(int argc, char *argv[]);
  int main_vcfconvert(int argc, char *argv[]);
  int main_vcfcnv(int argc, char *argv[]);
  #if USE_GPL
@@ -125,7 +125,7 @@ static cmd_t cmds[] =
        .alias = "query",
        .help  = "transform VCF/BCF into user-defined formats"
      },
-    { .func  = main_bcftools_reheader,
+    { .func  = main_reheader,
        .alias = "reheader",
        .help  = "modify VCF/BCF header, change sample names"
      },
diff --git a/bcftools/regidx.c b/bcftools/regidx.c

index 84646a843823615dd881249727c3309f05d6fe1d..9b2c66d71eae327872c98cecd1347621f4157a28 100644 (file)
--- a/bcftools/regidx.c
+++ b/bcftools/regidx.c
@@ -1,5 +1,5 @@
  /* 
-    Copyright (C) 2014-2016 Genome Research Ltd.
+    Copyright (C) 2014-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -189,6 +189,35 @@ int regidx_insert(regidx_t *idx, char *line)
      return 0;
  }
  
+regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+    regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+    if ( !idx ) return NULL;
+
+    idx->free  = free_f;
+    idx->parse = parser ? parser : regidx_parse_tab;
+    idx->usr   = usr_dat;
+    idx->seq2regs = khash_str2int_init();
+    idx->payload_size = payload_size;
+    if ( payload_size ) idx->payload = malloc(payload_size);
+
+    kstring_t tmp = {0,0,0};
+    const char *ss = str;
+    while ( *ss )
+    {
+        while ( *ss && isspace(*ss) ) ss++;
+        const char *se = ss;
+        while ( *se && *se!='\r' && *se!='\n' ) se++;
+        tmp.l = 0;
+        kputsn(ss, se-ss, &tmp);
+        regidx_insert(idx,tmp.s);
+        while ( *se && isspace(*se) ) se++;
+        ss = se;
+    }
+    free(tmp.s);
+    return idx;
+}
+
  regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
  {
      if ( !parser )
@@ -595,4 +624,13 @@ int regitr_loop(regitr_t *regitr)
  }
  
  
+void regitr_copy(regitr_t *dst, regitr_t *src)
+{
+    _itr_t *dst_itr = (_itr_t*) dst->itr;
+    _itr_t *src_itr = (_itr_t*) src->itr;
+    *dst_itr = *src_itr;
+    *dst = *src;
+    dst->itr = dst_itr;
+}
+
  
diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c

index 62ef61f8da570aabcabb8131313ce909120f4561..1082e4aa9a5ff94d3fa2630cfdca7f63bcae348c 100644 (file)
--- a/bcftools/regidx.c.pysam.c
+++ b/bcftools/regidx.c.pysam.c
@@ -1,7 +1,7 @@
  #include "bcftools.pysam.h"
  
  /* 
-    Copyright (C) 2014-2016 Genome Research Ltd.
+    Copyright (C) 2014-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -191,6 +191,35 @@ int regidx_insert(regidx_t *idx, char *line)
      return 0;
  }
  
+regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+    regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+    if ( !idx ) return NULL;
+
+    idx->free  = free_f;
+    idx->parse = parser ? parser : regidx_parse_tab;
+    idx->usr   = usr_dat;
+    idx->seq2regs = khash_str2int_init();
+    idx->payload_size = payload_size;
+    if ( payload_size ) idx->payload = malloc(payload_size);
+
+    kstring_t tmp = {0,0,0};
+    const char *ss = str;
+    while ( *ss )
+    {
+        while ( *ss && isspace(*ss) ) ss++;
+        const char *se = ss;
+        while ( *se && *se!='\r' && *se!='\n' ) se++;
+        tmp.l = 0;
+        kputsn(ss, se-ss, &tmp);
+        regidx_insert(idx,tmp.s);
+        while ( *se && isspace(*se) ) se++;
+        ss = se;
+    }
+    free(tmp.s);
+    return idx;
+}
+
  regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
  {
      if ( !parser )
@@ -597,4 +626,13 @@ int regitr_loop(regitr_t *regitr)
  }
  
  
+void regitr_copy(regitr_t *dst, regitr_t *src)
+{
+    _itr_t *dst_itr = (_itr_t*) dst->itr;
+    _itr_t *src_itr = (_itr_t*) src->itr;
+    *dst_itr = *src_itr;
+    *dst = *src;
+    dst->itr = dst_itr;
+}
+
  
diff --git a/bcftools/regidx.h b/bcftools/regidx.h

index 8e25fe176052fbe5e20910a996db2c7a819777c3..fe0a897e4ae550f0602a2fee5af990aafdee686f 100644 (file)
--- a/bcftools/regidx.h
+++ b/bcftools/regidx.h
@@ -109,6 +109,8 @@ int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);
  
  /*
   *  regidx_init() - creates new index
+ *  regidx_init_string() - creates new index, from a string rather than from a file
+ *
   *  @param fname:  input file name or NULL if regions will be added one-by-one via regidx_insert()
   *  @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
   *                 the format will be autodected, currently either regidx_parse_tab (the default) or
@@ -121,6 +123,7 @@ int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);
   *  Returns index on success or NULL on error.
   */
  regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
+regidx_t *regidx_init_string(const char *string, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
  
  /*
   *  regidx_destroy() - free memory allocated by regidx_init
@@ -184,6 +187,11 @@ int regitr_overlap(regitr_t *itr);
   */
  int regitr_loop(regitr_t *itr);
  
+/*
+ *  regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop
+ */
+void regitr_copy(regitr_t *dst, regitr_t *src);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/bcftools/reheader.c b/bcftools/reheader.c

index 30a441c059932bfba8726ed5650fefb9c9006991..2776069009a718d24b854cb891d884b143dd173c 100644 (file)
--- a/bcftools/reheader.c
+++ b/bcftools/reheader.c
@@ -1,6 +1,6 @@
  /*  reheader.c -- reheader subcommand.
  
-    Copyright (C) 2014,2016 Genome Research Ltd.
+    Copyright (C) 2014-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -467,7 +467,7 @@ static void usage(args_t *args)
      exit(1);
  }
  
-int main_bcftools_reheader(int argc, char *argv[])
+int main_reheader(int argc, char *argv[])
  {
      int c;
      args_t *args  = (args_t*) calloc(1,sizeof(args_t));
diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c

index 803b4833d6ef4620e4b0819eb8f79a36fe59e904..11c0870a98f41e4733b4522b70410a12fd707356 100644 (file)
--- a/bcftools/reheader.c.pysam.c
+++ b/bcftools/reheader.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  reheader.c -- reheader subcommand.
  
-    Copyright (C) 2014,2016 Genome Research Ltd.
+    Copyright (C) 2014-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -469,7 +469,7 @@ static void usage(args_t *args)
      exit(1);
  }
  
-int main_bcftools_reheader(int argc, char *argv[])
+int main_reheader(int argc, char *argv[])
  {
      int c;
      args_t *args  = (args_t*) calloc(1,sizeof(args_t));
diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c

index e6efda9463dee9fccd35505e56a7699f1c88ae1c..abea98f9099cd7ea69a9a8dd6696d6133837c0dc 100644 (file)
--- a/bcftools/vcfannotate.c
+++ b/bcftools/vcfannotate.c
@@ -1,6 +1,6 @@
  /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -95,6 +95,7 @@ typedef struct _args_t
      filter_t *filter;
      char *filter_str;
      int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+    int keep_sites;
  
      rm_tag_t *rm;           // tags scheduled for removal
      int nrm;
@@ -263,7 +264,7 @@ static void init_remove_annots(args_t *args)
                  tag->key = strdup(str.s);
                  tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
                  if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
-                bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
+                if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
              }
              else
              {
@@ -293,32 +294,35 @@ static void init_remove_annots(args_t *args)
                  tag->key = strdup(str.s);
                  if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
                  else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
-                bcf_hdr_remove(args->hdr_out,type,tag->key);
+                if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
              }
          }
          else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
          else if ( !strcasecmp("FILTER",str.s) )
          {
              tag->handler = remove_filter;
-            remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
+            if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
          }
          else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
          else if ( !strcasecmp("INFO",str.s) ) 
          {
              tag->handler = remove_info;
-            remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
+            if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
          }
          else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
          {
              tag->handler = remove_format;
-            remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
+            if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
          }
          else if ( str.l )
          {
-            if ( str.s[0]=='#' && str.s[1]=='#' )
-                bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
-            else
-                bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+            if ( !args->keep_sites )
+            {
+                if ( str.s[0]=='#' && str.s[1]=='#' )
+                    bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
+                else
+                    bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+            }
              args->nrm--;
          }
  
@@ -354,7 +358,7 @@ static void init_remove_annots(args_t *args)
                  tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
              }
              tag->key = strdup(hrec->vals[k]);
-            bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
+            if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
          }
      }
      khash_str2int_destroy_free(keep);
@@ -1870,6 +1874,7 @@ static void usage(args_t *args)
      fprintf(stderr, "   -h, --header-lines <file>      lines which should be appended to the VCF header\n");
      fprintf(stderr, "   -I, --set-id [+]<format>       set ID column, see man page for details\n");
      fprintf(stderr, "   -i, --include <expr>           select sites for which the expression is true (see man page for details)\n");
+    fprintf(stderr, "   -k, --keep-sites               leave -i/-e sites unchanged instead of discarding them\n");
      fprintf(stderr, "   -m, --mark-sites [+-]<tag>     add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
      fprintf(stderr, "       --no-version               do not append version and command line to the header\n");
      fprintf(stderr, "   -o, --output <file>            write output to a file [standard output]\n");
@@ -1879,7 +1884,7 @@ static void usage(args_t *args)
      fprintf(stderr, "       --rename-chrs <file>       rename sequences according to map file: from\\tto\n");
      fprintf(stderr, "   -s, --samples [^]<list>        comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
      fprintf(stderr, "   -S, --samples-file [^]<file>   file of samples to annotate (or exclude with \"^\" prefix)\n");
-    fprintf(stderr, "   -x, --remove <list>            list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
+    fprintf(stderr, "   -x, --remove <list>            list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
      fprintf(stderr, "       --threads <int>            number of extra output compression threads [0]\n");
      fprintf(stderr, "\n");
      exit(1);
@@ -1901,6 +1906,7 @@ int main_vcfannotate(int argc, char *argv[])
  
      static struct option loptions[] =
      {
+        {"keep-sites",required_argument,NULL,'k'},
          {"mark-sites",required_argument,NULL,'m'},
          {"set-id",required_argument,NULL,'I'},
          {"output",required_argument,NULL,'o'},
@@ -1921,9 +1927,10 @@ int main_vcfannotate(int argc, char *argv[])
          {"no-version",no_argument,NULL,8},
          {NULL,0,NULL,0}
      };
-    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0)
      {
          switch (c) {
+            case 'k': args->keep_sites = 1; break;
              case 'm': 
                  args->mark_sites_logic = MARK_LISTED;
                  if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
@@ -2008,7 +2015,11 @@ int main_vcfannotate(int argc, char *argv[])
          {
              int pass = filter_test(args->filter, line, NULL);
              if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
-            if ( !pass ) continue;
+            if ( !pass ) 
+            {
+                if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line);
+                continue;
+            }
          }
          annotate(args, line);
          bcf_write1(args->out_fh, args->hdr_out, line);
diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c

index 72824f7b7810f6efa1d0af1999d5073fbc482570..87a6cc4efd81a69396030428697a782593606cc9 100644 (file)
--- a/bcftools/vcfannotate.c.pysam.c
+++ b/bcftools/vcfannotate.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfannotate.c -- Annotate and edit VCF/BCF files.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -97,6 +97,7 @@ typedef struct _args_t
      filter_t *filter;
      char *filter_str;
      int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+    int keep_sites;
  
      rm_tag_t *rm;           // tags scheduled for removal
      int nrm;
@@ -265,7 +266,7 @@ static void init_remove_annots(args_t *args)
                  tag->key = strdup(str.s);
                  tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
                  if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
-                bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
+                if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
              }
              else
              {
@@ -295,32 +296,35 @@ static void init_remove_annots(args_t *args)
                  tag->key = strdup(str.s);
                  if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
                  else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
-                bcf_hdr_remove(args->hdr_out,type,tag->key);
+                if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
              }
          }
          else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
          else if ( !strcasecmp("FILTER",str.s) )
          {
              tag->handler = remove_filter;
-            remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
+            if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
          }
          else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
          else if ( !strcasecmp("INFO",str.s) ) 
          {
              tag->handler = remove_info;
-            remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
+            if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
          }
          else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
          {
              tag->handler = remove_format;
-            remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
+            if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
          }
          else if ( str.l )
          {
-            if ( str.s[0]=='#' && str.s[1]=='#' )
-                bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
-            else
-                bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+            if ( !args->keep_sites )
+            {
+                if ( str.s[0]=='#' && str.s[1]=='#' )
+                    bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
+                else
+                    bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
+            }
              args->nrm--;
          }
  
@@ -356,7 +360,7 @@ static void init_remove_annots(args_t *args)
                  tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
              }
              tag->key = strdup(hrec->vals[k]);
-            bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
+            if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
          }
      }
      khash_str2int_destroy_free(keep);
@@ -1872,6 +1876,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "   -h, --header-lines <file>      lines which should be appended to the VCF header\n");
      fprintf(bcftools_stderr, "   -I, --set-id [+]<format>       set ID column, see man page for details\n");
      fprintf(bcftools_stderr, "   -i, --include <expr>           select sites for which the expression is true (see man page for details)\n");
+    fprintf(bcftools_stderr, "   -k, --keep-sites               leave -i/-e sites unchanged instead of discarding them\n");
      fprintf(bcftools_stderr, "   -m, --mark-sites [+-]<tag>     add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
      fprintf(bcftools_stderr, "       --no-version               do not append version and command line to the header\n");
      fprintf(bcftools_stderr, "   -o, --output <file>            write output to a file [standard output]\n");
@@ -1881,7 +1886,7 @@ static void usage(args_t *args)
      fprintf(bcftools_stderr, "       --rename-chrs <file>       rename sequences according to map file: from\\tto\n");
      fprintf(bcftools_stderr, "   -s, --samples [^]<list>        comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
      fprintf(bcftools_stderr, "   -S, --samples-file [^]<file>   file of samples to annotate (or exclude with \"^\" prefix)\n");
-    fprintf(bcftools_stderr, "   -x, --remove <list>            list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
+    fprintf(bcftools_stderr, "   -x, --remove <list>            list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
      fprintf(bcftools_stderr, "       --threads <int>            number of extra output compression threads [0]\n");
      fprintf(bcftools_stderr, "\n");
      exit(1);
@@ -1903,6 +1908,7 @@ int main_vcfannotate(int argc, char *argv[])
  
      static struct option loptions[] =
      {
+        {"keep-sites",required_argument,NULL,'k'},
          {"mark-sites",required_argument,NULL,'m'},
          {"set-id",required_argument,NULL,'I'},
          {"output",required_argument,NULL,'o'},
@@ -1923,9 +1929,10 @@ int main_vcfannotate(int argc, char *argv[])
          {"no-version",no_argument,NULL,8},
          {NULL,0,NULL,0}
      };
-    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0)
      {
          switch (c) {
+            case 'k': args->keep_sites = 1; break;
              case 'm': 
                  args->mark_sites_logic = MARK_LISTED;
                  if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
@@ -2010,7 +2017,11 @@ int main_vcfannotate(int argc, char *argv[])
          {
              int pass = filter_test(args->filter, line, NULL);
              if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
-            if ( !pass ) continue;
+            if ( !pass ) 
+            {
+                if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line);
+                continue;
+            }
          }
          annotate(args, line);
          bcf_write1(args->out_fh, args->hdr_out, line);
diff --git a/bcftools/vcfcnv.c b/bcftools/vcfcnv.c

index 11c55bddf71ad9af61a64913ae9f7399b55cd058..abfef54015ce29a33bc58be1db5b9b883423c06b 100644 (file)
--- a/bcftools/vcfcnv.c
+++ b/bcftools/vcfcnv.c
@@ -1,6 +1,6 @@
  /* The MIT License
  
-   Copyright (c) 2014-2015 Genome Research Ltd.
+   Copyright (c) 2014-2018 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -338,7 +338,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
              "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
              "\n"
              "dat = {}\n"
-            "with open('%s', 'rb') as f:\n"
+            "with open('%s', 'r') as f:\n"
              "    reader = csv.reader(f, 'tab')\n"
              "    for row in reader:\n"
              "        chr = row[0]\n"
@@ -347,7 +347,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
              "        dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
              "\n"
              "cnv = {}\n"
-            "with open('%s', 'rb') as f:\n"
+            "with open('%s', 'r') as f:\n"
              "    reader = csv.reader(f, 'tab')\n"
              "    for row in reader:\n"
              "        chr = row[0]\n"
@@ -429,7 +429,7 @@ static void create_plots(args_t *args)
              "\n"
              "def chroms_to_plot(th):\n"
              "   dat = {}\n"
-            "   with open('%s/summary.tab', 'rb') as f:\n"
+            "   with open('%s/summary.tab', 'r') as f:\n"
              "       reader = csv.reader(f, 'tab')\n"
              "       for row in reader:\n"
              "           if row[0]!='RG': continue\n"
@@ -451,14 +451,14 @@ static void create_plots(args_t *args)
              "   plot_chroms = chroms_to_plot(args.plot_threshold)\n"
              "\n"
              "def read_dat(file,dat,plot_chr):\n"
-            "   with open(file, 'rb') as f:\n"
+            "   with open(file, 'r') as f:\n"
              "       reader = csv.reader(f, 'tab')\n"
              "       for row in reader:\n"
              "           chr = row[0]\n"
              "           if chr != plot_chr: continue\n"
              "           dat.append([row[1], float(row[2]), float(row[3])])\n"
              "def read_cnv(file,cnv,plot_chr):\n"
-            "   with open(file, 'rb') as f:\n"
+            "   with open(file, 'r') as f:\n"
              "       reader = csv.reader(f, 'tab')\n"
              "       for row in reader:\n"
              "           chr = row[0]\n"
diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c

index db4dffcb04613b5d6ac0737e8a3d87966a9484fc..29b0cb5f14edf8bc15413c5acab7be83485f0132 100644 (file)
--- a/bcftools/vcfcnv.c.pysam.c
+++ b/bcftools/vcfcnv.c.pysam.c
@@ -2,7 +2,7 @@
  
  /* The MIT License
  
-   Copyright (c) 2014-2015 Genome Research Ltd.
+   Copyright (c) 2014-2018 Genome Research Ltd.
  
     Author: Petr Danecek <pd3@sanger.ac.uk>
     
@@ -340,7 +340,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
              "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
              "\n"
              "dat = {}\n"
-            "with open('%s', 'rb') as f:\n"
+            "with open('%s', 'r') as f:\n"
              "    reader = csv.reader(f, 'tab')\n"
              "    for row in reader:\n"
              "        chr = row[0]\n"
@@ -349,7 +349,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
              "        dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
              "\n"
              "cnv = {}\n"
-            "with open('%s', 'rb') as f:\n"
+            "with open('%s', 'r') as f:\n"
              "    reader = csv.reader(f, 'tab')\n"
              "    for row in reader:\n"
              "        chr = row[0]\n"
@@ -431,7 +431,7 @@ static void create_plots(args_t *args)
              "\n"
              "def chroms_to_plot(th):\n"
              "   dat = {}\n"
-            "   with open('%s/summary.tab', 'rb') as f:\n"
+            "   with open('%s/summary.tab', 'r') as f:\n"
              "       reader = csv.reader(f, 'tab')\n"
              "       for row in reader:\n"
              "           if row[0]!='RG': continue\n"
@@ -453,14 +453,14 @@ static void create_plots(args_t *args)
              "   plot_chroms = chroms_to_plot(args.plot_threshold)\n"
              "\n"
              "def read_dat(file,dat,plot_chr):\n"
-            "   with open(file, 'rb') as f:\n"
+            "   with open(file, 'r') as f:\n"
              "       reader = csv.reader(f, 'tab')\n"
              "       for row in reader:\n"
              "           chr = row[0]\n"
              "           if chr != plot_chr: continue\n"
              "           dat.append([row[1], float(row[2]), float(row[3])])\n"
              "def read_cnv(file,cnv,plot_chr):\n"
-            "   with open(file, 'rb') as f:\n"
+            "   with open(file, 'r') as f:\n"
              "       reader = csv.reader(f, 'tab')\n"
              "       for row in reader:\n"
              "           chr = row[0]\n"
diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c

index 1e28ad85743bd7bf4d13c60d1d51d640436d992d..f815e25cb3426225e1391211207a96fe2980da37 100644 (file)
--- a/bcftools/vcfconvert.c
+++ b/bcftools/vcfconvert.c
@@ -1,6 +1,6 @@
  /*  vcfconvert.c -- convert between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c

index a054ca8b31f9530c0a3739e484781f768370ce11..a0ec8b9a5f934b2d49ac969bd52509a06cd0ab31 100644 (file)
--- a/bcftools/vcfconvert.c.pysam.c
+++ b/bcftools/vcfconvert.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfconvert.c -- convert between VCF/BCF and related formats.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
diff --git a/bcftools/vcfgtcheck.c b/bcftools/vcfgtcheck.c

index 8835db367a40f55860c4c131086ae6950799fcc3..c4d77e19cd1a0f0324c49a039444801775f1dc6f 100644 (file)
--- a/bcftools/vcfgtcheck.c
+++ b/bcftools/vcfgtcheck.c
@@ -1,6 +1,6 @@
  /*  vcfgtcheck.c -- Check sample identity.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -81,7 +81,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample)
              "sample_ids = False\n"
              "\n"
              "dat = []\n"
-            "with open('%s.tab', 'rb') as f:\n"
+            "with open('%s.tab', 'r') as f:\n"
              "    reader = csv.reader(f, 'tab')\n"
              "    for row in reader:\n"
              "        if row[0][0]=='#': continue\n"
@@ -153,7 +153,7 @@ static void plot_cross_check(args_t *args)
              "dat   = None\n"
              "min   = None\n"
              "max   = None\n"
-            "with open('%s.tab', 'rb') as f:\n"
+            "with open('%s.tab', 'r') as f:\n"
              "   reader = csv.reader(f, 'tab')\n"
              "   i = 0\n"
              "   for row in reader:\n"
diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c

index 6a6fa58639c19884249e46e6a36a9cd4dfdebb5e..462e4041df5db9e1f01440dccefd829d78328e25 100644 (file)
--- a/bcftools/vcfgtcheck.c.pysam.c
+++ b/bcftools/vcfgtcheck.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfgtcheck.c -- Check sample identity.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -83,7 +83,7 @@ static void plot_check(args_t *args, char *target_sample, char *query_sample)
              "sample_ids = False\n"
              "\n"
              "dat = []\n"
-            "with open('%s.tab', 'rb') as f:\n"
+            "with open('%s.tab', 'r') as f:\n"
              "    reader = csv.reader(f, 'tab')\n"
              "    for row in reader:\n"
              "        if row[0][0]=='#': continue\n"
@@ -155,7 +155,7 @@ static void plot_cross_check(args_t *args)
              "dat   = None\n"
              "min   = None\n"
              "max   = None\n"
-            "with open('%s.tab', 'rb') as f:\n"
+            "with open('%s.tab', 'r') as f:\n"
              "   reader = csv.reader(f, 'tab')\n"
              "   i = 0\n"
              "   for row in reader:\n"
diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c

index bc51018f8552a886c3e1eb99325c865c09baf9c7..ead1e8951d9ae02fa30d7d8ad7880b88d87fcc4b 100644 (file)
--- a/bcftools/vcfnorm.c
+++ b/bcftools/vcfnorm.c
@@ -1,6 +1,6 @@
  /*  vcfnorm.c -- Left-align and normalize indels.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -34,6 +34,7 @@ THE SOFTWARE.  */
  #include <htslib/vcf.h>
  #include <htslib/synced_bcf_reader.h>
  #include <htslib/faidx.h>
+#include <htslib/khash_str2int.h>
  #include "bcftools.h"
  #include "rbuf.h"
  
@@ -53,6 +54,15 @@ typedef struct
  }
  map_t;
  
+// primitive comparison of two records' alleles via hashes; normalized alleles assumed
+typedef struct
+{
+    int n;  // number of alleles
+    char *ref, *alt;
+    void *hash;
+}
+cmpals_t;
+
  typedef struct
  {
      char *tseq, *seq;
@@ -72,6 +82,8 @@ typedef struct
      int aln_win;            // the realignment window size (maximum repeat size)
      bcf_srs_t *files;       // using the synced reader only for -r option
      bcf_hdr_t *hdr;
+    cmpals_t *cmpals;
+    int ncmpals, mcmpals;
      faidx_t *fai;
      struct { int tot, set, swap; } nref;
      char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
@@ -94,7 +106,7 @@ static inline int replace_iupac_codes(char *seq, int nseq)
  }
  static inline int has_non_acgtn(char *seq, int nseq)
  {
-    char *end = nseq ? seq + nseq : seq + UINT32_MAX;   // arbitrary large number
+    char *end = seq + nseq;
      while ( *seq && seq<end )
      {
          char c = toupper(*seq);
@@ -326,7 +338,7 @@ static int realign(args_t *args, bcf1_t *line)
      {
          if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC;  // symbolic allele
          if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
-        if ( has_non_acgtn(line->d.allele[i],0) )
+        if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
          {
              if ( args->check_ref==CHECK_REF_EXIT )
                  error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
@@ -1510,6 +1522,69 @@ static bcf1_t *mrows_flush(args_t *args)
      }
      return NULL;
  }
+static void cmpals_add(args_t *args, bcf1_t *rec)
+{
+    args->ncmpals++;
+    hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals);
+    cmpals_t *cmpals = args->cmpals + args->ncmpals - 1;
+    free(cmpals->ref);
+    cmpals->ref = strdup(rec->d.allele[0]);
+    cmpals->n   = rec->n_allele;
+    if ( rec->n_allele==2 )
+    {
+        free(cmpals->alt);
+        cmpals->alt = strdup(rec->d.allele[1]);
+    }
+    else
+    {
+        if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+        cmpals->hash = khash_str2int_init();
+        int i;
+        for (i=1; i<rec->n_allele; i++)
+            khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i]));
+    }
+}
+static int cmpals_match(args_t *args, bcf1_t *rec)
+{
+    int i, j;
+    for (i=0; i<args->ncmpals; i++)
+    {
+        cmpals_t *cmpals = args->cmpals + i;
+        if ( rec->n_allele != cmpals->n ) continue;
+
+        // NB. assuming both are normalized
+        if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue;
+
+        // the most frequent case
+        if ( rec->n_allele==2 )
+        {
+            if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue;
+            return 1;
+        }
+
+        khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash;
+        for (j=1; j<rec->n_allele; j++) 
+            if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break;
+        if ( j<rec->n_allele ) continue;
+        return 1;
+    }
+    cmpals_add(args, rec);
+    return 0;
+}
+static void cmpals_reset(args_t *args) { args->ncmpals = 0; }
+static void cmpals_destroy(args_t *args)
+{
+    int i;
+    for (i=0; i<args->mcmpals; i++)
+    {
+        cmpals_t *cmpals = args->cmpals + i;
+        free(cmpals->ref);
+        free(cmpals->alt);
+        if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+    }
+    free(args->cmpals);
+}
+
  static void flush_buffer(args_t *args, htsFile *file, int n)
  {
      bcf1_t *line;
@@ -1540,17 +1615,20 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
              int line_type = bcf_get_variant_types(args->lines[k]);
              if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
              {
-                if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
+                if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue;
              }
              else
              {
                  prev_rid  = args->lines[k]->rid;
                  prev_pos  = args->lines[k]->pos;
                  prev_type = 0;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
              }
              prev_type |= line_type;
+            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]);
          }
          bcf_write1(file, args->hdr, args->lines[k]);
      }
@@ -1580,6 +1658,7 @@ static void init_data(args_t *args)
  
  static void destroy_data(args_t *args)
  {
+    cmpals_destroy(args);
      int i;
      for (i=0; i<args->rbuf.m; i++)
          if ( args->lines[i] ) bcf_destroy1(args->lines[i]);
@@ -1676,17 +1755,20 @@ static void normalize_vcf(args_t *args)
              int line_type = bcf_get_variant_types(line);
              if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
              {
-                if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
+                if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue;
              }
              else
              {
                  prev_rid  = line->rid;
                  prev_pos  = line->pos;
                  prev_type = 0;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
              }
              prev_type |= line_type;
+            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line);
          }
  
          // still on the same chromosome?
@@ -1743,7 +1825,7 @@ static void usage(void)
      fprintf(stderr, "Options:\n");
      fprintf(stderr, "    -c, --check-ref <e|w|x|s>         check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
      fprintf(stderr, "    -D, --remove-duplicates           remove duplicate lines of the same type.\n");
-    fprintf(stderr, "    -d, --rm-dup <type>               remove duplicate snps|indels|both|any\n");
+    fprintf(stderr, "    -d, --rm-dup <type>               remove duplicate snps|indels|both|all|none\n");
      fprintf(stderr, "    -f, --fasta-ref <file>            reference sequence (MANDATORY)\n");
      fprintf(stderr, "    -m, --multiallelics <-|+>[type]   split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
      fprintf(stderr, "        --no-version                  do not append version and command line to the header\n");
@@ -1804,10 +1886,12 @@ int main_vcfnorm(int argc, char *argv[])
          switch (c) {
              case 'N': args->do_indels = 0; break;
              case 'd':
-                if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1;
-                else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1;
-                else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1;
-                else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1;
+                if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
+                else if ( !strcmp("indels",optarg) ) args->rmdup = BCF_SR_PAIR_INDELS;
+                else if ( !strcmp("both",optarg) ) args->rmdup = BCF_SR_PAIR_BOTH;
+                else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+                else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+                else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT;
                  else error("The argument to -d not recognised: %s\n", optarg);
                  break;
              case 'm':
@@ -1841,7 +1925,7 @@ int main_vcfnorm(int argc, char *argv[])
              case 'o': args->output_fname = optarg; break;
              case 'D':
                  fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); 
-                args->rmdup = COLLAPSE_NONE<<1;
+                args->rmdup = BCF_SR_PAIR_EXACT;
                  break;
              case 's': args->strict_filter = 1; break;
              case 'f': args->ref_fname = optarg; break;
diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c

index 5e3a5fbd6588e66c57d32c4a67aae0ac7e731108..d204de4fb1f343211e21d2f61095f6287009d18c 100644 (file)
--- a/bcftools/vcfnorm.c.pysam.c
+++ b/bcftools/vcfnorm.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfnorm.c -- Left-align and normalize indels.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -36,6 +36,7 @@ THE SOFTWARE.  */
  #include <htslib/vcf.h>
  #include <htslib/synced_bcf_reader.h>
  #include <htslib/faidx.h>
+#include <htslib/khash_str2int.h>
  #include "bcftools.h"
  #include "rbuf.h"
  
@@ -55,6 +56,15 @@ typedef struct
  }
  map_t;
  
+// primitive comparison of two records' alleles via hashes; normalized alleles assumed
+typedef struct
+{
+    int n;  // number of alleles
+    char *ref, *alt;
+    void *hash;
+}
+cmpals_t;
+
  typedef struct
  {
      char *tseq, *seq;
@@ -74,6 +84,8 @@ typedef struct
      int aln_win;            // the realignment window size (maximum repeat size)
      bcf_srs_t *files;       // using the synced reader only for -r option
      bcf_hdr_t *hdr;
+    cmpals_t *cmpals;
+    int ncmpals, mcmpals;
      faidx_t *fai;
      struct { int tot, set, swap; } nref;
      char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets;
@@ -96,7 +108,7 @@ static inline int replace_iupac_codes(char *seq, int nseq)
  }
  static inline int has_non_acgtn(char *seq, int nseq)
  {
-    char *end = nseq ? seq + nseq : seq + UINT32_MAX;   // arbitrary large number
+    char *end = seq + nseq;
      while ( *seq && seq<end )
      {
          char c = toupper(*seq);
@@ -328,7 +340,7 @@ static int realign(args_t *args, bcf1_t *line)
      {
          if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC;  // symbolic allele
          if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION;  // spanning deletion
-        if ( has_non_acgtn(line->d.allele[i],0) )
+        if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
          {
              if ( args->check_ref==CHECK_REF_EXIT )
                  error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
@@ -1512,6 +1524,69 @@ static bcf1_t *mrows_flush(args_t *args)
      }
      return NULL;
  }
+static void cmpals_add(args_t *args, bcf1_t *rec)
+{
+    args->ncmpals++;
+    hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals);
+    cmpals_t *cmpals = args->cmpals + args->ncmpals - 1;
+    free(cmpals->ref);
+    cmpals->ref = strdup(rec->d.allele[0]);
+    cmpals->n   = rec->n_allele;
+    if ( rec->n_allele==2 )
+    {
+        free(cmpals->alt);
+        cmpals->alt = strdup(rec->d.allele[1]);
+    }
+    else
+    {
+        if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+        cmpals->hash = khash_str2int_init();
+        int i;
+        for (i=1; i<rec->n_allele; i++)
+            khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i]));
+    }
+}
+static int cmpals_match(args_t *args, bcf1_t *rec)
+{
+    int i, j;
+    for (i=0; i<args->ncmpals; i++)
+    {
+        cmpals_t *cmpals = args->cmpals + i;
+        if ( rec->n_allele != cmpals->n ) continue;
+
+        // NB. assuming both are normalized
+        if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue;
+
+        // the most frequent case
+        if ( rec->n_allele==2 )
+        {
+            if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue;
+            return 1;
+        }
+
+        khash_t(str2int) *hash = (khash_t(str2int)*) cmpals->hash;
+        for (j=1; j<rec->n_allele; j++) 
+            if ( !khash_str2int_has_key(hash, rec->d.allele[j]) ) break;
+        if ( j<rec->n_allele ) continue;
+        return 1;
+    }
+    cmpals_add(args, rec);
+    return 0;
+}
+static void cmpals_reset(args_t *args) { args->ncmpals = 0; }
+static void cmpals_destroy(args_t *args)
+{
+    int i;
+    for (i=0; i<args->mcmpals; i++)
+    {
+        cmpals_t *cmpals = args->cmpals + i;
+        free(cmpals->ref);
+        free(cmpals->alt);
+        if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash);
+    }
+    free(args->cmpals);
+}
+
  static void flush_buffer(args_t *args, htsFile *file, int n)
  {
      bcf1_t *line;
@@ -1542,17 +1617,20 @@ static void flush_buffer(args_t *args, htsFile *file, int n)
              int line_type = bcf_get_variant_types(args->lines[k]);
              if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos )
              {
-                if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
+                if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue;
              }
              else
              {
                  prev_rid  = args->lines[k]->rid;
                  prev_pos  = args->lines[k]->pos;
                  prev_type = 0;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
              }
              prev_type |= line_type;
+            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]);
          }
          bcf_write1(file, args->hdr, args->lines[k]);
      }
@@ -1582,6 +1660,7 @@ static void init_data(args_t *args)
  
  static void destroy_data(args_t *args)
  {
+    cmpals_destroy(args);
      int i;
      for (i=0; i<args->rbuf.m; i++)
          if ( args->lines[i] ) bcf_destroy1(args->lines[i]);
@@ -1678,17 +1757,20 @@ static void normalize_vcf(args_t *args)
              int line_type = bcf_get_variant_types(line);
              if ( prev_rid>=0 && prev_rid==line->rid && prev_pos==line->pos )
              {
-                if ( (args->rmdup>>1)&COLLAPSE_ANY ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
-                if ( (args->rmdup>>1)&COLLAPSE_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_ANY ) continue;    // rmdup by position only
+                if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue;
              }
              else
              {
                  prev_rid  = line->rid;
                  prev_pos  = line->pos;
                  prev_type = 0;
+                if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args);
              }
              prev_type |= line_type;
+            if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line);
          }
  
          // still on the same chromosome?
@@ -1745,7 +1827,7 @@ static void usage(void)
      fprintf(bcftools_stderr, "Options:\n");
      fprintf(bcftools_stderr, "    -c, --check-ref <e|w|x|s>         check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
      fprintf(bcftools_stderr, "    -D, --remove-duplicates           remove duplicate lines of the same type.\n");
-    fprintf(bcftools_stderr, "    -d, --rm-dup <type>               remove duplicate snps|indels|both|any\n");
+    fprintf(bcftools_stderr, "    -d, --rm-dup <type>               remove duplicate snps|indels|both|all|none\n");
      fprintf(bcftools_stderr, "    -f, --fasta-ref <file>            reference sequence (MANDATORY)\n");
      fprintf(bcftools_stderr, "    -m, --multiallelics <-|+>[type]   split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
      fprintf(bcftools_stderr, "        --no-version                  do not append version and command line to the header\n");
@@ -1806,10 +1888,12 @@ int main_vcfnorm(int argc, char *argv[])
          switch (c) {
              case 'N': args->do_indels = 0; break;
              case 'd':
-                if ( !strcmp("snps",optarg) ) args->rmdup = COLLAPSE_SNPS<<1;
-                else if ( !strcmp("indels",optarg) ) args->rmdup = COLLAPSE_INDELS<<1;
-                else if ( !strcmp("both",optarg) ) args->rmdup = COLLAPSE_BOTH<<1;
-                else if ( !strcmp("any",optarg) ) args->rmdup = COLLAPSE_ANY<<1;
+                if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
+                else if ( !strcmp("indels",optarg) ) args->rmdup = BCF_SR_PAIR_INDELS;
+                else if ( !strcmp("both",optarg) ) args->rmdup = BCF_SR_PAIR_BOTH;
+                else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+                else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY;
+                else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT;
                  else error("The argument to -d not recognised: %s\n", optarg);
                  break;
              case 'm':
@@ -1843,7 +1927,7 @@ int main_vcfnorm(int argc, char *argv[])
              case 'o': args->output_fname = optarg; break;
              case 'D':
                  fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); 
-                args->rmdup = COLLAPSE_NONE<<1;
+                args->rmdup = BCF_SR_PAIR_EXACT;
                  break;
              case 's': args->strict_filter = 1; break;
              case 'f': args->ref_fname = optarg; break;
diff --git a/bcftools/vcfplugin.c b/bcftools/vcfplugin.c

index a53ac3c958a620c5a98c6a8cf4f2d0e20b37aabf..90292f4e9e300468c41d4740f61ec646edddc6c5 100644 (file)
--- a/bcftools/vcfplugin.c
+++ b/bcftools/vcfplugin.c
@@ -1,6 +1,6 @@
  /*  vcfplugin.c -- plugin modules for operating on VCF/BCF files.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -637,7 +637,11 @@ int main_plugin(int argc, char *argv[])
              if ( !pass ) continue;
          }
          line = args->plugin.process(line);
-        if ( line ) bcf_write1(args->out_fh, args->hdr_out, line);
+        if ( line )
+        {
+            if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode);
+            bcf_write1(args->out_fh, args->hdr_out, line);
+        }
      }
      destroy_data(args);
      bcf_sr_destroy(args->files);
diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c

index f8e393b5092e2626c7d9eacec0897c22ae17b154..606e245838e4d10f425cff6a761d08bbee898e50 100644 (file)
--- a/bcftools/vcfplugin.c.pysam.c
+++ b/bcftools/vcfplugin.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfplugin.c -- plugin modules for operating on VCF/BCF files.
  
-    Copyright (C) 2013-2016 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -639,7 +639,11 @@ int main_plugin(int argc, char *argv[])
              if ( !pass ) continue;
          }
          line = args->plugin.process(line);
-        if ( line ) bcf_write1(args->out_fh, args->hdr_out, line);
+        if ( line )
+        {
+            if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode);
+            bcf_write1(args->out_fh, args->hdr_out, line);
+        }
      }
      destroy_data(args);
      bcf_sr_destroy(args->files);
diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c

index 04554f8d721a403565b703e8a0dac104fc3867c5..29c4a9df2c9558813df44273502f66679580c950 100644 (file)
--- a/bcftools/vcfquery.c
+++ b/bcftools/vcfquery.c
@@ -1,6 +1,6 @@
  /*  vcfquery.c -- Extracts fields from VCF/BCF file.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -48,6 +48,7 @@ typedef struct
      filter_t *filter;
      char *filter_str;
      int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+    uint8_t *smpl_pass;
      convert_t *convert;
      bcf_srs_t *files;
      bcf_hdr_t *header;
@@ -99,6 +100,7 @@ static void init_data(args_t *args)
          }
      }
      args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+    convert_set_option(args->convert, subset_samples, &args->smpl_pass);
      if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
      free(samples);
  
@@ -129,6 +131,7 @@ static void query_vcf(args_t *args)
          fwrite(str.s, str.l, 1, args->out);
      }
  
+    int i,max_convert_unpack = convert_max_unpack(args->convert);
      while ( bcf_sr_next_line(args->files) )
      {
          if ( !bcf_sr_has_line(args->files,0) ) continue;
@@ -137,9 +140,30 @@ static void query_vcf(args_t *args)
  
          if ( args->filter )
          {
-            int pass = filter_test(args->filter, line, NULL);
-            if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
-            if ( !pass ) continue;
+            int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass);
+            if ( args->filter_logic & FLT_EXCLUDE )
+            {
+                // This code addresses this problem:
+                //  -i can include a site but exclude a sample
+                //  -e exclude a site but include a sample
+
+                if ( pass )
+                {
+                    if ( !args->smpl_pass ) continue;
+                    if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
+
+                    pass = 0;
+                    for (i=0; i<line->n_sample; i++)
+                    {
+                        if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
+                        else { args->smpl_pass[i] = 1; pass = 1; }
+                    }
+                    if ( !pass ) continue;
+                }
+                else if ( args->smpl_pass )
+                    for (i=0; i<line->n_sample; i++) args->smpl_pass[i] = 1;
+            }
+            else if ( !pass ) continue;
          }
  
          str.l = 0;
@@ -198,7 +222,6 @@ static void usage(void)
      fprintf(stderr, "Usage:   bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
      fprintf(stderr, "\n");
      fprintf(stderr, "Options:\n");
-    fprintf(stderr, "    -c, --collapse <string>           collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
      fprintf(stderr, "    -e, --exclude <expr>              exclude sites for which the expression is true (see man page for details)\n");
      fprintf(stderr, "    -f, --format <string>             see man page for details\n");
      fprintf(stderr, "    -H, --print-header                print header\n");
@@ -254,14 +277,8 @@ int main_vcfquery(int argc, char *argv[])
              case 'f': args->format_str = strdup(optarg); break;
              case 'H': args->print_header = 1; break;
              case 'v': args->vcf_list = optarg; break;
-            case 'c':
-                if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
-                else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
-                else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
-                else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
-                else error("The --collapse string \"%s\" not recognised.\n", optarg);
+            case 'c': 
+                error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n", optarg);
                  break;
              case 'a':
                  {
@@ -322,7 +339,6 @@ int main_vcfquery(int argc, char *argv[])
      {
          if ( !fname ) usage();
          args->files = bcf_sr_init();
-        args->files->collapse = collapse;
          if ( optind+1 < argc ) args->files->require_index = 1;
          if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
              error("Failed to read the regions: %s\n", args->regions_list);
diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c

index e9100e6c56551039bd64b8589f5b63d42bfeb5db..8186a59d970ccec91992ca307c830ffd662b55cd 100644 (file)
--- a/bcftools/vcfquery.c.pysam.c
+++ b/bcftools/vcfquery.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfquery.c -- Extracts fields from VCF/BCF file.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -50,6 +50,7 @@ typedef struct
      filter_t *filter;
      char *filter_str;
      int filter_logic;   // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
+    uint8_t *smpl_pass;
      convert_t *convert;
      bcf_srs_t *files;
      bcf_hdr_t *header;
@@ -101,6 +102,7 @@ static void init_data(args_t *args)
          }
      }
      args->convert = convert_init(args->header, samples, nsamples, args->format_str);
+    convert_set_option(args->convert, subset_samples, &args->smpl_pass);
      if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
      free(samples);
  
@@ -131,6 +133,7 @@ static void query_vcf(args_t *args)
          fwrite(str.s, str.l, 1, args->out);
      }
  
+    int i,max_convert_unpack = convert_max_unpack(args->convert);
      while ( bcf_sr_next_line(args->files) )
      {
          if ( !bcf_sr_has_line(args->files,0) ) continue;
@@ -139,9 +142,30 @@ static void query_vcf(args_t *args)
  
          if ( args->filter )
          {
-            int pass = filter_test(args->filter, line, NULL);
-            if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
-            if ( !pass ) continue;
+            int pass = filter_test(args->filter, line, (const uint8_t**) &args->smpl_pass);
+            if ( args->filter_logic & FLT_EXCLUDE )
+            {
+                // This code addresses this problem:
+                //  -i can include a site but exclude a sample
+                //  -e exclude a site but include a sample
+
+                if ( pass )
+                {
+                    if ( !args->smpl_pass ) continue;
+                    if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
+
+                    pass = 0;
+                    for (i=0; i<line->n_sample; i++)
+                    {
+                        if ( args->smpl_pass[i] ) args->smpl_pass[i] = 0;
+                        else { args->smpl_pass[i] = 1; pass = 1; }
+                    }
+                    if ( !pass ) continue;
+                }
+                else if ( args->smpl_pass )
+                    for (i=0; i<line->n_sample; i++) args->smpl_pass[i] = 1;
+            }
+            else if ( !pass ) continue;
          }
  
          str.l = 0;
@@ -200,7 +224,6 @@ static void usage(void)
      fprintf(bcftools_stderr, "Usage:   bcftools query [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
      fprintf(bcftools_stderr, "\n");
      fprintf(bcftools_stderr, "Options:\n");
-    fprintf(bcftools_stderr, "    -c, --collapse <string>           collapse lines with duplicate positions for <snps|indels|both|all|some|none>, see man page [none]\n");
      fprintf(bcftools_stderr, "    -e, --exclude <expr>              exclude sites for which the expression is true (see man page for details)\n");
      fprintf(bcftools_stderr, "    -f, --format <string>             see man page for details\n");
      fprintf(bcftools_stderr, "    -H, --print-header                print header\n");
@@ -256,14 +279,8 @@ int main_vcfquery(int argc, char *argv[])
              case 'f': args->format_str = strdup(optarg); break;
              case 'H': args->print_header = 1; break;
              case 'v': args->vcf_list = optarg; break;
-            case 'c':
-                if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
-                else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
-                else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
-                else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
-                else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
-                else error("The --collapse string \"%s\" not recognised.\n", optarg);
+            case 'c': 
+                error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n", optarg);
                  break;
              case 'a':
                  {
@@ -324,7 +341,6 @@ int main_vcfquery(int argc, char *argv[])
      {
          if ( !fname ) usage();
          args->files = bcf_sr_init();
-        args->files->collapse = collapse;
          if ( optind+1 < argc ) args->files->require_index = 1;
          if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
              error("Failed to read the regions: %s\n", args->regions_list);
diff --git a/bcftools/vcfroh.c b/bcftools/vcfroh.c

index 8c1d055048c260fc337c1082ffe9fd1b4c95f1ee..626f97538c2ec651c0a4b8623fa7a7a56ce746cd 100644 (file)
--- a/bcftools/vcfroh.c
+++ b/bcftools/vcfroh.c
@@ -1,6 +1,6 @@
  /*  vcfroh.c -- HMM model for detecting runs of autozygosity.
  
-    Copyright (C) 2013-2015 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -526,6 +526,8 @@ static void flush_viterbi(args_t *args, int ismpl)
                          smpl->rg.state = 1;
                          smpl->rg.beg = smpl->sites[i];
                          smpl->rg.rid = args->prev_rid;
+                        smpl->rg.qual  = qual;
+                        smpl->rg.nqual = 1;
                      }
                  }
                  else if ( state )
diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c

index b303a40e3e46bc8d231e33e040df92636d228d64..77d2f4fdd801f751b118f8ffa68ec121cd7993b4 100644 (file)
--- a/bcftools/vcfroh.c.pysam.c
+++ b/bcftools/vcfroh.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfroh.c -- HMM model for detecting runs of autozygosity.
  
-    Copyright (C) 2013-2015 Genome Research Ltd.
+    Copyright (C) 2013-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -528,6 +528,8 @@ static void flush_viterbi(args_t *args, int ismpl)
                          smpl->rg.state = 1;
                          smpl->rg.beg = smpl->sites[i];
                          smpl->rg.rid = args->prev_rid;
+                        smpl->rg.qual  = qual;
+                        smpl->rg.nqual = 1;
                      }
                  }
                  else if ( state )
diff --git a/bcftools/vcfsom.c b/bcftools/vcfsom.c

index 03181e9b2c794725987caeed3fc4343ce24fb2f1..434dc16115f7bebade90fefef85ad5b453aca885 100644 (file)
--- a/bcftools/vcfsom.c
+++ b/bcftools/vcfsom.c
@@ -453,7 +453,7 @@ static void create_eval_plot(args_t *args)
              "import csv\n"
              "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
              "dat = []\n"
-            "with open('%s.eval', 'rb') as f:\n"
+            "with open('%s.eval', 'r') as f:\n"
              "\treader = csv.reader(f, 'tab')\n"
              "\tfor row in reader:\n"
              "\t\tif row[0][0]!='#': dat.append(row)\n"
diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c

index d806c019c95ad4fe000caede5e958700caf651bb..092d37a580c95414c9499bfdc5faf975ab048dfe 100644 (file)
--- a/bcftools/vcfsom.c.pysam.c
+++ b/bcftools/vcfsom.c.pysam.c
@@ -455,7 +455,7 @@ static void create_eval_plot(args_t *args)
              "import csv\n"
              "csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
              "dat = []\n"
-            "with open('%s.eval', 'rb') as f:\n"
+            "with open('%s.eval', 'r') as f:\n"
              "\treader = csv.reader(f, 'tab')\n"
              "\tfor row in reader:\n"
              "\t\tif row[0][0]!='#': dat.append(row)\n"
diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c

index 3b73173f7d682818761cdbf8522daefa9b486724..4f7765e39fdb4078badd881ffc1dd7de3517ded0 100644 (file)
--- a/bcftools/vcfstats.c
+++ b/bcftools/vcfstats.c
@@ -1,6 +1,6 @@
  /*  vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
  
-    Copyright (C) 2012-2016 Genome Research Ltd.
+    Copyright (C) 2012-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -87,7 +87,7 @@ typedef struct
      int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
      int subst[15];
      int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
-    int *smpl_hapRef, *smpl_hapAlt;
+    int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
      int *smpl_indel_hets, *smpl_indel_homs;
      int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
      unsigned long int *smpl_dp;
@@ -305,7 +305,7 @@ int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, i
  
      // Sanity check: the reference sequence must match the REF allele
      for (i=0; i<fai_ref_len && i<ref_len; i++)
-        if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] )
+        if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] && !iupac_consistent(fai_ref[i], ref[i]) )
              error("\nSanity check failed, the reference sequence differs: %s:%d+%d .. %c vs %c\n", chr, pos, i, ref[i],fai_ref[i]);
  
      // Count occurrences of all possible kmers
@@ -470,6 +470,7 @@ static void init_stats(args_t *args)
          #endif
          if ( args->files->n_smpl )
          {
+            stats->smpl_missing = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_hets   = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_homAA  = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_homRR  = (int *) calloc(args->files->n_smpl,sizeof(int));
@@ -551,6 +552,7 @@ static void destroy_stats(args_t *args)
          #endif
          free(stats->insertions);
          free(stats->deletions);
+        free(stats->smpl_missing);
          free(stats->smpl_hets);
          free(stats->smpl_homAA);
          free(stats->smpl_homRR);
@@ -858,7 +860,11 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
          {
              int ial, jal;
              int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
-            if ( gt==GT_UNKN ) continue;
+            if ( gt==GT_UNKN )
+            {
+                stats->smpl_missing[is]++;
+                continue;
+            }
              if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
              {
                  if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
@@ -1495,18 +1501,19 @@ static void print_stats(args_t *args)
  
      if ( args->files->n_smpl )
      {
-        printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+        printf("# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. The rest include both SNPs and indels.\n");
          printf("# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
-            "\t[12]nHapRef\t[13]nHapAlt\n");
+            "\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing\n");
          for (id=0; id<args->nstats; id++)
          {
              stats_t *stats = &args->stats[id];
              for (i=0; i<args->files->n_smpl; i++)
              {
                  float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
-                printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
+                printf("PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i],
                      stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
-                    stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
+                    stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i],
+                    stats->smpl_hapAlt[i], stats->smpl_missing[i]);
              }
          }
  
diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c

index 875dd6ac26e10ffa90a55f38a7ec02349bf13d0f..46a051f63f91a19df6f4e918d371be65dab0f58c 100644 (file)
--- a/bcftools/vcfstats.c.pysam.c
+++ b/bcftools/vcfstats.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
  
-    Copyright (C) 2012-2016 Genome Research Ltd.
+    Copyright (C) 2012-2017 Genome Research Ltd.
  
      Author: Petr Danecek <pd3@sanger.ac.uk>
  
@@ -89,7 +89,7 @@ typedef struct
      int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
      int subst[15];
      int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
-    int *smpl_hapRef, *smpl_hapAlt;
+    int *smpl_hapRef, *smpl_hapAlt, *smpl_missing;
      int *smpl_indel_hets, *smpl_indel_homs;
      int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
      unsigned long int *smpl_dp;
@@ -307,7 +307,7 @@ int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, i
  
      // Sanity check: the reference sequence must match the REF allele
      for (i=0; i<fai_ref_len && i<ref_len; i++)
-        if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] )
+        if ( ref[i] != fai_ref[i] && ref[i] - 32 != fai_ref[i] && !iupac_consistent(fai_ref[i], ref[i]) )
              error("\nSanity check failed, the reference sequence differs: %s:%d+%d .. %c vs %c\n", chr, pos, i, ref[i],fai_ref[i]);
  
      // Count occurrences of all possible kmers
@@ -472,6 +472,7 @@ static void init_stats(args_t *args)
          #endif
          if ( args->files->n_smpl )
          {
+            stats->smpl_missing = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_hets   = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_homAA  = (int *) calloc(args->files->n_smpl,sizeof(int));
              stats->smpl_homRR  = (int *) calloc(args->files->n_smpl,sizeof(int));
@@ -553,6 +554,7 @@ static void destroy_stats(args_t *args)
          #endif
          free(stats->insertions);
          free(stats->deletions);
+        free(stats->smpl_missing);
          free(stats->smpl_hets);
          free(stats->smpl_homAA);
          free(stats->smpl_homRR);
@@ -860,7 +862,11 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
          {
              int ial, jal;
              int gt = bcf_gt_type(fmt_ptr, reader->samples[is], &ial, &jal);
-            if ( gt==GT_UNKN ) continue;
+            if ( gt==GT_UNKN )
+            {
+                stats->smpl_missing[is]++;
+                continue;
+            }
              if ( gt==GT_HAPL_R || gt==GT_HAPL_A )
              {
                  if ( line_type&VCF_INDEL && stats->smpl_frm_shifts )
@@ -1497,18 +1503,19 @@ static void print_stats(args_t *args)
  
      if ( args->files->n_smpl )
      {
-        fprintf(bcftools_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n");
+        fprintf(bcftools_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. The rest include both SNPs and indels.\n");
          fprintf(bcftools_stdout, "# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons"
-            "\t[12]nHapRef\t[13]nHapAlt\n");
+            "\t[12]nHapRef\t[13]nHapAlt\t[14]nMissing\n");
          for (id=0; id<args->nstats; id++)
          {
              stats_t *stats = &args->stats[id];
              for (i=0; i<args->files->n_smpl; i++)
              {
                  float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0;
-                fprintf(bcftools_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i],
+                fprintf(bcftools_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i],
                      stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i],
-                    stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]);
+                    stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i],
+                    stats->smpl_hapAlt[i], stats->smpl_missing[i]);
              }
          }
  
diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c

index 645cc8a4c4414cc8e08066e0b9d3b55b1bdcb528..5d523f164c5571da709cc9053bb330efbfb542f0 100644 (file)
--- a/bcftools/vcfview.c
+++ b/bcftools/vcfview.c
@@ -1,6 +1,6 @@
  /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -342,11 +342,14 @@ int subset_vcf(args_t *args, bcf1_t *line)
              an += args->ac[i];
      }
  
+    int update_ac = args->calc_ac;
      if (args->n_samples)
      {
          int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
          bcf_subset(args->hdr, line, args->n_samples, args->imap);
-        if (args->calc_ac) {
+        if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
+        if ( update_ac )
+        {
              bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
              an = 0;
              for (i=0; i<line->n_allele; i++) {
@@ -442,7 +445,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
          if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
          if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
      }
-    if (args->calc_ac && args->update_info) {
+    if (update_ac && args->update_info) {
          bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
          bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
      }
diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c

index 4fbe35a4b7e2329a3596c7d0977b9f13e59a9542..4de3ac4529566ecf396f4376c8850c18514f81b4 100644 (file)
--- a/bcftools/vcfview.c.pysam.c
+++ b/bcftools/vcfview.c.pysam.c
@@ -2,7 +2,7 @@
  
  /*  vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
  
-    Copyright (C) 2013-2014 Genome Research Ltd.
+    Copyright (C) 2013-2018 Genome Research Ltd.
  
      Author: Shane McCarthy <sm15@sanger.ac.uk>
  
@@ -344,11 +344,14 @@ int subset_vcf(args_t *args, bcf1_t *line)
              an += args->ac[i];
      }
  
+    int update_ac = args->calc_ac;
      if (args->n_samples)
      {
          int non_ref_ac_sub = 0, *ac_sub = (int*) calloc(line->n_allele,sizeof(int));
          bcf_subset(args->hdr, line, args->n_samples, args->imap);
-        if (args->calc_ac) {
+        if ( args->calc_ac && !bcf_get_fmt(args->hdr,line,"GT") ) update_ac = 0;
+        if ( update_ac )
+        {
              bcf_calc_ac(args->hsub, line, ac_sub, BCF_UN_FMT); // recalculate AC and AN
              an = 0;
              for (i=0; i<line->n_allele; i++) {
@@ -444,7 +447,7 @@ int subset_vcf(args_t *args, bcf1_t *line)
          if (args->uncalled == FLT_INCLUDE && an > 0) return 0; // select uncalled
          if (args->uncalled == FLT_EXCLUDE && an == 0) return 0; // skip if uncalled
      }
-    if (args->calc_ac && args->update_info) {
+    if (update_ac && args->update_info) {
          bcf_update_info_int32(args->hdr, line, "AC", &args->ac[1], line->n_allele-1);
          bcf_update_info_int32(args->hdr, line, "AN", &an, 1);
      }
diff --git a/bcftools/version.h b/bcftools/version.h

index eb2074cfde9a03e6c757ea06d40492548bf5844b..29d20170f95e8477bdfec44e6e5683447bc2725d 100644 (file)
--- a/bcftools/version.h
+++ b/bcftools/version.h
@@ -1 +1 @@
-#define BCFTOOLS_VERSION "1.6"
+#define BCFTOOLS_VERSION "1.7"
diff --git a/doc/release.rst b/doc/release.rst

index d0ece25619911ca44d236b6119265a7f63741e29..1a255b1409b04fead2d9caae1cdba5f521946b8f 100644 (file)
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,19 @@
  Release notes
  =============
  
+Release 0.14.1
+==============
+
+This is mostly a bugfix release, though bcftools has now also been
+upgraded to 1.7.0.
+
+* [#621] Add a warning to count_coverage when an alignment has an
+  empty QUAL field
+* [#635] Speed-up of AlignedSegment.find_intro()
+* treat border case of all bases in pileup column below quality score
+* [#634] Fix access to pileup reference_sequence
+
+
  Release 0.14.0
  ==============
  
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx

index e94db54c245429318b11267729e252860e167f98..edb5eaa224310eb75bcb15af53cab53354b5f291 100644 (file)
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -2804,8 +2804,8 @@ cdef class PileupColumn:
      property reference_name:
          """:term:`reference` name (None if no AlignmentFile is associated)"""
          def __get__(self):
-            if self._alignment_file is not None:
-                return self._alignment_file.getrname(self.tid)
+            if self.header is not None:
+                return self.header.get_reference_name(self.tid)
              return None
  
      property nsegments:
@@ -2966,7 +2966,6 @@ cdef class PileupColumn:
                      buf[n] = p.b.core.qual + 33
                  n += 1
                  assert n < MAX_PILEUP_BUFFER_SIZE
-                
              if not p.is_del:
                  if p.qpos < p.b.core.l_qseq:
                      cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
@@ -3032,9 +3031,13 @@ cdef class PileupColumn:
              n += 1
              assert n < MAX_PILEUP_BUFFER_SIZE
  
-        # quicker to ensemble all and split than to encode all separately.
-        # ignore last ":"
-        return force_str(PyBytes_FromStringAndSize(<char*>buf, n-1)).split(":")
+        if n == 0:
+            # could be zero if all qualities are too low
+            return ""
+        else:
+            # quicker to ensemble all and split than to encode all separately.
+            # ignore last ":"
+            return force_str(PyBytes_FromStringAndSize(<char*>buf, n-1)).split(":")
  
      def get_query_qualities(self):
          """query base quality scores at pileup column position.
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx

index 439cc550153243d3697101a30cc60ebeadae636b..739013dd85c5e053adac02265561558bc863e5e3 100644 (file)
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -1,3 +1,4 @@
+
  # cython: embedsignature=True
  # cython: profile=True
  ########################################################
@@ -1508,7 +1509,6 @@ cdef class AlignmentFile(HTSFile):
  
          """
  
-        
          cdef uint32_t contig_length = self.get_reference_length(contig)
          cdef int _start = start if start is not None else 0
          cdef int _stop = stop if stop is not None else contig_length
@@ -1537,12 +1537,14 @@ cdef class AlignmentFile(HTSFile):
          cdef int refpos
          cdef int c = 0
          cdef int filter_method = 0
+
+
          if read_callback == "all":
              filter_method = 1
          elif read_callback == "nofilter":
              filter_method = 2
  
-        cdef int _threshold = quality_threshold
+        cdef int _threshold = quality_threshold or 0
          for read in self.fetch(contig=contig,
                                 reference=reference,
                                 start=start,
@@ -1564,10 +1566,13 @@ cdef class AlignmentFile(HTSFile):
              # count
              seq = read.seq
              quality = read.query_qualities
+            
              for qpos, refpos in read.get_aligned_pairs(True):
                  if qpos is not None and refpos is not None and \
                     _start <= refpos < _stop:
-                    if quality[qpos] >= quality_threshold:
+
+                    # only check base quality if _threshold > 0
+                    if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
                          if seq[qpos] == 'A':
                              count_a.data.as_ulongs[refpos - _start] += 1
                          if seq[qpos] == 'C':
@@ -1579,7 +1584,7 @@ cdef class AlignmentFile(HTSFile):
  
          return count_a, count_c, count_g, count_t
  
-    def find_introns(self, read_iterator):
+    def find_introns_slow(self, read_iterator):
          """Return a dictionary {(start, stop): count}
          Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
          and their support ( = number of reads ).
@@ -1604,6 +1609,38 @@ cdef class AlignmentFile(HTSFile):
                      last_read_pos = read_loc
          return res
  
+    def find_introns(self, read_iterator):
+        """Return a dictionary {(start, stop): count}
+        Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+        and their support ( = number of reads ).
+
+        read_iterator can be the result of a .fetch(...) call.
+        Or it can be a generator filtering such reads. Example
+        samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+        """
+        cdef:
+            uint32_t base_position, junc_start, nt
+            int op
+            AlignedSegment r
+            int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
+
+        import collections
+        res = collections.Counter()
+
+        match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
+        for r in read_iterator:
+            base_position = r.pos
+
+            for op, nt in r.cigartuples:
+                if op in match_or_deletion: 
+                    base_position += nt
+                elif op == BAM_CREF_SKIP: 
+                    junc_start = base_position
+                    base_position += nt
+                    res[(junc_start, base_position)] += 1
+        return res
+ 
+
      def close(self):
          '''closes the :class:`pysam.AlignmentFile`.'''
  
diff --git a/pysam/version.py b/pysam/version.py

index 43da562db691f7815f7f962a43cb960c9961360a..8a68bcc6704d0e20f01346b9fccdd70ea2635f5f 100644 (file)
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,5 +1,5 @@
  # pysam versioning information
-__version__ = "0.14"
+__version__ = "0.14.1"
  
  # TODO: upgrade number
  __samtools_version__ = "1.7"
diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py

index 1cdbb69591c5804b388bd9f3ba59d816c27553b3..1cf733e0eff2e980405e46ef1f3201379aeddea1 100644 (file)
--- a/tests/AlignmentFileHeader_test.py
+++ b/tests/AlignmentFileHeader_test.py
@@ -7,28 +7,19 @@ and data files located there.
  
  import unittest
  import os
-import shutil
  import sys
  import re
  import copy
-import collections
  from collections import OrderedDict as odict
-import subprocess
-import logging
-import array
+import pysam
+import pysam.samtools
+from TestUtils import get_temp_filename, BAM_DATADIR
+
  if sys.version_info.major >= 3:
      from io import StringIO
  else:
      from StringIO import StringIO
  
-from functools import partial
-
-import pysam
-import pysam.samtools
-from TestUtils import checkBinaryEqual, checkURL, \
-    check_samtools_view_equal, checkFieldEqual, force_str, \
-    get_temp_filename, BAM_DATADIR
-
  
  class TestHeaderConstruction(unittest.TestCase):
      """testing header construction."""
diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py

index 1851da8153d5b50e42b3ffd22957c22699b5b601..32738fe713d0088eec10f2eccee0ea59f6b0e5ad 100644 (file)
--- a/tests/AlignmentFilePileup_test.py
+++ b/tests/AlignmentFilePileup_test.py
@@ -1,9 +1,9 @@
  """Benchmarking module for AlignmentFile functionality"""
  import os
-import re
+import pysam
  import unittest
  from TestUtils import BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list
-from PileupTestUtils import *
+import PileupTestUtils
  
  
  class TestPileupReadSelection(unittest.TestCase):
@@ -162,6 +162,10 @@ class TestPileupObjects(unittest.TestCase):
                      pcolumn1.reference_id, 0,
                      "chromosome/target id mismatch in position 1: %s != %s" %
                      (pcolumn1.reference_id, 0))
+                self.assertEqual(
+                    pcolumn1.reference_name, "chr1",
+                    "chromosome mismatch in position 1: %s != %s" %
+                    (pcolumn1.reference_name, "chr1"))
                  self.assertEqual(
                      pcolumn1.reference_pos, 105 - 1,
                      "position mismatch in position 1: %s != %s" %
@@ -182,6 +186,10 @@ class TestPileupObjects(unittest.TestCase):
                      pcolumn2.reference_id, 1,
                      "chromosome/target id mismatch in position 1: %s != %s" %
                      (pcolumn2.reference_id, 1))
+                self.assertEqual(
+                    pcolumn2.reference_name, "chr2",
+                    "chromosome mismatch in position 1: %s != %s" %
+                    (pcolumn2.reference_name, "chr2"))
                  self.assertEqual(
                      pcolumn2.reference_pos, 1480 - 1,
                      "position mismatch in position 1: %s != %s" %
@@ -338,32 +346,32 @@ class PileUpColumnTests(unittest.TestCase):
      fn_fasta = os.path.join(BAM_DATADIR, "ex1.fa")
      
      def test_pileup_depths_are_equal(self):
-        samtools_result = build_depth_with_samtoolspipe(self.fn)
-        pysam_result = build_depth_with_filter_with_pysam(self.fn)
+        samtools_result = PileupTestUtils.build_depth_with_samtoolspipe(self.fn)
+        pysam_result = PileupTestUtils.build_depth_with_filter_with_pysam(self.fn)
          self.assertEqual(pysam_result, samtools_result)
  
      def test_pileup_query_bases_without_reference_are_equal(self):
-        samtools_result = build_query_bases_with_samtoolspipe(self.fn)
-        pysam_result = build_query_bases_with_pysam(self.fn)
+        samtools_result = PileupTestUtils.build_query_bases_with_samtoolspipe(self.fn)
+        pysam_result = PileupTestUtils.build_query_bases_with_pysam(self.fn)
          self.assertEqual(["".join(x) for x in pysam_result], samtools_result)
  
      def test_pileup_query_bases_with_reference_are_equal(self):
-        samtools_result = build_query_bases_with_samtoolspipe(self.fn, "-f", self.fn_fasta)
+        samtools_result = PileupTestUtils.build_query_bases_with_samtoolspipe(self.fn, "-f", self.fn_fasta)
          with pysam.FastaFile(self.fn_fasta) as fasta:
-            pysam_result = build_query_bases_with_pysam(self.fn, fastafile=fasta, stepper="samtools")
+            pysam_result = PileupTestUtils.build_query_bases_with_pysam(self.fn, fastafile=fasta, stepper="samtools")
          self.assertEqual(["".join(x) for x in pysam_result], samtools_result)
          
      def test_pileup_query_qualities_are_equal(self):
-        samtools_result = build_query_qualities_with_samtoolspipe(self.fn)
-        pysam_result = build_query_qualities_with_pysam(self.fn)
+        samtools_result = PileupTestUtils.build_query_qualities_with_samtoolspipe(self.fn)
+        pysam_result = PileupTestUtils.build_query_qualities_with_pysam(self.fn)
          pysam_result = [
              [chr(min(126, x + 33)) for x in l] for l in pysam_result]
          self.assertEqual("".join(flatten_nested_list(pysam_result)),
                           "".join(flatten_nested_list(samtools_result)))
  
      def test_pileup_mapping_qualities_are_equal(self):
-        samtools_result = build_mapping_qualities_with_samtoolspipe(self.fn)
-        pysam_result = build_mapping_qualities_with_pysam(self.fn)
+        samtools_result = PileupTestUtils.build_mapping_qualities_with_samtoolspipe(self.fn)
+        pysam_result = PileupTestUtils.build_mapping_qualities_with_pysam(self.fn)
          # convert to chars
          pysam_result = [
              [chr(min(126, x + 33)) for x in l] for l in pysam_result]
@@ -372,8 +380,8 @@ class PileUpColumnTests(unittest.TestCase):
                           "".join(flatten_nested_list(samtools_result)))
  
      def test_pileup_query_qualities_from_pileups_are_equal(self):
-        samtools_result = build_query_qualities_with_samtoolspipe(self.fn)
-        pysam_result = build_query_qualities_with_pysam_pileups(self.fn)
+        samtools_result = PileupTestUtils.build_query_qualities_with_samtoolspipe(self.fn)
+        pysam_result = PileupTestUtils.build_query_qualities_with_pysam_pileups(self.fn)
          pysam_result = [
              "".join([chr(min(126, x + 33)) for x in l]) for l in pysam_result]
  
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py

index 700018cb4182ab9855091ed07ab82bb32cd3d5fb..73a7b0a81067eba7058c89e6288030a2d2eb2a62 100644 (file)
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -9,8 +9,6 @@ import unittest
  import os
  import shutil
  import sys
-import re
-import copy
  import collections
  import subprocess
  import logging
@@ -24,7 +22,7 @@ from functools import partial
  
  import pysam
  import pysam.samtools
-from TestUtils import checkBinaryEqual, checkURL, \
+from TestUtils import checkBinaryEqual, check_url, \
      check_samtools_view_equal, checkFieldEqual, force_str, \
      get_temp_filename, BAM_DATADIR
  
@@ -1572,7 +1570,7 @@ class TestRemoteFileFTP(unittest.TestCase):
  
      def testFTPView(self):
          return
-        if not checkURL(self.url):
+        if not check_url(self.url):
              return
  
          result = pysam.samtools.view(self.url, self.region)
@@ -1580,7 +1578,7 @@ class TestRemoteFileFTP(unittest.TestCase):
  
      def testFTPFetch(self):
          return
-        if not checkURL(self.url):
+        if not check_url(self.url):
              return
  
          samfile = pysam.AlignmentFile(self.url, "rb")
@@ -1595,7 +1593,7 @@ class TestRemoteFileHTTP(unittest.TestCase):
      local = os.path.join(BAM_DATADIR, "ex1.bam")
  
      def testView(self):
-        if not checkURL(self.url):
+        if not check_url(self.url):
              return
  
          samfile_local = pysam.AlignmentFile(self.local, "rb")
@@ -1605,7 +1603,7 @@ class TestRemoteFileHTTP(unittest.TestCase):
          self.assertEqual(len(result.splitlines()), len(ref))
  
      def testFetch(self):
-        if not checkURL(self.url):
+        if not check_url(self.url):
              return
  
          with pysam.AlignmentFile(self.url, "rb") as samfile:
@@ -1619,7 +1617,7 @@ class TestRemoteFileHTTP(unittest.TestCase):
              self.assertEqual(x.compare(y), 0)
  
      def testFetchAll(self):
-        if not checkURL(self.url):
+        if not check_url(self.url):
              return
  
          with pysam.AlignmentFile(self.url, "rb") as samfile:
@@ -2206,7 +2204,7 @@ class TestExplicitIndex(unittest.TestCase):
              samfile.fetch("chr1")
  
      def testRemoteExplicitIndexBAM(self):
-        if not checkURL(
+        if not check_url(
                  "http://genserv.anat.ox.ac.uk/downloads/pysam/test/noindex.bam"):
              return
  
@@ -2261,7 +2259,7 @@ class TestHeader1000Genomes(unittest.TestCase):
  
      def testRead(self):
  
-        if not checkURL(self.bamfile):
+        if not check_url(self.bamfile):
              return
  
          f = pysam.AlignmentFile(self.bamfile, "rb")
diff --git a/tests/TestUtils.py b/tests/TestUtils.py

index f4fe8e36cd1eb56ed995efffd8eceddc5e8434fe..7811b9e1822070c6078967f185de927b275c7198 100644 (file)
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -132,7 +132,7 @@ def check_samtools_view_equal(
      return True
  
  
-def checkURL(url):
+def check_url(url):
      '''return True if URL is available.
  
      A URL might not be available if it is the wrong URL
diff --git a/tests/faidx_test.py b/tests/faidx_test.py

index c618a92f07b2c72c00367bfc7279674645de4b95..3126e2388abd06871fec679842ec241017d6acc9 100644 (file)
--- a/tests/faidx_test.py
+++ b/tests/faidx_test.py
@@ -5,7 +5,7 @@ import gzip
  import copy
  import shutil
  
-from TestUtils import checkURL, BAM_DATADIR, get_temp_filename
+from TestUtils import check_url, BAM_DATADIR, get_temp_filename
  
  
  class TestFastaFile(unittest.TestCase):
@@ -220,7 +220,7 @@ class TestRemoteFileFTP(unittest.TestCase):
             "GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa")
  
      def testFTPView(self):
-        if not checkURL(self.url):
+        if not check_url(self.url):
              return
  
          with pysam.Fastafile(self.url) as f:
@@ -229,7 +229,7 @@ class TestRemoteFileFTP(unittest.TestCase):
                  1000)
  
      def test_sequence_lengths_are_available(self):
-        if not checkURL(self.url):
+        if not check_url(self.url):
              return
  
          with pysam.Fastafile(self.url) as f:
diff --git a/tests/pysam_data/example_reverse_complement.sam b/tests/pysam_data/example_reverse_complement.sam

new file mode 100644 (file)

index 0000000..795d19c
--- /dev/null
+++ b/tests/pysam_data/example_reverse_complement.sam
@@ -0,0 +1,28 @@
+@SQ    SN:chr1 LN:249250621
+@SQ    SN:chr10        LN:135534747
+@SQ    SN:chr11        LN:135006516
+@SQ    SN:chr12        LN:133851895
+@SQ    SN:chr13        LN:115169878
+@SQ    SN:chr14        LN:107349540
+@SQ    SN:chr15        LN:102531392
+@SQ    SN:chr16        LN:90354753
+@SQ    SN:chr17        LN:81195210
+@SQ    SN:chr18        LN:78077248
+@SQ    SN:chr19        LN:59128983
+@SQ    SN:chr2 LN:243199373
+@SQ    SN:chr20        LN:63025520
+@SQ    SN:chr21        LN:48129895
+@SQ    SN:chr22        LN:51304566
+@SQ    SN:chr3 LN:198022430
+@SQ    SN:chr4 LN:191154276
+@SQ    SN:chr5 LN:180915260
+@SQ    SN:chr6 LN:171115067
+@SQ    SN:chr7 LN:159138663
+@SQ    SN:chr8 LN:146364022
+@SQ    SN:chr9 LN:141213431
+@SQ    SN:chrM LN:16571
+@SQ    SN:chrX LN:155270560
+@SQ    SN:chrY LN:59373566
+@PG    ID:bwa  PN:bwa  VN:0.7.12-r1039 CL:bwa mem /ifs/mirror/genomes/bwa-0.7.5a/hg19 both.fq
+chr22:25000000-25000500        0       chr22   25000000        60      501M    *       0       0       GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA   GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA   NM:i:0  MD:Z:501        AS:i:501        XS:i:343
+chr22:25000000-25000500        16      chr22   25000000        60      501M    *       0       0       GCTCTGTGCCGCAAGGGTGGAAACTGTGAGAGACAGATTCCAACTCCACGTCTGGGTAGTAAGCATCCAGTCCAGGGCTGTAGGCAGTCCTGGGGAAGACGCCAGAGATCTGTGCATTCTCATATCGAGGGATAGCGACTCCAGGCTGGGGGCTGGCAGGGTAAGGGGTGGGTGGGTCCTGGGCTTACCCGCAGGTCTGCAGACTTCCTGGGGCCAGCTGACCTCGTAAAATCCCTTTTGTCTAAGCTTCAGTTTCCTGCCTGTGAATGGGGTTGGGGCTGTGCTCTGGTTTCACCCTTGTGGCTCTGGGGTTGTGGTGACAAAGCCATCAAGCTGGGTTGAAGGATTAACCAGGAAACTTCAGACTGGCTGCCGTGTCTACCTCTTCCTCATACTCCTCTCTCTGCTGCATCCTGGGAAGCTGCTCTGCTCAGCCTAAATGAGGCTCAGTTGTGTGTGTGCGCACGTGCTTGCACGTGTGTTGGAAGTGGGTGATACTGA   CGAGACACGGCGTTCCCACCTTTGACACTCTCTGTCTAAGGTTGAGGTGCAGACCCATCATTCGTAGGTCAGGTCCCGACATCCGTCAGGACCCCTTCTGCGGTCTCTAGACACGTAAGAGTATAGCTCCCTATCGCTGAGGTCCGACCCCCGACCGTCCCATTCCCCACCCACCCAGGACCCGAATGGGCGTCCAGACGTCTGAAGGACCCCGGTCGACTGGAGCATTTTAGGGAAAACAGATTCGAAGTCAAAGGACGGACACTTACCCCAACCCCGACACGAGACCAAAGTGGGAACACCGAGACCCCAACACCACTGTTTCGGTAGTTCGACCCAACTTCCTAATTGGTCCTTTGAAGTCTGACCGACGGCACAGATGGAGAAGGAGTATGAGGAGAGAGACGACGTAGGACCCTTCGACGAGACGAGTCGGATTTACTCCGAGTCAACACACACACGCGTGCACGAACGTGCACACAACCTTCACCCACTATGACT   NM:i:0  MD:Z:501        AS:i:501        XS:i:343
diff --git a/tests/tabix_test.py b/tests/tabix_test.py

index 013ff8683aa43f620e9085b88862ccc58b656b90..265c60a41868b2736c506b63f830a48bcee4c5a4 100644 (file)
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -16,7 +16,7 @@ import glob
  import re
  import copy
  import tempfile
-from TestUtils import checkURL, load_and_convert, TABIX_DATADIR, get_temp_filename
+from TestUtils import check_url, load_and_convert, TABIX_DATADIR, get_temp_filename
  
  IS_PYTHON3 = sys.version_info[0] >= 3
  
@@ -1046,7 +1046,7 @@ class TestRemoteFileHTTP(unittest.TestCase):
      local = os.path.join(TABIX_DATADIR, "example.gtf.gz")
  
      def setUp(self):
-        if not pysam.config.HAVE_LIBCURL or not checkURL(self.url):
+        if not pysam.config.HAVE_LIBCURL or not check_url(self.url):
              self.remote_file = None
          else:
              self.remote_file = pysam.TabixFile(self.url, "r")
@@ -1085,7 +1085,7 @@ class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP):
      local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz")
  
      def setUp(self):
-        if not pysam.config.HAVE_LIBCURL or not checkURL(self.url):
+        if not pysam.config.HAVE_LIBCURL or not check_url(self.url):
              self.remote_file = None
          else:
              self.remote_file = pysam.TabixFile(self.url, "r")
author	Afif Elghraoui <afif@debian.org>
	Sun, 22 Apr 2018 22:30:02 +0000 (18:30 -0400)
committer	Afif Elghraoui <afif@debian.org>
	Sun, 22 Apr 2018 22:30:02 +0000 (18:30 -0400)
NEWS		patch \| blob \| history
bcftools/bcftools.h		patch \| blob \| history
bcftools/consensus.c		patch \| blob \| history
bcftools/consensus.c.pysam.c		patch \| blob \| history
bcftools/convert.c		patch \| blob \| history
bcftools/convert.c.pysam.c		patch \| blob \| history
bcftools/convert.h		patch \| blob \| history
bcftools/csq.c		patch \| blob \| history
bcftools/csq.c.pysam.c		patch \| blob \| history
bcftools/filter.c		patch \| blob \| history
bcftools/filter.c.pysam.c		patch \| blob \| history
bcftools/main.c.pysam.c		patch \| blob \| history
bcftools/regidx.c		patch \| blob \| history
bcftools/regidx.c.pysam.c		patch \| blob \| history
bcftools/regidx.h		patch \| blob \| history
bcftools/reheader.c		patch \| blob \| history
bcftools/reheader.c.pysam.c		patch \| blob \| history
bcftools/vcfannotate.c		patch \| blob \| history
bcftools/vcfannotate.c.pysam.c		patch \| blob \| history
bcftools/vcfcnv.c		patch \| blob \| history
bcftools/vcfcnv.c.pysam.c		patch \| blob \| history
bcftools/vcfconvert.c		patch \| blob \| history
bcftools/vcfconvert.c.pysam.c		patch \| blob \| history
bcftools/vcfgtcheck.c		patch \| blob \| history
bcftools/vcfgtcheck.c.pysam.c		patch \| blob \| history
bcftools/vcfnorm.c		patch \| blob \| history
bcftools/vcfnorm.c.pysam.c		patch \| blob \| history
bcftools/vcfplugin.c		patch \| blob \| history
bcftools/vcfplugin.c.pysam.c		patch \| blob \| history
bcftools/vcfquery.c		patch \| blob \| history
bcftools/vcfquery.c.pysam.c		patch \| blob \| history
bcftools/vcfroh.c		patch \| blob \| history
bcftools/vcfroh.c.pysam.c		patch \| blob \| history
bcftools/vcfsom.c		patch \| blob \| history
bcftools/vcfsom.c.pysam.c		patch \| blob \| history
bcftools/vcfstats.c		patch \| blob \| history
bcftools/vcfstats.c.pysam.c		patch \| blob \| history
bcftools/vcfview.c		patch \| blob \| history
bcftools/vcfview.c.pysam.c		patch \| blob \| history
bcftools/version.h		patch \| blob \| history
doc/release.rst		patch \| blob \| history
pysam/libcalignedsegment.pyx		patch \| blob \| history
pysam/libcalignmentfile.pyx		patch \| blob \| history
pysam/version.py		patch \| blob \| history
tests/AlignmentFileHeader_test.py		patch \| blob \| history
tests/AlignmentFilePileup_test.py		patch \| blob \| history
tests/AlignmentFile_test.py		patch \| blob \| history
tests/TestUtils.py		patch \| blob \| history
tests/faidx_test.py		patch \| blob \| history
tests/pysam_data/example_reverse_complement.sam	[new file with mode: 0644]	patch \| blob
tests/tabix_test.py		patch \| blob \| history