Imported Upstream version 0.10.0+ds

author Afif Elghraoui <afif@debian.org>

Tue, 24 Jan 2017 03:09:05 +0000 (19:09 -0800)

committer Afif Elghraoui <afif@debian.org>

Tue, 24 Jan 2017 03:09:05 +0000 (19:09 -0800)
author Afif Elghraoui <afif@debian.org>
Tue, 24 Jan 2017 03:09:05 +0000 (19:09 -0800)
committer Afif Elghraoui <afif@debian.org>
Tue, 24 Jan 2017 03:09:05 +0000 (19:09 -0800)
diff --git a/.travis.yml b/.travis.yml

index 1482ed780d53ff97ae13b112ea996e851ed31977..bfc5d1ce0d7cf9fed7fa5c3a8be0319695ba6ad4 100644 (file)
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,14 +3,14 @@ os:
    - osx
  
  language: c
-sudo: required
+sudo: false
  
  env:
    matrix:
      - CONDA_PY=2.7
-    - CONDA_PY=3.3
      - CONDA_PY=3.4
      - CONDA_PY=3.5
+    - CONDA_PY=3.6
  
  addons:
    apt:
diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c

index 9afe620095cbf3300d95d1f52b406bfcf570f027..9eb3a7c6a621f968b045912af854f6a7fc8e7c07 100644 (file)
--- a/bcftools/vcfisec.c
+++ b/bcftools/vcfisec.c
@@ -317,7 +317,7 @@ static void init_data(args_t *args)
          while (*p && *p!=',') p++;
          if ( *p==',' ) p++;
      }
-    if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+    if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files);
      if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
      {
          if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c

index 758d4759043635d3e034018e62a824aa6f3895d3..e3890d5add062a25e21ad276b4b7b3fca6d93fee 100644 (file)
--- a/bcftools/vcfisec.c.pysam.c
+++ b/bcftools/vcfisec.c.pysam.c
@@ -319,7 +319,7 @@ static void init_data(args_t *args)
          while (*p && *p!=',') p++;
          if ( *p==',' ) p++;
      }
-    if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+    if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files);
      if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
      {
          if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
diff --git a/benchmark/cython_flagstat.py b/benchmark/cython_flagstat.py

new file mode 100644 (file)

index 0000000..6a9b7df
--- /dev/null
+++ b/benchmark/cython_flagstat.py
@@ -0,0 +1,23 @@
+"""compute number of reads/alignments from BAM file
+===================================================
+
+This is a benchmarking utility script with limited functionality.
+
+Compute simple flag stats on a BAM-file using
+the pysam cython interface.
+
+"""
+
+import sys
+import pysam
+import pyximport
+pyximport.install()
+import _cython_flagstat
+
+assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0])
+
+is_paired, is_proper = _cython_flagstat.count(
+    pysam.AlignmentFile(sys.argv[1], "rb"))
+
+print ("there are alignments of %i paired reads" % is_paired)
+print ("there are %i proper paired alignments" % is_proper)
diff --git a/benchmark/python_flagstat.py b/benchmark/python_flagstat.py

new file mode 100644 (file)

index 0000000..0a14d80
--- /dev/null
+++ b/benchmark/python_flagstat.py
@@ -0,0 +1,23 @@
+"""compute number of reads/alignments from BAM file
+===================================================
+
+This is a benchmarking utility script with limited functionality.
+
+Compute simple flag stats on a BAM-file using
+the pysam python interface.
+"""
+
+import sys
+import pysam
+
+assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0])
+
+is_paired = 0
+is_proper = 0
+
+for read in pysam.AlignmentFile(sys.argv[1], "rb"):
+    is_paired += read.is_paired
+    is_proper += read.is_proper_pair
+
+print ("there are alignments of %i paired reads" % is_paired)
+print ("there are %i proper paired alignments" % is_proper)
diff --git a/buildwheels.sh b/buildwheels.sh

new file mode 100755 (executable)

index 0000000..a5987f1
--- /dev/null
+++ b/buildwheels.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+# Build manylinux1 wheels for pysam. Based on the example at
+# <https://github.com/pypa/python-manylinux-demo>
+#
+# It is best to run this in a fresh clone of the repository!
+#
+# Run this within the repository root:
+#   docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
+#
+# The wheels will be put into the wheelhouse/ subdirectory.
+#
+# For interactive tests:
+#   docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash
+
+set -xeuo pipefail
+
+# For convenience, if this script is called from outside of a docker container,
+# it starts a container and runs itself inside of it.
+if ! grep -q docker /proc/1/cgroup; then
+  # We are not inside a container
+  exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0
+fi
+
+yum install -y zlib-devel
+
+# Python 2.6 is not supported
+rm -r /opt/python/cp26*
+
+# Python 3.3 builds fail with:
+#  /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib
+rm -r /opt/python/cp33*
+
+# Without libcurl support, htslib can open files from HTTP and FTP URLs.
+# With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a
+# current version of OpenSSL, and we do not want to be responsible for
+# updating the wheels as soon as there are any security issues. So disable
+# libcurl for now.
+# See also <https://github.com/pypa/manylinux/issues/74>.
+#
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
+
+PYBINS="/opt/python/*/bin"
+for PYBIN in ${PYBINS}; do
+    ${PYBIN}/pip install -r /io/requirements.txt
+    ${PYBIN}/pip wheel /io/ -w wheelhouse/
+done
+
+# Bundle external shared libraries into the wheels
+#
+# The '-L .' option is a workaround. By default, auditwheel puts all external
+# libraries (.so files) into a .libs directory and sets the RUNPATH to $ORIGIN/.libs.
+# When HTSLIB_MODE is 'shared' (now the default), then all so libraries part of
+# pysam require that RUNPATH is set to $ORIGIN (without the .libs). It seems
+# auditwheel overwrites $ORIGIN with $ORIGIN/.libs. This workaround makes
+# auditwheel set the RUNPATH to "$ORIGIN/." and it will work as desired.
+#
+for whl in wheelhouse/*.whl; do
+    auditwheel repair -L . $whl -w /io/wheelhouse/
+done
+
+# Created files are owned by root, so fix permissions.
+chown -R --reference=/io/setup.py /io/wheelhouse/
+
+# TODO Install packages and test them
+#for PYBIN in ${PYBINS}; do
+#    ${PYBIN}/pip install pysam --no-index -f /io/wheelhouse
+#    (cd $HOME; ${PYBIN}/nosetests ...)
+#done
diff --git a/cy_build.py b/cy_build.py

index 880b5ccfc1905d3c8bd459c9bc9d62c0e2dbc493..29af588bcdcdbc5dfc4bf24454d2c8d6846fb5a9 100644 (file)
--- a/cy_build.py
+++ b/cy_build.py
@@ -16,7 +16,6 @@ if sys.platform == 'darwin':
      config_vars = get_config_vars()
      config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '')
      config_vars['SHLIB_EXT'] = '.so'
-    config_vars['SO'] = '.so'
  
  
  def is_pip_install():
@@ -61,7 +60,6 @@ class cy_build_ext(build_ext):
              ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
  
          if sys.platform == 'darwin':
-
              relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
  
              if "develop" in sys.argv or "test" in sys.argv:
diff --git a/doc/api.rst b/doc/api.rst

index 671fe4e75b45e02eb0a1902a809308ff45ea1b69..686c60ddadbb0b0bbf57ecc576864c7ec619c0d4 100644 (file)
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -201,7 +201,7 @@ Fastq files
     :members:
  
  
-.. autoclass:: pysam.cfaidx.FastqProxy
+.. autoclass:: pysam.FastqProxy
     :members:
  
  
diff --git a/doc/release.rst b/doc/release.rst

index f49b8f05d61a5c8c127a1adbe3e7ae09b1332afb..1d378f35e6cf8513658d285dc1f7562b49500244 100644 (file)
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,6 +2,35 @@
  Release notes
  =============
  
+Release 0.10.0
+==============
+
+This release implements further functionality in the VariantFile API
+and includes several bugfixes:
+
+* treat special case -c option in samtools view outputs to stdout even
+  if -o given, fixes #315
+* permit reading BAM files with CSI index, closes #370
+* raise Error if query name exceeds maximum length, fixes #373
+* new method to compute hash value for AlignedSegment
+* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile
+* Avoid segfault by detecting out of range reference_id and
+  next_reference in AlignedSegment.tostring
+* Issue #355: Implement streams using file descriptors for VariantFile
+* upgrade to htslib 1.3.2
+* fix compilation with musl libc
+* Issue #316, #360: Rename all Cython modules to have lib as a prefix
+* Issue #332, hardclipped bases in cigar included by
+  pysam.AlignedSegment.infer_query_length()
+* Added support for Python 3.6 filename encoding protocol
+* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord
+* Issue #331, fix failure in VariantFile.reset() method
+* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and
+  VariantRecord.copy() methods to create new VariantRecord objects
+* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters
+* Preliminary (potentially unsafe) support for removing and altering header metadata
+* Many minor fixes and improvements to VariantFile and related objects
+
  Release 0.9.1
  =============
  
diff --git a/doc/usage.rst b/doc/usage.rst

index 90e76885fa98bd65853f78e22a7c1fcb8bf44ff0..936f3bdcd3fd311b733d932dfd0fe64e3fcce817 100644 (file)
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -197,12 +197,22 @@ be retrieved by numeric index:
      for row in tbx.fetch("chr1", 1000, 2000):
           print ("chromosome is", row[0])
  
-By providing a parser argument to :class:`~pysam.AlignmentFile.fetch`
+By providing a parser to :class:`~pysam.AlignmentFile.fetch`
  or :class:`~pysam.TabixFile`, the data will we presented in parsed
-form:
+form::
  
      for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asTuple()):
           print ("chromosome is", row.contig)
+         print ("first field (chrom)=", row[0])
+
+Pre-built parsers are available for :term:`bed`
+(:class:`~pysam.asBed`) formatted files and :term:`gtf`
+(:class:`~pysam.asGTF`) formatted files. Thus, additional fields
+become available through named access, for example::
+
+    for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asBed()):
+         print ("name is", row.name)
+
  
  .. Currently inactivated as pileup deprecated
  .. Using the samtools SNP caller
diff --git a/pysam/__init__.py b/pysam/__init__.py

index d1b5d410c80239e2b1169dc6f21435fae4142720..ed17e04c0c4f0d55efd74c16f25f62b165b50ac7 100644 (file)
--- a/pysam/__init__.py
+++ b/pysam/__init__.py
@@ -3,22 +3,24 @@ import sys
  import sysconfig
  
  from pysam.libchtslib import *
-from pysam.cutils import *
-import pysam.cutils as cutils
-import pysam.cfaidx as cfaidx
-from pysam.cfaidx import *
-import pysam.ctabix as ctabix
-from pysam.ctabix import *
-import pysam.csamfile as csamfile
-from pysam.csamfile import *
-import pysam.calignmentfile as calignmentfile
-from pysam.calignmentfile import *
-import pysam.calignedsegment as calignedsegment
-from pysam.calignedsegment import *
-import pysam.cvcf as cvcf
-from pysam.cvcf import *
-import pysam.cbcf as cbcf
-from pysam.cbcf import *
+from pysam.libcutils import *
+import pysam.libcutils as libcutils
+import pysam.libcfaidx as libcfaidx
+from pysam.libcfaidx import *
+import pysam.libctabix as libctabix
+from pysam.libctabix import *
+import pysam.libcsamfile as libcsamfile
+from pysam.libcsamfile import *
+import pysam.libcalignmentfile as libcalignmentfile
+from pysam.libcalignmentfile import *
+import pysam.libcalignedsegment as libcalignedsegment
+from pysam.libcalignedsegment import *
+import pysam.libcvcf as libcvcf
+from pysam.libcvcf import *
+import pysam.libcbcf as libcbcf
+from pysam.libcbcf import *
+import pysam.libcbgzf as libcbgzf
+from pysam.libcbgzf import *
  from pysam.utils import SamtoolsError
  import pysam.Pileup as Pileup
  from pysam.samtools import *
@@ -28,14 +30,15 @@ import pysam.config
  # export all the symbols from separate modules
  __all__ = \
      libchtslib.__all__ +\
-    cutils.__all__ +\
-    ctabix.__all__ +\
-    cvcf.__all__ +\
-    cbcf.__all__ +\
-    cfaidx.__all__ +\
-    calignmentfile.__all__ +\
-    calignedsegment.__all__ +\
-    csamfile.__all__ +\
+    libcutils.__all__ +\
+    libctabix.__all__ +\
+    libcvcf.__all__ +\
+    libcbcf.__all__ +\
+    libcbgzf.__all__ +\
+    libcfaidx.__all__ +\
+    libcalignmentfile.__all__ +\
+    libcalignedsegment.__all__ +\
+    libcsamfile.__all__ +\
      ["SamtoolsError"] +\
      ["Pileup"]
  
@@ -75,25 +78,17 @@ def get_defines():
  
  def get_libraries():
      '''return a list of libraries to link against.'''
-    # Note that this list does not include csamtools.so as there are
+    # Note that this list does not include libcsamtools.so as there are
      # numerous name conflicts with libchtslib.so.
      dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
-    pysam_libs = ['ctabixproxies',
-                  'cfaidx',
-                  'csamfile',
-                  'cvcf',
-                  'cbcf',
-                  'ctabix']
+    pysam_libs = ['libctabixproxies',
+                  'libcfaidx',
+                  'libcsamfile',
+                  'libcvcf',
+                  'libcbcf',
+                  'libctabix']
      if pysam.config.HTSLIB == "builtin":
          pysam_libs.append('libchtslib')
  
-    if sys.version_info.major >= 3:
-        if sys.version_info.minor >= 5:
-            return [os.path.join(dirname, x + ".{}.so".format(
-                sysconfig.get_config_var('SOABI'))) for x in pysam_libs]
-        else:
-            return [os.path.join(dirname, x + ".{}{}.so".format(
-                sys.implementation.cache_tag,
-                sys.abiflags)) for x in pysam_libs]
-    else:
-        return [os.path.join(dirname, x + ".so") for x in pysam_libs]
+    so = sysconfig.get_config_var('SO')
+    return [os.path.join(dirname, x + so) for x in pysam_libs]
diff --git a/pysam/calignedsegment.pxd b/pysam/calignedsegment.pxd

deleted file mode 100644 (file)

index 0880bef..0000000
--- a/pysam/calignedsegment.pxd
+++ /dev/null
@@ -1,91 +0,0 @@
-from pysam.chtslib cimport *
-
-cdef extern from "htslib_util.h":
-
-    # add *nbytes* into the variable length data of *src* at *pos*
-    bam1_t * pysam_bam_update(bam1_t * b,
-                              size_t nbytes_old,
-                              size_t nbytes_new,
-                              uint8_t * pos)
-
-    # now: static
-    int aux_type2size(int)
-
-    char * pysam_bam_get_qname(bam1_t * b)
-    uint32_t * pysam_bam_get_cigar(bam1_t * b)
-    uint8_t * pysam_bam_get_seq(bam1_t * b)
-    uint8_t * pysam_bam_get_qual(bam1_t * b)
-    uint8_t * pysam_bam_get_aux(bam1_t * b)
-    int pysam_bam_get_l_aux(bam1_t * b)
-    char pysam_bam_seqi(uint8_t * s, int i)
-
-    uint16_t pysam_get_bin(bam1_t * b)
-    uint8_t pysam_get_qual(bam1_t * b)
-    uint8_t pysam_get_l_qname(bam1_t * b)
-    uint16_t pysam_get_flag(bam1_t * b)
-    uint16_t pysam_get_n_cigar(bam1_t * b)
-    void pysam_set_bin(bam1_t * b, uint16_t v)
-    void pysam_set_qual(bam1_t * b, uint8_t v)
-    void pysam_set_l_qname(bam1_t * b, uint8_t v)
-    void pysam_set_flag(bam1_t * b, uint16_t v)
-    void pysam_set_n_cigar(bam1_t * b, uint16_t v)
-    void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
-
-
-from pysam.calignmentfile cimport AlignmentFile
-ctypedef AlignmentFile AlignmentFile_t
-
-
-# Note: need to declare all C fields and methods here
-cdef class AlignedSegment:
-
-    # object that this AlignedSegment represents
-    cdef bam1_t * _delegate
-
-    # the file from which this AlignedSegment originates (can be None)
-    cdef AlignmentFile _alignment_file
-
-    # caching of array properties for quick access
-    cdef object cache_query_qualities
-    cdef object cache_query_alignment_qualities
-    cdef object cache_query_sequence
-    cdef object cache_query_alignment_sequence
-
-    # add an alignment tag with value to the AlignedSegment
-    # an existing tag of the same name will be replaced.
-    cpdef set_tag(self, tag, value, value_type=?, replace=?)
-
-    # add an alignment tag with value to the AlignedSegment
-    # an existing tag of the same name will be replaced.
-    cpdef get_tag(self, tag, with_value_type=?)
-
-    # return true if tag exists
-    cpdef has_tag(self, tag)
-
-    # returns a valid sam alignment string
-    cpdef tostring(self, AlignmentFile_t handle)
-
-
-cdef class PileupColumn:
-    cdef bam_pileup1_t ** plp
-    cdef int tid
-    cdef int pos
-    cdef int n_pu
-    cdef AlignmentFile _alignment_file
-
-
-cdef class PileupRead:
-    cdef AlignedSegment _alignment
-    cdef int32_t  _qpos
-    cdef int _indel
-    cdef int _level
-    cdef uint32_t _is_del
-    cdef uint32_t _is_head
-    cdef uint32_t _is_tail
-    cdef uint32_t _is_refskip
-
-# factor methods
-cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file)
-cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file)
-cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
-cdef inline uint32_t get_alignment_length(bam1_t * src)
diff --git a/pysam/calignedsegment.pyx b/pysam/calignedsegment.pyx

deleted file mode 100644 (file)

index f4e0750..0000000
--- a/pysam/calignedsegment.pyx
+++ /dev/null
@@ -1,2457 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-# Cython wrapper for SAM/BAM/CRAM files based on htslib
-###############################################################################
-# The principal classes defined in this module are:
-#
-# class AlignedSegment  an aligned segment (read)
-#
-# class PileupColumn    a collection of segments (PileupRead) aligned to
-#                       a particular genomic position.
-#
-# class PileupRead      an AlignedSegment aligned to a particular genomic
-#                       position. Contains additional attributes with respect
-#                       to this.
-#
-# Additionally this module defines numerous additional classes that are part
-# of the internal API. These are:
-#
-# Various iterator classes to iterate over alignments in sequential (IteratorRow)
-# or in a stacked fashion (IteratorColumn):
-#
-# class IteratorRow
-# class IteratorRowRegion
-# class IteratorRowHead
-# class IteratorRowAll
-# class IteratorRowAllRefs
-# class IteratorRowSelection
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-import re
-import array
-import ctypes
-import struct
-
-cimport cython
-from cpython cimport array as c_array
-from cpython.version cimport PY_MAJOR_VERSION
-from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
-from libc.string cimport strchr
-from cpython cimport array as c_array
-
-from pysam.cutils cimport force_bytes, force_str, \
-    charptr_to_str, charptr_to_bytes
-from pysam.cutils cimport qualities_to_qualitystring, qualitystring_to_array, \
-    array_to_qualitystring
-
-# Constants for binary tag conversion
-cdef char * htslib_types = 'cCsSiIf'
-cdef char * parray_types = 'bBhHiIf'
-
-# translation tables
-
-# cigar code to character and vice versa
-cdef char* CODE2CIGAR= "MIDNSHP=XB"
-cdef int NCIGAR_CODES = 10
-
-if PY_MAJOR_VERSION >= 3:
-    CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
-else:
-    CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
-
-CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
-
-#####################################################################
-# typecode guessing
-cdef inline char map_typecode_htslib_to_python(uint8_t s):
-    """map an htslib typecode to the corresponding python typecode
-    to be used in the struct or array modules."""
-
-    # map type from htslib to python array
-    cdef char * f = strchr(htslib_types, s)
-
-    if f == NULL:
-        return 0
-    return parray_types[f - htslib_types]
-
-cdef inline uint8_t map_typecode_python_to_htslib(char s):
-    """determine value type from type code of array"""
-    cdef char * f = strchr(parray_types, s)
-    if f == NULL:
-        return 0
-    return htslib_types[f - parray_types]
-
-# optional tag data manipulation
-cdef convert_binary_tag(uint8_t * tag):
-    """return bytesize, number of values and array of values
-    in aux_data memory location pointed to by tag."""
-    cdef uint8_t auxtype
-    cdef uint8_t byte_size
-    cdef int32_t nvalues
-    # get byte size
-    auxtype = tag[0]
-    byte_size = aux_type2size(auxtype)
-    tag += 1
-    # get number of values in array
-    nvalues = (<int32_t*>tag)[0]
-    tag += 4
-
-    # define python array
-    cdef c_array.array c_values = array.array(
-        chr(map_typecode_htslib_to_python(auxtype)))
-    c_array.resize(c_values, nvalues)
-
-    # copy data
-    memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
-
-    # no need to check for endian-ness as bam1_core_t fields
-    # and aux_data are in host endian-ness. See sam.c and calls
-    # to swap_data
-    return byte_size, nvalues, c_values
-
-
-cdef inline uint8_t get_value_code(value, value_type=None):
-    '''guess type code for a *value*. If *value_type* is None,
-    the type code will be inferred based on the Python type of
-    *value*'''
-    cdef uint8_t  typecode
-    cdef char * _char_type
-
-    if value_type is None:
-        if isinstance(value, int):
-            typecode = 'i'
-        elif isinstance(value, float):
-            typecode = 'd'
-        elif isinstance(value, str):
-            typecode = 'Z'
-        elif isinstance(value, bytes):
-            typecode = 'Z'
-        elif isinstance(value, array.array) or \
-                isinstance(value, list) or \
-                isinstance(value, tuple):
-            typecode = 'B'
-        else:
-            return 0
-    else:
-        if value_type not in 'Zidf':
-            return 0
-        value_type = force_bytes(value_type)
-        _char_type = value_type
-        typecode = (<uint8_t*>_char_type)[0]
-
-    return typecode
-
-
-cdef inline bytes getTypecode(value, maximum_value=None):
-    '''returns the value typecode of a value.
-
-    If max is specified, the approprite type is
-    returned for a range where value is the minimum.
-    '''
-
-    if maximum_value is None:
-        maximum_value = value
-
-    cdef bytes valuetype
-
-    t = type(value)
-
-    if t is float:
-        valuetype = b'f'
-    elif t is int:
-        # signed ints
-        if value < 0:
-            if value >= -128 and maximum_value < 128:
-                valuetype = b'c'
-            elif value >= -32768 and maximum_value < 32768:
-                valuetype = b's'
-            elif value < -2147483648 or maximum_value >= 2147483648:
-                raise ValueError(
-                    "at least one signed integer out of range of "
-                    "BAM/SAM specification")
-            else:
-                valuetype = b'i'
-        # unsigned ints
-        else:
-            if maximum_value < 256:
-                valuetype = b'C'
-            elif maximum_value < 65536:
-                valuetype = b'S'
-            elif maximum_value >= 4294967296:
-                raise ValueError(
-                    "at least one integer out of range of BAM/SAM specification")
-            else:
-                valuetype = b'I'
-    else:
-        # Note: hex strings (H) are not supported yet
-        if t is not bytes:
-            value = value.encode('ascii')
-        if len(value) == 1:
-            valuetype = b'A'
-        else:
-            valuetype = b'Z'
-
-    return valuetype
-
-
-cdef inline packTags(tags):
-    """pack a list of tags. Each tag is a tuple of (tag, tuple).
-
-    Values are packed into the most space efficient data structure
-    possible unless the tag contains a third field with the typecode.
-
-    Returns a format string and the associated list of arguments
-    to be used in a call to struct.pack_into.
-    """
-    fmts, args = ["<"], []
-    
-    cdef char array_typecode
-
-    datatype2format = {
-        b'c': ('b', 1),
-        b'C': ('B', 1),
-        b's': ('h', 2),
-        b'S': ('H', 2),
-        b'i': ('i', 4),
-        b'I': ('I', 4),
-        b'f': ('f', 4),
-        b'A': ('c', 1)}
-
-    for tag in tags:
-
-        if len(tag) == 2:
-            pytag, value = tag
-            valuetype = None
-        elif len(tag) == 3:
-            pytag, value, valuetype = tag
-        else:
-            raise ValueError("malformatted tag: %s" % str(tag))
-
-        pytag = force_bytes(pytag)
-        valuetype = force_bytes(valuetype)
-        t = type(value)
-
-        if t is tuple or t is list:
-            # binary tags from tuples or lists
-            if valuetype is None:
-                # automatically determine value type - first value
-                # determines type. If there is a mix of types, the
-                # result is undefined.
-                valuetype = getTypecode(min(value), max(value))
-
-            if valuetype not in datatype2format:
-                raise ValueError("invalid value type '%s'" % valuetype)
-
-            datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0])
-            args.extend([pytag[:2],
-                         b"B",
-                         valuetype,
-                         len(value)] + list(value))
-
-        elif isinstance(value, array.array):
-            # binary tags from arrays
-            if valuetype is None:
-                array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
-
-                if array_typecode == 0:
-                    raise ValueError("unsupported type code '{}'"
-                                     .format(value.typecode))
-
-                valuetype = force_bytes(chr(array_typecode))
-                    
-            if valuetype not in datatype2format:
-                raise ValueError("invalid value type '%s' (%s)" %
-                                 (valuetype, type(valuetype)))
-
-            # use array.tostring() to retrieve byte representation and
-            # save as bytes
-            datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1])
-            args.extend([pytag[:2],
-                         b"B",
-                         valuetype,
-                         len(value),
-                         force_bytes(value.tostring())])
-
-        else:
-            if valuetype is None:
-                valuetype = getTypecode(value)
-
-            if valuetype in b"AZ":
-                value = force_bytes(value)
-
-            if valuetype == b"Z":
-                datafmt = "2sc%is" % (len(value)+1)
-            else:
-                datafmt = "2sc%s" % datatype2format[valuetype][0]
-
-            args.extend([pytag[:2],
-                         valuetype,
-                         value])
-
-        fmts.append(datafmt)
-
-    return "".join(fmts), args
-
-
-cdef inline int32_t calculateQueryLength(bam1_t * src):
-    """return query length computed from CIGAR alignment.
-
-    Return 0 if there is no CIGAR alignment.
-    """
-
-    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
-
-    if cigar_p == NULL:
-        return 0
-
-    cdef uint32_t k, qpos
-    cdef int op
-    qpos = 0
-
-    for k from 0 <= k < pysam_get_n_cigar(src):
-        op = cigar_p[k] & BAM_CIGAR_MASK
-
-        if op == BAM_CMATCH or op == BAM_CINS or \
-           op == BAM_CSOFT_CLIP or \
-           op == BAM_CEQUAL or op == BAM_CDIFF:
-            qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
-
-    return qpos
-
-
-cdef inline int32_t getQueryStart(bam1_t *src) except -1:
-    cdef uint32_t * cigar_p
-    cdef uint32_t k, op
-    cdef uint32_t start_offset = 0
-
-    if pysam_get_n_cigar(src):
-        cigar_p = pysam_bam_get_cigar(src);
-        for k from 0 <= k < pysam_get_n_cigar(src):
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            if op == BAM_CHARD_CLIP:
-                if start_offset != 0 and start_offset != src.core.l_qseq:
-                    PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
-                    return -1
-            elif op == BAM_CSOFT_CLIP:
-                start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
-            else:
-                break
-
-    return start_offset
-
-
-cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
-    cdef uint32_t * cigar_p
-    cdef uint32_t k, op
-    cdef uint32_t end_offset = src.core.l_qseq
-
-    # if there is no sequence, compute length from cigar string
-    if end_offset == 0:
-        end_offset = calculateQueryLength(src)
-
-    # walk backwards in cigar string
-    if pysam_get_n_cigar(src) > 1:
-        cigar_p = pysam_bam_get_cigar(src);
-        for k from pysam_get_n_cigar(src) > k >= 1:
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            if op == BAM_CHARD_CLIP:
-                if end_offset != 0 and end_offset != src.core.l_qseq:
-                    PyErr_SetString(ValueError,
-                                    'Invalid clipping in CIGAR string')
-                    return -1
-            elif op == BAM_CSOFT_CLIP:
-                end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
-            else:
-                break
-
-    return end_offset
-
-
-cdef inline bytes getSequenceInRange(bam1_t *src,
-                                     uint32_t start,
-                                     uint32_t end):
-    """return python string of the sequence in a bam1_t object.
-    """
-
-    cdef uint8_t * p
-    cdef uint32_t k
-    cdef char * s
-
-    if not src.core.l_qseq:
-        return None
-
-    seq = PyBytes_FromStringAndSize(NULL, end - start)
-    s   = <char*>seq
-    p   = pysam_bam_get_seq(src)
-
-    for k from start <= k < end:
-        # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
-        # note: do not use string literal as it will be a python string
-        s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf]
-
-    return charptr_to_bytes(seq)
-
-
-cdef inline object getQualitiesInRange(bam1_t *src,
-                                       uint32_t start,
-                                       uint32_t end):
-    """return python array of quality values from a bam1_t object"""
-
-    cdef uint8_t * p
-    cdef uint32_t k
-
-    p = pysam_bam_get_qual(src)
-    if p[0] == 0xff:
-        return None
-
-    # 'B': unsigned char
-    cdef c_array.array result = array.array('B', [0])
-    c_array.resize(result, end - start)
-
-    # copy data
-    memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
-
-    return result
-
-
-#####################################################################
-## private factory methods
-cdef class AlignedSegment
-cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file):
-    '''return an AlignedSegment object constructed from `src`'''
-    # note that the following does not call __init__
-    cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
-    dest._delegate = bam_dup1(src)
-    dest._alignment_file = alignment_file
-    return dest
-
-
-cdef class PileupColumn
-cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos,
-                      int n_pu, AlignmentFile alignment_file):
-    '''return a PileupColumn object constructed from pileup in `plp` and
-    setting additional attributes.
-
-    '''
-    # note that the following does not call __init__
-    cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
-    dest._alignment_file = alignment_file
-    dest.plp = plp
-    dest.tid = tid
-    dest.pos = pos
-    dest.n_pu = n_pu
-    return dest
-
-cdef class PileupRead
-cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file):
-    '''return a PileupRead object construted from a bam_pileup1_t * object.'''
-    cdef PileupRead dest = PileupRead.__new__(PileupRead)
-    dest._alignment = makeAlignedSegment(src.b, alignment_file)
-    dest._qpos = src.qpos
-    dest._indel = src.indel
-    dest._level = src.level
-    dest._is_del = src.is_del
-    dest._is_head = src.is_head
-    dest._is_tail = src.is_tail
-    dest._is_refskip = src.is_refskip
-    return dest
-
-
-cdef inline uint32_t get_alignment_length(bam1_t * src):
-    cdef int k = 0
-    cdef uint32_t l = 0
-    if src == NULL:
-        return 0
-    cdef uint32_t * cigar_p = bam_get_cigar(src)
-    if cigar_p == NULL:
-        return 0
-    cdef int op
-    cdef int n = pysam_get_n_cigar(src)
-    for k from 0 <= k < n:
-        op = cigar_p[k] & BAM_CIGAR_MASK
-        if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
-            continue
-        l += cigar_p[k] >> BAM_CIGAR_SHIFT
-    return l
-
-
-# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
-cdef inline bytes build_alignment_sequence(bam1_t * src):
-    """return expanded sequence from MD tag.
-
-    The sequence includes substitutions and both insertions in the
-    reference as well as deletions to the reference sequence. Combine
-    with the cigar string to reconstitute the query or the reference
-    sequence.
-
-    Positions corresponding to `N` (skipped region from the reference)
-    in the CIGAR string will not appear in the returned sequence. The
-    MD should correspondingly not contain these. Thus proper tags are::
-    
-       Deletion from the reference:   cigar=5M1D5M    MD=5^C5
-       Skipped region from reference: cigar=5M1N5M    MD=10
-
-    Returns
-    -------
-
-    None, if no MD tag is present.
-
-    """
-    if src == NULL:
-        return None
-
-    cdef uint32_t start = getQueryStart(src)
-    cdef uint32_t end = getQueryEnd(src)
-    # get read sequence, taking into account soft-clipping
-    r = getSequenceInRange(src, start, end)
-    cdef char * read_sequence = r
-    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
-    if cigar_p == NULL:
-        return None
-
-    cdef uint32_t r_idx = 0
-    cdef int op
-    cdef uint32_t k, i, l, x
-    cdef int nmatches = 0
-    cdef int s_idx = 0
-
-    cdef uint32_t max_len = get_alignment_length(src)
-    if max_len == 0:
-        raise ValueError("could not determine alignment length")
-
-    cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
-    if s == NULL:
-        raise ValueError(
-            "could not allocated sequence of length %i" % max_len)
-
-    for k from 0 <= k < pysam_get_n_cigar(src):
-        op = cigar_p[k] & BAM_CIGAR_MASK
-        l = cigar_p[k] >> BAM_CIGAR_SHIFT
-        if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
-            for i from 0 <= i < l:
-                s[s_idx] = read_sequence[r_idx] 
-                r_idx += 1
-                s_idx += 1
-        elif op == BAM_CDEL:
-            for i from 0 <= i < l:
-                s[s_idx] = '-'
-                s_idx += 1
-        elif op == BAM_CREF_SKIP:
-            pass
-        elif op == BAM_CINS:
-            for i from 0 <= i < l:
-                # encode insertions into reference as lowercase
-                s[s_idx] = read_sequence[r_idx] + 32
-                r_idx += 1
-                s_idx += 1
-        elif op == BAM_CSOFT_CLIP:
-            pass
-        elif op == BAM_CHARD_CLIP:
-            pass # advances neither
-        elif op == BAM_CPAD:
-            raise NotImplementedError(
-                "Padding (BAM_CPAD, 6) is currently not supported. "
-                "Please implement. Sorry about that.")
-
-    cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")    
-    if md_tag_ptr == NULL:
-        seq = PyBytes_FromStringAndSize(s, s_idx)
-        free(s)
-        return seq
-
-    cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
-    cdef int md_idx = 0
-    s_idx = 0
-
-    while md_tag[md_idx] != 0:
-        # c is numerical
-        if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
-            nmatches *= 10
-            nmatches += md_tag[md_idx] - 48
-            md_idx += 1
-            continue
-        else:
-            # save matches up to this point, skipping insertions
-            for x from 0 <= x < nmatches:
-                while s[s_idx] >= 'a':
-                    s_idx += 1
-                s_idx += 1
-            while s[s_idx] >= 'a':
-                s_idx += 1
-
-            r_idx += nmatches
-            nmatches = 0
-            if md_tag[md_idx] == '^':
-                md_idx += 1
-                while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
-                    assert s[s_idx] == '-'
-                    s[s_idx] = md_tag[md_idx]
-                    s_idx += 1
-                    md_idx += 1
-            else:
-                # save mismatch and change to lower case
-                s[s_idx] = md_tag[md_idx] + 32
-                s_idx += 1
-                r_idx += 1
-                md_idx += 1
-
-    # save matches up to this point, skipping insertions
-    for x from 0 <= x < nmatches:
-        while s[s_idx] >= 'a':
-            s_idx += 1
-        s_idx += 1
-    while s[s_idx] >= 'a':
-        s_idx += 1
-
-    seq = PyBytes_FromStringAndSize(s, s_idx)
-    free(s)
-
-    return seq
-
-
-cdef class AlignedSegment:
-    '''Class representing an aligned segment.
-
-    This class stores a handle to the samtools C-structure representing
-    an aligned read. Member read access is forwarded to the C-structure
-    and converted into python objects. This implementation should be fast,
-    as only the data needed is converted.
-
-    For write access, the C-structure is updated in-place. This is
-    not the most efficient way to build BAM entries, as the variable
-    length data is concatenated and thus needs to be resized if
-    a field is updated. Furthermore, the BAM entry might be
-    in an inconsistent state.
-
-    One issue to look out for is that the sequence should always
-    be set *before* the quality scores. Setting the sequence will
-    also erase any quality scores that were set previously.
-    '''
-
-    # Now only called when instances are created from Python
-    def __init__(self):
-        # see bam_init1
-        self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
-        # allocate some memory. If size is 0, calloc does not return a
-        # pointer that can be passed to free() so allocate 40 bytes
-        # for a new read
-        self._delegate.m_data = 40
-        self._delegate.data = <uint8_t *>calloc(
-            self._delegate.m_data, 1)
-        self._delegate.l_data = 0
-
-        # caching for selected fields
-        self.cache_query_qualities = None
-        self.cache_query_alignment_qualities = None
-        self.cache_query_sequence = None
-        self.cache_query_alignment_sequence = None
-
-    def __dealloc__(self):
-        bam_destroy1(self._delegate)
-
-    def __str__(self):
-        """return string representation of alignment.
-
-        The representation is an approximate :term:`SAM` format, because
-        an aligned read might not be associated with a :term:`AlignmentFile`.
-        As a result :term:`tid` is shown instead of the reference name.
-        Similarly, the tags field is returned in its parsed state.
-
-        To get a valid SAM record, use :meth:`tostring`.
-        """
-        # sam-parsing is done in sam.c/bam_format1_core which
-        # requires a valid header.
-        return "\t".join(map(str, (self.query_name,
-                                   self.flag,
-                                   self.reference_id,
-                                   self.reference_start,
-                                   self.mapping_quality,
-                                   self.cigarstring,
-                                   self.next_reference_id,
-                                   self.next_reference_start,
-                                   self.query_alignment_length,
-                                   self.query_sequence,
-                                   self.query_qualities,
-                                   self.tags)))
-
-    def __copy__(self):
-        return makeAlignedSegment(self._delegate, self._alignment_file)
-
-    def __deepcopy__(self, memo):
-        return makeAlignedSegment(self._delegate, self._alignment_file)
-
-    def compare(self, AlignedSegment other):
-        '''return -1,0,1, if contents in this are binary
-        <,=,> to *other*
-
-        '''
-
-        cdef int retval, x
-        cdef bam1_t *t
-        cdef bam1_t *o
-
-        t = self._delegate
-        o = other._delegate
-
-        # uncomment for debugging purposes
-        # cdef unsigned char * oo, * tt
-        # tt = <unsigned char*>(&t.core)
-        # oo = <unsigned char*>(&o.core)
-        # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
-        # tt = <unsigned char*>(t.data)
-        # oo = <unsigned char*>(o.data)
-        # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
-
-        # Fast-path test for object identity
-        if t == o:
-            return 0
-
-        retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
-
-        if retval:
-            return retval
-        # cmp(t.l_data, o.l_data)
-        retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
-        if retval:
-            return retval
-        return memcmp(t.data, o.data, t.l_data)
-
-    def __richcmp__(self, AlignedSegment other, int op):
-        if op == 2:  # == operator
-            return self.compare(other) == 0
-        elif op == 3:  # != operator
-            return self.compare(other) != 0
-        else:
-            return NotImplemented
-
-    def __hash__(self):
-        cdef bam1_t * src
-        src = self._delegate
-        # shift and xor values in the core structure
-        # make sure tid and mtid are shifted by different amounts
-        # should variable length data be included?
-        cdef uint32_t hash_value = src.core.tid << 24 ^ \
-            src.core.pos << 16 ^ \
-            src.core.qual << 8 ^ \
-            src.core.flag ^ \
-            src.core.isize << 24 ^ \
-            src.core.mtid << 16 ^ \
-            src.core.mpos << 8
-
-        return hash_value
-
-    cpdef tostring(self, AlignmentFile_t htsfile):
-        """returns a string representation of the aligned segment.
-
-        The output format is valid SAM format.
-
-        Parameters
-        ----------
-
-        htsfile -- AlignmentFile object to map numerical
-                   identifers to chromosome names.
-        """
-
-        cdef kstring_t line
-        line.l = line.m = 0
-        line.s = NULL
-
-        if sam_format1(htsfile.header, self._delegate, &line) < 0:
-            if line.m:
-                free(line.s)
-            raise ValueError('sam_format failed')
-
-        ret = force_str(line.s[:line.l])
-        
-        if line.m:
-            free(line.s)
-
-        return ret
-
-    ########################################################
-    ## Basic attributes in order of appearance in SAM format
-    property query_name:
-        """the query template name (None if not present)"""
-        def __get__(self):
-            cdef bam1_t * src
-            src = self._delegate
-            if pysam_get_l_qname(src) == 0:
-                return None
-            return charptr_to_str(<char *>pysam_bam_get_qname(src))
-
-        def __set__(self, qname):
-            if qname is None or len(qname) == 0:
-                return
-            qname = force_bytes(qname)
-            cdef bam1_t * src
-            cdef int l
-            cdef char * p
-
-            src = self._delegate
-            p = pysam_bam_get_qname(src)
-
-            # the qname is \0 terminated
-            l = len(qname) + 1
-            pysam_bam_update(src,
-                             pysam_get_l_qname(src),
-                             l,
-                             <uint8_t*>p)
-
-
-            pysam_set_l_qname(src, l)
-
-            # re-acquire pointer to location in memory
-            # as it might have moved
-            p = pysam_bam_get_qname(src)
-
-            strncpy(p, qname, l)
-
-    property flag:
-        """properties flag"""
-        def __get__(self):
-            return pysam_get_flag(self._delegate)
-        def __set__(self, flag):
-            pysam_set_flag(self._delegate, flag)
-
-    property reference_name:
-        """:term:`reference` name (None if no AlignmentFile is associated)"""
-        def __get__(self):
-            if self._alignment_file is not None:
-                return self._alignment_file.getrname(self._delegate.core.tid)
-            return None
-
-    property reference_id:
-        """:term:`reference` ID
-
-        .. note::
-
-            This field contains the index of the reference sequence in
-            the sequence dictionary. To obtain the name of the
-            reference sequence, use
-            :meth:`pysam.AlignmentFile.getrname()`
-
-        """
-        def __get__(self): return self._delegate.core.tid
-        def __set__(self, tid): self._delegate.core.tid = tid
-
-    property reference_start:
-        """0-based leftmost coordinate"""
-        def __get__(self): return self._delegate.core.pos
-        def __set__(self, pos):
-            ## setting the position requires updating the "bin" attribute
-            cdef bam1_t * src
-            src = self._delegate
-            src.core.pos = pos
-            if pysam_get_n_cigar(src):
-                pysam_set_bin(src,
-                              hts_reg2bin(
-                                  src.core.pos,
-                                  bam_endpos(src),
-                                  14,
-                                  5))
-            else:
-                pysam_set_bin(src,
-                              hts_reg2bin(
-                                  src.core.pos,
-                                  src.core.pos + 1,
-                                  14,
-                                  5))
-
-    property mapping_quality:
-        """mapping quality"""
-        def __get__(self):
-            return pysam_get_qual(self._delegate)
-        def __set__(self, qual):
-            pysam_set_qual(self._delegate, qual)
-
-    property cigarstring:
-        '''the :term:`cigar` alignment as a string.
-
-        The cigar string is a string of alternating integers
-        and characters denoting the length and the type of
-        an operation.
-
-        .. note::
-            The order length,operation is specified in the
-            SAM format. It is different from the order of
-            the :attr:`cigar` property.
-
-        Returns None if not present.
-
-        To unset the cigarstring, assign None or the
-        empty string.
-        '''
-        def __get__(self):
-            c = self.cigartuples
-            if c is None:
-                return None
-            # reverse order
-            else:
-                return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
-
-        def __set__(self, cigar):
-            if cigar is None or len(cigar) == 0:
-                self.cigartuples = []
-            else:
-                parts = CIGAR_REGEX.findall(cigar)
-                # reverse order
-                self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
-
-    # TODO
-    # property cigar:
-    #     """the cigar alignment"""
-
-    property next_reference_id:
-        """the :term:`reference` id of the mate/next read."""
-        def __get__(self): return self._delegate.core.mtid
-        def __set__(self, mtid):
-            self._delegate.core.mtid = mtid
-
-    property next_reference_name:
-        """:term:`reference` name of the mate/next read (None if no
-        AlignmentFile is associated)"""
-        def __get__(self):
-            if self._alignment_file is not None:
-                return self._alignment_file.getrname(self._delegate.core.mtid)
-            return None
-
-    property next_reference_start:
-        """the position of the mate/next read."""
-        def __get__(self):
-            return self._delegate.core.mpos
-        def __set__(self, mpos):
-            self._delegate.core.mpos = mpos
-
-    property query_length:
-        """the length of the query/read.
-
-        This value corresponds to the length of the sequence supplied
-        in the BAM/SAM file. The length of a query is 0 if there is no
-        sequence in the BAM/SAM file. In those cases, the read length
-        can be inferred from the CIGAR alignment, see
-        :meth:`pysam.AlignmentFile.infer_query_length.`.
-
-        The length includes soft-clipped bases and is equal to
-        ``len(query_sequence)``.
-
-        This property is read-only but can be set by providing a
-        sequence.
-
-        Returns 0 if not available.
-
-        """
-        def __get__(self):
-            return self._delegate.core.l_qseq
-
-    property template_length:
-        """the observed query template length"""
-        def __get__(self):
-            return self._delegate.core.isize
-        def __set__(self, isize):
-            self._delegate.core.isize = isize
-
-    property query_sequence:
-        """read sequence bases, including :term:`soft clipped` bases
-        (None if not present).
-
-        Note that assigning to seq will invalidate any quality scores.
-        Thus, to in-place edit the sequence and quality scores, copies of
-        the quality scores need to be taken. Consider trimming for example::
-
-           q = read.query_qualities
-           read.query_squence = read.query_sequence[5:10]
-           read.query_qualities = q[5:10]
-
-        The sequence is returned as it is stored in the BAM file. Some mappers
-        might have stored a reverse complement of the original read
-        sequence.
-        """
-        def __get__(self):
-            if self.cache_query_sequence:
-                return self.cache_query_sequence
-
-            cdef bam1_t * src
-            cdef char * s
-            src = self._delegate
-
-            if src.core.l_qseq == 0:
-                return None
-
-            self.cache_query_sequence = force_str(getSequenceInRange(
-                src, 0, src.core.l_qseq))
-            return self.cache_query_sequence
-
-        def __set__(self, seq):
-            # samtools manages sequence and quality length memory together
-            # if no quality information is present, the first byte says 0xff.
-            cdef bam1_t * src
-            cdef uint8_t * p
-            cdef char * s
-            cdef int l, k
-            cdef Py_ssize_t nbytes_new, nbytes_old
-
-            if seq == None:
-                l = 0
-            else:
-                l = len(seq)
-                seq = force_bytes(seq)
-
-            src = self._delegate
-
-            # as the sequence is stored in half-bytes, the total length (sequence
-            # plus quality scores) is (l+1)/2 + l
-            nbytes_new = (l + 1) / 2 + l
-            nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq
-
-            # acquire pointer to location in memory
-            p = pysam_bam_get_seq(src)
-            src.core.l_qseq = l
-
-            # change length of data field
-            pysam_bam_update(src,
-                             nbytes_old,
-                             nbytes_new,
-                             p)
-
-            if l > 0:
-                # re-acquire pointer to location in memory
-                # as it might have moved
-                p = pysam_bam_get_seq(src)
-                for k from 0 <= k < nbytes_new:
-                    p[k] = 0
-                # convert to C string
-                s = seq
-                for k from 0 <= k < l:
-                    p[k/2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
-
-                # erase qualities
-                p = pysam_bam_get_qual(src)
-                p[0] = 0xff
-
-            self.cache_query_sequence = force_str(seq)
-
-            # clear cached values for quality values
-            self.cache_query_qualities = None
-            self.cache_query_alignment_qualities = None
-
-    property query_qualities:
-        """read sequence base qualities, including :term:`soft
-        clipped` bases (None if not present).
-
-        Quality scores are returned as a python array of unsigned
-        chars. Note that this is not the ASCII-encoded value typically
-        seen in FASTQ or SAM formatted files. Thus, no offset of 33
-        needs to be subtracted.
-
-        Note that to set quality scores the sequence has to be set
-        beforehand as this will determine the expected length of the
-        quality score array.
-
-        This method raises a ValueError if the length of the
-        quality scores and the sequence are not the same.
-
-        """
-        def __get__(self):
-
-            if self.cache_query_qualities:
-                return self.cache_query_qualities
-
-            cdef bam1_t * src
-            cdef char * q
-
-            src = self._delegate
-
-            if src.core.l_qseq == 0:
-                return None
-
-            self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
-            return self.cache_query_qualities
-
-        def __set__(self, qual):
-
-            # note that memory is already allocated via setting the sequence
-            # hence length match of sequence and quality needs is checked.
-            cdef bam1_t * src
-            cdef uint8_t * p
-            cdef int l
-
-            src = self._delegate
-            p = pysam_bam_get_qual(src)
-            if qual is None or len(qual) == 0:
-                # if absent and there is a sequence: set to 0xff
-                if src.core.l_qseq != 0:
-                    p[0] = 0xff
-                return
-
-            # check for length match
-            l = len(qual)
-            if src.core.l_qseq != l:
-                raise ValueError(
-                    "quality and sequence mismatch: %i != %i" %
-                    (l, src.core.l_qseq))
-
-            # create a python array object filling it
-            # with the quality scores
-
-            # NB: should avoid this copying if qual is
-            # already of the correct type.
-            cdef c_array.array result = c_array.array('B', qual)
-
-            # copy data
-            memcpy(p, result.data.as_voidptr, l)
-
-            # save in cache
-            self.cache_query_qualities = qual
-
-    property bin:
-        """properties bin"""
-        def __get__(self):
-            return pysam_get_bin(self._delegate)
-        def __set__(self, bin):
-            pysam_set_bin(self._delegate, bin)
-
-
-    ##########################################################
-    # Derived simple attributes. These are simple attributes of
-    # AlignedSegment getting and setting values.
-    ##########################################################
-    # 1. Flags
-    ##########################################################
-    property is_paired:
-        """true if read is paired in sequencing"""
-        def __get__(self):
-            return (self.flag & BAM_FPAIRED) != 0
-        def __set__(self,val):
-            pysam_update_flag(self._delegate, val, BAM_FPAIRED)
-
-    property is_proper_pair:
-        """true if read is mapped in a proper pair"""
-        def __get__(self):
-            return (self.flag & BAM_FPROPER_PAIR) != 0
-        def __set__(self,val):
-            pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
-    property is_unmapped:
-        """true if read itself is unmapped"""
-        def __get__(self):
-            return (self.flag & BAM_FUNMAP) != 0
-        def __set__(self, val):
-            pysam_update_flag(self._delegate, val, BAM_FUNMAP)
-    property mate_is_unmapped:
-        """true if the mate is unmapped"""
-        def __get__(self):
-            return (self.flag & BAM_FMUNMAP) != 0
-        def __set__(self,val):
-            pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
-    property is_reverse:
-        """true if read is mapped to reverse strand"""
-        def __get__(self):
-            return (self.flag & BAM_FREVERSE) != 0
-        def __set__(self,val):
-            pysam_update_flag(self._delegate, val, BAM_FREVERSE)
-    property mate_is_reverse:
-        """true is read is mapped to reverse strand"""
-        def __get__(self):
-            return (self.flag & BAM_FMREVERSE) != 0
-        def __set__(self,val):
-            pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
-    property is_read1:
-        """true if this is read1"""
-        def __get__(self):
-            return (self.flag & BAM_FREAD1) != 0
-        def __set__(self,val):
-            pysam_update_flag(self._delegate, val, BAM_FREAD1)
-    property is_read2:
-        """true if this is read2"""
-        def __get__(self):
-            return (self.flag & BAM_FREAD2) != 0
-        def __set__(self, val):
-            pysam_update_flag(self._delegate, val, BAM_FREAD2)
-    property is_secondary:
-        """true if not primary alignment"""
-        def __get__(self):
-            return (self.flag & BAM_FSECONDARY) != 0
-        def __set__(self, val):
-            pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
-    property is_qcfail:
-        """true if QC failure"""
-        def __get__(self):
-            return (self.flag & BAM_FQCFAIL) != 0
-        def __set__(self, val):
-            pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
-    property is_duplicate:
-        """true if optical or PCR duplicate"""
-        def __get__(self):
-            return (self.flag & BAM_FDUP) != 0
-        def __set__(self, val):
-            pysam_update_flag(self._delegate, val, BAM_FDUP)
-    property is_supplementary:
-        """true if this is a supplementary alignment"""
-        def __get__(self):
-            return (self.flag & BAM_FSUPPLEMENTARY) != 0
-        def __set__(self, val):
-            pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
-
-    # 2. Coordinates and lengths
-    property reference_end:
-        '''aligned reference position of the read on the reference genome.
-
-        reference_end points to one past the last aligned residue.
-        Returns None if not available (read is unmapped or no cigar
-        alignment present).
-
-        '''
-        def __get__(self):
-            cdef bam1_t * src
-            src = self._delegate
-            if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
-                return None
-            return bam_endpos(src)
-
-    property reference_length:
-        '''aligned length of the read on the reference genome.
-
-        This is equal to `aend - pos`. Returns None if not available.'''
-        def __get__(self):
-            cdef bam1_t * src
-            src = self._delegate
-            if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
-                return None
-            return bam_endpos(src) - \
-                self._delegate.core.pos
-
-    property query_alignment_sequence:
-        """aligned portion of the read.
-
-        This is a substring of :attr:`seq` that excludes flanking
-        bases that were :term:`soft clipped` (None if not present). It
-        is equal to ``seq[qstart:qend]``.
-
-        SAM/BAM files may include extra flanking bases that are not
-        part of the alignment.  These bases may be the result of the
-        Smith-Waterman or other algorithms, which may not require
-        alignments that begin at the first residue or end at the last.
-        In addition, extra sequencing adapters, multiplex identifiers,
-        and low-quality bases that were not considered for alignment
-        may have been retained.
-
-        """
-
-        def __get__(self):
-            if self.cache_query_alignment_sequence:
-                return self.cache_query_alignment_sequence
-
-            cdef bam1_t * src
-            cdef uint32_t start, end
-
-            src = self._delegate
-
-            if src.core.l_qseq == 0:
-                return None
-
-            start = getQueryStart(src)
-            end   = getQueryEnd(src)
-
-            self.cache_query_alignment_sequence = force_str(
-                getSequenceInRange(src, start, end))
-            return self.cache_query_alignment_sequence
-
-    property query_alignment_qualities:
-        """aligned query sequence quality values (None if not present). These
-        are the quality values that correspond to :attr:`query`, that
-        is, they exclude qualities of :term:`soft clipped` bases. This
-        is equal to ``qual[qstart:qend]``.
-
-        Quality scores are returned as a python array of unsigned
-        chars. Note that this is not the ASCII-encoded value typically
-        seen in FASTQ or SAM formatted files. Thus, no offset of 33
-        needs to be subtracted.
-
-        This property is read-only.
-
-        """
-        def __get__(self):
-
-            if self.cache_query_alignment_qualities:
-                return self.cache_query_alignment_qualities
-
-            cdef bam1_t * src
-            cdef uint32_t start, end
-
-            src = self._delegate
-
-            if src.core.l_qseq == 0:
-                return None
-
-            start = getQueryStart(src)
-            end   = getQueryEnd(src)
-            self.cache_query_alignment_qualities = \
-                getQualitiesInRange(src, start, end)
-            return self.cache_query_alignment_qualities
-
-    property query_alignment_start:
-        """start index of the aligned query portion of the sequence (0-based,
-        inclusive).
-
-        This the index of the first base in :attr:`seq` that is not
-        soft-clipped.
-
-        """
-        def __get__(self):
-            return getQueryStart(self._delegate)
-
-    property query_alignment_end:
-        """end index of the aligned query portion of the sequence (0-based,
-        exclusive)"""
-        def __get__(self):
-            return getQueryEnd(self._delegate)
-
-    property query_alignment_length:
-        """length of the aligned query sequence.
-
-        This is equal to :attr:`qend` - :attr:`qstart`"""
-        def __get__(self):
-            cdef bam1_t * src
-            src = self._delegate
-            return getQueryEnd(src) - getQueryStart(src)
-
-    #####################################################
-    # Computed properties
-
-    def get_reference_positions(self, full_length=False):
-        """a list of reference positions that this read aligns to.
-
-        By default, this method only returns positions in the
-        reference that are within the alignment. If *full_length* is
-        set, None values will be included for any soft-clipped or
-        unaligned positions within the read. The returned list will
-        thus be of the same length as the read.
-
-        """
-        cdef uint32_t k, i, pos
-        cdef int op
-        cdef uint32_t * cigar_p
-        cdef bam1_t * src
-        cdef bint _full = full_length
-
-        src = self._delegate
-        if pysam_get_n_cigar(src) == 0:
-            return []
-
-        result = []
-        pos = src.core.pos
-        cigar_p = pysam_bam_get_cigar(src)
-
-        for k from 0 <= k < pysam_get_n_cigar(src):
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            l = cigar_p[k] >> BAM_CIGAR_SHIFT
-
-            if op == BAM_CSOFT_CLIP or op == BAM_CINS:
-                if _full:
-                    for i from 0 <= i < l:
-                        result.append(None)
-            elif op == BAM_CMATCH:
-                for i from pos <= i < pos + l:
-                    result.append(i)
-                pos += l
-            elif op == BAM_CDEL or op == BAM_CREF_SKIP:
-                pos += l
-
-        return result
-
-    def infer_query_length(self, always=True):
-        """inferred read length from CIGAR string.
-
-        If *always* is set to True, the read length
-        will be always inferred. If set to False, the length
-        of the read sequence will be returned if it is
-        available.
-
-        Returns None if CIGAR string is not present.
-        """
-
-        cdef uint32_t * cigar_p
-        cdef bam1_t * src
-
-        src = self._delegate
-
-        if not always and src.core.l_qseq:
-            return src.core.l_qseq
-
-        return calculateQueryLength(src)
-
-    def get_reference_sequence(self):
-        """return the reference sequence.
-
-        This method requires the MD tag to be set.
-        """
-        cdef uint32_t k, i
-        cdef int op
-        cdef bam1_t * src = self._delegate
-        ref_seq = force_str(build_alignment_sequence(src))
-        if ref_seq is None:
-            raise ValueError("MD tag not present")
-
-        cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
-        cdef uint32_t r_idx = 0
-        result = []
-        for k from 0 <= k < pysam_get_n_cigar(src):
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            l = cigar_p[k] >> BAM_CIGAR_SHIFT
-            if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
-                for i from 0 <= i < l:
-                    result.append(ref_seq[r_idx])
-                    r_idx += 1
-            elif op == BAM_CDEL:
-                for i from 0 <= i < l:
-                    result.append(ref_seq[r_idx])
-                    r_idx += 1
-            elif op == BAM_CREF_SKIP:
-                pass
-            elif op == BAM_CINS:
-                r_idx += l
-            elif op == BAM_CSOFT_CLIP:
-                pass
-            elif op == BAM_CHARD_CLIP:
-                pass # advances neither
-            elif op == BAM_CPAD:
-                raise NotImplementedError(
-                    "Padding (BAM_CPAD, 6) is currently not supported. "
-                    "Please implement. Sorry about that.")
-
-        return "".join(result)
-
-    def get_aligned_pairs(self, matches_only=False, with_seq=False):
-        """a list of aligned read (query) and reference positions.
-
-        For inserts, deletions, skipping either query or reference
-        position may be None.
-
-        Padding is currently not supported and leads to an exception.
-
-        Parameters
-        ----------
-
-        matches_only : bool
-          If True, only matched bases are returned - no None on either
-          side.
-        with_seq : bool
-          If True, return a third element in the tuple containing the
-          reference sequence. Substitutions are lower-case. This option
-          requires an MD tag to be present.
-
-        Returns
-        -------
-
-        aligned_pairs : list of tuples
-
-        """
-        cdef uint32_t k, i, pos, qpos, r_idx, l
-        cdef int op
-        cdef uint32_t * cigar_p
-        cdef bam1_t * src = self._delegate
-        cdef bint _matches_only = bool(matches_only)
-        cdef bint _with_seq = bool(with_seq)
-
-        # TODO: this method performs no checking and assumes that
-        # read sequence, cigar and MD tag are consistent.
-
-        if _with_seq:
-            ref_seq = force_str(self.get_reference_sequence())
-            if ref_seq is None:
-                raise ValueError("MD tag not present")
-
-        r_idx = 0
-
-        if pysam_get_n_cigar(src) == 0:
-            return []
-
-        result = []
-        pos = src.core.pos
-        qpos = 0
-        cigar_p = pysam_bam_get_cigar(src)
-        for k from 0 <= k < pysam_get_n_cigar(src):
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            l = cigar_p[k] >> BAM_CIGAR_SHIFT
-
-            if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
-                if _with_seq:
-                    for i from pos <= i < pos + l:
-                        result.append((qpos, i, ref_seq[r_idx]))
-                        r_idx += 1
-                        qpos += 1
-                else:
-                    for i from pos <= i < pos + l:
-                        result.append((qpos, i))
-                        qpos += 1
-                pos += l
-
-            elif op == BAM_CINS or op == BAM_CSOFT_CLIP:
-                if not _matches_only:
-                    if _with_seq:
-                        for i from pos <= i < pos + l:
-                            result.append((qpos, None, None))
-                            qpos += 1
-                    else:
-                        for i from pos <= i < pos + l:
-                            result.append((qpos, None))
-                            qpos += 1
-                else:
-                    qpos += l
-
-            elif op == BAM_CDEL:
-                if not _matches_only:
-                    if _with_seq:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i, ref_seq[r_idx]))
-                            r_idx += 1
-                    else:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i))
-                pos += l
-
-            elif op == BAM_CHARD_CLIP:
-                pass # advances neither
-
-            elif op == BAM_CREF_SKIP:
-                if not _matches_only:
-                    if _with_seq:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i, None))
-                    else:
-                        for i from pos <= i < pos + l:
-                            result.append((None, i))
-
-                pos += l
-
-            elif op == BAM_CPAD:
-                raise NotImplementedError(
-                    "Padding (BAM_CPAD, 6) is currently not supported. "
-                    "Please implement. Sorry about that.")
-
-        return result
-
-    def get_blocks(self):
-        """ a list of start and end positions of
-        aligned gapless blocks.
-
-        The start and end positions are in genomic
-        coordinates.
-
-        Blocks are not normalized, i.e. two blocks
-        might be directly adjacent. This happens if
-        the two blocks are separated by an insertion
-        in the read.
-        """
-
-        cdef uint32_t k, pos, l
-        cdef int op
-        cdef uint32_t * cigar_p
-        cdef bam1_t * src
-
-        src = self._delegate
-        if pysam_get_n_cigar(src) == 0:
-            return []
-
-        result = []
-        pos = src.core.pos
-        cigar_p = pysam_bam_get_cigar(src)
-        l = 0
-
-        for k from 0 <= k < pysam_get_n_cigar(src):
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            l = cigar_p[k] >> BAM_CIGAR_SHIFT
-            if op == BAM_CMATCH:
-                result.append((pos, pos + l))
-                pos += l
-            elif op == BAM_CDEL or op == BAM_CREF_SKIP:
-                pos += l
-
-        return result
-
-    def get_overlap(self, uint32_t start, uint32_t end):
-        """return number of aligned bases of read overlapping the interval
-        *start* and *end* on the reference sequence.
-
-        Return None if cigar alignment is not available.
-        """
-        cdef uint32_t k, i, pos, overlap
-        cdef int op, o
-        cdef uint32_t * cigar_p
-        cdef bam1_t * src
-
-        overlap = 0
-
-        src = self._delegate
-        if pysam_get_n_cigar(src) == 0:
-            return None
-        pos = src.core.pos
-        o = 0
-
-        cigar_p = pysam_bam_get_cigar(src)
-        for k from 0 <= k < pysam_get_n_cigar(src):
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            l = cigar_p[k] >> BAM_CIGAR_SHIFT
-
-            if op == BAM_CMATCH:
-                o = min( pos + l, end) - max( pos, start )
-                if o > 0: overlap += o
-
-            if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
-                pos += l
-
-        return overlap
-
-    def get_cigar_stats(self):
-        """summary of operations in cigar string.
-
-        The output order in the array is "MIDNSHP=X" followed by a
-        field for the NM tag. If the NM tag is not present, this
-        field will always be 0.
-
-        +-----+--------------+-----+
-        |M    |BAM_CMATCH    |0    |
-        +-----+--------------+-----+
-        |I    |BAM_CINS      |1    |
-        +-----+--------------+-----+
-        |D    |BAM_CDEL      |2    |
-        +-----+--------------+-----+
-        |N    |BAM_CREF_SKIP |3    |
-        +-----+--------------+-----+
-        |S    |BAM_CSOFT_CLIP|4    |
-        +-----+--------------+-----+
-        |H    |BAM_CHARD_CLIP|5    |
-        +-----+--------------+-----+
-        |P    |BAM_CPAD      |6    |
-        +-----+--------------+-----+
-        |=    |BAM_CEQUAL    |7    |
-        +-----+--------------+-----+
-        |X    |BAM_CDIFF     |8    |
-        +-----+--------------+-----+
-        |NM   |NM tag        |9    |
-        +-----+--------------+-----+
-
-        If no cigar string is present, empty arrays will be returned.
-
-        Parameters
-        ----------
-
-        Returns
-        -------
-
-        arrays : two arrays. The first contains the nucleotide counts within
-           each cigar operation, the second contains the number of blocks for
-           each cigar operation.
-
-        """
-        
-        cdef int nfields = NCIGAR_CODES + 1
-
-        cdef c_array.array base_counts = array.array(
-            "I",
-            [0] * nfields)
-        cdef uint32_t [:] base_view = base_counts
-        cdef c_array.array block_counts = array.array(
-            "I",
-            [0] * nfields)
-        cdef uint32_t [:] block_view = block_counts
-
-        cdef bam1_t * src = self._delegate
-        cdef int op
-        cdef uint32_t l
-        cdef int32_t k
-        cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
-
-        if cigar_p == NULL:
-            return None
-
-        for k from 0 <= k < pysam_get_n_cigar(src):
-            op = cigar_p[k] & BAM_CIGAR_MASK
-            l = cigar_p[k] >> BAM_CIGAR_SHIFT
-            base_view[op] += l
-            block_view[op] += 1
-
-        cdef uint8_t * v = bam_aux_get(src, 'NM')
-        if v != NULL:
-            base_view[nfields - 1] = <int32_t>bam_aux2i(v)
-
-        return base_counts, block_counts
-
-    #####################################################
-    ## Unsorted as yet
-    # TODO: capture in CIGAR object
-    property cigartuples:
-        """the :term:`cigar` alignment. The alignment
-        is returned as a list of tuples of (operation, length).
-
-        If the alignment is not present, None is returned.
-
-        The operations are:
-
-        +-----+--------------+-----+
-        |M    |BAM_CMATCH    |0    |
-        +-----+--------------+-----+
-        |I    |BAM_CINS      |1    |
-        +-----+--------------+-----+
-        |D    |BAM_CDEL      |2    |
-        +-----+--------------+-----+
-        |N    |BAM_CREF_SKIP |3    |
-        +-----+--------------+-----+
-        |S    |BAM_CSOFT_CLIP|4    |
-        +-----+--------------+-----+
-        |H    |BAM_CHARD_CLIP|5    |
-        +-----+--------------+-----+
-        |P    |BAM_CPAD      |6    |
-        +-----+--------------+-----+
-        |=    |BAM_CEQUAL    |7    |
-        +-----+--------------+-----+
-        |X    |BAM_CDIFF     |8    |
-        +-----+--------------+-----+
-
-        .. note::
-            The output is a list of (operation, length) tuples, such as
-            ``[(0, 30)]``.
-            This is different from the SAM specification and
-            the :attr:`cigarstring` property, which uses a
-            (length, operation) order, for example: ``30M``.
-
-        To unset the cigar property, assign an empty list
-        or None.
-        """
-        def __get__(self):
-            cdef uint32_t * cigar_p
-            cdef bam1_t * src
-            cdef uint32_t op, l
-            cdef int k
-
-            src = self._delegate
-            if pysam_get_n_cigar(src) == 0:
-                return None
-
-            cigar = []
-
-            cigar_p = pysam_bam_get_cigar(src);
-            for k from 0 <= k < pysam_get_n_cigar(src):
-                op = cigar_p[k] & BAM_CIGAR_MASK
-                l = cigar_p[k] >> BAM_CIGAR_SHIFT
-                cigar.append((op, l))
-            return cigar
-
-        def __set__(self, values):
-            cdef uint32_t * p
-            cdef bam1_t * src
-            cdef op, l
-            cdef int k, ncigar
-
-            k = 0
-
-            src = self._delegate
-
-            # get location of cigar string
-            p = pysam_bam_get_cigar(src)
-
-            # empty values for cigar string
-            if values is None:
-                values = []
-
-            ncigar = len(values)
-            # create space for cigar data within src.data
-            pysam_bam_update(src,
-                             pysam_get_n_cigar(src) * 4,
-                             ncigar * 4,
-                             <uint8_t*>p)
-
-            # length is number of cigar operations, not bytes
-            pysam_set_n_cigar(src, ncigar)
-
-            # re-acquire pointer to location in memory
-            # as it might have moved
-            p = pysam_bam_get_cigar(src)
-
-            # insert cigar operations
-            for op, l in values:
-                p[k] = l << BAM_CIGAR_SHIFT | op
-                k += 1
-
-            ## setting the cigar string requires updating the bin
-            pysam_set_bin(src,
-                          hts_reg2bin(
-                              src.core.pos,
-                              bam_endpos(src),
-                              14,
-                              5))
-
-
-    cpdef set_tag(self,
-                  tag,
-                  value,
-                  value_type=None,
-                  replace=True):
-        """sets a particular field *tag* to *value* in the optional alignment
-        section.
-
-        *value_type* describes the type of *value* that is to entered
-        into the alignment record.. It can be set explicitly to one
-        of the valid one-letter type codes. If unset, an appropriate
-        type will be chosen automatically.
-
-        An existing value of the same *tag* will be overwritten unless
-        replace is set to False. This is usually not recommened as a
-        tag may only appear once in the optional alignment section.
-
-        If *value* is None, the tag will be deleted.
-        """
-
-        cdef int value_size
-        cdef uint8_t * value_ptr
-        cdef uint8_t *existing_ptr
-        cdef uint8_t typecode
-        cdef float float_value
-        cdef double double_value
-        cdef int32_t int_value
-        cdef bam1_t * src = self._delegate
-        cdef char * _value_type
-        cdef c_array.array array_value
-        cdef object buffer
-
-        if len(tag) != 2:
-            raise ValueError('Invalid tag: %s' % tag)
-
-        tag = force_bytes(tag)
-        if replace:
-            existing_ptr = bam_aux_get(src, tag)
-            if existing_ptr:
-                bam_aux_del(src, existing_ptr)
-
-        # setting value to None deletes a tag
-        if value is None:
-            return
-
-        typecode = get_value_code(value, value_type)
-        if typecode == 0:
-            raise ValueError("can't guess type or invalid type code specified")
-
-        # Not Endian-safe, but then again neither is samtools!
-        if typecode == 'Z':
-            value = force_bytes(value)
-            value_ptr = <uint8_t*><char*>value
-            value_size = len(value)+1
-        elif typecode == 'i':
-            int_value = value
-            value_ptr = <uint8_t*>&int_value
-            value_size = sizeof(int32_t)
-        elif typecode == 'd':
-            double_value = value
-            value_ptr = <uint8_t*>&double_value
-            value_size = sizeof(double)
-        elif typecode == 'f':
-            float_value  = value
-            value_ptr = <uint8_t*>&float_value
-            value_size = sizeof(float)
-        elif typecode == 'B':
-            # the following goes through python, needs to be cleaned up
-            # pack array using struct
-            if value_type is None:
-                fmt, args = packTags([(tag, value)])
-            else:
-                fmt, args = packTags([(tag, value, value_type)])
-
-            # remove tag and type code as set by bam_aux_append
-            # first four chars of format (<2sc)
-            fmt = '<' + fmt[4:]
-            # first two values to pack
-            args = args[2:]
-            value_size = struct.calcsize(fmt)
-            # buffer will be freed when object goes out of scope
-            buffer = ctypes.create_string_buffer(value_size)
-            struct.pack_into(fmt, buffer, 0, *args)
-            # bam_aux_append copies data from value_ptr
-            bam_aux_append(src,
-                           tag,
-                           typecode,
-                           value_size,
-                           <uint8_t*>buffer.raw)
-            return
-        else:
-            raise ValueError('unsupported value_type in set_option')
-
-        bam_aux_append(src,
-                       tag,
-                       typecode,
-                       value_size,
-                       value_ptr)
-
-    cpdef has_tag(self, tag):
-        """returns true if the optional alignment section
-        contains a given *tag*."""
-        cdef uint8_t * v
-        cdef int nvalues
-        btag = force_bytes(tag)
-        v = bam_aux_get(self._delegate, btag)
-        return v != NULL
-
-    cpdef get_tag(self, tag, with_value_type=False):
-        """
-        retrieves data from the optional alignment section
-        given a two-letter *tag* denoting the field.
-
-        The returned value is cast into an appropriate python type.
-
-        This method is the fastest way to access the optional
-        alignment section if only few tags need to be retrieved.
-
-        Parameters
-        ----------
-
-        tag :
-            data tag.
-
-        with_value_type : Optional[bool]
-            if set to True, the return value is a tuple of (tag value, type code).
-            (default False)
-
-        Returns
-        -------
-
-        A python object with the value of the `tag`. The type of the
-        object depends on the data type in the data record.
-
-        Raises
-        ------
-
-        KeyError
-            If `tag` is not present, a KeyError is raised.
-
-        """
-        cdef uint8_t * v
-        cdef int nvalues
-        btag = force_bytes(tag)
-        v = bam_aux_get(self._delegate, btag)
-        if v == NULL:
-            raise KeyError("tag '%s' not present" % tag)
-        if chr(v[0]) == "B":
-            auxtype = chr(v[0]) + chr(v[1])
-        else:
-            auxtype = chr(v[0])
-
-        if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
-            value = <int>bam_aux2i(v)
-        elif auxtype == 'i' or auxtype == 'I':
-            value = <int32_t>bam_aux2i(v)
-        elif auxtype == 'f' or auxtype == 'F':
-            value = <float>bam_aux2f(v)
-        elif auxtype == 'd' or auxtype == 'D':
-            value = <double>bam_aux2f(v)
-        elif auxtype == 'A':
-            # there might a more efficient way
-            # to convert a char into a string
-            value = '%c' % <char>bam_aux2A(v)
-        elif auxtype == 'Z':
-            value = charptr_to_str(<char*>bam_aux2Z(v))
-        elif auxtype[0] == 'B':
-            bytesize, nvalues, values = convert_binary_tag(v + 1)
-            value = values
-        else:
-            raise ValueError("unknown auxiliary type '%s'" % auxtype)
-
-        if with_value_type:
-            return (value, auxtype)
-        else:
-            return value
-
-    def get_tags(self, with_value_type=False):
-        """the fields in the optional aligment section.
-
-        Returns a list of all fields in the optional
-        alignment section. Values are converted to appropriate python
-        values. For example:
-
-        [(NM, 2), (RG, "GJP00TM04")]
-
-        If *with_value_type* is set, the value type as encode in
-        the AlignedSegment record will be returned as well:
-
-        [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
-
-        This method will convert all values in the optional alignment
-        section. When getting only one or few tags, please see
-        :meth:`get_tag` for a quicker way to achieve this.
-
-        """
-
-        cdef char * ctag
-        cdef bam1_t * src
-        cdef uint8_t * s
-        cdef char auxtag[3]
-        cdef char auxtype
-        cdef uint8_t byte_size
-        cdef int32_t nvalues
-
-        src = self._delegate
-        if src.l_data == 0:
-            return []
-        s = pysam_bam_get_aux(src)
-        result = []
-        auxtag[2] = 0
-        while s < (src.data + src.l_data):
-            # get tag
-            auxtag[0] = s[0]
-            auxtag[1] = s[1]
-            s += 2
-            auxtype = s[0]
-            if auxtype in ('c', 'C'):
-                value = <int>bam_aux2i(s)
-                s += 1
-            elif auxtype in ('s', 'S'):
-                value = <int>bam_aux2i(s)
-                s += 2
-            elif auxtype in ('i', 'I'):
-                value = <int32_t>bam_aux2i(s)
-                s += 4
-            elif auxtype == 'f':
-                value = <float>bam_aux2f(s)
-                s += 4
-            elif auxtype == 'd':
-                value = <double>bam_aux2f(s)
-                s += 8
-            elif auxtype == 'A':
-                value = "%c" % <char>bam_aux2A(s)
-                s += 1
-            elif auxtype in ('Z', 'H'):
-                value = charptr_to_str(<char*>bam_aux2Z(s))
-                # +1 for NULL terminated string
-                s += len(value) + 1
-            elif auxtype == 'B':
-                s += 1
-                byte_size, nvalues, value = convert_binary_tag(s)
-                # 5 for 1 char and 1 int
-                s += 5 + (nvalues * byte_size) - 1
-            else:
-                raise KeyError("unknown type '%s'" % auxtype)
-
-            s += 1
-
-            if with_value_type:
-                result.append((charptr_to_str(auxtag), value, chr(auxtype)))
-            else:
-                result.append((charptr_to_str(auxtag), value))
-
-        return result
-
-    def set_tags(self, tags):
-        """sets the fields in the optional alignmest section with
-        a list of (tag, value) tuples.
-
-        The :term:`value type` of the values is determined from the
-        python type. Optionally, a type may be given explicitly as
-        a third value in the tuple, For example:
-
-        x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
-
-        This method will not enforce the rule that the same tag may appear
-        only once in the optional alignment section.
-        """
-
-        cdef bam1_t * src
-        cdef uint8_t * s
-        cdef char * temp
-        cdef int new_size = 0
-        cdef int old_size
-        src = self._delegate
-
-        # convert and pack the data
-        if tags is not None and len(tags) > 0:
-            fmt, args = packTags(tags)
-            new_size = struct.calcsize(fmt)
-            buffer = ctypes.create_string_buffer(new_size)
-            struct.pack_into(fmt,
-                             buffer,
-                             0,
-                             *args)
-
-        # delete the old data and allocate new space.
-        # If total_size == 0, the aux field will be
-        # empty
-        old_size = pysam_bam_get_l_aux(src)
-        pysam_bam_update(src,
-                         old_size,
-                         new_size,
-                         pysam_bam_get_aux(src))
-
-        # copy data only if there is any
-        if new_size > 0:
-
-            # get location of new data
-            s = pysam_bam_get_aux(src)
-
-            # check if there is direct path from buffer.raw to tmp
-            p = buffer.raw
-            # create handle to make sure buffer stays alive long
-            # enough for memcpy, see issue 129
-            temp = p
-            memcpy(s, temp, new_size)
-
-
-    ########################################################
-    # Compatibility Accessors
-    # Functions, properties for compatibility with pysam < 0.8
-    #
-    # Several options
-    #     change the factory functions according to API
-    #         * requires code changes throughout, incl passing
-    #           handles to factory functions
-    #     subclass functions and add attributes at runtime
-    #         e.g.: AlignedSegments.qname = AlignedSegments.query_name
-    #         * will slow down the default interface
-    #     explicit declaration of getters/setters
-    ########################################################
-    property qname:
-        """deprecated, use query_name instead"""
-        def __get__(self): return self.query_name
-        def __set__(self, v): self.query_name = v
-    property tid:
-        """deprecated, use reference_id instead"""
-        def __get__(self): return self.reference_id
-        def __set__(self, v): self.reference_id = v
-    property pos:
-        """deprecated, use reference_start instead"""
-        def __get__(self): return self.reference_start
-        def __set__(self, v): self.reference_start = v
-    property mapq:
-        """deprecated, use mapping_quality instead"""
-        def __get__(self): return self.mapping_quality
-        def __set__(self, v): self.mapping_quality = v
-    property rnext:
-        """deprecated, use next_reference_id instead"""
-        def __get__(self): return self.next_reference_id
-        def __set__(self, v): self.next_reference_id = v
-    property pnext:
-        """deprecated, use next_reference_start instead"""
-        def __get__(self):
-            return self.next_reference_start
-        def __set__(self, v):
-            self.next_reference_start = v
-    property cigar:
-        """deprecated, use cigartuples instead"""
-        def __get__(self):
-            r = self.cigartuples
-            if r is None:
-                r = []
-            return r
-        def __set__(self, v): self.cigartuples = v
-    property tlen:
-        """deprecated, use template_length instead"""
-        def __get__(self):
-            return self.template_length
-        def __set__(self, v):
-            self.template_length = v
-    property seq:
-        """deprecated, use query_sequence instead"""
-        def __get__(self):
-            return self.query_sequence
-        def __set__(self, v):
-            self.query_sequence = v
-    property qual:
-        """deprecated, query_qualities instead"""
-        def __get__(self):
-            return array_to_qualitystring(self.query_qualities)
-        def __set__(self, v):
-            self.query_qualities = qualitystring_to_array(v)
-    property alen:
-        """deprecated, reference_length instead"""
-        def __get__(self):
-            return self.reference_length
-        def __set__(self, v):
-            self.reference_length = v
-    property aend:
-        """deprecated, reference_end instead"""
-        def __get__(self):
-            return self.reference_end
-        def __set__(self, v):
-            self.reference_end = v
-    property rlen:
-        """deprecated, query_length instead"""
-        def __get__(self):
-            return self.query_length
-        def __set__(self, v):
-            self.query_length = v
-    property query:
-        """deprecated, query_alignment_sequence instead"""
-        def __get__(self):
-            return self.query_alignment_sequence
-        def __set__(self, v):
-            self.query_alignment_sequence = v
-    property qqual:
-        """deprecated, query_alignment_qualities instead"""
-        def __get__(self):
-            return array_to_qualitystring(self.query_alignment_qualities)
-        def __set__(self, v):
-            self.query_alignment_qualities = qualitystring_to_array(v)
-    property qstart:
-        """deprecated, use query_alignment_start instead"""
-        def __get__(self):
-            return self.query_alignment_start
-        def __set__(self, v):
-            self.query_alignment_start = v
-    property qend:
-        """deprecated, use query_alignment_end instead"""
-        def __get__(self):
-            return self.query_alignment_end
-        def __set__(self, v):
-            self.query_alignment_end = v
-    property qlen:
-        """deprecated, use query_alignment_length instead"""
-        def __get__(self):
-            return self.query_alignment_length
-        def __set__(self, v):
-            self.query_alignment_length = v
-    property mrnm:
-        """deprecated, use next_reference_id instead"""
-        def __get__(self):
-            return self.next_reference_id
-        def __set__(self, v):
-            self.next_reference_id = v
-    property mpos:
-        """deprecated, use next_reference_start instead"""
-        def __get__(self):
-            return self.next_reference_start
-        def __set__(self, v):
-            self.next_reference_start = v
-    property rname:
-        """deprecated, use reference_id instead"""
-        def __get__(self):
-            return self.reference_id
-        def __set__(self, v):
-            self.reference_id = v
-    property isize:
-        """deprecated, use template_length instead"""
-        def __get__(self):
-            return self.template_length
-        def __set__(self, v):
-            self.template_length = v
-    property blocks:
-        """deprecated, use get_blocks() instead"""
-        def __get__(self):
-            return self.get_blocks()
-    property aligned_pairs:
-        """deprecated, use get_aligned_pairs() instead"""
-        def __get__(self):
-            return self.get_aligned_pairs()
-    property inferred_length:
-        """deprecated, use infer_query_length() instead"""
-        def __get__(self):
-            return self.infer_query_length()
-    property positions:
-        """deprecated, use get_reference_positions() instead"""
-        def __get__(self):
-            return self.get_reference_positions()
-    property tags:
-        """deprecated, use get_tags() instead"""
-        def __get__(self):
-            return self.get_tags()
-        def __set__(self, tags):
-            self.set_tags(tags)
-    def overlap(self):
-        """deprecated, use get_overlap() instead"""
-        return self.get_overlap()
-    def opt(self, tag):
-        """deprecated, use get_tag() instead"""
-        return self.get_tag(tag)
-    def setTag(self, tag, value, value_type=None, replace=True):
-        """deprecated, use set_tag() instead"""
-        return self.set_tag(tag, value, value_type, replace)
-
-
-cdef class PileupColumn:
-    '''A pileup of reads at a particular reference sequence postion
-    (:term:`column`). A pileup column contains all the reads that map
-    to a certain target base.
-
-    This class is a proxy for results returned by the samtools pileup
-    engine.  If the underlying engine iterator advances, the results
-    of this column will change.
-
-    '''
-    def __init__(self):
-        raise TypeError("this class cannot be instantiated from Python")
-
-    def __str__(self):
-        return "\t".join(map(str,
-                              (self.reference_id,
-                               self.reference_pos,
-                               self.nsegments))) +\
-            "\n" +\
-            "\n".join(map(str, self.pileups))
-
-    property reference_id:
-        '''the reference sequence number as defined in the header'''
-        def __get__(self):
-            return self.tid
-
-    property reference_name:
-        """:term:`reference` name (None if no AlignmentFile is associated)"""
-        def __get__(self):
-            if self._alignment_file is not None:
-                return self._alignment_file.getrname(self.tid)
-            return None
-
-    property nsegments:
-        '''number of reads mapping to this column.'''
-        def __get__(self):
-            return self.n_pu
-        def __set__(self, n):
-            self.n_pu = n
-
-    property reference_pos:
-        '''the position in the reference sequence (0-based).'''
-        def __get__(self):
-            return self.pos
-
-    property pileups:
-        '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
-        def __get__(self):
-            cdef int x
-            pileups = []
-
-            if self.plp == NULL or self.plp[0] == NULL:
-                raise ValueError("PileupColumn accessed after iterator finished")
-
-            # warning: there could be problems if self.n and self.buf are
-            # out of sync.
-            for x from 0 <= x < self.n_pu:
-                pileups.append(makePileupRead(&(self.plp[0][x]),
-                                              self._alignment_file))
-            return pileups
-
-    ########################################################
-    # Compatibility Accessors
-    # Functions, properties for compatibility with pysam < 0.8
-    ########################################################
-    property pos:
-        def __get__(self):
-            return self.reference_pos
-        def __set__(self, v):
-            self.reference_pos = v
-
-    property tid:
-        def __get__(self):
-            return self.reference_id
-        def __set__(self, v):
-            self.reference_id = v
-
-    property n:
-        def __get__(self):
-            return self.nsegments
-        def __set__(self, v):
-            self.nsegments = v
-
-
-cdef class PileupRead:
-    '''Representation of a read aligned to a particular position in the
-    reference sequence.
-
-    '''
-
-    def __init__(self):
-        raise TypeError(
-            "this class cannot be instantiated from Python")
-
-    def __str__(self):
-        return "\t".join(
-            map(str,
-                (self.alignment, self.query_position,
-                 self.indel, self.level,
-                 self.is_del, self.is_head,
-                 self.is_tail, self.is_refskip)))
-
-    property alignment:
-        """a :class:`pysam.AlignedSegment` object of the aligned read"""
-        def __get__(self):
-            return self._alignment
-
-    property query_position:
-        """position of the read base at the pileup site, 0-based.
-        None if is_del or is_refskip is set.
-
-        """
-        def __get__(self):
-            if self.is_del or self.is_refskip:
-                return None
-            else:
-                return self._qpos
-
-    property query_position_or_next:
-        """position of the read base at the pileup site, 0-based.
-
-        If the current position is a deletion, returns the next
-        aligned base.
-
-        """
-        def __get__(self):
-            return self._qpos
-
-    property indel:
-        """indel length for the position follwing the current pileup site.
-
-        This quantity peeks ahead to the next cigar operation in this
-        alignment. If the next operation is and insertion, indel will
-        be positve. If the next operation is a deletion, it will be
-        negation. 0 if the next operation is not an indel.
-
-        """
-        def __get__(self):
-            return self._indel
-
-    property level:
-        """the level of the read in the "viewer" mode"""
-        def __get__(self):
-            return self._level
-
-    property is_del:
-        """1 iff the base on the padded read is a deletion"""
-        def __get__(self):
-            return self._is_del
-
-    property is_head:
-        """1 iff the base on the padded read is the left-most base."""
-        def __get__(self):
-            return self._is_head
-
-    property is_tail:
-        """1 iff the base on the padded read is the right-most base."""
-        def __get__(self):
-            return self._is_tail
-
-    property is_refskip:
-        def __get__(self):
-            return self._is_refskip
-
-__all__ = [
-    "AlignedSegment",
-    "PileupColumn",
-    "PileupRead"]
diff --git a/pysam/calignmentfile.pxd b/pysam/calignmentfile.pxd

deleted file mode 100644 (file)

index 3384e7e..0000000
--- a/pysam/calignmentfile.pxd
+++ /dev/null
@@ -1,164 +0,0 @@
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-
-from pysam.cfaidx cimport faidx_t, Fastafile
-from pysam.calignedsegment cimport AlignedSegment
-from pysam.chtslib cimport *
-
-from cpython cimport array
-cimport cython
-
-cdef extern from *:
-    ctypedef char* const_char_ptr "const char*"
-
-cdef extern from "htslib_util.h":
-
-    char * pysam_bam_get_qname(bam1_t * b)
-
-cdef extern from "samfile_util.h":
-
-    int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
-    int bam_prob_realn(bam1_t *b, const char *ref)
-
-####################################################################
-# Utility types
-
-ctypedef struct __iterdata:
-    htsFile * htsfile
-    bam_hdr_t * header
-    hts_itr_t * iter
-    faidx_t * fastafile
-    int tid
-    char * seq
-    int seq_len
-
-
-cdef class AlignmentFile:
-
-    cdef object _filename
-    cdef object _reference_filename
-
-    # pointer to htsFile structure
-    cdef htsFile * htsfile
-
-    # pointer to index
-    cdef hts_idx_t *index
-    # header structure
-    cdef bam_hdr_t * header
-    # true if file is bam format
-    cdef readonly bint is_bam
-    # true if file is bam format
-    cdef readonly bint is_cram
-    # true if not a file but a stream
-    cdef readonly bint is_stream
-    # true if file is not on the local filesystem
-    cdef readonly bint is_remote
-    # current read within iteration
-    cdef bam1_t * b
-    # file opening mode
-    cdef char * mode
-
-    # beginning of read section
-    cdef int64_t start_offset
-
-    cdef bam1_t * getCurrent(self)
-    cdef int cnext(self)
-
-    # write an aligned read
-    cpdef int write(self, AlignedSegment read) except -1
-
-cdef class PileupColumn:
-    cdef bam_pileup1_t ** plp
-    cdef int tid
-    cdef int pos
-    cdef int n_pu
-
-cdef class PileupRead:
-    cdef AlignedSegment _alignment
-    cdef int32_t  _qpos
-    cdef int _indel
-    cdef int _level
-    cdef uint32_t _is_del
-    cdef uint32_t _is_head
-    cdef uint32_t _is_tail
-    cdef uint32_t _is_refskip
-
-cdef class IteratorRow:
-    cdef int retval
-    cdef bam1_t * b
-    cdef AlignmentFile samfile
-    cdef htsFile * htsfile
-    cdef bam_hdr_t * header
-    cdef int owns_samfile
-
-cdef class IteratorRowRegion(IteratorRow):
-    cdef hts_itr_t * iter
-    cdef bam1_t * getCurrent(self)
-    cdef int cnext(self)
-
-cdef class IteratorRowHead(IteratorRow):
-    cdef int max_rows
-    cdef int current_row
-    cdef bam1_t * getCurrent(self)
-    cdef int cnext(self)
-
-cdef class IteratorRowAll(IteratorRow):
-    cdef bam1_t * getCurrent(self)
-    cdef int cnext(self)
-
-cdef class IteratorRowAllRefs(IteratorRow):
-    cdef int         tid
-    cdef IteratorRowRegion rowiter
-
-cdef class IteratorRowSelection(IteratorRow):
-    cdef int current_pos
-    cdef positions
-    cdef bam1_t * getCurrent(self)
-    cdef int cnext(self)
-
-cdef class IteratorColumn:
-
-    # result of the last plbuf_push
-    cdef IteratorRowRegion iter
-    cdef int tid
-    cdef int pos
-    cdef int n_plp
-    cdef int mask
-    cdef bam_pileup1_t * plp
-    cdef bam_plp_t pileup_iter
-    cdef __iterdata iterdata
-    cdef AlignmentFile samfile
-    cdef Fastafile fastafile
-    cdef stepper
-    cdef int max_depth
-
-    cdef int cnext(self)
-    cdef char * getSequence(self)
-    cdef setMask(self, mask)
-    cdef setupIteratorData(self,
-                           int tid,
-                           int start,
-                           int end,
-                           int multiple_iterators=?)
-
-    cdef reset(self, tid, start, end)
-    cdef _free_pileup_iter(self)
-
-cdef class IteratorColumnRegion(IteratorColumn):
-    cdef int start
-    cdef int end
-    cdef int truncate
-
-cdef class IteratorColumnAllRefs(IteratorColumn):
-    pass
-
-cdef class IndexedReads:
-    cdef AlignmentFile samfile
-    cdef htsFile * htsfile
-    cdef index
-    cdef int owns_samfile
-    cdef bam_hdr_t * header
-
diff --git a/pysam/calignmentfile.pyx b/pysam/calignmentfile.pyx

deleted file mode 100644 (file)

index ed5e584..0000000
--- a/pysam/calignmentfile.pyx
+++ /dev/null
@@ -1,2535 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-########################################################
-########################################################
-# Cython wrapper for SAM/BAM/CRAM files based on htslib
-########################################################
-# The principal classes defined in this module are:
-#
-# class AlignmentFile   read/write access to SAM/BAM/CRAM formatted files
-# 
-# class IndexedReads    index a SAM/BAM/CRAM file by query name while keeping
-#                       the original sort order intact
-# 
-# Additionally this module defines numerous additional classes that
-# are part of the internal API. These are:
-# 
-# Various iterator classes to iterate over alignments in sequential
-# (IteratorRow) or in a stacked fashion (IteratorColumn):
-# 
-# class IteratorRow
-# class IteratorRowRegion
-# class IteratorRowHead
-# class IteratorRowAll
-# class IteratorRowAllRefs
-# class IteratorRowSelection
-# class IteratorColumn
-# class IteratorColumnRegion
-# class IteratorColumnAllRefs
-#
-########################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-########################################################
-import os
-import collections
-import re
-import warnings
-import array
-
-from cpython cimport array as c_array
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-from pysam.calignedsegment cimport makeAlignedSegment, makePileupColumn
-from pysam.chtslib cimport hisremote
-
-if PY_MAJOR_VERSION >= 3:
-    from io import StringIO
-else:
-    from StringIO import StringIO
-
-cimport cython
-
-########################################################
-## Constants and global variables
-
-# defines imported from samtools
-DEF SEEK_SET = 0
-DEF SEEK_CUR = 1
-DEF SEEK_END = 2
-
-# maximum genomic coordinace
-cdef int MAX_POS = 2 << 29
-
-# valid types for SAM headers
-VALID_HEADER_TYPES = {"HD" : dict,
-                      "SQ" : list,
-                      "RG" : list,
-                      "PG" : list,
-                      "CO" : list}
-
-# order of records within SAM headers
-VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
-
-# default type conversions within SAM header records
-KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
-                       "SQ" : {"SN" : str, "LN" : int, "AS" : str, 
-                               "M5" : str, "SP" : str, "UR" : str,},
-                       "RG" : {"ID" : str, "CN" : str, "DS" : str,
-                               "DT" : str, "FO" : str, "KS" : str,
-                               "LB" : str, "PG" : str, "PI" : str,
-                               "PL" : str, "PM" : str, "PU" : str,
-                               "SM" : str,},
-                       "PG" : {"ID" : str, "PN" : str, "CL" : str, 
-                               "PP" : str, "DS" : str, "VN" : str,},}
-
-# output order of fields within records. Ensure that CL is at
-# the end as parsing a CL will ignore any subsequent records.
-VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"),
-                      "SQ" : ("SN", "LN", "AS", "M5",
-                               "UR", "SP"),
-                      "RG" : ("ID", "CN", "SM", "LB",
-                              "PU", "PI", "DT", "DS",
-                              "PL", "FO", "KS", "PG",
-                              "PM"),
-                      "PG" : ("PN", "ID", "VN", "PP",
-                              "DS", "CL"),}
-
-
-def build_header_line(fields, record):
-    '''build a header line from `fields` dictionary for `record`'''
-
-    # TODO: add checking for field and sort order
-    line = ["@%s" % record]
-        # comment
-    if record == "CO":
-        line.append(fields)
-    # user tags
-    elif record.islower():
-        for key in sorted(fields):
-            line.append("%s:%s" % (key, str(fields[key])))
-    # defined tags
-    else:
-        # write fields of the specification
-        for key in VALID_HEADER_ORDER[record]:
-            if key in fields:
-                line.append("%s:%s" % (key, str(fields[key])))
-        # write user fields
-        for key in fields:
-            if not key.isupper():
-                line.append("%s:%s" % (key, str(fields[key])))
-
-    return "\t".join(line)
-
-cdef bam_hdr_t * build_header(new_header):
-    '''return a new header built from a dictionary in `new_header`.
-
-    This method inserts the text field, target_name and target_len.
-    '''
-
-    lines = []
-
-    # check if hash exists
-
-    # create new header and copy old data
-    cdef bam_hdr_t * dest
-
-    dest = bam_hdr_init()
-
-    # first: defined tags
-    for record in VALID_HEADERS:
-        if record in new_header:
-            ttype = VALID_HEADER_TYPES[record]
-            data = new_header[record]
-            if type(data) != type(ttype()):
-                raise ValueError(
-                    "invalid type for record %s: %s, expected %s" %
-                    (record, type(data), type(ttype())))
-            if type(data) is dict:
-                lines.append(build_header_line(data, record))
-            else:
-                for fields in new_header[record]:
-                    lines.append(build_header_line(fields, record))
-
-    # then: user tags (lower case), sorted alphabetically
-    for record, data in sorted(new_header.items()):
-        if record in VALID_HEADERS: continue
-        if type(data) is dict:
-            lines.append(build_header_line(data, record))
-        else:
-            for fields in new_header[record]:
-                lines.append(build_header_line(fields, record))
-
-    text = "\n".join(lines) + "\n"
-    if dest.text != NULL: free( dest.text )
-    dest.text = <char*>calloc(len(text), sizeof(char))
-    dest.l_text = len(text)
-    cdef bytes btext = text.encode('ascii')
-    strncpy(dest.text, btext, dest.l_text)
-
-    cdef bytes bseqname
-    # collect targets
-    if "SQ" in new_header:
-        seqs = []
-        for fields in new_header["SQ"]:
-            try:
-                seqs.append( (fields["SN"], fields["LN"] ) )
-            except KeyError:
-                raise KeyError( "incomplete sequence information in '%s'" % str(fields))
-
-        dest.n_targets = len(seqs)
-        dest.target_name = <char**>calloc(dest.n_targets, sizeof(char*))
-        dest.target_len = <uint32_t*>calloc(dest.n_targets, sizeof(uint32_t))
-
-        for x from 0 <= x < dest.n_targets:
-            seqname, seqlen = seqs[x]
-            dest.target_name[x] = <char*>calloc(
-                len(seqname) + 1, sizeof(char))
-            bseqname = seqname.encode('ascii')
-            strncpy(dest.target_name[x], bseqname,
-                    len(seqname) + 1)
-            dest.target_len[x] = seqlen
-
-    return dest
-
-
-cdef class AlignmentFile:
-    """AlignmentFile(filepath_or_object, mode=None, template=None,
-    reference_names=None, reference_lengths=None, text=NULL,
-    header=None, add_sq_text=False, check_header=True, check_sq=True,
-    reference_filename=None, filename=None)
-
-    A :term:`SAM`/:term:`BAM` formatted file. 
-
-    If `filepath_or_object` is a string, the file is automatically
-    opened. If `filepath_or_object` is a python File object, the
-    already opened file will be used.
-
-    If the file is opened for reading an index for a BAM file exists
-    (.bai), it will be opened automatically. Without an index random
-    access via :meth:`~pysam.AlignmentFile.fetch` and
-    :meth:`~pysam.AlignmentFile.pileup` is disabled.
-
-    For writing, the header of a :term:`SAM` file/:term:`BAM` file can
-    be constituted from several sources (see also the samtools format
-    specification):
-
-        1. If `template` is given, the header is copied from a another
-           `AlignmentFile` (`template` must be a
-           :class:`~pysam.AlignmentFile`).
-
-        2. If `header` is given, the header is built from a
-           multi-level dictionary. 
-
-        3. If `text` is given, new header text is copied from raw
-           text.
-
-        4. The names (`reference_names`) and lengths
-           (`reference_lengths`) are supplied directly as lists.
-
-    When reading or writing a CRAM file, the filename of a FASTA-formatted
-    reference can be specified with `reference_filename`.
-
-    By default, if a file is opened in mode 'r', it is checked
-    for a valid header (`check_header` = True) and a definition of
-    chromosome names (`check_sq` = True).
-
-    Parameters
-    ----------
-    mode : string
-        `mode` should be ``r`` for reading or ``w`` for writing. The
-        default is text mode (:term:`SAM`). For binary (:term:`BAM`)
-        I/O you should append ``b`` for compressed or ``u`` for
-        uncompressed :term:`BAM` output.  Use ``h`` to output header
-        information in text (:term:`TAM`) mode. Use ``c`` for
-        :term:`CRAM` formatted files.
-
-        If ``b`` is present, it must immediately follow ``r`` or
-        ``w``.  Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
-        ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
-        :term:`BAM` formatted file for reading, type::
-
-           f = pysam.AlignmentFile('ex1.bam','rb')
-
-        If mode is not specified, the method will try to auto-detect
-        in the order 'rb', 'r', thus both the following should work::
-
-            f1 = pysam.AlignmentFile('ex1.bam')
-            f2 = pysam.AlignmentFile('ex1.sam')
-
-    template : AlignmentFile
-        when writing, copy  header frem `template`.
-
-    header :  dict
-        when writing, build header from a multi-level dictionary. The
-        first level are the four types ('HD', 'SQ', ...). The second
-        level are a list of lines, with each line being a list of
-        tag-value pairs. The header is constructed first from all the
-        defined fields, followed by user tags in alphabetical order.
-
-    text : string
-        when writing, use the string provided as the header
-
-    reference_names : list
-        see referece_lengths
-
-    reference_lengths : list
-        when writing, build header from list of chromosome names and
-        lengths.  By default, 'SQ' and 'LN' tags will be added to the
-        header text. This option can be changed by unsetting the flag
-        `add_sq_text`.
-
-    add_sq_text : bool
-        do not add 'SQ' and 'LN' tags to header. This option permits
-        construction :term:`SAM` formatted files without a header.
-
-    check_header : bool
-        when reading, check if header is present (default=True)
-
-    check_sq : bool
-        when reading, check if SQ entries are present in header
-        (default=True)
-
-    reference_filename : string
-        Path to a FASTA-formatted reference file. Valid only for CRAM files.
-        When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
-        specified in the header (``UR`` tag), which are normally used to find
-        the reference.
-
-    filename : string
-        Alternative to filepath_or_object. Filename of the file
-        to be opened.
-
-    """
-
-    def __cinit__(self, *args, **kwargs):
-
-        self.htsfile = NULL
-        self._filename = None
-        self.is_bam = False
-        self.is_stream = False
-        self.is_cram = False
-        self.is_remote = False
-
-        if "filename" in kwargs:
-            args = [kwargs["filename"]]
-            del kwargs["filename"]
-
-        self._open(*args, **kwargs)
-
-        # allocate memory for iterator
-        self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
-
-    def is_open(self):
-        '''return true if htsfile has been opened.'''
-        return self.htsfile != NULL
-
-    def has_index(self):
-        """return true if htsfile has an existing (and opened) index.
-        """
-        return self.index != NULL
-
-    def check_index(self):
-        """return True if index is present.
-
-        Raises
-        ------
-
-        AttributeError
-            if htsfile is :term:`SAM` formatted and thus has no index.
-
-        ValueError
-            if htsfile is closed or index could not be opened.
-        """
-
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-        if not self.is_bam and not self.is_cram:
-            raise AttributeError(
-                "AlignmentFile.mapped only available in bam files")
-        if self.index == NULL:
-            raise ValueError(
-                "mapping information not recorded in index "
-                "or index not available")
-        return True
-
-    def _open(self,
-              filepath_or_object,
-              mode=None,
-              AlignmentFile template=None,
-              reference_names=None,
-              reference_lengths=None,
-              reference_filename=None,
-              text=None,
-              header=None,
-              port=None,
-              add_sq_text=True,
-              check_header=True,
-              check_sq=True,
-              filepath_index=None,
-              referencenames=None,
-              referencelengths=None):
-        '''open a sam, bam or cram formatted file.
-
-        If _open is called on an existing file, the current file
-        will be closed and a new file will be opened.
-        '''
-        cdef char *cfilename
-        cdef char *creference_filename
-        cdef char *cindexname
-        cdef char *cmode
-
-        # for backwards compatibility:
-        if referencenames is not None:
-            reference_names = referencenames
-        if referencelengths is not None:
-            reference_lengths = referencelengths
-
-        # autodetection for read
-        if mode is None:
-            mode = "r"
-
-        assert mode in ("r", "w", "rb", "wb", "wh",
-                        "wbu", "rU", "wb0",
-                        "rc", "wc"), \
-            "invalid file opening mode `%s`" % mode
-
-        # close a previously opened file
-        if self.htsfile != NULL:
-            self.close()
-
-        # StringIO not supported
-        if isinstance(filepath_or_object, StringIO):
-            filename = "stringio"
-            raise NotImplementedError(
-                "access from StringIO objects not supported")
-            if filepath_or_object.closed:
-                raise ValueError('I/O operation on closed StringIO object')
-        # check if we are working with a File object
-        elif hasattr(filepath_or_object, "fileno"):
-            filename = filepath_or_object.name
-            if filepath_or_object.closed:
-                raise ValueError('I/O operation on closed file')
-        else:
-            filename = filepath_or_object
-
-        # for htslib, wbu seems to not work
-        if mode == "wbu":
-            mode = "wb0"
-
-        cdef bytes bmode = mode.encode('ascii')
-        self._filename = filename = encode_filename(filename)
-        self._reference_filename = reference_filename = encode_filename(
-            reference_filename)
-
-        # FIXME: Use htsFormat when it is available
-        self.is_stream = filename == b"-"
-        self.is_remote = hisremote(filename)
-
-        cdef char * ctext
-        cdef hFILE * fp
-        ctext = NULL
-
-        if mode[0] == 'w':
-            # open file for writing
-
-            # header structure (used for writing)
-            if template:
-                self.header = bam_hdr_dup(template.header)
-            elif header:
-                self.header = build_header(header)
-            else:
-                # build header from a target names and lengths
-                assert reference_names and reference_lengths, \
-                    ("either supply options `template`, `header` "
-                     "or  both `reference_names` and `reference_lengths` "
-                     "for writing")
-                assert len(reference_names) == len(reference_lengths), \
-                    "unequal names and lengths of reference sequences"
-
-                # allocate and fill header
-                reference_names = [force_bytes(ref) for ref in reference_names]
-                self.header = bam_hdr_init()
-                self.header.n_targets = len(reference_names)
-                n = 0
-                for x in reference_names:
-                    n += len(x) + 1
-                self.header.target_name = <char**>calloc(
-                    n, sizeof(char*))
-                self.header.target_len = <uint32_t*>calloc(
-                    n, sizeof(uint32_t))
-                for x from 0 <= x < self.header.n_targets:
-                    self.header.target_len[x] = reference_lengths[x]
-                    name = reference_names[x]
-                    self.header.target_name[x] = <char*>calloc(
-                        len(name) + 1, sizeof(char))
-                    strncpy(self.header.target_name[x], name, len(name))
-
-                # Optionally, if there is no text, add a SAM
-                # compatible header to output file.
-                if text is None and add_sq_text:
-                    text = []
-                    for x from 0 <= x < self.header.n_targets:
-                        text.append("@SQ\tSN:%s\tLN:%s\n" % \
-                                    (force_str(reference_names[x]), 
-                                     reference_lengths[x]))
-                    text = ''.join(text)
-
-                if text is not None:
-                    # copy without \0
-                    text = force_bytes(text)
-                    ctext = text
-                    self.header.l_text = strlen(ctext)
-                    self.header.text = <char*>calloc(
-                        strlen(ctext), sizeof(char))
-                    memcpy(self.header.text, ctext, strlen(ctext))
-
-            # open file (hts_open is synonym with sam_open)
-            cfilename, cmode = filename, bmode
-            if hasattr(filepath_or_object, "fileno"):
-                fp = hdopen(filepath_or_object.fileno(), cmode)
-                with nogil:
-                    self.htsfile = hts_hopen(fp, cfilename, cmode)
-            else:
-                with nogil:
-                    self.htsfile = hts_open(cfilename, cmode)
-
-            # htsfile.format does not get set until writing, so use
-            # the format specifier explicitely given by the user.
-            self.is_bam = "b" in mode
-            self.is_cram = "c" in mode
-
-            # set filename with reference sequences. If no filename
-            # is given, the CRAM reference arrays will be built from
-            # the @SQ header in the header
-            if self.is_cram and reference_filename:
-                # note that fn_aux takes ownership, so create a copy
-                self.htsfile.fn_aux = strdup(self._reference_filename)
-
-            # write header to htsfile
-            if self.is_bam or self.is_cram or "h" in mode:
-                with nogil:
-                    sam_hdr_write(self.htsfile, self.header)
-
-        elif mode[0] == "r":
-            # open file for reading
-            if (filename != b"-"
-                and not self.is_remote
-                and not os.path.exists(filename)):
-                raise IOError("file `%s` not found" % filename)
-
-            # open file (hts_open is synonym with sam_open)
-            cfilename, cmode = filename, bmode
-            if hasattr(filepath_or_object, "fileno"):
-                fp = hdopen(filepath_or_object.fileno(), cmode)
-                with nogil:
-                    self.htsfile = hts_hopen(fp, cfilename, cmode)
-            else:
-                with nogil:
-                    self.htsfile = hts_open(cfilename, cmode)
-
-            if self.htsfile == NULL:
-                raise ValueError(
-                    "could not open file (mode='%s') - "
-                    "is it SAM/BAM format?" % mode)
-
-            self.is_bam = self.htsfile.format.format == bam
-            self.is_cram = self.htsfile.format.format == cram
-
-            # bam files require a valid header
-            if self.is_bam or self.is_cram:
-                with nogil:
-                    self.header = sam_hdr_read(self.htsfile)
-                if self.header == NULL:
-                    raise ValueError(
-                        "file does not have valid header (mode='%s') "
-                        "- is it BAM format?" % mode )
-            else:
-                # in sam files it is optional (htsfile full of
-                # unmapped reads)
-                if check_header:
-                    with nogil:
-                        self.header = sam_hdr_read(self.htsfile)
-                    if self.header == NULL:
-                        raise ValueError(
-                            "file does not have valid header (mode='%s') "
-                            "- is it SAM format?" % mode )
-                    # self.header.ignore_sam_err = True
-
-            # set filename with reference sequences
-            if self.is_cram and reference_filename:
-                creference_filename = self._reference_filename
-                hts_set_opt(self.htsfile,
-                            CRAM_OPT_REFERENCE,
-                            creference_filename)
-
-            if check_sq and self.header.n_targets == 0:
-                raise ValueError(
-                    ("file has no sequences defined (mode='%s') - "
-                     "is it SAM/BAM format? Consider opening with "
-                     "check_sq=False") % mode)
-
-        if self.htsfile == NULL:
-            raise IOError("could not open file `%s`" % filename )
-
-        # check for index and open if present
-        cdef int format_index = -1
-        if self.is_bam:
-            format_index = HTS_FMT_BAI
-        elif self.is_cram:
-            format_index = HTS_FMT_CRAI
-
-        if mode[0] == "r" and (self.is_bam or self.is_cram):
-
-            # open index for remote files
-            if self.is_remote and not filepath_index:
-                cfilename = filename
-
-                with nogil:
-                    self.index = hts_idx_load(cfilename, format_index)
-                if self.index == NULL:
-                    warnings.warn(
-                        "unable to open remote index for '%s'" % cfilename)
-            else:
-                has_index = True
-                cfilename = filename
-                if filepath_index:
-                    if not os.path.exists(filepath_index):
-                        warnings.warn(
-                            "unable to open index at %s" % cfilename)
-                        self.index = NULL
-                        has_index = False
-                else:
-                    if self.is_bam \
-                            and not os.path.exists(filename + b".bai") \
-                            and not os.path.exists(filename[:-4] + b".bai"):
-                        self.index = NULL
-                        has_index = False
-                    elif self.is_cram \
-                            and not os.path.exists(filename + b".crai") \
-                            and not os.path.exists(filename[:-5] + b".crai"):
-                        self.index = NULL
-                        has_index = False
-
-                if has_index:
-                    # returns NULL if there is no index or index could
-                    # not be opened
-                    if filepath_index:
-                        cindexname = filepath_index = encode_filename(filepath_index)
-                        with nogil:
-                            self.index = sam_index_load2(self.htsfile,
-                                                         cfilename,
-                                                         cindexname)
-
-                    else:
-                        with nogil:
-                            self.index = sam_index_load(self.htsfile,
-                                                        cfilename)
-                    if self.index == NULL:
-                        raise IOError(
-                            "error while opening index for '%s'" %
-                            filename)
-
-            # save start of data section
-            if not self.is_stream:
-                self.start_offset = self.tell()
-
-    def get_tid(self, reference):
-        """
-        return the numerical :term:`tid` corresponding to
-        :term:`reference`
-
-        returns -1 if reference is not known.
-        """
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-        reference = force_bytes(reference)
-        return bam_name2id(self.header, reference)
-
-    def get_reference_name(self, tid):
-        """
-        return :term:`reference` name corresponding to numerical :term:`tid`
-        """
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-        if not 0 <= tid < self.header.n_targets:
-            raise ValueError("reference_id %i out of range 0<=tid<%i" % 
-                             (tid, self.header.n_targets))
-        return charptr_to_str(self.header.target_name[tid])
-
-    def reset(self):
-        """reset file position to beginning of file just after
-        the header.
-
-        Returns
-        -------
-
-        The file position after moving the file pointer.
-
-        """
-        return self.seek(self.start_offset, 0)
-
-    def seek(self, uint64_t offset, int where=0):
-        """move file pointer to position `offset`, see
-        :meth:`pysam.AlignmentFile.tell`.
-
-        Parameters
-        ----------
-        
-        offset : int
-
-        position of the read/write pointer within the file.
-
-        where : int
-    
-        optional and defaults to 0 which means absolute file
-        positioning, other values are 1 which means seek relative to
-        the current position and 2 means seek relative to the file's
-        end.
-        
-        Returns
-        -------
-        
-        the file position after moving the file pointer
-
-        """
-
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-        if not self.is_bam:
-            raise NotImplementedError(
-                "seek only available in bam files")
-        if self.is_stream:
-            raise OSError("seek no available in streams")
-
-        cdef uint64_t pos
-        with nogil:
-            pos = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, where)
-        return pos
-
-    def tell(self):
-        """
-        return current file position.
-        """
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-        if not (self.is_bam or self.is_cram):
-            raise NotImplementedError(
-                "seek only available in bam files")
-
-        cdef uint64_t pos
-        with nogil:
-            pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
-        return pos
-
-    def parse_region(self,
-                     reference=None,
-                     start=None,
-                     end=None,
-                     region=None,
-                     tid=None):
-        """parse alternative ways to specify a genomic region. A region can
-        either be specified by :term:`reference`, `start` and
-        `end`. `start` and `end` denote 0-based, half-open
-        intervals.
-
-        Alternatively, a samtools :term:`region` string can be
-        supplied.
-        
-        If any of the coordinates are missing they will be replaced by the
-        minimum (`start`) or maximum (`end`) coordinate.
-
-        Note that region strings are 1-based, while `start` and `end` denote
-        an interval in python coordinates.
-
-        Returns
-        -------
-        
-        tuple :  a tuple of `flag`, :term:`tid`, `start` and `end`. The
-        flag indicates whether no coordinates were supplied and the
-        genomic region is the complete genomic space.
-
-        Raises
-        ------
-        
-        ValueError
-           for invalid or out of bounds regions.
-
-        """
-        cdef int rtid
-        cdef long long rstart
-        cdef long long rend
-
-        rtid = -1
-        rstart = 0
-        rend = MAX_POS
-        if start != None:
-            try:
-                rstart = start
-            except OverflowError:
-                raise ValueError('start out of range (%i)' % start)
-
-        if end != None:
-            try:
-                rend = end
-            except OverflowError:
-                raise ValueError('end out of range (%i)' % end)
-
-        if region:
-            region = force_str(region)
-            parts = re.split("[:-]", region)
-            reference = parts[0]
-            if len(parts) >= 2:
-                rstart = int(parts[1]) - 1
-            if len(parts) >= 3:
-                rend = int(parts[2])
-
-        if not reference:
-            return 0, 0, 0, 0
-
-        if tid is not None:
-            rtid = tid
-        else:
-            rtid = self.gettid(reference)
-
-        if rtid < 0:
-            raise ValueError(
-                "invalid reference `%s`" % reference)
-        if rstart > rend:
-            raise ValueError(
-                'invalid coordinates: start (%i) > end (%i)' % (rstart, rend))
-        if not 0 <= rstart < MAX_POS:
-            raise ValueError('start out of range (%i)' % rstart)
-        if not 0 <= rend <= MAX_POS:
-            raise ValueError('end out of range (%i)' % rend)
-
-        return 1, rtid, rstart, rend
-
-    def fetch(self,
-              reference=None,
-              start=None,
-              end=None,
-              region=None,
-              tid=None,
-              until_eof=False,
-              multiple_iterators=False):
-        """fetch reads aligned in a :term:`region`. 
-
-        See :meth:`AlignmentFile.parse_region` for more information
-        on genomic regions.
-
-        Without a `reference` or `region` all mapped reads in the file
-        will be fetched. The reads will be returned ordered by reference
-        sequence, which will not necessarily be the order within the
-        file. This mode of iteration still requires an index. If there is
-        no index, use `until_eof=True`.
-
-        If only `reference` is set, all reads aligned to `reference`
-        will be fetched.
-
-        A :term:`SAM` file does not allow random access. If `region`
-        or `reference` are given, an exception is raised.
-
-        :class:`~pysam.FastaFile`
-        :class:`~pysam.IteratorRow`
-        :class:`~pysam.IteratorRow`
-        :class:`~IteratorRow`
-        :class:`IteratorRow`
-
-        Parameters
-        ----------
-        
-        until_eof : bool
-
-           If `until_eof` is True, all reads from the current file
-           position will be returned in order as they are within the
-           file. Using this option will also fetch unmapped reads.
-
-        multiple_iterators : bool
-           
-           If `multiple_iterators` is True, multiple
-           iterators on the same file can be used at the same time. The
-           iterator returned will receive its own copy of a filehandle to
-           the file effectively re-opening the file. Re-opening a file
-           creates some overhead, so beware.
-
-        Returns
-        -------
-
-        An iterator over a collection of reads.
-
-        Raises
-        ------
-
-        ValueError
-            if the genomic coordinates are out of range or invalid or the
-            file does not permit random access to genomic coordinates.
-
-        """
-        cdef int rtid, rstart, rend, has_coord
-
-        if not self.is_open():
-            raise ValueError( "I/O operation on closed file" )
-
-        has_coord, rtid, rstart, rend = self.parse_region(
-            reference,
-            start,
-            end,
-            region,
-            tid)
-
-        # Turn of re-opening if htsfile is a stream
-        if self.is_stream:
-            multiple_iterators = False
-
-        if self.is_bam or self.is_cram:
-            if not until_eof and not self.is_remote:
-                if not self.has_index():
-                    raise ValueError(
-                        "fetch called on bamfile without index")
-
-            if has_coord:
-                return IteratorRowRegion(
-                    self, rtid, rstart, rend, 
-                    multiple_iterators=multiple_iterators)
-            else:
-                if until_eof:
-                    return IteratorRowAll(
-                        self,
-                        multiple_iterators=multiple_iterators)
-                else:
-                    # AH: check - reason why no multiple_iterators for
-                    # AllRefs?
-                    return IteratorRowAllRefs(
-                        self,
-                        multiple_iterators=multiple_iterators)
-        else:
-            if has_coord:
-                raise ValueError(
-                    "fetching by region is not available for sam files")
-
-            if self.header == NULL:
-                raise ValueError(
-                    "fetch called for htsfile without header")
-
-            # check if targets are defined
-            # give warning, sam_read1 segfaults
-            if self.header.n_targets == 0:
-                warnings.warn("fetch called for htsfile without header")
-                
-            return IteratorRowAll(self,
-                                  multiple_iterators=multiple_iterators)
-
-    def head(self, n, multiple_iterators=True):
-        '''return an iterator over the first n alignments. 
-
-        This iterator is is useful for inspecting the bam-file.
-
-        Parameters
-        ----------
-
-        multiple_iterators : bool
-        
-            is set to True by default in order to
-            avoid changing the current file position.
-        
-        Returns
-        -------
-        
-        an iterator over a collection of reads
-        
-        '''
-        return IteratorRowHead(self, n,
-                               multiple_iterators=multiple_iterators)
-
-    def mate(self, AlignedSegment read):
-        '''return the mate of :class:`~pysam.AlignedSegment` `read`.
-
-        .. note::
-
-            Calling this method will change the file position.
-            This might interfere with any iterators that have
-            not re-opened the file.
-
-        .. note::
-  
-           This method is too slow for high-throughput processing.
-           If a read needs to be processed with its mate, work
-           from a read name sorted file or, better, cache reads.
-
-        Returns
-        -------
-        
-        :class:`~pysam.AlignedSegment` : the mate
-
-        Raises
-        ------
-
-        ValueError
-            if the read is unpaired or the mate is unmapped
-
-        '''
-        cdef uint32_t flag = read._delegate.core.flag
-
-        if flag & BAM_FPAIRED == 0:
-            raise ValueError("read %s: is unpaired" %
-                             (read.query_name))
-        if flag & BAM_FMUNMAP != 0:
-            raise ValueError("mate %s: is unmapped" %
-                             (read.query_name))
-
-        # xor flags to get the other mate
-        cdef int x = BAM_FREAD1 + BAM_FREAD2
-        flag = (flag ^ x) & x
-
-        # Make sure to use a separate file to jump around
-        # to mate as otherwise the original file position
-        # will be lost
-        # The following code is not using the C API and
-        # could thus be made much quicker, for example
-        # by using tell and seek.
-        for mate in self.fetch(
-                read._delegate.core.mpos,
-                read._delegate.core.mpos + 1,
-                tid=read._delegate.core.mtid,
-                multiple_iterators=True):
-            if mate.flag & flag != 0 and \
-               mate.query_name == read.query_name:
-                break
-        else:
-            raise ValueError("mate not found")
-
-        return mate
-
-    def pileup(self,
-               reference=None,
-               start=None,
-               end=None,
-               region=None,
-               **kwargs):
-        """perform a :term:`pileup` within a :term:`region`. The region is
-        specified by :term:`reference`, 'start' and 'end' (using
-        0-based indexing).  Alternatively, a samtools 'region' string
-        can be supplied.
-
-        Without 'reference' or 'region' all reads will be used for the
-        pileup. The reads will be returned ordered by
-        :term:`reference` sequence, which will not necessarily be the
-        order within the file.
-
-        Note that :term:`SAM` formatted files do not allow random
-        access.  In these files, if a 'region' or 'reference' are
-        given an exception is raised.
-
-        .. note::
-
-            'all' reads which overlap the region are returned. The
-            first base returned will be the first base of the first
-            read 'not' necessarily the first base of the region used
-            in the query.
-
-        Parameters
-        ----------
-
-        stepper : string
-           The stepper controlls how the iterator advances.
-           Possible options for the stepper are
-
-           ``all``
-              skip reads in which any of the following flags are set:
-              BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
-
-           ``nofilter``
-              uses every single read
-
-           ``samtools``
-              same filter and read processing as in :term:`csamtools`
-              pileup. This requires a 'fastafile' to be given.
-
-
-        fastafile : :class:`~pysam.FastaFile` object.
-
-           This is required for some of the steppers.
-
-        max_depth : int
-           Maximum read depth permitted. The default limit is '8000'.
-
-        truncate : bool
-
-           By default, the samtools pileup engine outputs all reads
-           overlapping a region. If truncate is True and a region is
-           given, only columns in the exact region specificied are
-           returned.
-
-        Returns
-        -------
-
-        an iterator over genomic positions.
-
-        """
-        cdef int rtid, rstart, rend, has_coord
-
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-
-        has_coord, rtid, rstart, rend = self.parse_region(
-            reference, start, end, region)
-
-        if self.is_bam or self.is_cram:
-            if not self.has_index():
-                raise ValueError("no index available for pileup")
-
-            if has_coord:
-                return IteratorColumnRegion(self,
-                                            tid=rtid,
-                                            start=rstart,
-                                            end=rend,
-                                            **kwargs )
-            else:
-                return IteratorColumnAllRefs(self, **kwargs )
-
-        else:
-            raise NotImplementedError(
-                "pileup of samfiles not implemented yet")
-
-    def count(self,
-              reference=None,
-              start=None,
-              end=None,
-              region=None,
-              until_eof=False,
-              read_callback="nofilter"):
-        '''count the number of reads in :term:`region`
-
-        The region is specified by :term:`reference`, `start` and
-        `end`. Alternatively, a :term:`samtools` :term:`region` string
-        can be supplied.
-
-        A :term:`SAM` file does not allow random access and if
-        `region` or `reference` are given, an exception is raised.
-
-        Parameters
-        ----------
-        
-        reference : string
-            reference_name of the genomic region (chromosome)
-
-        start : int
-            start of the genomic region
-
-        end : int
-            end of the genomic region
-        
-        region : string
-            a region string in samtools format.
-
-        until_eof : bool
-            count until the end of the file, possibly including 
-            unmapped reads as well.
-
-        read_callback: string or function
-
-            select a call-back to ignore reads when counting. It can
-            be either a string with the following values:
-
-            ``all``
-                skip reads in which any of the following
-                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
-                BAM_FDUP
-
-            ``nofilter``
-                uses every single read
-
-            Alternatively, `read_callback` can be a function
-            ``check_read(read)`` that should return True only for
-            those reads that shall be included in the counting.
-
-        Raises
-        ------
-
-        ValueError
-            if the genomic coordinates are out of range or invalid.
-
-        '''
-        cdef AlignedSegment read
-        cdef long counter = 0
-
-        if not self.is_open():
-            raise ValueError( "I/O operation on closed file" )
-
-        cdef int filter_method = 0
-        if read_callback == "all":
-            filter_method = 1
-        elif read_callback == "nofilter":
-            filter_method = 2
-
-        for read in self.fetch(reference=reference,
-                               start=start,
-                               end=end,
-                               region=region,
-                               until_eof=until_eof):
-            # apply filter
-            if filter_method == 1:
-                # filter = "all"
-                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
-                    continue
-            elif filter_method == 2:
-                # filter = "nofilter"
-                pass
-            else:
-                if not read_callback(read):
-                    continue
-            counter += 1
-
-        return counter
-
-    @cython.boundscheck(False)  # we do manual bounds checking
-    def count_coverage(self, 
-                       reference=None,
-                       start=None,
-                       end=None,
-                       region=None,
-                       quality_threshold=15,
-                       read_callback='all'):
-        """count the coverage of genomic positions by reads in :term:`region`.
-
-        The region is specified by :term:`reference`, `start` and
-        `end`. Alternatively, a :term:`samtools` :term:`region` string
-        can be supplied. The coverage is computed per-base [ACGT].
-
-        Parameters
-        ----------
-        
-        reference : string
-            reference_name of the genomic region (chromosome)
-
-        start : int
-            start of the genomic region
-
-        end : int
-            end of the genomic region
-
-        region : int
-            a region string.
-
-        quality_threshold : int
-            quality_threshold is the minimum quality score (in phred) a
-            base has to reach to be counted. 
-
-        read_callback: string or function
-
-            select a call-back to ignore reads when counting. It can
-            be either a string with the following values:
-
-            ``all``
-                skip reads in which any of the following
-                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
-                BAM_FDUP
-
-            ``nofilter``
-                uses every single read
-
-            Alternatively, `read_callback` can be a function
-            ``check_read(read)`` that should return True only for
-            those reads that shall be included in the counting.
-
-        Raises
-        ------
-
-        ValueError
-            if the genomic coordinates are out of range or invalid.
-
-        Returns
-        -------
-
-        four array.arrays of the same length in order A C G T : tuple
-
-        """
-        
-        cdef int _start = start
-        cdef int _stop = end
-        cdef int length = _stop - _start
-        cdef c_array.array int_array_template = array.array('L', [])
-        cdef c_array.array count_a
-        cdef c_array.array count_c
-        cdef c_array.array count_g
-        cdef c_array.array count_t
-        count_a = c_array.clone(int_array_template, length, zero=True)
-        count_c = c_array.clone(int_array_template, length, zero=True)
-        count_g = c_array.clone(int_array_template, length, zero=True)
-        count_t = c_array.clone(int_array_template, length, zero=True)
-
-        cdef AlignedSegment read
-        cdef cython.str seq
-        cdef c_array.array quality
-        cdef int qpos
-        cdef int refpos
-        cdef int c = 0
-        cdef int filter_method = 0
-        if read_callback == "all":
-            filter_method = 1
-        elif read_callback == "nofilter":
-            filter_method = 2
-    
-        cdef int _threshold = quality_threshold
-        for read in self.fetch(reference=reference,
-                               start=start,
-                               end=end,
-                               region=region):
-            # apply filter
-            if filter_method == 1:
-                # filter = "all"
-                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
-                    continue
-            elif filter_method == 2:
-                # filter = "nofilter"
-                pass
-            else:
-                if not read_callback(read):
-                    continue
-
-            # count
-            seq = read.seq
-            quality = read.query_qualities
-            for qpos, refpos in read.get_aligned_pairs(True):
-                if qpos is not None and refpos is not None and \
-                   _start <= refpos < _stop:
-                    if quality[qpos] >= quality_threshold:
-                        if seq[qpos] == 'A':
-                            count_a.data.as_ulongs[refpos - _start] += 1
-                        if seq[qpos] == 'C':
-                            count_c.data.as_ulongs[refpos - _start] += 1
-                        if seq[qpos] == 'G':
-                            count_g.data.as_ulongs[refpos - _start] += 1
-                        if seq[qpos] == 'T':
-                            count_t.data.as_ulongs[refpos - _start] += 1
-
-        return count_a, count_c, count_g, count_t
-
-    def close(self):
-        '''
-        closes the :class:`pysam.AlignmentFile`.'''
-        if self.htsfile != NULL:
-            hts_close(self.htsfile)
-            hts_idx_destroy(self.index);
-            self.htsfile = NULL
-
-    def __dealloc__(self):
-        # remember: dealloc cannot call other methods
-        # note: no doc string
-        # note: __del__ is not called.
-
-        # FIXME[kbj]: isn't self.close a method?  I've been duplicating
-        # close within __dealloc__ (see BCFFile.__dealloc__).  Not a pretty
-        # solution and perhaps unnecessary given that calling self.close has
-        # been working for years.
-        # AH: I have removed the call to close. Even though it is working,
-        # it seems to be dangerous according to the documentation as the
-        # object be partially deconstructed already.
-        if self.htsfile != NULL:
-            hts_close(self.htsfile)
-            hts_idx_destroy(self.index);
-            self.htsfile = NULL
-
-        bam_destroy1(self.b)
-        if self.header != NULL:
-            bam_hdr_destroy(self.header)
-            
-    cpdef int write(self, AlignedSegment read) except -1:
-        '''
-        write a single :class:`pysam.AlignedSegment` to disk.
-
-        Raises
-        ------
-        ValueError
-            if the writing failed
-
-        Returns
-        -------
-            
-        int : the number of bytes written. If the file is closed,
-              this will be 0.
-        '''
-        if not self.is_open():
-            return 0
-
-        cdef int ret
-
-        with nogil:
-            ret = sam_write1(self.htsfile,
-                             self.header,
-                             read._delegate)
-
-        # kbj: Still need to raise an exception with except -1. Otherwise
-        #      when ret == -1 we get a "SystemError: error return without
-        #      exception set".
-        if ret < 0:
-            raise ValueError('sam write failed')
-
-        return ret
-
-    # context manager interface
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-        return False
-
-    ###############################################################
-    ###############################################################
-    ###############################################################
-    ## properties
-    ###############################################################
-    property closed:
-        """bool indicating the current state of the file object. 
-        This is a read-only attribute; the close() method changes the value. 
-        """
-        def __get__(self):
-            return not self.is_open()
-
-    property filename:
-        """filename associated with this object. This is a read-only attribute."""
-        def __get__(self):
-            return self._filename
-
-    property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
-        This is a read-only attribute."""
-        def __get__(self):
-            if not self.is_open():
-                raise ValueError("I/O operation on closed file")
-            return self.header.n_targets
-
-    property references:
-        """tuple with the names of :term:`reference` sequences. This is a 
-        read-only attribute"""
-        def __get__(self):
-            if not self.is_open(): raise ValueError( "I/O operation on closed file" )
-            t = []
-            for x from 0 <= x < self.header.n_targets:
-                t.append(charptr_to_str(self.header.target_name[x]))
-            return tuple(t)
-
-    property lengths:
-        """tuple of the lengths of the :term:`reference` sequences. This is a
-        read-only attribute. The lengths are in the same order as
-        :attr:`pysam.AlignmentFile.references`
-
-        """
-        def __get__(self):
-            if not self.is_open():
-                raise ValueError("I/O operation on closed file")
-            t = []
-            for x from 0 <= x < self.header.n_targets:
-                t.append(self.header.target_len[x])
-            return tuple(t)
-
-    property mapped:
-        """int with total number of mapped alignments according to the
-        statistics recorded in the index. This is a read-only
-        attribute.
-        """
-        def __get__(self):
-            self.check_index()
-            cdef int tid
-            cdef uint64_t total = 0
-            cdef uint64_t mapped, unmapped
-            for tid from 0 <= tid < self.header.n_targets:
-                with nogil:
-                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
-                total += mapped
-            return total
-
-    property unmapped:
-        """int with total number of unmapped reads according to the statistics
-        recorded in the index. This number of reads includes the number of reads
-        without coordinates. This is a read-only attribute.
-        """
-        def __get__(self):
-            self.check_index()
-            cdef int tid
-            cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
-            cdef uint64_t mapped, unmapped
-            for tid from 0 <= tid < self.header.n_targets:
-                with nogil:
-                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
-                total += unmapped
-            return total
-
-    property nocoordinate:
-        """int with total number of reads without coordinates according to the
-        statistics recorded in the index. This is a read-only attribute.
-        """
-        def __get__(self):
-            self.check_index()
-            cdef uint64_t n
-            with nogil:
-                n = hts_idx_get_n_no_coor(self.index)
-            return n
-
-    property format:
-        '''string describing the file format'''
-        def __get__(self):
-            if not self.is_open():
-                raise ValueError( "I/O operation on closed file" )
-            return hts_format_description(&self.htsfile.format)
-
-    property text:
-        '''string with the full contents of the :term:`sam file` header as a
-        string. 
-
-        This is a read-only attribute.
-        
-        See :attr:`pysam.AlignmentFile.header` to get a parsed
-        representation of the header.
-        '''
-        def __get__(self):
-            if not self.is_open():
-                raise ValueError( "I/O operation on closed file" )
-            return from_string_and_size(self.header.text, self.header.l_text)
-
-    property header:
-        """two-level dictionay with header information from the file. 
-        
-        This is a read-only attribute.
-
-        The first level contains the record (``HD``, ``SQ``, etc) and
-        the second level contains the fields (``VN``, ``LN``, etc).
-        
-        The parser is validating and will raise an AssertionError if
-        if encounters any record or field tags that are not part of
-        the SAM specification. Use the
-        :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
-        header.
-
-        The parsing follows the SAM format specification with the
-        exception of the ``CL`` field. This option will consume the
-        rest of a header line irrespective of any additional fields.
-        This behaviour has been added to accommodate command line
-        options that contain characters that are not valid field
-        separators.
-
-        """
-        def __get__(self):
-            if not self.is_open():
-                raise ValueError( "I/O operation on closed file" )
-
-            result = {}
-            
-            if self.header.text != NULL:
-                # convert to python string (note: call self.text to
-                # create 0-terminated string)
-                t = self.text
-                for line in t.split("\n"):
-                    if not line.strip(): continue
-                    assert line.startswith("@"), \
-                        "header line without '@': '%s'" % line
-                    fields = line[1:].split("\t")
-                    record = fields[0]
-                    assert record in VALID_HEADER_TYPES, \
-                        "header line with invalid type '%s': '%s'" % (record, line)
-
-                    # treat comments
-                    if record == "CO":
-                        if record not in result:
-                            result[record] = []
-                        result[record].append("\t".join( fields[1:]))
-                        continue
-                    # the following is clumsy as generators do not work?
-                    x = {}
-
-                    for idx, field in enumerate(fields[1:]):
-                        if ":" not in field: 
-                            raise ValueError("malformatted header: no ':' in field" )
-                        key, value = field.split(":", 1)
-                        if key in ("CL",):
-                            # special treatment for command line
-                            # statements (CL). These might contain
-                            # characters that are non-conformant with
-                            # the valid field separators in the SAM
-                            # header. Thus, in contravention to the
-                            # SAM API, consume the rest of the line.
-                            key, value = "\t".join(fields[idx+1:]).split(":", 1)
-                            x[key] = KNOWN_HEADER_FIELDS[record][key](value)
-                            break
-
-                        # interpret type of known header record tags, default to str
-                        x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
-
-                    if VALID_HEADER_TYPES[record] == dict:
-                        if record in result:
-                            raise ValueError(
-                                "multiple '%s' lines are not permitted" % record)
-
-                        result[record] = x
-                    elif VALID_HEADER_TYPES[record] == list:
-                        if record not in result: result[record] = []
-                        result[record].append(x)
-
-                # if there are no SQ lines in the header, add the
-                # reference names from the information in the bam
-                # file.
-                #
-                # Background: c-samtools keeps the textual part of the
-                # header separate from the list of reference names and
-                # lengths. Thus, if a header contains only SQ lines,
-                # the SQ information is not part of the textual header
-                # and thus are missing from the output. See issue 84.
-                if "SQ" not in result:
-                    sq = []
-                    for ref, length in zip(self.references, self.lengths):
-                        sq.append({'LN': length, 'SN': ref })
-                    result["SQ"] = sq
-
-            return result
-
-    ###############################################################
-    ## file-object like iterator access
-    ## note: concurrent access will cause errors (see IteratorRow
-    ## and multiple_iterators)
-    ## Possible solutions: deprecate or open new file handle
-    def __iter__(self):
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-
-        if not self.is_bam and self.header.n_targets == 0:
-            raise NotImplementedError(
-                "can not iterate over samfile without header")
-        return self
-
-    cdef bam1_t * getCurrent( self ):
-        return self.b
-
-    cdef int cnext(self):
-        '''
-        cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
-        '''
-        cdef int ret
-        with nogil:
-            ret = sam_read1(self.htsfile,
-                            self.header,
-                            self.b)
-        return ret
-
-    def __next__(self):
-        cdef int ret = self.cnext()
-        if (ret >= 0):
-            return makeAlignedSegment(self.b, self)
-        elif ret == -2:
-            raise IOError('truncated file')
-        else:
-            raise StopIteration
-            
-    # Compatibility functions for pysam < 0.8.3
-    def gettid(self, reference):
-        """deprecated, use get_tid() instead"""
-        return self.get_tid(reference)
-        
-    def getrname(self, tid):
-        """deprecated, use get_reference_name() instead"""
-        return self.get_reference_name(tid)
-
-
-cdef class IteratorRow:
-    '''abstract base class for iterators over mapped reads.
-
-    Various iterators implement different behaviours for wrapping around
-    contig boundaries. Examples include:
-
-    :class:`pysam.IteratorRowRegion`
-        iterate within a single contig and a defined region.
-
-    :class:`pysam.IteratorRowAll`
-        iterate until EOF. This iterator will also include unmapped reads.
-
-    :class:`pysam.IteratorRowAllRefs`
-        iterate over all reads in all reference sequences.
-
-    The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
-
-    .. note::
-
-        It is usually not necessary to create an object of this class
-        explicitly. It is returned as a result of call to a
-        :meth:`AlignmentFile.fetch`.
-
-    '''
-
-    def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
-        cdef char *cfilename
-        cdef char *creference_filename
-        
-        if not samfile.is_open():
-            raise ValueError("I/O operation on closed file")
-
-        # makes sure that samfile stays alive as long as the
-        # iterator is alive
-        self.samfile = samfile
-
-        # reopen the file - note that this makes the iterator
-        # slow and causes pileup to slow down significantly.
-        if multiple_iterators:
-            cfilename = samfile._filename
-            with nogil:
-                self.htsfile = hts_open(cfilename, 'r')
-            assert self.htsfile != NULL
-            # read header - required for accurate positioning
-            # could a tell/seek work?
-            with nogil:
-                self.header = sam_hdr_read(self.htsfile)
-            assert self.header != NULL
-            self.owns_samfile = True
-            # options specific to CRAM files
-            if samfile.is_cram and samfile._reference_filename:
-                creference_filename = samfile._reference_filename
-                hts_set_opt(self.htsfile,
-                            CRAM_OPT_REFERENCE,
-                            creference_filename)
-
-        else:
-            self.htsfile = self.samfile.htsfile
-            self.owns_samfile = False
-            self.header = self.samfile.header
-
-        self.retval = 0
-
-        self.b = bam_init1()
-
-    def __dealloc__(self):
-        bam_destroy1(self.b)
-        if self.owns_samfile:
-            hts_close(self.htsfile)
-            bam_hdr_destroy(self.header)
-
-
-cdef class IteratorRowRegion(IteratorRow):
-    """*(AlignmentFile samfile, int tid, int beg, int end,
-    int multiple_iterators=False)*
-
-    iterate over mapped reads in a region.
-
-    .. note::
-
-        It is usually not necessary to create an object of this class
-        explicitly. It is returned as a result of call to a
-        :meth:`AlignmentFile.fetch`.
-
-    """
-
-    def __init__(self, AlignmentFile samfile,
-                 int tid, int beg, int end,
-                 int multiple_iterators=False):
-
-        IteratorRow.__init__(self, samfile,
-                             multiple_iterators=multiple_iterators)
-
-        if not samfile.has_index():
-            raise ValueError("no index available for iteration")
-
-        with nogil:
-            self.iter = sam_itr_queryi(
-                self.samfile.index,
-                tid,
-                beg,
-                end)
-    
-    def __iter__(self):
-        return self
-
-    cdef bam1_t * getCurrent(self):
-        return self.b
-
-    cdef int cnext(self):
-        '''cversion of iterator. Used by IteratorColumn'''
-        with nogil:
-            self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
-                                       self.iter,
-                                       self.b,
-                                       self.htsfile)
-
-    def __next__(self):
-        self.cnext()
-        if self.retval >= 0:
-            return makeAlignedSegment(self.b, self.samfile)
-        elif self.retval == -2:
-            # Note: it is currently not the case that hts_iter_next
-            # returns -2 for a truncated file.
-            # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
-            raise IOError('truncated file')
-        else:
-            raise StopIteration
-
-    def __dealloc__(self):
-        hts_itr_destroy(self.iter)
-
-
-cdef class IteratorRowHead(IteratorRow):
-    """*(AlignmentFile samfile, n, int multiple_iterators=False)*
-
-    iterate over first n reads in `samfile`
-
-    .. note::
-        It is usually not necessary to create an object of this class
-        explicitly. It is returned as a result of call to a
-        :meth:`AlignmentFile.head`.
-
-    """
-
-    def __init__(self, AlignmentFile samfile, int n,
-                 int multiple_iterators=False):
-
-        IteratorRow.__init__(self, samfile,
-                             multiple_iterators=multiple_iterators)
-
-        self.max_rows = n
-        self.current_row = 0
-
-    def __iter__(self):
-        return self
-
-    cdef bam1_t * getCurrent( self ):
-        return self.b
-
-    cdef int cnext(self):
-        '''cversion of iterator. Used by IteratorColumn'''
-        cdef int ret
-        with nogil:
-            ret = sam_read1(self.htsfile,
-                            self.samfile.header,
-                            self.b)
-        return ret
-
-    def __next__(self):
-        if self.current_row >= self.max_rows:
-            raise StopIteration
-
-        cdef int ret = self.cnext()
-        if ret >= 0:
-            self.current_row += 1
-            return makeAlignedSegment(self.b, self.samfile)
-        elif ret == -2:
-            raise IOError('truncated file')
-        else:
-            raise StopIteration
-
-
-cdef class IteratorRowAll(IteratorRow):
-    """*(AlignmentFile samfile, int multiple_iterators=False)*
-
-    iterate over all reads in `samfile`
-
-    .. note::
-
-        It is usually not necessary to create an object of this class
-        explicitly. It is returned as a result of call to a
-        :meth:`AlignmentFile.fetch`.
-
-    """
-
-    def __init__(self, AlignmentFile samfile,
-                 int multiple_iterators=False):
-
-        IteratorRow.__init__(self, samfile,
-                             multiple_iterators=multiple_iterators)
-
-    def __iter__(self):
-        return self
-
-    cdef bam1_t * getCurrent( self ):
-        return self.b
-
-    cdef int cnext(self):
-        '''cversion of iterator. Used by IteratorColumn'''
-        cdef int ret
-        with nogil:
-            ret = sam_read1(self.htsfile,
-                            self.samfile.header,
-                            self.b)
-        return ret
-
-    def __next__(self):
-        cdef int ret = self.cnext()
-        if ret >= 0:
-            return makeAlignedSegment(self.b, self.samfile)
-        elif ret == -2:
-            raise IOError('truncated file')
-        else:
-            raise StopIteration
-
-
-cdef class IteratorRowAllRefs(IteratorRow):
-    """iterates over all mapped reads by chaining iterators over each
-    reference
-
-    .. note::
-        It is usually not necessary to create an object of this class
-        explicitly. It is returned as a result of call to a
-        :meth:`AlignmentFile.fetch`.
-
-    """
-
-    def __init__(self, AlignmentFile samfile,
-                 multiple_iterators=False):
-
-        IteratorRow.__init__(self, samfile,
-                             multiple_iterators=multiple_iterators)
-
-        if not samfile.has_index():
-            raise ValueError("no index available for fetch")
-
-        self.tid = -1
-
-    def nextiter(self):
-        # get a new iterator for a chromosome. The file
-        # will not be re-opened.
-        self.rowiter = IteratorRowRegion(self.samfile,
-                                         self.tid,
-                                         0,
-                                         1<<29)
-        # set htsfile and header of the rowiter
-        # to the values in this iterator to reflect multiple_iterators
-        self.rowiter.htsfile = self.htsfile
-        self.rowiter.header = self.header
-
-        # make sure the iterator understand that IteratorRowAllRefs
-        # has ownership
-        self.rowiter.owns_samfile = False
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        # Create an initial iterator
-        if self.tid == -1:
-            if not self.samfile.nreferences:
-                raise StopIteration
-            self.tid = 0
-            self.nextiter()
-
-        while 1:
-            self.rowiter.cnext()
-
-            # If current iterator is not exhausted, return aligned read
-            if self.rowiter.retval > 0:
-                return makeAlignedSegment(self.rowiter.b, self.samfile)
-
-            self.tid += 1
-
-            # Otherwise, proceed to next reference or stop
-            if self.tid < self.samfile.nreferences:
-                self.nextiter()
-            else:
-                raise StopIteration
-
-
-cdef class IteratorRowSelection(IteratorRow):
-    """*(AlignmentFile samfile)*
-
-    iterate over reads in `samfile` at a given list of file positions.
-
-    .. note::
-        It is usually not necessary to create an object of this class
-        explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
-    """
-
-    def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
-
-        IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
-
-        self.positions = positions
-        self.current_pos = 0
-
-    def __iter__(self):
-        return self
-
-    cdef bam1_t * getCurrent(self):
-        return self.b
-
-    cdef int cnext(self):
-        '''cversion of iterator'''
-        # end iteration if out of positions
-        if self.current_pos >= len(self.positions): return -1
-
-        cdef uint64_t pos = self.positions[self.current_pos]
-        with nogil:
-            bgzf_seek(hts_get_bgzfp(self.htsfile),
-                      pos,
-                      0)
-        self.current_pos += 1
-
-        cdef int ret
-        with nogil:
-            ret = sam_read1(self.htsfile,
-                            self.samfile.header,
-                            self.b)
-        return ret
-
-    def __next__(self):
-        cdef int ret = self.cnext()
-        if (ret >= 0):
-            return makeAlignedSegment(self.b, self.samfile)
-        elif (ret == -2):
-            raise IOError('truncated file')
-        else:
-            raise StopIteration
-
-
-cdef int __advance_nofilter(void *data, bam1_t *b):
-    '''advance without any read filtering.
-    '''
-    cdef __iterdata * d
-    d = <__iterdata*>data
-    cdef int ret
-    with nogil:
-        ret = sam_itr_next(d.htsfile, d.iter, b)
-    return ret
-
-
-cdef int __advance_all(void *data, bam1_t *b):
-    '''only use reads for pileup passing basic
-    filters:
-
-    BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
-    '''
-
-    cdef __iterdata * d
-    cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
-    d = <__iterdata*>data
-    cdef int ret
-    with nogil:
-        ret = sam_itr_next(d.htsfile, d.iter, b)
-    while ret >= 0 and b.core.flag & mask:
-        with nogil:
-            ret = sam_itr_next(d.htsfile, d.iter, b)
-    return ret
-
-
-cdef int __advance_snpcalls(void * data, bam1_t * b):
-    '''advance using same filter and read processing as in
-    the samtools pileup.
-    '''
-
-    # Note that this method requries acces to some 
-    # functions in the samtools code base and is thus
-    # not htslib only.
-    # The functions accessed in samtools are:
-    # 1. bam_prob_realn
-    # 2. bam_cap_mapQ
-    cdef __iterdata * d
-    d = <__iterdata*>data
-
-    cdef int ret
-    cdef int skip = 0
-    cdef int q
-    cdef int is_cns = 1
-    cdef int is_nobaq = 0
-    cdef int capQ_thres = 0
-
-    with nogil:
-        ret = sam_itr_next(d.htsfile, d.iter, b)
-
-    # reload sequence
-    if d.fastafile != NULL and b.core.tid != d.tid:
-        if d.seq != NULL:
-            free(d.seq)
-        d.tid = b.core.tid
-        with nogil:
-            d.seq = faidx_fetch_seq(
-                d.fastafile,
-                d.header.target_name[d.tid],
-                0, MAX_POS,
-                &d.seq_len)
-
-        if d.seq == NULL:
-            raise ValueError(
-                "reference sequence for '%s' (tid=%i) not found" % \
-                (d.header.target_name[d.tid],
-                 d.tid))
-
-    while ret >= 0:
-        skip = 0
-
-        # realign read - changes base qualities
-        if d.seq != NULL and is_cns and not is_nobaq: 
-            bam_prob_realn(b, d.seq)
-
-        if d.seq != NULL and capQ_thres > 10:
-            q = bam_cap_mapQ(b, d.seq, capQ_thres)
-            if q < 0:
-                skip = 1
-            elif b.core.qual > q:
-                b.core.qual = q
-        if b.core.flag & BAM_FUNMAP:
-            skip = 1
-        elif b.core.flag & 1 and not b.core.flag & 2:
-            skip = 1
-
-        if not skip:
-            break
-        # additional filters
-
-        with nogil:
-            ret = sam_itr_next(d.htsfile, d.iter, b)
-
-    return ret
-
-cdef class IteratorColumn:
-    '''abstract base class for iterators over columns.
-
-    IteratorColumn objects wrap the pileup functionality of samtools.
-
-    For reasons of efficiency, the iterator points to the current
-    pileup buffer. The pileup buffer is updated at every iteration.
-    This might cause some unexpected behavious. For example,
-    consider the conversion to a list::
-
-       f = AlignmentFile("file.bam", "rb")
-       result = list( f.pileup() )
-
-    Here, ``result`` will contain ``n`` objects of type
-    :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
-    ``result`` will contain the same information.
-
-    The desired behaviour can be achieved by list comprehension::
-
-       result = [ x.pileups() for x in f.pileup() ]
-
-    ``result`` will be a list of ``n`` lists of objects of type
-    :class:`~pysam.PileupRead`.
-
-    If the iterator is associated with a :class:`~pysam.Fastafile` using the
-    :meth:`addReference` method, then the iterator will export the
-    current sequence via the methods :meth:`getSequence` and
-    :meth:`seq_len`.
-
-    Optional kwargs to the iterator:
-
-    stepper
-       The stepper controls how the iterator advances.
-
-       Valid values are None, "all" (default), "nofilter" or "samtools".
-
-       See AlignmentFile.pileup for description.
-    
-    fastafile
-       A :class:`~pysam.FastaFile` object
-
-    max_depth
-       maximum read depth. The default is 8000.
-
-    '''
-
-    def __cinit__( self, AlignmentFile samfile, **kwargs ):
-        self.samfile = samfile
-        self.fastafile = kwargs.get("fastafile", None)
-        self.stepper = kwargs.get("stepper", None)
-        self.max_depth = kwargs.get("max_depth", 8000)
-        self.iterdata.seq = NULL
-        self.tid = 0
-        self.pos = 0
-        self.n_plp = 0
-        self.plp = NULL
-        self.pileup_iter = <bam_plp_t>NULL
-
-    def __iter__(self):
-        return self
-
-    cdef int cnext(self):
-        '''perform next iteration.
-        '''
-        # do not release gil here because of call-backs
-        self.plp = bam_plp_auto(self.pileup_iter,
-                                &self.tid,
-                                &self.pos,
-                                &self.n_plp)
-
-    cdef char * getSequence(self):
-        '''return current reference sequence underlying the iterator.
-        '''
-        return self.iterdata.seq
-
-    property seq_len:
-        '''current sequence length.'''
-        def __get__(self):
-            return self.iterdata.seq_len
-
-    def addReference(self, Fastafile fastafile):
-       '''
-       add reference sequences in `fastafile` to iterator.'''
-       self.fastafile = fastafile
-       if self.iterdata.seq != NULL:
-           free(self.iterdata.seq)
-       self.iterdata.tid = -1
-       self.iterdata.fastafile = self.fastafile.fastafile
-
-    def hasReference(self):
-        '''
-        return true if iterator is associated with a reference'''
-        return self.fastafile
-
-    cdef setMask(self, mask):
-        '''set masking flag in iterator.
-
-        reads with bits set in `mask` will be skipped.
-        '''
-        raise NotImplementedError()
-        # self.mask = mask
-        # bam_plp_set_mask( self.pileup_iter, self.mask )
-
-    cdef setupIteratorData( self,
-                            int tid,
-                            int start,
-                            int end,
-                            int multiple_iterators=0 ):
-        '''setup the iterator structure'''
-
-        self.iter = IteratorRowRegion(self.samfile, tid, start, end, multiple_iterators)
-        self.iterdata.htsfile = self.samfile.htsfile
-        self.iterdata.iter = self.iter.iter
-        self.iterdata.seq = NULL
-        self.iterdata.tid = -1
-        self.iterdata.header = self.samfile.header
-
-        if self.fastafile is not None:
-            self.iterdata.fastafile = self.fastafile.fastafile
-        else:
-            self.iterdata.fastafile = NULL
-
-        # Free any previously allocated memory before reassigning
-        # pileup_iter
-        self._free_pileup_iter()
-
-        if self.stepper is None or self.stepper == "all":
-            with nogil:
-                self.pileup_iter = bam_plp_init(
-                    <bam_plp_auto_f>&__advance_all,
-                    &self.iterdata)
-        elif self.stepper == "nofilter":
-            with nogil:
-                self.pileup_iter = bam_plp_init(
-                    <bam_plp_auto_f>&__advance_nofilter,
-                    &self.iterdata)
-        elif self.stepper == "samtools":
-            with nogil:
-                self.pileup_iter = bam_plp_init(
-                    <bam_plp_auto_f>&__advance_snpcalls,
-                    &self.iterdata)
-        else:
-            raise ValueError(
-                "unknown stepper option `%s` in IteratorColumn" % self.stepper)
-
-        if self.max_depth:
-            with nogil:
-                bam_plp_set_maxcnt(self.pileup_iter, self.max_depth)
-
-        # bam_plp_set_mask( self.pileup_iter, self.mask )
-
-    cdef reset( self, tid, start, end ):
-        '''reset iterator position.
-
-        This permits using the iterator multiple times without
-        having to incur the full set-up costs.
-        '''
-        self.iter = IteratorRowRegion( self.samfile, tid, start, end, multiple_iterators = 0 )
-        self.iterdata.iter = self.iter.iter
-
-        # invalidate sequence if different tid
-        if self.tid != tid:
-            if self.iterdata.seq != NULL:
-                free(self.iterdata.seq)
-            self.iterdata.seq = NULL
-            self.iterdata.tid = -1
-
-        # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
-        with nogil:
-            bam_plp_reset(self.pileup_iter)
-
-    cdef _free_pileup_iter(self):
-        '''free the memory alloc'd by bam_plp_init.
-
-        This is needed before setupIteratorData allocates
-        another pileup_iter, or else memory will be lost.
-        '''
-        if self.pileup_iter != <bam_plp_t>NULL:
-            with nogil:
-                bam_plp_reset(self.pileup_iter)
-                bam_plp_destroy(self.pileup_iter)
-                self.pileup_iter = <bam_plp_t>NULL
-
-    def __dealloc__(self):
-        # reset in order to avoid memory leak messages for iterators
-        # that have not been fully consumed
-        self._free_pileup_iter()
-        self.plp = <bam_pileup1_t*>NULL
-
-        if self.iterdata.seq != NULL:
-            free(self.iterdata.seq)
-            self.iterdata.seq = NULL
-
-
-cdef class IteratorColumnRegion(IteratorColumn):
-    '''iterates over a region only.
-    '''
-    def __cinit__(self, AlignmentFile samfile,
-                  int tid = 0,
-                  int start = 0,
-                  int end = MAX_POS,
-                  int truncate = False,
-                  **kwargs ):
-
-        # initialize iterator
-        self.setupIteratorData(tid, start, end, 1)
-        self.start = start
-        self.end = end
-        self.truncate = truncate
-
-    def __next__(self):
-
-        while 1:
-            self.cnext()
-            if self.n_plp < 0:
-                raise ValueError("error during iteration" )
-
-            if self.plp == NULL:
-                raise StopIteration
-            
-            if self.truncate:
-                if self.start > self.pos: continue
-                if self.pos >= self.end: raise StopIteration
-
-            return makePileupColumn(&self.plp,
-                                   self.tid,
-                                   self.pos,
-                                   self.n_plp,
-                                   self.samfile)
-
-
-cdef class IteratorColumnAllRefs(IteratorColumn):
-    """iterates over all columns by chaining iterators over each reference
-    """
-
-    def __cinit__(self,
-                  AlignmentFile samfile,
-                  **kwargs):
-
-        # no iteration over empty files
-        if not samfile.nreferences:
-            raise StopIteration
-
-        # initialize iterator
-        self.setupIteratorData(self.tid, 0, MAX_POS, 1)
-
-    def __next__(self):
-
-        while 1:
-            self.cnext()
-
-            if self.n_plp < 0:
-                raise ValueError("error during iteration" )
-
-            # return result, if within same reference
-            if self.plp != NULL:
-                return makePileupColumn(&self.plp,
-                                        self.tid,
-                                        self.pos,
-                                        self.n_plp,
-                                        self.samfile)
-                
-            # otherwise, proceed to next reference or stop
-            self.tid += 1
-            if self.tid < self.samfile.nreferences:
-                self.setupIteratorData(self.tid, 0, MAX_POS, 0)
-            else:
-                raise StopIteration
-
-
-cdef class SNPCall:
-    '''the results of a SNP call.'''
-    cdef int _tid
-    cdef int _pos
-    cdef char _reference_base
-    cdef char _genotype
-    cdef int _consensus_quality
-    cdef int _snp_quality
-    cdef int _rms_mapping_quality
-    cdef int _coverage
-
-    property tid:
-        '''the chromosome ID as is defined in the header'''
-        def __get__(self):
-            return self._tid
-
-    property pos:
-       '''nucleotide position of SNP.'''
-       def __get__(self): return self._pos
-
-    property reference_base:
-       '''reference base at pos. ``N`` if no reference sequence supplied.'''
-       def __get__(self): return from_string_and_size( &self._reference_base, 1 )
-
-    property genotype:
-       '''the genotype called.'''
-       def __get__(self): return from_string_and_size( &self._genotype, 1 )
-
-    property consensus_quality:
-       '''the genotype quality (Phred-scaled).'''
-       def __get__(self): return self._consensus_quality
-
-    property snp_quality:
-       '''the snp quality (Phred scaled) - probability of consensus being
-       identical to reference sequence.'''
-       def __get__(self): return self._snp_quality
-
-    property mapping_quality:
-       '''the root mean square (rms) of the mapping quality of all reads
-       involved in the call.'''
-       def __get__(self): return self._rms_mapping_quality
-
-    property coverage:
-       '''coverage or read depth - the number of reads involved in the call.'''
-       def __get__(self): return self._coverage
-
-    def __str__(self):
-
-        return "\t".join( map(str, (
-                    self.tid,
-                    self.pos,
-                    self.reference_base,
-                    self.genotype,
-                    self.consensus_quality,
-                    self.snp_quality,
-                    self.mapping_quality,
-                    self.coverage ) ) )
-
-
-cdef class IndexedReads:
-    """*(AlignmentFile samfile, multiple_iterators=True)
-
-    Index a Sam/BAM-file by query name while keeping the
-    original sort order intact.
-
-    The index is kept in memory and can be substantial.
-
-    By default, the file is re-openend to avoid conflicts if multiple
-    operators work on the same file. Set `multiple_iterators` = False
-    to not re-open `samfile`.
-
-    Parameters
-    ----------
-
-    samfile : AlignmentFile
-        File to be indexed.
-
-    multiple_iterators : bool
-        Flag indicating whether the file should be reopened. Reopening prevents
-        existing iterators being affected by the indexing.
-
-    """
-
-    def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
-        cdef char *cfilename
-
-        # makes sure that samfile stays alive as long as this
-        # object is alive.
-        self.samfile = samfile
-
-        assert samfile.is_bam, "can only IndexReads on bam files"
-
-        # multiple_iterators the file - note that this makes the iterator
-        # slow and causes pileup to slow down significantly.
-        if multiple_iterators:
-            cfilename = samfile._filename
-            with nogil:
-                self.htsfile = hts_open(cfilename, 'r')
-            assert self.htsfile != NULL
-            # read header - required for accurate positioning
-            with nogil:
-                self.header = sam_hdr_read(self.htsfile)
-            self.owns_samfile = True
-        else:
-            self.htsfile = self.samfile.htsfile
-            self.header = self.samfile.header
-            self.owns_samfile = False
-
-    def build(self):
-        '''build the index.'''
-
-        self.index = collections.defaultdict(list)
-
-        # this method will start indexing from the current file
-        # position if you decide
-        cdef int ret = 1
-        cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
-
-        cdef uint64_t pos
-
-        while ret > 0:
-            with nogil:
-                pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
-                ret = sam_read1(self.htsfile,
-                                self.samfile.header,
-                                b)
-            if ret > 0:
-                qname = charptr_to_str(pysam_bam_get_qname(b))
-                self.index[qname].append(pos)
-
-        bam_destroy1(b)
-
-    def find(self, query_name):
-        '''find `query_name` in index.
-
-        Returns
-        -------
-
-        IteratorRowSelection
-            Returns an iterator over all reads with query_name.
-
-        Raises
-        ------
-        
-        KeyError
-            if the `query_name` is not in the index.
-
-        '''
-        if query_name in self.index:
-            return IteratorRowSelection(
-                self.samfile,
-                self.index[query_name],
-                multiple_iterators = False)
-        else:
-            raise KeyError("read %s not found" % query_name)
-
-    def __dealloc__(self):
-        if self.owns_samfile:
-            hts_close(self.htsfile)
-            bam_hdr_destroy(self.header)
-
-__all__ = [
-    "AlignmentFile",
-    "IteratorRow",
-    "IteratorColumn",
-    "IndexedReads"]
diff --git a/pysam/cbcf.pxd b/pysam/cbcf.pxd

deleted file mode 100644 (file)

index b6e210a..0000000
--- a/pysam/cbcf.pxd
+++ /dev/null
@@ -1,159 +0,0 @@
-###############################################################################
-###############################################################################
-## Cython wrapper for htslib VCF/BCF reader/writer
-###############################################################################
-#
-# NOTICE: This code is incomplete and preliminary.  It is nearly complete as
-#         an immutable interface, but has no capability (yet) to mutate the
-#         resulting data (beyond dropping all samples).  Documentation still
-#         needs to be written and a unit test suite is in the works.  The
-#         code is also specific to Python 2 and will require a bit of work
-#         to properly adapt to Python 3.
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com)
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-
-from pysam.chtslib cimport *
-
-
-cdef class VariantHeader(object):
-    cdef bcf_hdr_t *ptr
-
-    cdef _subset_samples(self, include_samples)
-
-
-cdef class VariantHeaderRecord(object):
-    cdef VariantHeader header
-    cdef bcf_hrec_t *ptr
-
-
-cdef class VariantHeaderRecords(object):
-    cdef VariantHeader header
-
-
-cdef class VariantHeaderContigs(object):
-    cdef VariantHeader header
-
-
-cdef class VariantHeaderSamples(object):
-    cdef VariantHeader header
-
-
-cdef class VariantContig(object):
-    cdef VariantHeader header
-    cdef int id
-
-
-cdef class VariantMetadata(object):
-    cdef VariantHeader header
-    cdef int type
-    cdef int id
-
-
-cdef class VariantHeaderMetadata(object):
-    cdef VariantHeader header
-    cdef int32_t type
-
-
-cdef class VariantRecord(object):
-    cdef VariantHeader header
-    cdef bcf1_t *ptr
-
-
-cdef class VariantRecordFilter(object):
-    cdef VariantRecord record
-
-
-cdef class VariantRecordFormat(object):
-    cdef VariantRecord record
-
-
-cdef class VariantRecordInfo(object):
-    cdef VariantRecord record
-
-
-cdef class VariantRecordSamples(object):
-    cdef VariantRecord record
-
-
-cdef class VariantRecordSample(object):
-    cdef VariantRecord record
-    cdef readonly int32_t index
-
-
-cdef class BaseIndex(object):
-    cdef tuple refs
-    cdef dict refmap
-
-
-cdef class BCFIndex(BaseIndex):
-    cdef VariantHeader header
-    cdef hts_idx_t *ptr
-
-
-cdef class TabixIndex(BaseIndex):
-    cdef tbx_t *ptr
-
-
-cdef class BaseIterator(object):
-    cdef VariantFile bcf
-    cdef hts_itr_t  *iter
-
-
-cdef class BCFIterator(BaseIterator):
-    cdef BCFIndex index
-
-
-cdef class TabixIterator(BaseIterator):
-    cdef TabixIndex index
-    cdef kstring_t line_buffer
-
-
-cdef class VariantFile(object):
-    cdef htsFile *htsfile                  # pointer to htsFile structure
-    cdef int64_t  start_offset             # BGZF offset of first record
-
-    cdef readonly object     filename       # filename as supplied by user
-    cdef readonly object     mode           # file opening mode
-    cdef readonly object     index_filename # filename of index, if supplied by user
-
-    cdef readonly VariantHeader  header
-    cdef readonly BaseIndex      index
-
-    cdef readonly bint           drop_samples  # true if sample information is to be ignored
-
-    # FIXME: Temporary, use htsFormat when it is available
-    cdef readonly bint       is_bcf        # true if file is a bcf file
-    cdef readonly bint       is_stream     # true if not a seekable file but a stream
-    cdef readonly bint       is_remote     # true if file is not on the local filesystem
-    cdef readonly bint       is_reading    # true if file has begun reading records
-
-    cpdef int write(self, VariantRecord record) except -1
diff --git a/pysam/cbcf.pyx b/pysam/cbcf.pyx

deleted file mode 100644 (file)

index 41fd44f..0000000
--- a/pysam/cbcf.pyx
+++ /dev/null
@@ -1,3652 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-## Cython wrapper for htslib VCF/BCF reader/writer
-###############################################################################
-#
-# NOTICE: This code is incomplete and preliminary.  It offers a nearly
-#         complete Pythonic interface to VCF/BCF metadata and data with
-#         reading and writing capability.  It has limited capability to to
-#         mutate the resulting data.  Documentation and a unit test suite
-#         are in the works.  The code is best tested under Python 2, but
-#         should also work with Python 3.  Please report any remaining
-#         str/bytes issues on the github site when using Python 3 and I'll
-#         fix them promptly.
-#
-# Here is a minimal example of how to use the API:
-#
-#     $ cat bcfview.py
-#     import sys
-#     from pysam import VariantFile
-#
-#     bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
-#     bcf_out = VariantFile('-', 'w', header=bcf_in.header)
-#
-#     for rec in bcf_in:
-#         bcf_out.write(rec)
-#
-# Performance is fairly close to that of bcftools view.  Here is an example
-# using some 1k Genomes data:
-#
-#     $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
-#      1103799
-#
-#     real     0m56.114s
-#     user     1m4.489s
-#     sys      0m3.102s
-#
-#     $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
-#      1103800  # bcftools adds an extra header
-#
-#     real     0m55.126s
-#     user     1m3.502s
-#     sys      0m3.459s
-#
-# Here is a quick tour through the API::
-#
-#     VariantFile(filename, mode=None, header=None, drop_samples=False)
-#
-#         Attributes / Properties
-#
-#             htsfile:      htsFile*                             [private]
-#             start_offset: BGZF offset of first record          [private]
-#             filename:     filename                             [read only]
-#             mode:         mode                                 [read only]
-#             header:       VariantHeader object                 [read only]
-#             index:        TabixIndex, BCFIndex or None         [read only]
-#             drop_samples: sample information is to be ignored  [read only]
-#
-#             is_stream:    file is stdin/stdout                 [read only]
-#             is_remote:    file is not on the local filesystem  [read only]
-#             is_reading:   file has begun reading records       [read only]
-#             category:     file format general category         [read only]
-#             format:       file format                          [read only]
-#             version:      tuple of (major, minor) format version [read only]
-#             compression:  file compression
-#             description:  vaguely human readable description of  [read only]
-#                           file format.
-#
-#         Methods:
-#             copy()
-#             close()
-#             open(filename, mode=None, header=None, drop_samples=False)
-#             reset()
-#             seek(offset)
-#             tell()
-#             fetch(contig=None, start=None, stop=None, region=None, reopen=False)
-#             subset_samples(include_samples)
-#
-#     VariantHeader()
-#
-#         version:      VCF version
-#         samples:      sequence-like access to samples
-#         records:      sequence-like access to partially parsed headers
-#         contigs:      mapping-like object for contig name -> VariantContig
-#
-#         filters:      mapping-like object for filter name -> VariantMetadata
-#         info:         mapping-like object for info name -> VariantMetadata
-#         formats:      mapping-like object for formats name -> VariantMetadata
-#
-#     VariantRecord(...)
-#
-#         header:       VariantHeader object
-#         rid:          reference id (i.e. tid)
-#         chrom:        chromosome/contig string
-#         contig:       synonym for chrom
-#         pos:          1-based start position (inclusive)
-#         start:        0-based start position (inclusive)
-#         stop:         0-based stop position (exclusive)
-#         rlen:         reference length (stop - start)
-#         id:           record identifier
-#         ref:          reference allele
-#         alleles:      alleles (ref followed by alts)
-#         alts:         alt alleles
-#         qual:         quality (float)
-#         filter:       mapping-like object for filter name -> type info
-#         info:         mapping-like object for info name -> value
-#         format:       mapping-like object for format name -> type info
-#         samples:      mapping-like object of sample genotypes & attrs
-#
-#     VariantRecordSample(...)
-#
-#         name:           sample name
-#         index:          sample index
-#         allele_indices: tuple of allele indices (ref=0, alt=1..len(alts), missing=-1)
-#         alleles:        tuple of alleles (missing=None)
-#
-#         VariantRecordSample is also a mapping object from formats to values
-#
-#     VariantContig(...)
-#
-#         id:           reference id (i.e. tid)
-#         name:         chromosome/contig string
-#         length:       contig length if provided, else None
-#         header:       defining VariantHeaderRecord
-#
-#     VariantMetadata(...) # for FILTER, INFO and FORMAT metadata
-#
-#         id:           internal id
-#         name:         metadata name
-#         type:         value data type
-#         number:       number of values
-#         header:       defining VariantHeaderRecord
-#
-#    VariantHeaderRecord(...)  # replace with single tuple of key/value pairs?
-#
-#        type:          record type
-#        key:           first record key
-#        value:         first record value
-#        attrs:         remaining key/value pairs
-#
-###############################################################################
-#
-# TODO list for next major sprint:
-#
-#   * more genotype methods
-#   * unit test suite (perhaps py.test based)
-#   * documentation
-#   * htslib 1.2 format info
-#
-# For later sprints:
-#
-#   * ability to create indices
-#   * mutable header and record data
-#   * pickle support
-#   * Python 3 support
-#   * left/right locus normalization
-#   * parallel iteration (like synced_bcf_reader)
-#   * fix reopen to re-use fd
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com)
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-from __future__ import division, print_function
-
-import os
-import sys
-
-from libc.string cimport strcmp, strpbrk
-from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
-
-cimport cython
-
-from cpython.object  cimport PyObject
-from cpython.ref     cimport Py_INCREF
-from cpython.dict    cimport PyDict_GetItemString, PyDict_SetItemString
-from cpython.tuple   cimport PyTuple_New, PyTuple_SET_ITEM
-from cpython.bytes   cimport PyBytes_FromStringAndSize
-from cpython.unicode cimport PyUnicode_DecodeASCII
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.chtslib   cimport hisremote
-
-
-from warnings         import warn
-
-
-__all__ = ['VariantFile',
-           'VariantHeader',
-           'VariantHeaderRecord',
-           'VariantRecord']
-
-########################################################################
-########################################################################
-## Constants
-########################################################################
-
-cdef int   MAX_POS = 2 << 29
-cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
-cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
-cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
-
-cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
-cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
-                      'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
-cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
-
-########################################################################
-########################################################################
-## Python 3 compatibility functions
-########################################################################
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.cutils cimport encode_filename, from_string_and_size
-
-
-########################################################################
-########################################################################
-## VCF/BCF string intern system
-########################################################################
-
-cdef dict bcf_str_cache = {}
-
-cdef inline bcf_str_cache_get_charptr(const char* s):
-    if s == NULL:
-        return None
-
-    cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
-    if pystr:
-        return <object>pystr
-
-    if PY_MAJOR_VERSION < 3:
-        val = s
-    else:
-        val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
-
-    PyDict_SetItemString(bcf_str_cache, s, val)
-
-    return val
-
-
-########################################################################
-########################################################################
-## Low level type conversion helpers
-########################################################################
-
-
-cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
-    return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
-
-
-cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
-    if not a:
-        return None
-    try:
-         return tuple(charptr_to_str(a[i]) for i in range(n))
-    finally:
-        if free_after and a:
-            free(a)
-
-
-cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
-    cdef char    *datac
-    cdef int8_t  *data8
-    cdef int16_t *data16
-    cdef int32_t *data32
-    cdef float   *dataf
-    cdef int      i
-
-    if not data or n <= 0:
-        return None
-
-    if type == BCF_BT_CHAR:
-        datac = <char *>data
-        while n and datac[n-1] == bcf_str_vector_end:
-            n -= 1
-        value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None
-        # FIXME: Need to know length?  Report errors?  Pad with missing values?  Not clear what to do.
-
-        value = tuple(v or None for v in value.split(',')) if value else ()
-        # FIXME: Need to know length?  Report errors?  Pad with missing values?  Not clear what to do.
-    else:
-        value = []
-        if type == BCF_BT_INT8:
-            data8 = <int8_t *>data
-            for i in range(n):
-                if data8[i] == bcf_int8_vector_end:
-                    break
-                value.append(data8[i] if data8[i] != bcf_int8_missing else None)
-        elif type == BCF_BT_INT16:
-            data16 = <int16_t *>data
-            for i in range(n):
-                if data16[i] == bcf_int16_vector_end:
-                    break
-                value.append(data16[i] if data16[i] != bcf_int16_missing else None)
-        elif type == BCF_BT_INT32:
-            data32 = <int32_t *>data
-            for i in range(n):
-                if data32[i] == bcf_int32_vector_end:
-                    break
-                value.append(data32[i] if data32[i] != bcf_int32_missing else None)
-        elif type == BCF_BT_FLOAT:
-            dataf = <float *>data
-            for i in range(n):
-                if bcf_float_is_vector_end(dataf[i]):
-                    break
-                value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
-        else:
-            raise TypeError('unsupported info type code')
-
-    # FIXME: Need to know length?  Report errors?  Pad with missing values?  Not clear what to do.
-    if not value:
-        if scalar:
-            value = None
-        elif count <= 0:
-            value = ()
-        else:
-            value = (None,)*count
-    elif scalar and len(value) == 1:
-        value = value[0]
-    else:
-        value = tuple(value)
-
-    return value
-
-
-cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
-    cdef char    *datac
-    cdef int8_t  *data8
-    cdef int16_t *data16
-    cdef int32_t *data32
-    cdef float   *dataf
-    cdef ssize_t  i, value_count = len(values)
-
-    assert(value_count <= n)
-
-    if bt_type == BCF_BT_CHAR:
-        if not isinstance(values, (str, bytes)):
-            values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
-            value_count = len(values)
-        assert(value_count <= n)
-        datac = <char *>data
-        memcpy(datac, <char *>values, value_count)
-        for i in range(value_count, n):
-            datac[i] = 0
-    elif bt_type == BCF_BT_INT8:
-        datai8 = <int8_t *>data
-        for i in range(value_count):
-            val = values[i]
-            datai8[i] = val if val is not None else bcf_int8_missing
-        for i in range(value_count, n):
-            datai8[i] = bcf_int8_vector_end
-    elif bt_type == BCF_BT_INT16:
-        datai16 = <int16_t *>data
-        for i in range(value_count):
-            val = values[i]
-            datai16[i] = val if val is not None else bcf_int16_missing
-        for i in range(value_count, n):
-            datai16[i] = bcf_int16_vector_end
-    elif bt_type == BCF_BT_INT32:
-        datai32 = <int32_t *>data
-        for i in range(value_count):
-            val = values[i]
-            datai32[i] = val if val is not None else bcf_int32_missing
-        for i in range(value_count, n):
-            datai32[i] = bcf_int32_vector_end
-    elif bt_type == BCF_BT_FLOAT:
-        dataf = <float *>data
-        for i in range(value_count):
-            val = values[i]
-            if val is None:
-                bcf_float_set(dataf + i, bcf_float_missing)
-            else:
-                dataf[i] = val
-        for i in range(value_count, n):
-            bcf_float_set(dataf + i, bcf_float_vector_end)
-    else:
-        raise TypeError('unsupported type')
-
-
-cdef bcf_empty_array(int type, ssize_t n, int vlen):
-    cdef char    *datac
-    cdef int32_t *data32
-    cdef float   *dataf
-    cdef int      i
-
-    if n <= 0:
-        raise ValueError('Cannot create empty array')
-
-    if type == BCF_HT_STR:
-        value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
-        datac = <char *>value
-        for i in range(n):
-            datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
-    elif type == BCF_HT_INT:
-        value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
-        data32 = <int32_t *><char *>value
-        for i in range(n):
-            data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
-    elif type == BCF_HT_REAL:
-        value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
-        dataf = <float *><char *>value
-        for i in range(n):
-            bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
-    else:
-        raise TypeError('unsupported header type code')
-
-    return value
-
-
-cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
-                           void *dst_data, int dst_type, ssize_t dst_values,
-                           int vlen):
-    cdef char    *src_datac
-    cdef char    *dst_datac
-    cdef int8_t  *src_datai8
-    cdef int16_t *src_datai16
-    cdef int32_t *src_datai32
-    cdef int32_t *dst_datai
-    cdef float   *src_dataf
-    cdef float   *dst_dataf
-    cdef ssize_t src_size, dst_size, i, j
-    cdef int val
-
-    if src_values > dst_values:
-        raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
-
-    if src_type == dst_type == BCF_BT_CHAR:
-        src_datac = <char *>src_data
-        dst_datac = <char *>dst_data
-        memcpy(src_datac, dst_datac, src_values)
-        for i in range(src_values, dst_values):
-            dst_datac[i] = 0
-    elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
-        src_datai8 = <int8_t *>src_data
-        dst_datai  = <int32_t *>dst_data
-        for i in range(src_values):
-            val = src_datai8[i]
-            if val == bcf_int8_missing:
-                val = bcf_int32_missing
-            elif val == bcf_int8_vector_end:
-                val = bcf_int32_vector_end
-            dst_datai[i] = val
-        for i in range(src_values, dst_values):
-            dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
-    elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
-        src_datai16 = <int16_t *>src_data
-        dst_datai   = <int32_t *>dst_data
-        for i in range(src_values):
-            val = src_datai16[i]
-            if val == bcf_int16_missing:
-                val = bcf_int32_missing
-            elif val == bcf_int16_vector_end:
-                val = bcf_int32_vector_end
-            dst_datai[i] = val
-        for i in range(src_values, dst_values):
-            dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
-    elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
-        src_datai32 = <int32_t *>src_data
-        dst_datai   = <int32_t *>dst_data
-        for i in range(src_values):
-            dst_datai[i] = src_datai32[i]
-        for i in range(src_values, dst_values):
-            dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
-    elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
-        src_dataf = <float *>src_data
-        dst_dataf = <float *>dst_data
-        for i in range(src_values):
-            dst_dataf[i] = src_dataf[i]
-        for i in range(src_values, dst_values):
-            bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
-    else:
-        raise TypeError('unsupported types')
-
-
-cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
-    cdef bcf_hdr_t *hdr = record.header.ptr
-    cdef bcf1_t *r = record.ptr
-    cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
-    cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
-
-    scalar[0] = 0
-
-    if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
-        count[0] = number
-    elif length == BCF_VL_FIXED:
-        if number == 1:
-            scalar[0] = 1
-        count[0] = number
-    elif length == BCF_VL_R:
-        count[0] = r.n_allele
-    elif length == BCF_VL_A:
-        count[0] = r.n_allele - 1
-    elif length == BCF_VL_G:
-        count[0] = r.n_allele * (r.n_allele + 1) // 2
-    elif length == BCF_VL_VAR:
-        count[0] = -1
-    else:
-        raise ValueError('Unknown format length')
-
-
-cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
-    cdef bcf_hdr_t *hdr = record.header.ptr
-
-    cdef char *s
-    cdef ssize_t count
-    cdef int scalar
-
-    bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar)
-
-    if z.len == 0:
-        if  bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
-            value = True
-        elif scalar:
-            value = None
-        else:
-            value = ()
-    elif z.len == 1:
-        if z.type == BCF_BT_INT8:
-            if z.v1.i == bcf_int8_missing:
-                value = None
-            elif z.v1.i == bcf_int8_vector_end:
-                value = ()
-            else:
-                value = z.v1.i
-        elif z.type == BCF_BT_INT16:
-            if z.v1.i == bcf_int16_missing:
-                value = None
-            elif z.v1.i == bcf_int16_vector_end:
-                value = ()
-            else:
-                value = z.v1.i
-        elif z.type == BCF_BT_INT32:
-            if z.v1.i == bcf_int32_missing:
-                value = None
-            elif z.v1.i == bcf_int32_vector_end:
-                value = ()
-            else:
-                value = z.v1.i
-        elif z.type == BCF_BT_FLOAT:
-            if bcf_float_is_missing(z.v1.f):
-                value = None
-            elif bcf_float_is_vector_end(z.v1.f):
-                value = ()
-            else:
-                value = z.v1.f
-        elif z.type == BCF_BT_CHAR:
-            value = force_str(chr(z.v1.i))
-        else:
-            raise TypeError('unsupported info type code')
-
-        if not scalar and value != ():
-            value = (value,)
-    else:
-        value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
-
-    return value
-
-
-cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
-                             int id, int bt_type, ssize_t bt_len, ssize_t *value_count,
-                             int *scalar, int *realloc):
-
-    bcf_get_value_count(record, hl_type, id, value_count, scalar)
-
-    # Validate values now that we know the type and size
-    values = (value,) if not isinstance(value, tuple) else value
-
-    # Validate values now that we know the type and size
-    if ht_type == BCF_HT_FLAG:
-        value_count[0] = 1
-
-    if value_count[0] != -1 and value_count[0] != len(values):
-        if scalar[0]:
-            raise TypeError('value expected to be scalar'.format(value_count[0]))
-        else:
-            raise TypeError('values expected to be {:d}-tuple'.format(value_count[0]))
-
-    if ht_type == BCF_HT_REAL:
-        for v in values:
-            if not(v is None or isinstance(v, (float, int))):
-                raise TypeError('invalid value for Float format')
-    elif ht_type == BCF_HT_INT:
-        for v in values:
-            if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
-                raise TypeError('invalid value for Integer format')
-        for v in values:
-            if not(v is None or bcf_int32_missing < v <= INT32_MAX):
-                raise ValueError('Integer value too small/large to store in VCF/BCF')
-    elif ht_type == BCF_HT_STR:
-        values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
-    elif ht_type == BCF_HT_FLAG:
-        if values[0] not in (True, False, None, 1, 0):
-            raise ValueError('Flag values must be: True, False, None, 1, 0')
-    else:
-        raise TypeError('unsupported type')
-
-    realloc[0] = 0
-    if len(values) <= 1 and hl_type == BCF_HL_INFO:
-        realloc[0] = 0
-    elif len(values) > bt_len:
-        realloc[0] = 1
-    elif bt_type == BCF_BT_INT8:
-        for v in values:
-            if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
-                realloc[0] = 1
-                break
-    elif bt_type == BCF_BT_INT16:
-        for v in values:
-            if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
-                realloc[0] = 1
-                break
-
-    return values
-
-
-cdef bcf_encode_alleles(VariantRecord record, values):
-    cdef bcf1_t *r = record.ptr
-    cdef int32_t nalleles = r.n_allele
-    cdef list gt_values = []
-    cdef char *s
-    cdef int i
-
-    if not values:
-        return ()
-
-    if not isinstance(values, (list, tuple)):
-        values = (values,)
-
-    for value in values:
-        if value is None:
-            gt_values.append(None)
-        elif isinstance(value, (str, bytes)):
-            bvalue = force_bytes(value)
-            s = bvalue
-            for i in range(r.n_allele):
-                if strcmp(r.d.allele[i], s) != 0:
-                    gt_values.append(bcf_gt_unphased(i))
-                    break
-            else:
-                raise ValueError('Unknown allele')
-        else:
-            i = value
-            if not (0 <= i < nalleles):
-                raise ValueError('Invalid allele index')
-            gt_values.append(bcf_gt_unphased(i))
-
-    return gt_values
-
-
-cdef bcf_info_set_value(VariantRecord record, key, value):
-    cdef bcf_hdr_t *hdr = record.header.ptr
-    cdef bcf1_t *r = record.ptr
-    cdef vdict_t *d
-    cdef khiter_t k
-    cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
-    cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
-
-    if bcf_unpack(r, BCF_UN_INFO) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    bkey = force_bytes(key)
-    cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
-    if info:
-        info_id = info.key
-    else:
-        d = <vdict_t *>hdr.dict[BCF_DT_ID]
-        k = kh_get_vdict(d, bkey)
-
-        if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
-            raise KeyError('unknown INFO')
-
-        info_id = kh_val_vdict(d, k).id
-
-    info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
-    values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
-                              info.type if info else -1, info.len if info else -1,
-                              &value_count, &scalar, &realloc)
-
-    if info_type == BCF_HT_FLAG:
-        if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
-            raise ValueError('Unable to update INFO values')
-        return
-
-    vlen = value_count < 0
-    value_count = len(values)
-
-    # If we can, write updated values to existing allocated storage
-    if info and not realloc:
-        r.d.shared_dirty |= BCF1_DIRTY_INF
-
-        if value_count == 0:
-            info.len = 0
-            # FIXME: Check if need to free vptr if info.len > 0?
-        elif value_count == 1:
-            # FIXME: Check if need to free vptr if info.len > 0?
-            if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
-                bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
-            elif info.type == BCF_BT_FLOAT:
-                bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
-            else:
-                raise TypeError('unsupported info type code')
-            info.len = 1
-        else:
-            bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
-        return
-
-    alloc_len = max(1, value_count)
-    if info and info.len > alloc_len:
-        alloc_len = info.len
-
-    new_values = bcf_empty_array(info_type, alloc_len, vlen)
-    cdef char *valp = <char *>new_values
-
-    if info_type == BCF_HT_INT:
-        dst_type = BCF_BT_INT32
-    elif info_type == BCF_HT_REAL:
-        dst_type = BCF_BT_FLOAT
-    elif info_type == BCF_HT_STR:
-        dst_type = BCF_BT_CHAR
-    else:
-        raise ValueError('Unsupported INFO type')
-
-    bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
-
-    if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
-        raise ValueError('Unable to update INFO values')
-
-
-cdef bcf_info_del_value(VariantRecord record, key):
-    cdef bcf_hdr_t *hdr = record.header.ptr
-    cdef bcf1_t *r = record.ptr
-    cdef ssize_t value_count
-    cdef int scalar
-
-    if bcf_unpack(r, BCF_UN_INFO) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    bkey = force_bytes(key)
-    cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
-    if not info:
-        raise KeyError(key)
-
-    bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar)
-
-    if value_count <= 0:
-        null_value = ()
-    elif scalar:
-        null_value = None
-    else:
-        null_value = (None,)*value_count
-
-    bcf_info_set_value(record, bkey, null_value)
-
-
-cdef bcf_format_get_value(VariantRecordSample sample, key):
-    cdef bcf_hdr_t *hdr = sample.record.header.ptr
-    cdef bcf1_t *r = sample.record.ptr
-    cdef ssize_t count
-    cdef int scalar
-
-    if bcf_unpack(r, BCF_UN_ALL) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    bkey = force_bytes(key)
-    cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
-    if not fmt or not fmt.p:
-        raise KeyError('invalid FORMAT')
-
-    if is_gt_fmt(hdr, fmt.id):
-        return bcf_format_get_allele_indices(sample)
-
-    bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar)
-
-    if fmt.p and fmt.n and fmt.size:
-        return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
-    elif scalar:
-        return None
-    elif count <= 0:
-        return ()
-    else:
-        return (None,)*count
-
-
-cdef bcf_format_set_value(VariantRecordSample sample, key, value):
-    cdef bcf_hdr_t *hdr = sample.record.header.ptr
-    cdef bcf1_t *r = sample.record.ptr
-    cdef int fmt_id
-    cdef vdict_t *d
-    cdef khiter_t k
-    cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
-    cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
-
-    if bcf_unpack(r, BCF_UN_ALL) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    bkey = force_bytes(key)
-    cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
-    if fmt:
-        fmt_id = fmt.id
-    else:
-        d = <vdict_t *>hdr.dict[BCF_DT_ID]
-        k = kh_get_vdict(d, bkey)
-
-        if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
-            raise KeyError('unknown format')
-
-        fmt_id = kh_val_vdict(d, k).id
-
-    fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
-
-    if fmt_type == BCF_HT_FLAG:
-        raise ValueError('Flag types are not allowed on FORMATs')
-
-    if is_gt_fmt(hdr, fmt_id):
-        value = bcf_encode_alleles(sample.record, value)
-
-    values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
-                              fmt.type if fmt else -1, fmt.n if fmt else -1,
-                              &value_count, &scalar, &realloc)
-
-    vlen = value_count < 0
-    value_count = len(values)
-
-    # If we can, write updated values to existing allocated storage
-    if fmt and not realloc:
-        r.d.indiv_dirty = 1
-        bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
-        return
-
-    alloc_len = max(1, value_count)
-    if fmt and fmt.n > alloc_len:
-        alloc_len = fmt.n
-
-    n = bcf_hdr_nsamples(hdr)
-    new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
-    cdef char *valp = <char *>new_values
-
-    if fmt_type == BCF_HT_INT:
-        dst_type = BCF_BT_INT32
-        dst_size = sizeof(int32_t) * alloc_len
-    elif fmt_type == BCF_HT_REAL:
-        dst_type = BCF_BT_FLOAT
-        dst_size = sizeof(float) * alloc_len
-    elif fmt_type == BCF_HT_STR:
-        dst_type = BCF_BT_CHAR
-        dst_size = sizeof(char) * alloc_len
-    else:
-        raise ValueError('Unsupported FORMAT type')
-
-    if fmt and n > 1:
-        for i in range(n):
-            bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
-                                  valp  + i*dst_size, dst_type, alloc_len,
-                                  vlen)
-
-    bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
-
-    if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
-        raise ValueError('Unable to update format values')
-
-
-cdef bcf_format_del_value(VariantRecordSample sample, key):
-    cdef bcf_hdr_t *hdr = sample.record.header.ptr
-    cdef bcf1_t *r = sample.record.ptr
-    cdef ssize_t value_count
-    cdef int scalar
-
-    if bcf_unpack(r, BCF_UN_ALL) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    bkey = force_bytes(key)
-    cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
-    if not fmt or not fmt.p:
-        raise KeyError(key)
-
-    bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar)
-
-    if value_count <= 0:
-        null_value = ()
-    elif scalar:
-        null_value = None
-    else:
-        null_value = (None,)*value_count
-
-    bcf_format_set_value(sample, bkey, null_value)
-
-
-cdef bcf_format_get_allele_indices(VariantRecordSample sample):
-    cdef bcf_hdr_t *hdr = sample.record.header.ptr
-    cdef bcf1_t *r = sample.record.ptr
-    cdef int32_t n = bcf_hdr_nsamples(hdr)
-
-    if bcf_unpack(r, BCF_UN_ALL) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    if sample.index < 0 or sample.index >= n or not r.n_fmt:
-        return ()
-
-    cdef bcf_fmt_t *fmt0 = r.d.fmt
-    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
-    if not gt0 or not fmt0.n:
-        return ()
-
-    cdef int8_t  *data8
-    cdef int16_t *data16
-    cdef int32_t *data32
-    cdef int32_t a, nalleles = r.n_allele
-    cdef list alleles = []
-
-    if fmt0.type == BCF_BT_INT8:
-        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data8[i] == bcf_int8_vector_end:
-                break
-            elif data8[i] == bcf_int8_missing:
-                a = -1
-            else:
-                a = bcf_gt_allele(data8[i])
-            alleles.append(a if 0 <= a < nalleles else None)
-    elif fmt0.type == BCF_BT_INT16:
-        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data16[i] == bcf_int16_vector_end:
-                break
-            elif data16[i] == bcf_int16_missing:
-                a = -1
-            else:
-                a = bcf_gt_allele(data16[i])
-            alleles.append(a if 0 <= a < nalleles else None)
-    elif fmt0.type == BCF_BT_INT32:
-        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data32[i] == bcf_int32_vector_end:
-                break
-            elif data32[i] == bcf_int32_missing:
-                a = -1
-            else:
-                a = bcf_gt_allele(data32[i])
-            alleles.append(a if 0 <= a < nalleles else None)
-
-    return tuple(alleles)
-
-
-cdef bcf_format_get_alleles(VariantRecordSample sample):
-    cdef bcf_hdr_t *hdr = sample.record.header.ptr
-    cdef bcf1_t *r = sample.record.ptr
-    cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
-
-    if bcf_unpack(r, BCF_UN_ALL) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    cdef int32_t nalleles = r.n_allele
-
-    if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
-        return ()
-
-    cdef bcf_fmt_t *fmt0 = r.d.fmt
-    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
-    if not gt0 or not fmt0.n:
-        return ()
-
-    cdef int32_t  a
-    cdef int8_t  *data8
-    cdef int16_t *data16
-    cdef int32_t *data32
-    alleles = []
-    if fmt0.type == BCF_BT_INT8:
-        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data8[i] == bcf_int8_vector_end:
-                break
-            a = bcf_gt_allele(data8[i])
-            alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
-    elif fmt0.type == BCF_BT_INT16:
-        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data16[i] == bcf_int16_vector_end:
-                break
-            a = bcf_gt_allele(data16[i])
-            alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
-    elif fmt0.type == BCF_BT_INT32:
-        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data32[i] == bcf_int32_vector_end:
-                break
-            a = bcf_gt_allele(data32[i])
-            alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
-    return tuple(alleles)
-
-
-cdef bint bcf_sample_get_phased(VariantRecordSample sample):
-    cdef bcf_hdr_t *hdr = sample.record.header.ptr
-    cdef bcf1_t *r = sample.record.ptr
-    cdef int32_t n = bcf_hdr_nsamples(hdr)
-
-    if bcf_unpack(r, BCF_UN_ALL) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    if sample.index < 0 or sample.index >= n or not r.n_fmt:
-        return False
-
-    cdef bcf_fmt_t *fmt0 = r.d.fmt
-    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
-    if not gt0 or not fmt0.n:
-        return False
-
-    cdef int8_t  *data8
-    cdef int16_t *data16
-    cdef int32_t *data32
-
-    cdef bint phased = False
-
-    if fmt0.type == BCF_BT_INT8:
-        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data8[i] == bcf_int8_vector_end:
-                break
-            elif data8[i] == bcf_int8_missing:
-                continue
-            elif i and not bcf_gt_is_phased(data8[i]):
-                return False
-            else:
-                phased = True
-    elif fmt0.type == BCF_BT_INT16:
-        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data16[i] == bcf_int16_vector_end:
-                break
-            elif data16[i] == bcf_int16_missing:
-                continue
-            elif i and not bcf_gt_is_phased(data16[i]):
-                return False
-            else:
-                phased = True
-    elif fmt0.type == BCF_BT_INT32:
-        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data32[i] == bcf_int32_vector_end:
-                break
-            elif data32[i] == bcf_int32_missing:
-                continue
-            elif i and not bcf_gt_is_phased(data32[i]):
-                return False
-            else:
-                phased = True
-
-    return phased
-
-
-cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
-    cdef bcf_hdr_t *hdr = sample.record.header.ptr
-    cdef bcf1_t *r = sample.record.ptr
-    cdef int32_t n = bcf_hdr_nsamples(hdr)
-
-    if bcf_unpack(r, BCF_UN_ALL) < 0:
-        raise ValueError('Error unpacking VariantRecord')
-
-    if sample.index < 0 or sample.index >= n or not r.n_fmt:
-        return
-
-    cdef bcf_fmt_t *fmt0 = r.d.fmt
-    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
-    if not gt0 or not fmt0.n:
-        raise ValueError('Cannot set phased before genotype is set')
-
-    cdef int8_t  *data8
-    cdef int16_t *data16
-    cdef int32_t *data32
-
-    if fmt0.type == BCF_BT_INT8:
-        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data8[i] == bcf_int8_vector_end:
-                break
-            elif data8[i] == bcf_int8_missing:
-                continue
-            elif i:
-                data8[i] = (data8[i] & 0xFE) | phased
-    elif fmt0.type == BCF_BT_INT16:
-        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data16[i] == bcf_int16_vector_end:
-                break
-            elif data16[i] == bcf_int16_missing:
-                continue
-            elif i:
-                data16[i] = (data16[i] & 0xFFFE) | phased
-    elif fmt0.type == BCF_BT_INT32:
-        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
-        for i in range(fmt0.n):
-            if data32[i] == bcf_int32_vector_end:
-                break
-            elif data32[i] == bcf_int32_missing:
-                continue
-            elif i:
-                data32[i] = (data32[i] & 0xFFFFFFFE) | phased
-
-
-########################################################################
-########################################################################
-## Variant Header objects
-########################################################################
-
-#FIXME: implement a full mapping interface
-#FIXME: passing bcf_hrec_t*  may not be the safest approach once mutating
-#       operations are allowed.
-cdef class VariantHeaderRecord(object):
-    """header record from a :class:`VariantHeader` object"""
-
-    property type:
-        """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
-        def __get__(self):
-            cdef bcf_hrec_t *r = self.ptr
-            return METADATA_TYPES[r.type]
-
-    property key:
-        """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
-        def __get__(self):
-            cdef bcf_hrec_t *r = self.ptr
-            return bcf_str_cache_get_charptr(r.key) if r.key else None
-
-    property value:
-        """header value.  Set only for generic lines, None for FILTER/INFO, etc."""
-        def __get__(self):
-            cdef bcf_hrec_t *r = self.ptr
-            return charptr_to_str(r.value) if r.value else None
-
-    property attrs:
-        """sequence of additional header attributes"""
-        def __get__(self):
-            cdef bcf_hrec_t *r = self.ptr
-            cdef int i
-            return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
-                          charptr_to_str(r.vals[i]) if r.vals[i] else None)
-                         for i in range(r.nkeys))
-
-    def __len__(self):
-        cdef bcf_hrec_t *r = self.ptr
-        return r.nkeys
-
-    def __bool__(self):
-        cdef bcf_hrec_t *r = self.ptr
-        return r.nkeys != 0
-
-    def __getitem__(self, key):
-        """get attribute value"""
-        cdef bcf_hrec_t *r = self.ptr
-        cdef int i
-        bkey = force_bytes(key)
-        for i in range(r.nkeys):
-            if r.keys[i] and r.keys[i] == bkey:
-                return charptr_to_str(r.vals[i]) if r.vals[i] else None
-        raise KeyError('cannot find metadata key')
-
-    def __iter__(self):
-        cdef bcf_hrec_t *r = self.ptr
-        cdef int i
-        for i in range(r.nkeys):
-            if r.keys[i]:
-                yield bcf_str_cache_get_charptr(r.keys[i])
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        try:
-            self[key]
-        except KeyError:
-            return False
-        else:
-            return True
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        cdef bcf_hrec_t *r = self.ptr
-        cdef int i
-        for i in range(r.nkeys):
-            if r.keys[i]:
-                yield charptr_to_str(r.vals[i]) if r.vals[i] else None
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        cdef bcf_hrec_t *r = self.ptr
-        cdef int i
-        for i in range(r.nkeys):
-            if r.keys[i]:
-                yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-    def __str__(self):
-        cdef bcf_hrec_t *r = self.ptr
-        if r.type == BCF_HL_GEN:
-            return '##{}={}'.format(self.key, self.value)
-        else:
-            attrs = ','.join('{}={}'.format(k, v) for k,v in self.attrs if k != 'IDX')
-            return '##{}=<{}>'.format(self.key or self.type, attrs)
-
-
-cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    if not hdr:
-        return None
-
-    cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
-    record.header = header
-    record.ptr = hdr
-
-    return record
-
-
-cdef class VariantHeaderRecords(object):
-    """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
-
-    def __len__(self):
-        return self.header.ptr.nhrec
-
-    def __bool__(self):
-        return self.header.ptr.nhrec != 0
-
-    def __getitem__(self, index):
-        cdef int32_t i = index
-        if i < 0 or i >= self.header.ptr.nhrec:
-            raise IndexError('invalid header record index')
-        return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
-
-    def __iter__(self):
-        cdef int32_t i
-        for i in range(self.header.ptr.nhrec):
-            yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
-
-    __hash__ = None
-
-
-cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
-    records.header = header
-    return records
-
-
-cdef class VariantMetadata(object):
-    """filter, info or format metadata record from a :class:`VariantHeader`
-    object"""
-
-    property name:
-        """metadata name"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
-
-    # Q: Should this be exposed?
-    property id:
-        """metadata internal header id number"""
-        def __get__(self):
-            return self.id
-
-    property number:
-        """metadata number (i.e. cardinality)"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or self.type == BCF_HL_FLT:
-                return None
-            cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
-            if l == BCF_VL_FIXED:
-                return bcf_hdr_id2number(hdr, self.type, self.id)
-            elif l == BCF_VL_VAR:
-                return '.'
-            else:
-                return METADATA_LENGTHS[l]
-
-    property type:
-        """metadata value type"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or \
-               self.type == BCF_HL_FLT:
-                return None
-            return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
-
-    property description:
-        """metadata description (or None if not set)"""
-        def __get__(self):
-            descr = self.record.get('Description')
-            if descr:
-                descr = descr.strip('"')
-            return force_str(descr)
-
-    property record:
-        """:class:`VariantHeaderRecord` associated with this
-        :class:`VariantMetadata` object"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            if not bcf_hdr_idinfo_exists(hdr, self.type, self.id):
-                return None
-            cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
-            if not hrec:
-                return None
-            return makeVariantHeaderRecord(self.header, hrec)
-
-
-cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
-        raise ValueError('invalid metadata type')
-
-    if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
-        raise ValueError('invalid metadata id')
-
-    cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
-    meta.header = header
-    meta.type = type
-    meta.id = id
-
-    return meta
-
-
-cdef class VariantHeaderMetadata(object):
-    """mapping from filter, info or format name to :class:`VariantMetadata` object"""
-
-    def add(self, id, number, type, description, **kwargs):
-        """Add a new filter, info or format record"""
-        if id in self:
-            raise ValueError('Header already exists for id={}'.format(id))
-
-        if self.type == BCF_HL_FLT:
-            if number is not None:
-                raise ValueError('Number must be None when adding a filter')
-            if type is not None:
-                raise ValueError('Type must be None when adding a filter')
-
-            items = [('ID', id), ('Description', description)]
-        else:
-            if type not in VALUE_TYPES:
-                raise ValueError('unknown type specified: {}'.format(type))
-            if number is None:
-                number = '.'
-
-            items = [('ID', id),
-                     ('Number', number),
-                     ('Type', type),
-                     ('Description', description)]
-
-        items += kwargs.items()
-        self.header.add_meta(METADATA_TYPES[self.type], items=items)
-
-    def __len__(self):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef bcf_idpair_t *idpair
-        cdef int32_t i, n = 0
-
-        for i in range(hdr.n[BCF_DT_ID]):
-            idpair = hdr.id[BCF_DT_ID] + i
-            if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
-                n += 1
-        return n
-
-    def __bool__(self):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef bcf_idpair_t *idpair
-        cdef int32_t i
-
-        for i in range(hdr.n[BCF_DT_ID]):
-            idpair = hdr.id[BCF_DT_ID] + i
-            if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
-                return True
-        return False
-
-    def __getitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
-
-        bkey = force_bytes(key)
-        cdef khiter_t k = kh_get_vdict(d, bkey)
-
-        if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
-            raise KeyError('invalid filter')
-
-        return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef bcf_idpair_t *idpair
-        cdef int32_t i
-
-        for i in range(hdr.n[BCF_DT_ID]):
-            idpair = hdr.id[BCF_DT_ID] + i
-            if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
-                yield bcf_str_cache_get_charptr(idpair.key)
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        try:
-            self[key]
-        except KeyError:
-            return False
-        else:
-            return True
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        for key in self:
-            yield self[key]
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        for key in self:
-            yield (key, self[key])
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-
-cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
-    meta.header = header
-    meta.type = type
-
-    return meta
-
-
-cdef class VariantContig(object):
-    """contig metadata from a :class:`VariantHeader`"""
-
-    property name:
-        """contig name"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
-
-    property id:
-        """contig internal id number"""
-        def __get__(self):
-            return self.id
-
-    property length:
-        """contig length or None if not available"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
-            return length if length else None
-
-    property header:
-        """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
-            return makeVariantHeaderRecord(self.header, hrec)
-
-
-cdef VariantContig makeVariantContig(VariantHeader header, int id):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
-        raise ValueError('invalid contig id')
-
-    cdef VariantContig contig = VariantContig.__new__(VariantContig)
-    contig.header = header
-    contig.id = id
-
-    return contig
-
-
-cdef class VariantHeaderContigs(object):
-    """mapping from contig name or index to :class:`VariantContig` object."""
-
-    def __len__(self):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
-        return hdr.n[BCF_DT_CTG]
-
-    def __bool__(self):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
-        return hdr.n[BCF_DT_CTG] != 0
-
-    def __getitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef int index
-
-        if isinstance(key, int):
-            index = key
-            if index < 0 or index >= hdr.n[BCF_DT_CTG]:
-                raise IndexError('invalid contig index')
-            return makeVariantContig(self.header, index)
-
-        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
-        bkey = force_bytes(key)
-        cdef khiter_t k = kh_get_vdict(d, bkey)
-
-        if k == kh_end(d):
-            raise KeyError('invalid contig')
-
-        cdef int id = kh_val_vdict(d, k).id
-
-        return makeVariantContig(self.header, id)
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
-        cdef uint32_t n = kh_size(d)
-
-        assert n == hdr.n[BCF_DT_CTG]
-
-        for i in range(n):
-            yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        try:
-            self[key]
-        except KeyError:
-            return False
-        else:
-            return True
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        for key in self:
-            yield self[key]
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        for key in self:
-            yield (key, self[key])
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-    def add(self, id, **kwargs):
-        """Add a new contig record"""
-        if id in self:
-            raise ValueError('Header already exists for contig {}'.format(id))
-
-        items = [('ID', id)] + kwargs.items()
-        self.header.add_meta('contig', items=items)
-
-
-cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
-    contigs.header = header
-
-    return contigs
-
-
-cdef class VariantHeaderSamples(object):
-    """sequence of sample names from a :class:`VariantHeader` object"""
-
-    def __len__(self):
-        return bcf_hdr_nsamples(self.header.ptr)
-
-    def __bool__(self):
-        return bcf_hdr_nsamples(self.header.ptr) != 0
-
-    def __getitem__(self, index):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef int32_t n = bcf_hdr_nsamples(hdr)
-        cdef int32_t i = index
-
-        if i < 0 or i >= n:
-            raise IndexError('invalid sample index')
-
-        return charptr_to_str(hdr.samples[i])
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
-        for i in range(n):
-            yield charptr_to_str(hdr.samples[i])
-
-    def __contains__(self, key):
-        cdef bcf_hdr_t *hdr = self.header.ptr
-        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
-        bkey = force_bytes(key)
-        cdef khiter_t k = kh_get_vdict(d, bkey)
-
-        return k != kh_end(d)
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-    def add(self, name):
-        """Add a new sample"""
-        self.header.add_sample(name)
-
-
-cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
-    samples.header = header
-
-    return samples
-
-
-cdef class VariantHeader(object):
-    """header information for a :class:`VariantFile` object"""
-
-    #FIXME: Add structured proxy
-    #FIXME: Add generic proxy
-    #FIXME: Add mutable methods
-
-    # See makeVariantHeader for C constructor
-    def __cinit__(self):
-        self.ptr = NULL
-
-    # Python constructor
-    def __init__(self):
-        self.ptr = bcf_hdr_init(b'w')
-        if not self.ptr:
-            raise ValueError('cannot create VariantHeader')
-
-    def __dealloc__(self):
-        if self.ptr:
-            bcf_hdr_destroy(self.ptr)
-            self.ptr = NULL
-
-    def __bool__(self):
-        # self.ptr == NULL should be impossible
-        return self.ptr != NULL
-
-    def copy(self):
-        return makeVariantHeader(bcf_hdr_dup(self.ptr))
-
-    property version:
-        """VCF version"""
-        def __get__(self):
-            return force_str(bcf_hdr_get_version(self.ptr))
-
-    property samples:
-        """samples (:class:`VariantHeaderSamples`)"""
-        def __get__(self):
-            return makeVariantHeaderSamples(self)
-
-    property records:
-        """header records (:class:`VariantHeaderRecords`)"""
-        def __get__(self):
-            return makeVariantHeaderRecords(self)
-
-    property contigs:
-        """contig information (:class:`VariantHeaderContigs`)"""
-        def __get__(self):
-            return makeVariantHeaderContigs(self)
-
-    property filters:
-        """filter metadata (:class:`VariantHeaderMetadata`)"""
-        def __get__(self):
-            return makeVariantHeaderMetadata(self, BCF_HL_FLT)
-
-    property info:
-        """info metadata (:class:`VariantHeaderMetadata`)"""
-        def __get__(self):
-            return makeVariantHeaderMetadata(self, BCF_HL_INFO)
-
-    property formats:
-        """format metadata (:class:`VariantHeaderMetadata`)"""
-        def __get__(self):
-            return makeVariantHeaderMetadata(self, BCF_HL_FMT)
-
-    property alts:
-        """alt metadata (:class:`dict` ID->record).
-
-        The data returned just a snapshot of alt records, is created
-        every time the property is requested, and modifications will
-        not be reflected in the header metadata and vice versa.
-
-        i.e. it is just a dict that reflects the state of alt records
-        at the time it is created.
-
-        """
-        def __get__(self):
-            return {record['ID']:record for record in self.records
-                    if record.key.upper() == 'ALT' }
-
-
-    # only safe to do when opening an htsfile
-    cdef _subset_samples(self, include_samples):
-        keep_samples    = set(self.samples)
-        include_samples = set(include_samples)
-        missing_samples = include_samples - keep_samples
-        keep_samples   &= include_samples
-
-        if missing_samples:
-            # FIXME: add specialized exception with payload
-            raise ValueError(
-                'missing {:d} requested samples'.format(
-                    len(missing_samples)))
-
-        keep_samples = force_bytes(','.join(keep_samples))
-        cdef char *keep = <char *>keep_samples if keep_samples else NULL
-        cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
-
-        if ret != 0:
-            raise ValueError(
-                'bcf_hdr_set_samples failed: ret = {}'.format(ret))
-
-    def __str__(self):
-        cdef int hlen
-        cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
-
-        try:
-            return charptr_to_str_w_len(hstr, hlen)
-        finally:
-            free(hstr)
-
-    def add_record(self, VariantHeaderRecord record):
-        """Add an existing :class:`VariantHeaderRecord` to this header"""
-        cdef bcf_hrec_t *r = record.ptr
-
-        if r.type == BCF_HL_GEN:
-            self.add_meta(r.key, r.value)
-        else:
-            items = [(k,v) for k,v in record.attrs if k != 'IDX']
-            self.add_meta(r.key, items=items)
-
-    def add_line(self, line):
-        """Add a metadata line to this header"""
-        bline = force_bytes(line)
-        if bcf_hdr_append(self.ptr, bline) < 0:
-            raise ValueError('invalid header line')
-
-        if self.ptr.dirty:
-            bcf_hdr_sync(self.ptr)
-
-    def add_meta(self, key, value=None, items=None):
-        """Add metadata to this header"""
-        if not ((value is not None) ^ (items is not None)):
-            raise ValueError('either value or items must be specified')
-
-        cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
-        cdef int quoted
-
-        try:
-            key = force_bytes(key)
-            hrec.key = strdup(key)
-
-            if value is not None:
-                hrec.value = strdup(force_bytes(value))
-            else:
-                for key, value in items:
-                    key = force_bytes(key)
-                    bcf_hrec_add_key(hrec, key, <int>len(key))
-
-                    value = force_bytes(str(value))
-                    quoted = strpbrk(value, ' ;,"\t<>') != NULL
-                    bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
-        except:
-            bcf_hrec_destroy(hrec)
-            raise
-
-        bcf_hdr_add_hrec(self.ptr, hrec)
-
-        if self.ptr.dirty:
-            bcf_hdr_sync(self.ptr)
-
-    def add_sample(self, name):
-        """Add a new sample to this header"""
-        bname = force_bytes(name)
-        if bcf_hdr_add_sample(self.ptr, bname) < 0:
-            raise ValueError('Duplicated sample name: {}'.format(name))
-        if self.ptr.dirty:
-            bcf_hdr_sync(self.ptr)
-
-
-cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
-    if not hdr:
-        raise ValueError('cannot create VariantHeader')
-
-    cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
-    header.ptr = hdr
-
-    return header
-
-
-########################################################################
-########################################################################
-## Variant Record objects
-########################################################################
-
-cdef class VariantRecordFilter(object):
-    """Filters set on a :class:`VariantRecord` object, presented as a mapping from
-       filter index or name to :class:`VariantMetadata` object"""
-
-    def __len__(self):
-        return self.record.ptr.d.n_flt
-
-    def __bool__(self):
-        return self.record.ptr.d.n_flt != 0
-
-    def __getitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int index, id
-        cdef int n = r.d.n_flt
-
-        if isinstance(key, int):
-            index = key
-
-            if index < 0 or index >= n:
-                raise IndexError('invalid filter index')
-
-            id = r.d.flt[index]
-        else:
-            if key == '.':
-                key = 'PASS'
-
-            bkey = force_bytes(key)
-            id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
-
-            if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
-               or not bcf_has_filter(hdr, self.record.ptr, bkey):
-                raise KeyError('Invalid filter')
-
-        return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
-
-    def __delitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int index, id
-        cdef int n = r.d.n_flt
-
-        if isinstance(key, int):
-            index = key
-
-            if index < 0 or index >= n:
-                raise IndexError('invalid filter index')
-
-            id = r.d.flt[index]
-        else:
-            if key == '.':
-                key = 'PASS'
-
-            bkey = force_bytes(key)
-            id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
-
-            if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
-               or not bcf_has_filter(hdr, self.record.ptr, bkey):
-                raise KeyError('Invalid filter')
-
-        bcf_remove_filter(hdr, r, id, 0)
-
-    def clear(self):
-        """Clear all filters"""
-        cdef bcf1_t *r = self.record.ptr
-        r.d.shared_dirty |= BCF1_DIRTY_FLT
-        r.d.n_flt = 0
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int i
-
-        for i in range(r.d.n_flt):
-            yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        bkey = force_bytes(key)
-        return bcf_has_filter(hdr, r, bkey) == 1
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        for key in self:
-            yield self[key]
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        for key in self:
-            yield (key, self[key])
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-
-cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
-    if not record:
-        raise ValueError('invalid VariantRecord')
-
-    cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
-    filter.record = record
-
-    return filter
-
-
-cdef class VariantRecordFormat(object):
-    """Format data present for each sample in a :class:`VariantRecord` object,
-       presented as mapping from format name to :class:`VariantMetadata` object."""
-
-    def __len__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int i, n = 0
-
-        for i in range(r.n_fmt):
-            if r.d.fmt[i].p:
-                n += 1
-        return n
-
-    def __bool__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int i
-
-        for i in range(r.n_fmt):
-            if r.d.fmt[i].p:
-                return True
-        return False
-
-    def __getitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-
-        bkey = force_bytes(key)
-        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
-        if not fmt or not fmt.p:
-            raise KeyError('unknown format')
-
-        return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
-
-    def __delitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-
-        bkey = force_bytes(key)
-        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
-        if not fmt or not fmt.p:
-            raise KeyError('unknown format')
-
-        if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
-            raise ValueError('Unable to delete FORMAT')
-
-    def clear(self):
-        """Clear all formats for all samples within the associated
-           :class:`VariantRecord` instance"""
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_fmt_t *fmt
-        cdef const char *key
-        cdef int i
-
-        for i in reversed(range(r.n_fmt)):
-            fmt = &r.d.fmt[i]
-            if fmt.p:
-                key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
-                if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
-                    raise ValueError('Unable to delete FORMAT')
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_fmt_t *fmt
-        cdef int i
-
-        for i in range(r.n_fmt):
-            fmt = &r.d.fmt[i]
-            if fmt.p:
-                yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        bkey = force_bytes(key)
-        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-        return fmt != NULL and fmt.p != NULL
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        for key in self:
-            yield self[key]
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        for key in self:
-            yield (key, self[key])
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-
-cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
-    if not record:
-        raise ValueError('invalid VariantRecord')
-
-    cdef VariantRecordFormat format = VariantRecordFormat.__new__(
-        VariantRecordFormat)
-    format.record = record
-
-    return format
-
-
-#TODO: Add a getmeta method to return the corresponding VariantMetadata?
-cdef class VariantRecordInfo(object):
-    """Info data stored in a :class:`VariantRecord` object, presented as a
-       mapping from info metadata name to value."""
-
-    def __len__(self):
-        return self.record.ptr.n_info
-
-    def __bool__(self):
-        return self.record.ptr.n_info != 0
-
-    def __getitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef vdict_t *d
-        cdef khiter_t k
-        cdef info_id
-
-        if bcf_unpack(r, BCF_UN_INFO) < 0:
-            raise ValueError('Error unpacking VariantRecord')
-
-        bkey = force_bytes(key)
-        cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
-        if not info:
-            d = <vdict_t *>hdr.dict[BCF_DT_ID]
-            k = kh_get_vdict(d, bkey)
-
-            if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
-                raise KeyError('Unknown INFO field: {}'.format(key))
-
-            info_id = kh_val_vdict(d, k).id
-        else:
-            info_id = info.key
-
-        if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
-            return info != NULL and info.vptr != NULL
-
-        if not info or not info.vptr:
-            raise KeyError('Invalid INFO field: {}'.format(key))
-
-        return bcf_info_get_value(self.record, info)
-
-    def __setitem__(self, key, value):
-        bcf_info_set_value(self.record, key, value)
-
-    def __delitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-
-        if bcf_unpack(r, BCF_UN_INFO) < 0:
-            raise ValueError('Error unpacking VariantRecord')
-
-        bkey = force_bytes(key)
-        cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
-        if not info or not info.vptr:
-            raise KeyError('Unknown INFO field: {}'.format(key))
-
-        if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
-            raise ValueError('Unable to delete INFO')
-
-    def clear(self):
-        """Clear all info data"""
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_info_t *info
-        cdef const char *key
-        cdef int i
-
-        if bcf_unpack(r, BCF_UN_INFO) < 0:
-            raise ValueError('Error unpacking VariantRecord')
-
-        for i in range(r.n_info):
-            info = &r.d.info[i]
-            if info and info.vptr:
-                key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
-                if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
-                    raise ValueError('Unable to delete INFO')
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_info_t *info
-        cdef int i
-
-        for i in range(r.n_info):
-            info = &r.d.info[i]
-            if info and info.vptr:
-                yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key))
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-
-        if bcf_unpack(r, BCF_UN_INFO) < 0:
-            raise ValueError('Error unpacking VariantRecord')
-
-        bkey = force_bytes(key)
-        cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
-        return info != NULL
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_info_t *info
-        cdef int i
-
-        for i in range(r.n_info):
-            info = &r.d.info[i]
-            if info and info.vptr:
-                yield bcf_info_get_value(self.record, info)
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_info_t *info
-        cdef int i
-
-        for i in range(r.n_info):
-            info = &r.d.info[i]
-            if info and info.vptr:
-                key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
-                value = bcf_info_get_value(self.record, info)
-                yield bcf_str_cache_get_charptr(key), value
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-
-cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
-    if not record:
-        raise ValueError('invalid VariantRecord')
-
-    cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
-    info.record = record
-
-    return info
-
-
-cdef class VariantRecordSamples(object):
-    """mapping from sample index or name to :class:`VariantRecordSample` object."""
-
-    def __len__(self):
-        return bcf_hdr_nsamples(self.record.header.ptr)
-
-    def __bool__(self):
-        return bcf_hdr_nsamples(self.record.header.ptr) != 0
-
-    def __getitem__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int n = bcf_hdr_nsamples(hdr)
-        cdef int sample_index
-        cdef vdict_t *d
-        cdef khiter_t k
-
-        if isinstance(key, int):
-            sample_index = key
-        else:
-            bkey = force_bytes(key)
-            sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
-            if sample_index < 0:
-                raise KeyError('invalid sample name')
-
-        if sample_index < 0 or sample_index >= n:
-            raise IndexError('invalid sample index')
-
-        return makeVariantRecordSample(self.record, sample_index)
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
-        for i in range(n):
-            yield charptr_to_str(hdr.samples[i])
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int n = bcf_hdr_nsamples(hdr)
-        cdef int sample_index
-        cdef vdict_t *d
-        cdef khiter_t k
-
-        if isinstance(key, int):
-            sample_index = key
-        else:
-            bkey = force_bytes(key)
-            sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
-            if sample_index < 0:
-                raise KeyError('invalid sample name')
-
-        return 0 <= sample_index < n
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
-        for i in range(n):
-            yield makeVariantRecordSample(self.record, i)
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
-        for i in range(n):
-            yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-
-cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
-    if not record:
-        raise ValueError('invalid VariantRecord')
-
-    cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
-        VariantRecordSamples)
-    samples.record = record
-
-    return samples
-
-
-cdef class VariantRecord(object):
-    """Variant record"""
-
-    def __dealloc__(self):
-        if self.ptr:
-            bcf_destroy1(self.ptr)
-            self.ptr = NULL
-
-    property rid:
-        """internal reference id number"""
-        def __get__(self):
-            return self.ptr.rid
-        def __set__(self, rid):
-            cdef bcf_hdr_t *hdr = self.header.ptr
-            cdef int r = rid
-            if rid < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
-                raise ValueError('invalid reference id')
-            self.ptr.rid = r
-
-    property chrom:
-        """chromosome/contig name"""
-        def __get__(self):
-            return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
-        def __set__(self, chrom):
-            cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
-            bchrom = force_bytes(chrom)
-            cdef khint_t k = kh_get_vdict(d, bchrom)
-            if k == kh_end(d):
-                raise ValueError('Invalid chromosome/contig')
-            self.ptr.rid = kh_val_vdict(d, k).id
-
-    property contig:
-        """chromosome/contig name"""
-        def __get__(self):
-            return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
-        def __set__(self, chrom):
-            cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
-            bchrom = force_bytes(chrom)
-            cdef khint_t k = kh_get_vdict(d, bchrom)
-            if k == kh_end(d):
-                raise ValueError('Invalid chromosome/contig')
-            self.ptr.rid = kh_val_vdict(d, k).id
-
-    property pos:
-        """record start position on chrom/contig (1-based inclusive)"""
-        def __get__(self):
-            return self.ptr.pos + 1
-        def __set__(self, pos):
-            if pos < 1:
-                raise ValueError('Position must be positive')
-            # FIXME: check start <= stop?
-            #   KBJ: Can't or else certain mutating operations will become
-            #        difficult or impossible.  e.g.  having to delete
-            #        info['END'] before being able to reset pos is going to
-            #        create subtle bugs.  Better to check this when writing
-            #        records.
-            self.ptr.pos = pos - 1
-
-    property start:
-        """record start position on chrom/contig (0-based inclusive)"""
-        def __get__(self):
-            return self.ptr.pos
-        def __set__(self, start):
-            if start < 0:
-                raise ValueError('Start coordinate must be non-negative')
-            # FIXME: check start <= stop?
-            #   KBJ: See above.
-            self.ptr.pos = start
-
-    property stop:
-        """record stop position on chrom/contig (0-based exclusive)"""
-        def __get__(self):
-            return self.ptr.pos + self.ptr.rlen
-        def __set__(self, stop):
-            if stop < self.ptr.pos:
-                raise ValueError('Stop coordinate must be greater than or equal to start')
-            self.ptr.rlen = stop - self.ptr.pos
-
-    property rlen:
-        """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)"""
-        def __get__(self):
-            return self.ptr.rlen
-        def __set__(self, rlen):
-            if rlen < 0:
-                raise ValueError('Reference length must be non-negative')
-            self.ptr.rlen = rlen
-
-    property qual:
-        """phred scaled quality score or None if not available"""
-        def __get__(self):
-            return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
-        def __set__(self, qual):
-            if qual is not None:
-                self.ptr.qual = qual
-            else:
-                bcf_float_set(&self.ptr.qual, bcf_float_missing)
-
-#   property n_allele:
-#       def __get__(self):
-#           return self.ptr.n_allele
-
-#   property n_sample:
-#       def __get__(self):
-#           return self.ptr.n_sample
-
-    property id:
-        """record identifier or None if not available"""
-        def __get__(self):
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
-        def __set__(self, id):
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            cdef char *idstr = NULL
-            if id is not None:
-                bid = force_bytes(id)
-                idstr = bid
-            if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
-                raise ValueError('Error updating id')
-
-    property ref:
-        """reference allele"""
-        def __get__(self):
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            return charptr_to_str(r.d.allele[0]) if r.d.allele else None
-        def __set__(self, ref):
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            #FIXME: Set alleles directly -- this is stupid
-            if not ref:
-                raise ValueError('ref allele cannot be null')
-            ref = force_bytes(ref)
-            if r.d.allele and r.n_allele:
-                alleles = [r.d.allele[i] for i in range(r.n_allele)]
-                alleles[0] = ref
-            else:
-                alleles = [ref]
-            self.alleles = alleles
-
-    property alleles:
-        """tuple of reference allele followed by alt alleles"""
-        def __get__(self):
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            if not r.d.allele:
-                return None
-            cdef tuple res = PyTuple_New(r.n_allele)
-            for i in range(r.n_allele):
-                a = charptr_to_str(r.d.allele[i])
-                PyTuple_SET_ITEM(res, i, a)
-                Py_INCREF(a)
-            return res
-        def __set__(self, values):
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            values = [force_bytes(v) for v in values]
-            if b'' in values:
-                raise ValueError('cannot set null allele')
-            values = b','.join(values)
-            if bcf_update_alleles_str(self.header.ptr, r, values) < 0:
-                raise ValueError('Error updating alleles')
-
-    property alts:
-        """tuple of alt alleles"""
-        def __get__(self):
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            if r.n_allele < 2 or not r.d.allele:
-                return None
-            cdef tuple res = PyTuple_New(r.n_allele - 1)
-            for i in range(1, r.n_allele):
-                a = charptr_to_str(r.d.allele[i])
-                PyTuple_SET_ITEM(res, i - 1, a)
-                Py_INCREF(a)
-            return res
-        def __set__(self, values):
-            #FIXME: Set alleles directly -- this is stupid
-            cdef bcf1_t *r = self.ptr
-            if bcf_unpack(r, BCF_UN_STR) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            values = [force_bytes(v) for v in values]
-            if b'' in values:
-                raise ValueError('cannot set null alt allele')
-            ref  = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
-            self.alleles = ref + values
-
-    property filter:
-        """filter information (see :class:`VariantRecordFilter`)"""
-        def __get__(self):
-            if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            return makeVariantRecordFilter(self)
-
-    property info:
-        """info data (see :class:`VariantRecordInfo`)"""
-        def __get__(self):
-            if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            return makeVariantRecordInfo(self)
-
-    property format:
-        """sample format metadata (see :class:`VariantRecordFormat`)"""
-        def __get__(self):
-            if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            return makeVariantRecordFormat(self)
-
-    property samples:
-        """sample data (see :class:`VariantRecordSamples`)"""
-        def __get__(self):
-            if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
-                raise ValueError('Error unpacking VariantRecord')
-            return makeVariantRecordSamples(self)
-
-    def __str__(self):
-        cdef kstring_t line
-        cdef char c
-
-        line.l = line.m = 0
-        line.s = NULL
-
-        if vcf_format(self.header.ptr, self.ptr, &line) < 0:
-            if line.m:
-                free(line.s)
-            raise ValueError('vcf_format failed')
-
-        # Strip CR/LF?
-        #while line.l:
-        #    c = line.s[line.l - 1]
-        #    if c != b'\n' and c != b'\r':
-        #        break
-        #    line.l -= 1
-
-        ret = charptr_to_str_w_len(line.s, line.l)
-
-        if line.m:
-            free(line.s)
-
-        return ret
-
-
-cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    if not r:
-        raise ValueError('cannot create VariantRecord')
-
-    cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
-    record.header = header
-    record.ptr = r
-
-    return record
-
-
-########################################################################
-########################################################################
-## Variant Sampletype object
-########################################################################
-
-
-cdef class VariantRecordSample(object):
-    """Data for a single sample from a :class:`VariantRecord` object.
-       Provides data accessors for genotypes and a mapping interface
-       from format name to values.
-    """
-
-    property name:
-        """sample name"""
-        def __get__(self):
-            cdef bcf_hdr_t *hdr = self.record.header.ptr
-            cdef bcf1_t *r = self.record.ptr
-            cdef int32_t n = bcf_hdr_nsamples(hdr)
-
-            if self.index < 0 or self.index >= n:
-                raise ValueError('invalid sample index')
-
-            return charptr_to_str(hdr.samples[self.index])
-
-    property allele_indices:
-        """allele indices for called genotype, if present.  Otherwise None"""
-        def __get__(self):
-            return bcf_format_get_allele_indices(self)
-        def __set__(self, values):
-            self['GT'] = values
-        def __del__(self):
-            self['GT'] = ()
-
-    property alleles:
-        """alleles for called genotype, if present.  Otherwise None"""
-        def __get__(self):
-            return bcf_format_get_alleles(self)
-        def __set__(self, values):
-            self['GT'] = values
-        def __del__(self):
-            self['GT'] = ()
-
-    property phased:
-        """False if genotype is missing or any allele is unphased.  Otherwise True."""
-        def __get__(self):
-            return bcf_sample_get_phased(self)
-        def __set__(self, value):
-            bcf_sample_set_phased(self, value)
-
-    def __len__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int i, n = 0
-
-        if bcf_unpack(r, BCF_UN_FMT) < 0:
-            raise ValueError('Error unpacking VariantRecord')
-
-        for i in range(r.n_fmt):
-            if r.d.fmt[i].p:
-                n += 1
-        return n
-
-    def __bool__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef int i
-
-        if bcf_unpack(r, BCF_UN_FMT) < 0:
-            raise ValueError('Error unpacking VariantRecord')
-
-        for i in range(r.n_fmt):
-            if r.d.fmt[i].p:
-                return True
-        return False
-
-    def __getitem__(self, key):
-        return bcf_format_get_value(self, key)
-
-    def __setitem__(self, key, value):
-        bcf_format_set_value(self, key, value)
-
-    def __delitem__(self, key):
-        bcf_format_del_value(self, key)
-
-    def clear(self):
-        """Clear all format data (including genotype) for this sample"""
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_fmt_t *fmt
-        cdef int i
-
-        for i in range(r.n_fmt):
-            fmt = &r.d.fmt[i]
-            if fmt.p:
-                bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
-
-    def __iter__(self):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        cdef bcf_fmt_t *fmt
-        cdef int i
-
-        for i in range(r.n_fmt):
-            fmt = &r.d.fmt[i]
-            if r.d.fmt[i].p:
-                yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        cdef bcf_hdr_t *hdr = self.record.header.ptr
-        cdef bcf1_t *r = self.record.ptr
-        bkey = force_bytes(key)
-        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-        return fmt != NULL and fmt.p != NULL
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        for key in self:
-            yield self[key]
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        for key in self:
-            yield (key, self[key])
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-
-cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
-    if not record or sample_index < 0:
-        raise ValueError('cannot create VariantRecordSample')
-
-    cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
-    sample.record = record
-    sample.index = sample_index
-
-    return sample
-
-
-########################################################################
-########################################################################
-## Index objects
-########################################################################
-
-
-cdef class BaseIndex(object):
-    def __init__(self):
-        self.refs = ()
-        self.remap = {}
-
-    def __len__(self):
-        return len(self.refs)
-
-    def __bool__(self):
-        return len(self.refs) != 0
-
-    def __getitem__(self, key):
-        if isinstance(key, int):
-            return self.refs[key]
-        else:
-            return self.refmap[key]
-
-    def __iter__(self):
-        return iter(self.refs)
-
-    def get(self, key, default=None):
-        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
-        try:
-            return self[key]
-        except KeyError:
-            return default
-
-    def __contains__(self, key):
-        try:
-            self[key]
-        except KeyError:
-            return False
-        else:
-            return True
-
-    def iterkeys(self):
-        """D.iterkeys() -> an iterator over the keys of D"""
-        return iter(self)
-
-    def itervalues(self):
-        """D.itervalues() -> an iterator over the values of D"""
-        for key in self:
-            yield self[key]
-
-    def iteritems(self):
-        """D.iteritems() -> an iterator over the (key, value) items of D"""
-        for key in self:
-            yield (key, self[key])
-
-    def keys(self):
-        """D.keys() -> list of D's keys"""
-        return list(self)
-
-    def items(self):
-        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
-        return list(self.iteritems())
-
-    def values(self):
-        """D.values() -> list of D's values"""
-        return list(self.itervalues())
-
-    # Mappings are not hashable by default, but subclasses can change this
-    __hash__ = None
-
-    #TODO: implement __richcmp__
-
-
-cdef class BCFIndex(object):
-    """CSI index data structure for BCF files"""
-    def __init__(self):
-        self.refs = ()
-        self.refmap = {}
-
-        if not self.ptr:
-            raise ValueError('Invalid index object')
-
-        cdef int n
-        cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
-
-        if not refs:
-            raise ValueError('Cannot retrieve reference sequence names')
-
-        self.refs = char_array_to_tuple(refs, n, free_after=1)
-        self.refmap = { r:i for i,r in enumerate(self.refs) }
-
-    def __dealloc__(self):
-        if self.ptr:
-            hts_idx_destroy(self.ptr)
-            self.ptr = NULL
-
-    def fetch(self, bcf, contig, start, stop, region, reopen):
-        return BCFIterator(bcf, contig, start, stop, region, reopen)
-
-
-cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
-    if not idx:
-        return None
-
-    if not header:
-        raise ValueError('invalid VariantHeader')
-
-    cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
-    index.header = header
-    index.ptr = idx
-    index.__init__()
-
-    return index
-
-
-cdef class TabixIndex(BaseIndex):
-    """Tabix index data structure for VCF files"""
-    def __init__(self):
-        self.refs = ()
-        self.refmap = {}
-
-        if not self.ptr:
-            raise ValueError('Invalid index object')
-
-        cdef int n
-        cdef const char **refs = tbx_seqnames(self.ptr, &n)
-
-        if not refs:
-            raise ValueError('Cannot retrieve reference sequence names')
-
-        self.refs = char_array_to_tuple(refs, n, free_after=1)
-        self.refmap = { r:i for i,r in enumerate(self.refs) }
-
-    def __dealloc__(self):
-        if self.ptr:
-            tbx_destroy(self.ptr)
-            self.ptr = NULL
-
-    def fetch(self, bcf, contig, start, stop, region, reopen):
-        return TabixIterator(bcf, contig, start, stop, region, reopen)
-
-
-cdef TabixIndex makeTabixIndex(tbx_t *idx):
-    if not idx:
-        return None
-
-    cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
-    index.ptr = idx
-    index.__init__()
-
-    return index
-
-
-########################################################################
-########################################################################
-## Iterators
-########################################################################
-
-
-cdef class BaseIterator(object):
-    pass
-
-
-# Interal function to clean up after iteration stop or failure.
-# This would be a nested function if it weren't a cdef function.
-cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
-    bcf_destroy1(record)
-
-    # destroy iter so future calls to __next__ raise StopIteration
-    bcf_itr_destroy(self.iter)
-    self.iter = NULL
-
-
-cdef class BCFIterator(BaseIterator):
-    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
-
-        if not isinstance(bcf.index, BCFIndex):
-            raise ValueError('bcf index required')
-
-        cdef BCFIndex index = bcf.index
-        cdef int rid, cstart, cstop
-        cdef char *cregion
-
-        if not index:
-            raise ValueError('bcf index required')
-
-        if reopen:
-            bcf = bcf.copy()
-
-        if region is not None:
-            if contig is not None or start is not None or stop is not None:
-                raise ValueError  # FIXME
-
-            bregion = force_bytes(region)
-            cregion = bregion
-            with nogil:
-                self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion)
-        else:
-            if contig is None:
-                raise ValueError  # FIXME
-
-            try:
-                rid = index.refmap[contig]
-            except KeyError:
-                raise('Unknown contig specified')
-
-            if start is None:
-                start = 0
-            if stop is None:
-                stop = MAX_POS
-
-            cstart, cstop = start, stop
-
-            with nogil:
-                self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
-
-        # Do not fail on self.iter == NULL, since it signifies a null query.
-
-        self.bcf = bcf
-        self.index = index
-
-    def __dealloc__(self):
-        if self.iter:
-            bcf_itr_destroy(self.iter)
-            self.iter = NULL
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if not self.iter:
-            raise StopIteration
-
-        cdef bcf1_t *record = bcf_init1()
-
-        record.pos = -1
-        if self.bcf.drop_samples:
-            record.max_unpack = BCF_UN_SHR
-
-        cdef int ret
-
-        with nogil:
-            ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
-
-        if ret < 0:
-            _stop_BCFIterator(self, record)
-            if ret == -1:
-                raise StopIteration
-            else:
-                raise ValueError('error reading BCF file')
-
-        ret = bcf_subset_format(self.bcf.header.ptr, record)
-
-        if ret < 0:
-            _stop_BCFIterator(self, record)
-            raise ValueError('error in bcf_subset_format')
-
-        return makeVariantRecord(self.bcf.header, record)
-
-
-cdef class TabixIterator(BaseIterator):
-    def __cinit__(self, *args, **kwargs):
-        self.line_buffer.l = 0
-        self.line_buffer.m = 0
-        self.line_buffer.s = NULL
-
-    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
-        if not isinstance(bcf.index, TabixIndex):
-            raise ValueError('tabix index required')
-
-        cdef TabixIndex index = bcf.index
-
-        if not index:
-            raise ValueError('bcf index required')
-
-        if reopen:
-            bcf = bcf.copy()
-
-        if region is not None:
-            if contig is not None or start is not None or stop is not None:
-                raise ValueError  # FIXME
-
-            self.iter = tbx_itr_querys(index.ptr, region)
-        else:
-            if contig is None:
-                raise ValueError  # FIXME
-
-            rid = index.refmap.get(contig, -1)
-
-            if start is None:
-                start = 0
-            if stop is None:
-                stop = MAX_POS
-
-            self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
-
-        # Do not fail on self.iter == NULL, since it signifies a null query.
-
-        self.bcf = bcf
-        self.index = index
-
-    def __dealloc__(self):
-        if self.iter:
-            tbx_itr_destroy(self.iter)
-            self.iter = NULL
-
-        if self.line_buffer.m:
-            free(self.line_buffer.s)
-
-        self.line_buffer.l = 0
-        self.line_buffer.m = 0
-        self.line_buffer.s = NULL
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if not self.iter:
-            raise StopIteration
-
-        cdef int ret
-
-        with nogil:
-            ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
-
-        if ret < 0:
-            tbx_itr_destroy(self.iter)
-            self.iter = NULL
-            if ret == -1:
-                raise StopIteration
-            else:
-                raise ValueError('error reading indexed VCF file')
-
-        cdef bcf1_t *record = bcf_init1()
-
-        record.pos = -1
-        if self.bcf.drop_samples:
-            record.max_unpack = BCF_UN_SHR
-
-        ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
-
-        # FIXME: stop iteration on parse failure?
-        if ret < 0:
-            bcf_destroy1(record)
-            raise ValueError('error in vcf_parse')
-
-        return makeVariantRecord(self.bcf.header, record)
-
-
-########################################################################
-########################################################################
-## Variant File
-########################################################################
-
-
-cdef class VariantFile(object):
-    """*(filename, mode=None, index_filename=None, header=None, drop_samples=False)*
-
-    A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
-    opened.
-
-    *mode* should be ``r`` for reading or ``w`` for writing. The default is
-    text mode (:term:`VCF`).  For binary (:term:`BCF`) I/O you should append
-    ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
-
-    If ``b`` is present, it must immediately follow ``r`` or ``w``.  Valid
-    modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
-    For instance, to open a :term:`BCF` formatted file for reading, type::
-
-        f = pysam.VariantFile('ex1.bcf','rb')
-
-    If mode is not specified, we will try to auto-detect in the order 'rb',
-    'r', thus both the following should work::
-
-        f1 = pysam.VariantFile('ex1.bcf')
-        f2 = pysam.VariantFile('ex1.vcf')
-
-    If an index for a variant file exists (.csi or .tbi), it will be opened
-    automatically.  Without an index random access to records via
-    :meth:`fetch` is disabled.
-
-    For writing, a :class:`VariantHeader` object must be provided, typically
-    obtained from another :term:`VCF` file/:term:`BCF` file.
-    """
-    def __cinit__(self, *args, **kwargs):
-        self.htsfile = NULL
-
-    def __init__(self, *args, **kwargs):
-        self.header         = None
-        self.index          = None
-        self.filename       = None
-        self.mode           = None
-        self.index_filename = None
-        self.is_stream      = False
-        self.is_remote      = False
-        self.is_reading     = False
-        self.drop_samples   = False
-        self.start_offset   = -1
-
-        self.open(*args, **kwargs)
-
-    def __dealloc__(self):
-        if self.htsfile:
-            hts_close(self.htsfile)
-            self.htsfile = NULL
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-        return False
-
-    property category:
-        """General file format category.  One of UNKNOWN, ALIGNMENTS,
-        VARIANTS, INDEX, REGIONS"""
-        def __get__(self):
-            if not self.htsfile:
-                raise ValueError('metadata not available on closed file')
-            return FORMAT_CATEGORIES[self.htsfile.format.category]
-
-    property format:
-        """File format.
-
-        One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
-        BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
-        """
-        def __get__(self):
-            if not self.htsfile:
-                raise ValueError('metadata not available on closed file')
-            return FORMATS[self.htsfile.format.format]
-
-    property version:
-        """Tuple of file format version numbers (major, minor)"""
-        def __get__(self):
-            if not self.htsfile:
-                raise ValueError('metadata not available on closed file')
-            return (self.htsfile.format.version.major,
-                    self.htsfile.format.version.minor)
-
-    property compression:
-        """File compression.
-
-        One of NONE, GZIP, BGZF, CUSTOM."""
-        def __get__(self):
-            if not self.htsfile:
-                raise ValueError('metadata not available on closed file')
-            return COMPRESSION[self.htsfile.format.compression]
-
-    property description:
-        """Vaguely human readable description of the file format"""
-        def __get__(self):
-            if not self.htsfile:
-                raise ValueError('metadata not available on closed file')
-            cdef char *desc = hts_format_description(&self.htsfile.format)
-            try:
-                return charptr_to_str(desc)
-            finally:
-                free(desc)
-
-    def close(self):
-        """closes the :class:`pysam.VariantFile`."""
-        if self.htsfile:
-            hts_close(self.htsfile)
-            self.htsfile = NULL
-        self.header = self.index = None
-
-    property is_open:
-        def __get__(self):
-            """return True if VariantFile is open and in a valid state."""
-            return self.htsfile != NULL
-
-    def __iter__(self):
-        if not self.is_open:
-            raise ValueError('I/O operation on closed file')
-
-        if not self.mode.startswith(b'r'):
-            raise ValueError(
-                'cannot iterate over Variantfile opened for writing')
-
-        self.is_reading = 1
-        return self
-
-    def __next__(self):
-        cdef int ret
-        cdef bcf1_t *record = bcf_init1()
-
-        record.pos = -1
-        if self.drop_samples:
-            record.max_unpack = BCF_UN_SHR
-
-        with nogil:
-            ret = bcf_read1(self.htsfile, self.header.ptr, record)
-
-        if ret < 0:
-            bcf_destroy1(record)
-            if ret == -1:
-                raise StopIteration
-            elif ret == -2:
-                raise IOError('truncated file')
-            else:
-                raise ValueError('Variant read failed')
-
-        return makeVariantRecord(self.header, record)
-
-    def copy(self):
-        if not self.is_open:
-            raise ValueError
-
-        cdef VariantFile vars = VariantFile.__new__(VariantFile)
-        cdef bcf_hdr_t *hdr
-        cdef char *cfilename
-        cdef char *cmode
-
-        # FIXME: re-open using fd or else header and index could be invalid
-        cfilename, cmode = self.filename, self.mode
-        with nogil:
-            vars.htsfile = hts_open(cfilename, cmode)
-
-        if not vars.htsfile:
-            raise ValueError('Cannot re-open htsfile')
-
-        # minimize overhead by re-using header and index.  This approach is
-        # currently risky, but see above for how this can be mitigated.
-        vars.header         = self.header
-        vars.index          = self.index
-
-        vars.filename       = self.filename
-        vars.mode           = self.mode
-        vars.index_filename = self.index_filename
-        vars.drop_samples   = self.drop_samples
-        vars.is_stream      = self.is_stream
-        vars.is_remote      = self.is_remote
-        vars.is_reading     = self.is_reading
-        vars.start_offset   = self.start_offset
-
-        if self.htsfile.is_bin:
-            vars.seek(self.tell())
-        else:
-            with nogil:
-                hdr = bcf_hdr_read(vars.htsfile)
-            makeVariantHeader(hdr)
-
-        return vars
-
-    def open(self, filename, mode='rb',
-             index_filename=None,
-             VariantHeader header=None,
-             drop_samples=False):
-        """open a vcf/bcf file.
-
-        If open is called on an existing VariantFile, the current file will be
-        closed and a new file will be opened.
-        """
-        cdef bcf_hdr_t *hdr
-        cdef BGZF *bgzfp
-        cdef hts_idx_t *idx
-        cdef tbx_t *tidx
-        cdef char *cfilename
-        cdef char *cindex_filename = NULL
-        cdef char *cmode
-
-        # close a previously opened file
-        if self.is_open:
-            self.close()
-
-        if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'):
-            raise ValueError('invalid file opening mode `{}`'.format(mode))
-
-        # for htslib, wbu seems to not work
-        if mode == 'wbu':
-            mode = 'wb0'
-
-        self.mode = mode = force_bytes(mode)
-        self.filename = filename = encode_filename(filename)
-        if index_filename is not None:
-            self.index_filename = index_filename = encode_filename(index_filename)
-        else:
-            self.index_filename = None
-        self.drop_samples = bool(drop_samples)
-        self.header = None
-
-        self.is_remote = hisremote(filename)
-        self.is_stream = filename == b'-'
-
-        if mode.startswith(b'w'):
-            # open file for writing
-            if index_filename is not None:
-                raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
-
-            # header structure (used for writing)
-            if header:
-                self.header = header.copy()
-            else:
-                raise ValueError('a VariantHeader must be specified')
-
-            # open file. Header gets written to file at the same time
-            # for bam files and sam files (in the latter case, the
-            # mode needs to be wh)
-            cfilename, cmode = filename, mode
-            with nogil:
-                self.htsfile = hts_open(cfilename, cmode)
-
-            if not self.htsfile:
-                raise ValueError("could not open file `{}` (mode='{}')".format((filename, mode)))
-
-            with nogil:
-                bcf_hdr_write(self.htsfile, self.header.ptr)
-
-        elif mode.startswith(b'r'):
-            # open file for reading
-            if filename != b'-' and not self.is_remote and not os.path.exists(filename):
-                raise IOError('file `{}` not found'.format(filename))
-
-            cfilename, cmode = filename, mode
-            with nogil:
-                self.htsfile = hts_open(cfilename, cmode)
-
-            if not self.htsfile:
-                raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
-
-            if self.htsfile.format.format not in (bcf, vcf):
-                raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
-
-            if self.htsfile.format.compression == bgzf:
-                bgzfp = hts_get_bgzfp(self.htsfile)
-                if bgzfp and bgzf_check_EOF(bgzfp) == 0:
-                    warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename))
-
-            with nogil:
-                hdr = bcf_hdr_read(self.htsfile)
-
-            try:
-                self.header = makeVariantHeader(hdr)
-            except ValueError:
-                raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode))
-
-            # check for index and open if present
-            if self.htsfile.format.format == bcf:
-                if index_filename is not None:
-                    cindex_filename = index_filename
-                with nogil:
-                    idx = bcf_index_load2(cfilename, cindex_filename)
-                self.index = makeBCFIndex(self.header, idx)
-
-            elif self.htsfile.format.compression == bgzf:
-                if index_filename is not None:
-                    cindex_filename = index_filename
-                with nogil:
-                    tidx = tbx_index_load2(cfilename, cindex_filename)
-                self.index = makeTabixIndex(tidx)
-
-            if not self.is_stream:
-                self.start_offset = self.tell()
-        else:
-            raise ValueError("unknown mode {}".format(mode))
-
-    def reset(self):
-        """reset file position to beginning of file just after the header."""
-        return self.seek(self.start_offset, 0)
-
-    def seek(self, uint64_t offset):
-        """move file pointer to position *offset*, see
-        :meth:`pysam.VariantFile.tell`."""
-        if not self.is_open:
-            raise ValueError('I/O operation on closed file')
-        if self.is_stream:
-            raise OSError('seek not available in streams')
-
-        cdef int64_t ret
-        if self.htsfile.format.compression != no_compression:
-            with nogil:
-                ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
-        else:
-            with nogil:
-                ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
-        return ret
-
-    def tell(self):
-        """return current file position, see :meth:`pysam.VariantFile.seek`."""
-        if not self.is_open:
-            raise ValueError('I/O operation on closed file')
-        if self.is_stream:
-            raise OSError('tell not available in streams')
-
-        cdef int64_t ret
-        if self.htsfile.format.compression != no_compression:
-            with nogil:
-                ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
-        else:
-            with nogil:
-                ret = hts_utell(self.htsfile)
-        return ret
-
-    def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
-        """fetch records in a :term:`region` using 0-based indexing. The
-        region is specified by :term:`contig`, *start* and *end*.
-        Alternatively, a samtools :term:`region` string can be supplied.
-
-        Without *contig* or *region* all mapped records will be fetched.  The
-        records will be returned ordered by contig, which will not necessarily
-        be the order within the file.
-
-        Set *reopen* to true if you will be using multiple iterators on the
-        same file at the same time.  The iterator returned will receive its
-        own copy of a filehandle to the file effectively re-opening the
-        file.  Re-opening a file incurrs some overhead, so use with care.
-
-        If only *contig* is set, all records on *contig* will be fetched.
-        If both *region* and *contig* are given, an exception is raised.
-
-        Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
-        (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
-        read sequentially.
-        """
-        if not self.is_open:
-            raise ValueError('I/O operation on closed file')
-
-        if not self.mode.startswith(b'r'):
-            raise ValueError('cannot fetch from Variantfile opened '
-                             'for writing')
-
-        if contig is None and region is None:
-            self.is_reading = 1
-            bcf = self.copy() if reopen else self
-            bcf.seek(self.start_offset)
-            return iter(bcf)
-
-        if not self.index:
-            raise ValueError('fetch requires an index')
-
-        self.is_reading = 1
-        return self.index.fetch(self, contig, start, stop, region, reopen)
-
-    cpdef int write(self, VariantRecord record) except -1:
-        """
-        write a single :class:`pysam.VariantRecord` to disk.
-
-        returns the number of bytes written.
-        """
-        if not self.is_open:
-            return ValueError('I/O operation on closed file')
-
-        if not self.mode.startswith(b'w'):
-            raise ValueError('cannot write to a Variantfile opened for reading')
-
-        #if record.header is not self.header:
-        #    raise ValueError('Writing records from a different VariantFile is not yet supported')
-
-        cdef int ret
-
-        with nogil:
-            ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
-
-        if ret < 0:
-            raise ValueError('write failed')
-
-        return ret
-
-    def subset_samples(self, include_samples):
-        """
-        Read only a subset of samples to reduce processing time and memory.
-        Must be called prior to retrieving records.
-        """
-        if not self.is_open:
-            raise ValueError('I/O operation on closed file')
-
-        if not self.mode.startswith(b'r'):
-            raise ValueError('cannot subset samples from Variantfile '
-                             'opened for writing')
-
-        if self.is_reading:
-            raise ValueError('cannot subset samples after fetching records')
-
-        self.header._subset_samples(include_samples)
-
-        # potentially unnecessary optimization that also sets max_unpack
-        if not include_samples:
-            self.drop_samples = True
diff --git a/pysam/cfaidx.pxd b/pysam/cfaidx.pxd

deleted file mode 100644 (file)

index 7749274..0000000
--- a/pysam/cfaidx.pxd
+++ /dev/null
@@ -1,79 +0,0 @@
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-cimport cython
-
-from cpython cimport array
-from pysam.chtslib cimport faidx_t, kstring_t, BGZF
-
-# These functions are put here and not in chtslib.pxd in order
-# to avoid warnings for unused functions.
-cdef extern from "pysam_stream.h" nogil:
-
-    ctypedef struct kstream_t:
-        pass
-
-    ctypedef struct kseq_t:
-        kstring_t name
-        kstring_t comment
-        kstring_t seq
-        kstring_t qual
-
-    kseq_t *kseq_init(BGZF *)
-    int kseq_read(kseq_t *)
-    void kseq_destroy(kseq_t *)
-    kstream_t *ks_init(BGZF *)
-    void ks_destroy(kstream_t *)
-
-    # Retrieve characters from stream until delimiter
-    # is reached placing results in str.
-    int ks_getuntil(kstream_t *,
-                    int delimiter,
-                    kstring_t * str,
-                    int * dret)
-
-cdef class FastaFile:
-    cdef bint is_remote
-    cdef object _filename, _references, _lengths, reference2length
-    cdef faidx_t* fastafile
-    cdef char* _fetch(self, char* reference,
-                      int start, int end, int* length)
-
-
-cdef class FastqProxy:
-    cdef kseq_t * _delegate
-    cdef cython.str tostring(self)
-    cpdef array.array get_quality_array(self, int offset=*)
-
-
-cdef class PersistentFastqProxy:
-    """
-    Python container for pysam.cfaidx.FastqProxy with persistence.
-    """
-    cdef public str comment, quality, sequence, name
-    cdef cython.str tostring(self)
-    cpdef array.array get_quality_array(self, int offset=*)
-
-
-cdef class FastxFile:
-    cdef object _filename
-    cdef BGZF * fastqfile
-    cdef kseq_t * entry
-    cdef bint persist
-    cdef bint is_remote
-
-    cdef kseq_t * getCurrent(self)
-    cdef int cnext(self)
-
-
-# Compatibility Layer for pysam 0.8.1
-cdef class FastqFile(FastxFile):
-    pass
-
-
-# Compatibility Layer for pysam < 0.8
-cdef class Fastafile(FastaFile):
-    pass
-
diff --git a/pysam/cfaidx.pyx b/pysam/cfaidx.pyx

deleted file mode 100644 (file)

index 78f9aac..0000000
--- a/pysam/cfaidx.pyx
+++ /dev/null
@@ -1,571 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-# Cython wrapper for SAM/BAM/CRAM files based on htslib
-###############################################################################
-# The principal classes defined in this module are:
-#
-# class FastaFile   random read read/write access to faidx indexd files
-# class FastxFile   streamed read/write access to fasta/fastq files
-#
-# Additionally this module defines several additional classes that are part
-# of the internal API. These are:
-#
-# class FastqProxy
-# class PersistentFastqProxy
-#
-# For backwards compatibility, the following classes are also defined:
-#
-# class Fastafile   equivalent to FastaFile
-# class FastqFile   equivalent to FastxFile
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-import sys
-import os
-import re
-from cpython cimport array
-
-from cpython cimport PyErr_SetString, \
-    PyBytes_Check, \
-    PyUnicode_Check, \
-    PyBytes_FromStringAndSize
-
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.chtslib cimport \
-    faidx_nseq, fai_load, fai_destroy, fai_fetch, \
-    faidx_seq_len, \
-    faidx_fetch_seq, hisremote, \
-    bgzf_open, bgzf_close
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-from pysam.cutils cimport qualitystring_to_array, parse_region
-
-cdef class FastqProxy
-cdef makeFastqProxy(kseq_t * src):
-    '''enter src into AlignedRead.'''
-    cdef FastqProxy dest = FastqProxy.__new__(FastqProxy)
-    dest._delegate = src
-    return dest
-
-## TODO:
-##        add automatic indexing.
-##        add function to get sequence names.
-cdef class FastaFile:
-    """Random access to fasta formatted files that
-    have been indexed by :term:`faidx`.
-
-    The file is automatically opened. The index file of file
-    ``<filename>`` is expected to be called ``<filename>.fai``.
-
-    Parameters
-    ----------
-
-    filename : string
-        Filename of fasta file to be opened.
-
-    filepath_index : string
-        Optional, filename of the index. By default this is
-        the filename + ".fai".
-
-    Raises
-    ------
-
-    ValueError
-        if index file is missing
-
-    IOError
-        if file could not be opened
-
-    """
-
-    def __cinit__(self, *args, **kwargs):
-        self.fastafile = NULL
-        self._filename = None
-        self._references = None
-        self._lengths = None
-        self.reference2length = None
-        self._open(*args, **kwargs)
-
-    def is_open(self):
-        '''return true if samfile has been opened.'''
-        return self.fastafile != NULL
-
-    def __len__(self):
-        if self.fastafile == NULL:
-            raise ValueError("calling len() on closed file")
-
-        return faidx_nseq(self.fastafile)
-
-    def _open(self, filename, filepath_index=None):
-        '''open an indexed fasta file.
-
-        This method expects an indexed fasta file.
-        '''
-
-        # close a previously opened file
-        if self.fastafile != NULL:
-            self.close()
-
-        self._filename = encode_filename(filename)
-        cdef char *cfilename = self._filename
-        self.is_remote = hisremote(cfilename)
-
-        if filepath_index is not None:
-            raise NotImplementedError(
-                "setting an explicit path for the index "
-                "is not implemented")
-
-        # open file for reading
-        if (self._filename != b"-"
-            and not self.is_remote
-            and not os.path.exists(filename)):
-            raise IOError("file `%s` not found" % filename)
-
-        with nogil:
-            self.fastafile = fai_load(cfilename)
-
-        if self.fastafile == NULL:
-            raise IOError("could not open file `%s`" % filename)
-
-        if self.is_remote:
-            filepath_index = os.path.basename(
-                re.sub("[^:]+:[/]*", "", filename)) + ".fai"
-        elif filepath_index is None:
-            filepath_index = filename + ".fai"
-
-        if not os.path.exists(filepath_index):
-            raise ValueError("could not locate index file {}".format(
-                filepath_index))
-
-        with open(filepath_index) as inf:
-            data = [x.split("\t") for x in inf]
-            self._references = tuple(x[0] for x in data)
-            self._lengths = tuple(int(x[1]) for x in data)
-            self.reference2length = dict(zip(self._references, self._lengths))
-
-    def close(self):
-        """close the file."""
-        if self.fastafile != NULL:
-            fai_destroy(self.fastafile)
-            self.fastafile = NULL
-
-    def __dealloc__(self):
-        if self.fastafile != NULL:
-            fai_destroy(self.fastafile)
-            self.fastafile = NULL
-
-    # context manager interface
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-        return False
-
-    property closed:
-        """"bool indicating the current state of the file object.
-        This is a read-only attribute; the close() method changes the value.
-        """
-        def __get__(self):
-            return not self.is_open()
-
-    property filename:
-        """filename associated with this object. This is a read-only attribute."""
-        def __get__(self):
-            return self._filename
-
-    property references:
-        '''tuple with the names of :term:`reference` sequences.'''
-        def __get__(self):
-            return self._references
-
-    property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
-        This is a read-only attribute."""
-        def __get__(self):
-            return len(self._references) if self.references else None
-
-    property lengths:
-        """tuple with the lengths of :term:`reference` sequences."""
-        def __get__(self):
-            return self._lengths
-
-    def fetch(self,
-              reference=None,
-              start=None,
-              end=None,
-              region=None):
-        """fetch sequences in a :term:`region`.
-
-        A region can
-        either be specified by :term:`reference`, `start` and
-        `end`. `start` and `end` denote 0-based, half-open
-        intervals.
-
-        Alternatively, a samtools :term:`region` string can be
-        supplied.
-
-        If any of the coordinates are missing they will be replaced by the
-        minimum (`start`) or maximum (`end`) coordinate.
-
-        Note that region strings are 1-based, while `start` and `end` denote
-        an interval in python coordinates.
-        The region is specified by :term:`reference`, `start` and `end`.
-
-        Returns
-        -------
-
-        string : a string with the sequence specified by the region.
-
-        Raises
-        ------
-
-        IndexError
-            if the coordinates are out of range
-
-        ValueError
-            if the region is invalid
-
-        """
-
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file" )
-
-        cdef int length
-        cdef char *seq
-        cdef char *ref
-        cdef int rstart, rend
-
-        reference, rstart, rend = parse_region(reference, start, end, region)
-
-        if reference is None:
-            raise ValueError("no sequence/region supplied.")
-
-        if rstart == rend:
-            return ""
-
-        ref = reference
-        with nogil:
-            length = faidx_seq_len(self.fastafile, ref)
-        if length == -1:
-            raise KeyError("sequence '%s' not present" % reference)
-        if rstart >= length:
-            return ""
-
-        # fai_fetch adds a '\0' at the end
-        with nogil:
-            seq = faidx_fetch_seq(self.fastafile,
-                                  ref,
-                                  rstart,
-                                  rend-1,
-                                  &length)
-
-        if seq == NULL:
-            raise ValueError(
-                "failure when retrieving sequence on '%s'" % reference)
-
-        try:
-            return charptr_to_str(seq)
-        finally:
-            free(seq)
-
-    cdef char * _fetch(self, char * reference, int start, int end, int * length):
-        '''fetch sequence for reference, start and end'''
-
-        with nogil:
-            return faidx_fetch_seq(self.fastafile,
-                                   reference,
-                                   start,
-                                   end-1,
-                                   length)
-
-    def get_reference_length(self, reference):
-        '''return the length of reference.'''
-        return self.reference2length[reference]
-
-    def __getitem__(self, reference):
-        return self.fetch(reference)
-
-    def __contains__(self, reference):
-        '''return true if reference in fasta file.'''
-        return reference in self.reference2length
-
-
-cdef class FastqProxy:
-    """A single entry in a fastq file."""
-    def __init__(self): pass
-
-    property name:
-        """The name of each entry in the fastq file."""
-        def __get__(self):
-            return charptr_to_str(self._delegate.name.s)
-
-    property sequence:
-        """The sequence of each entry in the fastq file."""
-        def __get__(self):
-            return charptr_to_str(self._delegate.seq.s)
-
-    property comment:
-        def __get__(self):
-            if self._delegate.comment.l:
-                return charptr_to_str(self._delegate.comment.s)
-            else:
-                return None
-
-    property quality:
-        """The quality score of each entry in the fastq file, represented as a string."""
-        def __get__(self):
-            if self._delegate.qual.l:
-                return charptr_to_str(self._delegate.qual.s)
-            else:
-                return None
-
-    cdef cython.str tostring(self):
-        if self.comment is None:
-            comment = ""
-        else:
-            comment = " %s" % self.comment
-
-        if self.quality is None:
-            return ">%s%s\n%s" % (self.name, comment, self.sequence)
-        else:
-            return "@%s%s\n%s\n+\n%s" % (self.name, comment,
-                                         self.sequence, self.quality)
-
-    def __str__(self):
-        return self.tostring()
-
-    cpdef array.array get_quality_array(self, int offset=33):
-        '''return quality values as integer array after subtracting offset.'''
-        if self.quality is None:
-            return None
-        return qualitystring_to_array(force_bytes(self.quality),
-                                      offset=offset)
-
-cdef class PersistentFastqProxy:
-    """
-    Python container for pysam.cfaidx.FastqProxy with persistence.
-    Needed to compare multiple fastq records from the same file.
-    """
-    def __init__(self, FastqProxy FastqRead):
-        self.comment = FastqRead.comment
-        self.quality = FastqRead.quality
-        self.sequence = FastqRead.sequence
-        self.name = FastqRead.name
-
-    cdef cython.str tostring(self):
-        if self.comment is None:
-            comment = ""
-        else:
-            comment = " %s" % self.comment
-
-        if self.quality is None:
-            return ">%s%s\n%s" % (self.name, comment, self.sequence)
-        else:
-            return "@%s%s\n%s\n+\n%s" % (self.name, comment,
-                                         self.sequence, self.quality)
-
-    def __str__(self):
-        return self.tostring()
-
-    cpdef array.array get_quality_array(self, int offset=33):
-        '''return quality values as array after subtracting offset.'''
-        if self.quality is None:
-            return None
-        return qualitystring_to_array(force_bytes(self.quality),
-                                      offset=offset)
-
-
-cdef class FastxFile:
-    """Stream access to :term:`fasta` or :term:`fastq` formatted files.
-
-    The file is automatically opened.
-
-    Entries in the file can be both fastq or fasta formatted or even a
-    mixture of the two.
-
-    This file object permits iterating over all entries in the
-    file. Random access is not implemented. The iteration returns
-    objects of type :class:`FastqProxy`
-
-    Parameters
-    ----------
-
-    filename : string
-        Filename of fasta/fastq file to be opened.
-
-    persist : bool
-
-        If True (default) make a copy of the entry in the file during
-        iteration. If set to False, no copy will be made. This will
-        permit faster iteration, but an entry will not persist when
-        the iteration continues.
-
-    Notes
-    -----
-    Prior to version 0.8.2, this was called FastqFile.
-
-    Raises
-    ------
-
-    IOError
-        if file could not be opened
-
-
-    Examples
-    --------
-    >>> with pysam.FastxFile(filename) as fh:
-    ...    for entry in fh:
-    ...        print(entry.name)
-    ...        print(entry.sequence)
-    ...        print(entry.comment)
-    ...        print(entry.quality)
-
-    """
-    def __cinit__(self, *args, **kwargs):
-        # self.fastqfile = <gzFile*>NULL
-        self._filename = None
-        self.entry = NULL
-        self._open(*args, **kwargs)
-
-    def is_open(self):
-        '''return true if samfile has been opened.'''
-        return self.entry != NULL
-
-    def _open(self, filename, persist=True):
-        '''open a fastq/fasta file in *filename*
-
-        Paramentes
-        ----------
-
-        persist : bool
-
-            if True return a copy of the underlying data (default
-            True).  The copy will persist even if the iteration
-            on the file continues.
-
-        '''
-        if self.fastqfile != NULL:
-            self.close()
-
-        self._filename = encode_filename(filename)
-        cdef char *cfilename = self._filename
-        self.is_remote = hisremote(cfilename)
-
-        # open file for reading
-        if (self._filename != b"-"
-            and not self.is_remote
-            and not os.path.exists(filename)):
-            raise IOError("file `%s` not found" % filename)
-
-        self.persist = persist
-
-        with nogil:
-            self.fastqfile = bgzf_open(cfilename, "r")
-            self.entry = kseq_init(self.fastqfile)
-        self._filename = filename
-
-    def close(self):
-        '''close the file.'''
-        if self.fastqfile != NULL:
-            bgzf_close(self.fastqfile)
-            self.fastqfile = NULL
-        if self.entry != NULL:
-            kseq_destroy(self.entry)
-            self.entry = NULL
-
-    def __dealloc__(self):
-        if self.fastqfile != NULL:
-            bgzf_close(self.fastqfile)
-        if self.entry:
-            kseq_destroy(self.entry)
-
-    # context manager interface
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-        return False
-
-    property closed:
-        """"bool indicating the current state of the file object.
-        This is a read-only attribute; the close() method changes the value.
-        """
-        def __get__(self):
-            return not self.is_open()
-
-    property filename:
-        """string with the filename associated with this object."""
-        def __get__(self):
-            return self._filename
-
-    def __iter__(self):
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-        return self
-
-    cdef kseq_t * getCurrent(self):
-        return self.entry
-
-    cdef int cnext(self):
-        '''C version of iterator
-        '''
-        with nogil:
-            return kseq_read(self.entry)
-
-    def __next__(self):
-        """
-        python version of next().
-        """
-        cdef int l
-        with nogil:
-            l = kseq_read(self.entry)
-        if (l >= 0):
-            if self.persist:
-                return PersistentFastqProxy(makeFastqProxy(self.entry))
-            return makeFastqProxy(self.entry)
-        else:
-            raise StopIteration
-
-# Compatibility Layer for pysam 0.8.1
-cdef class FastqFile(FastxFile):
-    """FastqFile is deprecated: use FastxFile instead"""
-    pass
-
-# Compatibility Layer for pysam < 0.8
-cdef class Fastafile(FastaFile):
-    """Fastafile is deprecated: use FastaFile instead"""
-    pass
-
-__all__ = ["FastaFile",
-           "FastqFile",
-           "FastxFile",
-           "Fastafile"]
diff --git a/pysam/chtslib.pxd b/pysam/chtslib.pxd

deleted file mode 100644 (file)

index 33c1559..0000000
--- a/pysam/chtslib.pxd
+++ /dev/null
@@ -1,1898 +0,0 @@
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-from posix.types cimport off_t
-
-cdef extern from "Python.h":
-   FILE* PyFile_AsFile(object)
-
-
-cdef extern from "htslib/kstring.h" nogil:
-    ctypedef struct kstring_t:
-        size_t l, m
-        char *s
-
-
-cdef extern from "htslib_util.h" nogil:
-    int hts_set_verbosity(int verbosity)
-    int hts_get_verbosity()
-
-    ctypedef uint32_t khint32_t
-    ctypedef uint32_t khint_t
-    ctypedef khint_t  khiter_t
-
-    # Used to manage BCF Header info
-    ctypedef struct vdict_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        khint32_t *flags
-        const char *keys
-        bcf_idinfo_t *vals
-
-    # Used to manage indexed contigs in Tabix
-    ctypedef struct s2i_t:
-        khint_t n_buckets, size, n_occupied, upper_bound
-        khint32_t *flags
-        const char *keys
-        int64_t *vals
-
-    # Generic khash methods
-    khint_t kh_size(void *d)
-    khint_t kh_begin(void *d)
-    khint_t kh_end(void *d)
-    int kh_exist(void *d, khiter_t i)
-
-    # Specialized khash methods for vdict
-    khint_t kh_get_vdict(vdict_t *d, const char *key)
-    const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
-    bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
-
-
-cdef extern from "htslib/hfile.h" nogil:
-    ctypedef struct hFILE
-
-    # @abstract  Open the named file or URL as a stream
-    # @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
-    hFILE *hopen(const char *filename, const char *mode)
-
-    # @abstract  Associate a stream with an existing open file descriptor
-    # @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
-    # @notes     For socket descriptors (on Windows), mode should contain 's'.
-    hFILE *hdopen(int fd, const char *mode)
-
-    # @abstract  Report whether the file name or URL denotes remote storage
-    # @return    0 if local, 1 if remote.
-    # @notes     "Remote" means involving e.g. explicit network access, with the
-    #   implication that callers may wish to cache such files' contents locally.
-    int hisremote(const char *filename)
-
-    # @abstract  Flush (for output streams) and close the stream
-    # @return    0 if successful, or EOF (with errno set) if an error occurred.
-    int hclose(hFILE *fp)
-
-    # @abstract  Close the stream, without flushing or propagating errors
-    # @notes     For use while cleaning up after an error only.  Preserves errno.
-    void hclose_abruptly(hFILE *fp)
-
-    # @abstract  Return the stream's error indicator
-    # @return    Non-zero (in fact, an errno value) if an error has occurred.
-    # @notes     This would be called herror() and return true/false to parallel
-    #   ferror(3), but a networking-related herror(3) function already exists.  */
-    int herrno(hFILE *fp)
-
-    # @abstract  Clear the stream's error indicator
-    void hclearerr(hFILE *fp)
-
-    # @abstract  Reposition the read/write stream offset
-    # @return    The resulting offset within the stream (as per lseek(2)),
-    #   or negative if an error occurred.
-    off_t hseek(hFILE *fp, off_t offset, int whence)
-
-    # @abstract  Report the current stream offset
-    # @return    The offset within the stream, starting from zero.
-    off_t htell(hFILE *fp)
-
-    # @abstract  Read one character from the stream
-    # @return    The character read, or EOF on end-of-file or error
-    int hgetc(hFILE *fp)
-
-    # @abstract  Peek at characters to be read without removing them from buffers
-    # @param fp      The file stream
-    # @param buffer  The buffer to which the peeked bytes will be written
-    # @param nbytes  The number of bytes to peek at; limited by the size of the
-    #   internal buffer, which could be as small as 4K.
-    # @return    The number of bytes peeked, which may be less than nbytes if EOF
-    #   is encountered; or negative, if there was an I/O error.
-    # @notes  The characters peeked at remain in the stream's internal buffer,
-    #   and will be returned by later hread() etc calls.
-    ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
-
-    # @abstract  Read a block of characters from the file
-    # @return    The number of bytes read, or negative if an error occurred.
-    # @notes     The full nbytes requested will be returned, except as limited
-    #   by EOF or I/O errors.
-    ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
-
-    # @abstract  Write a character to the stream
-    # @return    The character written, or EOF if an error occurred.
-    int hputc(int c, hFILE *fp)
-
-    # @abstract  Write a string to the stream
-    # @return    0 if successful, or EOF if an error occurred.
-    int hputs(const char *text, hFILE *fp)
-
-    # @abstract  Write a block of characters to the file
-    # @return    Either nbytes, or negative if an error occurred.
-    # @notes     In the absence of I/O errors, the full nbytes will be written.
-    ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
-
-    # @abstract  For writing streams, flush buffered output to the underlying stream
-    # @return    0 if successful, or EOF if an error occurred.
-    int hflush(hFILE *fp)
-
-
-cdef extern from "htslib/bgzf.h" nogil:
-    ctypedef struct bgzf_mtaux_t
-    ctypedef struct bgzidx_t
-    ctypedef struct z_stream
-
-    ctypedef struct BGZF:
-        unsigned           errcode
-        unsigned           is_write
-        int           is_be
-        int           compress_level
-        int           is_compressed
-        int           is_gzip
-        int           cache_size
-        int64_t       block_address
-        int64_t       uncompressed_address
-        void         *uncompressed_block
-        void         *compressed_block
-        void         *cache
-        hFILE        *fp
-        bgzf_mtaux_t *mt
-        bgzidx_t     *idx
-        int           idx_build_otf
-        z_stream     *gz_stream
-
-    #*****************
-    #  Basic routines *
-    # *****************/
-
-    #  Open an existing file descriptor for reading or writing.
-    #
-    #  @param fd    file descriptor
-    #  @param mode  mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
-    #               writing, 'a' for appending, 'g' for gzip rather than BGZF
-    #               compression (with 'w' only), and digit specifies the zlib
-    #               compression level.
-    #               Note that there is a distinction between 'u' and '0': the
-    #               first yields plain uncompressed output whereas the latter
-    #               outputs uncompressed data wrapped in the zlib format.
-    #  @return      BGZF file handler; 0 on error
-
-    BGZF* bgzf_dopen(int fd, const char *mode)
-    BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility
-
-    #  Open the specified file for reading or writing.
-    BGZF* bgzf_open(const char* path, const char *mode)
-
-    #  Open an existing hFILE stream for reading or writing.
-    BGZF* bgzf_hopen(hFILE *fp, const char *mode)
-
-    #  Close the BGZF and free all associated resources.
-    #
-    #  @param fp    BGZF file handler
-    #  @return      0 on success and -1 on error
-    int bgzf_close(BGZF *fp)
-
-    #  Read up to _length_ bytes from the file storing into _data_.
-    #
-    #  @param fp     BGZF file handler
-    #  @param data   data array to read into
-    #  @param length size of data to read
-    #  @return       number of bytes actually read; 0 on end-of-file and -1 on error
-    ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
-
-    #  Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
-    #  the complete _length_ bytes will be written (or queued for writing).
-    #
-    #  @param fp     BGZF file handler
-    #  @param data   data array to write
-    #  @param length size of data to write
-    #  @return       number of bytes written (i.e., _length_); negative on error
-    ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
-
-    #  Read up to _length_ bytes directly from the underlying stream without
-    #  decompressing.  Bypasses BGZF blocking, so must be used with care in
-    #  specialised circumstances only.
-    #
-    #  @param fp     BGZF file handler
-    #  @param data   data array to read into
-    #  @param length number of raw bytes to read
-    #  @return       number of bytes actually read; 0 on end-of-file and -1 on error
-    ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
-
-    #  Write _length_ bytes directly to the underlying stream without
-    #  compressing.  Bypasses BGZF blocking, so must be used with care
-    #  in specialised circumstances only.
-    #
-    #  @param fp     BGZF file handler
-    #  @param data   data array to write
-    #  @param length number of raw bytes to write
-    #  @return       number of bytes actually written; -1 on error
-    ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
-
-    #  Write the data in the buffer to the file.
-    int bgzf_flush(BGZF *fp)
-
-    int SEEK_SET
-
-    #  Return a virtual file pointer to the current location in the file.
-    #  No interpetation of the value should be made, other than a subsequent
-    #  call to bgzf_seek can be used to position the file at the same point.
-    #  Return value is non-negative on success.
-    int64_t bgzf_tell(BGZF *fp)
-
-    #  Set the file to read from the location specified by _pos_.
-    #
-    #  @param fp     BGZF file handler
-    #  @param pos    virtual file offset returned by bgzf_tell()
-    #  @param whence must be SEEK_SET
-    #  @return       0 on success and -1 on error
-    # /
-    int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
-
-    #  Check if the BGZF end-of-file (EOF) marker is present
-    #
-    #  @param fp    BGZF file handler opened for reading
-    #  @return      1 if the EOF marker is present and correct
-    #               2 if it can't be checked, e.g., because fp isn't seekable
-    #               0 if the EOF marker is absent
-    #               -1 (with errno set) on error
-    int bgzf_check_EOF(BGZF *fp)
-
-    #  Check if a file is in the BGZF format
-    #
-    #  @param fn    file name
-    #  @return      1 if _fn_ is BGZF; 0 if not or on I/O error
-    int bgzf_is_bgzf(const char *fn)
-
-    #*********************
-    #  Advanced routines *
-    #*********************
-
-    #  Set the cache size. Only effective when compiled with -DBGZF_CACHE.
-    #
-    #  @param fp    BGZF file handler
-    #  @param size  size of cache in bytes; 0 to disable caching (default)
-    void bgzf_set_cache_size(BGZF *fp, int size)
-
-    #  Flush the file if the remaining buffer size is smaller than _size_
-    #  @return      0 if flushing succeeded or was not needed; negative on error
-    int bgzf_flush_try(BGZF *fp, ssize_t size)
-
-    #  Read one byte from a BGZF file. It is faster than bgzf_read()
-    #  @param fp     BGZF file handler
-    #  @return       byte read; -1 on end-of-file or error
-    int bgzf_getc(BGZF *fp)
-
-    #  Read one line from a BGZF file. It is faster than bgzf_getc()
-    #
-    #  @param fp     BGZF file handler
-    #  @param delim  delimitor
-    #  @param str    string to write to; must be initialized
-    #  @return       length of the string; 0 on end-of-file; negative on error
-    int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
-
-    #  Read the next BGZF block.
-    int bgzf_read_block(BGZF *fp)
-
-    #  Enable multi-threading (only effective on writing and when the
-    #  library was compiled with -DBGZF_MT)
-    #
-    #  @param fp          BGZF file handler; must be opened for writing
-    #  @param n_threads   #threads used for writing
-    #  @param n_sub_blks  #blocks processed by each thread; a value 64-256 is recommended
-    int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
-
-
-    # Compress a single BGZF block.
-    #
-    # @param dst    output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
-    # @param dlen   size of output buffer; updated on return to the number
-    #               of bytes actually written to dst
-    # @param src    buffer to be compressed
-    # @param slen   size of data to compress (must be <= BGZF_BLOCK_SIZE)
-    # @param level  compression level
-    # @return       0 on success and negative on error
-    #
-    int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
-
-    #*******************
-    #  bgzidx routines *
-    #   BGZF at the uncompressed offset
-    #
-    #   @param fp           BGZF file handler; must be opened for reading
-    #   @param uoffset      file offset in the uncompressed data
-    #   @param where        SEEK_SET supported atm
-    #
-    #   Returns 0 on success and -1 on error.
-    int bgzf_useek(BGZF *fp, long uoffset, int where)
-
-    #   Position in uncompressed BGZF
-    #
-    #   @param fp           BGZF file handler; must be opened for reading
-    #
-    #   Returns the current offset on success and -1 on error.
-    long bgzf_utell(BGZF *fp)
-
-    #  Tell BGZF to build index while compressing.
-    #
-    #  @param fp          BGZF file handler; can be opened for reading or writing.
-    #
-    #  Returns 0 on success and -1 on error.
-    int bgzf_index_build_init(BGZF *fp)
-
-    #  Load BGZF index
-    #
-    #  @param fp          BGZF file handler
-    #  @param bname       base name
-    #  @param suffix      suffix to add to bname (can be NULL)
-    #
-    #  Returns 0 on success and -1 on error.
-    int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
-
-    #  Save BGZF index
-    #
-    #  @param fp          BGZF file handler
-    #  @param bname       base name
-    #  @param suffix      suffix to add to bname (can be NULL)
-    #
-    #  Returns 0 on success and -1 on error.
-    int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
-
-
-cdef extern from "htslib/hts.h" nogil:
-    uint32_t kroundup32(uint32_t x)
-
-    ctypedef struct cram_fd
-
-    union FilePointerUnion:
-        BGZF    *bgzf
-        cram_fd *cram
-        hFILE   *hfile
-        void    *voidp
-
-    enum htsFormatCategory:
-        unknown_category
-        sequence_data    # Sequence data -- SAM, BAM, CRAM, etc
-        variant_data     # Variant calling data -- VCF, BCF, etc
-        index_file       # Index file associated with some data file
-        region_list      # Coordinate intervals or regions -- BED, etc
-        category_maximum
-
-    enum htsExactFormat:
-        unknown_format
-        binary_format
-        text_format
-        sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
-        format_maximum
-
-    enum htsCompression:
-        no_compression, gzip, bgzf, custom
-        compression_maximum
-
-    enum hts_fmt_option:
-        CRAM_OPT_DECODE_MD,
-        CRAM_OPT_PREFIX,
-        CRAM_OPT_VERBOSITY,
-        CRAM_OPT_SEQS_PER_SLICE,
-        CRAM_OPT_SLICES_PER_CONTAINER,
-        CRAM_OPT_RANGE,
-        CRAM_OPT_VERSION,
-        CRAM_OPT_EMBED_REF,
-        CRAM_OPT_IGNORE_MD5,
-        CRAM_OPT_REFERENCE,
-        CRAM_OPT_MULTI_SEQ_PER_SLICE,
-        CRAM_OPT_NO_REF,
-        CRAM_OPT_USE_BZIP2,
-        CRAM_OPT_SHARED_REF,
-        CRAM_OPT_NTHREADS,
-        CRAM_OPT_THREAD_POOL,
-        CRAM_OPT_USE_LZMA,
-        CRAM_OPT_USE_RANS,
-        CRAM_OPT_REQUIRED_FIELDS,
-        HTS_OPT_COMPRESSION_LEVEL,
-        HTS_OPT_NTHREADS,
-
-    ctypedef struct htsVersion:
-        short major, minor
-
-    ctypedef struct htsFormat:
-        htsFormatCategory category
-        htsExactFormat    format
-        htsVersion        version
-        htsCompression    compression
-        short             compression_level
-        void              *specific  
-
-    ctypedef struct htsFile:
-        uint8_t  is_bin
-        uint8_t  is_write
-        uint8_t  is_be
-        uint8_t  is_cram
-        int64_t lineno
-        kstring_t line
-        char *fn
-        char *fn_aux
-        FilePointerUnion fp
-        htsFormat format
-
-    int hts_verbose
-
-    # @abstract Table for converting a nucleotide character to 4-bit encoding.
-    # The input character may be either an IUPAC ambiguity code, '=' for 0, or
-    # '0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
-    # for A/C/G/T or combinations of these bits for ambiguous bases.
-    const unsigned char *seq_nt16_table
-
-    # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
-    # ambiguity code letter (or '=' when given 0).
-    const char *seq_nt16_str
-
-    # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
-    # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
-    const int *seq_nt16_int
-
-    # @abstract  Get the htslib version number
-    # @return    For released versions, a string like "N.N[.N]"; or git describe
-    # output if using a library built within a Git repository.
-    const char *hts_version()
-
-    # @abstract    Determine format by peeking at the start of a file
-    # @param fp    File opened for reading, positioned at the beginning
-    # @param fmt   Format structure that will be filled out on return
-    # @return      0 for success, or negative if an error occurred.
-    int hts_detect_format(hFILE *fp, htsFormat *fmt)
-
-    # @abstract    Get a human-readable description of the file format
-    # @return      Description string, to be freed by the caller after use.
-    char *hts_format_description(const htsFormat *format)
-
-    # @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
-    # @param fn       The file name or "-" for stdin/stdout
-    # @param mode     Mode matching / [rwa][bceguxz0-9]* /
-    # @discussion
-    #     With 'r' opens for reading; any further format mode letters are ignored
-    #     as the format is detected by checking the first few bytes or BGZF blocks
-    #     of the file.  With 'w' or 'a' opens for writing or appending, with format
-    #     specifier letters:
-    #       b  binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
-    #       c  CRAM format
-    #       g  gzip compressed
-    #       u  uncompressed
-    #       z  bgzf compressed
-    #       [0-9]  zlib compression level
-    #     and with non-format option letters (for any of 'r'/'w'/'a'):
-    #       e  close the file on exec(2) (opens with O_CLOEXEC, where supported)
-    #       x  create the file exclusively (opens with O_EXCL, where supported)
-    #     Note that there is a distinction between 'u' and '0': the first yields
-    #     plain uncompressed output whereas the latter outputs uncompressed data
-    #     wrapped in the zlib format.
-    # @example
-    #     [rw]b  .. compressed BCF, BAM, FAI
-    #     [rw]bu .. uncompressed BCF
-    #     [rw]z  .. compressed VCF
-    #     [rw]   .. uncompressed VCF
-    htsFile *hts_open(const char *fn, const char *mode)
-
-    # @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
-    # @param fn       The file name or "-" for stdin/stdout
-    # @param mode     Open mode, as per hts_open()
-    # @param fmt      Optional format specific parameters
-    # @discussion
-    #     See hts_open() for description of fn and mode.
-    #     // TODO Update documentation for s/opts/fmt/
-    #     Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
-    #     if defined, override mode.  Opts also contains a linked list of hts_opt
-    #     structures to apply to the open file handle.  These can contain things
-    #     like pointers to the reference or information on compression levels,
-    #     block sizes, etc.
-    htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
-
-    # @abstract       Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
-    # @param fp       The already-open file handle
-    # @param fn       The file name or "-" for stdin/stdout
-    # @param mode     Open mode, as per hts_open()
-    htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
-
-    # @abstract  Close a file handle, flushing buffered data for output streams
-    # @param fp  The file handle to be closed
-    # @return    0 for success, or negative if an error occurred.
-    int hts_close(htsFile *fp)
-
-    # @abstract  Returns the file's format information
-    # @param fp  The file handle
-    # @return    Read-only pointer to the file's htsFormat.
-    const htsFormat *hts_get_format(htsFile *fp)
-
-    # @ abstract      Returns a string containing the file format extension.
-    # @ param format  Format structure containing the file type.
-    # @ return        A string ("sam", "bam", etc) or "?" for unknown formats.
-    const char *hts_format_file_extension(const htsFormat *format)
-
-    # @abstract  Sets a specified CRAM option on the open file handle.
-    # @param fp  The file handle open the open file.
-    # @param opt The CRAM_OPT_* option.
-    # @param ... Optional arguments, dependent on the option used.
-    # @return    0 for success, or negative if an error occurred.
-    int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
-
-    int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
-    char **hts_readlines(const char *fn, int *_n)
-
-    #   @abstract       Parse comma-separated list or read list from a file
-    #   @param list     File name or comma-separated list
-    #   @param is_file
-    #   @param _n       Size of the output array (number of items read)
-    #   @return         NULL on failure or pointer to newly allocated array of
-    #                   strings
-    char **hts_readlist(const char *fn, int is_file, int *_n)
-
-    # @abstract  Create extra threads to aid compress/decompression for this file
-    # @param fp  The file handle
-    # @param n   The number of worker threads to create
-    # @return    0 for success, or negative if an error occurred.
-    # @notes     THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
-    int hts_set_threads(htsFile *fp, int n)
-
-    # @abstract  Set .fai filename for a file opened for reading
-    # @return    0 for success, negative on failure
-    # @discussion
-    #     Called before *_hdr_read(), this provides the name of a .fai file
-    #     used to provide a reference list if the htsFile contains no @SQ headers.
-    int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
-
-    int8_t HTS_IDX_NOCOOR
-    int8_t HTS_IDX_START
-    int8_t HTS_IDX_REST
-    int8_t HTS_IDX_NONE
-
-    int8_t HTS_FMT_CSI
-    int8_t HTS_FMT_BAI
-    int8_t HTS_FMT_TBI
-    int8_t HTS_FMT_CRAI
-
-    BGZF *hts_get_bgzfp(htsFile *fp)
-    int hts_useek(htsFile *fp, long uoffset, int where)
-    long hts_utell(htsFile *fp)
-
-    ctypedef struct hts_idx_t
-
-    ctypedef struct hts_pair64_t:
-        uint64_t u, v
-
-    ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end)
-
-    ctypedef struct hts_bins_t:
-        int n, m
-        int *a
-
-    ctypedef struct hts_itr_t:
-        uint32_t read_rest
-        uint32_t finished
-        int tid, bed, end, n_off, i
-        int curr_tid, curr_beg, curr_end
-        uint64_t curr_off
-        hts_pair64_t *off
-        hts_readrec_func *readfunc
-        hts_bins_t bins
-
-    hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
-    void hts_idx_destroy(hts_idx_t *idx)
-    int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
-    void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
-
-    #### Save an index to a file
-    #    @param idx  Index to be written
-    #    @param fn   Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
-    #    @param fmt  One of the HTS_FMT_* index formats
-    #    @return  0 if successful, or negative if an error occurred.
-    int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
-
-    #### Save an index to a specific file
-    #    @param idx    Index to be written
-    #    @param fn     Input BAM/BCF/etc filename
-    #    @param fnidx  Output filename, or NULL to add .bai/.csi/etc to @a fn
-    #    @param fmt    One of the HTS_FMT_* index formats
-    #    @return  0 if successful, or negative if an error occurred.
-    int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
-
-    #### Load an index file
-    #    @param fn   BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
-    #                the extension substituted, to search for an existing index file
-    #    @param fmt  One of the HTS_FMT_* index formats
-    #    @return  The index, or NULL if an error occurred.
-    hts_idx_t *hts_idx_load(const char *fn, int fmt)
-
-    #### Load a specific index file
-    #    @param fn     Input BAM/BCF/etc filename
-    #    @param fnidx  The input index filename
-    #    @return  The index, or NULL if an error occurred.
-    hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
-
-    uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
-    void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
-
-    int hts_idx_get_stat(const hts_idx_t* idx, int tid,
-                         uint64_t* mapped, uint64_t* unmapped)
-
-    uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
-
-    int HTS_PARSE_THOUSANDS_SEP  # Ignore ',' separators within numbers
-
-    # Parse a numeric string
-    #    The number may be expressed in scientific notation, and optionally may
-    #    contain commas in the integer part (before any decimal point or E notation).
-    #    @param str     String to be parsed
-    #    @param strend  If non-NULL, set on return to point to the first character
-    #                   in @a str after those forming the parsed number
-    #    @param flags   Or'ed-together combination of HTS_PARSE_* flags
-    #    @return  Converted value of the parsed number.
-    #
-    #    When @a strend is NULL, a warning will be printed (if hts_verbose is 2
-    #    or more) if there are any trailing characters after the number.
-    long long hts_parse_decimal(const char *str, char **strend, int flags)
-
-    # Parse a "CHR:START-END"-style region string
-    #    @param str  String to be parsed
-    #    @param beg  Set on return to the 0-based start of the region
-    #    @param end  Set on return to the 1-based end of the region
-    #    @return  Pointer to the colon or '\0' after the reference sequence name,
-    #             or NULL if @a str could not be parsed.
-    const char *hts_parse_reg(const char *str, int *beg, int *end)
-
-    hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
-    void hts_itr_destroy(hts_itr_t *iter)
-
-    ctypedef int (*hts_name2id_f)(void*, const char*)
-    ctypedef const char *(*hts_id2name_f)(void*, int)
-    ctypedef hts_itr_t *hts_itr_query_func(
-        const hts_idx_t *idx,
-        int tid,
-        int beg,
-        int end,
-        hts_readrec_func *readrec)
-
-    hts_itr_t *hts_itr_querys(
-        const hts_idx_t *idx,
-        const char *reg,
-        hts_name2id_f getid,
-        void *hdr,
-        hts_itr_query_func *itr_query,
-        hts_readrec_func *readrec)
-
-    int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
-    const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)  # free only the array, not the values
-
-    # hts_file_type() - Convenience function to determine file type
-    # @fname: the file name
-    #
-    # Returns one of the FT_* defines.
-    #
-    # DEPRECATED:  This function has been replaced by hts_detect_format().
-    # It and these FT_* macros will be removed in a future HTSlib release.
-    int FT_UNKN
-    int FT_GZ
-    int FT_VCF
-    int FT_VCF_GZ
-    int FT_BCF
-    int FT_BCF_GZ
-    int FT_STDIN
-
-    int hts_file_type(const char *fname)
-
-    inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
-    inline int hts_bin_bot(int bin, int n_lvls)
-
-    # * Endianness *
-    inline int ed_is_big()
-    inline uint16_t ed_swap_2(uint16_t v)
-    inline void *ed_swap_2p(void *x)
-    inline uint32_t ed_swap_4(uint32_t v)
-    inline void *ed_swap_4p(void *x)
-    inline uint64_t ed_swap_8(uint64_t v)
-    inline void *ed_swap_8p(void *x)
-
-
-cdef extern from "htslib/sam.h" nogil:
-    #**********************
-    #*** SAM/BAM header ***
-    #**********************
-
-    # @abstract Structure for the alignment header.
-    # @field n_targets   number of reference sequences
-    # @field l_text      length of the plain text in the header
-    # @field target_len  lengths of the reference sequences
-    # @field target_name names of the reference sequences
-    # @field text        plain text
-    # @field sdict       header dictionary
-
-    ctypedef struct bam_hdr_t:
-         int32_t n_targets, ignore_sam_err
-         uint32_t l_text
-         uint32_t *target_len
-         uint8_t *cigar_tab
-         char **target_name
-         char *text
-         void *sdict
-
-    #****************************
-    #*** CIGAR related macros ***
-    #****************************
-
-    int BAM_CMATCH
-    int BAM_CINS
-    int BAM_CDEL
-    int BAM_CREF_SKIP
-    int BAM_CSOFT_CLIP
-    int BAM_CHARD_CLIP
-    int BAM_CPAD
-    int BAM_CEQUAL
-    int BAM_CDIFF
-    int BAM_CBACK
-
-    char    *BAM_CIGAR_STR
-    int      BAM_CIGAR_SHIFT
-    uint32_t BAM_CIGAR_MASK
-    uint32_t BAM_CIGAR_TYPE
-
-    char bam_cigar_op(uint32_t c)
-    uint32_t bam_cigar_oplen(uint32_t c)
-    char bam_cigar_opchr(uint32_t)
-    uint32_t bam_cigar_gen(char, uint32_t)
-    int bam_cigar_type(char o)
-
-    # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair
-    int BAM_FPAIRED
-    # @abstract the read is mapped in a proper pair
-    int BAM_FPROPER_PAIR
-    # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
-    int BAM_FUNMAP
-    # @abstract the mate is unmapped
-    int BAM_FMUNMAP
-    # @abstract the read is mapped to the reverse strand
-    int BAM_FREVERSE
-    # @abstract the mate is mapped to the reverse strand
-    int BAM_FMREVERSE
-    # @abstract this is read1
-    int BAM_FREAD1
-    # @abstract this is read2
-    int BAM_FREAD2
-    # @abstract not primary alignment
-    int BAM_FSECONDARY
-    # @abstract QC failure
-    int BAM_FQCFAIL
-    # @abstract optical or PCR duplicate
-    int BAM_FDUP
-    # @abstract supplementary alignment
-    int BAM_FSUPPLEMENTARY
-
-    #*************************
-    #*** Alignment records ***
-    #*************************
-
-    # @abstract Structure for core alignment information.
-    # @field  tid     chromosome ID, defined by bam_hdr_t
-    # @field  pos     0-based leftmost coordinate
-    # @field  bin     bin calculated by bam_reg2bin()
-    # @field  qual    mapping quality
-    # @field  l_qname length of the query name
-    # @field  flag    bitwise flag
-    # @field  n_cigar number of CIGAR operations
-    # @field  l_qseq  length of the query sequence (read)
-    # @field  mtid    chromosome ID of next read in template, defined by bam_hdr_t
-    # @field  mpos    0-based leftmost coordinate of next read in template
-
-    ctypedef struct bam1_core_t:
-        int32_t tid
-        int32_t pos
-        uint16_t bin
-        uint8_t qual
-        uint8_t l_qname
-        uint16_t flag
-        uint16_t n_cigar
-        int32_t l_qseq
-        int32_t mtid
-        int32_t mpos
-        int32_t isize
-
-    # @abstract Structure for one alignment.
-    # @field  core       core information about the alignment
-    # @field  l_data     current length of bam1_t::data
-    # @field  m_data     maximum length of bam1_t::data
-    # @field  data       all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
-    #
-    # @discussion Notes:
-    #
-    # 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
-    # 2. l_qseq is calculated from the total length of an alignment block
-    # on reading or from CIGAR.
-    # 3. cigar data is encoded 4 bytes per CIGAR operation.
-    # 4. seq is nybble-encoded according to seq_nt16_table.
-    ctypedef struct bam1_t:
-        bam1_core_t core
-        int l_data, m_data
-        uint8_t *data
-        uint64_t id
-
-    # @abstract  Get whether the query is on the reverse strand
-    # @param  b  pointer to an alignment
-    # @return    boolean true if query is on the reverse strand
-    int bam_is_rev(bam1_t *b)
-
-    # @abstract  Get whether the query's mate is on the reverse strand
-    # @param  b  pointer to an alignment
-    # @return    boolean true if query's mate on the reverse strand
-    int bam_is_mrev(bam1_t *b)
-
-    # @abstract  Get the name of the query
-    # @param  b  pointer to an alignment
-    # @return    pointer to the name string, null terminated
-    char *bam_get_qname(bam1_t *b)
-
-    # @abstract  Get the CIGAR array
-    # @param  b  pointer to an alignment
-    # @return    pointer to the CIGAR array
-    #
-    # @discussion In the CIGAR array, each element is a 32-bit integer. The
-    # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
-    # length of a CIGAR.
-    uint32_t *bam_get_cigar(bam1_t *b)
-
-    # @abstract  Get query sequence
-    # @param  b  pointer to an alignment
-    # @return    pointer to sequence
-    #
-    # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
-    # 8 for T and 15 for N. Two bases are packed in one byte with the base
-    # at the higher 4 bits having smaller coordinate on the read. It is
-    # recommended to use bam_seqi() macro to get the base.
-    char *bam_get_seq(bam1_t *b)
-
-    # @abstract  Get query quality
-    # @param  b  pointer to an alignment
-    # @return    pointer to quality string
-    uint8_t *bam_get_qual(bam1_t *b)
-
-    # @abstract  Get auxiliary data
-    # @param  b  pointer to an alignment
-    # @return    pointer to the concatenated auxiliary data
-    uint8_t *bam_get_aux(bam1_t *b)
-
-    # @abstract  Get length of auxiliary data
-    # @param  b  pointer to an alignment
-    # @return    length of the concatenated auxiliary data
-    int bam_get_l_aux(bam1_t *b)
-
-    # @abstract  Get a base on read
-    # @param  s  Query sequence returned by bam1_seq()
-    # @param  i  The i-th position, 0-based
-    # @return    4-bit integer representing the base.
-    char bam_seqi(char *s, int i)
-
-    #**************************
-    #*** Exported functions ***
-    #**************************
-
-    #***************
-    #*** BAM I/O ***
-    #***************
-
-    bam_hdr_t *bam_hdr_init()
-    bam_hdr_t *bam_hdr_read(BGZF *fp)
-    int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
-    void bam_hdr_destroy(bam_hdr_t *h)
-    int bam_name2id(bam_hdr_t *h, const char *ref)
-    bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0)
-
-    bam1_t *bam_init1()
-    void bam_destroy1(bam1_t *b)
-    int bam_read1(BGZF *fp, bam1_t *b)
-    int bam_write1(BGZF *fp, const bam1_t *b)
-    bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
-    bam1_t *bam_dup1(const bam1_t *bsrc)
-
-    int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
-    int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
-
-    # @abstract Calculate the rightmost base position of an alignment on the
-    # reference genome.
-
-    # @param  b  pointer to an alignment
-    # @return    the coordinate of the first base after the alignment, 0-based
-
-    # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
-    # For an unmapped read (either according to its flags or if it has no cigar
-    # string), we return b->core.pos + 1 by convention.
-    int32_t bam_endpos(const bam1_t *b)
-
-    int   bam_str2flag(const char *str)  # returns negative value on error
-    char *bam_flag2str(int flag)         # The string must be freed by the user
-
-    #*************************
-    #*** BAM/CRAM indexing ***
-    #*************************
-
-    # These BAM iterator functions work only on BAM files.  To work with either
-    # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
-    void bam_itr_destroy(hts_itr_t *iter)
-    hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
-    hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
-    int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
-
-    # Load/build .csi or .bai BAM index file.  Does not work with CRAM.
-    # It is recommended to use the sam_index_* functions below instead.
-    hts_idx_t *bam_index_load(const char *fn)
-    int bam_index_build(const char *fn, int min_shift)
-
-    # Load a BAM (.csi or .bai) or CRAM (.crai) index file
-    # @param fp  File handle of the data file whose index is being opened
-    # @param fn  BAM/CRAM/etc filename to search alongside for the index file
-    # @return  The index, or NULL if an error occurred.
-    hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
-
-    # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
-    # @param fp     File handle of the data file whose index is being opened
-    # @param fn     BAM/CRAM/etc data file filename
-    # @param fnidx  Index filename, or NULL to search alongside @a fn
-    # @return  The index, or NULL if an error occurred.
-    hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
-
-    # Generate and save an index file
-    # @param fn        Input BAM/etc filename, to which .csi/etc will be added
-    # @param min_shift Positive to generate CSI, or 0 to generate BAI
-    # @return  0 if successful, or negative if an error occurred (usually -1; or
-    #         -2: opening fn failed; -3: format not indexable)
-    int sam_index_build(const char *fn, int min_shift)
-
-    # Generate and save an index to a specific file
-    # @param fn        Input BAM/CRAM/etc filename
-    # @param fnidx     Output filename, or NULL to add .bai/.csi/etc to @a fn
-    # @param min_shift Positive to generate CSI, or 0 to generate BAI
-    # @return  0 if successful, or negative if an error occurred.
-    int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
-
-    void sam_itr_destroy(hts_itr_t *iter)
-    hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
-    hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
-    int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
-
-    #***************
-    #*** SAM I/O ***
-    #***************
-
-    htsFile *sam_open(const char *fn, const char *mode)
-    htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
-    int sam_close(htsFile *fp)
-
-    int sam_open_mode(char *mode, const char *fn, const char *format)
-
-    # A version of sam_open_mode that can handle ,key=value options.
-    # The format string is allocated and returned, to be freed by the caller.
-    # Prefix should be "r" or "w",
-    char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
-
-    bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
-    bam_hdr_t *sam_hdr_read(htsFile *fp)
-    int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
-
-    int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
-    int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
-    int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
-    int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
-
-    #*************************************
-    #*** Manipulating auxiliary fields ***
-    #*************************************
-
-    uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
-    int32_t  bam_aux2i(const uint8_t *s)
-    double   bam_aux2f(const uint8_t *s)
-    char     bam_aux2A(const uint8_t *s)
-    char    *bam_aux2Z(const uint8_t *s)
-
-    void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data)
-    int bam_aux_del(bam1_t *b, uint8_t *s)
-
-    #**************************
-    #*** Pileup and Mpileup ***
-    #**************************
-
-    # @abstract Structure for one alignment covering the pileup position.
-    # @field  b          pointer to the alignment
-    # @field  qpos       position of the read base at the pileup site, 0-based
-    # @field  indel      indel length; 0 for no indel, positive for ins and negative for del
-    # @field  level      the level of the read in the "viewer" mode
-    # @field  is_del     1 iff the base on the padded read is a deletion
-    # @field  is_head    ???
-    # @field  is_tail    ???
-    # @field  is_refskip ???
-    # @field  aux        ???
-    #
-    # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
-    # difference between the two functions is that the former does not
-    # set bam_pileup1_t::level, while the later does. Level helps the
-    # implementation of alignment viewers, but calculating this has some
-    # overhead.
-    #
-    # is_del, is_head, etc are a bit field, declaring as below should
-    # work as expected, see
-    # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
-
-    ctypedef struct bam_pileup1_t:
-        bam1_t *b
-        int32_t qpos
-        int indel, level
-        uint32_t is_del
-        uint32_t is_head
-        uint32_t is_tail
-        uint32_t is_refskip
-        uint32_t aux
-
-    ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
-    ctypedef int (*bam_test_f)()
-
-    ctypedef struct __bam_plp_t
-    ctypedef __bam_plp_t *bam_plp_t
-
-    ctypedef struct __bam_mplp_t
-    ctypedef __bam_mplp_t *bam_mplp_t
-
-    # bam_plp_init() - sets an iterator over multiple
-    # @func:      see mplp_func in bam_plcmd.c in samtools for an example. Expected return
-    #             status: 0 on success, -1 on end, < -1 on non-recoverable errors
-    # @data:      user data to pass to @func
-    bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
-    void bam_plp_destroy(bam_plp_t iter)
-    int bam_plp_push(bam_plp_t iter, const bam1_t *b)
-    const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
-    const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
-    void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
-    void bam_plp_reset(bam_plp_t iter)
-
-    bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
-
-    # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
-    # read pairs and for each base pair set the base quality of the
-    # lower-quality base to zero, thus effectively discarding it from
-    # calling. If the two bases are identical, the quality of the other base
-    # is increased to the sum of their qualities (capped at 200), otherwise
-    # it is multiplied by 0.8.
-    void bam_mplp_init_overlaps(bam_mplp_t iter)
-    void bam_mplp_destroy(bam_mplp_t iter)
-    void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
-    int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
-
-    # Added by AH
-    # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
-
-
-cdef extern from "htslib/faidx.h" nogil:
-
-    ctypedef struct faidx_t:
-       pass
-
-    int fai_build(char *fn)
-
-    void fai_destroy(faidx_t *fai)
-
-    faidx_t *fai_load(char *fn)
-
-    char *fai_fetch(faidx_t *fai,
-                    char *reg,
-                    int *len)
-
-    int faidx_nseq(faidx_t *fai)
-
-    int faidx_has_seq(faidx_t *fai, const char *seq)
-
-    char *faidx_fetch_seq(faidx_t *fai,
-                         char *c_name,
-                         int p_beg_i,
-                         int p_end_i,
-                         int *len)
-
-    int faidx_seq_len(faidx_t *fai, const char *seq)
-
-
-# tabix support
-cdef extern from "htslib/tbx.h" nogil:
-
-    # tbx.h definitions
-    int8_t TBX_MAX_SHIFT
-    int8_t TBX_GENERIC
-    int8_t TBX_SAM
-    int8_t TBX_VCF
-    int8_t TBX_UCSC
-
-    ctypedef struct tbx_conf_t:
-        int32_t preset
-        int32_t sc, bc, ec   # seq col., beg col. and end col.
-        int32_t meta_char, line_skip
-
-    ctypedef struct tbx_t:
-        tbx_conf_t conf
-        hts_idx_t *idx
-        void * dict
-
-    tbx_conf_t tbx_conf_gff
-    tbx_conf_t tbx_conf_bed
-    tbx_conf_t tbx_conf_psltbl
-    tbx_conf_t tbx_conf_sam
-    tbx_conf_t tbx_conf_vcf
-
-    void tbx_itr_destroy(hts_itr_t * iter)
-    hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
-    hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
-    int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data)
-
-    int tbx_name2id(tbx_t *tbx, char *ss)
-
-    int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
-    int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
-
-    tbx_t * tbx_index_load(char *fn)
-    tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
-
-    # free the array but not the values
-    char **tbx_seqnames(tbx_t *tbx, int *n)
-
-    void tbx_destroy(tbx_t *tbx)
-
-
-# VCF/BCF API
-cdef extern from "htslib/vcf.h" nogil:
-
-    # Header struct
-
-    uint8_t BCF_HL_FLT   # header line
-    uint8_t BCF_HL_INFO
-    uint8_t BCF_HL_FMT
-    uint8_t BCF_HL_CTG
-    uint8_t BCF_HL_STR   # structured header line TAG=<A=..,B=..>
-    uint8_t BCF_HL_GEN   # generic header line
-
-    uint8_t BCF_HT_FLAG  # header type
-    uint8_t BCF_HT_INT
-    uint8_t BCF_HT_REAL
-    uint8_t BCF_HT_STR
-
-    uint8_t BCF_VL_FIXED # variable length
-    uint8_t BCF_VL_VAR
-    uint8_t BCF_VL_A
-    uint8_t BCF_VL_G
-    uint8_t BCF_VL_R
-
-    # === Dictionary ===
-    #
-    # The header keeps three dictonaries. The first keeps IDs in the
-    # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
-    # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
-    # is the actual hash table, which is opaque to the end users. In the hash
-    # table, the key is the ID or sample name as a C string and the value is a
-    # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
-    # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
-    # size of the hash table or, equivalently, the length of the id[] arrays.
-
-    uint8_t BCF_DT_ID       # dictionary type
-    uint8_t BCF_DT_CTG
-    uint8_t BCF_DT_SAMPLE
-
-    # Complete textual representation of a header line
-    ctypedef struct bcf_hrec_t:
-        int type            # One of the BCF_HL_* type
-        char *key           # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
-        char *value         # Set only for generic lines, NULL for FILTER/INFO, etc.
-        int nkeys           # Number of structured fields
-        char **keys         # The key=value pairs
-        char **vals
-
-    ctypedef struct bcf_idinfo_t:
-        uint32_t info[3]     # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
-        bcf_hrec_t *hrec[3]  # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
-        int id
-
-    ctypedef struct bcf_idpair_t:
-        const char *key
-        const bcf_idinfo_t *val
-
-    ctypedef struct bcf_hdr_t:
-        int32_t n[3]                # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
-        bcf_idpair_t *id[3]
-        void *dict[3]               # ID dictionary, contig dict and sample dict
-        char **samples
-        bcf_hrec_t **hrec
-        int nhrec, dirty
-        int ntransl
-        int *transl[2]              # for bcf_translate()
-        int nsamples_ori            # for bcf_hdr_set_samples()
-        uint8_t *keep_samples
-        kstring_t mem
-        int32_t m[3]                # m: allocated size of the dictionary block in use (see n above)
-
-    uint8_t bcf_type_shift[]
-
-    # * VCF record *
-
-    uint8_t BCF_BT_NULL
-    uint8_t BCF_BT_INT8
-    uint8_t BCF_BT_INT16
-    uint8_t BCF_BT_INT32
-    uint8_t BCF_BT_FLOAT
-    uint8_t BCF_BT_CHAR
-
-    uint8_t VCF_REF
-    uint8_t VCF_SNP
-    uint8_t VCF_MNP
-    uint8_t VCF_INDEL
-    uint8_t VCF_OTHER
-
-    ctypedef struct variant_t:
-        int type, n     # variant type and the number of bases affected, negative for deletions
-
-    ctypedef struct bcf_fmt_t:
-        int id             # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
-        int n, size, type  # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
-        uint8_t *p         # same as vptr and vptr_* in bcf_info_t below
-        uint32_t p_len
-        uint32_t p_off
-        uint8_t p_free
-
-    union bcf_info_union_t:
-        int32_t i      # integer value
-        float f        # float value
-
-    ctypedef struct bcf_info_t:
-        int key        # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
-        int type, len  # type: one of BCF_BT_* types; len: vector length, 1 for scalars
-
-        # v1 union only set if $len==1; for easier access
-        bcf_info_union_t v1
-        uint8_t *vptr           # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
-        uint32_t vptr_len       # length of the vptr block or, when set, of the vptr_mod block, excluding offset
-        uint32_t vptr_off       # vptr offset, i.e., the size of the INFO key plus size+type bytes
-        uint8_t  vptr_free      # indicates that vptr-vptr_off must be freed; set only when modified and the new
-                                # data block is bigger than the original
-
-    uint8_t BCF1_DIRTY_ID
-    uint8_t BCF1_DIRTY_ALS
-    uint8_t BCF1_DIRTY_FLT
-    uint8_t BCF1_DIRTY_INF
-
-    ctypedef struct bcf_dec_t:
-        int m_fmt, m_info, m_id, m_als, m_allele, m_flt  # allocated size (high-water mark); do not change
-        int n_flt           # Number of FILTER fields
-        int *flt            # FILTER keys in the dictionary
-        char *id            # ID
-        char *als           # REF+ALT block (\0-seperated)
-        char **allele       # allele[0] is the REF (allele[] pointers to the als block); all null terminated
-        bcf_info_t *info    # INFO
-        bcf_fmt_t *fmt      # FORMAT and individual sample
-        variant_t *var      # $var and $var_type set only when set_variant_types called
-        int n_var, var_type
-        int shared_dirty    # if set, shared.s must be recreated on BCF output
-        int indiv_dirty     # if set, indiv.s must be recreated on BCF output
-
-    uint8_t BCF_ERR_CTG_UNDEF
-    uint8_t BCF_ERR_TAG_UNDEF
-    uint8_t BCF_ERR_NCOLS
-    uint8_t BCF_ERR_LIMITS
-
-    # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
-    # is slower because the string is first to be parsed, packed into BCF line
-    # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
-    # is known in advance that some of the fields will not be required (notably
-    # the sample columns), parsing of these can be skipped by setting max_unpack
-    # appropriately.
-    # Similarly, it is fast to output a BCF line because the columns (kept in
-    # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
-    # line must be formatted in vcf_format.
-
-    ctypedef struct bcf1_t:
-        int32_t rid               # CHROM
-        int32_t pos               # POS
-        int32_t rlen              # length of REF
-        float qual                # QUAL
-        uint32_t n_info, n_allele
-        uint32_t n_fmt, n_sample
-        kstring_t shared, indiv
-        bcf_dec_t d               # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
-        int max_unpack            # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
-        int unpacked              # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
-        int unpack_size[3]        # the original block size of ID, REF+ALT and FILTER
-        int errcode               # one of BCF_ERR_* codes
-
-    ####### API #######
-
-    # BCF and VCF I/O
-    #
-    # A note about naming conventions: htslib internally represents VCF
-    # records as bcf1_t data structures, therefore most functions are
-    # prefixed with bcf_. There are a few exceptions where the functions must
-    # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
-    # these cases, functions prefixed with bcf_ are more general and work
-    # with both BCF and VCF.
-
-    # bcf_hdr_init() - create an empty BCF header.
-    # @param mode    "r" or "w"
-    #
-    # When opened for writing, the mandatory fileFormat and
-    # FILTER=PASS lines are added automatically.
-    bcf_hdr_t *bcf_hdr_init(const char *mode)
-
-    # Destroy a BCF header struct
-    void bcf_hdr_destroy(bcf_hdr_t *h)
-
-    # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
-    bcf1_t *bcf_init()
-
-    # Deallocate a bcf1_t object
-    void bcf_destroy(bcf1_t *v)
-
-    # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
-    # not the bcf1_t object itself.
-    void bcf_empty(bcf1_t *v)
-
-    # Make the bcf1_t object ready for next read. Intended mostly for
-    # internal use, the user should rarely need to call this function
-    # directly.
-    void bcf_clear(bcf1_t *v)
-
-    # Reads VCF or BCF header
-    bcf_hdr_t *bcf_hdr_read(htsFile *fp)
-
-    # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
-    # @samples: samples to include or exclude from file or as a comma-separated string.
-    #             LIST|FILE   .. select samples in list/file
-    #             ^LIST|FILE  .. exclude samples from list/file
-    #             -           .. include all samples
-    #             NULL        .. exclude all samples
-    # @is_file: @samples is a file (1) or a comma-separated list (0)
-    #
-    # The bottleneck of VCF reading is parsing of genotype fields. If the
-    # reader knows in advance that only subset of samples is needed (possibly
-    # no samples at all), the performance of bcf_read() can be significantly
-    # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
-    # The function bcf_read() will subset the VCF/BCF records automatically
-    # with the notable exception when reading records via bcf_itr_next().
-    # In this case, bcf_subset_format() must be called explicitly, because
-    # bcf_readrec() does not see the header.
-    #
-    # Returns 0 on success, -1 on error or a positive integer if the list
-    # contains samples not present in the VCF header. In such a case, the
-    # return value is the index of the offending sample.
-    #
-    int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
-    int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
-
-    # Writes VCF or BCF header
-    int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
-
-    # Parse VCF line contained in kstring and populate the bcf1_t struct
-    int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
-
-    # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
-    int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
-
-    # bcf_read() - read next VCF or BCF record
-    #
-    # Returns -1 on critical errors, 0 otherwise. On errors which are not
-    # critical for reading, such as missing header definitions, v->errcode is
-    # set to one of BCF_ERR* code and must be checked before calling
-    # vcf_write().
-    int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-
-    # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
-    #
-    # Note that bcf_unpack() must be called even when reading VCF. It is safe
-    # to call the function repeatedly, it will not unpack the same field
-    # twice.
-    uint8_t BCF_UN_STR        # up to ALT inclusive
-    uint8_t BCF_UN_FLT        # up to FILTER
-    uint8_t BCF_UN_INFO       # up to INFO
-    uint8_t BCF_UN_SHR        # all shared information
-    uint8_t BCF_UN_FMT        # unpack format and each sample
-    uint8_t BCF_UN_IND        # a synonymo of BCF_UN_FMT
-    uint8_t BCF_UN_ALL        # everything
-
-    int bcf_unpack(bcf1_t *b, int which)
-
-    # bcf_dup() - create a copy of BCF record.
-    #
-    # Note that bcf_unpack() must be called on the returned copy as if it was
-    # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
-    # internally to reflect any changes made by bcf_update_* functions.
-    bcf1_t *bcf_dup(bcf1_t *src)
-    bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
-
-    # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
-    int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
-
-    # The following functions work only with VCFs and should rarely be called
-    # directly. Usually one wants to use their bcf_* alternatives, which work
-    # transparently with both VCFs and BCFs.
-    bcf_hdr_t *vcf_hdr_read(htsFile *fp)
-    int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
-    int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-    int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-
-    #************************************************************************
-    # Header querying and manipulation routines
-    #************************************************************************
-
-    # Create a new header using the supplied template
-    bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
-
-    # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
-    # Returns 0 on success or sets a bit on error:
-    #     1 .. conflicting definitions of tag length
-    #     # todo
-    int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
-
-    # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
-    # @param dst: the destination header to be merged into, NULL on the first pass
-    # @param src: the source header
-    #
-    # Notes:
-    #     - use as:
-    #         bcf_hdr_t *dst = NULL;
-    #         for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
-    #
-    #     - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
-    #     combining multiple BCF headers. The current bcf_hdr_combine()
-    #     does not have this problem, but became slow when used for many files.
-    bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
-
-    # bcf_hdr_add_sample() - add a new sample.
-    # @param sample:  sample name to be added
-    int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
-
-    # Read VCF header from a file and update the header
-    int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
-
-    # Returns formatted header (newly allocated string) and its length,
-    # excluding the terminating \0. If is_bcf parameter is unset, IDX
-    # fields are discarded.
-    char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
-
-    # Append new VCF header line, returns 0 on success
-    int bcf_hdr_append(bcf_hdr_t *h, const char *line)
-    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
-
-    # VCF version, e.g. VCFv4.2
-    const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
-    void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
-
-    # bcf_hdr_remove() - remove VCF header tag
-    # @param type:      one of BCF_HL_*
-    # @param key:       tag name or NULL to remove all tags of the given type
-    void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
-
-    # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
-    # @param n:        number of samples to keep
-    # @param samples:  names of the samples to keep
-    # @param imap:     mapping from index in @samples to the sample index in the original file
-    #
-    # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
-    # by comparing n and bcf_hdr_nsamples(out_hdr).
-    # This function can be used to reorder samples.
-    # See also bcf_subset() which subsets individual records.
-    #
-    bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
-
-    # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
-    const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
-
-    # Get number of samples
-    int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
-
-    # The following functions are for internal use and should rarely be called directly
-    int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
-    int bcf_hdr_sync(bcf_hdr_t *h)
-    bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
-    void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
-    int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
-
-    # bcf_hdr_get_hrec() - get header line info
-    # @param type:  one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
-    # @param key:   the header key for generic lines (e.g. "fileformat"), any field
-    #                 for structured lines, typically "ID".
-    # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
-    # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
-    #
-    bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
-    bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
-    void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
-    void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
-    int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
-    void hrec_add_idx(bcf_hrec_t *hrec, int idx)
-    void bcf_hrec_destroy(bcf_hrec_t *hrec)
-
-    #************************************************************************
-    # Individual record querying and manipulation routines
-    #************************************************************************
-
-    # See the description of bcf_hdr_subset()
-    int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
-
-    # bcf_translate() - translate tags ids to be consistent with different header. This function
-    #                   is useful when lines from multiple VCF need to be combined.
-    # @dst_hdr:   the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
-    # @src_hdr:   the source header, used in bcf_read()
-    # @src_line:  line obtained by bcf_read()
-    int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
-
-    # bcf_get_variant_type[s]()  - returns one of VCF_REF, VCF_SNP, etc
-    int bcf_get_variant_types(bcf1_t *rec)
-    int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
-    int bcf_is_snp(bcf1_t *v)
-
-    # bcf_update_filter() - sets the FILTER column
-    # @flt_ids:  The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
-    # @n:        Number of filters. If n==0, all filters are removed
-    int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
-
-    # bcf_add_filter() - adds to the FILTER column
-    # @flt_id:   The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
-    #
-    # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
-    int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
-
-    # bcf_remove_filter() - removes from the FILTER column
-    # @flt_id:   filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
-    # @pass:     when set to 1 and no filters are present, set to PASS
-    int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
-
-    # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
-    int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
-
-    # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
-    # @alleles:           Array of alleles
-    # @nals:              Number of alleles
-    # @alleles_string:    Comma-separated alleles, starting with the REF allele
-    int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
-    int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
-
-    # bcf_update_id() - sets new ID string
-    # bcf_add_id() - adds to the ID string checking for duplicates
-    int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
-    int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
-
-    # bcf_update_info_*() - functions for updating INFO fields
-    # @hdr:       the BCF header
-    # @line:      VCF line to be edited
-    # @key:       the INFO tag to be updated
-    # @values:    pointer to the array of values. Pass NULL to remove the tag.
-    # @n:         number of values in the array. When set to 0, the INFO tag is removed
-    #
-    # The @string in bcf_update_info_flag() is optional, @n indicates whether
-    # the flag is set or removed.
-    #
-    # Returns 0 on success or negative value on error.
-    #
-    int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
-    int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
-    int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
-    int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
-    int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
-
-    # bcf_update_format_*() - functions for updating FORMAT fields
-    # @values:    pointer to the array of values, the same number of elements
-    #             is expected for each sample. Missing values must be padded
-    #             with bcf_*_missing or bcf_*_vector_end values.
-    # @n:         number of values in the array. If n==0, existing tag is removed.
-    #
-    # The function bcf_update_format_string() is a higher-level (slower) variant of
-    # bcf_update_format_char(). The former accepts array of \0-terminated strings
-    # whereas the latter requires that the strings are collapsed into a single array
-    # of fixed-length strings. In case of strings with variable length, shorter strings
-    # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
-    # are not \0-terminated.
-    #
-    # Returns 0 on success or negative value on error.
-    #
-    int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
-    int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
-    int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
-    int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
-    int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
-    int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
-
-    # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
-    # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
-    # from bcf_get_genotypes() below.
-    uint32_t bcf_gt_phased(uint32_t idx)
-    uint32_t bcf_gt_unphased(uint32_t idx)
-    uint32_t bcf_gt_missing
-    uint32_t bcf_gt_is_missing(uint32_t val)
-    uint32_t bcf_gt_is_phased(uint32_t idx)
-    uint32_t bcf_gt_allele(uint32_t val)
-
-    # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
-    uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
-    void bcf_gt2alleles(int igt, int *a, int *b)
-
-    # bcf_get_fmt() - returns pointer to FORMAT's field data
-    # @header: for access to BCF_DT_ID dictionary
-    # @line:   VCF line obtained from vcf_parse1
-    # @fmt:    one of GT,PL,...
-    #
-    # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
-    # is not available.
-    #
-    bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
-    bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
-
-    # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
-    # @line: VCF line obtained from vcf_parse1
-    # @id:  The header index for the tag, obtained from bcf_hdr_id2int()
-    #
-    # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
-    # as their goal is to avoid the header lookup.
-    #
-    bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
-    bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
-
-    # bcf_get_info_*() - get INFO values, integers or floats
-    # @hdr:       BCF header
-    # @line:      BCF record
-    # @tag:       INFO tag to retrieve
-    # @dst:       *dst is pointer to a memory location, can point to NULL
-    # @ndst:      pointer to the size of allocated memory
-    #
-    # Returns negative value on error or the number of written values on
-    # success. bcf_get_info_string() returns on success the number of
-    # characters written excluding the null-terminating byte. bcf_get_info_flag()
-    # returns 1 when flag is set or 0 if not.
-    #
-    # List of return codes:
-    #     -1 .. no such INFO tag defined in the header
-    #     -2 .. clash between types defined in the header and encountered in the VCF record
-    #     -3 .. tag is not present in the VCF record
-    #
-    int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
-    int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
-    int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
-    int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
-    int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
-
-    # bcf_get_format_*() - same as bcf_get_info*() above
-    #
-    # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
-    # see the description of bcf_update_format_string() and bcf_update_format_char() above.
-    # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
-    # a single block of \0-terminated strings collapsed into a single array and an array of pointers
-    # to these strings. Both arrays must be cleaned by the user.
-    #
-    # Returns negative value on error or the number of written values on success.
-    #
-    # Example:
-    #     int ndst = 0; char **dst = NULL
-    #     if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
-    #         for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
-    #     free(dst[0]); free(dst)
-    #
-    # Example:
-    #     int ngt, *gt_arr = NULL, ngt_arr = 0
-    #     ngt = bcf_get_genotypes(hdr, line, &gt_arr, &ngt_arr)
-    #
-    int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
-    int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
-    int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
-    int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst)
-    int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
-    int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
-
-    #************************************************************************
-    # Helper functions
-    #************************************************************************
-
-    #
-    # bcf_hdr_id2int() - Translates string into numeric ID
-    # bcf_hdr_int2id() - Translates numeric ID into string
-    # @type:     one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
-    # @id:       tag name, such as: PL, DP, GT, etc.
-    #
-    # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
-    # fields in BCF records.
-    #
-    int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
-    const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
-
-    # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
-    # bcf_hdr_id2name() - Translates numeric ID to sequence name
-    #
-    int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
-    const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
-    const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
-
-    #
-    # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
-    # @type:      one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
-    # @int_id:    return value of bcf_hdr_id2int, must be >=0
-    #
-    # The returned values are:
-    #    bcf_hdr_id2length   ..  whether the number of values is fixed or variable, one of BCF_VL_*
-    #    bcf_hdr_id2number   ..  the number of values, 0xfffff for variable length fields
-    #    bcf_hdr_id2type     ..  the field type, one of BCF_HT_*
-    #    bcf_hdr_id2coltype  ..  the column type, one of BCF_HL_*
-    #
-    # Notes: Prior to using the macros, the presence of the info should be
-    # tested with bcf_hdr_idinfo_exists().
-    #
-    int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
-    int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
-    int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
-    int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
-    int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
-    bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
-
-    void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
-    uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
-
-    void bcf_enc_vchar(kstring_t *s, int l, const char *a)
-    void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
-    void bcf_enc_vfloat(kstring_t *s, int n, float *a)
-
-    #************************************************************************
-    # BCF index
-    #
-    # Note that these functions work with BCFs only. See synced_bcf_reader.h
-    # which provides (amongst other things) an API to work transparently with
-    # both indexed BCFs and VCFs.
-    #************************************************************************
-
-    hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
-    int bcf_index_build(const char *fn, int min_shift)
-    int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
-
-    #*******************
-    # Typed value I/O *
-    #******************
-
-    # Note that in contrast with BCFv2.1 specification, HTSlib implementation
-    # allows missing values in vectors. For integer types, the values 0x80,
-    # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
-    # 0x80000001 as end-of-vector indicators.  Similarly for floats, the value of
-    # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
-    # end-of-vector indicator.
-    # Note that the end-of-vector byte is not part of the vector.
-
-    # This trial BCF version (v2.2) is compatible with the VCF specification and
-    # enables to handle correctly vectors with different ploidy in presence of
-    # missing values.
-
-    int32_t bcf_int8_vector_end
-    int32_t bcf_int16_vector_end
-    int32_t bcf_int32_vector_end
-    int32_t bcf_str_vector_end
-    int32_t bcf_int8_missing
-    int32_t bcf_int16_missing
-    int32_t bcf_int32_missing
-    int32_t bcf_str_missing
-
-    uint32_t bcf_float_vector_end
-    uint32_t bcf_float_missing
-
-    void bcf_float_set(float *ptr, uint32_t value)
-    void bcf_float_set_vector_end(float *x)
-    void bcf_float_set_missing(float *x)
-
-    int bcf_float_is_missing(float f)
-    int bcf_float_is_vector_end(float f)
-    void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
-    void bcf_enc_size(kstring_t *s, int size, int type)
-    int bcf_enc_inttype(long x)
-    void bcf_enc_int1(kstring_t *s, int32_t x)
-    int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
-    int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
-    int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
-
-    # These trivial wrappers are defined only for consistency with other parts of htslib
-    bcf1_t *bcf_init1()
-    int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-    int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-    int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-    int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-    void bcf_destroy1(bcf1_t *v)
-    void bcf_empty1(bcf1_t *v)
-    int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
-    void bcf_clear1(bcf1_t *v)
-    int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
-
-    # Other nice wrappers
-    void bcf_itr_destroy(hts_itr_t *iter)
-    hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
-    hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
-    int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
-    hts_idx_t *bcf_index_load(const char *fn)
-    const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
-
-
-# VCF/BCF utility functions
-cdef extern from "htslib/vcfutils.h" nogil:
-    struct kbitset_t
-
-    # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
-    # @header:  for access to BCF_DT_ID dictionary
-    # @line:    VCF line obtain from vcf_parse1
-    #
-    # Returns the number of removed alleles on success or negative
-    # on error:
-    #     -1 .. some allele index is out of bounds
-    int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
-
-    # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
-    # @header:  for access to BCF_DT_ID dictionary
-    # @line:    VCF line obtained from vcf_parse1
-    # @mask:    alleles to remove
-    #
-    # If you have more than 31 alleles, then the integer bit mask will
-    # overflow, so use bcf_remove_allele_set instead
-    void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
-
-    # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
-    # @header:  for access to BCF_DT_ID dictionary
-    # @line:    VCF line obtained from vcf_parse1
-    # @rm_set:  pointer to kbitset_t object with bits set for allele
-    #           indexes to remove
-    #
-    # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
-    void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
-
-    # bcf_calc_ac() - calculate the number of REF and ALT alleles
-    # @header:  for access to BCF_DT_ID dictionary
-    # @line:    VCF line obtained from vcf_parse1
-    # @ac:      array of length line->n_allele
-    # @which:   determine if INFO/AN,AC and indv fields be used
-    #
-    # Returns 1 if the call succeeded, or 0 if the value could not
-    # be determined.
-    #
-    # The value of @which determines if existing INFO/AC,AN can be
-    # used (BCF_UN_INFO) and and if indv fields can be splitted
-    # (BCF_UN_FMT).
-    int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
-
-    # bcf_gt_type() - determines type of the genotype
-    # @fmt_ptr:  the GT format field as set for example by set_fmt_ptr
-    # @isample:  sample index (starting from 0)
-    # @ial:      index of the 1st non-reference allele (starting from 1)
-    # @jal:      index of the 2nd non-reference allele (starting from 1)
-    #
-    # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
-    # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
-    # is not NULL and the genotype has one or more non-reference
-    # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
-    # position of the allele which appeared first in ALT. If $jal is
-    # not null and the genotype is GT_HET_AA, $jal will be set and is
-    # the position of the second allele in ALT.
-    uint8_t GT_HOM_RR    # note: the actual value of GT_* matters, used in dosage r2 calculation
-    uint8_t GT_HOM_AA
-    uint8_t GT_HET_RA
-    uint8_t GT_HET_AA
-    uint8_t GT_HAPL_R
-    uint8_t GT_HAPL_A
-    uint8_t GT_UNKN
-    int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
-
-    int bcf_acgt2int(char c)
-    char bcf_int2acgt(int i)
-
-    # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
-    # @i,j:  allele indexes, 0-based, i<=j
-    # Returns index to the Number=G diploid array
-    uint32_t bcf_ij2G(uint32_t i, uint32_t j)
diff --git a/pysam/chtslib.pyx b/pysam/chtslib.pyx

deleted file mode 100644 (file)

index eab229f..0000000
--- a/pysam/chtslib.pyx
+++ /dev/null
@@ -1,19 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-# adds doc-strings for sphinx
-from pysam.chtslib cimport *
-
-cpdef set_verbosity(int verbosity):
-    u"""Set htslib's hts_verbose global variable to the specified value.
-    """
-    return hts_set_verbosity(verbosity)
-
-cpdef get_verbosity():
-    u"""Return the value of htslib's hts_verbose global variable.
-    """
-    return hts_get_verbosity()
-
-__all__ = [
-    "get_verbosity",
-    "set_verbosity"]
-
diff --git a/pysam/csamfile.pxd b/pysam/csamfile.pxd

deleted file mode 100644 (file)

index a76a599..0000000
--- a/pysam/csamfile.pxd
+++ /dev/null
@@ -1,45 +0,0 @@
-from pysam.calignmentfile cimport AlignedSegment, AlignmentFile
-
-#################################################
-# Compatibility Layer for pysam < 0.8
-
-# import all declarations from htslib
-from pysam.chtslib cimport *
-
-cdef class AlignedRead(AlignedSegment):
-    pass
-
-cdef class Samfile(AlignmentFile):
-    pass
-
-# import the conversion functions
-cdef extern from "htslib_util.h":
-
-    # add *nbytes* into the variable length data of *src* at *pos*
-    bam1_t * pysam_bam_update(bam1_t * b,
-                              size_t nbytes_old,
-                              size_t nbytes_new,
-                              uint8_t * pos)
-
-    # now: static
-    int aux_type2size(int)
-
-    char * pysam_bam_get_qname(bam1_t * b)
-    uint32_t * pysam_bam_get_cigar(bam1_t * b)
-    uint8_t * pysam_bam_get_seq(bam1_t * b)
-    uint8_t * pysam_bam_get_qual(bam1_t * b)
-    uint8_t * pysam_bam_get_aux(bam1_t * b)
-    int pysam_bam_get_l_aux(bam1_t * b)
-    char pysam_bam_seqi(uint8_t * s, int i)
-
-    uint16_t pysam_get_bin(bam1_t * b)
-    uint8_t pysam_get_qual(bam1_t * b)
-    uint8_t pysam_get_l_qname(bam1_t * b)
-    uint16_t pysam_get_flag(bam1_t * b)
-    uint16_t pysam_get_n_cigar(bam1_t * b)
-    void pysam_set_bin(bam1_t * b, uint16_t v)
-    void pysam_set_qual(bam1_t * b, uint8_t v)
-    void pysam_set_l_qname(bam1_t * b, uint8_t v)
-    void pysam_set_flag(bam1_t * b, uint16_t v)
-    void pysam_set_n_cigar(bam1_t * b, uint16_t v)
-    void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
diff --git a/pysam/csamfile.pyx b/pysam/csamfile.pyx

deleted file mode 100644 (file)

index ed9d79b..0000000
--- a/pysam/csamfile.pyx
+++ /dev/null
@@ -1,43 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-# adds doc-strings for sphinx
-import tempfile
-import os
-import sys
-import types
-import itertools
-import struct
-import ctypes
-import collections
-import re
-import platform
-import warnings
-from cpython cimport PyErr_SetString, \
-    PyBytes_Check, \
-    PyUnicode_Check, \
-    PyBytes_FromStringAndSize
-
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-
-
-cdef class Samfile(AlignmentFile):
-    '''Deprecated alternative for :class:`~pysam.AlignmentFile`
-
-    Added for backwards compatibility with pysam <= 0.8.0
-    '''
-    pass
-
-
-cdef class AlignedRead(AlignedSegment):
-    '''Deprecated alternative for :class:`~pysam.AlignedSegment`
-
-    Added for backwards compatibility with pysam <= 0.8.0
-    '''
-    pass
-
-
-__all__ = ['Samfile', 'AlignedRead']
-
-
diff --git a/pysam/ctabix.pxd b/pysam/ctabix.pxd

deleted file mode 100644 (file)

index 028090e..0000000
--- a/pysam/ctabix.pxd
+++ /dev/null
@@ -1,119 +0,0 @@
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-
-# Note: this replaces python "open"!
-cdef extern from "fcntl.h":
-    int open(char *pathname, int flags)
-
-cdef extern from "unistd.h" nogil:
-    ctypedef int ssize_t
-    ssize_t read(int fd, void *buf, size_t count)
-    int close(int fd)
-
-from pysam.chtslib cimport hts_idx_t, hts_itr_t, htsFile, \
-    tbx_t, kstring_t, BGZF
-
-# These functions are put here and not in chtslib.pxd in order
-# to avoid warnings for unused functions.
-cdef extern from "pysam_stream.h" nogil:
-
-    ctypedef struct kstream_t:
-        pass
-
-    ctypedef struct kseq_t:
-        kstring_t name
-        kstring_t comment
-        kstring_t seq
-        kstring_t qual
-
-    kseq_t *kseq_init(BGZF *)
-    int kseq_read(kseq_t *)
-    void kseq_destroy(kseq_t *)
-    kstream_t *ks_init(BGZF *)
-    void ks_destroy(kstream_t *)
-
-    # Retrieve characters from stream until delimiter
-    # is reached placing results in str.
-    int ks_getuntil(kstream_t *,
-                    int delimiter,
-                    kstring_t * str,
-                    int * dret)
-
-
-cdef class tabix_file_iterator:
-    cdef BGZF * fh
-    cdef kstream_t * kstream
-    cdef kstring_t buffer
-    cdef size_t size
-    cdef Parser parser
-    cdef int fd
-    cdef int duplicated_fd
-    cdef infile
-
-    cdef __cnext__(self)
-
-cdef class TabixFile:
-
-    # pointer to tabixfile
-    cdef htsFile * tabixfile
-    # pointer to index structure
-    cdef tbx_t * index
-
-    # flag indicating whether file is remote
-    cdef int is_remote
-
-    cdef object _filename
-    cdef object _filename_index
-
-    cdef Parser parser
-
-    cdef encoding    
-
-cdef class Parser:
-    cdef encoding
-
-    cdef parse(self, char * buffer, int len)
-
-cdef class asTuple(Parser):
-    cdef parse(self, char * buffer, int len)
-
-cdef class asGTF(Parser):
-    pass
-
-cdef class asBed(Parser):
-    pass
-
-cdef class asVCF(Parser):
-    pass
-
-cdef class TabixIterator:
-    cdef hts_itr_t * iterator
-    cdef TabixFile tabixfile
-    cdef kstring_t buffer
-    cdef encoding
-    cdef int __cnext__(self)
-
-cdef class TabixIteratorParsed(TabixIterator):
-    cdef Parser parser
-
-cdef class GZIterator:
-    cdef object _filename
-    cdef BGZF * gzipfile
-    cdef kstream_t * kstream
-    cdef kstring_t buffer
-    cdef int __cnext__(self)
-    cdef encoding
-
-cdef class GZIteratorHead(GZIterator):
-    pass
-
-cdef class GZIteratorParsed(GZIterator):
-    cdef Parser parser
-
-# Compatibility Layer for pysam < 0.8
-cdef class Tabixfile(TabixFile):
-    pass
-
diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx

deleted file mode 100644 (file)

index a23fa87..0000000
--- a/pysam/ctabix.pyx
+++ /dev/null
@@ -1,1206 +0,0 @@
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-# Cython wrapper for access to tabix indexed files in bgzf format
-###############################################################################
-# The principal classes and functions defined in this module are:
-#
-# class TabixFile  class wrapping tabix indexed files in bgzf format
-#
-# class asTuple  Parser class for tuples
-# class asGT     Parser class for GTF formatted rows
-# class asBed    Parser class for Bed formatted rows
-# class asVCF    Parser class for VCF formatted rows
-#
-# class tabix_generic_iterator  Streamed iterator of bgzf formatted files
-#
-# Additionally this module defines several additional classes that are part
-# of the internal API. These are:
-#
-# class Parser                base class for parsers of tab-separated rows
-# class tabix_file_iterator
-# class TabixIterator         iterator class over rows in bgzf file
-# class EmptyIterator
-#
-# For backwards compatibility, the following classes are also defined:
-#
-# class Tabixfile   equivalent to TabixFile
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-import os
-import sys
-
-from libc.stdio cimport printf, fprintf, stderr
-from libc.string cimport strerror
-from libc.errno cimport errno
-from posix.unistd cimport dup
-
-from cpython cimport PyErr_SetString, PyBytes_Check, \
-    PyUnicode_Check, PyBytes_FromStringAndSize, \
-    PyObject_AsFileDescriptor
-
-from cpython.version cimport PY_MAJOR_VERSION
-
-cimport pysam.ctabixproxies as ctabixproxies
-
-from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
-    BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
-    tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
-    tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
-    tbx_destroy, hisremote
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-
-cdef class Parser:
-
-    def __init__(self, encoding="ascii"):
-        self.encoding = encoding
-
-    def set_encoding(self, encoding):
-        self.encoding = encoding
-
-    def get_encoding(self):
-        return self.encoding
-
-    cdef parse(self, char * buffer, int length):
-        raise NotImplementedError(
-            'parse method of %s not implemented' % str(self))
-
-    def __call__(self, char * buffer, int length):
-        return self.parse(buffer, length)
-
-
-cdef class asTuple(Parser):
-    '''converts a :term:`tabix row` into a python tuple.
-
-    A field in a row is accessed by numeric index.
-    ''' 
-    cdef parse(self, char * buffer, int len):
-        cdef ctabixproxies.TupleProxy r
-        r = ctabixproxies.TupleProxy(self.encoding)
-        # need to copy - there were some
-        # persistence issues with "present"
-        r.copy(buffer, len)
-        return r
-
-
-cdef class asGTF(Parser):
-    '''converts a :term:`tabix row` into a GTF record with the following
-    fields:
-   
-    +----------+----------+-------------------------------+
-    |*Column*  |*Name*    |*Content*                      |
-    +----------+----------+-------------------------------+
-    |1         |contig    |the chromosome name            |
-    +----------+----------+-------------------------------+
-    |2         |feature   |The feature type               |
-    +----------+----------+-------------------------------+
-    |3         |source    |The feature source             |
-    +----------+----------+-------------------------------+
-    |4         |start     |genomic start coordinate       |
-    |          |          |(0-based)                      |
-    +----------+----------+-------------------------------+
-    |5         |end       |genomic end coordinate         |
-    |          |          |(0-based)                      |
-    +----------+----------+-------------------------------+
-    |6         |score     |feature score                  |
-    +----------+----------+-------------------------------+
-    |7         |strand    |strand                         |
-    +----------+----------+-------------------------------+
-    |8         |frame     |frame                          |
-    +----------+----------+-------------------------------+
-    |9         |attributes|the attribute field            |
-    +----------+----------+-------------------------------+
-
-    GTF formatted entries also define the following fields that
-    are derived from the attributes field:
-
-    +--------------------+------------------------------+
-    |*Name*              |*Content*                     |
-    +--------------------+------------------------------+
-    |gene_id             |the gene identifier           |
-    +--------------------+------------------------------+
-    |transcript_id       |the transcript identifier     |
-    +--------------------+------------------------------+
-
-    ''' 
-    cdef parse(self, char * buffer, int len):
-        cdef ctabixproxies.GTFProxy r
-        r = ctabixproxies.GTFProxy(self.encoding)
-        r.copy(buffer, len)
-        return r
-
-
-cdef class asBed(Parser):
-    '''converts a :term:`tabix row` into a bed record
-    with the following fields:
-
-    +-----------+-----------+------------------------------------------+
-    |*Column*   |*Field*    |*Contents*                                |
-    |           |           |                                          |
-    +-----------+-----------+------------------------------------------+
-    |1          |contig     |contig                                    |
-    |           |           |                                          |
-    +-----------+-----------+------------------------------------------+
-    |2          |start      |genomic start coordinate (zero-based)     |
-    +-----------+-----------+------------------------------------------+
-    |3          |end        |genomic end coordinate plus one           |
-    |           |           |(zero-based)                              |
-    +-----------+-----------+------------------------------------------+
-    |4          |name       |name of feature.                          |
-    +-----------+-----------+------------------------------------------+
-    |5          |score      |score of feature                          |
-    +-----------+-----------+------------------------------------------+
-    |6          |strand     |strand of feature                         |
-    +-----------+-----------+------------------------------------------+
-    |7          |thickStart |thickStart                                |
-    +-----------+-----------+------------------------------------------+
-    |8          |thickEnd   |thickEnd                                  |
-    +-----------+-----------+------------------------------------------+
-    |9          |itemRGB    |itemRGB                                   |
-    +-----------+-----------+------------------------------------------+
-    |10         |blockCount |number of bocks                           |
-    +-----------+-----------+------------------------------------------+
-    |11         |blockSizes |',' separated string of block sizes       |
-    +-----------+-----------+------------------------------------------+
-    |12         |blockStarts|',' separated string of block genomic     |
-    |           |           |start positions                           |
-    +-----------+-----------+------------------------------------------+
-
-    Only the first three fields are required. Additional
-    fields are optional, but if one is defined, all the preceding
-    need to be defined as well.
-
-    ''' 
-    cdef parse(self, char * buffer, int len):
-        cdef ctabixproxies.BedProxy r
-        r = ctabixproxies.BedProxy(self.encoding)
-        r.copy(buffer, len)
-        return r
-
-
-cdef class asVCF(Parser): 
-    '''converts a :term:`tabix row` into a VCF record with
-    the following fields:
-    
-    +----------+---------+------------------------------------+
-    |*Column*  |*Field*  |*Contents*                          |
-    |          |         |                                    |
-    +----------+---------+------------------------------------+
-    |1         |contig   |chromosome                          |
-    +----------+---------+------------------------------------+
-    |2         |pos      |chromosomal position, zero-based    |
-    +----------+---------+------------------------------------+
-    |3         |id       |id                                  |
-    +----------+---------+------------------------------------+
-    |4         |ref      |reference allele                    |
-    +----------+---------+------------------------------------+
-    |5         |alt      |alternate alleles                   |
-    +----------+---------+------------------------------------+
-    |6         |qual     |quality                             |
-    +----------+---------+------------------------------------+
-    |7         |filter   |filter                              |
-    +----------+---------+------------------------------------+
-    |8         |info     |info                                |
-    +----------+---------+------------------------------------+
-    |9         |format   |format specifier.                   |
-    +----------+---------+------------------------------------+
-
-    Access to genotypes is via index::
-
-        contig = vcf.contig
-        first_sample_genotype = vcf[0]
-        second_sample_genotype = vcf[1]
-
-    '''
-    cdef parse(self, char * buffer, int len):
-        cdef ctabixproxies.VCFProxy r
-        r = ctabixproxies.VCFProxy(self.encoding)
-        r.copy(buffer, len)
-        return r
-
-
-cdef class TabixFile:
-    """Random access to bgzf formatted files that
-    have been indexed by :term:`tabix`.
-
-    The file is automatically opened. The index file of file
-    ``<filename>`` is expected to be called ``<filename>.tbi``
-    by default (see parameter `index`).
-    
-    Parameters
-    ----------
-    
-    filename : string
-        Filename of bgzf file to be opened.
-
-    index : string
-        The filename of the index. If not set, the default is to
-        assume that the index is called ``filename.tbi`
-
-    mode : char
-        The file opening mode. Currently, only ``r`` is permitted.
-        
-    parser : :class:`pysam.Parser`
-    
-        sets the default parser for this tabix file. If `parser`
-        is None, the results are returned as an unparsed string.
-        Otherwise, `parser` is assumed to be a functor that will return
-        parsed data (see for example :class:`~pysam.asTuple` and
-        :class:`~pysam.asGTF`).
-
-    encoding : string
-
-        The encoding passed to the parser
-
-    Raises
-    ------
-    
-    ValueError
-        if index file is missing.
-
-    IOError
-        if file could not be opened
-    """
-    def __cinit__(self,
-                  filename,
-                  mode = 'r',
-                  parser=None,
-                  index=None,
-                  encoding="ascii",
-                  *args,
-                  **kwargs ):
-
-        self.tabixfile = NULL
-        self.parser = parser
-        self._open(filename, mode, index, *args, **kwargs)
-        self.encoding = encoding
-
-    def _open( self, 
-               filename,
-               mode='r',
-               index=None,
-              ):
-        '''open a :term:`tabix file` for reading.
-        '''
-
-        assert mode in ("r",), "invalid file opening mode `%s`" % mode
-
-        if self.tabixfile != NULL:
-            self.close()
-        self.tabixfile = NULL
-
-        filename_index = index or (filename + ".tbi")
-        # encode all the strings to pass to tabix
-        self._filename = encode_filename(filename)
-        self._filename_index = encode_filename(filename_index)
-
-        self.is_remote = hisremote(self._filename)
-
-        if not self.is_remote:
-            if not os.path.exists(filename):
-                raise IOError("file `%s` not found" % filename)
-
-            if not os.path.exists(filename_index):
-                raise IOError("index `%s` not found" % filename_index)
-
-        # open file
-        cdef char *cfilename = self._filename
-        with nogil:
-            self.tabixfile = hts_open(cfilename, 'r')
-
-        if self.tabixfile == NULL:
-            raise IOError("could not open file `%s`" % filename)
-        
-        cfilename = self._filename_index
-        with nogil:
-            self.index = tbx_index_load(cfilename)
-
-        if self.index == NULL:
-            raise IOError("could not open index for `%s`" % filename)
-
-    def _dup(self):
-        '''return a copy of this tabix file.
-        
-        The file is being re-opened.
-        '''
-        return TabixFile(self._filename,
-                         mode="r", 
-                         parser=self.parser,
-                         index=self._filename_index,
-                         encoding=self.encoding)
-
-    def is_open(self):
-        '''return true if samfile has been opened.'''
-        return self.tabixfile != NULL
-
-
-    def fetch(self, 
-              reference=None,
-              start=None, 
-              end=None, 
-              region=None,
-              parser=None,
-              multiple_iterators=False):
-        '''fetch one or more rows in a :term:`region` using 0-based
-        indexing. The region is specified by :term:`reference`,
-        *start* and *end*. Alternatively, a samtools :term:`region`
-        string can be supplied.
-
-        Without *reference* or *region* all entries will be fetched. 
-        
-        If only *reference* is set, all reads matching on *reference*
-        will be fetched.
-
-        If *parser* is None, the default parser will be used for
-        parsing.
-        
-        Set *multiple_iterators* to true if you will be using multiple
-        iterators on the same file at the same time. The iterator
-        returned will receive its own copy of a filehandle to the file
-        effectively re-opening the file. Re-opening a file creates
-        some overhead, so beware.
-
-        '''
-        if not self.is_open():
-            raise ValueError("I/O operation on closed file")
-
-        # convert coordinates to region string, which is one-based
-        if reference:
-            if end is not None:
-                if end < 0:
-                    raise ValueError("end out of range (%i)" % end)
-                if start is None:
-                    start = 0
-                    
-                if start < 0:
-                    raise ValueError("start out of range (%i)" % end)
-                elif start > end:
-                    raise ValueError(
-                        'start (%i) >= end (%i)' % (start, end))
-                elif start == end:
-                    return EmptyIterator()
-                else:
-                    region = '%s:%i-%i' % (reference, start + 1, end)
-            elif start is not None:
-                if start < 0:
-                    raise ValueError("start out of range (%i)" % end)
-                region = '%s:%i' % (reference, start + 1)
-            else:
-                region = reference
-
-        # get iterator
-        cdef hts_itr_t * itr
-        cdef char *cstr
-        cdef TabixFile fileobj
-
-        # reopen the same file if necessary
-        if multiple_iterators:
-            fileobj = self._dup()
-        else:
-            fileobj = self
-
-        if region is None:
-            # without region or reference - iterate from start
-            with nogil:
-                itr = tbx_itr_queryi(fileobj.index,
-                                      HTS_IDX_START,
-                                      0,
-                                      0)
-        else:
-            s = force_bytes(region, encoding=fileobj.encoding)
-            cstr = s
-            with nogil:
-                itr = tbx_itr_querys(fileobj.index, cstr)
-
-        if itr == NULL:
-            if region is None:
-                if len(self.contigs) > 0:
-                    # when accessing a tabix file created prior tabix 1.0
-                    # the full-file iterator is empty.
-                    raise ValueError(
-                        "could not create iterator, possible "
-                        "tabix version mismatch")
-                else:
-                    # possible reason is that the file is empty -
-                    # return an empty iterator
-                    return EmptyIterator()
-            else:
-                raise ValueError(
-                    "could not create iterator for region '%s'" %
-                    region)
-            
-        # use default parser if no parser is specified
-        if parser is None:
-            parser = fileobj.parser
-
-        cdef TabixIterator a
-        if parser is None: 
-            a = TabixIterator(encoding=fileobj.encoding)
-        else:
-            parser.set_encoding(fileobj.encoding)
-            a = TabixIteratorParsed(parser)
-
-        a.tabixfile = fileobj
-        a.iterator = itr
-
-        return a
-
-    # context manager interface
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-        return False
-
-    ###############################################################
-    ###############################################################
-    ###############################################################
-    ## properties
-    ###############################################################
-    property closed:
-        """"bool indicating the current state of the file object. 
-        This is a read-only attribute; the close() method changes the value. 
-        """
-        def __get__(self):
-            return not self.is_open()
-
-    property filename:
-        '''filename associated with this object.'''
-        def __get__(self):
-            if not self.is_open():
-                raise ValueError("I/O operation on closed file")
-            return self._filename
-
-    property header:
-        '''the file header.
-
-        The file header consists of the lines at the beginning of a
-        file that are prefixed by the comment character ``#``.
-       
-        .. note::
-            The header is returned as an iterator presenting lines
-            without the newline character.
-        
-        .. note::
-            The header is only available for local files. For remote
-            files an Attribute Error is raised.
-
-        '''
-        
-        def __get__(self):
-            if self.is_remote:
-                raise AttributeError(
-                    "the header is not available for remote files")
-            return GZIteratorHead(self.filename)
-
-    property contigs:
-        '''list of chromosome names'''
-        def __get__(self):
-            cdef char ** sequences
-            cdef int nsequences
-            
-            with nogil:
-                sequences = tbx_seqnames(self.index, &nsequences)
-            cdef int x
-            result = []
-            for x from 0 <= x < nsequences:
-                result.append(force_str(sequences[x]))
-            
-            # htslib instructions:
-            # only free container, not the sequences themselves
-            free(sequences)
-
-            return result
-            
-    def close(self):
-        '''
-        closes the :class:`pysam.TabixFile`.'''
-        if self.tabixfile != NULL:
-            hts_close(self.tabixfile)
-            self.tabixfile = NULL
-        if self.index != NULL:
-            tbx_destroy(self.index)
-            self.index = NULL
-
-    def __dealloc__( self ):
-        # remember: dealloc cannot call other python methods
-        # note: no doc string
-        # note: __del__ is not called.
-        if self.tabixfile != NULL:
-            hts_close(self.tabixfile)
-            self.tabixfile = NULL
-        if self.index != NULL:
-            tbx_destroy(self.index)
-
-
-cdef class TabixIterator:
-    """iterates over rows in *tabixfile* in region
-    given by *tid*, *start* and *end*.
-    """
-
-    def __init__(self, encoding="ascii"):
-        self.encoding = encoding
-    
-    def __iter__(self):
-        self.buffer.s = NULL
-        self.buffer.l = 0
-        self.buffer.m = 0
-
-        return self 
-
-    cdef int __cnext__(self):
-        '''iterate to next element.
-        
-        Return -5 if file has been closed when this function
-        was called.
-        '''
-        if self.tabixfile.tabixfile == NULL:
-            return -5
-
-        cdef int retval
-
-        while 1:
-            with nogil:
-                retval = tbx_itr_next(
-                    self.tabixfile.tabixfile,
-                    self.tabixfile.index,
-                    self.iterator,
-                    &self.buffer)
-
-            if retval < 0:
-                break
-
-            if self.buffer.s[0] != '#':
-                break
-
-        return retval
-
-    def __next__(self): 
-        """python version of next().
-
-        pyrex uses this non-standard name instead of next()
-        """
-        
-        cdef int retval = self.__cnext__()
-        if retval == -5:
-            raise IOError("iteration on closed file")
-        elif retval < 0:
-            raise StopIteration
-
-        return charptr_to_str(self.buffer.s, self.encoding)
-
-    def next(self):
-        return self.__next__()
-
-    def __dealloc__(self):
-        if <void*>self.iterator != NULL:
-            tbx_itr_destroy(self.iterator)
-        if self.buffer.s != NULL:
-            free(self.buffer.s)
-
-
-class EmptyIterator:
-    '''empty iterator'''
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        raise StopIteration()
-
-    def __next__(self):
-        raise StopIteration()
-
-
-cdef class TabixIteratorParsed(TabixIterator):
-    """iterates over mapped reads in a region.
-
-    The *parser* determines the encoding.
-
-    Returns parsed data.
-    """
-
-    def __init__(self, 
-                 Parser parser):
-        
-        TabixIterator.__init__(self)
-        self.parser = parser
-
-    def __next__(self): 
-        """python version of next().
-
-        pyrex uses this non-standard name instead of next()
-        """
-        
-        cdef int retval = self.__cnext__()
-        if retval == -5:
-            raise IOError("iteration on closed file")
-        elif retval < 0:
-            raise StopIteration
-
-        return self.parser.parse(self.buffer.s,
-                                 self.buffer.l)
-
-
-cdef class GZIterator:
-    def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
-        '''iterate line-by-line through gzip (or bgzip)
-        compressed file.
-        '''
-        if not os.path.exists(filename):
-            raise IOError("No such file or directory: %s" % filename)
-
-        filename = encode_filename(filename)
-        cdef char *cfilename = filename
-        with nogil:
-            self.gzipfile = bgzf_open(cfilename, "r")
-        self._filename = filename
-        self.kstream = ks_init(self.gzipfile)
-        self.encoding = encoding
-
-        self.buffer.l = 0
-        self.buffer.m = 0
-        self.buffer.s = <char*>malloc(buffer_size)
-
-    def __dealloc__(self):
-        '''close file.'''
-        if self.gzipfile != NULL:
-            bgzf_close(self.gzipfile)
-            self.gzipfile = NULL
-        if self.buffer.s != NULL:
-            free(self.buffer.s)
-        if self.kstream != NULL:
-            ks_destroy(self.kstream)
-
-    def __iter__(self):
-        return self
-
-    cdef int __cnext__(self):
-        cdef int dret = 0
-        cdef int retval = 0
-        while 1:
-            with nogil:
-                retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
-            
-            if retval < 0: 
-                break
-
-            return dret
-        return -1
-
-    def __next__(self):
-        """python version of next().
-        """
-        cdef int retval = self.__cnext__()
-        if retval < 0:
-            raise StopIteration
-        return force_str(self.buffer.s, self.encoding)
-
-
-cdef class GZIteratorHead(GZIterator):
-    '''iterate line-by-line through gzip (or bgzip)
-    compressed file returning comments at top of file.
-    '''
-
-    def __next__(self):
-        """python version of next().
-        """
-        cdef int retval = self.__cnext__()
-        if retval < 0:
-            raise StopIteration
-        if self.buffer.s[0] == '#':
-            return self.buffer.s
-        else:
-            raise StopIteration
-
-
-cdef class GZIteratorParsed(GZIterator):
-    '''iterate line-by-line through gzip (or bgzip)
-    compressed file returning comments at top of file.
-    '''
-
-    def __init__(self, parser):
-        self.parser = parser
-
-    def __next__(self):
-        """python version of next().
-        """
-        cdef int retval = self.__cnext__()
-        if retval < 0:
-            raise StopIteration
-
-        return self.parser.parse(self.buffer.s,
-                                 self.buffer.l)
-
-
-def tabix_compress(filename_in, 
-                   filename_out,
-                   force=False):
-    '''compress *filename_in* writing the output to *filename_out*.
-    
-    Raise an IOError if *filename_out* already exists, unless *force*
-    is set.
-    '''
-
-    if not force and os.path.exists(filename_out):
-        raise IOError(
-            "Filename '%s' already exists, use *force* to "
-            "overwrite" % filename_out)
-
-    cdef int WINDOW_SIZE
-    cdef int c, r
-    cdef void * buffer
-    cdef BGZF * fp
-    cdef int fd_src
-    cdef bint is_empty = True
-    cdef int O_RDONLY
-    O_RDONLY = os.O_RDONLY
-
-    WINDOW_SIZE = 64 * 1024
-
-    fn = encode_filename(filename_out)
-    cdef char *cfn = fn
-    with nogil:
-        fp = bgzf_open(cfn, "w")
-    if fp == NULL:
-        raise IOError("could not open '%s' for writing" % filename_out)
-
-    fn = encode_filename(filename_in)
-    fd_src = open(fn, O_RDONLY)
-    if fd_src == 0:
-        raise IOError("could not open '%s' for reading" % filename_in)
-
-    buffer = malloc(WINDOW_SIZE)
-    c = 1
-    
-    while c > 0:
-        with nogil:
-            c = read(fd_src, buffer, WINDOW_SIZE)
-            if c > 0:
-                is_empty = False
-            r = bgzf_write(fp, buffer, c)
-        if r < 0:
-            free(buffer)
-            raise OSError("writing failed")
-        
-    free(buffer)
-    r = bgzf_close(fp)
-    if r < 0:
-        raise OSError("error %i when writing to file %s" % (r, filename_out))
-
-    r = close(fd_src)
-    # an empty file will return with -1, thus ignore this.
-    if r < 0:
-        if not (r == -1 and is_empty):
-            raise OSError("error %i when closing file %s" % (r, filename_in))
-
-
-def tabix_index( filename, 
-                 force = False,
-                 seq_col = None, 
-                 start_col = None, 
-                 end_col = None,
-                 preset = None,
-                 meta_char = "#",
-                 zerobased = False,
-                 int min_shift = -1,
-                ):
-    '''index tab-separated *filename* using tabix.
-
-    An existing index will not be overwritten unless
-    *force* is set.
-
-    The index will be built from coordinates
-    in columns *seq_col*, *start_col* and *end_col*.
-
-    The contents of *filename* have to be sorted by 
-    contig and position - the method does not check
-    if the file is sorted.
-
-    Column indices are 0-based. Coordinates in the file
-    are assumed to be 1-based.
-
-    If *preset* is provided, the column coordinates
-    are taken from a preset. Valid values for preset
-    are "gff", "bed", "sam", "vcf", psltbl", "pileup".
-    
-    Lines beginning with *meta_char* and the first
-    *line_skip* lines will be skipped.
-    
-    If *filename* does not end in ".gz", it will be automatically
-    compressed. The original file will be removed and only the 
-    compressed file will be retained. 
-
-    If *filename* ends in *gz*, the file is assumed to be already
-    compressed with bgzf.
-
-    *min-shift* sets the minimal interval size to 1<<INT; 0 for the
-    old tabix index. The default of -1 is changed inside htslib to 
-    the old tabix default of 0.
-
-    returns the filename of the compressed data
-
-    '''
-    
-    if not os.path.exists(filename):
-        raise IOError("No such file '%s'" % filename)
-
-    if preset is None and \
-       (seq_col is None or start_col is None or end_col is None):
-        raise ValueError(
-            "neither preset nor seq_col,start_col and end_col given")
-
-    if not filename.endswith(".gz"): 
-        tabix_compress(filename, filename + ".gz", force=force)
-        os.unlink( filename )
-        filename += ".gz"
-
-    if not force and os.path.exists(filename + ".tbi"):
-        raise IOError(
-            "Filename '%s.tbi' already exists, use *force* to overwrite")
-
-    # columns (1-based):
-    #   preset-code, contig, start, end, metachar for
-    #     comments, lines to ignore at beginning
-    # 0 is a missing column
-    preset2conf = {
-        'gff' : (0, 1, 4, 5, ord('#'), 0),
-        'bed' : (0x10000, 1, 2, 3, ord('#'), 0),
-        'psltbl' : (0x10000, 15, 17, 18, ord('#'), 0),
-        'sam' : (1, 3, 4, 0, ord('@'), 0),
-        'vcf' : (2, 1, 2, 0, ord('#'), 0),
-        'pileup': (3, 1, 2, 0, ord('#'), 0),
-        }
-
-    if preset:
-        try:
-            conf_data = preset2conf[preset]
-        except KeyError:
-            raise KeyError(
-                "unknown preset '%s', valid presets are '%s'" %
-                (preset, ",".join(preset2conf.keys())))
-    else:
-        if end_col == None:
-            end_col = -1
-        preset = 0
-
-        # note that tabix internally works with 0-based coordinates
-        # and open/closed intervals.  When using a preset, conversion
-        # is automatically taken care of.  Otherwise, the coordinates
-        # are assumed to be 1-based closed intervals and -1 is
-        # subtracted from the start coordinate. To avoid doing this,
-        # set the TI_FLAG_UCSC=0x10000 flag:
-        if zerobased:
-            preset = preset | 0x10000
-
-        conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
-                
-    cdef tbx_conf_t conf
-    conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
-
-
-    fn = encode_filename(filename)
-    cdef char *cfn = fn
-    with nogil:
-        tbx_index_build(cfn, min_shift, &conf)
-    
-    return filename
-
-# #########################################################
-# cdef class tabix_file_iterator_old:
-#     '''iterate over ``infile``.
-
-#     This iterator is not safe. If the :meth:`__next__()` method is called 
-#     after ``infile`` is closed, the result is undefined (see ``fclose()``).
-
-#     The iterator might either raise a StopIteration or segfault.
-#     '''
-
-
-#     def __cinit__(self, 
-#                   infile, 
-#                   Parser parser,
-#                   int buffer_size = 65536 ):
-
-#         cdef int fd = PyObject_AsFileDescriptor( infile )
-#         if fd == -1: raise ValueError( "I/O operation on closed file." )
-#         self.infile = fdopen( fd, 'r')
-
-#         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
-
-#         self.buffer = <char*>malloc( buffer_size )        
-#         self.size = buffer_size
-#         self.parser = parser
-
-#     def __iter__(self):
-#         return self
-
-#     cdef __cnext__(self):
-
-#         cdef char * b
-#         cdef size_t nbytes
-#         b = self.buffer
-
-#         while not feof( self.infile ):
-#             nbytes = getline( &b, &self.size, self.infile)
-
-#             # stop at first error or eof
-#             if (nbytes == -1): break
-#             # skip comments
-#             if (b[0] == '#'): continue
-
-#             # skip empty lines
-#             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
-
-#             # make sure that entry is complete
-#             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
-#                 result = b
-#                 raise ValueError( "incomplete line at %s" % result )
-
-#             # make sure that this goes fully through C
-#             # otherwise buffer is copied to/from a
-#             # Python object causing segfaults as
-#             # the wrong memory is freed
-#             return self.parser.parse( b, nbytes )
-
-#         raise StopIteration
-
-#     def __dealloc__(self):
-#         free(self.buffer)
-
-#     def __next__(self):
-#         return self.__cnext__()
-
-#########################################################
-#########################################################
-#########################################################
-## Iterators for parsing through unindexed files.
-#########################################################
-# cdef buildGzipError(void *gzfp):
-#     cdef int errnum = 0
-#     cdef char *s = gzerror(gzfp, &errnum)
-#     return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
-
-
-cdef class tabix_file_iterator:
-    '''iterate over a compressed or uncompressed ``infile``.
-    '''
-
-    def __cinit__(self, 
-                  infile, 
-                  Parser parser,
-                  int buffer_size=65536):
-
-        if infile.closed:
-            raise ValueError("I/O operation on closed file.")
-
-        self.infile = infile
-
-        cdef int fd = PyObject_AsFileDescriptor(infile)
-        if fd == -1:
-            raise ValueError("I/O operation on closed file.")
-
-        self.duplicated_fd = dup(fd)
-
-        # From the manual:
-        # gzopen can be used to read a file which is not in gzip format; 
-        # in this case gzread will directly read from the file without decompression. 
-        # When reading, this will be detected automatically by looking 
-        # for the magic two-byte gzip header. 
-        self.fh = bgzf_dopen(self.duplicated_fd, 'r')
-
-        if self.fh == NULL: 
-            raise IOError('%s' % strerror(errno))
-
-        self.kstream = ks_init(self.fh) 
-        
-        self.buffer.s = <char*>malloc(buffer_size)
-        #if self.buffer == NULL:
-        #    raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
-        #self.size = buffer_size
-        self.parser = parser
-
-    def __iter__(self):
-        return self
-
-    cdef __cnext__(self):
-
-        cdef char * b
-        cdef int dret = 0
-        cdef int retval = 0
-        while 1:
-            with nogil:
-                retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
-            
-            if retval < 0: 
-                break
-                #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
-
-            b = self.buffer.s
-            
-            # skip comments
-            if (b[0] == '#'):
-                continue
-
-            # skip empty lines
-            if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
-                continue
-
-            # gzgets terminates at \n, no need to test
-
-            # parser creates a copy
-            return self.parser.parse(b, self.buffer.l)
-
-        raise StopIteration
-
-    def __dealloc__(self):
-        free(self.buffer.s)
-        ks_destroy(self.kstream)
-        bgzf_close(self.fh)
-        
-    def __next__(self):
-        return self.__cnext__()
-
-    def next(self):
-        return self.__cnext__()
-    
-
-class tabix_generic_iterator:
-    '''iterate over ``infile``.
-    
-    Permits the use of file-like objects for example from the gzip module.
-    '''
-    def __init__(self, infile, parser):
-
-        self.infile = infile
-        if self.infile.closed:
-            raise ValueError("I/O operation on closed file.")
-        self.parser = parser
-
-    def __iter__(self):
-        return self
-
-    # cython version - required for python 3
-    def __next__(self):
-        
-        cdef char * b
-        cdef char * cpy
-        cdef size_t nbytes
-
-        encoding = self.parser.get_encoding()
-
-        # note that GzipFile.close() does not close the file
-        # reading is still possible.
-        if self.infile.closed:
-            raise ValueError("I/O operation on closed file.")
-
-        while 1:
-
-            line = self.infile.readline()
-            if not line:
-                break
-            
-            s = force_bytes(line, encoding)
-            b = s
-            nbytes = len(line)
-            assert b[nbytes] == '\0'
-
-            # skip comments
-            if b[0] == '#':
-                continue
-
-            # skip empty lines
-            if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
-                continue
-            
-            # make sure that entry is complete
-            if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
-                raise ValueError("incomplete line at %s" % line)
-            
-            bytes_cpy = <bytes> b
-            cpy = <char *> bytes_cpy
-
-            return self.parser(cpy, nbytes)            
-
-        raise StopIteration
-
-    # python version - required for python 2.7
-    def next(self):
-        return self.__next__()
-
-def tabix_iterator(infile, parser):
-    """return an iterator over all entries in a file.
-    
-    Results are returned parsed as specified by the *parser*. If
-    *parser* is None, the results are returned as an unparsed string.
-    Otherwise, *parser* is assumed to be a functor that will return
-    parsed data (see for example :class:`~pysam.asTuple` and
-    :class:`~pysam.asGTF`).
-
-    """
-    if PY_MAJOR_VERSION >= 3:
-        return tabix_generic_iterator(infile, parser)
-    else:
-        return tabix_file_iterator(infile, parser)
-        
-    # file objects can use C stdio
-    # used to be: isinstance( infile, file):
-    # if PY_MAJOR_VERSION >= 3:
-    #     if isinstance( infile, io.IOBase ):
-    #         return tabix_copy_iterator( infile, parser )
-    #     else:
-    #         return tabix_generic_iterator( infile, parser )
-    # else:
-#        if isinstance( infile, file ):
-#            return tabix_copy_iterator( infile, parser )
-#        else:
-#            return tabix_generic_iterator( infile, parser )
-    
-cdef class Tabixfile(TabixFile):
-    """Tabixfile is deprecated: use TabixFile instead"""
-    pass
-
-
-__all__ = [
-    "tabix_index", 
-    "tabix_compress",
-    "TabixFile",
-    "Tabixfile",
-    "asTuple",
-    "asGTF",
-    "asVCF",
-    "asBed",
-    "GZIterator",
-    "GZIteratorHead",
-    "tabix_iterator", 
-    "tabix_generic_iterator", 
-    "tabix_file_iterator", 
-]
diff --git a/pysam/ctabixproxies.pxd b/pysam/ctabixproxies.pxd

deleted file mode 100644 (file)

index 5317b81..0000000
--- a/pysam/ctabixproxies.pxd
+++ /dev/null
@@ -1,59 +0,0 @@
-#cdef extern from "Python.h":
-#    ctypedef struct FILE
-
-from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t
-
-cdef class TupleProxy:
-
-    cdef:
-        char * data
-        char ** fields
-        int nfields
-        int index
-        int nbytes
-        int offset
-        bint is_modified
-
-    cdef encoding
-
-    cpdef int getMaxFields(self)
-    cpdef int getMinFields(self)
-#    cdef char * _getindex(self, int idx)
-
-    cdef take(self, char * buffer, size_t nbytes)
-    cdef present(self, char * buffer, size_t nbytes)
-    cdef copy(self, char * buffer, size_t nbytes, bint reset=*)
-    cdef update(self, char * buffer, size_t nbytes)
-
-cdef class GTFProxy(TupleProxy) :
-
-    cdef:
-        char * _attributes
-        cdef bint hasOwnAttributes
-
-    cpdef int getMaxFields(self)
-    cpdef int getMinFields(self)
-    cdef char * getAttributes(self)
-
-cdef class NamedTupleProxy(TupleProxy):
-    pass
-
-cdef class BedProxy(NamedTupleProxy):
-
-    cdef:
-        char * contig
-        uint32_t start
-        uint32_t end
-        int bedfields
-
-    cpdef int getMaxFields(self)
-    cpdef int getMinFields(self)
-    cdef update(self, char * buffer, size_t nbytes)
-
-cdef class VCFProxy(NamedTupleProxy) :
-
-    cdef:
-        char * contig
-        uint32_t pos
-
-    cdef update(self, char * buffer, size_t nbytes)
diff --git a/pysam/ctabixproxies.pyx b/pysam/ctabixproxies.pyx

deleted file mode 100644 (file)

index f5288cc..0000000
--- a/pysam/ctabixproxies.pyx
+++ /dev/null
@@ -1,827 +0,0 @@
-from cpython cimport PyBytes_FromStringAndSize
-
-from libc.stdio cimport printf, feof, fgets
-from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
-from libc.stdlib cimport free, malloc, calloc, realloc
-from libc.stdlib cimport atoi, atol, atof
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-
-import collections
-
-cdef char *StrOrEmpty(char * buffer):
-     if buffer == NULL:
-         return ""
-     else: return buffer
-
-cdef int isNew(char * p, char * buffer, size_t nbytes):
-    """return True if `p` is located within `buffer` of size
-    `nbytes`
-    """
-    if p == NULL:
-        return 0
-    return not (buffer <= p < buffer + nbytes)
-
-
-cdef class TupleProxy:
-    '''Proxy class for access to parsed row as a tuple.
-
-    This class represents a table row for fast read-access.
-
-    Access to individual fields is via the [] operator.
-    
-    Only read-only access is implemented.
-
-    '''
-
-    def __cinit__(self, encoding="ascii"): 
-        self.data = NULL
-        self.fields = NULL
-        self.index = 0
-        self.nbytes = 0
-        self.is_modified = 0
-        self.nfields = 0
-        # start counting at field offset
-        self.offset = 0
-        self.encoding = encoding
-
-    def __dealloc__(self):
-        cdef int x
-        if self.is_modified:
-            for x from 0 <= x < self.nfields:
-                if isNew(self.fields[x], self.data, self.nbytes):
-                    free(self.fields[x])
-                    self.fields[x] = NULL
-
-        if self.data != NULL:
-            free(self.data)
-        if self.fields != NULL:
-            free(self.fields)
-
-    def __copy__(self):
-        if self.is_modified:
-            raise NotImplementedError(
-                "copying modified tuples is not implemented")
-        cdef TupleProxy n = type(self)()
-        n.copy(self.data, self.nbytes, reset=True)
-        return n
-
-    def compare(self, TupleProxy other):
-        '''return -1,0,1, if contents in this are binary
-        <,=,> to *other*
-
-        '''
-        if self.is_modified or other.is_modified:
-            raise NotImplementedError(
-                'comparison of modified TupleProxies is not implemented')
-        if self.data == other.data:
-            return 0
-
-        if self.nbytes < other.nbytes:
-            return -1
-        elif self.nbytes > other.nbytes:
-            return 1
-        return memcmp(self.data, other.data, self.nbytes)
-
-    def __richcmp__(self, TupleProxy other, int op):
-        if op == 2:  # == operator
-            return self.compare(other) == 0
-        elif op == 3:  # != operator
-            return self.compare(other) != 0
-        else:
-            err_msg = "op {0} isn't implemented yet".format(op)
-            raise NotImplementedError(err_msg)
-
-    cdef take(self, char * buffer, size_t nbytes):
-        '''start presenting buffer.
-
-        Take ownership of the pointer.
-        '''
-        self.data = buffer
-        self.nbytes = nbytes
-        self.update(buffer, nbytes)
-
-    cdef present(self, char * buffer, size_t nbytes):
-        '''start presenting buffer.
-
-        Do not take ownership of the pointer.
-        '''
-        self.update(buffer, nbytes)
-
-    cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
-        '''start presenting buffer of size *nbytes*.
-
-        Buffer is a '\0'-terminated string without the '\n'.
-
-        Take a copy of buffer.
-        '''
-        # +1 for '\0'
-        cdef int s = sizeof(char) *  (nbytes + 1)
-        self.data = <char*>malloc(s)
-        if self.data == NULL:
-            raise ValueError("out of memory in TupleProxy.copy()")
-        memcpy(<char*>self.data, buffer, s)
-
-        if reset:
-            for x from 0 <= x < nbytes:
-                if self.data[x] == '\0':
-                    self.data[x] = '\t'
-
-        self.update(self.data, nbytes)
-
-    cpdef int getMinFields(self):
-        '''return minimum number of fields.'''
-        # 1 is not a valid tabix entry, but TupleProxy
-        # could be more generic.
-        return 1
-
-    cpdef int getMaxFields(self):
-        '''return maximum number of fields. Return 
-        0 for unknown length.'''
-        return 0
-
-    cdef update(self, char * buffer, size_t nbytes):
-        '''update internal data.
-
-        *buffer* is a \0 terminated string.
-
-        *nbytes* is the number of bytes in buffer (excluding
-        the \0)
-
-        Update starts work in buffer, thus can be used
-        to collect any number of fields until nbytes
-        is exhausted.
-
-        If max_fields is set, the number of fields is initialized to
-        max_fields.
-
-        '''
-        cdef char * pos
-        cdef char * old_pos
-        cdef int field
-        cdef int max_fields, min_fields, x
-
-        assert strlen(buffer) == nbytes, \
-            "length of buffer (%i) != number of bytes (%i)" % (
-            strlen(buffer), nbytes)
-
-        if buffer[nbytes] != 0:
-            raise ValueError("incomplete line at %s" % buffer)
-
-        #################################
-        # remove line breaks and feeds and update number of bytes
-        x = nbytes - 1
-        while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): 
-            buffer[x] = '\0'
-            x -= 1
-        self.nbytes = x + 1
-
-        #################################
-        # clear data
-        if self.fields != NULL:
-            free(self.fields)
- 
-        for field from 0 <= field < self.nfields:
-            if isNew(self.fields[field], self.data, self.nbytes):
-                free(self.fields[field])
-                
-        self.is_modified = self.nfields = 0
-
-        #################################
-        # allocate new
-        max_fields = self.getMaxFields()
-        # pre-count fields - better would be
-        # to guess or dynamically grow
-        if max_fields == 0:
-            for x from 0 <= x < nbytes:
-                if buffer[x] == '\t':
-                    max_fields += 1
-            max_fields += 1
-
-        self.fields = <char **>calloc(max_fields, sizeof(char *)) 
-        if self.fields == NULL:
-            raise ValueError("out of memory in TupleProxy.update()")
-
-        #################################
-        # start filling
-        field = 0
-        self.fields[field] = pos = buffer
-        field += 1
-        old_pos = pos
-        while 1:
-            
-            pos = <char*>memchr(pos, '\t', nbytes)
-            if pos == NULL:
-                break
-            if field >= max_fields:
-                raise ValueError(
-                    "parsing error: more than %i fields in line: %s" %
-                    (max_fields, buffer))
-
-            pos[0] = '\0'
-            pos += 1
-            self.fields[field] = pos
-            field += 1
-            nbytes -= pos - old_pos
-            if nbytes < 0:
-                break
-            old_pos = pos
-        self.nfields = field
-        if self.nfields < self.getMinFields():
-            raise ValueError(
-                "parsing error: fewer that %i fields in line: %s" %
-                (self.getMinFields(), buffer))
-
-    def _getindex(self, int index):
-        '''return item at idx index'''
-        cdef int i = index
-        if i < 0:
-            i += self.nfields
-        if i < 0:
-            raise IndexError("list index out of range")
-        # apply offset - separating a fixed number 
-        # of fields from a variable number such as in VCF
-        i += self.offset
-        if i >= self.nfields:
-            raise IndexError(
-                "list index out of range %i >= %i" %
-                (i, self.nfields))
-        return force_str(self.fields[i], self.encoding)
-
-    def __getitem__(self, key):
-        if type(key) == int:
-            return self._getindex(key)
-        # slice object
-        start, end, step = key.indices(self.nfields)
-        result = []
-        for index in range(start, end, step):
-            result.append(self._getindex(index))
-        return result
-
-    def _setindex(self, index, value):
-        '''set item at idx index.'''
-        cdef int idx = index
-        if idx < 0:
-            raise IndexError("list index out of range")
-        if idx >= self.nfields:
-            raise IndexError("list index out of range")
-
-        if isNew(self.fields[idx], self.data, self.nbytes):
-            free(self.fields[idx] )
-
-        self.is_modified = 1
-
-        if value is None:
-            self.fields[idx] = NULL
-            return
-
-        # conversion with error checking
-        value = force_bytes(value)
-        cdef char * tmp = <char*>value
-        self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
-        if self.fields[idx] == NULL:
-            raise ValueError("out of memory" )
-        strcpy(self.fields[idx], tmp)
-
-    def __setitem__(self, index, value):
-        '''set item at *index* to *value*'''
-        cdef int i = index
-        if i < 0:
-            i += self.nfields
-        i += self.offset
-        
-        self._setindex(i, value)
-
-    def __len__(self):
-        return self.nfields
-
-    def __iter__(self):
-        self.index = 0
-        return self
-
-    def __next__(self): 
-        """python version of next().
-        """
-        if self.index >= self.nfields:
-            raise StopIteration
-        cdef char * retval = self.fields[self.index]
-        self.index += 1
-        if retval == NULL:
-            return None
-        else:
-            return force_str(retval, self.encoding)
-
-    def __str__(self):
-        '''return original data'''
-        # copy and replace \0 bytes with \t characters
-        cdef char * cpy
-        if self.is_modified:
-            # todo: treat NULL values
-            result = []
-            for x in xrange(0, self.nfields):
-                result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
-            return "\t".join(result)
-        else:
-            cpy = <char*>calloc(sizeof(char), self.nbytes+1)
-            if cpy == NULL:
-                raise ValueError("out of memory")
-            memcpy(cpy, self.data, self.nbytes+1)
-            for x from 0 <= x < self.nbytes:
-                if cpy[x] == '\0':
-                    cpy[x] = '\t'
-            result = cpy[:self.nbytes]
-            free(cpy)
-            r = result.decode(self.encoding)
-            return r
-
-def toDot(v):
-    '''convert value to '.' if None'''
-    if v is None:
-        return "." 
-    else:
-        return str(v)
-
-def quote(v):
-    '''return a quoted attribute.'''
-    if isinstance(v, str):
-        return '"%s"' % v
-    else: 
-        return str(v)
-
-
-cdef class GTFProxy(TupleProxy):
-    '''Proxy class for access to GTF fields.
-
-    This class represents a GTF entry for fast read-access.
-    Write-access has been added as well, though some care must
-    be taken. If any of the string fields (contig, source, ...)
-    are set, the new value is tied to the lifetime of the
-    argument that was supplied.
-
-    The only exception is the attributes field when set from
-    a dictionary - this field will manage its own memory.
-    '''
-
-    def __cinit__(self): 
-        # automatically calls TupleProxy.__cinit__
-        self.hasOwnAttributes = False
-        self._attributes = NULL
-
-    def __dealloc__(self):
-        # automatically calls TupleProxy.__dealloc__
-        if self.hasOwnAttributes:
-            free(self._attributes)
-
-    cpdef int getMinFields(self):
-        '''return minimum number of fields.'''
-        return 9
-
-    cpdef int getMaxFields(self):
-        '''return max number of fields.'''
-        return 9
-
-    property contig:
-        '''contig of feature.'''
-        def __get__(self):
-            return self._getindex(0)
-        def __set__(self, value):
-            self._setindex(0, value)
-
-    property source:
-        '''feature source.'''
-        def __get__(self):
-            return self._getindex(1)
-        def __set__(self, value):
-            if value is None:
-                value = "."
-            self._setindex(1, value)
-
-    property feature:
-        '''feature name.'''
-        def __get__(self):
-            return self._getindex(2)
-        def __set__(self, value):
-            if value is None:
-                value = "."
-            self._setindex(2, value)
-
-    property start:
-        '''feature start (in 0-based open/closed coordinates).'''
-        def __get__(self ):
-            return int( self._getindex(3)) - 1
-        def __set__(self, value ):
-            self._setindex(3, str(value+1))
-
-    property end:
-        '''feature end (in 0-based open/closed coordinates).'''
-        def __get__(self):
-            return int(self._getindex(4))
-        def __set__(self, value):
-            self._setindex(4, str(value))
-
-    property score:
-        '''feature score.'''
-        def __get__(self): 
-            v = self._getindex(5)
-            if v == "" or v[0] == '.':
-                return None
-            else:
-                return float(v)
-
-        def __set__(self, value):
-            if value is None:
-                value = "."
-            self._setindex(5, str(value))
-
-    property strand:
-        '''feature strand.'''
-        def __get__(self):
-           return self._getindex(6)
-        def __set__(self, value ):
-            if value is None:
-                value = "."
-            self._setindex(6, value)
-
-    property frame:
-       '''feature frame.'''
-       def __get__(self):
-            v = self._getindex(7)
-            if v == "" or v[0] == '.':
-                return v
-            else:
-                return int(v)
-
-       def __set__(self, value):
-            if value is None:
-                value = "."
-            self._setindex(7, str(value))
-
-    property attributes:
-        '''feature attributes (as a string).'''
-        def __get__(self): 
-            if self.hasOwnAttributes:
-                return force_str(self._attributes)
-            else:
-                return force_str(self._getindex(8))
-        def __set__( self, value): 
-            if self.hasOwnAttributes:
-                free(self._attributes)
-                self._attributes = NULL
-                self.hasOwnAttributes = False
-            self._setindex(8, value)
-
-    cdef char * getAttributes(self):
-        '''return pointer to attributes.'''
-        cdef char * attributes
-        if self.hasOwnAttributes:
-            attributes = self._attributes
-        else:
-            attributes = self.fields[8]
-        if attributes == NULL:
-            raise KeyError("no attributes defined GTF entry")
-        return attributes
-
-    def asDict(self):
-        """parse attributes - return as dict
-        """
-
-        # remove comments
-        attributes = self.attributes
-
-        # separate into fields
-        # Fields might contain a ";", for example in ENSEMBL GTF file
-        # for mouse, v78:
-        # ...; transcript_name "TXNRD2;-001"; ....
-        # The current heuristic is to split on a semicolon followed by a
-        # space, see also http://mblab.wustl.edu/GTF22.html
-
-        # Remove white space to prevent a last empty field.
-        fields = [x.strip() for x in attributes.strip().split("; ")]
-        
-        result = collections.OrderedDict()
-
-        for f in fields:
-
-            # strip semicolon (GTF files without a space after the last semicolon)
-            if f.endswith(";"):
-                f = f[:-1]
-
-            # split at most once in order to avoid separating
-            # multi-word values
-            d = [x.strip() for x in f.split(" ", 1)]
-
-            n,v = d[0], d[1]
-            if len(d) > 2:
-                v = d[1:]
-
-            if v[0] == '"' and v[-1] == '"':
-                v = v[1:-1]
-            else:
-                ## try to convert to a value
-                try:
-                    v = float(v)
-                    v = int(v)
-                except ValueError:
-                    pass
-                except TypeError:
-                    pass
-
-            result[n] = v
-        
-        return result
-    
-    def fromDict(self, d):
-        '''set attributes from a dictionary.'''
-        cdef char * p
-        cdef int l
-
-        # clean up if this field is set twice
-        if self.hasOwnAttributes: 
-            free(self._attributes)
-
-        aa = []
-        for k,v in d.items():
-            if isinstance(v, str):
-                aa.append( '%s "%s"' % (k,v) )
-            else:
-                aa.append( '%s %s' % (k,str(v)) )
-
-        a = force_bytes("; ".join(aa) + ";")
-        p = a
-        l = len(a)
-        self._attributes = <char *>calloc(l + 1, sizeof(char))
-        if self._attributes == NULL:
-            raise ValueError("out of memory")
-        memcpy(self._attributes, p, l)
-
-        self.hasOwnAttributes = True
-        self.is_modified = True
-
-    def __str__(self):
-        cdef char * cpy
-        cdef int x
-
-        if self.is_modified:
-            return "\t".join( 
-                (self.contig, 
-                 self.source, 
-                 self.feature, 
-                 str(self.start+1),
-                 str(self.end),
-                 toDot(self.score),
-                 toDot(self.strand),
-                 toDot(self.frame),
-                 self.attributes))
-        else: 
-            return TupleProxy.__str__(self)
-
-    def invert(self, int lcontig):
-        '''invert coordinates to negative strand coordinates
-        
-        This method will only act if the feature is on the
-        negative strand.'''
-
-        if self.strand[0] == '-':
-            start = min(self.start, self.end)
-            end = max(self.start, self.end)
-            self.start, self.end = lcontig - end, lcontig - start
-
-    def keys(self):
-        '''return a list of attributes defined in this entry.'''
-        r = self.attributes
-        return [x.strip().split(" ")[0]
-                # separator is ';' followed by space
-                for x in r.split("; ") if x.strip() != '']
-
-    def __getitem__(self, key):
-        return self.__getattr__(key)
-
-    def __getattr__(self, item):
-        """Generic lookup of attribute from GFF/GTF attributes 
-        Only called if there *isn't* an attribute with this name
-        """
-        cdef char * start
-        cdef char * query
-        cdef char * cpy
-        cdef char * end
-        cdef int l
-
-        #
-        # important to use the getAttributes function.
-        # Using the self.attributes property to access
-        # the attributes caused a hard-to-trace bug
-        # in which fields in the attribute string were
-        # set to 0.
-        # Running through valgrind complained that
-        # memory was accessed in the memory field
-        # that has been released. It is not clear
-        # why this happened and might be a cython bug
-        # (Version 0.16). The valgrind warnings
-        # disappeard after accessing the C data structures
-        # directly and so did the bug.
-        cdef char * attributes = self.getAttributes()
-        if attributes == NULL:
-            raise KeyError("key %s not found, no attributes" % item)
-
-        # add space in order to make sure
-        # to not pick up a field that is a prefix of another field
-        r = force_bytes(item + " ")
-        query = r
-        start = strstr(attributes, query)
-
-        if start == NULL:
-            raise AttributeError("'GTFProxy' has no attribute '%s'" % item)
-
-        start += strlen(query)
-        # skip gaps before
-        while start[0] == ' ':
-            start += 1
-
-        if start[0] == '"':
-            start += 1
-            end = start
-            while end[0] != '\0' and end[0] != '"':
-                end += 1
-            l = end - start
-            result = force_str(PyBytes_FromStringAndSize(start, l),
-                                self.encoding)
-            return result
-        else:
-            return force_str(start, self.encoding)
-
-    def setAttribute(self, name, value):
-        '''convenience method to set an attribute.'''
-        r = self.asDict()
-        r[name] = value
-        self.fromDict(r)
-
-    def __cmp__(self, other):
-        return (self.contig, self.strand, self.start) < \
-            (other.contig, other.strand, other.start)
-
-    # python 3 compatibility
-    def __richcmp__(GTFProxy self, GTFProxy other, int op):
-        if op == 0:
-            return (self.contig, self.strand, self.start) < \
-                (other.contig, other.strand, other.start)
-        elif op == 1:
-            return (self.contig, self.strand, self.start) <= \
-                (other.contig, other.strand, other.start)
-        elif op == 2:
-            return self.compare(other) == 0
-        elif op == 3:
-            return self.compare(other) != 0
-        else:
-            err_msg = "op {0} isn't implemented yet".format(op)
-            raise NotImplementedError(err_msg)
-
-
-cdef class NamedTupleProxy(TupleProxy):
-
-    map_key2field = {}
-
-    def __setattr__(self, key, value):
-        '''set attribute.'''
-        cdef int idx
-        idx, f = self.map_key2field[key]
-        if self.nfields < idx:
-            raise KeyError("field %s not set" % key)
-        TupleProxy.__setitem__(self, idx, str(value))
-
-    def __getattr__(self, key):
-        cdef int idx
-        idx, f = self.map_key2field[key]
-        if self.nfields < idx:
-            raise KeyError("field %s not set" % key)
-        if f == str:
-            return force_str(self.fields[idx],
-                              self.encoding)
-        return f(self.fields[idx])
-
-
-cdef class BedProxy(NamedTupleProxy):
-    '''Proxy class for access to Bed fields.
-
-    This class represents a BED entry for fast read-access.
-    '''
-    map_key2field = {
-        'contig' : (0, str),
-        'start' : (1, int),
-        'end' : (2, int),
-        'name' : (3, str),
-        'score' : (4, float),
-        'strand' : (5, str),
-        'thickStart' : (6, int),
-        'thickEnd' : (7, int),
-        'itemRGB' : (8, str),
-        'blockCount': (9, int),
-        'blockSizes': (10, str),
-        'blockStarts': (11, str), } 
-
-    cpdef int getMinFields(self):
-        '''return minimum number of fields.'''
-        return 3
-
-    cpdef int getMaxFields(self):
-        '''return max number of fields.'''
-        return 12
-
-    cdef update(self, char * buffer, size_t nbytes):
-        '''update internal data.
-
-        nbytes does not include the terminal '\0'.
-        '''
-        TupleProxy.update(self, buffer, nbytes)
-
-        if self.nfields < 3:
-            raise ValueError(
-                "bed format requires at least three columns")
-
-        # determines bed format
-        self.bedfields = self.nfields
-
-        # do automatic conversion
-        self.contig = self.fields[0]
-        self.start = atoi(self.fields[1]) 
-        self.end = atoi(self.fields[2])
-
-    # __setattr__ in base class seems to take precedence
-    # hence implement setters in __setattr__
-    #property start:
-    #    def __get__( self ): return self.start
-    #property end:
-    #    def __get__( self ): return self.end
-
-    def __str__(self):
-
-        cdef int save_fields = self.nfields
-        # ensure fields to use correct format
-        self.nfields = self.bedfields
-        retval = TupleProxy.__str__(self)
-        self.nfields = save_fields
-        return retval
-
-    def __setattr__(self, key, value ):
-        '''set attribute.'''
-        if key == "start":
-            self.start = value
-        elif key == "end":
-            self.end = value
-
-        cdef int idx
-        idx, f = self.map_key2field[key]
-        TupleProxy._setindex(self, idx, str(value) )
-
-cdef class VCFProxy(NamedTupleProxy):
-    '''Proxy class for access to VCF fields.
-
-    The genotypes are accessed via a numeric index.
-    Sample headers are not available.
-    '''
-    map_key2field = { 
-        'contig' : (0, str),
-        'pos' : (1, int),
-        'id' : (2, str),
-        'ref' : (3, str),
-        'alt' : (4, str),
-        'qual' : (5, str),
-        'filter' : (6, str),
-        'info' : (7, str),
-        'format' : (8, str) }
-
-    def __cinit__(self): 
-        # automatically calls TupleProxy.__cinit__
-        # start indexed access at genotypes
-        self.offset = 9
-
-    cdef update(self, char * buffer, size_t nbytes):
-        '''update internal data.
-        
-        nbytes does not include the terminal '\0'.
-        '''
-        TupleProxy.update(self, buffer, nbytes)
-
-        self.contig = self.fields[0]
-        # vcf counts from 1 - correct here
-        self.pos = atoi(self.fields[1]) - 1
-                             
-    def __len__(self):
-        '''return number of genotype fields.'''
-        return max(0, self.nfields - 9)
-
-    property pos:
-       '''feature end (in 0-based open/closed coordinates).'''
-       def __get__(self): 
-           return self.pos
-
-    def __setattr__(self, key, value):
-        '''set attribute.'''
-        if key == "pos": 
-            self.pos = value
-            value += 1
-
-        cdef int idx
-        idx, f = self.map_key2field[key]
-        TupleProxy._setindex(self, idx, str(value))
-
diff --git a/pysam/cutils.pxd b/pysam/cutils.pxd

deleted file mode 100644 (file)

index 81e544a..0000000
--- a/pysam/cutils.pxd
+++ /dev/null
@@ -1,38 +0,0 @@
-#########################################################################
-# Utility functions used across pysam
-#########################################################################
-cimport cython
-from cpython cimport array as c_array
-
-cpdef parse_region(reference=*, start=*, end=*, region=*)
-
-#########################################################################
-# Utility functions for quality string conversions
-
-cpdef c_array.array qualitystring_to_array(input_str, int offset=*)
-cpdef array_to_qualitystring(c_array.array arr, int offset=*)
-cpdef qualities_to_qualitystring(qualities, int offset=*)
-
-########################################################################
-########################################################################
-########################################################################
-## Python 3 compatibility functions
-########################################################################
-cdef charptr_to_str(const char *s, encoding=*)
-cdef bytes charptr_to_bytes(const char *s, encoding=*)
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*)
-cdef force_str(object s, encoding=*)
-cdef bytes force_bytes(object s, encoding=*)
-cdef bytes encode_filename(object filename)
-cdef from_string_and_size(const char *s, size_t length)
-
-cdef extern from "pysam_util.h":
-
-    int samtools_main(int argc, char *argv[])
-    int bcftools_main(int argc, char *argv[])
-    void pysam_set_stderr(int fd)
-    void pysam_unset_stderr()
-    void pysam_set_stdout(int fd)
-    void pysam_set_stdout_fn(const char *)
-    void pysam_unset_stdout()
-    void set_optind(int)
diff --git a/pysam/cutils.pyx b/pysam/cutils.pyx

deleted file mode 100644 (file)

index 7510727..0000000
--- a/pysam/cutils.pyx
+++ /dev/null
@@ -1,371 +0,0 @@
-import types
-import sys
-import string
-import re
-import tempfile
-import os
-import io
-from contextlib import contextmanager
-
-from cpython.version cimport PY_MAJOR_VERSION
-from cpython cimport PyBytes_Check, PyUnicode_Check
-from cpython cimport array as c_array
-from libc.stdlib cimport calloc, free
-from libc.string cimport strncpy
-from libc.stdio cimport fprintf, stderr, fflush
-from libc.stdio cimport stdout as c_stdout
-from posix.fcntl cimport open as c_open, O_WRONLY
-
-#####################################################################
-# hard-coded constants
-cdef int MAX_POS = 2 << 29
-
-#################################################################
-# Utility functions for quality string conversions
-cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
-    """convert a qualitystring to an array of quality values."""
-    if input_str is None:
-        return None
-    qs = force_bytes(input_str)
-    cdef char i
-    return c_array.array('B', [i - offset for i in qs])
-
-
-cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
-    """convert an array of quality values to a string."""
-    if qualities is None:
-        return None
-    cdef int x
-    
-    cdef c_array.array result
-    result = c_array.clone(qualities, len(qualities), zero=False)
-    
-    for x from 0 <= x < len(qualities):
-        result[x] = qualities[x] + offset
-    return force_str(result.tostring())
-
-
-cpdef qualities_to_qualitystring(qualities, int offset=33):
-    """convert a list or array of quality scores to the string
-    representation used in the SAM format.
-
-    Parameters
-    ----------
-    offset : int
-        offset to be added to the quality scores to arrive at
-        the characters of the quality string (default=33).
-
-    Returns
-    -------
-    string
-         a quality string
-
-    """
-    cdef char x
-    if qualities is None:
-        return None
-    elif isinstance(qualities, c_array.array):
-        return array_to_qualitystring(qualities, offset=offset)
-    else:
-        # tuples and lists
-        return force_str("".join([chr(x + offset) for x in qualities]))
-
-
-########################################################################
-########################################################################
-########################################################################
-## Python 3 compatibility functions
-########################################################################
-cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
-
-cdef from_string_and_size(const char* s, size_t length):
-    if IS_PYTHON3:
-        return s[:length].decode("ascii")
-    else:
-        return s[:length]
-
-# filename encoding (copied from lxml.etree.pyx)
-cdef str _FILENAME_ENCODING
-_FILENAME_ENCODING = sys.getfilesystemencoding()
-if _FILENAME_ENCODING is None:
-    _FILENAME_ENCODING = sys.getdefaultencoding()
-if _FILENAME_ENCODING is None:
-    _FILENAME_ENCODING = 'ascii'
-
-#cdef char* _C_FILENAME_ENCODING
-#_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
-
-cdef bytes encode_filename(object filename):
-    """Make sure a filename is 8-bit encoded (or None)."""
-    if filename is None:
-        return None
-    elif PyBytes_Check(filename):
-        return filename
-    elif PyUnicode_Check(filename):
-        return filename.encode(_FILENAME_ENCODING)
-    else:
-        raise TypeError(u"Argument must be string or unicode.")
-
-cdef bytes force_bytes(object s, encoding="ascii"):
-    u"""convert string or unicode object to bytes, assuming
-    ascii encoding.
-    """
-    if not IS_PYTHON3:
-        return s
-    elif s is None:
-        return None
-    elif PyBytes_Check(s):
-        return s
-    elif PyUnicode_Check(s):
-        return s.encode(encoding)
-    else:
-        raise TypeError(u"Argument must be string, bytes or unicode.")
-
-cdef charptr_to_str(const char* s, encoding="ascii"):
-    if s == NULL:
-        return None
-    if PY_MAJOR_VERSION < 3:
-        return s
-    else:
-        return s.decode(encoding)
-
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
-    if s == NULL:
-        return None
-    if PY_MAJOR_VERSION < 3:
-        return s[:n]
-    else:
-        return s[:n].decode(encoding)
-
-cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
-    if s == NULL:
-        return None
-    else:
-        return s
-
-cdef force_str(object s, encoding="ascii"):
-    """Return s converted to str type of current Python
-    (bytes in Py2, unicode in Py3)"""
-    if s is None:
-        return None
-    if PY_MAJOR_VERSION < 3:
-        return s
-    elif PyBytes_Check(s):
-        return s.decode(encoding)
-    else:
-        # assume unicode
-        return s
-
-cpdef parse_region(reference=None,
-                   start=None,
-                   end=None,
-                   region=None):
-    """parse alternative ways to specify a genomic region. A region can
-    either be specified by :term:`reference`, `start` and
-    `end`. `start` and `end` denote 0-based, half-open
-    intervals.
-
-    Alternatively, a samtools :term:`region` string can be
-    supplied.
-
-    If any of the coordinates are missing they will be replaced by the
-    minimum (`start`) or maximum (`end`) coordinate.
-
-    Note that region strings are 1-based, while `start` and `end` denote
-    an interval in python coordinates.
-
-    Returns
-    -------
-
-    tuple :  a tuple of `reference`, `start` and `end`.
-
-    Raises
-    ------
-
-    ValueError
-       for invalid or out of bounds regions.
-
-    """
-    cdef int rtid
-    cdef long long rstart
-    cdef long long rend
-
-    rtid = -1
-    rstart = 0
-    rend = MAX_POS
-    if start != None:
-        try:
-            rstart = start
-        except OverflowError:
-            raise ValueError('start out of range (%i)' % start)
-
-    if end != None:
-        try:
-            rend = end
-        except OverflowError:
-            raise ValueError('end out of range (%i)' % end)
-
-    if region:
-        region = force_str(region)
-        parts = re.split("[:-]", region)
-        reference = parts[0]
-        if len(parts) >= 2:
-            rstart = int(parts[1]) - 1
-        if len(parts) >= 3:
-            rend = int(parts[2])
-
-    if not reference:
-        return None, 0, 0
-
-    if not 0 <= rstart < MAX_POS:
-        raise ValueError('start out of range (%i)' % rstart)
-    if not 0 <= rend <= MAX_POS:
-        raise ValueError('end out of range (%i)' % rend)
-    if rstart > rend:
-        raise ValueError(
-            'invalid region: start (%i) > end (%i)' % (rstart, rend))
-
-    return force_bytes(reference), rstart, rend
-
-
-def _pysam_dispatch(collection,
-                    method,
-                    args=None,
-                    catch_stdout=True,
-                    save_stdout=None):
-    '''call ``method`` in samtools/bcftools providing arguments in args.
-    
-    Catching of stdout can be turned off by setting *catch_stdout* to
-    False.
-
-    '''
-
-    if method == "index":
-        if not os.path.exists(args[0]):
-            raise IOError("No such file or directory: '%s'" % args[0])
-            
-    if args is None:
-        args = []
-    else:
-        args = list(args)
-
-    # redirect stderr to file
-    stderr_h, stderr_f = tempfile.mkstemp()
-    pysam_set_stderr(stderr_h)
-
-    # redirect stdout to file
-    if save_stdout:
-        stdout_f = save_stdout
-        stdout_h = c_open(force_bytes(stdout_f),
-                          O_WRONLY)
-        if stdout_h == -1:
-            raise OSError("error while opening {} for writing".format(stdout_f))
-
-        pysam_set_stdout_fn(force_bytes(stdout_f))
-        pysam_set_stdout(stdout_h)
-    elif catch_stdout:
-        stdout_h, stdout_f = tempfile.mkstemp()
-
-        MAP_STDOUT_OPTIONS = {
-            "samtools": {
-                "view": "-o {}",
-                "mpileup": "-o {}",
-                "depad": "-o {}",
-                "calmd": "",  # uses pysam_stdout_fn
-            },
-            "bcftools": {}
-        }
-
-        stdout_option = None
-        if collection == "bcftools":
-            # in bcftools, most methods accept -o, the exceptions
-            # are below:
-            if method not in ("index", "roh", "stats"):
-                stdout_option = "-o {}"
-        elif method in MAP_STDOUT_OPTIONS[collection]:
-            stdout_option = MAP_STDOUT_OPTIONS[collection][method]
-
-        if stdout_option is not None:
-            os.close(stdout_h)
-            pysam_set_stdout_fn(force_bytes(stdout_f))
-            args.extend(stdout_option.format(stdout_f).split(" "))
-        else:
-            pysam_set_stdout(stdout_h)
-    else:
-        pysam_set_stdout_fn("-")
-
-    # setup the function call to samtools/bcftools main
-    cdef char ** cargs
-    cdef int i, n, retval, l
-    n = len(args)
-    method = force_bytes(method)
-    collection = force_bytes(collection)
-    args = [force_bytes(a) for a in args]
-
-    # allocate two more for first (dummy) argument (contains command)
-    cdef int extra_args = 0
-    if method == b"index":
-        extra_args = 1
-    # add extra arguments for commands accepting optional arguments
-    # such as 'samtools index x.bam [out.index]'
-    cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
-    cargs[0] = collection
-    cargs[1] = method
-
-    # create copies of strings - getopt for long options permutes
-    # arguments
-    for i from 0 <= i < n:
-        l = len(args[i])
-        cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
-        strncpy(cargs[i + 2], args[i], l)
-    
-    # reset getopt. On OsX there getopt reset is different
-    # between getopt and getopt_long
-    if method in [b'index', b'cat', b'quickcheck',
-                  b'faidx', b'kprobaln']:
-        set_optind(1)
-    else:
-        set_optind(0)
-
-    # call samtools/bcftools
-    if collection == b"samtools":
-        retval = samtools_main(n + 2, cargs)
-    elif collection == b"bcftools":
-        retval = bcftools_main(n + 2, cargs)
-
-    for i from 0 <= i < n:
-        free(cargs[i + 2])
-    free(cargs)
-
-    # get error messages
-    def _collect(fn):
-        out = []
-        try:
-            with open(fn, "r") as inf:
-                out = inf.read()
-        except UnicodeDecodeError:
-            with open(fn, "rb") as inf:
-                # read binary output
-                out = inf.read()
-        finally:
-            os.remove(fn)
-        return out
-
-    pysam_unset_stderr()
-    out_stderr = _collect(stderr_f)
-
-    if save_stdout:
-        pysam_unset_stdout()
-        out_stdout = None
-    elif catch_stdout:
-        pysam_unset_stdout()
-        out_stdout = _collect(stdout_f)
-    else:
-        out_stdout = None
-
-    return retval, out_stderr, out_stdout
-
-
-__all__ = ["qualitystring_to_array",
-           "array_to_qualitystring",
-           "qualities_to_qualitystring"]
diff --git a/pysam/cvcf.pxd b/pysam/cvcf.pxd

deleted file mode 100644 (file)

index 139597f..0000000
--- a/pysam/cvcf.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-
-
diff --git a/pysam/cvcf.pyx b/pysam/cvcf.pyx

deleted file mode 100644 (file)

index 5e2fda2..0000000
--- a/pysam/cvcf.pyx
+++ /dev/null
@@ -1,1203 +0,0 @@
-# cython: embedsignature=True
-#
-# Code to read, write and edit VCF files
-#
-# VCF lines are encoded as a dictionary with these keys (note: all lowercase):
-# 'chrom':  string
-# 'pos':    integer
-# 'id':     string
-# 'ref':    string
-# 'alt':    list of strings
-# 'qual':   integer
-# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
-# 'info':   dictionary of values (see below)
-# 'format': list of keys (strings)
-# sample keys: dictionary of values (see below)
-#
-# The sample keys are accessible through vcf.getsamples()
-#
-# A dictionary of values contains value keys (defined in ##INFO or
-# ##FORMAT lines) which map to a list, containing integers, floats,
-# strings, or characters.  Missing values are replaced by a particular
-# value, often -1 or .
-#
-# Genotypes are not stored as a string, but as a list of 1 or 3
-# elements (for haploid and diploid samples), the first (and last) the
-# integer representing an allele, and the second the separation
-# character.  Note that there is just one genotype per sample, but for
-# consistency the single element is stored in a list.
-#
-# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
-# (key, value) pairs and are accessible through getheader()
-#
-# The VCF class can be instantiated with a 'regions' variable
-# consisting of tuples (chrom,start,end) encoding 0-based half-open
-# segments.  Only variants with a position inside the segment will be
-# parsed.  A regions parser is available under parse_regions.
-#
-# When instantiated, a reference can be passed to the VCF class.  This
-# may be any class that supports a fetch(chrom, start, end) method.
-#
-# NOTE: the position that is returned to Python is 0-based, NOT
-# 1-based as in the VCF file.
-# NOTE: There is also preliminary VCF functionality in the VariantFile class.
-#
-# TODO:
-#  only v4.0 writing is complete; alleles are not converted to v3.3 format
-#
-
-from collections import namedtuple, defaultdict
-from operator import itemgetter
-import sys, re, copy, bisect
-
-from libc.stdlib cimport atoi
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-
-cimport pysam.ctabix as ctabix
-cimport pysam.ctabixproxies as ctabixproxies
-
-from pysam.cutils cimport force_str
-
-import pysam
-
-gtsRegEx = re.compile("[|/\\\\]")
-alleleRegEx = re.compile('^[ACGTN]+$')
-
-# Utility function.  Uses 0-based coordinates
-def get_sequence(chrom, start, end, fa):
-    # obtain sequence from .fa file, without truncation
-    if end<=start: return ""
-    if not fa: return "N"*(end-start)
-    if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
-    sequence = fa.fetch(chrom, start, end).upper()
-    if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
-    return sequence
-
-# Utility function.  Parses a region string
-def parse_regions( string ):
-    result = []
-    for r in string.split(','):
-        elts = r.split(':')
-        chrom, start, end = elts[0], 0, 3000000000
-        if len(elts)==1: pass
-        elif len(elts)==2:
-            if len(elts[1])>0:
-                ielts = elts[1].split('-')
-                if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
-                try:    start, end = int(ielts[0])-1, int(ielts[1])
-                except: raise ValueError("Don't understand region string '%s'" % r)
-        else:
-            raise ValueError("Don't understand region string '%s'" % r)
-        result.append( (chrom,start,end) )
-    return result
-
-
-FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
-
-###########################################################################################################
-#
-# New class
-#
-###########################################################################################################
-
-cdef class VCFRecord( ctabixproxies.TupleProxy):
-    '''vcf record.
-
-    initialized from data and vcf meta
-    '''
-
-    cdef vcf
-    cdef char * contig
-    cdef uint32_t pos
-
-    def __init__(self, vcf):
-        self.vcf = vcf
-        self.encoding = vcf.encoding
-
-        # if len(data) != len(self.vcf._samples):
-        #     self.vcf.error(str(data),
-        #                self.BAD_NUMBER_OF_COLUMNS,
-        #                "expected %s for %s samples (%s), got %s" % \
-        #                    (len(self.vcf._samples),
-        #                     len(self.vcf._samples),
-        #                     self.vcf._samples,
-        #                     len(data)))
-
-    def __cinit__(self, vcf):
-        # start indexed access at genotypes
-        self.offset = 9
-
-        self.vcf = vcf
-        self.encoding = vcf.encoding
-
-    def error(self, line, error, opt=None):
-        '''raise error.'''
-        # pass to vcf file for error handling
-        return self.vcf.error(line, error, opt)
-
-    cdef update(self, char * buffer, size_t nbytes):
-        '''update internal data.
-
-        nbytes does not include the terminal '\0'.
-        '''
-        ctabixproxies.TupleProxy.update(self, buffer, nbytes)
-
-        self.contig = self.fields[0]
-        # vcf counts from 1 - correct here
-        self.pos = atoi(self.fields[1]) - 1
-
-    def __len__(self):
-        return max(0, self.nfields - 9)
-
-    property contig:
-        def __get__(self): return self.contig
-
-    property pos:
-        def __get__(self): return self.pos
-
-    property id:
-        def __get__(self): return self.fields[2]
-
-    property ref:
-        def __get__(self):
-            return self.fields[3]
-
-    property alt:
-        def __get__(self):
-            # convert v3.3 to v4.0 alleles below
-            alt = self.fields[4]
-            if alt == ".": alt = []
-            else: alt = alt.upper().split(',')
-            return alt
-
-    property qual:
-        def __get__(self):
-            qual = self.fields[5]
-            if qual == b".": qual = -1
-            else:
-                try:    qual = float(qual)
-                except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
-            return qual
-
-    property filter:
-        def __get__(self):
-            f = self.fields[6]
-            # postpone checking that filters exist.  Encode missing filter or no filtering as empty list
-            if f == b"." or f == b"PASS" or f == b"0": return []
-            else: return f.split(';')
-
-    property info:
-        def __get__(self):
-            col = self.fields[7]
-            # dictionary of keys, and list of values
-            info = {}
-            if col != b".":
-                for blurp in col.split(';'):
-                    elts = blurp.split('=')
-                    if len(elts) == 1: v = None
-                    elif len(elts) == 2: v = elts[1]
-                    else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
-                    info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
-            return info
-
-    property format:
-         def __get__(self):
-             return self.fields[8].split(':')
-
-    property samples:
-        def __get__(self):
-            return self.vcf._samples
-
-    def __getitem__(self, key):
-
-        # parse sample columns
-        values = self.fields[self.vcf._sample2column[key]].split(':')
-        alt = self.alt
-        format = self.format
-
-        if len(values) > len(format):
-            self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
-                           (len(values),key,len(format)))
-
-        result = {}
-        for idx in range(len(format)):
-            expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
-            if idx < len(values): value = values[idx]
-            else:
-                if expected == -1: value = "."
-                else: value = ",".join(["."]*expected)
-
-            result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
-            if expected != -1 and len(result[format[idx]]) != expected:
-                self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
-                               "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
-                if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
-                result[format[idx]] = result[format[idx]][:expected]
-
-        return result
-
-
-cdef class asVCFRecord(ctabix.Parser):
-    '''converts a :term:`tabix row` into a VCF record.'''
-    cdef vcffile
-    def __init__(self, vcffile):
-        self.vcffile = vcffile
-
-    cdef parse(self, char * buffer, int len):
-        cdef VCFRecord r
-        r = VCFRecord(self.vcffile)
-        r.copy(buffer, len)
-        return r
-
-class VCF(object):
-
-    # types
-    NT_UNKNOWN = 0
-    NT_NUMBER = 1
-    NT_ALLELES = 2
-    NT_NR_ALLELES = 3
-    NT_GENOTYPES = 4
-    NT_PHASED_GENOTYPES = 5
-
-    _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
-                1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
-                2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
-                3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
-                4:"POS_NOT_NUMERICAL:Position column is not numerical",
-                5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
-                6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
-                7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
-                8:"POS_NOT_POSITIVE:Position field must be >0",
-                9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
-               10:"ERROR_INFO_STRING:Error while parsing info field",
-               11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
-               12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
-               13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
-               14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
-               15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
-               16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
-               17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
-               18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
-               19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
-               20:"MISSING_REF:Reference allele missing",
-               21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
-               22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
-               23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
-               24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
-               25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
-               26:"WRONG_REF:Wrong reference %s",
-               27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
-               28:"BAD_CHR_TAG:Error calculating chr tag for %s",
-               29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
-               30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
-               31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
-               32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
-               33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
-                }
-
-    # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
-    _header = []
-
-    # version number; 33=v3.3; 40=v4.0
-    _version = 40
-
-    # info, filter and format data
-    _info = {}
-    _filter = {}
-    _format = {}
-
-    # header; and required columns
-    _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
-    _samples = []
-
-    # control behaviour
-    _ignored_errors = set([11,31])   # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
-    _warn_errors = set([])
-    _leftalign = False
-
-    # reference sequence
-    _reference = None
-
-    # regions to include; None includes everything
-    _regions = None
-
-    # statefull stuff
-    _lineno = -1
-    _line = None
-    _lines = None
-
-    def __init__(self, _copy=None, reference=None, regions=None,
-                 lines=None, leftalign=False):
-        # make error identifiers accessible by name
-        for id in self._errors.keys():
-            self.__dict__[self._errors[id].split(':')[0]] = id
-        if _copy != None:
-            self._leftalign = _copy._leftalign
-            self._header = _copy._header[:]
-            self._version = _copy._version
-            self._info = copy.deepcopy(_copy._info)
-            self._filter = copy.deepcopy(_copy._filter)
-            self._format = copy.deepcopy(_copy._format)
-            self._samples = _copy._samples[:]
-            self._sample2column = copy.deepcopy(_copy._sample2column)
-            self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
-            self._warn_errors = copy.deepcopy(_copy._warn_errors)
-            self._reference = _copy._reference
-            self._regions = _copy._regions
-        if reference: self._reference = reference
-        if regions: self._regions = regions
-        if leftalign: self._leftalign = leftalign
-        self._lines = lines
-        self.encoding = "ascii"
-        self.tabixfile = None
-
-    def error(self,line,error,opt=None):
-        if error in self._ignored_errors: return
-        errorlabel, errorstring = self._errors[error].split(':')
-        if opt: errorstring = errorstring % opt
-        errwarn = ["Error","Warning"][error in self._warn_errors]
-        errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
-        if error in self._warn_errors: return
-        raise ValueError(errorstring)
-
-    def parse_format(self,line,format,filter=False):
-        if self._version == 40:
-            if not format.startswith('<'):
-                self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
-                format = "<"+format
-            if not format.endswith('>'):
-                self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
-                format += ">"
-            format = format[1:-1]
-        data = {'id':None,'number':None,'type':None,'descr':None}
-        idx = 0
-        while len(format.strip())>0:
-            elts = format.strip().split(',')
-            first, rest = elts[0], ','.join(elts[1:])
-            if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
-                if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
-                if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
-                first = ["ID=","Number=","Type=","Description="][idx] + first
-            if first.startswith('ID='):            data['id'] = first.split('=')[1]
-            elif first.startswith('Number='):      data['number'] = first.split('=')[1]
-            elif first.startswith('Type='):        data['type'] = first.split('=')[1]
-            elif first.startswith('Description='):
-                elts = format.split('"')
-                if len(elts)<3:
-                    self.error(line,self.FORMAT_MISSING_QUOTES)
-                    elts = first.split('=') + [rest]
-                data['descr'] = elts[1]
-                rest = '"'.join(elts[2:])
-                if rest.startswith(','): rest = rest[1:]
-            else:
-                self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
-            format = rest
-            idx += 1
-            if filter and idx==1: idx=3  # skip number and type fields for FILTER format strings
-        if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
-        if 'descr' not in data:
-            # missing description
-            self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
-            data['descr'] = ""
-        if not data['type'] and not data['number']:
-            # fine, ##filter format
-            return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
-        if not data['type'] in ["Integer","Float","Character","String","Flag"]:
-            self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
-        # I would like a missing-value field, but it isn't there
-        if data['type'] in ['Integer','Float']: data['missing'] = None    # Do NOT use arbitrary int/float as missing value
-        else:                                   data['missing'] = '.'
-        if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
-        try:
-            n = int(data['number'])
-            t = self.NT_NUMBER
-        except ValueError:
-            n = -1
-            if data['number'] == '.':                   t = self.NT_UNKNOWN
-            elif data['number'] == '#alleles':          t = self.NT_ALLELES
-            elif data['number'] == '#nonref_alleles':   t = self.NT_NR_ALLELES
-            elif data['number'] == '#genotypes':        t = self.NT_GENOTYPES
-            elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
-            elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
-            # abbreviations added in VCF version v4.1
-            elif data['number'] == 'A': t = self.NT_ALLELES
-            elif data['number'] == 'G': t = self.NT_GENOTYPES
-            else:
-                self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
-        # if number is 0 - type must be Flag
-        if n == 0 and data['type'] != 'Flag':
-            self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
-            # force type 'Flag' if no number
-            data['type'] = 'Flag'
-
-        return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
-
-    def format_format( self, fmt, filter=False ):
-        values = [('ID',fmt.id)]
-        if fmt.number != None and not filter:
-            if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
-            elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
-            elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
-            elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
-            elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
-            elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
-            else:
-                raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
-            values.append( ('Number',nmb) )
-            values.append( ('Type', fmt.type) )
-        values.append( ('Description', '"' + fmt.description + '"') )
-        if self._version == 33:
-            format = ",".join([v for k,v in values])
-        else:
-            format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
-        return format
-
-    def get_expected(self, format, formatdict, alt):
-        fmt = formatdict[format]
-        if fmt.numbertype == self.NT_UNKNOWN: return -1
-        if fmt.numbertype == self.NT_NUMBER: return fmt.number
-        if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
-        if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
-        if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
-        if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
-        return 0
-
-
-    def _add_definition(self, formatdict, key, data, line ):
-        if key in formatdict: return
-        self.error(line,self.ERROR_UNKNOWN_KEY,key)
-        if data == None:
-            formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
-            return
-        if data == []: data = [""]             # unsure what type -- say string
-        if type(data[0]) == type(0.0):
-            formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
-            return
-        if type(data[0]) == type(0):
-            formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
-            return
-        formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
-
-
-    # todo: trim trailing missing values
-    def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
-        output, sdata = [], []
-        if type(data) == type([]): # for FORMAT field, make data with dummy values
-            d = {}
-            for k in data: d[k] = []
-            data = d
-        # convert missing values; and silently add definitions if required
-        for k in data:
-            self._add_definition( format, k, data[k], "(output)" )
-            for idx,v in enumerate(data[k]):
-                if v == format[k].missingvalue: data[k][idx] = "."
-        # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
-        for k in data:
-            if k != 'GT': sdata.append( (k,data[k]) )
-        sdata.sort()
-        if 'GT' in data:
-            sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
-        for k,v in sdata:
-            if v == []: v = None
-            if key and value:
-                if v != None: output.append( k+"="+','.join(map(str,v)) )
-                else: output.append( k )
-            elif key: output.append(k)
-            elif value:
-                if v != None: output.append( ','.join(map(str,v)) )
-                else: output.append( "." )                    # should not happen
-        # snip off trailing missing data
-        while len(output) > 1:
-            last = output[-1].replace(',','').replace('.','')
-            if len(last)>0: break
-            output = output[:-1]
-        return separator.join(output)
-
-
-    def enter_default_format(self):
-        for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
-                  FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
-                  FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
-                  FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
-                  FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
-                  FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
-                  FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
-                  FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
-                  FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
-                  FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1),    # unknown number, since may be haploid
-                  FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
-                  FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
-                  FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
-                  FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
-                  ]:
-            if f.id not in self._format:
-                self._format[f.id] = f
-
-    def parse_header(self, line):
-
-        assert line.startswith('##')
-        elts = line[2:].split('=')
-        key = elts[0].strip()
-        value = '='.join(elts[1:]).strip()
-        if key == "fileformat":
-            if value == "VCFv3.3":
-                self._version = 33
-            elif value == "VCFv4.0":
-                self._version = 40
-            elif value == "VCFv4.1":
-                # AH - for testing
-                self._version = 40
-            elif value == "VCFv4.2":
-                # AH - for testing
-                self._version = 40
-            else:
-                self.error(line,self.UNKNOWN_FORMAT_STRING)
-        elif key == "INFO":
-            f = self.parse_format(line, value)
-            self._info[ f.id ] = f
-        elif key == "FILTER":
-            f = self.parse_format(line, value, filter=True)
-            self._filter[ f.id ] = f
-        elif key == "FORMAT":
-            f = self.parse_format(line, value)
-            self._format[ f.id ] = f
-        else:
-            # keep other keys in the header field
-            self._header.append( (key,value) )
-
-
-    def write_header( self, stream ):
-        stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
-        for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
-        for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
-            for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
-
-
-    def parse_heading( self, line ):
-        assert line.startswith('#')
-        assert not line.startswith('##')
-        headings = line[1:].split('\t')
-        # test for 8, as FORMAT field might be missing
-        if len(headings)==1 and len(line[1:].split()) >= 8:
-            self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
-            headings = line[1:].split()
-
-        for i,s in enumerate(self._required):
-
-            if len(headings)<=i or headings[i] != s:
-
-                if len(headings) <= i:
-                    err = "(%sth entry not found)" % (i+1)
-                else:
-                    err = "(found %s, expected %s)" % (headings[i],s)
-
-                #self.error(line,self.BADLY_FORMATTED_HEADING,err)
-                # allow FORMAT column to be absent
-                if len(headings) == 8:
-                    headings.append("FORMAT")
-                else:
-                    self.error(line,self.BADLY_FORMATTED_HEADING,err)
-
-        self._samples = headings[9:]
-        self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
-
-    def write_heading( self, stream ):
-        stream.write("#" + "\t".join(self._required + self._samples) + "\n")
-
-    def convertGT(self, GTstring):
-        if GTstring == ".": return ["."]
-        try:
-            gts = gtsRegEx.split(GTstring)
-            if len(gts) == 1: return [int(gts[0])]
-            if len(gts) != 2: raise ValueError()
-            if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
-            return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
-        except ValueError:
-            self.error(self._line,self.BAD_GENOTYPE,GTstring)
-            return [".","|","."]
-
-    def convertGTback(self, GTdata):
-        return ''.join(map(str,GTdata))
-
-    def parse_formatdata( self, key, value, formatdict, line ):
-        # To do: check that the right number of values is present
-        f = formatdict.get(key,None)
-        if f == None:
-            self._add_definition(formatdict, key, value, line )
-            f = formatdict[key]
-        if f.type == "Flag":
-            if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
-            return []
-        values = value.split(',')
-        # deal with trailing data in some early VCF files
-        if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
-            self.error(line,self.ERROR_TRAILING_DATA,values[-1])
-            values[-1] = values[-1].split(';')[0]
-        if f.type == "Integer":
-            for idx,v in enumerate(values):
-                try:
-                    if v == ".": values[idx] = f.missingvalue
-                    else:        values[idx] = int(v)
-                except:
-                    self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
-                    return [0] * len(values)
-            return values
-        elif f.type == "String":
-            self._line = line
-            if f.id == "GT": values = list(map( self.convertGT, values ))
-            return values
-        elif f.type == "Character":
-            for v in values:
-                if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
-            return values
-        elif f.type == "Float":
-            for idx,v in enumerate(values):
-                if v == ".": values[idx] = f.missingvalue
-            try: return list(map(float,values))
-            except:
-                self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
-                return [0.0] * len(values)
-        else:
-            # can't happen
-            self.error(line,self.ERROR_INFO_STRING)
-
-    def inregion(self, chrom, pos):
-        if not self._regions: return True
-        for r in self._regions:
-            if r[0] == chrom and r[1] <= pos < r[2]: return True
-        return False
-
-    def parse_data( self, line, lineparse=False ):
-        cols = line.split('\t')
-        if len(cols) != len(self._samples)+9:
-            # gracefully deal with absent FORMAT column
-            # and those missing samples
-            if len(cols) == 8:
-                cols.append("")
-            else:
-                self.error(line,
-                           self.BAD_NUMBER_OF_COLUMNS,
-                           "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
-
-        chrom = cols[0]
-
-        # get 0-based position
-        try:    pos = int(cols[1])-1
-        except: self.error(line,self.POS_NOT_NUMERICAL)
-        if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
-
-        # implement filtering
-        if not self.inregion(chrom,pos): return None
-
-        # end of first-pass parse for sortedVCF
-        if lineparse: return chrom, pos, line
-
-        id = cols[2]
-
-        ref = cols[3].upper()
-        if ref == ".":
-            self.error(line,self.MISSING_REF)
-            if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
-            else:                   ref = ""
-        else:
-            for c in ref:
-                if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
-            if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
-
-        # make sure reference is sane
-        if self._reference:
-            left = max(0,pos-100)
-            faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
-            faref = faref_leftflank[pos-left:]
-            if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
-            ref = faref
-
-        # convert v3.3 to v4.0 alleles below
-        if cols[4] == ".": alt = []
-        else: alt = cols[4].upper().split(',')
-
-        if cols[5] == ".": qual = -1
-        else:
-            try:    qual = float(cols[5])
-            except: self.error(line,self.QUAL_NOT_NUMERICAL)
-
-        # postpone checking that filters exist.  Encode missing filter or no filtering as empty list
-        if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
-        else: filter = cols[6].split(';')
-
-        # dictionary of keys, and list of values
-        info = {}
-        if cols[7] != ".":
-            for blurp in cols[7].split(';'):
-                elts = blurp.split('=')
-                if len(elts) == 1: v = None
-                elif len(elts) == 2: v = elts[1]
-                else: self.error(line,self.ERROR_INFO_STRING)
-                info[elts[0]] = self.parse_formatdata(elts[0],
-                                                      v,
-                                                      self._info,
-                                                      line)
-
-        # Gracefully deal with absent FORMAT column
-        if cols[8] == "": format = []
-        else: format = cols[8].split(':')
-
-        # check: all filters are defined
-        for f in filter:
-            if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
-
-        # check: format fields are defined
-        if self._format:
-            for f in format:
-                if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
-
-        # convert v3.3 alleles
-        if self._version == 33:
-            if len(ref) != 1: self.error(line,self.V33_BAD_REF)
-            newalts = []
-            have_deletions = False
-            for a in alt:
-                if len(a) == 1: a = a + ref[1:]                       # SNP; add trailing reference
-                elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:]  # insertion just beyond pos; add first and trailing reference
-                elif a.startswith('D'): # allow D<seq> and D<num>
-                    have_deletions = True
-                    try:
-                        l = int(a[1:])          # throws ValueError if sequence
-                        if len(ref) < l:        # add to reference if necessary
-                            addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
-                            ref += addns
-                            for i,na in enumerate(newalts): newalts[i] = na+addns
-                        a = ref[l:]             # new deletion, deleting pos...pos+l
-                    except ValueError:
-                        s = a[1:]
-                        if len(ref) < len(s):   # add Ns to reference if necessary
-                            addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
-                            if not s.endswith(addns) and addns != 'N'*len(addns):
-                                self.error(line,self.V33_UNMATCHED_DELETION,
-                                           "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
-                            ref += addns
-                            for i,na in enumerate(newalts): newalts[i] = na+addns
-                        a = ref[len(s):]        # new deletion, deleting from pos
-                else:
-                    self.error(line,self.V33_BAD_ALLELE)
-                newalts.append(a)
-            alt = newalts
-            # deletion alleles exist, add dummy 1st reference allele, and account for leading base
-            if have_deletions:
-                if pos == 0:
-                    # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
-                    addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
-                    ref += addn
-                    alt = [allele+addn for allele in alt]
-                else:
-                    addn = get_sequence(chrom,pos-1,pos,self._reference)
-                    ref = addn + ref
-                    alt = [addn + allele for allele in alt]
-                    pos -= 1
-        else:
-            # format v4.0 -- just check for nucleotides
-            for allele in alt:
-                if not alleleRegEx.match(allele):
-                    self.error(line,self.V40_BAD_ALLELE,allele)
-
-        # check for leading nucleotide in indel calls
-        for allele in alt:
-            if len(allele) != len(ref):
-                if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
-                if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
-                    self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
-
-        # trim trailing bases in alleles
-        # AH: not certain why trimming this needs to be added
-        #     disabled now for unit testing
-        # if alt:
-        #     for i in range(1,min(len(ref),min(map(len,alt)))):
-        #         if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
-        #             break
-        #         ref, alt = ref[:-1], [allele[:-1] for allele in alt]
-
-        # left-align alleles, if a reference is available
-        if self._leftalign and self._reference:
-            while left < pos:
-                movable = True
-                for allele in alt:
-                    if len(allele) > len(ref):
-                        longest, shortest = allele, ref
-                    else:
-                        longest, shortest = ref, allele
-                    if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
-                        movable = False
-                    if longest[-1].upper() != longest[len(shortest)-1].upper():
-                        movable = False
-                if not movable:
-                    break
-                ref = ref[:-1]
-                alt = [allele[:-1] for allele in alt]
-                if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
-                    ref = faref_leftflank[pos-left-1] + ref
-                    alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
-                    pos -= 1
-
-        # parse sample columns
-        samples = []
-        for sample in cols[9:]:
-            dict = {}
-            values = sample.split(':')
-            if len(values) > len(format):
-                self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
-            for idx in range(len(format)):
-                expected = self.get_expected(format[idx], self._format, alt)
-                if idx < len(values): value = values[idx]
-                else:
-                    if expected == -1: value = "."
-                    else: value = ",".join(["."]*expected)
-
-                dict[format[idx]] = self.parse_formatdata(format[idx],
-                                                          value,
-                                                          self._format,
-                                                          line)
-                if expected != -1 and len(dict[format[idx]]) != expected:
-                    self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
-                               "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
-                    if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
-                    dict[format[idx]] = dict[format[idx]][:expected]
-            samples.append( dict )
-
-        # done
-        d = {'chrom':chrom,
-             'pos':pos,      # return 0-based position
-             'id':id,
-             'ref':ref,
-             'alt':alt,
-             'qual':qual,
-             'filter':filter,
-             'info':info,
-             'format':format}
-        for key,value in zip(self._samples,samples):
-            d[key] = value
-
-        return d
-
-
-    def write_data(self, stream, data):
-        required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
-        for k in required:
-            if k not in data: raise ValueError("Required key %s not found in data" % str(k))
-        if data['alt'] == []: alt = "."
-        else: alt = ",".join(data['alt'])
-        if data['filter'] == None: filter = "."
-        elif data['filter'] == []:
-            if self._version == 33: filter = "0"
-            else: filter = "PASS"
-        else: filter = ';'.join(data['filter'])
-        if data['qual'] == -1: qual = "."
-        else: qual = str(data['qual'])
-
-        output = [data['chrom'],
-                  str(data['pos']+1),   # change to 1-based position
-                  data['id'],
-                  data['ref'],
-                  alt,
-                  qual,
-                  filter,
-                  self.format_formatdata(
-                      data['info'], self._info, separator=";"),
-                  self.format_formatdata(
-                      data['format'], self._format, value=False)]
-
-        for s in self._samples:
-            output.append(self.format_formatdata(
-                data[s], self._format, key=False))
-
-        stream.write( "\t".join(output) + "\n" )
-
-    def _parse_header(self, stream):
-        self._lineno = 0
-        for line in stream:
-            line = force_str(line, self.encoding)
-            self._lineno += 1
-            if line.startswith('##'):
-                self.parse_header(line.strip())
-            elif line.startswith('#'):
-                self.parse_heading(line.strip())
-                self.enter_default_format()
-            else:
-                break
-        return line
-
-    def _parse(self, line, stream):
-        # deal with files with header only
-        if line.startswith("##"): return
-        if len(line.strip()) > 0:
-            d = self.parse_data( line.strip() )
-            if d: yield d
-        for line in stream:
-            self._lineno += 1
-            if self._lines and self._lineno > self._lines: raise StopIteration
-            d = self.parse_data( line.strip() )
-            if d: yield d
-
-    ######################################################################################################
-    #
-    # API follows
-    #
-    ######################################################################################################
-
-    def getsamples(self):
-        """ List of samples in VCF file """
-        return self._samples
-
-    def setsamples(self,samples):
-        """ List of samples in VCF file """
-        self._samples = samples
-
-    def getheader(self):
-        """ List of header key-value pairs (strings) """
-        return self._header
-
-    def setheader(self,header):
-        """ List of header key-value pairs (strings) """
-        self._header = header
-
-    def getinfo(self):
-        """ Dictionary of ##INFO tags, as VCF.FORMAT values """
-        return self._info
-
-    def setinfo(self,info):
-        """ Dictionary of ##INFO tags, as VCF.FORMAT values """
-        self._info = info
-
-    def getformat(self):
-        """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
-        return self._format
-
-    def setformat(self,format):
-        """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
-        self._format = format
-
-    def getfilter(self):
-        """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
-        return self._filter
-
-    def setfilter(self,filter):
-        """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
-        self._filter = filter
-
-    def setversion(self, version):
-        if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
-        self._version = version
-
-    def setregions(self, regions):
-        self._regions = regions
-
-    def setreference(self, ref):
-        """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
-        self._reference = ref
-
-    def ignoreerror(self, errorstring):
-        try:             self._ignored_errors.add(self.__dict__[errorstring])
-        except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
-
-    def warnerror(self, errorstring):
-        try:             self._warn_errors.add(self.__dict__[errorstring])
-        except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
-
-    def parse(self, stream):
-        """ Parse a stream of VCF-formatted lines.  Initializes class instance and return generator """
-        last_line = self._parse_header(stream)
-        # now return a generator that does the actual work.  In this way the pre-processing is done
-        # before the first piece of data is yielded
-        return self._parse(last_line, stream)
-
-    def write(self, stream, datagenerator):
-        """ Writes a VCF file to a stream, using a data generator (or list) """
-        self.write_header(stream)
-        self.write_heading(stream)
-        for data in datagenerator: self.write_data(stream,data)
-
-    def writeheader(self, stream):
-        """ Writes a VCF header """
-        self.write_header(stream)
-        self.write_heading(stream)
-
-    def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
-        """ Utility function: compares two calls for equality """
-        # a variant should always be assigned to a unique position, one base before
-        # the leftmost position of the alignment gap.  If this rule is implemented
-        # correctly, the two positions must be equal for the calls to be identical.
-        if pos1 != pos2: return False
-        # from both calls, trim rightmost bases when identical.  Do this safely, i.e.
-        # only when the reference bases are not Ns
-        while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
-            ref1 = ref1[:-1]
-            alt1 = alt1[:-1]
-        while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
-            ref2 = ref2[:-1]
-            alt2 = alt2[:-1]
-        # now, the alternative alleles must be identical
-        return alt1 == alt2
-
-###########################################################################################################
-###########################################################################################################
-## API functions added by Andreas
-###########################################################################################################
-
-    def connect(self, filename, encoding="ascii"):
-        '''connect to tabix file.'''
-        self.encoding=encoding
-        self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
-        self._parse_header(self.tabixfile.header)
-
-    def __del__(self):
-        self.close()
-        self.tabixfile = None
-
-    def close(self):
-        if self.tabixfile:
-            self.tabixfile.close()
-            self.tabixfile = None
-
-    def fetch(self,
-              reference=None,
-              start=None,
-              end=None,
-              region=None ):
-        """ Parse a stream of VCF-formatted lines.
-        Initializes class instance and return generator """
-        return self.tabixfile.fetch(
-            reference,
-            start,
-            end,
-            region,
-            parser = asVCFRecord(self))
-
-    def validate(self, record):
-        '''validate vcf record.
-
-        returns a validated record.
-        '''
-
-        raise NotImplementedError("needs to be checked")
-
-        chrom, pos = record.chrom, record.pos
-
-        # check reference
-        ref = record.ref
-        if ref == ".":
-            self.error(str(record),self.MISSING_REF)
-            if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
-            else:                   ref = ""
-        else:
-            for c in ref:
-                if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
-                if "N" in ref: ref = get_sequence(chrom,
-                                                  pos,
-                                                  pos+len(ref),
-                                                  self._reference)
-
-        # make sure reference is sane
-        if self._reference:
-            left = max(0,self.pos-100)
-            faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
-            faref = faref_leftflank[pos-left:]
-            if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
-            ref = faref
-
-        # check: format fields are defined
-        for f in record.format:
-            if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
-
-        # check: all filters are defined
-        for f in record.filter:
-            if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
-
-        # convert v3.3 alleles
-        if self._version == 33:
-            if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
-            newalts = []
-            have_deletions = False
-            for a in alt:
-                if len(a) == 1: a = a + ref[1:]                       # SNP; add trailing reference
-                elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:]  # insertion just beyond pos; add first and trailing reference
-                elif a.startswith('D'): # allow D<seq> and D<num>
-                    have_deletions = True
-                    try:
-                        l = int(a[1:])          # throws ValueError if sequence
-                        if len(ref) < l:        # add to reference if necessary
-                            addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
-                            ref += addns
-                            for i,na in enumerate(newalts): newalts[i] = na+addns
-                        a = ref[l:]             # new deletion, deleting pos...pos+l
-                    except ValueError:
-                        s = a[1:]
-                        if len(ref) < len(s):   # add Ns to reference if necessary
-                            addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
-                            if not s.endswith(addns) and addns != 'N'*len(addns):
-                                self.error(str(record),self.V33_UNMATCHED_DELETION,
-                                           "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
-                            ref += addns
-                            for i,na in enumerate(newalts): newalts[i] = na+addns
-                        a = ref[len(s):]        # new deletion, deleting from pos
-                else:
-                    self.error(str(record),self.V33_BAD_ALLELE)
-                newalts.append(a)
-            alt = newalts
-            # deletion alleles exist, add dummy 1st reference allele, and account for leading base
-            if have_deletions:
-                if pos == 0:
-                    # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
-                    addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
-                    ref += addn
-                    alt = [allele+addn for allele in alt]
-                else:
-                    addn = get_sequence(chrom,pos-1,pos,self._reference)
-                    ref = addn + ref
-                    alt = [addn + allele for allele in alt]
-                    pos -= 1
-        else:
-            # format v4.0 -- just check for nucleotides
-            for allele in alt:
-                if not alleleRegEx.match(allele):
-                    self.error(str(record),self.V40_BAD_ALLELE,allele)
-
-
-        # check for leading nucleotide in indel calls
-        for allele in alt:
-            if len(allele) != len(ref):
-                if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
-                if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
-                    self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
-
-        # trim trailing bases in alleles
-        # AH: not certain why trimming this needs to be added
-        #     disabled now for unit testing
-        # for i in range(1,min(len(ref),min(map(len,alt)))):
-        #     if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
-        #         break
-        #     ref, alt = ref[:-1], [allele[:-1] for allele in alt]
-
-        # left-align alleles, if a reference is available
-        if self._leftalign and self._reference:
-            while left < pos:
-                movable = True
-                for allele in alt:
-                    if len(allele) > len(ref):
-                        longest, shortest = allele, ref
-                    else:
-                        longest, shortest = ref, allele
-                    if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
-                        movable = False
-                    if longest[-1].upper() != longest[len(shortest)-1].upper():
-                        movable = False
-                if not movable:
-                    break
-                ref = ref[:-1]
-                alt = [allele[:-1] for allele in alt]
-                if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
-                    ref = faref_leftflank[pos-left-1] + ref
-                    alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
-                    pos -= 1
-
-__all__ = [
-    "VCF", "VCFRecord", ]
diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd

new file mode 100644 (file)

index 0000000..f1d59d1
--- /dev/null
+++ b/pysam/libcalignedsegment.pxd
@@ -0,0 +1,91 @@
+from pysam.libchtslib cimport *
+
+cdef extern from "htslib_util.h":
+
+    # add *nbytes* into the variable length data of *src* at *pos*
+    bam1_t * pysam_bam_update(bam1_t * b,
+                              size_t nbytes_old,
+                              size_t nbytes_new,
+                              uint8_t * pos)
+
+    # now: static
+    int aux_type2size(int)
+
+    char * pysam_bam_get_qname(bam1_t * b)
+    uint32_t * pysam_bam_get_cigar(bam1_t * b)
+    uint8_t * pysam_bam_get_seq(bam1_t * b)
+    uint8_t * pysam_bam_get_qual(bam1_t * b)
+    uint8_t * pysam_bam_get_aux(bam1_t * b)
+    int pysam_bam_get_l_aux(bam1_t * b)
+    char pysam_bam_seqi(uint8_t * s, int i)
+
+    uint16_t pysam_get_bin(bam1_t * b)
+    uint8_t pysam_get_qual(bam1_t * b)
+    uint8_t pysam_get_l_qname(bam1_t * b)
+    uint16_t pysam_get_flag(bam1_t * b)
+    uint16_t pysam_get_n_cigar(bam1_t * b)
+    void pysam_set_bin(bam1_t * b, uint16_t v)
+    void pysam_set_qual(bam1_t * b, uint8_t v)
+    void pysam_set_l_qname(bam1_t * b, uint8_t v)
+    void pysam_set_flag(bam1_t * b, uint16_t v)
+    void pysam_set_n_cigar(bam1_t * b, uint16_t v)
+    void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
+
+
+from pysam.libcalignmentfile cimport AlignmentFile
+ctypedef AlignmentFile AlignmentFile_t
+
+
+# Note: need to declare all C fields and methods here
+cdef class AlignedSegment:
+
+    # object that this AlignedSegment represents
+    cdef bam1_t * _delegate
+
+    # the file from which this AlignedSegment originates (can be None)
+    cdef AlignmentFile _alignment_file
+
+    # caching of array properties for quick access
+    cdef object cache_query_qualities
+    cdef object cache_query_alignment_qualities
+    cdef object cache_query_sequence
+    cdef object cache_query_alignment_sequence
+
+    # add an alignment tag with value to the AlignedSegment
+    # an existing tag of the same name will be replaced.
+    cpdef set_tag(self, tag, value, value_type=?, replace=?)
+
+    # add an alignment tag with value to the AlignedSegment
+    # an existing tag of the same name will be replaced.
+    cpdef get_tag(self, tag, with_value_type=?)
+
+    # return true if tag exists
+    cpdef has_tag(self, tag)
+
+    # returns a valid sam alignment string
+    cpdef tostring(self, AlignmentFile_t handle)
+
+
+cdef class PileupColumn:
+    cdef bam_pileup1_t ** plp
+    cdef int tid
+    cdef int pos
+    cdef int n_pu
+    cdef AlignmentFile _alignment_file
+
+
+cdef class PileupRead:
+    cdef AlignedSegment _alignment
+    cdef int32_t  _qpos
+    cdef int _indel
+    cdef int _level
+    cdef uint32_t _is_del
+    cdef uint32_t _is_head
+    cdef uint32_t _is_tail
+    cdef uint32_t _is_refskip
+
+# factor methods
+cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file)
+cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file)
+cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
+cdef inline uint32_t get_alignment_length(bam1_t * src)
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx

new file mode 100644 (file)

index 0000000..c95bb13
--- /dev/null
+++ b/pysam/libcalignedsegment.pyx
@@ -0,0 +1,2483 @@
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+###############################################################################
+# The principal classes defined in this module are:
+#
+# class AlignedSegment  an aligned segment (read)
+#
+# class PileupColumn    a collection of segments (PileupRead) aligned to
+#                       a particular genomic position.
+#
+# class PileupRead      an AlignedSegment aligned to a particular genomic
+#                       position. Contains additional attributes with respect
+#                       to this.
+#
+# Additionally this module defines numerous additional classes that are part
+# of the internal API. These are:
+#
+# Various iterator classes to iterate over alignments in sequential (IteratorRow)
+# or in a stacked fashion (IteratorColumn):
+#
+# class IteratorRow
+# class IteratorRowRegion
+# class IteratorRowHead
+# class IteratorRowAll
+# class IteratorRowAllRefs
+# class IteratorRowSelection
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import re
+import array
+import ctypes
+import struct
+
+cimport cython
+from cpython cimport array as c_array
+from cpython.version cimport PY_MAJOR_VERSION
+from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
+from libc.string cimport strchr
+from cpython cimport array as c_array
+
+from pysam.libcutils cimport force_bytes, force_str, \
+    charptr_to_str, charptr_to_bytes
+from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
+    array_to_qualitystring
+
+# Constants for binary tag conversion
+cdef char * htslib_types = 'cCsSiIf'
+cdef char * parray_types = 'bBhHiIf'
+
+# translation tables
+
+# cigar code to character and vice versa
+cdef char* CODE2CIGAR= "MIDNSHP=XB"
+cdef int NCIGAR_CODES = 10
+
+if PY_MAJOR_VERSION >= 3:
+    CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
+else:
+    CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
+
+CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
+
+#####################################################################
+# C multiplication with wrapping around
+cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
+    return (a * b) & 0xffffffff
+
+
+#####################################################################
+# typecode guessing
+cdef inline char map_typecode_htslib_to_python(uint8_t s):
+    """map an htslib typecode to the corresponding python typecode
+    to be used in the struct or array modules."""
+
+    # map type from htslib to python array
+    cdef char * f = strchr(htslib_types, s)
+
+    if f == NULL:
+        return 0
+    return parray_types[f - htslib_types]
+
+cdef inline uint8_t map_typecode_python_to_htslib(char s):
+    """determine value type from type code of array"""
+    cdef char * f = strchr(parray_types, s)
+    if f == NULL:
+        return 0
+    return htslib_types[f - parray_types]
+
+# optional tag data manipulation
+cdef convert_binary_tag(uint8_t * tag):
+    """return bytesize, number of values and array of values
+    in aux_data memory location pointed to by tag."""
+    cdef uint8_t auxtype
+    cdef uint8_t byte_size
+    cdef int32_t nvalues
+    # get byte size
+    auxtype = tag[0]
+    byte_size = aux_type2size(auxtype)
+    tag += 1
+    # get number of values in array
+    nvalues = (<int32_t*>tag)[0]
+    tag += 4
+
+    # define python array
+    cdef c_array.array c_values = array.array(
+        chr(map_typecode_htslib_to_python(auxtype)))
+    c_array.resize(c_values, nvalues)
+
+    # copy data
+    memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
+
+    # no need to check for endian-ness as bam1_core_t fields
+    # and aux_data are in host endian-ness. See sam.c and calls
+    # to swap_data
+    return byte_size, nvalues, c_values
+
+
+cdef inline uint8_t get_value_code(value, value_type=None):
+    '''guess type code for a *value*. If *value_type* is None,
+    the type code will be inferred based on the Python type of
+    *value*'''
+    cdef uint8_t  typecode
+    cdef char * _char_type
+
+    if value_type is None:
+        if isinstance(value, int):
+            typecode = 'i'
+        elif isinstance(value, float):
+            typecode = 'd'
+        elif isinstance(value, str):
+            typecode = 'Z'
+        elif isinstance(value, bytes):
+            typecode = 'Z'
+        elif isinstance(value, array.array) or \
+                isinstance(value, list) or \
+                isinstance(value, tuple):
+            typecode = 'B'
+        else:
+            return 0
+    else:
+        if value_type not in 'Zidf':
+            return 0
+        value_type = force_bytes(value_type)
+        _char_type = value_type
+        typecode = (<uint8_t*>_char_type)[0]
+
+    return typecode
+
+
+cdef inline bytes getTypecode(value, maximum_value=None):
+    '''returns the value typecode of a value.
+
+    If max is specified, the approprite type is
+    returned for a range where value is the minimum.
+    '''
+
+    if maximum_value is None:
+        maximum_value = value
+
+    cdef bytes valuetype
+
+    t = type(value)
+
+    if t is float:
+        valuetype = b'f'
+    elif t is int:
+        # signed ints
+        if value < 0:
+            if value >= -128 and maximum_value < 128:
+                valuetype = b'c'
+            elif value >= -32768 and maximum_value < 32768:
+                valuetype = b's'
+            elif value < -2147483648 or maximum_value >= 2147483648:
+                raise ValueError(
+                    "at least one signed integer out of range of "
+                    "BAM/SAM specification")
+            else:
+                valuetype = b'i'
+        # unsigned ints
+        else:
+            if maximum_value < 256:
+                valuetype = b'C'
+            elif maximum_value < 65536:
+                valuetype = b'S'
+            elif maximum_value >= 4294967296:
+                raise ValueError(
+                    "at least one integer out of range of BAM/SAM specification")
+            else:
+                valuetype = b'I'
+    else:
+        # Note: hex strings (H) are not supported yet
+        if t is not bytes:
+            value = value.encode('ascii')
+        if len(value) == 1:
+            valuetype = b'A'
+        else:
+            valuetype = b'Z'
+
+    return valuetype
+
+
+cdef inline packTags(tags):
+    """pack a list of tags. Each tag is a tuple of (tag, tuple).
+
+    Values are packed into the most space efficient data structure
+    possible unless the tag contains a third field with the typecode.
+
+    Returns a format string and the associated list of arguments
+    to be used in a call to struct.pack_into.
+    """
+    fmts, args = ["<"], []
+
+    cdef char array_typecode
+
+    datatype2format = {
+        b'c': ('b', 1),
+        b'C': ('B', 1),
+        b's': ('h', 2),
+        b'S': ('H', 2),
+        b'i': ('i', 4),
+        b'I': ('I', 4),
+        b'f': ('f', 4),
+        b'A': ('c', 1)}
+
+    for tag in tags:
+
+        if len(tag) == 2:
+            pytag, value = tag
+            valuetype = None
+        elif len(tag) == 3:
+            pytag, value, valuetype = tag
+        else:
+            raise ValueError("malformatted tag: %s" % str(tag))
+
+        pytag = force_bytes(pytag)
+        valuetype = force_bytes(valuetype)
+        t = type(value)
+
+        if t is tuple or t is list:
+            # binary tags from tuples or lists
+            if valuetype is None:
+                # automatically determine value type - first value
+                # determines type. If there is a mix of types, the
+                # result is undefined.
+                valuetype = getTypecode(min(value), max(value))
+
+            if valuetype not in datatype2format:
+                raise ValueError("invalid value type '%s'" % valuetype)
+
+            datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0])
+            args.extend([pytag[:2],
+                         b"B",
+                         valuetype,
+                         len(value)] + list(value))
+
+        elif isinstance(value, array.array):
+            # binary tags from arrays
+            if valuetype is None:
+                array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
+
+                if array_typecode == 0:
+                    raise ValueError("unsupported type code '{}'"
+                                     .format(value.typecode))
+
+                valuetype = force_bytes(chr(array_typecode))
+
+            if valuetype not in datatype2format:
+                raise ValueError("invalid value type '%s' (%s)" %
+                                 (valuetype, type(valuetype)))
+
+            # use array.tostring() to retrieve byte representation and
+            # save as bytes
+            datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1])
+            args.extend([pytag[:2],
+                         b"B",
+                         valuetype,
+                         len(value),
+                         force_bytes(value.tostring())])
+
+        else:
+            if valuetype is None:
+                valuetype = getTypecode(value)
+
+            if valuetype in b"AZ":
+                value = force_bytes(value)
+
+            if valuetype == b"Z":
+                datafmt = "2sc%is" % (len(value)+1)
+            else:
+                datafmt = "2sc%s" % datatype2format[valuetype][0]
+
+            args.extend([pytag[:2],
+                         valuetype,
+                         value])
+
+        fmts.append(datafmt)
+
+    return "".join(fmts), args
+
+
+cdef inline int32_t calculateQueryLength(bam1_t * src):
+    """return query length computed from CIGAR alignment.
+
+    Return 0 if there is no CIGAR alignment.
+    """
+
+    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+
+    if cigar_p == NULL:
+        return 0
+
+    cdef uint32_t k, qpos
+    cdef int op
+    qpos = 0
+
+    for k from 0 <= k < pysam_get_n_cigar(src):
+        op = cigar_p[k] & BAM_CIGAR_MASK
+
+        if op == BAM_CMATCH or \
+           op == BAM_CINS or \
+           op == BAM_CSOFT_CLIP or \
+           op == BAM_CHARD_CLIP or \
+           op == BAM_CEQUAL or \
+           op == BAM_CDIFF:
+            qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
+
+    return qpos
+
+
+cdef inline int32_t getQueryStart(bam1_t *src) except -1:
+    cdef uint32_t * cigar_p
+    cdef uint32_t k, op
+    cdef uint32_t start_offset = 0
+
+    if pysam_get_n_cigar(src):
+        cigar_p = pysam_bam_get_cigar(src);
+        for k from 0 <= k < pysam_get_n_cigar(src):
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            if op == BAM_CHARD_CLIP:
+                if start_offset != 0 and start_offset != src.core.l_qseq:
+                    PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+                    return -1
+            elif op == BAM_CSOFT_CLIP:
+                start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+            else:
+                break
+
+    return start_offset
+
+
+cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
+    cdef uint32_t * cigar_p
+    cdef uint32_t k, op
+    cdef uint32_t end_offset = src.core.l_qseq
+
+    # if there is no sequence, compute length from cigar string
+    if end_offset == 0:
+        end_offset = calculateQueryLength(src)
+
+    # walk backwards in cigar string
+    if pysam_get_n_cigar(src) > 1:
+        cigar_p = pysam_bam_get_cigar(src);
+        for k from pysam_get_n_cigar(src) > k >= 1:
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            if op == BAM_CHARD_CLIP:
+                if end_offset != 0 and end_offset != src.core.l_qseq:
+                    PyErr_SetString(ValueError,
+                                    'Invalid clipping in CIGAR string')
+                    return -1
+            elif op == BAM_CSOFT_CLIP:
+                end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
+            else:
+                break
+
+    return end_offset
+
+
+cdef inline bytes getSequenceInRange(bam1_t *src,
+                                     uint32_t start,
+                                     uint32_t end):
+    """return python string of the sequence in a bam1_t object.
+    """
+
+    cdef uint8_t * p
+    cdef uint32_t k
+    cdef char * s
+
+    if not src.core.l_qseq:
+        return None
+
+    seq = PyBytes_FromStringAndSize(NULL, end - start)
+    s   = <char*>seq
+    p   = pysam_bam_get_seq(src)
+
+    for k from start <= k < end:
+        # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
+        # note: do not use string literal as it will be a python string
+        s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf]
+
+    return charptr_to_bytes(seq)
+
+
+cdef inline object getQualitiesInRange(bam1_t *src,
+                                       uint32_t start,
+                                       uint32_t end):
+    """return python array of quality values from a bam1_t object"""
+
+    cdef uint8_t * p
+    cdef uint32_t k
+
+    p = pysam_bam_get_qual(src)
+    if p[0] == 0xff:
+        return None
+
+    # 'B': unsigned char
+    cdef c_array.array result = array.array('B', [0])
+    c_array.resize(result, end - start)
+
+    # copy data
+    memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
+
+    return result
+
+
+#####################################################################
+## private factory methods
+cdef class AlignedSegment
+cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file):
+    '''return an AlignedSegment object constructed from `src`'''
+    # note that the following does not call __init__
+    cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
+    dest._delegate = bam_dup1(src)
+    dest._alignment_file = alignment_file
+    return dest
+
+
+cdef class PileupColumn
+cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos,
+                      int n_pu, AlignmentFile alignment_file):
+    '''return a PileupColumn object constructed from pileup in `plp` and
+    setting additional attributes.
+
+    '''
+    # note that the following does not call __init__
+    cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
+    dest._alignment_file = alignment_file
+    dest.plp = plp
+    dest.tid = tid
+    dest.pos = pos
+    dest.n_pu = n_pu
+    return dest
+
+cdef class PileupRead
+cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file):
+    '''return a PileupRead object construted from a bam_pileup1_t * object.'''
+    cdef PileupRead dest = PileupRead.__new__(PileupRead)
+    dest._alignment = makeAlignedSegment(src.b, alignment_file)
+    dest._qpos = src.qpos
+    dest._indel = src.indel
+    dest._level = src.level
+    dest._is_del = src.is_del
+    dest._is_head = src.is_head
+    dest._is_tail = src.is_tail
+    dest._is_refskip = src.is_refskip
+    return dest
+
+
+cdef inline uint32_t get_alignment_length(bam1_t * src):
+    cdef int k = 0
+    cdef uint32_t l = 0
+    if src == NULL:
+        return 0
+    cdef uint32_t * cigar_p = bam_get_cigar(src)
+    if cigar_p == NULL:
+        return 0
+    cdef int op
+    cdef int n = pysam_get_n_cigar(src)
+    for k from 0 <= k < n:
+        op = cigar_p[k] & BAM_CIGAR_MASK
+        if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
+            continue
+        l += cigar_p[k] >> BAM_CIGAR_SHIFT
+    return l
+
+
+# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
+cdef inline bytes build_alignment_sequence(bam1_t * src):
+    """return expanded sequence from MD tag.
+
+    The sequence includes substitutions and both insertions in the
+    reference as well as deletions to the reference sequence. Combine
+    with the cigar string to reconstitute the query or the reference
+    sequence.
+
+    Positions corresponding to `N` (skipped region from the reference)
+    in the CIGAR string will not appear in the returned sequence. The
+    MD should correspondingly not contain these. Thus proper tags are::
+
+       Deletion from the reference:   cigar=5M1D5M    MD=5^C5
+       Skipped region from reference: cigar=5M1N5M    MD=10
+
+    Returns
+    -------
+
+    None, if no MD tag is present.
+
+    """
+    if src == NULL:
+        return None
+
+    cdef uint32_t start = getQueryStart(src)
+    cdef uint32_t end = getQueryEnd(src)
+    # get read sequence, taking into account soft-clipping
+    r = getSequenceInRange(src, start, end)
+    cdef char * read_sequence = r
+    cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+    if cigar_p == NULL:
+        return None
+
+    cdef uint32_t r_idx = 0
+    cdef int op
+    cdef uint32_t k, i, l, x
+    cdef int nmatches = 0
+    cdef int s_idx = 0
+
+    cdef uint32_t max_len = get_alignment_length(src)
+    if max_len == 0:
+        raise ValueError("could not determine alignment length")
+
+    cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
+    if s == NULL:
+        raise ValueError(
+            "could not allocated sequence of length %i" % max_len)
+
+    for k from 0 <= k < pysam_get_n_cigar(src):
+        op = cigar_p[k] & BAM_CIGAR_MASK
+        l = cigar_p[k] >> BAM_CIGAR_SHIFT
+        if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+            for i from 0 <= i < l:
+                s[s_idx] = read_sequence[r_idx]
+                r_idx += 1
+                s_idx += 1
+        elif op == BAM_CDEL:
+            for i from 0 <= i < l:
+                s[s_idx] = '-'
+                s_idx += 1
+        elif op == BAM_CREF_SKIP:
+            pass
+        elif op == BAM_CINS:
+            for i from 0 <= i < l:
+                # encode insertions into reference as lowercase
+                s[s_idx] = read_sequence[r_idx] + 32
+                r_idx += 1
+                s_idx += 1
+        elif op == BAM_CSOFT_CLIP:
+            pass
+        elif op == BAM_CHARD_CLIP:
+            pass # advances neither
+        elif op == BAM_CPAD:
+            raise NotImplementedError(
+                "Padding (BAM_CPAD, 6) is currently not supported. "
+                "Please implement. Sorry about that.")
+
+    cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
+    if md_tag_ptr == NULL:
+        seq = PyBytes_FromStringAndSize(s, s_idx)
+        free(s)
+        return seq
+
+    cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
+    cdef int md_idx = 0
+    s_idx = 0
+
+    while md_tag[md_idx] != 0:
+        # c is numerical
+        if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
+            nmatches *= 10
+            nmatches += md_tag[md_idx] - 48
+            md_idx += 1
+            continue
+        else:
+            # save matches up to this point, skipping insertions
+            for x from 0 <= x < nmatches:
+                while s[s_idx] >= 'a':
+                    s_idx += 1
+                s_idx += 1
+            while s[s_idx] >= 'a':
+                s_idx += 1
+
+            r_idx += nmatches
+            nmatches = 0
+            if md_tag[md_idx] == '^':
+                md_idx += 1
+                while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
+                    assert s[s_idx] == '-'
+                    s[s_idx] = md_tag[md_idx]
+                    s_idx += 1
+                    md_idx += 1
+            else:
+                # save mismatch and change to lower case
+                s[s_idx] = md_tag[md_idx] + 32
+                s_idx += 1
+                r_idx += 1
+                md_idx += 1
+
+    # save matches up to this point, skipping insertions
+    for x from 0 <= x < nmatches:
+        while s[s_idx] >= 'a':
+            s_idx += 1
+        s_idx += 1
+    while s[s_idx] >= 'a':
+        s_idx += 1
+
+    seq = PyBytes_FromStringAndSize(s, s_idx)
+    free(s)
+
+    return seq
+
+
+cdef class AlignedSegment:
+    '''Class representing an aligned segment.
+
+    This class stores a handle to the samtools C-structure representing
+    an aligned read. Member read access is forwarded to the C-structure
+    and converted into python objects. This implementation should be fast,
+    as only the data needed is converted.
+
+    For write access, the C-structure is updated in-place. This is
+    not the most efficient way to build BAM entries, as the variable
+    length data is concatenated and thus needs to be resized if
+    a field is updated. Furthermore, the BAM entry might be
+    in an inconsistent state.
+
+    One issue to look out for is that the sequence should always
+    be set *before* the quality scores. Setting the sequence will
+    also erase any quality scores that were set previously.
+    '''
+
+    # Now only called when instances are created from Python
+    def __init__(self):
+        # see bam_init1
+        self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
+        # allocate some memory. If size is 0, calloc does not return a
+        # pointer that can be passed to free() so allocate 40 bytes
+        # for a new read
+        self._delegate.m_data = 40
+        self._delegate.data = <uint8_t *>calloc(
+            self._delegate.m_data, 1)
+        self._delegate.l_data = 0
+        # set some data to make read approximately legit.
+        # Note, SAM writing fails with q_name of length 0
+        self._delegate.core.l_qname = 0
+        self._delegate.core.tid = -1
+        self._delegate.core.pos = -1
+        self._delegate.core.mtid = -1
+        self._delegate.core.mpos = -1
+
+        # caching for selected fields
+        self.cache_query_qualities = None
+        self.cache_query_alignment_qualities = None
+        self.cache_query_sequence = None
+        self.cache_query_alignment_sequence = None
+
+    def __dealloc__(self):
+        bam_destroy1(self._delegate)
+
+    def __str__(self):
+        """return string representation of alignment.
+
+        The representation is an approximate :term:`SAM` format, because
+        an aligned read might not be associated with a :term:`AlignmentFile`.
+        As a result :term:`tid` is shown instead of the reference name.
+        Similarly, the tags field is returned in its parsed state.
+
+        To get a valid SAM record, use :meth:`tostring`.
+        """
+        # sam-parsing is done in sam.c/bam_format1_core which
+        # requires a valid header.
+        return "\t".join(map(str, (self.query_name,
+                                   self.flag,
+                                   self.reference_id,
+                                   self.reference_start,
+                                   self.mapping_quality,
+                                   self.cigarstring,
+                                   self.next_reference_id,
+                                   self.next_reference_start,
+                                   self.query_alignment_length,
+                                   self.query_sequence,
+                                   self.query_qualities,
+                                   self.tags)))
+
+    def __copy__(self):
+        return makeAlignedSegment(self._delegate, self._alignment_file)
+
+    def __deepcopy__(self, memo):
+        return makeAlignedSegment(self._delegate, self._alignment_file)
+
+    def compare(self, AlignedSegment other):
+        '''return -1,0,1, if contents in this are binary
+        <,=,> to *other*
+
+        '''
+
+        cdef int retval, x
+        cdef bam1_t *t
+        cdef bam1_t *o
+
+        t = self._delegate
+        o = other._delegate
+
+        # uncomment for debugging purposes
+        # cdef unsigned char * oo, * tt
+        # tt = <unsigned char*>(&t.core)
+        # oo = <unsigned char*>(&o.core)
+        # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
+        # tt = <unsigned char*>(t.data)
+        # oo = <unsigned char*>(o.data)
+        # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
+
+        # Fast-path test for object identity
+        if t == o:
+            return 0
+
+        retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
+
+        if retval:
+            return retval
+        # cmp(t.l_data, o.l_data)
+        retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
+        if retval:
+            return retval
+        return memcmp(t.data, o.data, t.l_data)
+
+    def __richcmp__(self, AlignedSegment other, int op):
+        if op == 2:  # == operator
+            return self.compare(other) == 0
+        elif op == 3:  # != operator
+            return self.compare(other) != 0
+        else:
+            return NotImplemented
+
+    def __hash__(self):
+        cdef bam1_t * src = self._delegate
+        cdef int x
+
+        # see http://effbot.org/zone/python-hash.htm
+        cdef uint8_t * c = <uint8_t *>&src.core
+        cdef uint32_t hash_value = c[0]
+        for x from 1 <= x < sizeof(bam1_core_t):
+            hash_value = c_mul(hash_value, 1000003) ^ c[x]
+        c = <uint8_t *>src.data
+        for x from 0 <= x < src.l_data:
+            hash_value = c_mul(hash_value, 1000003) ^ c[x]
+
+        return hash_value
+
+    cpdef tostring(self, AlignmentFile_t htsfile):
+        """returns a string representation of the aligned segment.
+
+        The output format is valid SAM format.
+
+        Parameters
+        ----------
+
+        htsfile -- AlignmentFile object to map numerical
+                   identifiers to chromosome names.
+        """
+        cdef int n_targets = htsfile.header.n_targets
+
+        if self._delegate.core.tid >= n_targets \
+            or self._delegate.core.mtid >= n_targets:
+            raise ValueError('htsfile does not match aligned segment')
+
+        cdef kstring_t line
+        line.l = line.m = 0
+        line.s = NULL
+
+        if sam_format1(htsfile.header, self._delegate, &line) < 0:
+            if line.m:
+                free(line.s)
+            raise ValueError('sam_format failed')
+
+        ret = force_str(line.s[:line.l])
+
+        if line.m:
+            free(line.s)
+
+        return ret
+
+    ########################################################
+    ## Basic attributes in order of appearance in SAM format
+    property query_name:
+        """the query template name (None if not present)"""
+        def __get__(self):
+            cdef bam1_t * src
+            src = self._delegate
+            if pysam_get_l_qname(src) == 0:
+                return None
+            return charptr_to_str(<char *>pysam_bam_get_qname(src))
+
+        def __set__(self, qname):
+            if qname is None or len(qname) == 0:
+                return
+
+            if len(qname) >= 255:
+                raise ValueError("query length out of range {} > 254".format(
+                    len(qname)))
+
+            qname = force_bytes(qname)
+            cdef bam1_t * src
+            cdef int l
+            cdef char * p
+
+            src = self._delegate
+            p = pysam_bam_get_qname(src)
+
+            # the qname is \0 terminated
+            l = len(qname) + 1
+            pysam_bam_update(src,
+                             pysam_get_l_qname(src),
+                             l,
+                             <uint8_t*>p)
+
+            pysam_set_l_qname(src, l)
+
+            # re-acquire pointer to location in memory
+            # as it might have moved
+            p = pysam_bam_get_qname(src)
+
+            strncpy(p, qname, l)
+
+    property flag:
+        """properties flag"""
+        def __get__(self):
+            return pysam_get_flag(self._delegate)
+        def __set__(self, flag):
+            pysam_set_flag(self._delegate, flag)
+
+    property reference_name:
+        """:term:`reference` name (None if no AlignmentFile is associated)"""
+        def __get__(self):
+            if self._alignment_file is not None:
+                return self._alignment_file.getrname(self._delegate.core.tid)
+            return None
+
+    property reference_id:
+        """:term:`reference` ID
+
+        .. note::
+
+            This field contains the index of the reference sequence in
+            the sequence dictionary. To obtain the name of the
+            reference sequence, use
+            :meth:`pysam.AlignmentFile.getrname()`
+
+        """
+        def __get__(self): return self._delegate.core.tid
+        def __set__(self, tid): self._delegate.core.tid = tid
+
+    property reference_start:
+        """0-based leftmost coordinate"""
+        def __get__(self): return self._delegate.core.pos
+        def __set__(self, pos):
+            ## setting the position requires updating the "bin" attribute
+            cdef bam1_t * src
+            src = self._delegate
+            src.core.pos = pos
+            if pysam_get_n_cigar(src):
+                pysam_set_bin(src,
+                              hts_reg2bin(
+                                  src.core.pos,
+                                  bam_endpos(src),
+                                  14,
+                                  5))
+            else:
+                pysam_set_bin(src,
+                              hts_reg2bin(
+                                  src.core.pos,
+                                  src.core.pos + 1,
+                                  14,
+                                  5))
+
+    property mapping_quality:
+        """mapping quality"""
+        def __get__(self):
+            return pysam_get_qual(self._delegate)
+        def __set__(self, qual):
+            pysam_set_qual(self._delegate, qual)
+
+    property cigarstring:
+        '''the :term:`cigar` alignment as a string.
+
+        The cigar string is a string of alternating integers
+        and characters denoting the length and the type of
+        an operation.
+
+        .. note::
+            The order length,operation is specified in the
+            SAM format. It is different from the order of
+            the :attr:`cigar` property.
+
+        Returns None if not present.
+
+        To unset the cigarstring, assign None or the
+        empty string.
+        '''
+        def __get__(self):
+            c = self.cigartuples
+            if c is None:
+                return None
+            # reverse order
+            else:
+                return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
+
+        def __set__(self, cigar):
+            if cigar is None or len(cigar) == 0:
+                self.cigartuples = []
+            else:
+                parts = CIGAR_REGEX.findall(cigar)
+                # reverse order
+                self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
+
+    # TODO
+    # property cigar:
+    #     """the cigar alignment"""
+
+    property next_reference_id:
+        """the :term:`reference` id of the mate/next read."""
+        def __get__(self): return self._delegate.core.mtid
+        def __set__(self, mtid):
+            self._delegate.core.mtid = mtid
+
+    property next_reference_name:
+        """:term:`reference` name of the mate/next read (None if no
+        AlignmentFile is associated)"""
+        def __get__(self):
+            if self._alignment_file is not None:
+                return self._alignment_file.getrname(self._delegate.core.mtid)
+            return None
+
+    property next_reference_start:
+        """the position of the mate/next read."""
+        def __get__(self):
+            return self._delegate.core.mpos
+        def __set__(self, mpos):
+            self._delegate.core.mpos = mpos
+
+    property query_length:
+        """the length of the query/read.
+
+        This value corresponds to the length of the sequence supplied
+        in the BAM/SAM file. The length of a query is 0 if there is no
+        sequence in the BAM/SAM file. In those cases, the read length
+        can be inferred from the CIGAR alignment, see
+        :meth:`pysam.AlignedSegment.infer_query_length`.
+
+        The length includes soft-clipped bases and is equal to
+        ``len(query_sequence)``.
+
+        This property is read-only but can be set by providing a
+        sequence.
+
+        Returns 0 if not available.
+
+        """
+        def __get__(self):
+            return self._delegate.core.l_qseq
+
+    property template_length:
+        """the observed query template length"""
+        def __get__(self):
+            return self._delegate.core.isize
+        def __set__(self, isize):
+            self._delegate.core.isize = isize
+
+    property query_sequence:
+        """read sequence bases, including :term:`soft clipped` bases
+        (None if not present).
+
+        Note that assigning to seq will invalidate any quality scores.
+        Thus, to in-place edit the sequence and quality scores, copies of
+        the quality scores need to be taken. Consider trimming for example::
+
+           q = read.query_qualities
+           read.query_squence = read.query_sequence[5:10]
+           read.query_qualities = q[5:10]
+
+        The sequence is returned as it is stored in the BAM file. Some mappers
+        might have stored a reverse complement of the original read
+        sequence.
+        """
+        def __get__(self):
+            if self.cache_query_sequence:
+                return self.cache_query_sequence
+
+            cdef bam1_t * src
+            cdef char * s
+            src = self._delegate
+
+            if src.core.l_qseq == 0:
+                return None
+
+            self.cache_query_sequence = force_str(getSequenceInRange(
+                src, 0, src.core.l_qseq))
+            return self.cache_query_sequence
+
+        def __set__(self, seq):
+            # samtools manages sequence and quality length memory together
+            # if no quality information is present, the first byte says 0xff.
+            cdef bam1_t * src
+            cdef uint8_t * p
+            cdef char * s
+            cdef int l, k
+            cdef Py_ssize_t nbytes_new, nbytes_old
+
+            if seq == None:
+                l = 0
+            else:
+                l = len(seq)
+                seq = force_bytes(seq)
+
+            src = self._delegate
+
+            # as the sequence is stored in half-bytes, the total length (sequence
+            # plus quality scores) is (l+1)/2 + l
+            nbytes_new = (l + 1) / 2 + l
+            nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq
+
+            # acquire pointer to location in memory
+            p = pysam_bam_get_seq(src)
+            src.core.l_qseq = l
+
+            # change length of data field
+            pysam_bam_update(src,
+                             nbytes_old,
+                             nbytes_new,
+                             p)
+
+            if l > 0:
+                # re-acquire pointer to location in memory
+                # as it might have moved
+                p = pysam_bam_get_seq(src)
+                for k from 0 <= k < nbytes_new:
+                    p[k] = 0
+                # convert to C string
+                s = seq
+                for k from 0 <= k < l:
+                    p[k/2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
+
+                # erase qualities
+                p = pysam_bam_get_qual(src)
+                p[0] = 0xff
+
+            self.cache_query_sequence = force_str(seq)
+
+            # clear cached values for quality values
+            self.cache_query_qualities = None
+            self.cache_query_alignment_qualities = None
+
+    property query_qualities:
+        """read sequence base qualities, including :term:`soft
+        clipped` bases (None if not present).
+
+        Quality scores are returned as a python array of unsigned
+        chars. Note that this is not the ASCII-encoded value typically
+        seen in FASTQ or SAM formatted files. Thus, no offset of 33
+        needs to be subtracted.
+
+        Note that to set quality scores the sequence has to be set
+        beforehand as this will determine the expected length of the
+        quality score array.
+
+        This method raises a ValueError if the length of the
+        quality scores and the sequence are not the same.
+
+        """
+        def __get__(self):
+
+            if self.cache_query_qualities:
+                return self.cache_query_qualities
+
+            cdef bam1_t * src
+            cdef char * q
+
+            src = self._delegate
+
+            if src.core.l_qseq == 0:
+                return None
+
+            self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
+            return self.cache_query_qualities
+
+        def __set__(self, qual):
+
+            # note that memory is already allocated via setting the sequence
+            # hence length match of sequence and quality needs is checked.
+            cdef bam1_t * src
+            cdef uint8_t * p
+            cdef int l
+
+            src = self._delegate
+            p = pysam_bam_get_qual(src)
+            if qual is None or len(qual) == 0:
+                # if absent and there is a sequence: set to 0xff
+                if src.core.l_qseq != 0:
+                    p[0] = 0xff
+                return
+
+            # check for length match
+            l = len(qual)
+            if src.core.l_qseq != l:
+                raise ValueError(
+                    "quality and sequence mismatch: %i != %i" %
+                    (l, src.core.l_qseq))
+
+            # create a python array object filling it
+            # with the quality scores
+
+            # NB: should avoid this copying if qual is
+            # already of the correct type.
+            cdef c_array.array result = c_array.array('B', qual)
+
+            # copy data
+            memcpy(p, result.data.as_voidptr, l)
+
+            # save in cache
+            self.cache_query_qualities = qual
+
+    property bin:
+        """properties bin"""
+        def __get__(self):
+            return pysam_get_bin(self._delegate)
+        def __set__(self, bin):
+            pysam_set_bin(self._delegate, bin)
+
+
+    ##########################################################
+    # Derived simple attributes. These are simple attributes of
+    # AlignedSegment getting and setting values.
+    ##########################################################
+    # 1. Flags
+    ##########################################################
+    property is_paired:
+        """true if read is paired in sequencing"""
+        def __get__(self):
+            return (self.flag & BAM_FPAIRED) != 0
+        def __set__(self,val):
+            pysam_update_flag(self._delegate, val, BAM_FPAIRED)
+
+    property is_proper_pair:
+        """true if read is mapped in a proper pair"""
+        def __get__(self):
+            return (self.flag & BAM_FPROPER_PAIR) != 0
+        def __set__(self,val):
+            pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
+    property is_unmapped:
+        """true if read itself is unmapped"""
+        def __get__(self):
+            return (self.flag & BAM_FUNMAP) != 0
+        def __set__(self, val):
+            pysam_update_flag(self._delegate, val, BAM_FUNMAP)
+    property mate_is_unmapped:
+        """true if the mate is unmapped"""
+        def __get__(self):
+            return (self.flag & BAM_FMUNMAP) != 0
+        def __set__(self,val):
+            pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
+    property is_reverse:
+        """true if read is mapped to reverse strand"""
+        def __get__(self):
+            return (self.flag & BAM_FREVERSE) != 0
+        def __set__(self,val):
+            pysam_update_flag(self._delegate, val, BAM_FREVERSE)
+    property mate_is_reverse:
+        """true is read is mapped to reverse strand"""
+        def __get__(self):
+            return (self.flag & BAM_FMREVERSE) != 0
+        def __set__(self,val):
+            pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
+    property is_read1:
+        """true if this is read1"""
+        def __get__(self):
+            return (self.flag & BAM_FREAD1) != 0
+        def __set__(self,val):
+            pysam_update_flag(self._delegate, val, BAM_FREAD1)
+    property is_read2:
+        """true if this is read2"""
+        def __get__(self):
+            return (self.flag & BAM_FREAD2) != 0
+        def __set__(self, val):
+            pysam_update_flag(self._delegate, val, BAM_FREAD2)
+    property is_secondary:
+        """true if not primary alignment"""
+        def __get__(self):
+            return (self.flag & BAM_FSECONDARY) != 0
+        def __set__(self, val):
+            pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
+    property is_qcfail:
+        """true if QC failure"""
+        def __get__(self):
+            return (self.flag & BAM_FQCFAIL) != 0
+        def __set__(self, val):
+            pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
+    property is_duplicate:
+        """true if optical or PCR duplicate"""
+        def __get__(self):
+            return (self.flag & BAM_FDUP) != 0
+        def __set__(self, val):
+            pysam_update_flag(self._delegate, val, BAM_FDUP)
+    property is_supplementary:
+        """true if this is a supplementary alignment"""
+        def __get__(self):
+            return (self.flag & BAM_FSUPPLEMENTARY) != 0
+        def __set__(self, val):
+            pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
+
+    # 2. Coordinates and lengths
+    property reference_end:
+        '''aligned reference position of the read on the reference genome.
+
+        reference_end points to one past the last aligned residue.
+        Returns None if not available (read is unmapped or no cigar
+        alignment present).
+
+        '''
+        def __get__(self):
+            cdef bam1_t * src
+            src = self._delegate
+            if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
+                return None
+            return bam_endpos(src)
+
+    property reference_length:
+        '''aligned length of the read on the reference genome.
+
+        This is equal to `aend - pos`. Returns None if not available.'''
+        def __get__(self):
+            cdef bam1_t * src
+            src = self._delegate
+            if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
+                return None
+            return bam_endpos(src) - \
+                self._delegate.core.pos
+
+    property query_alignment_sequence:
+        """aligned portion of the read.
+
+        This is a substring of :attr:`seq` that excludes flanking
+        bases that were :term:`soft clipped` (None if not present). It
+        is equal to ``seq[qstart:qend]``.
+
+        SAM/BAM files may include extra flanking bases that are not
+        part of the alignment.  These bases may be the result of the
+        Smith-Waterman or other algorithms, which may not require
+        alignments that begin at the first residue or end at the last.
+        In addition, extra sequencing adapters, multiplex identifiers,
+        and low-quality bases that were not considered for alignment
+        may have been retained.
+
+        """
+
+        def __get__(self):
+            if self.cache_query_alignment_sequence:
+                return self.cache_query_alignment_sequence
+
+            cdef bam1_t * src
+            cdef uint32_t start, end
+
+            src = self._delegate
+
+            if src.core.l_qseq == 0:
+                return None
+
+            start = getQueryStart(src)
+            end   = getQueryEnd(src)
+
+            self.cache_query_alignment_sequence = force_str(
+                getSequenceInRange(src, start, end))
+            return self.cache_query_alignment_sequence
+
+    property query_alignment_qualities:
+        """aligned query sequence quality values (None if not present). These
+        are the quality values that correspond to :attr:`query`, that
+        is, they exclude qualities of :term:`soft clipped` bases. This
+        is equal to ``qual[qstart:qend]``.
+
+        Quality scores are returned as a python array of unsigned
+        chars. Note that this is not the ASCII-encoded value typically
+        seen in FASTQ or SAM formatted files. Thus, no offset of 33
+        needs to be subtracted.
+
+        This property is read-only.
+
+        """
+        def __get__(self):
+
+            if self.cache_query_alignment_qualities:
+                return self.cache_query_alignment_qualities
+
+            cdef bam1_t * src
+            cdef uint32_t start, end
+
+            src = self._delegate
+
+            if src.core.l_qseq == 0:
+                return None
+
+            start = getQueryStart(src)
+            end   = getQueryEnd(src)
+            self.cache_query_alignment_qualities = \
+                getQualitiesInRange(src, start, end)
+            return self.cache_query_alignment_qualities
+
+    property query_alignment_start:
+        """start index of the aligned query portion of the sequence (0-based,
+        inclusive).
+
+        This the index of the first base in :attr:`seq` that is not
+        soft-clipped.
+
+        """
+        def __get__(self):
+            return getQueryStart(self._delegate)
+
+    property query_alignment_end:
+        """end index of the aligned query portion of the sequence (0-based,
+        exclusive)"""
+        def __get__(self):
+            return getQueryEnd(self._delegate)
+
+    property query_alignment_length:
+        """length of the aligned query sequence.
+
+        This is equal to :attr:`qend` - :attr:`qstart`"""
+        def __get__(self):
+            cdef bam1_t * src
+            src = self._delegate
+            return getQueryEnd(src) - getQueryStart(src)
+
+    #####################################################
+    # Computed properties
+
+    def get_reference_positions(self, full_length=False):
+        """a list of reference positions that this read aligns to.
+
+        By default, this method only returns positions in the
+        reference that are within the alignment. If *full_length* is
+        set, None values will be included for any soft-clipped or
+        unaligned positions within the read. The returned list will
+        thus be of the same length as the read.
+
+        """
+        cdef uint32_t k, i, pos
+        cdef int op
+        cdef uint32_t * cigar_p
+        cdef bam1_t * src
+        cdef bint _full = full_length
+
+        src = self._delegate
+        if pysam_get_n_cigar(src) == 0:
+            return []
+
+        result = []
+        pos = src.core.pos
+        cigar_p = pysam_bam_get_cigar(src)
+
+        for k from 0 <= k < pysam_get_n_cigar(src):
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+            if op == BAM_CSOFT_CLIP or op == BAM_CINS:
+                if _full:
+                    for i from 0 <= i < l:
+                        result.append(None)
+            elif op == BAM_CMATCH:
+                for i from pos <= i < pos + l:
+                    result.append(i)
+                pos += l
+            elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+                pos += l
+
+        return result
+
+    def infer_query_length(self, always=True):
+        """inferred read length from CIGAR string.
+
+        If *always* is set to True, the read length
+        will be always inferred. If set to False, the length
+        of the read sequence will be returned if it is
+        available.
+
+        Returns None if CIGAR string is not present.
+        """
+
+        cdef uint32_t * cigar_p
+        cdef bam1_t * src
+
+        src = self._delegate
+
+        if not always and src.core.l_qseq:
+            return src.core.l_qseq
+
+        return calculateQueryLength(src)
+
+    def get_reference_sequence(self):
+        """return the reference sequence.
+
+        This method requires the MD tag to be set.
+        """
+        cdef uint32_t k, i
+        cdef int op
+        cdef bam1_t * src = self._delegate
+        ref_seq = force_str(build_alignment_sequence(src))
+        if ref_seq is None:
+            raise ValueError("MD tag not present")
+
+        cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+        cdef uint32_t r_idx = 0
+        result = []
+        for k from 0 <= k < pysam_get_n_cigar(src):
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            l = cigar_p[k] >> BAM_CIGAR_SHIFT
+            if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+                for i from 0 <= i < l:
+                    result.append(ref_seq[r_idx])
+                    r_idx += 1
+            elif op == BAM_CDEL:
+                for i from 0 <= i < l:
+                    result.append(ref_seq[r_idx])
+                    r_idx += 1
+            elif op == BAM_CREF_SKIP:
+                pass
+            elif op == BAM_CINS:
+                r_idx += l
+            elif op == BAM_CSOFT_CLIP:
+                pass
+            elif op == BAM_CHARD_CLIP:
+                pass # advances neither
+            elif op == BAM_CPAD:
+                raise NotImplementedError(
+                    "Padding (BAM_CPAD, 6) is currently not supported. "
+                    "Please implement. Sorry about that.")
+
+        return "".join(result)
+
+    def get_aligned_pairs(self, matches_only=False, with_seq=False):
+        """a list of aligned read (query) and reference positions.
+
+        For inserts, deletions, skipping either query or reference
+        position may be None.
+
+        Padding is currently not supported and leads to an exception.
+
+        Parameters
+        ----------
+
+        matches_only : bool
+          If True, only matched bases are returned - no None on either
+          side.
+        with_seq : bool
+          If True, return a third element in the tuple containing the
+          reference sequence. Substitutions are lower-case. This option
+          requires an MD tag to be present.
+
+        Returns
+        -------
+
+        aligned_pairs : list of tuples
+
+        """
+        cdef uint32_t k, i, pos, qpos, r_idx, l
+        cdef int op
+        cdef uint32_t * cigar_p
+        cdef bam1_t * src = self._delegate
+        cdef bint _matches_only = bool(matches_only)
+        cdef bint _with_seq = bool(with_seq)
+
+        # TODO: this method performs no checking and assumes that
+        # read sequence, cigar and MD tag are consistent.
+
+        if _with_seq:
+            ref_seq = force_str(self.get_reference_sequence())
+            if ref_seq is None:
+                raise ValueError("MD tag not present")
+
+        r_idx = 0
+
+        if pysam_get_n_cigar(src) == 0:
+            return []
+
+        result = []
+        pos = src.core.pos
+        qpos = 0
+        cigar_p = pysam_bam_get_cigar(src)
+        for k from 0 <= k < pysam_get_n_cigar(src):
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+            if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+                if _with_seq:
+                    for i from pos <= i < pos + l:
+                        result.append((qpos, i, ref_seq[r_idx]))
+                        r_idx += 1
+                        qpos += 1
+                else:
+                    for i from pos <= i < pos + l:
+                        result.append((qpos, i))
+                        qpos += 1
+                pos += l
+
+            elif op == BAM_CINS or op == BAM_CSOFT_CLIP:
+                if not _matches_only:
+                    if _with_seq:
+                        for i from pos <= i < pos + l:
+                            result.append((qpos, None, None))
+                            qpos += 1
+                    else:
+                        for i from pos <= i < pos + l:
+                            result.append((qpos, None))
+                            qpos += 1
+                else:
+                    qpos += l
+
+            elif op == BAM_CDEL:
+                if not _matches_only:
+                    if _with_seq:
+                        for i from pos <= i < pos + l:
+                            result.append((None, i, ref_seq[r_idx]))
+                            r_idx += 1
+                    else:
+                        for i from pos <= i < pos + l:
+                            result.append((None, i))
+                pos += l
+
+            elif op == BAM_CHARD_CLIP:
+                pass # advances neither
+
+            elif op == BAM_CREF_SKIP:
+                if not _matches_only:
+                    if _with_seq:
+                        for i from pos <= i < pos + l:
+                            result.append((None, i, None))
+                    else:
+                        for i from pos <= i < pos + l:
+                            result.append((None, i))
+
+                pos += l
+
+            elif op == BAM_CPAD:
+                raise NotImplementedError(
+                    "Padding (BAM_CPAD, 6) is currently not supported. "
+                    "Please implement. Sorry about that.")
+
+        return result
+
+    def get_blocks(self):
+        """ a list of start and end positions of
+        aligned gapless blocks.
+
+        The start and end positions are in genomic
+        coordinates.
+
+        Blocks are not normalized, i.e. two blocks
+        might be directly adjacent. This happens if
+        the two blocks are separated by an insertion
+        in the read.
+        """
+
+        cdef uint32_t k, pos, l
+        cdef int op
+        cdef uint32_t * cigar_p
+        cdef bam1_t * src
+
+        src = self._delegate
+        if pysam_get_n_cigar(src) == 0:
+            return []
+
+        result = []
+        pos = src.core.pos
+        cigar_p = pysam_bam_get_cigar(src)
+        l = 0
+
+        for k from 0 <= k < pysam_get_n_cigar(src):
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            l = cigar_p[k] >> BAM_CIGAR_SHIFT
+            if op == BAM_CMATCH:
+                result.append((pos, pos + l))
+                pos += l
+            elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+                pos += l
+
+        return result
+
+    def get_overlap(self, uint32_t start, uint32_t end):
+        """return number of aligned bases of read overlapping the interval
+        *start* and *end* on the reference sequence.
+
+        Return None if cigar alignment is not available.
+        """
+        cdef uint32_t k, i, pos, overlap
+        cdef int op, o
+        cdef uint32_t * cigar_p
+        cdef bam1_t * src
+
+        overlap = 0
+
+        src = self._delegate
+        if pysam_get_n_cigar(src) == 0:
+            return None
+        pos = src.core.pos
+        o = 0
+
+        cigar_p = pysam_bam_get_cigar(src)
+        for k from 0 <= k < pysam_get_n_cigar(src):
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+            if op == BAM_CMATCH:
+                o = min( pos + l, end) - max( pos, start )
+                if o > 0: overlap += o
+
+            if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
+                pos += l
+
+        return overlap
+
+    def get_cigar_stats(self):
+        """summary of operations in cigar string.
+
+        The output order in the array is "MIDNSHP=X" followed by a
+        field for the NM tag. If the NM tag is not present, this
+        field will always be 0.
+
+        +-----+--------------+-----+
+        |M    |BAM_CMATCH    |0    |
+        +-----+--------------+-----+
+        |I    |BAM_CINS      |1    |
+        +-----+--------------+-----+
+        |D    |BAM_CDEL      |2    |
+        +-----+--------------+-----+
+        |N    |BAM_CREF_SKIP |3    |
+        +-----+--------------+-----+
+        |S    |BAM_CSOFT_CLIP|4    |
+        +-----+--------------+-----+
+        |H    |BAM_CHARD_CLIP|5    |
+        +-----+--------------+-----+
+        |P    |BAM_CPAD      |6    |
+        +-----+--------------+-----+
+        |=    |BAM_CEQUAL    |7    |
+        +-----+--------------+-----+
+        |X    |BAM_CDIFF     |8    |
+        +-----+--------------+-----+
+        |NM   |NM tag        |9    |
+        +-----+--------------+-----+
+
+        If no cigar string is present, empty arrays will be returned.
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        arrays : two arrays. The first contains the nucleotide counts within
+           each cigar operation, the second contains the number of blocks for
+           each cigar operation.
+
+        """
+
+        cdef int nfields = NCIGAR_CODES + 1
+
+        cdef c_array.array base_counts = array.array(
+            "I",
+            [0] * nfields)
+        cdef uint32_t [:] base_view = base_counts
+        cdef c_array.array block_counts = array.array(
+            "I",
+            [0] * nfields)
+        cdef uint32_t [:] block_view = block_counts
+
+        cdef bam1_t * src = self._delegate
+        cdef int op
+        cdef uint32_t l
+        cdef int32_t k
+        cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+
+        if cigar_p == NULL:
+            return None
+
+        for k from 0 <= k < pysam_get_n_cigar(src):
+            op = cigar_p[k] & BAM_CIGAR_MASK
+            l = cigar_p[k] >> BAM_CIGAR_SHIFT
+            base_view[op] += l
+            block_view[op] += 1
+
+        cdef uint8_t * v = bam_aux_get(src, 'NM')
+        if v != NULL:
+            base_view[nfields - 1] = <int32_t>bam_aux2i(v)
+
+        return base_counts, block_counts
+
+    #####################################################
+    ## Unsorted as yet
+    # TODO: capture in CIGAR object
+    property cigartuples:
+        """the :term:`cigar` alignment. The alignment
+        is returned as a list of tuples of (operation, length).
+
+        If the alignment is not present, None is returned.
+
+        The operations are:
+
+        +-----+--------------+-----+
+        |M    |BAM_CMATCH    |0    |
+        +-----+--------------+-----+
+        |I    |BAM_CINS      |1    |
+        +-----+--------------+-----+
+        |D    |BAM_CDEL      |2    |
+        +-----+--------------+-----+
+        |N    |BAM_CREF_SKIP |3    |
+        +-----+--------------+-----+
+        |S    |BAM_CSOFT_CLIP|4    |
+        +-----+--------------+-----+
+        |H    |BAM_CHARD_CLIP|5    |
+        +-----+--------------+-----+
+        |P    |BAM_CPAD      |6    |
+        +-----+--------------+-----+
+        |=    |BAM_CEQUAL    |7    |
+        +-----+--------------+-----+
+        |X    |BAM_CDIFF     |8    |
+        +-----+--------------+-----+
+
+        .. note::
+            The output is a list of (operation, length) tuples, such as
+            ``[(0, 30)]``.
+            This is different from the SAM specification and
+            the :attr:`cigarstring` property, which uses a
+            (length, operation) order, for example: ``30M``.
+
+        To unset the cigar property, assign an empty list
+        or None.
+        """
+        def __get__(self):
+            cdef uint32_t * cigar_p
+            cdef bam1_t * src
+            cdef uint32_t op, l
+            cdef int k
+
+            src = self._delegate
+            if pysam_get_n_cigar(src) == 0:
+                return None
+
+            cigar = []
+
+            cigar_p = pysam_bam_get_cigar(src);
+            for k from 0 <= k < pysam_get_n_cigar(src):
+                op = cigar_p[k] & BAM_CIGAR_MASK
+                l = cigar_p[k] >> BAM_CIGAR_SHIFT
+                cigar.append((op, l))
+            return cigar
+
+        def __set__(self, values):
+            cdef uint32_t * p
+            cdef bam1_t * src
+            cdef op, l
+            cdef int k, ncigar
+
+            k = 0
+
+            src = self._delegate
+
+            # get location of cigar string
+            p = pysam_bam_get_cigar(src)
+
+            # empty values for cigar string
+            if values is None:
+                values = []
+
+            ncigar = len(values)
+            # create space for cigar data within src.data
+            pysam_bam_update(src,
+                             pysam_get_n_cigar(src) * 4,
+                             ncigar * 4,
+                             <uint8_t*>p)
+
+            # length is number of cigar operations, not bytes
+            pysam_set_n_cigar(src, ncigar)
+
+            # re-acquire pointer to location in memory
+            # as it might have moved
+            p = pysam_bam_get_cigar(src)
+
+            # insert cigar operations
+            for op, l in values:
+                p[k] = l << BAM_CIGAR_SHIFT | op
+                k += 1
+
+            ## setting the cigar string requires updating the bin
+            pysam_set_bin(src,
+                          hts_reg2bin(
+                              src.core.pos,
+                              bam_endpos(src),
+                              14,
+                              5))
+
+
+    cpdef set_tag(self,
+                  tag,
+                  value,
+                  value_type=None,
+                  replace=True):
+        """sets a particular field *tag* to *value* in the optional alignment
+        section.
+
+        *value_type* describes the type of *value* that is to entered
+        into the alignment record.. It can be set explicitly to one
+        of the valid one-letter type codes. If unset, an appropriate
+        type will be chosen automatically.
+
+        An existing value of the same *tag* will be overwritten unless
+        replace is set to False. This is usually not recommened as a
+        tag may only appear once in the optional alignment section.
+
+        If *value* is None, the tag will be deleted.
+        """
+
+        cdef int value_size
+        cdef uint8_t * value_ptr
+        cdef uint8_t *existing_ptr
+        cdef uint8_t typecode
+        cdef float float_value
+        cdef double double_value
+        cdef int32_t int_value
+        cdef bam1_t * src = self._delegate
+        cdef char * _value_type
+        cdef c_array.array array_value
+        cdef object buffer
+
+        if len(tag) != 2:
+            raise ValueError('Invalid tag: %s' % tag)
+
+        tag = force_bytes(tag)
+        if replace:
+            existing_ptr = bam_aux_get(src, tag)
+            if existing_ptr:
+                bam_aux_del(src, existing_ptr)
+
+        # setting value to None deletes a tag
+        if value is None:
+            return
+
+        typecode = get_value_code(value, value_type)
+        if typecode == 0:
+            raise ValueError("can't guess type or invalid type code specified")
+
+        # Not Endian-safe, but then again neither is samtools!
+        if typecode == 'Z':
+            value = force_bytes(value)
+            value_ptr = <uint8_t*><char*>value
+            value_size = len(value)+1
+        elif typecode == 'i':
+            int_value = value
+            value_ptr = <uint8_t*>&int_value
+            value_size = sizeof(int32_t)
+        elif typecode == 'd':
+            double_value = value
+            value_ptr = <uint8_t*>&double_value
+            value_size = sizeof(double)
+        elif typecode == 'f':
+            float_value  = value
+            value_ptr = <uint8_t*>&float_value
+            value_size = sizeof(float)
+        elif typecode == 'B':
+            # the following goes through python, needs to be cleaned up
+            # pack array using struct
+            if value_type is None:
+                fmt, args = packTags([(tag, value)])
+            else:
+                fmt, args = packTags([(tag, value, value_type)])
+
+            # remove tag and type code as set by bam_aux_append
+            # first four chars of format (<2sc)
+            fmt = '<' + fmt[4:]
+            # first two values to pack
+            args = args[2:]
+            value_size = struct.calcsize(fmt)
+            # buffer will be freed when object goes out of scope
+            buffer = ctypes.create_string_buffer(value_size)
+            struct.pack_into(fmt, buffer, 0, *args)
+            # bam_aux_append copies data from value_ptr
+            bam_aux_append(src,
+                           tag,
+                           typecode,
+                           value_size,
+                           <uint8_t*>buffer.raw)
+            return
+        else:
+            raise ValueError('unsupported value_type in set_option')
+
+        bam_aux_append(src,
+                       tag,
+                       typecode,
+                       value_size,
+                       value_ptr)
+
+    cpdef has_tag(self, tag):
+        """returns true if the optional alignment section
+        contains a given *tag*."""
+        cdef uint8_t * v
+        cdef int nvalues
+        btag = force_bytes(tag)
+        v = bam_aux_get(self._delegate, btag)
+        return v != NULL
+
+    cpdef get_tag(self, tag, with_value_type=False):
+        """
+        retrieves data from the optional alignment section
+        given a two-letter *tag* denoting the field.
+
+        The returned value is cast into an appropriate python type.
+
+        This method is the fastest way to access the optional
+        alignment section if only few tags need to be retrieved.
+
+        Parameters
+        ----------
+
+        tag :
+            data tag.
+
+        with_value_type : Optional[bool]
+            if set to True, the return value is a tuple of (tag value, type code).
+            (default False)
+
+        Returns
+        -------
+
+        A python object with the value of the `tag`. The type of the
+        object depends on the data type in the data record.
+
+        Raises
+        ------
+
+        KeyError
+            If `tag` is not present, a KeyError is raised.
+
+        """
+        cdef uint8_t * v
+        cdef int nvalues
+        btag = force_bytes(tag)
+        v = bam_aux_get(self._delegate, btag)
+        if v == NULL:
+            raise KeyError("tag '%s' not present" % tag)
+        if chr(v[0]) == "B":
+            auxtype = chr(v[0]) + chr(v[1])
+        else:
+            auxtype = chr(v[0])
+
+        if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
+            value = <int>bam_aux2i(v)
+        elif auxtype == 'i' or auxtype == 'I':
+            value = <int32_t>bam_aux2i(v)
+        elif auxtype == 'f' or auxtype == 'F':
+            value = <float>bam_aux2f(v)
+        elif auxtype == 'd' or auxtype == 'D':
+            value = <double>bam_aux2f(v)
+        elif auxtype == 'A':
+            # there might a more efficient way
+            # to convert a char into a string
+            value = '%c' % <char>bam_aux2A(v)
+        elif auxtype == 'Z':
+            value = charptr_to_str(<char*>bam_aux2Z(v))
+        elif auxtype[0] == 'B':
+            bytesize, nvalues, values = convert_binary_tag(v + 1)
+            value = values
+        else:
+            raise ValueError("unknown auxiliary type '%s'" % auxtype)
+
+        if with_value_type:
+            return (value, auxtype)
+        else:
+            return value
+
+    def get_tags(self, with_value_type=False):
+        """the fields in the optional aligment section.
+
+        Returns a list of all fields in the optional
+        alignment section. Values are converted to appropriate python
+        values. For example:
+
+        [(NM, 2), (RG, "GJP00TM04")]
+
+        If *with_value_type* is set, the value type as encode in
+        the AlignedSegment record will be returned as well:
+
+        [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+
+        This method will convert all values in the optional alignment
+        section. When getting only one or few tags, please see
+        :meth:`get_tag` for a quicker way to achieve this.
+
+        """
+
+        cdef char * ctag
+        cdef bam1_t * src
+        cdef uint8_t * s
+        cdef char auxtag[3]
+        cdef char auxtype
+        cdef uint8_t byte_size
+        cdef int32_t nvalues
+
+        src = self._delegate
+        if src.l_data == 0:
+            return []
+        s = pysam_bam_get_aux(src)
+        result = []
+        auxtag[2] = 0
+        while s < (src.data + src.l_data):
+            # get tag
+            auxtag[0] = s[0]
+            auxtag[1] = s[1]
+            s += 2
+            auxtype = s[0]
+            if auxtype in ('c', 'C'):
+                value = <int>bam_aux2i(s)
+                s += 1
+            elif auxtype in ('s', 'S'):
+                value = <int>bam_aux2i(s)
+                s += 2
+            elif auxtype in ('i', 'I'):
+                value = <int32_t>bam_aux2i(s)
+                s += 4
+            elif auxtype == 'f':
+                value = <float>bam_aux2f(s)
+                s += 4
+            elif auxtype == 'd':
+                value = <double>bam_aux2f(s)
+                s += 8
+            elif auxtype == 'A':
+                value = "%c" % <char>bam_aux2A(s)
+                s += 1
+            elif auxtype in ('Z', 'H'):
+                value = charptr_to_str(<char*>bam_aux2Z(s))
+                # +1 for NULL terminated string
+                s += len(value) + 1
+            elif auxtype == 'B':
+                s += 1
+                byte_size, nvalues, value = convert_binary_tag(s)
+                # 5 for 1 char and 1 int
+                s += 5 + (nvalues * byte_size) - 1
+            else:
+                raise KeyError("unknown type '%s'" % auxtype)
+
+            s += 1
+
+            if with_value_type:
+                result.append((charptr_to_str(auxtag), value, chr(auxtype)))
+            else:
+                result.append((charptr_to_str(auxtag), value))
+
+        return result
+
+    def set_tags(self, tags):
+        """sets the fields in the optional alignmest section with
+        a list of (tag, value) tuples.
+
+        The :term:`value type` of the values is determined from the
+        python type. Optionally, a type may be given explicitly as
+        a third value in the tuple, For example:
+
+        x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+
+        This method will not enforce the rule that the same tag may appear
+        only once in the optional alignment section.
+        """
+
+        cdef bam1_t * src
+        cdef uint8_t * s
+        cdef char * temp
+        cdef int new_size = 0
+        cdef int old_size
+        src = self._delegate
+
+        # convert and pack the data
+        if tags is not None and len(tags) > 0:
+            fmt, args = packTags(tags)
+            new_size = struct.calcsize(fmt)
+            buffer = ctypes.create_string_buffer(new_size)
+            struct.pack_into(fmt,
+                             buffer,
+                             0,
+                             *args)
+
+        # delete the old data and allocate new space.
+        # If total_size == 0, the aux field will be
+        # empty
+        old_size = pysam_bam_get_l_aux(src)
+        pysam_bam_update(src,
+                         old_size,
+                         new_size,
+                         pysam_bam_get_aux(src))
+
+        # copy data only if there is any
+        if new_size > 0:
+
+            # get location of new data
+            s = pysam_bam_get_aux(src)
+
+            # check if there is direct path from buffer.raw to tmp
+            p = buffer.raw
+            # create handle to make sure buffer stays alive long
+            # enough for memcpy, see issue 129
+            temp = p
+            memcpy(s, temp, new_size)
+
+
+    ########################################################
+    # Compatibility Accessors
+    # Functions, properties for compatibility with pysam < 0.8
+    #
+    # Several options
+    #     change the factory functions according to API
+    #         * requires code changes throughout, incl passing
+    #           handles to factory functions
+    #     subclass functions and add attributes at runtime
+    #         e.g.: AlignedSegments.qname = AlignedSegments.query_name
+    #         * will slow down the default interface
+    #     explicit declaration of getters/setters
+    ########################################################
+    property qname:
+        """deprecated, use query_name instead"""
+        def __get__(self): return self.query_name
+        def __set__(self, v): self.query_name = v
+    property tid:
+        """deprecated, use reference_id instead"""
+        def __get__(self): return self.reference_id
+        def __set__(self, v): self.reference_id = v
+    property pos:
+        """deprecated, use reference_start instead"""
+        def __get__(self): return self.reference_start
+        def __set__(self, v): self.reference_start = v
+    property mapq:
+        """deprecated, use mapping_quality instead"""
+        def __get__(self): return self.mapping_quality
+        def __set__(self, v): self.mapping_quality = v
+    property rnext:
+        """deprecated, use next_reference_id instead"""
+        def __get__(self): return self.next_reference_id
+        def __set__(self, v): self.next_reference_id = v
+    property pnext:
+        """deprecated, use next_reference_start instead"""
+        def __get__(self):
+            return self.next_reference_start
+        def __set__(self, v):
+            self.next_reference_start = v
+    property cigar:
+        """deprecated, use cigartuples instead"""
+        def __get__(self):
+            r = self.cigartuples
+            if r is None:
+                r = []
+            return r
+        def __set__(self, v): self.cigartuples = v
+    property tlen:
+        """deprecated, use template_length instead"""
+        def __get__(self):
+            return self.template_length
+        def __set__(self, v):
+            self.template_length = v
+    property seq:
+        """deprecated, use query_sequence instead"""
+        def __get__(self):
+            return self.query_sequence
+        def __set__(self, v):
+            self.query_sequence = v
+    property qual:
+        """deprecated, query_qualities instead"""
+        def __get__(self):
+            return array_to_qualitystring(self.query_qualities)
+        def __set__(self, v):
+            self.query_qualities = qualitystring_to_array(v)
+    property alen:
+        """deprecated, reference_length instead"""
+        def __get__(self):
+            return self.reference_length
+        def __set__(self, v):
+            self.reference_length = v
+    property aend:
+        """deprecated, reference_end instead"""
+        def __get__(self):
+            return self.reference_end
+        def __set__(self, v):
+            self.reference_end = v
+    property rlen:
+        """deprecated, query_length instead"""
+        def __get__(self):
+            return self.query_length
+        def __set__(self, v):
+            self.query_length = v
+    property query:
+        """deprecated, query_alignment_sequence instead"""
+        def __get__(self):
+            return self.query_alignment_sequence
+        def __set__(self, v):
+            self.query_alignment_sequence = v
+    property qqual:
+        """deprecated, query_alignment_qualities instead"""
+        def __get__(self):
+            return array_to_qualitystring(self.query_alignment_qualities)
+        def __set__(self, v):
+            self.query_alignment_qualities = qualitystring_to_array(v)
+    property qstart:
+        """deprecated, use query_alignment_start instead"""
+        def __get__(self):
+            return self.query_alignment_start
+        def __set__(self, v):
+            self.query_alignment_start = v
+    property qend:
+        """deprecated, use query_alignment_end instead"""
+        def __get__(self):
+            return self.query_alignment_end
+        def __set__(self, v):
+            self.query_alignment_end = v
+    property qlen:
+        """deprecated, use query_alignment_length instead"""
+        def __get__(self):
+            return self.query_alignment_length
+        def __set__(self, v):
+            self.query_alignment_length = v
+    property mrnm:
+        """deprecated, use next_reference_id instead"""
+        def __get__(self):
+            return self.next_reference_id
+        def __set__(self, v):
+            self.next_reference_id = v
+    property mpos:
+        """deprecated, use next_reference_start instead"""
+        def __get__(self):
+            return self.next_reference_start
+        def __set__(self, v):
+            self.next_reference_start = v
+    property rname:
+        """deprecated, use reference_id instead"""
+        def __get__(self):
+            return self.reference_id
+        def __set__(self, v):
+            self.reference_id = v
+    property isize:
+        """deprecated, use template_length instead"""
+        def __get__(self):
+            return self.template_length
+        def __set__(self, v):
+            self.template_length = v
+    property blocks:
+        """deprecated, use get_blocks() instead"""
+        def __get__(self):
+            return self.get_blocks()
+    property aligned_pairs:
+        """deprecated, use get_aligned_pairs() instead"""
+        def __get__(self):
+            return self.get_aligned_pairs()
+    property inferred_length:
+        """deprecated, use infer_query_length() instead"""
+        def __get__(self):
+            return self.infer_query_length()
+    property positions:
+        """deprecated, use get_reference_positions() instead"""
+        def __get__(self):
+            return self.get_reference_positions()
+    property tags:
+        """deprecated, use get_tags() instead"""
+        def __get__(self):
+            return self.get_tags()
+        def __set__(self, tags):
+            self.set_tags(tags)
+    def overlap(self):
+        """deprecated, use get_overlap() instead"""
+        return self.get_overlap()
+    def opt(self, tag):
+        """deprecated, use get_tag() instead"""
+        return self.get_tag(tag)
+    def setTag(self, tag, value, value_type=None, replace=True):
+        """deprecated, use set_tag() instead"""
+        return self.set_tag(tag, value, value_type, replace)
+
+
+cdef class PileupColumn:
+    '''A pileup of reads at a particular reference sequence position
+    (:term:`column`). A pileup column contains all the reads that map
+    to a certain target base.
+
+    This class is a proxy for results returned by the samtools pileup
+    engine.  If the underlying engine iterator advances, the results
+    of this column will change.
+
+    '''
+    def __init__(self):
+        raise TypeError("this class cannot be instantiated from Python")
+
+    def __str__(self):
+        return "\t".join(map(str,
+                              (self.reference_id,
+                               self.reference_pos,
+                               self.nsegments))) +\
+            "\n" +\
+            "\n".join(map(str, self.pileups))
+
+    property reference_id:
+        '''the reference sequence number as defined in the header'''
+        def __get__(self):
+            return self.tid
+
+    property reference_name:
+        """:term:`reference` name (None if no AlignmentFile is associated)"""
+        def __get__(self):
+            if self._alignment_file is not None:
+                return self._alignment_file.getrname(self.tid)
+            return None
+
+    property nsegments:
+        '''number of reads mapping to this column.'''
+        def __get__(self):
+            return self.n_pu
+        def __set__(self, n):
+            self.n_pu = n
+
+    property reference_pos:
+        '''the position in the reference sequence (0-based).'''
+        def __get__(self):
+            return self.pos
+
+    property pileups:
+        '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
+        def __get__(self):
+            cdef int x
+            pileups = []
+
+            if self.plp == NULL or self.plp[0] == NULL:
+                raise ValueError("PileupColumn accessed after iterator finished")
+
+            # warning: there could be problems if self.n and self.buf are
+            # out of sync.
+            for x from 0 <= x < self.n_pu:
+                pileups.append(makePileupRead(&(self.plp[0][x]),
+                                              self._alignment_file))
+            return pileups
+
+    ########################################################
+    # Compatibility Accessors
+    # Functions, properties for compatibility with pysam < 0.8
+    ########################################################
+    property pos:
+        def __get__(self):
+            return self.reference_pos
+        def __set__(self, v):
+            self.reference_pos = v
+
+    property tid:
+        def __get__(self):
+            return self.reference_id
+        def __set__(self, v):
+            self.reference_id = v
+
+    property n:
+        def __get__(self):
+            return self.nsegments
+        def __set__(self, v):
+            self.nsegments = v
+
+
+cdef class PileupRead:
+    '''Representation of a read aligned to a particular position in the
+    reference sequence.
+
+    '''
+
+    def __init__(self):
+        raise TypeError(
+            "this class cannot be instantiated from Python")
+
+    def __str__(self):
+        return "\t".join(
+            map(str,
+                (self.alignment, self.query_position,
+                 self.indel, self.level,
+                 self.is_del, self.is_head,
+                 self.is_tail, self.is_refskip)))
+
+    property alignment:
+        """a :class:`pysam.AlignedSegment` object of the aligned read"""
+        def __get__(self):
+            return self._alignment
+
+    property query_position:
+        """position of the read base at the pileup site, 0-based.
+        None if is_del or is_refskip is set.
+
+        """
+        def __get__(self):
+            if self.is_del or self.is_refskip:
+                return None
+            else:
+                return self._qpos
+
+    property query_position_or_next:
+        """position of the read base at the pileup site, 0-based.
+
+        If the current position is a deletion, returns the next
+        aligned base.
+
+        """
+        def __get__(self):
+            return self._qpos
+
+    property indel:
+        """indel length for the position following the current pileup site.
+
+        This quantity peeks ahead to the next cigar operation in this
+        alignment. If the next operation is an insertion, indel will
+        be positive. If the next operation is a deletion, it will be
+        negation. 0 if the next operation is not an indel.
+
+        """
+        def __get__(self):
+            return self._indel
+
+    property level:
+        """the level of the read in the "viewer" mode. Note that this value
+        is currently not computed."""
+        def __get__(self):
+            return self._level
+
+    property is_del:
+        """1 iff the base on the padded read is a deletion"""
+        def __get__(self):
+            return self._is_del
+
+    property is_head:
+        """1 iff the base on the padded read is the left-most base."""
+        def __get__(self):
+            return self._is_head
+
+    property is_tail:
+        """1 iff the base on the padded read is the right-most base."""
+        def __get__(self):
+            return self._is_tail
+
+    property is_refskip:
+        """1 iff the base on the padded read is part of CIGAR N op."""
+        def __get__(self):
+            return self._is_refskip
+
+__all__ = [
+    "AlignedSegment",
+    "PileupColumn",
+    "PileupRead"]
diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd

new file mode 100644 (file)

index 0000000..6f32f47
--- /dev/null
+++ b/pysam/libcalignmentfile.pxd
@@ -0,0 +1,156 @@
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+
+from pysam.libcfaidx cimport faidx_t, Fastafile
+from pysam.libcalignedsegment cimport AlignedSegment
+from pysam.libchtslib cimport *
+
+from cpython cimport array
+cimport cython
+
+cdef extern from *:
+    ctypedef char* const_char_ptr "const char*"
+
+cdef extern from "htslib_util.h":
+
+    char * pysam_bam_get_qname(bam1_t * b)
+
+cdef extern from "samfile_util.h":
+
+    int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+    int bam_prob_realn(bam1_t *b, const char *ref)
+
+####################################################################
+# Utility types
+
+ctypedef struct __iterdata:
+    htsFile * htsfile
+    bam_hdr_t * header
+    hts_itr_t * iter
+    faidx_t * fastafile
+    int tid
+    char * seq
+    int seq_len
+
+
+cdef class AlignmentFile(HTSFile):
+    cdef readonly object reference_filename
+
+    # pointer to index
+    cdef hts_idx_t *index
+    # header structure
+    cdef bam_hdr_t * header
+
+    # current read within iteration
+    cdef bam1_t * b
+
+    cdef bam1_t * getCurrent(self)
+    cdef int cnext(self)
+
+    # write an aligned read
+    cpdef int write(self, AlignedSegment read) except -1
+
+
+cdef class PileupColumn:
+    cdef bam_pileup1_t ** plp
+    cdef int tid
+    cdef int pos
+    cdef int n_pu
+
+
+cdef class PileupRead:
+    cdef AlignedSegment _alignment
+    cdef int32_t  _qpos
+    cdef int _indel
+    cdef int _level
+    cdef uint32_t _is_del
+    cdef uint32_t _is_head
+    cdef uint32_t _is_tail
+    cdef uint32_t _is_refskip
+
+
+cdef class IteratorRow:
+    cdef int retval
+    cdef bam1_t * b
+    cdef AlignmentFile samfile
+    cdef htsFile * htsfile
+    cdef bam_hdr_t * header
+    cdef int owns_samfile
+
+
+cdef class IteratorRowRegion(IteratorRow):
+    cdef hts_itr_t * iter
+    cdef bam1_t * getCurrent(self)
+    cdef int cnext(self)
+
+cdef class IteratorRowHead(IteratorRow):
+    cdef int max_rows
+    cdef int current_row
+    cdef bam1_t * getCurrent(self)
+    cdef int cnext(self)
+
+cdef class IteratorRowAll(IteratorRow):
+    cdef bam1_t * getCurrent(self)
+    cdef int cnext(self)
+
+
+cdef class IteratorRowAllRefs(IteratorRow):
+    cdef int         tid
+    cdef IteratorRowRegion rowiter
+
+
+cdef class IteratorRowSelection(IteratorRow):
+    cdef int current_pos
+    cdef positions
+    cdef bam1_t * getCurrent(self)
+    cdef int cnext(self)
+
+
+cdef class IteratorColumn:
+
+    # result of the last plbuf_push
+    cdef IteratorRowRegion iter
+    cdef int tid
+    cdef int pos
+    cdef int n_plp
+    cdef int mask
+    cdef bam_pileup1_t * plp
+    cdef bam_plp_t pileup_iter
+    cdef __iterdata iterdata
+    cdef AlignmentFile samfile
+    cdef Fastafile fastafile
+    cdef stepper
+    cdef int max_depth
+
+    cdef int cnext(self)
+    cdef char * getSequence(self)
+    cdef setMask(self, mask)
+    cdef setupIteratorData(self,
+                           int tid,
+                           int start,
+                           int end,
+                           int multiple_iterators=?)
+
+    cdef reset(self, tid, start, end)
+    cdef _free_pileup_iter(self)
+
+
+cdef class IteratorColumnRegion(IteratorColumn):
+    cdef int start
+    cdef int end
+    cdef int truncate
+
+
+cdef class IteratorColumnAllRefs(IteratorColumn):
+    pass
+
+
+cdef class IndexedReads:
+    cdef AlignmentFile samfile
+    cdef htsFile * htsfile
+    cdef index
+    cdef int owns_samfile
+    cdef bam_hdr_t * header
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx

new file mode 100644 (file)

index 0000000..2161f87
--- /dev/null
+++ b/pysam/libcalignmentfile.pyx
@@ -0,0 +1,2490 @@
+# cython: embedsignature=True
+# cython: profile=True
+########################################################
+########################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+########################################################
+# The principal classes defined in this module are:
+#
+# class AlignmentFile   read/write access to SAM/BAM/CRAM formatted files
+# 
+# class IndexedReads    index a SAM/BAM/CRAM file by query name while keeping
+#                       the original sort order intact
+# 
+# Additionally this module defines numerous additional classes that
+# are part of the internal API. These are:
+# 
+# Various iterator classes to iterate over alignments in sequential
+# (IteratorRow) or in a stacked fashion (IteratorColumn):
+# 
+# class IteratorRow
+# class IteratorRowRegion
+# class IteratorRowHead
+# class IteratorRowAll
+# class IteratorRowAllRefs
+# class IteratorRowSelection
+# class IteratorColumn
+# class IteratorColumnRegion
+# class IteratorColumnAllRefs
+#
+########################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+########################################################
+import os
+import collections
+import re
+import warnings
+import array
+
+from libc.errno  cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
+from cpython cimport array as c_array
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
+from pysam.libchtslib cimport HTSFile, hisremote
+
+if PY_MAJOR_VERSION >= 3:
+    from io import StringIO
+else:
+    from StringIO import StringIO
+
+cimport cython
+
+########################################################
+## Constants and global variables
+
+# defines imported from samtools
+DEF SEEK_SET = 0
+DEF SEEK_CUR = 1
+DEF SEEK_END = 2
+
+# maximum genomic coordinace
+cdef int MAX_POS = 2 << 29
+
+# valid types for SAM headers
+VALID_HEADER_TYPES = {"HD" : dict,
+                      "SQ" : list,
+                      "RG" : list,
+                      "PG" : list,
+                      "CO" : list}
+
+# order of records within SAM headers
+VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
+
+# default type conversions within SAM header records
+KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
+                       "SQ" : {"SN" : str, "LN" : int, "AS" : str, 
+                               "M5" : str, "SP" : str, "UR" : str,
+                               "AH" : str,},
+                       "RG" : {"ID" : str, "CN" : str, "DS" : str,
+                               "DT" : str, "FO" : str, "KS" : str,
+                               "LB" : str, "PG" : str, "PI" : str,
+                               "PL" : str, "PM" : str, "PU" : str,
+                               "SM" : str,},
+                       "PG" : {"ID" : str, "PN" : str, "CL" : str, 
+                               "PP" : str, "DS" : str, "VN" : str,},}
+
+# output order of fields within records. Ensure that CL is at
+# the end as parsing a CL will ignore any subsequent records.
+VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"),
+                      "SQ" : ("SN", "LN", "AS", "M5",
+                               "UR", "SP", "AH"),
+                      "RG" : ("ID", "CN", "SM", "LB",
+                              "PU", "PI", "DT", "DS",
+                              "PL", "FO", "KS", "PG",
+                              "PM"),
+                      "PG" : ("PN", "ID", "VN", "PP",
+                              "DS", "CL"),}
+
+
+def build_header_line(fields, record):
+    '''build a header line from `fields` dictionary for `record`'''
+
+    # TODO: add checking for field and sort order
+    line = ["@%s" % record]
+        # comment
+    if record == "CO":
+        line.append(fields)
+    # user tags
+    elif record.islower():
+        for key in sorted(fields):
+            line.append("%s:%s" % (key, str(fields[key])))
+    # defined tags
+    else:
+        # write fields of the specification
+        for key in VALID_HEADER_ORDER[record]:
+            if key in fields:
+                line.append("%s:%s" % (key, str(fields[key])))
+        # write user fields
+        for key in fields:
+            if not key.isupper():
+                line.append("%s:%s" % (key, str(fields[key])))
+
+    return "\t".join(line)
+
+cdef bam_hdr_t * build_header(new_header):
+    '''return a new header built from a dictionary in `new_header`.
+
+    This method inserts the text field, target_name and target_len.
+    '''
+
+    lines = []
+
+    # check if hash exists
+
+    # create new header and copy old data
+    cdef bam_hdr_t * dest
+
+    dest = bam_hdr_init()
+
+    # first: defined tags
+    for record in VALID_HEADERS:
+        if record in new_header:
+            ttype = VALID_HEADER_TYPES[record]
+            data = new_header[record]
+            if type(data) != type(ttype()):
+                raise ValueError(
+                    "invalid type for record %s: %s, expected %s" %
+                    (record, type(data), type(ttype())))
+            if type(data) is dict:
+                lines.append(build_header_line(data, record))
+            else:
+                for fields in new_header[record]:
+                    lines.append(build_header_line(fields, record))
+
+    # then: user tags (lower case), sorted alphabetically
+    for record, data in sorted(new_header.items()):
+        if record in VALID_HEADERS: continue
+        if type(data) is dict:
+            lines.append(build_header_line(data, record))
+        else:
+            for fields in new_header[record]:
+                lines.append(build_header_line(fields, record))
+
+    text = "\n".join(lines) + "\n"
+    if dest.text != NULL: free( dest.text )
+    dest.text = <char*>calloc(len(text), sizeof(char))
+    dest.l_text = len(text)
+    cdef bytes btext = text.encode('ascii')
+    strncpy(dest.text, btext, dest.l_text)
+
+    cdef bytes bseqname
+    # collect targets
+    if "SQ" in new_header:
+        seqs = []
+        for fields in new_header["SQ"]:
+            try:
+                seqs.append( (fields["SN"], fields["LN"] ) )
+            except KeyError:
+                raise KeyError( "incomplete sequence information in '%s'" % str(fields))
+
+        dest.n_targets = len(seqs)
+        dest.target_name = <char**>calloc(dest.n_targets, sizeof(char*))
+        dest.target_len = <uint32_t*>calloc(dest.n_targets, sizeof(uint32_t))
+
+        for x from 0 <= x < dest.n_targets:
+            seqname, seqlen = seqs[x]
+            dest.target_name[x] = <char*>calloc(
+                len(seqname) + 1, sizeof(char))
+            bseqname = seqname.encode('ascii')
+            strncpy(dest.target_name[x], bseqname,
+                    len(seqname) + 1)
+            dest.target_len[x] = seqlen
+
+    return dest
+
+
+cdef class AlignmentFile(HTSFile):
+    """AlignmentFile(filepath_or_object, mode=None, template=None,
+    reference_names=None, reference_lengths=None, text=NULL,
+    header=None, add_sq_text=False, check_header=True, check_sq=True,
+    reference_filename=None, filename=None, duplicate_filehandle=True)
+
+    A :term:`SAM`/:term:`BAM` formatted file. 
+
+    If `filepath_or_object` is a string, the file is automatically
+    opened. If `filepath_or_object` is a python File object, the
+    already opened file will be used.
+
+    If the file is opened for reading an index for a BAM file exists
+    (.bai), it will be opened automatically. Without an index random
+    access via :meth:`~pysam.AlignmentFile.fetch` and
+    :meth:`~pysam.AlignmentFile.pileup` is disabled.
+
+    For writing, the header of a :term:`SAM` file/:term:`BAM` file can
+    be constituted from several sources (see also the samtools format
+    specification):
+
+        1. If `template` is given, the header is copied from a another
+           `AlignmentFile` (`template` must be a
+           :class:`~pysam.AlignmentFile`).
+
+        2. If `header` is given, the header is built from a
+           multi-level dictionary. 
+
+        3. If `text` is given, new header text is copied from raw
+           text.
+
+        4. The names (`reference_names`) and lengths
+           (`reference_lengths`) are supplied directly as lists.
+
+    When reading or writing a CRAM file, the filename of a FASTA-formatted
+    reference can be specified with `reference_filename`.
+
+    By default, if a file is opened in mode 'r', it is checked
+    for a valid header (`check_header` = True) and a definition of
+    chromosome names (`check_sq` = True).
+
+    Parameters
+    ----------
+    mode : string
+        `mode` should be ``r`` for reading or ``w`` for writing. The
+        default is text mode (:term:`SAM`). For binary (:term:`BAM`)
+        I/O you should append ``b`` for compressed or ``u`` for
+        uncompressed :term:`BAM` output.  Use ``h`` to output header
+        information in text (:term:`TAM`) mode. Use ``c`` for
+        :term:`CRAM` formatted files.
+
+        If ``b`` is present, it must immediately follow ``r`` or
+        ``w``.  Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
+        ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
+        :term:`BAM` formatted file for reading, type::
+
+           f = pysam.AlignmentFile('ex1.bam','rb')
+
+        If mode is not specified, the method will try to auto-detect
+        in the order 'rb', 'r', thus both the following should work::
+
+            f1 = pysam.AlignmentFile('ex1.bam')
+            f2 = pysam.AlignmentFile('ex1.sam')
+
+    template : AlignmentFile
+        when writing, copy  header frem `template`.
+
+    header :  dict
+        when writing, build header from a multi-level dictionary. The
+        first level are the four types ('HD', 'SQ', ...). The second
+        level are a list of lines, with each line being a list of
+        tag-value pairs. The header is constructed first from all the
+        defined fields, followed by user tags in alphabetical order.
+
+    text : string
+        when writing, use the string provided as the header
+
+    reference_names : list
+        see referece_lengths
+
+    reference_lengths : list
+        when writing, build header from list of chromosome names and
+        lengths.  By default, 'SQ' and 'LN' tags will be added to the
+        header text. This option can be changed by unsetting the flag
+        `add_sq_text`.
+
+    add_sq_text : bool
+        do not add 'SQ' and 'LN' tags to header. This option permits
+        construction :term:`SAM` formatted files without a header.
+
+    check_header : bool
+        when reading, check if header is present (default=True)
+
+    check_sq : bool
+        when reading, check if SQ entries are present in header
+        (default=True)
+
+    reference_filename : string
+        Path to a FASTA-formatted reference file. Valid only for CRAM files.
+        When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
+        specified in the header (``UR`` tag), which are normally used to find
+        the reference.
+
+    filename : string
+        Alternative to filepath_or_object. Filename of the file
+        to be opened.
+
+    duplicate_filehandle: bool 
+        By default, file handles passed either directly or through
+        File-like objects will be duplicated before passing them to
+        htslib. The duplication prevents issues where the same stream
+        will be closed by htslib and through destruction of the
+        high-level python object. Set to False to turn off
+        duplication.
+
+    """
+
+    def __cinit__(self, *args, **kwargs):
+        self.htsfile = NULL
+        self.filename = None
+        self.mode = None
+        self.is_stream = False
+        self.is_remote = False
+        self.index = NULL
+
+        if "filename" in kwargs:
+            args = [kwargs["filename"]]
+            del kwargs["filename"]
+
+        self._open(*args, **kwargs)
+
+        # allocate memory for iterator
+        self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+
+    def has_index(self):
+        """return true if htsfile has an existing (and opened) index.
+        """
+        return self.index != NULL
+
+    def check_index(self):
+        """return True if index is present.
+
+        Raises
+        ------
+
+        AttributeError
+            if htsfile is :term:`SAM` formatted and thus has no index.
+
+        ValueError
+            if htsfile is closed or index could not be opened.
+        """
+
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+        if not self.is_bam and not self.is_cram:
+            raise AttributeError(
+                "AlignmentFile.mapped only available in bam files")
+        if self.index == NULL:
+            raise ValueError(
+                "mapping information not recorded in index "
+                "or index not available")
+        return True
+
+    def _open(self,
+              filepath_or_object,
+              mode=None,
+              AlignmentFile template=None,
+              reference_names=None,
+              reference_lengths=None,
+              reference_filename=None,
+              text=None,
+              header=None,
+              port=None,
+              add_sq_text=True,
+              check_header=True,
+              check_sq=True,
+              filepath_index=None,
+              referencenames=None,
+              referencelengths=None,
+              duplicate_filehandle=True):
+        '''open a sam, bam or cram formatted file.
+
+        If _open is called on an existing file, the current file
+        will be closed and a new file will be opened.
+        '''
+        cdef char *cfilename = NULL
+        cdef char *creference_filename = NULL
+        cdef char *cindexname = NULL
+        cdef char *cmode = NULL
+
+        # for backwards compatibility:
+        if referencenames is not None:
+            reference_names = referencenames
+        if referencelengths is not None:
+            reference_lengths = referencelengths
+
+        # close a previously opened file
+        if self.is_open:
+            self.close()
+
+        # autodetection for read
+        if mode is None:
+            mode = "r"
+
+        assert mode in ("r", "w", "rb", "wb", "wh",
+                        "wbu", "rU", "wb0",
+                        "rc", "wc"), \
+            "invalid file opening mode `%s`" % mode
+
+        self.duplicate_filehandle = duplicate_filehandle
+
+        # StringIO not supported
+        if isinstance(filepath_or_object, StringIO):
+            raise NotImplementedError(
+                "access from StringIO objects not supported")
+        # reading from a file descriptor
+        elif isinstance(filepath_or_object, int):
+            self.filename = filepath_or_object
+            filename = None
+            self.is_remote = False
+            self.is_stream = True
+        # reading from a File object or other object with fileno
+        elif hasattr(filepath_or_object, "fileno"):
+            if filepath_or_object.closed:
+                raise ValueError('I/O operation on closed file')
+            self.filename = filepath_or_object
+            # .name can be TextIOWrapper
+            try:
+                filename = encode_filename(str(filepath_or_object.name))
+                cfilename = filename
+            except AttributeError:
+                filename = None
+            self.is_remote = False
+            self.is_stream = True
+        # what remains is a filename
+        else:
+            self.filename = filename = encode_filename(filepath_or_object)
+            cfilename = filename
+            self.is_remote = hisremote(cfilename)
+            self.is_stream = self.filename == b'-'
+
+        # for htslib, wbu seems to not work
+        if mode == "wbu":
+            mode = "wb0"
+
+        self.mode = force_bytes(mode)
+        self.reference_filename = reference_filename = encode_filename(
+            reference_filename)
+
+        cdef char * ctext
+        cdef hFILE * fp
+        ctext = NULL
+
+        if mode[0] == 'w':
+            # open file for writing
+
+            # header structure (used for writing)
+            if template:
+                self.header = bam_hdr_dup(template.header)
+            elif header:
+                self.header = build_header(header)
+            else:
+                # build header from a target names and lengths
+                assert reference_names and reference_lengths, \
+                    ("either supply options `template`, `header` "
+                     "or  both `reference_names` and `reference_lengths` "
+                     "for writing")
+                assert len(reference_names) == len(reference_lengths), \
+                    "unequal names and lengths of reference sequences"
+
+                # allocate and fill header
+                reference_names = [force_bytes(ref) for ref in reference_names]
+                self.header = bam_hdr_init()
+                self.header.n_targets = len(reference_names)
+                n = 0
+                for x in reference_names:
+                    n += len(x) + 1
+                self.header.target_name = <char**>calloc(n, sizeof(char*))
+                self.header.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+                for x from 0 <= x < self.header.n_targets:
+                    self.header.target_len[x] = reference_lengths[x]
+                    name = reference_names[x]
+                    self.header.target_name[x] = <char*>calloc(
+                        len(name) + 1, sizeof(char))
+                    strncpy(self.header.target_name[x], name, len(name))
+
+                # Optionally, if there is no text, add a SAM
+                # compatible header to output file.
+                if text is None and add_sq_text:
+                    text = []
+                    for x from 0 <= x < self.header.n_targets:
+                        text.append("@SQ\tSN:%s\tLN:%s\n" % \
+                                    (force_str(reference_names[x]), 
+                                     reference_lengths[x]))
+                    text = ''.join(text)
+
+                if text is not None:
+                    # copy without \0
+                    text = force_bytes(text)
+                    ctext = text
+                    self.header.l_text = strlen(ctext)
+                    self.header.text = <char*>calloc(
+                        strlen(ctext), sizeof(char))
+                    memcpy(self.header.text, ctext, strlen(ctext))
+
+            self.htsfile = self._open_htsfile()
+
+            # set filename with reference sequences. If no filename
+            # is given, the CRAM reference arrays will be built from
+            # the @SQ header in the header
+            if "c" in mode and reference_filename:
+                # note that fn_aux takes ownership, so create a copy
+                self.htsfile.fn_aux = strdup(self.reference_filename)
+
+            # write header to htsfile
+            if "b" in mode or "c" in mode or "h" in mode:
+                with nogil:
+                    sam_hdr_write(self.htsfile, self.header)
+
+        elif mode[0] == "r":
+            # open file for reading
+            if not self._exists():
+                raise IOError("file `%s` not found" % self.filename)
+                
+            self.htsfile = self._open_htsfile()
+
+            if self.htsfile == NULL:
+                raise ValueError(
+                    "could not open file (mode='%s') - "
+                    "is it SAM/BAM format?" % mode)
+
+            if self.htsfile.format.category != sequence_data:
+                raise ValueError("file does not contain alignment data")
+
+            # bam files require a valid header
+            if self.is_bam or self.is_cram:
+                with nogil:
+                    self.header = sam_hdr_read(self.htsfile)
+                if self.header == NULL:
+                    raise ValueError(
+                        "file does not have valid header (mode='%s') "
+                        "- is it BAM format?" % mode )
+            else:
+                # in sam files it is optional (htsfile full of
+                # unmapped reads)
+                if check_header:
+                    with nogil:
+                        self.header = sam_hdr_read(self.htsfile)
+                    if self.header == NULL:
+                        raise ValueError(
+                            "file does not have valid header (mode='%s') "
+                            "- is it SAM format?" % mode )
+                    # self.header.ignore_sam_err = True
+
+            # set filename with reference sequences
+            if self.is_cram and reference_filename:
+                creference_filename = self.reference_filename
+                hts_set_opt(self.htsfile,
+                            CRAM_OPT_REFERENCE,
+                            creference_filename)
+
+            if check_sq and self.header.n_targets == 0:
+                raise ValueError(
+                    ("file has no sequences defined (mode='%s') - "
+                     "is it SAM/BAM format? Consider opening with "
+                     "check_sq=False") % mode)
+
+        assert self.htsfile != NULL
+
+        # check for index and open if present
+        cdef int format_index = -1
+        if self.is_bam:
+            format_index = HTS_FMT_BAI
+        elif self.is_cram:
+            format_index = HTS_FMT_CRAI
+
+        if mode[0] == "r" and (self.is_bam or self.is_cram):
+            # open index for remote files
+            if self.is_remote and not filepath_index:
+                with nogil:
+                    self.index = hts_idx_load(cfilename, format_index)
+                if self.index == NULL:
+                    warnings.warn(
+                        "unable to open remote index for '%s'" % cfilename)
+            else:
+                has_index = True
+                if filepath_index:
+                    if not os.path.exists(filepath_index):
+                        warnings.warn(
+                            "unable to open index at %s" % cfilename)
+                        self.index = NULL
+                        has_index = False
+                elif filename is not None:
+                    if self.is_bam \
+                            and not os.path.exists(filename + b".bai") \
+                            and not os.path.exists(filename[:-4] + b".bai") \
+                            and not os.path.exists(filename + b".csi") \
+                            and not os.path.exists(filename[:-4] + b".csi"):
+                        self.index = NULL
+                        has_index = False
+                    elif self.is_cram \
+                            and not os.path.exists(filename + b".crai") \
+                            and not os.path.exists(filename[:-5] + b".crai"):
+                        self.index = NULL
+                        has_index = False
+                else:
+                    self.index = NULL
+                    has_index = False
+
+                if has_index:
+                    # returns NULL if there is no index or index could
+                    # not be opened
+                    if filepath_index:
+                        cindexname = filepath_index = encode_filename(filepath_index)
+                        with nogil:
+                            self.index = sam_index_load2(self.htsfile,
+                                                         cfilename,
+                                                         cindexname)
+                    else:
+                        with nogil:
+                            self.index = sam_index_load(self.htsfile,
+                                                        cfilename)
+                    if self.index == NULL:
+                        raise IOError(
+                            "error while opening index for '%s'" %
+                            filename)
+
+            # save start of data section
+            if not self.is_stream:
+                self.start_offset = self.tell()
+
+    def get_tid(self, reference):
+        """
+        return the numerical :term:`tid` corresponding to
+        :term:`reference`
+
+        returns -1 if reference is not known.
+        """
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+        reference = force_bytes(reference)
+        return bam_name2id(self.header, reference)
+
+    def get_reference_name(self, tid):
+        """
+        return :term:`reference` name corresponding to numerical :term:`tid`
+        """
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+        if not 0 <= tid < self.header.n_targets:
+            raise ValueError("reference_id %i out of range 0<=tid<%i" % 
+                             (tid, self.header.n_targets))
+        return charptr_to_str(self.header.target_name[tid])
+
+    def parse_region(self,
+                     reference=None,
+                     start=None,
+                     end=None,
+                     region=None,
+                     tid=None):
+        """parse alternative ways to specify a genomic region. A region can
+        either be specified by :term:`reference`, `start` and
+        `end`. `start` and `end` denote 0-based, half-open
+        intervals.
+
+        Alternatively, a samtools :term:`region` string can be
+        supplied.
+        
+        If any of the coordinates are missing they will be replaced by the
+        minimum (`start`) or maximum (`end`) coordinate.
+
+        Note that region strings are 1-based, while `start` and `end` denote
+        an interval in python coordinates.
+
+        Returns
+        -------
+        
+        tuple :  a tuple of `flag`, :term:`tid`, `start` and `end`. The
+        flag indicates whether no coordinates were supplied and the
+        genomic region is the complete genomic space.
+
+        Raises
+        ------
+        
+        ValueError
+           for invalid or out of bounds regions.
+
+        """
+        cdef int rtid
+        cdef long long rstart
+        cdef long long rend
+
+        rtid = -1
+        rstart = 0
+        rend = MAX_POS
+        if start != None:
+            try:
+                rstart = start
+            except OverflowError:
+                raise ValueError('start out of range (%i)' % start)
+
+        if end != None:
+            try:
+                rend = end
+            except OverflowError:
+                raise ValueError('end out of range (%i)' % end)
+
+        if region:
+            region = force_str(region)
+            parts = re.split("[:-]", region)
+            reference = parts[0]
+            if len(parts) >= 2:
+                rstart = int(parts[1]) - 1
+            if len(parts) >= 3:
+                rend = int(parts[2])
+
+        if not reference:
+            return 0, 0, 0, 0
+
+        if tid is not None:
+            rtid = tid
+        else:
+            rtid = self.gettid(reference)
+
+        if rtid < 0:
+            raise ValueError(
+                "invalid reference `%s`" % reference)
+        if rstart > rend:
+            raise ValueError(
+                'invalid coordinates: start (%i) > end (%i)' % (rstart, rend))
+        if not 0 <= rstart < MAX_POS:
+            raise ValueError('start out of range (%i)' % rstart)
+        if not 0 <= rend <= MAX_POS:
+            raise ValueError('end out of range (%i)' % rend)
+
+        return 1, rtid, rstart, rend
+
+    def fetch(self,
+              reference=None,
+              start=None,
+              end=None,
+              region=None,
+              tid=None,
+              until_eof=False,
+              multiple_iterators=False):
+        """fetch reads aligned in a :term:`region`. 
+
+        See :meth:`AlignmentFile.parse_region` for more information
+        on genomic regions.
+
+        Without a `reference` or `region` all mapped reads in the file
+        will be fetched. The reads will be returned ordered by reference
+        sequence, which will not necessarily be the order within the
+        file. This mode of iteration still requires an index. If there is
+        no index, use `until_eof=True`.
+
+        If only `reference` is set, all reads aligned to `reference`
+        will be fetched.
+
+        A :term:`SAM` file does not allow random access. If `region`
+        or `reference` are given, an exception is raised.
+
+        :class:`~pysam.FastaFile`
+        :class:`~pysam.IteratorRow`
+        :class:`~pysam.IteratorRow`
+        :class:`~IteratorRow`
+        :class:`IteratorRow`
+
+        Parameters
+        ----------
+        
+        until_eof : bool
+
+           If `until_eof` is True, all reads from the current file
+           position will be returned in order as they are within the
+           file. Using this option will also fetch unmapped reads.
+
+        multiple_iterators : bool
+           
+           If `multiple_iterators` is True, multiple
+           iterators on the same file can be used at the same time. The
+           iterator returned will receive its own copy of a filehandle to
+           the file effectively re-opening the file. Re-opening a file
+           creates some overhead, so beware.
+
+        Returns
+        -------
+
+        An iterator over a collection of reads.
+
+        Raises
+        ------
+
+        ValueError
+            if the genomic coordinates are out of range or invalid or the
+            file does not permit random access to genomic coordinates.
+
+        """
+        cdef int rtid, rstart, rend, has_coord
+
+        if not self.is_open:
+            raise ValueError( "I/O operation on closed file" )
+
+        has_coord, rtid, rstart, rend = self.parse_region(
+            reference,
+            start,
+            end,
+            region,
+            tid)
+
+        # Turn of re-opening if htsfile is a stream
+        if self.is_stream:
+            multiple_iterators = False
+
+        if self.is_bam or self.is_cram:
+            if not until_eof and not self.is_remote:
+                if not self.has_index():
+                    raise ValueError(
+                        "fetch called on bamfile without index")
+
+            if has_coord:
+                return IteratorRowRegion(
+                    self, rtid, rstart, rend, 
+                    multiple_iterators=multiple_iterators)
+            else:
+                if until_eof:
+                    return IteratorRowAll(
+                        self,
+                        multiple_iterators=multiple_iterators)
+                else:
+                    # AH: check - reason why no multiple_iterators for
+                    # AllRefs?
+                    return IteratorRowAllRefs(
+                        self,
+                        multiple_iterators=multiple_iterators)
+        else:
+            if has_coord:
+                raise ValueError(
+                    "fetching by region is not available for sam files")
+
+            if self.header == NULL:
+                raise ValueError(
+                    "fetch called for htsfile without header")
+
+            # check if targets are defined
+            # give warning, sam_read1 segfaults
+            if self.header.n_targets == 0:
+                warnings.warn("fetch called for htsfile without header")
+                
+            return IteratorRowAll(self,
+                                  multiple_iterators=multiple_iterators)
+
+    def head(self, n, multiple_iterators=True):
+        '''return an iterator over the first n alignments. 
+
+        This iterator is is useful for inspecting the bam-file.
+
+        Parameters
+        ----------
+
+        multiple_iterators : bool
+        
+            is set to True by default in order to
+            avoid changing the current file position.
+        
+        Returns
+        -------
+        
+        an iterator over a collection of reads
+        
+        '''
+        return IteratorRowHead(self, n,
+                               multiple_iterators=multiple_iterators)
+
+    def mate(self, AlignedSegment read):
+        '''return the mate of :class:`~pysam.AlignedSegment` `read`.
+
+        .. note::
+
+            Calling this method will change the file position.
+            This might interfere with any iterators that have
+            not re-opened the file.
+
+        .. note::
+  
+           This method is too slow for high-throughput processing.
+           If a read needs to be processed with its mate, work
+           from a read name sorted file or, better, cache reads.
+
+        Returns
+        -------
+        
+        :class:`~pysam.AlignedSegment` : the mate
+
+        Raises
+        ------
+
+        ValueError
+            if the read is unpaired or the mate is unmapped
+
+        '''
+        cdef uint32_t flag = read._delegate.core.flag
+
+        if flag & BAM_FPAIRED == 0:
+            raise ValueError("read %s: is unpaired" %
+                             (read.query_name))
+        if flag & BAM_FMUNMAP != 0:
+            raise ValueError("mate %s: is unmapped" %
+                             (read.query_name))
+
+        # xor flags to get the other mate
+        cdef int x = BAM_FREAD1 + BAM_FREAD2
+        flag = (flag ^ x) & x
+
+        # Make sure to use a separate file to jump around
+        # to mate as otherwise the original file position
+        # will be lost
+        # The following code is not using the C API and
+        # could thus be made much quicker, for example
+        # by using tell and seek.
+        for mate in self.fetch(
+                read._delegate.core.mpos,
+                read._delegate.core.mpos + 1,
+                tid=read._delegate.core.mtid,
+                multiple_iterators=True):
+            if mate.flag & flag != 0 and \
+               mate.query_name == read.query_name:
+                break
+        else:
+            raise ValueError("mate not found")
+
+        return mate
+
+    def pileup(self,
+               reference=None,
+               start=None,
+               end=None,
+               region=None,
+               **kwargs):
+        """perform a :term:`pileup` within a :term:`region`. The region is
+        specified by :term:`reference`, 'start' and 'end' (using
+        0-based indexing).  Alternatively, a samtools 'region' string
+        can be supplied.
+
+        Without 'reference' or 'region' all reads will be used for the
+        pileup. The reads will be returned ordered by
+        :term:`reference` sequence, which will not necessarily be the
+        order within the file.
+
+        Note that :term:`SAM` formatted files do not allow random
+        access.  In these files, if a 'region' or 'reference' are
+        given an exception is raised.
+
+        .. note::
+
+            'all' reads which overlap the region are returned. The
+            first base returned will be the first base of the first
+            read 'not' necessarily the first base of the region used
+            in the query.
+
+        Parameters
+        ----------
+
+        stepper : string
+           The stepper controls how the iterator advances.
+           Possible options for the stepper are
+
+           ``all``
+              skip reads in which any of the following flags are set:
+              BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+
+           ``nofilter``
+              uses every single read
+
+           ``samtools``
+              same filter and read processing as in :term:`csamtools`
+              pileup. This requires a 'fastafile' to be given.
+
+
+        fastafile : :class:`~pysam.FastaFile` object.
+
+           This is required for some of the steppers.
+
+        max_depth : int
+           Maximum read depth permitted. The default limit is '8000'.
+
+        truncate : bool
+
+           By default, the samtools pileup engine outputs all reads
+           overlapping a region. If truncate is True and a region is
+           given, only columns in the exact region specificied are
+           returned.
+
+        Returns
+        -------
+
+        an iterator over genomic positions.
+
+        """
+        cdef int rtid, rstart, rend, has_coord
+
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        has_coord, rtid, rstart, rend = self.parse_region(
+            reference, start, end, region)
+
+        if self.is_bam or self.is_cram:
+            if not self.has_index():
+                raise ValueError("no index available for pileup")
+
+            if has_coord:
+                return IteratorColumnRegion(self,
+                                            tid=rtid,
+                                            start=rstart,
+                                            end=rend,
+                                            **kwargs )
+            else:
+                return IteratorColumnAllRefs(self, **kwargs )
+
+        else:
+            raise NotImplementedError(
+                "pileup of samfiles not implemented yet")
+
+    def count(self,
+              reference=None,
+              start=None,
+              end=None,
+              region=None,
+              until_eof=False,
+              read_callback="nofilter"):
+        '''count the number of reads in :term:`region`
+
+        The region is specified by :term:`reference`, `start` and
+        `end`. Alternatively, a :term:`samtools` :term:`region` string
+        can be supplied.
+
+        A :term:`SAM` file does not allow random access and if
+        `region` or `reference` are given, an exception is raised.
+
+        Parameters
+        ----------
+        
+        reference : string
+            reference_name of the genomic region (chromosome)
+
+        start : int
+            start of the genomic region
+
+        end : int
+            end of the genomic region
+        
+        region : string
+            a region string in samtools format.
+
+        until_eof : bool
+            count until the end of the file, possibly including 
+            unmapped reads as well.
+
+        read_callback: string or function
+
+            select a call-back to ignore reads when counting. It can
+            be either a string with the following values:
+
+            ``all``
+                skip reads in which any of the following
+                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+                BAM_FDUP
+
+            ``nofilter``
+                uses every single read
+
+            Alternatively, `read_callback` can be a function
+            ``check_read(read)`` that should return True only for
+            those reads that shall be included in the counting.
+
+        Raises
+        ------
+
+        ValueError
+            if the genomic coordinates are out of range or invalid.
+
+        '''
+        cdef AlignedSegment read
+        cdef long counter = 0
+
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        cdef int filter_method = 0
+        if read_callback == "all":
+            filter_method = 1
+        elif read_callback == "nofilter":
+            filter_method = 2
+
+        for read in self.fetch(reference=reference,
+                               start=start,
+                               end=end,
+                               region=region,
+                               until_eof=until_eof):
+            # apply filter
+            if filter_method == 1:
+                # filter = "all"
+                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+                    continue
+            elif filter_method == 2:
+                # filter = "nofilter"
+                pass
+            else:
+                if not read_callback(read):
+                    continue
+            counter += 1
+
+        return counter
+
+    @cython.boundscheck(False)  # we do manual bounds checking
+    def count_coverage(self, 
+                       reference=None,
+                       start=None,
+                       end=None,
+                       region=None,
+                       quality_threshold=15,
+                       read_callback='all'):
+        """count the coverage of genomic positions by reads in :term:`region`.
+
+        The region is specified by :term:`reference`, `start` and
+        `end`. Alternatively, a :term:`samtools` :term:`region` string
+        can be supplied. The coverage is computed per-base [ACGT].
+
+        Parameters
+        ----------
+        
+        reference : string
+            reference_name of the genomic region (chromosome)
+
+        start : int
+            start of the genomic region
+
+        end : int
+            end of the genomic region
+
+        region : int
+            a region string.
+
+        quality_threshold : int
+            quality_threshold is the minimum quality score (in phred) a
+            base has to reach to be counted. 
+
+        read_callback: string or function
+
+            select a call-back to ignore reads when counting. It can
+            be either a string with the following values:
+
+            ``all``
+                skip reads in which any of the following
+                flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+                BAM_FDUP
+
+            ``nofilter``
+                uses every single read
+
+            Alternatively, `read_callback` can be a function
+            ``check_read(read)`` that should return True only for
+            those reads that shall be included in the counting.
+
+        Raises
+        ------
+
+        ValueError
+            if the genomic coordinates are out of range or invalid.
+
+        Returns
+        -------
+
+        four array.arrays of the same length in order A C G T : tuple
+
+        """
+        
+        cdef int _start = start
+        cdef int _stop = end
+        cdef int length = _stop - _start
+        cdef c_array.array int_array_template = array.array('L', [])
+        cdef c_array.array count_a
+        cdef c_array.array count_c
+        cdef c_array.array count_g
+        cdef c_array.array count_t
+        count_a = c_array.clone(int_array_template, length, zero=True)
+        count_c = c_array.clone(int_array_template, length, zero=True)
+        count_g = c_array.clone(int_array_template, length, zero=True)
+        count_t = c_array.clone(int_array_template, length, zero=True)
+
+        cdef AlignedSegment read
+        cdef cython.str seq
+        cdef c_array.array quality
+        cdef int qpos
+        cdef int refpos
+        cdef int c = 0
+        cdef int filter_method = 0
+        if read_callback == "all":
+            filter_method = 1
+        elif read_callback == "nofilter":
+            filter_method = 2
+    
+        cdef int _threshold = quality_threshold
+        for read in self.fetch(reference=reference,
+                               start=start,
+                               end=end,
+                               region=region):
+            # apply filter
+            if filter_method == 1:
+                # filter = "all"
+                if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+                    continue
+            elif filter_method == 2:
+                # filter = "nofilter"
+                pass
+            else:
+                if not read_callback(read):
+                    continue
+
+            # count
+            seq = read.seq
+            quality = read.query_qualities
+            for qpos, refpos in read.get_aligned_pairs(True):
+                if qpos is not None and refpos is not None and \
+                   _start <= refpos < _stop:
+                    if quality[qpos] >= quality_threshold:
+                        if seq[qpos] == 'A':
+                            count_a.data.as_ulongs[refpos - _start] += 1
+                        if seq[qpos] == 'C':
+                            count_c.data.as_ulongs[refpos - _start] += 1
+                        if seq[qpos] == 'G':
+                            count_g.data.as_ulongs[refpos - _start] += 1
+                        if seq[qpos] == 'T':
+                            count_t.data.as_ulongs[refpos - _start] += 1
+
+        return count_a, count_c, count_g, count_t
+
+    def find_introns(self, read_iterator):
+        """Return a dictionary {(start, stop): count}
+        Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+        and their support ( = number of reads ).
+
+        read_iterator can be the result of a .fetch(...) call.
+        Or it can be a generator filtering such reads. Example
+        samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+        """
+        import collections
+        res = collections.Counter()
+        for r in read_iterator:
+            if 'N' in r.cigarstring:
+                last_read_pos = False
+                for read_loc, genome_loc in r.get_aligned_pairs():
+                    if read_loc is None and last_read_pos:
+                        start = genome_loc
+                    elif read_loc and last_read_pos is None:
+                        stop = genome_loc  # we are right exclusive ,so this is correct
+                        res[(start, stop)] += 1
+                        del start
+                        del stop
+                    last_read_pos = read_loc
+        return res
+
+    def close(self):
+        '''
+        closes the :class:`pysam.AlignmentFile`.'''
+
+        if self.htsfile == NULL:
+            return
+
+        cdef int ret = hts_close(self.htsfile)
+        hts_idx_destroy(self.index)
+        self.htsfile = NULL
+
+        if ret < 0:
+            global errno
+            if errno == EPIPE:
+                errno = 0
+            else:
+                raise OSError(errno, force_str(strerror(errno)))
+
+    def __dealloc__(self):
+        # remember: dealloc cannot call other methods
+        # note: no doc string
+        # note: __del__ is not called.
+
+        # FIXME[kbj]: isn't self.close a method?  I've been duplicating
+        # close within __dealloc__ (see BCFFile.__dealloc__).  Not a pretty
+        # solution and perhaps unnecessary given that calling self.close has
+        # been working for years.
+        # AH: I have removed the call to close. Even though it is working,
+        # it seems to be dangerous according to the documentation as the
+        # object be partially deconstructed already.
+        cdef int ret = 0
+
+        if self.htsfile != NULL:
+            ret = hts_close(self.htsfile)
+            hts_idx_destroy(self.index);
+            self.htsfile = NULL
+
+        bam_destroy1(self.b)
+        if self.header != NULL:
+            bam_hdr_destroy(self.header)
+
+
+        if ret < 0:
+            global errno
+            if errno == EPIPE:
+                errno = 0
+            else:
+                raise OSError(errno, force_str(strerror(errno)))
+            
+    cpdef int write(self, AlignedSegment read) except -1:
+        '''
+        write a single :class:`pysam.AlignedSegment` to disk.
+
+        Raises
+        ------
+        ValueError
+            if the writing failed
+
+        Returns
+        -------
+            
+        int : the number of bytes written. If the file is closed,
+              this will be 0.
+        '''
+        if not self.is_open:
+            return 0
+
+        cdef int ret
+
+        with nogil:
+            ret = sam_write1(self.htsfile,
+                             self.header,
+                             read._delegate)
+
+        # kbj: Still need to raise an exception with except -1. Otherwise
+        #      when ret == -1 we get a "SystemError: error return without
+        #      exception set".
+        if ret < 0:
+            raise IOError(
+            "sam_write1 failed with error code {}".format(ret))
+
+        return ret
+
+    # context manager interface
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+        return False
+
+    ###############################################################
+    ###############################################################
+    ###############################################################
+    ## properties
+    ###############################################################
+    property nreferences:
+        """"int with the number of :term:`reference` sequences in the file.
+        This is a read-only attribute."""
+        def __get__(self):
+            if not self.is_open:
+                raise ValueError("I/O operation on closed file")
+            return self.header.n_targets
+
+    property references:
+        """tuple with the names of :term:`reference` sequences. This is a 
+        read-only attribute"""
+        def __get__(self):
+            if not self.is_open: raise ValueError( "I/O operation on closed file" )
+            t = []
+            for x from 0 <= x < self.header.n_targets:
+                t.append(charptr_to_str(self.header.target_name[x]))
+            return tuple(t)
+
+    property lengths:
+        """tuple of the lengths of the :term:`reference` sequences. This is a
+        read-only attribute. The lengths are in the same order as
+        :attr:`pysam.AlignmentFile.references`
+
+        """
+        def __get__(self):
+            if not self.is_open:
+                raise ValueError("I/O operation on closed file")
+            t = []
+            for x from 0 <= x < self.header.n_targets:
+                t.append(self.header.target_len[x])
+            return tuple(t)
+
+    property mapped:
+        """int with total number of mapped alignments according to the
+        statistics recorded in the index. This is a read-only
+        attribute.
+        """
+        def __get__(self):
+            self.check_index()
+            cdef int tid
+            cdef uint64_t total = 0
+            cdef uint64_t mapped, unmapped
+            for tid from 0 <= tid < self.header.n_targets:
+                with nogil:
+                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+                total += mapped
+            return total
+
+    property unmapped:
+        """int with total number of unmapped reads according to the statistics
+        recorded in the index. This number of reads includes the number of reads
+        without coordinates. This is a read-only attribute.
+        """
+        def __get__(self):
+            self.check_index()
+            cdef int tid
+            cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
+            cdef uint64_t mapped, unmapped
+            for tid from 0 <= tid < self.header.n_targets:
+                with nogil:
+                    hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+                total += unmapped
+            return total
+
+    property nocoordinate:
+        """int with total number of reads without coordinates according to the
+        statistics recorded in the index. This is a read-only attribute.
+        """
+        def __get__(self):
+            self.check_index()
+            cdef uint64_t n
+            with nogil:
+                n = hts_idx_get_n_no_coor(self.index)
+            return n
+
+    property text:
+        '''string with the full contents of the :term:`sam file` header as a
+        string. 
+
+        This is a read-only attribute.
+        
+        See :attr:`pysam.AlignmentFile.header` to get a parsed
+        representation of the header.
+        '''
+        def __get__(self):
+            if not self.is_open:
+                raise ValueError( "I/O operation on closed file" )
+            return from_string_and_size(self.header.text, self.header.l_text)
+
+    property header:
+        """two-level dictionay with header information from the file. 
+        
+        This is a read-only attribute.
+
+        The first level contains the record (``HD``, ``SQ``, etc) and
+        the second level contains the fields (``VN``, ``LN``, etc).
+        
+        The parser is validating and will raise an AssertionError if
+        if encounters any record or field tags that are not part of
+        the SAM specification. Use the
+        :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
+        header.
+
+        The parsing follows the SAM format specification with the
+        exception of the ``CL`` field. This option will consume the
+        rest of a header line irrespective of any additional fields.
+        This behaviour has been added to accommodate command line
+        options that contain characters that are not valid field
+        separators.
+
+        """
+        def __get__(self):
+            if not self.is_open:
+                raise ValueError( "I/O operation on closed file" )
+
+            result = {}
+            
+            if self.header.text != NULL:
+                # convert to python string (note: call self.text to
+                # create 0-terminated string)
+                t = self.text
+                for line in t.split("\n"):
+                    if not line.strip(): continue
+                    assert line.startswith("@"), \
+                        "header line without '@': '%s'" % line
+                    fields = line[1:].split("\t")
+                    record = fields[0]
+                    assert record in VALID_HEADER_TYPES, \
+                        "header line with invalid type '%s': '%s'" % (record, line)
+
+                    # treat comments
+                    if record == "CO":
+                        if record not in result:
+                            result[record] = []
+                        result[record].append("\t".join( fields[1:]))
+                        continue
+                    # the following is clumsy as generators do not work?
+                    x = {}
+
+                    for idx, field in enumerate(fields[1:]):
+                        if ":" not in field: 
+                            raise ValueError("malformatted header: no ':' in field" )
+                        key, value = field.split(":", 1)
+                        if key in ("CL",):
+                            # special treatment for command line
+                            # statements (CL). These might contain
+                            # characters that are non-conformant with
+                            # the valid field separators in the SAM
+                            # header. Thus, in contravention to the
+                            # SAM API, consume the rest of the line.
+                            key, value = "\t".join(fields[idx+1:]).split(":", 1)
+                            x[key] = KNOWN_HEADER_FIELDS[record][key](value)
+                            break
+
+                        # interpret type of known header record tags, default to str
+                        x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
+
+                    if VALID_HEADER_TYPES[record] == dict:
+                        if record in result:
+                            raise ValueError(
+                                "multiple '%s' lines are not permitted" % record)
+
+                        result[record] = x
+                    elif VALID_HEADER_TYPES[record] == list:
+                        if record not in result: result[record] = []
+                        result[record].append(x)
+
+                # if there are no SQ lines in the header, add the
+                # reference names from the information in the bam
+                # file.
+                #
+                # Background: c-samtools keeps the textual part of the
+                # header separate from the list of reference names and
+                # lengths. Thus, if a header contains only SQ lines,
+                # the SQ information is not part of the textual header
+                # and thus are missing from the output. See issue 84.
+                if "SQ" not in result:
+                    sq = []
+                    for ref, length in zip(self.references, self.lengths):
+                        sq.append({'LN': length, 'SN': ref })
+                    result["SQ"] = sq
+
+            return result
+
+    ###############################################################
+    ## file-object like iterator access
+    ## note: concurrent access will cause errors (see IteratorRow
+    ## and multiple_iterators)
+    ## Possible solutions: deprecate or open new file handle
+    def __iter__(self):
+        if not self.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        if not self.is_bam and self.header.n_targets == 0:
+            raise NotImplementedError(
+                "can not iterate over samfile without header")
+        return self
+
+    cdef bam1_t * getCurrent( self ):
+        return self.b
+
+    cdef int cnext(self):
+        '''
+        cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
+        '''
+        cdef int ret
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            self.header,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        cdef int ret = self.cnext()
+        if (ret >= 0):
+            return makeAlignedSegment(self.b, self)
+        elif ret == -2:
+            raise IOError('truncated file')
+        else:
+            raise StopIteration
+            
+    # Compatibility functions for pysam < 0.8.3
+    def gettid(self, reference):
+        """deprecated, use get_tid() instead"""
+        return self.get_tid(reference)
+        
+    def getrname(self, tid):
+        """deprecated, use get_reference_name() instead"""
+        return self.get_reference_name(tid)
+
+
+cdef class IteratorRow:
+    '''abstract base class for iterators over mapped reads.
+
+    Various iterators implement different behaviours for wrapping around
+    contig boundaries. Examples include:
+
+    :class:`pysam.IteratorRowRegion`
+        iterate within a single contig and a defined region.
+
+    :class:`pysam.IteratorRowAll`
+        iterate until EOF. This iterator will also include unmapped reads.
+
+    :class:`pysam.IteratorRowAllRefs`
+        iterate over all reads in all reference sequences.
+
+    The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
+
+    .. note::
+
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    '''
+
+    def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
+        cdef char *cfilename
+        cdef char *creference_filename
+        
+        if not samfile.is_open:
+            raise ValueError("I/O operation on closed file")
+
+        # makes sure that samfile stays alive as long as the
+        # iterator is alive
+        self.samfile = samfile
+
+        # reopen the file - note that this makes the iterator
+        # slow and causes pileup to slow down significantly.
+        if multiple_iterators:
+            cfilename = samfile.filename
+            with nogil:
+                self.htsfile = hts_open(cfilename, 'r')
+            assert self.htsfile != NULL
+            # read header - required for accurate positioning
+            # could a tell/seek work?
+            with nogil:
+                self.header = sam_hdr_read(self.htsfile)
+            assert self.header != NULL
+            self.owns_samfile = True
+            # options specific to CRAM files
+            if samfile.is_cram and samfile.reference_filename:
+                creference_filename = samfile.reference_filename
+                hts_set_opt(self.htsfile,
+                            CRAM_OPT_REFERENCE,
+                            creference_filename)
+
+        else:
+            self.htsfile = self.samfile.htsfile
+            self.owns_samfile = False
+            self.header = self.samfile.header
+
+        self.retval = 0
+
+        self.b = bam_init1()
+
+    def __dealloc__(self):
+        bam_destroy1(self.b)
+        if self.owns_samfile:
+            hts_close(self.htsfile)
+            bam_hdr_destroy(self.header)
+
+
+cdef class IteratorRowRegion(IteratorRow):
+    """*(AlignmentFile samfile, int tid, int beg, int end,
+    int multiple_iterators=False)*
+
+    iterate over mapped reads in a region.
+
+    .. note::
+
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    """
+
+    def __init__(self, AlignmentFile samfile,
+                 int tid, int beg, int end,
+                 int multiple_iterators=False):
+
+        IteratorRow.__init__(self, samfile,
+                             multiple_iterators=multiple_iterators)
+
+        if not samfile.has_index():
+            raise ValueError("no index available for iteration")
+
+        with nogil:
+            self.iter = sam_itr_queryi(
+                self.samfile.index,
+                tid,
+                beg,
+                end)
+    
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent(self):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        with nogil:
+            self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
+                                       self.iter,
+                                       self.b,
+                                       self.htsfile)
+
+    def __next__(self):
+        self.cnext()
+        if self.retval >= 0:
+            return makeAlignedSegment(self.b, self.samfile)
+        elif self.retval == -2:
+            # Note: it is currently not the case that hts_iter_next
+            # returns -2 for a truncated file.
+            # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
+            raise IOError('truncated file')
+        else:
+            raise StopIteration
+
+    def __dealloc__(self):
+        hts_itr_destroy(self.iter)
+
+
+cdef class IteratorRowHead(IteratorRow):
+    """*(AlignmentFile samfile, n, int multiple_iterators=False)*
+
+    iterate over first n reads in `samfile`
+
+    .. note::
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.head`.
+
+    """
+
+    def __init__(self, AlignmentFile samfile, int n,
+                 int multiple_iterators=False):
+
+        IteratorRow.__init__(self, samfile,
+                             multiple_iterators=multiple_iterators)
+
+        self.max_rows = n
+        self.current_row = 0
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent( self ):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        cdef int ret
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            self.samfile.header,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        if self.current_row >= self.max_rows:
+            raise StopIteration
+
+        cdef int ret = self.cnext()
+        if ret >= 0:
+            self.current_row += 1
+            return makeAlignedSegment(self.b, self.samfile)
+        elif ret == -2:
+            raise IOError('truncated file')
+        else:
+            raise StopIteration
+
+
+cdef class IteratorRowAll(IteratorRow):
+    """*(AlignmentFile samfile, int multiple_iterators=False)*
+
+    iterate over all reads in `samfile`
+
+    .. note::
+
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    """
+
+    def __init__(self, AlignmentFile samfile,
+                 int multiple_iterators=False):
+
+        IteratorRow.__init__(self, samfile,
+                             multiple_iterators=multiple_iterators)
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent( self ):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator. Used by IteratorColumn'''
+        cdef int ret
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            self.samfile.header,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        cdef int ret = self.cnext()
+        if ret >= 0:
+            return makeAlignedSegment(self.b, self.samfile)
+        elif ret == -2:
+            raise IOError('truncated file')
+        else:
+            raise StopIteration
+
+
+cdef class IteratorRowAllRefs(IteratorRow):
+    """iterates over all mapped reads by chaining iterators over each
+    reference
+
+    .. note::
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a
+        :meth:`AlignmentFile.fetch`.
+
+    """
+
+    def __init__(self, AlignmentFile samfile,
+                 multiple_iterators=False):
+
+        IteratorRow.__init__(self, samfile,
+                             multiple_iterators=multiple_iterators)
+
+        if not samfile.has_index():
+            raise ValueError("no index available for fetch")
+
+        self.tid = -1
+
+    def nextiter(self):
+        # get a new iterator for a chromosome. The file
+        # will not be re-opened.
+        self.rowiter = IteratorRowRegion(self.samfile,
+                                         self.tid,
+                                         0,
+                                         1<<29)
+        # set htsfile and header of the rowiter
+        # to the values in this iterator to reflect multiple_iterators
+        self.rowiter.htsfile = self.htsfile
+        self.rowiter.header = self.header
+
+        # make sure the iterator understand that IteratorRowAllRefs
+        # has ownership
+        self.rowiter.owns_samfile = False
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # Create an initial iterator
+        if self.tid == -1:
+            if not self.samfile.nreferences:
+                raise StopIteration
+            self.tid = 0
+            self.nextiter()
+
+        while 1:
+            self.rowiter.cnext()
+
+            # If current iterator is not exhausted, return aligned read
+            if self.rowiter.retval > 0:
+                return makeAlignedSegment(self.rowiter.b, self.samfile)
+
+            self.tid += 1
+
+            # Otherwise, proceed to next reference or stop
+            if self.tid < self.samfile.nreferences:
+                self.nextiter()
+            else:
+                raise StopIteration
+
+
+cdef class IteratorRowSelection(IteratorRow):
+    """*(AlignmentFile samfile)*
+
+    iterate over reads in `samfile` at a given list of file positions.
+
+    .. note::
+        It is usually not necessary to create an object of this class
+        explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
+    """
+
+    def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
+
+        IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
+
+        self.positions = positions
+        self.current_pos = 0
+
+    def __iter__(self):
+        return self
+
+    cdef bam1_t * getCurrent(self):
+        return self.b
+
+    cdef int cnext(self):
+        '''cversion of iterator'''
+        # end iteration if out of positions
+        if self.current_pos >= len(self.positions): return -1
+
+        cdef uint64_t pos = self.positions[self.current_pos]
+        with nogil:
+            bgzf_seek(hts_get_bgzfp(self.htsfile),
+                      pos,
+                      0)
+        self.current_pos += 1
+
+        cdef int ret
+        with nogil:
+            ret = sam_read1(self.htsfile,
+                            self.samfile.header,
+                            self.b)
+        return ret
+
+    def __next__(self):
+        cdef int ret = self.cnext()
+        if (ret >= 0):
+            return makeAlignedSegment(self.b, self.samfile)
+        elif (ret == -2):
+            raise IOError('truncated file')
+        else:
+            raise StopIteration
+
+
+cdef int __advance_nofilter(void *data, bam1_t *b):
+    '''advance without any read filtering.
+    '''
+    cdef __iterdata * d
+    d = <__iterdata*>data
+    cdef int ret
+    with nogil:
+        ret = sam_itr_next(d.htsfile, d.iter, b)
+    return ret
+
+
+cdef int __advance_all(void *data, bam1_t *b):
+    '''only use reads for pileup passing basic
+    filters:
+
+    BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+    '''
+
+    cdef __iterdata * d
+    cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
+    d = <__iterdata*>data
+    cdef int ret
+    with nogil:
+        ret = sam_itr_next(d.htsfile, d.iter, b)
+    while ret >= 0 and b.core.flag & mask:
+        with nogil:
+            ret = sam_itr_next(d.htsfile, d.iter, b)
+    return ret
+
+
+cdef int __advance_snpcalls(void * data, bam1_t * b):
+    '''advance using same filter and read processing as in
+    the samtools pileup.
+    '''
+
+    # Note that this method requries acces to some 
+    # functions in the samtools code base and is thus
+    # not htslib only.
+    # The functions accessed in samtools are:
+    # 1. bam_prob_realn
+    # 2. bam_cap_mapQ
+    cdef __iterdata * d
+    d = <__iterdata*>data
+
+    cdef int ret
+    cdef int skip = 0
+    cdef int q
+    cdef int is_cns = 1
+    cdef int is_nobaq = 0
+    cdef int capQ_thres = 0
+
+    with nogil:
+        ret = sam_itr_next(d.htsfile, d.iter, b)
+
+    # reload sequence
+    if d.fastafile != NULL and b.core.tid != d.tid:
+        if d.seq != NULL:
+            free(d.seq)
+        d.tid = b.core.tid
+        with nogil:
+            d.seq = faidx_fetch_seq(
+                d.fastafile,
+                d.header.target_name[d.tid],
+                0, MAX_POS,
+                &d.seq_len)
+
+        if d.seq == NULL:
+            raise ValueError(
+                "reference sequence for '%s' (tid=%i) not found" % \
+                (d.header.target_name[d.tid],
+                 d.tid))
+
+    while ret >= 0:
+        skip = 0
+
+        # realign read - changes base qualities
+        if d.seq != NULL and is_cns and not is_nobaq: 
+            bam_prob_realn(b, d.seq)
+
+        if d.seq != NULL and capQ_thres > 10:
+            q = bam_cap_mapQ(b, d.seq, capQ_thres)
+            if q < 0:
+                skip = 1
+            elif b.core.qual > q:
+                b.core.qual = q
+        if b.core.flag & BAM_FUNMAP:
+            skip = 1
+        elif b.core.flag & 1 and not b.core.flag & 2:
+            skip = 1
+
+        if not skip:
+            break
+        # additional filters
+
+        with nogil:
+            ret = sam_itr_next(d.htsfile, d.iter, b)
+
+    return ret
+
+cdef class IteratorColumn:
+    '''abstract base class for iterators over columns.
+
+    IteratorColumn objects wrap the pileup functionality of samtools.
+
+    For reasons of efficiency, the iterator points to the current
+    pileup buffer. The pileup buffer is updated at every iteration.
+    This might cause some unexpected behavious. For example,
+    consider the conversion to a list::
+
+       f = AlignmentFile("file.bam", "rb")
+       result = list( f.pileup() )
+
+    Here, ``result`` will contain ``n`` objects of type
+    :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
+    ``result`` will contain the same information.
+
+    The desired behaviour can be achieved by list comprehension::
+
+       result = [ x.pileups() for x in f.pileup() ]
+
+    ``result`` will be a list of ``n`` lists of objects of type
+    :class:`~pysam.PileupRead`.
+
+    If the iterator is associated with a :class:`~pysam.Fastafile` using the
+    :meth:`addReference` method, then the iterator will export the
+    current sequence via the methods :meth:`getSequence` and
+    :meth:`seq_len`.
+
+    Optional kwargs to the iterator:
+
+    stepper
+       The stepper controls how the iterator advances.
+
+       Valid values are None, "all" (default), "nofilter" or "samtools".
+
+       See AlignmentFile.pileup for description.
+    
+    fastafile
+       A :class:`~pysam.FastaFile` object
+
+    max_depth
+       maximum read depth. The default is 8000.
+
+    '''
+
+    def __cinit__( self, AlignmentFile samfile, **kwargs ):
+        self.samfile = samfile
+        self.fastafile = kwargs.get("fastafile", None)
+        self.stepper = kwargs.get("stepper", None)
+        self.max_depth = kwargs.get("max_depth", 8000)
+        self.iterdata.seq = NULL
+        self.tid = 0
+        self.pos = 0
+        self.n_plp = 0
+        self.plp = NULL
+        self.pileup_iter = <bam_plp_t>NULL
+
+    def __iter__(self):
+        return self
+
+    cdef int cnext(self):
+        '''perform next iteration.
+        '''
+        # do not release gil here because of call-backs
+        self.plp = bam_plp_auto(self.pileup_iter,
+                                &self.tid,
+                                &self.pos,
+                                &self.n_plp)
+
+    cdef char * getSequence(self):
+        '''return current reference sequence underlying the iterator.
+        '''
+        return self.iterdata.seq
+
+    property seq_len:
+        '''current sequence length.'''
+        def __get__(self):
+            return self.iterdata.seq_len
+
+    def addReference(self, Fastafile fastafile):
+       '''
+       add reference sequences in `fastafile` to iterator.'''
+       self.fastafile = fastafile
+       if self.iterdata.seq != NULL:
+           free(self.iterdata.seq)
+       self.iterdata.tid = -1
+       self.iterdata.fastafile = self.fastafile.fastafile
+
+    def hasReference(self):
+        '''
+        return true if iterator is associated with a reference'''
+        return self.fastafile
+
+    cdef setMask(self, mask):
+        '''set masking flag in iterator.
+
+        reads with bits set in `mask` will be skipped.
+        '''
+        raise NotImplementedError()
+        # self.mask = mask
+        # bam_plp_set_mask( self.pileup_iter, self.mask )
+
+    cdef setupIteratorData( self,
+                            int tid,
+                            int start,
+                            int end,
+                            int multiple_iterators=0 ):
+        '''setup the iterator structure'''
+
+        self.iter = IteratorRowRegion(self.samfile, tid, start, end, multiple_iterators)
+        self.iterdata.htsfile = self.samfile.htsfile
+        self.iterdata.iter = self.iter.iter
+        self.iterdata.seq = NULL
+        self.iterdata.tid = -1
+        self.iterdata.header = self.samfile.header
+
+        if self.fastafile is not None:
+            self.iterdata.fastafile = self.fastafile.fastafile
+        else:
+            self.iterdata.fastafile = NULL
+
+        # Free any previously allocated memory before reassigning
+        # pileup_iter
+        self._free_pileup_iter()
+
+        if self.stepper is None or self.stepper == "all":
+            with nogil:
+                self.pileup_iter = bam_plp_init(
+                    <bam_plp_auto_f>&__advance_all,
+                    &self.iterdata)
+        elif self.stepper == "nofilter":
+            with nogil:
+                self.pileup_iter = bam_plp_init(
+                    <bam_plp_auto_f>&__advance_nofilter,
+                    &self.iterdata)
+        elif self.stepper == "samtools":
+            with nogil:
+                self.pileup_iter = bam_plp_init(
+                    <bam_plp_auto_f>&__advance_snpcalls,
+                    &self.iterdata)
+        else:
+            raise ValueError(
+                "unknown stepper option `%s` in IteratorColumn" % self.stepper)
+
+        if self.max_depth:
+            with nogil:
+                bam_plp_set_maxcnt(self.pileup_iter, self.max_depth)
+
+        # bam_plp_set_mask( self.pileup_iter, self.mask )
+
+    cdef reset( self, tid, start, end ):
+        '''reset iterator position.
+
+        This permits using the iterator multiple times without
+        having to incur the full set-up costs.
+        '''
+        self.iter = IteratorRowRegion( self.samfile, tid, start, end, multiple_iterators = 0 )
+        self.iterdata.iter = self.iter.iter
+
+        # invalidate sequence if different tid
+        if self.tid != tid:
+            if self.iterdata.seq != NULL:
+                free(self.iterdata.seq)
+            self.iterdata.seq = NULL
+            self.iterdata.tid = -1
+
+        # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
+        with nogil:
+            bam_plp_reset(self.pileup_iter)
+
+    cdef _free_pileup_iter(self):
+        '''free the memory alloc'd by bam_plp_init.
+
+        This is needed before setupIteratorData allocates
+        another pileup_iter, or else memory will be lost.
+        '''
+        if self.pileup_iter != <bam_plp_t>NULL:
+            with nogil:
+                bam_plp_reset(self.pileup_iter)
+                bam_plp_destroy(self.pileup_iter)
+                self.pileup_iter = <bam_plp_t>NULL
+
+    def __dealloc__(self):
+        # reset in order to avoid memory leak messages for iterators
+        # that have not been fully consumed
+        self._free_pileup_iter()
+        self.plp = <bam_pileup1_t*>NULL
+
+        if self.iterdata.seq != NULL:
+            free(self.iterdata.seq)
+            self.iterdata.seq = NULL
+
+
+cdef class IteratorColumnRegion(IteratorColumn):
+    '''iterates over a region only.
+    '''
+    def __cinit__(self, AlignmentFile samfile,
+                  int tid = 0,
+                  int start = 0,
+                  int end = MAX_POS,
+                  int truncate = False,
+                  **kwargs ):
+
+        # initialize iterator
+        self.setupIteratorData(tid, start, end, 1)
+        self.start = start
+        self.end = end
+        self.truncate = truncate
+
+    def __next__(self):
+
+        while 1:
+            self.cnext()
+            if self.n_plp < 0:
+                raise ValueError("error during iteration" )
+
+            if self.plp == NULL:
+                raise StopIteration
+            
+            if self.truncate:
+                if self.start > self.pos: continue
+                if self.pos >= self.end: raise StopIteration
+
+            return makePileupColumn(&self.plp,
+                                   self.tid,
+                                   self.pos,
+                                   self.n_plp,
+                                   self.samfile)
+
+
+cdef class IteratorColumnAllRefs(IteratorColumn):
+    """iterates over all columns by chaining iterators over each reference
+    """
+
+    def __cinit__(self,
+                  AlignmentFile samfile,
+                  **kwargs):
+
+        # no iteration over empty files
+        if not samfile.nreferences:
+            raise StopIteration
+
+        # initialize iterator
+        self.setupIteratorData(self.tid, 0, MAX_POS, 1)
+
+    def __next__(self):
+
+        while 1:
+            self.cnext()
+
+            if self.n_plp < 0:
+                raise ValueError("error during iteration" )
+
+            # return result, if within same reference
+            if self.plp != NULL:
+                return makePileupColumn(&self.plp,
+                                        self.tid,
+                                        self.pos,
+                                        self.n_plp,
+                                        self.samfile)
+                
+            # otherwise, proceed to next reference or stop
+            self.tid += 1
+            if self.tid < self.samfile.nreferences:
+                self.setupIteratorData(self.tid, 0, MAX_POS, 0)
+            else:
+                raise StopIteration
+
+
+cdef class SNPCall:
+    '''the results of a SNP call.'''
+    cdef int _tid
+    cdef int _pos
+    cdef char _reference_base
+    cdef char _genotype
+    cdef int _consensus_quality
+    cdef int _snp_quality
+    cdef int _rms_mapping_quality
+    cdef int _coverage
+
+    property tid:
+        '''the chromosome ID as is defined in the header'''
+        def __get__(self):
+            return self._tid
+
+    property pos:
+       '''nucleotide position of SNP.'''
+       def __get__(self): return self._pos
+
+    property reference_base:
+       '''reference base at pos. ``N`` if no reference sequence supplied.'''
+       def __get__(self): return from_string_and_size( &self._reference_base, 1 )
+
+    property genotype:
+       '''the genotype called.'''
+       def __get__(self): return from_string_and_size( &self._genotype, 1 )
+
+    property consensus_quality:
+       '''the genotype quality (Phred-scaled).'''
+       def __get__(self): return self._consensus_quality
+
+    property snp_quality:
+       '''the snp quality (Phred scaled) - probability of consensus being
+       identical to reference sequence.'''
+       def __get__(self): return self._snp_quality
+
+    property mapping_quality:
+       '''the root mean square (rms) of the mapping quality of all reads
+       involved in the call.'''
+       def __get__(self): return self._rms_mapping_quality
+
+    property coverage:
+       '''coverage or read depth - the number of reads involved in the call.'''
+       def __get__(self): return self._coverage
+
+    def __str__(self):
+
+        return "\t".join( map(str, (
+                    self.tid,
+                    self.pos,
+                    self.reference_base,
+                    self.genotype,
+                    self.consensus_quality,
+                    self.snp_quality,
+                    self.mapping_quality,
+                    self.coverage ) ) )
+
+
+cdef class IndexedReads:
+    """*(AlignmentFile samfile, multiple_iterators=True)
+
+    Index a Sam/BAM-file by query name while keeping the
+    original sort order intact.
+
+    The index is kept in memory and can be substantial.
+
+    By default, the file is re-openend to avoid conflicts if multiple
+    operators work on the same file. Set `multiple_iterators` = False
+    to not re-open `samfile`.
+
+    Parameters
+    ----------
+
+    samfile : AlignmentFile
+        File to be indexed.
+
+    multiple_iterators : bool
+        Flag indicating whether the file should be reopened. Reopening prevents
+        existing iterators being affected by the indexing.
+
+    """
+
+    def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
+        cdef char *cfilename
+
+        # makes sure that samfile stays alive as long as this
+        # object is alive.
+        self.samfile = samfile
+
+        assert samfile.is_bam, "can only IndexReads on bam files"
+
+        # multiple_iterators the file - note that this makes the iterator
+        # slow and causes pileup to slow down significantly.
+        if multiple_iterators:
+            cfilename = samfile.filename
+            with nogil:
+                self.htsfile = hts_open(cfilename, 'r')
+            assert self.htsfile != NULL
+            # read header - required for accurate positioning
+            with nogil:
+                self.header = sam_hdr_read(self.htsfile)
+            self.owns_samfile = True
+        else:
+            self.htsfile = self.samfile.htsfile
+            self.header = self.samfile.header
+            self.owns_samfile = False
+
+    def build(self):
+        '''build the index.'''
+
+        self.index = collections.defaultdict(list)
+
+        # this method will start indexing from the current file
+        # position if you decide
+        cdef int ret = 1
+        cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
+
+        cdef uint64_t pos
+
+        while ret > 0:
+            with nogil:
+                pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
+                ret = sam_read1(self.htsfile,
+                                self.samfile.header,
+                                b)
+            if ret > 0:
+                qname = charptr_to_str(pysam_bam_get_qname(b))
+                self.index[qname].append(pos)
+
+        bam_destroy1(b)
+
+    def find(self, query_name):
+        '''find `query_name` in index.
+
+        Returns
+        -------
+
+        IteratorRowSelection
+            Returns an iterator over all reads with query_name.
+
+        Raises
+        ------
+        
+        KeyError
+            if the `query_name` is not in the index.
+
+        '''
+        if query_name in self.index:
+            return IteratorRowSelection(
+                self.samfile,
+                self.index[query_name],
+                multiple_iterators = False)
+        else:
+            raise KeyError("read %s not found" % query_name)
+
+    def __dealloc__(self):
+        if self.owns_samfile:
+            hts_close(self.htsfile)
+            bam_hdr_destroy(self.header)
+
+__all__ = [
+    "AlignmentFile",
+    "IteratorRow",
+    "IteratorColumn",
+    "IndexedReads"]
diff --git a/pysam/libcbcf.pxd b/pysam/libcbcf.pxd

new file mode 100644 (file)

index 0000000..fc7f56c
--- /dev/null
+++ b/pysam/libcbcf.pxd
@@ -0,0 +1,144 @@
+###############################################################################
+###############################################################################
+## Cython wrapper for htslib VCF/BCF reader/writer
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015, 2016 Kevin Jacobs (jacobs@bioinformed.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, memmove, strncpy, strlen, strdup
+
+from pysam.libchtslib cimport *
+
+
+cdef class VariantHeader(object):
+    cdef bcf_hdr_t *ptr
+
+    cpdef VariantRecord new_record(self)
+    cdef _subset_samples(self, include_samples)
+
+
+cdef class VariantHeaderRecord(object):
+    cdef VariantHeader header
+    cdef bcf_hrec_t *ptr
+
+
+cdef class VariantHeaderRecords(object):
+    cdef VariantHeader header
+
+
+cdef class VariantHeaderContigs(object):
+    cdef VariantHeader header
+
+
+cdef class VariantHeaderSamples(object):
+    cdef VariantHeader header
+
+
+cdef class VariantContig(object):
+    cdef VariantHeader header
+    cdef int id
+
+
+cdef class VariantMetadata(object):
+    cdef VariantHeader header
+    cdef int type
+    cdef int id
+
+
+cdef class VariantHeaderMetadata(object):
+    cdef VariantHeader header
+    cdef int32_t type
+
+
+cdef class VariantRecord(object):
+    cdef VariantHeader header
+    cdef bcf1_t *ptr
+
+
+cdef class VariantRecordFilter(object):
+    cdef VariantRecord record
+
+
+cdef class VariantRecordFormat(object):
+    cdef VariantRecord record
+
+
+cdef class VariantRecordInfo(object):
+    cdef VariantRecord record
+
+
+cdef class VariantRecordSamples(object):
+    cdef VariantRecord record
+
+
+cdef class VariantRecordSample(object):
+    cdef VariantRecord record
+    cdef readonly int32_t index
+
+
+cdef class BaseIndex(object):
+    cdef tuple refs
+    cdef dict refmap
+
+
+cdef class BCFIndex(BaseIndex):
+    cdef VariantHeader header
+    cdef hts_idx_t *ptr
+
+
+cdef class TabixIndex(BaseIndex):
+    cdef tbx_t *ptr
+
+
+cdef class BaseIterator(object):
+    cdef VariantFile bcf
+    cdef hts_itr_t  *iter
+
+
+cdef class BCFIterator(BaseIterator):
+    cdef BCFIndex index
+
+
+cdef class TabixIterator(BaseIterator):
+    cdef TabixIndex index
+    cdef kstring_t line_buffer
+
+
+cdef class VariantFile(HTSFile):
+    cdef readonly VariantHeader  header
+    cdef readonly BaseIndex      index
+
+    cdef readonly bint           drop_samples  # true if sample information is to be ignored
+
+    # FIXME: Temporary, use htsFormat when it is available
+    cdef readonly bint       is_reading     # true if file has begun reading records
+    cdef readonly bint       header_written # true if header has already been written
+
+    cpdef VariantRecord new_record(self)
+
+    cpdef int write(self, VariantRecord record) except -1
diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx

new file mode 100644 (file)

index 0000000..8f40451
--- /dev/null
+++ b/pysam/libcbcf.pyx
@@ -0,0 +1,3813 @@
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+## Cython wrapper for htslib VCF/BCF reader/writer
+###############################################################################
+#
+# NOTICE: This code is incomplete and preliminary.  It offers a nearly
+#         complete Pythonic interface to VCF/BCF metadata and data with
+#         reading and writing capability.  Documentation and a unit test suite
+#         are in the works.  The code is best tested under Python 2, but
+#         should also work with Python 3.  Please report any remaining
+#         str/bytes issues on the github site when using Python 3 and I'll
+#         fix them promptly.
+#
+# Here is a minimal example of how to use the API:
+#
+#     $ cat bcfview.py
+#     import sys
+#     from pysam import VariantFile
+#
+#     bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
+#     bcf_out = VariantFile('-', 'w', header=bcf_in.header)
+#
+#     for rec in bcf_in:
+#         bcf_out.write(rec)
+#
+# Performance is fairly close to that of bcftools view.  Here is an example
+# using some 1k Genomes data:
+#
+#     $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
+#      1103799
+#
+#     real     0m56.114s
+#     user     1m4.489s
+#     sys      0m3.102s
+#
+#     $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
+#      1103800  # bcftools adds an extra header
+#
+#     real     0m55.126s
+#     user     1m3.502s
+#     sys      0m3.459s
+#
+###############################################################################
+#
+# TODO list:
+#
+#   * more genotype methods
+#   * unit test suite (perhaps py.test based)
+#   * documentation
+#   * pickle support
+#   * left/right locus normalization
+#   * fix reopen to re-use fd
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+from __future__ import division, print_function
+
+import os
+import sys
+
+from libc.errno  cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
+from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
+
+cimport cython
+
+from cpython.object  cimport PyObject
+from cpython.ref     cimport Py_INCREF
+from cpython.dict    cimport PyDict_GetItemString, PyDict_SetItemString
+from cpython.tuple   cimport PyTuple_New, PyTuple_SET_ITEM
+from cpython.bytes   cimport PyBytes_FromStringAndSize
+from cpython.unicode cimport PyUnicode_DecodeASCII
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libchtslib cimport HTSFile, hisremote
+
+
+from warnings         import warn
+
+
+__all__ = ['VariantFile',
+           'VariantHeader',
+           'VariantHeaderRecord',
+           'VariantRecord']
+
+########################################################################
+########################################################################
+## Constants
+########################################################################
+
+cdef int   MAX_POS = 2 << 29
+cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
+cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
+cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
+
+
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+
+########################################################################
+########################################################################
+## VCF/BCF string intern system
+########################################################################
+
+cdef dict bcf_str_cache = {}
+
+cdef inline bcf_str_cache_get_charptr(const char* s):
+    if s == NULL:
+        return None
+
+    cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
+    if pystr:
+        return <object>pystr
+
+    if PY_MAJOR_VERSION < 3:
+        val = s
+    else:
+        val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
+
+    PyDict_SetItemString(bcf_str_cache, s, val)
+
+    return val
+
+
+########################################################################
+########################################################################
+## Low level type conversion helpers
+########################################################################
+
+
+cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
+    return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
+
+
+cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
+    return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
+
+
+cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
+    if not a:
+        return None
+    try:
+         return tuple(charptr_to_str(a[i]) for i in range(n))
+    finally:
+        if free_after and a:
+            free(a)
+
+
+cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
+    cdef char    *datac
+    cdef int8_t  *data8
+    cdef int16_t *data16
+    cdef int32_t *data32
+    cdef float   *dataf
+    cdef int      i
+
+    if not data or n <= 0:
+        return None
+
+    if type == BCF_BT_CHAR:
+        datac = <char *>data
+        while n and datac[n-1] == bcf_str_vector_end:
+            n -= 1
+        value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None
+        # FIXME: Need to know length?  Report errors?  Pad with missing values?  Not clear what to do.
+
+        value = tuple(v or None for v in value.split(',')) if value else ()
+        # FIXME: Need to know length?  Report errors?  Pad with missing values?  Not clear what to do.
+    else:
+        value = []
+        if type == BCF_BT_INT8:
+            data8 = <int8_t *>data
+            for i in range(n):
+                if data8[i] == bcf_int8_vector_end:
+                    break
+                value.append(data8[i] if data8[i] != bcf_int8_missing else None)
+        elif type == BCF_BT_INT16:
+            data16 = <int16_t *>data
+            for i in range(n):
+                if data16[i] == bcf_int16_vector_end:
+                    break
+                value.append(data16[i] if data16[i] != bcf_int16_missing else None)
+        elif type == BCF_BT_INT32:
+            data32 = <int32_t *>data
+            for i in range(n):
+                if data32[i] == bcf_int32_vector_end:
+                    break
+                value.append(data32[i] if data32[i] != bcf_int32_missing else None)
+        elif type == BCF_BT_FLOAT:
+            dataf = <float *>data
+            for i in range(n):
+                if bcf_float_is_vector_end(dataf[i]):
+                    break
+                value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
+        else:
+            raise TypeError('unsupported info type code')
+
+    # FIXME: Need to know length?  Report errors?  Pad with missing values?  Not clear what to do.
+    if not value:
+        if scalar:
+            value = None
+        elif count <= 0:
+            value = ()
+        else:
+            value = (None,)*count
+    elif scalar and len(value) == 1:
+        value = value[0]
+    else:
+        value = tuple(value)
+
+    return value
+
+
+cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
+    cdef char    *datac
+    cdef int8_t  *data8
+    cdef int16_t *data16
+    cdef int32_t *data32
+    cdef float   *dataf
+    cdef ssize_t  i, value_count = len(values)
+
+    assert(value_count <= n)
+
+    if bt_type == BCF_BT_CHAR:
+        if not isinstance(values, (str, bytes)):
+            values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+            value_count = len(values)
+        assert(value_count <= n)
+        datac = <char *>data
+        memcpy(datac, <char *>values, value_count)
+        for i in range(value_count, n):
+            datac[i] = 0
+    elif bt_type == BCF_BT_INT8:
+        datai8 = <int8_t *>data
+        for i in range(value_count):
+            val = values[i]
+            datai8[i] = val if val is not None else bcf_int8_missing
+        for i in range(value_count, n):
+            datai8[i] = bcf_int8_vector_end
+    elif bt_type == BCF_BT_INT16:
+        datai16 = <int16_t *>data
+        for i in range(value_count):
+            val = values[i]
+            datai16[i] = val if val is not None else bcf_int16_missing
+        for i in range(value_count, n):
+            datai16[i] = bcf_int16_vector_end
+    elif bt_type == BCF_BT_INT32:
+        datai32 = <int32_t *>data
+        for i in range(value_count):
+            val = values[i]
+            datai32[i] = val if val is not None else bcf_int32_missing
+        for i in range(value_count, n):
+            datai32[i] = bcf_int32_vector_end
+    elif bt_type == BCF_BT_FLOAT:
+        dataf = <float *>data
+        for i in range(value_count):
+            val = values[i]
+            if val is None:
+                bcf_float_set(dataf + i, bcf_float_missing)
+            else:
+                dataf[i] = val
+        for i in range(value_count, n):
+            bcf_float_set(dataf + i, bcf_float_vector_end)
+    else:
+        raise TypeError('unsupported type')
+
+
+cdef bcf_empty_array(int type, ssize_t n, int vlen):
+    cdef char    *datac
+    cdef int32_t *data32
+    cdef float   *dataf
+    cdef int      i
+
+    if n <= 0:
+        raise ValueError('Cannot create empty array')
+
+    if type == BCF_HT_STR:
+        value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
+        datac = <char *>value
+        for i in range(n):
+            datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
+    elif type == BCF_HT_INT:
+        value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
+        data32 = <int32_t *><char *>value
+        for i in range(n):
+            data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+    elif type == BCF_HT_REAL:
+        value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
+        dataf = <float *><char *>value
+        for i in range(n):
+            bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
+    else:
+        raise TypeError('unsupported header type code')
+
+    return value
+
+
+cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
+                           void *dst_data, int dst_type, ssize_t dst_values,
+                           int vlen):
+    cdef char    *src_datac
+    cdef char    *dst_datac
+    cdef int8_t  *src_datai8
+    cdef int16_t *src_datai16
+    cdef int32_t *src_datai32
+    cdef int32_t *dst_datai
+    cdef float   *src_dataf
+    cdef float   *dst_dataf
+    cdef ssize_t src_size, dst_size, i, j
+    cdef int val
+
+    if src_values > dst_values:
+        raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
+
+    if src_type == dst_type == BCF_BT_CHAR:
+        src_datac = <char *>src_data
+        dst_datac = <char *>dst_data
+        memcpy(src_datac, dst_datac, src_values)
+        for i in range(src_values, dst_values):
+            dst_datac[i] = 0
+    elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
+        src_datai8 = <int8_t *>src_data
+        dst_datai  = <int32_t *>dst_data
+        for i in range(src_values):
+            val = src_datai8[i]
+            if val == bcf_int8_missing:
+                val = bcf_int32_missing
+            elif val == bcf_int8_vector_end:
+                val = bcf_int32_vector_end
+            dst_datai[i] = val
+        for i in range(src_values, dst_values):
+            dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+    elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
+        src_datai16 = <int16_t *>src_data
+        dst_datai   = <int32_t *>dst_data
+        for i in range(src_values):
+            val = src_datai16[i]
+            if val == bcf_int16_missing:
+                val = bcf_int32_missing
+            elif val == bcf_int16_vector_end:
+                val = bcf_int32_vector_end
+            dst_datai[i] = val
+        for i in range(src_values, dst_values):
+            dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+    elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
+        src_datai32 = <int32_t *>src_data
+        dst_datai   = <int32_t *>dst_data
+        for i in range(src_values):
+            dst_datai[i] = src_datai32[i]
+        for i in range(src_values, dst_values):
+            dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+    elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
+        src_dataf = <float *>src_data
+        dst_dataf = <float *>dst_data
+        for i in range(src_values):
+            dst_dataf[i] = src_dataf[i]
+        for i in range(src_values, dst_values):
+            bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
+    else:
+        raise TypeError('unsupported types')
+
+
+cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
+    if record is None:
+        raise ValueError('record must not be None')
+
+    cdef bcf_hdr_t *hdr = record.header.ptr
+    cdef bcf1_t *r = record.ptr
+
+    if not check_header_id(hdr, hl_type, id):
+        raise ValueError('Invalid header')
+
+    cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
+    cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
+
+    scalar[0] = 0
+
+    if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
+        count[0] = number
+    elif length == BCF_VL_FIXED:
+        if number == 1:
+            scalar[0] = 1
+        count[0] = number
+    elif length == BCF_VL_R:
+        count[0] = r.n_allele
+    elif length == BCF_VL_A:
+        count[0] = r.n_allele - 1
+    elif length == BCF_VL_G:
+        count[0] = r.n_allele * (r.n_allele + 1) // 2
+    elif length == BCF_VL_VAR:
+        count[0] = -1
+    else:
+        raise ValueError('Unknown format length')
+
+
+cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
+    if record is None:
+        raise ValueError('record must not be None')
+
+    cdef bcf_hdr_t *hdr = record.header.ptr
+
+    cdef char *s
+    cdef ssize_t count
+    cdef int scalar
+
+    bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar)
+
+    if z.len == 0:
+        if  bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
+            value = True
+        elif scalar:
+            value = None
+        else:
+            value = ()
+    elif z.len == 1:
+        if z.type == BCF_BT_INT8:
+            value = z.v1.i if z.v1.i != bcf_int8_missing else None
+        elif z.type == BCF_BT_INT16:
+            value = z.v1.i if z.v1.i != bcf_int16_missing else None
+        elif z.type == BCF_BT_INT32:
+            value = z.v1.i if z.v1.i != bcf_int32_missing else None
+        elif z.type == BCF_BT_FLOAT:
+            value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
+        elif z.type == BCF_BT_CHAR:
+            value = force_str(chr(z.v1.i))
+        else:
+            raise TypeError('unsupported info type code')
+
+        if not scalar and value != ():
+            value = (value,)
+    else:
+        value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
+
+    return value
+
+
+cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
+                             int id, int bt_type, ssize_t bt_len,
+                             ssize_t *value_count, int *scalar, int *realloc):
+
+    if record is None:
+        raise ValueError('record must not be None')
+
+    bcf_get_value_count(record, hl_type, id, value_count, scalar)
+
+    # Validate values now that we know the type and size
+    values = (value,) if not isinstance(value, (list, tuple)) else value
+
+    # Validate values now that we know the type and size
+    if ht_type == BCF_HT_FLAG:
+        value_count[0] = 1
+    elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
+        # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
+        value_count[0] = -1
+
+    if value_count[0] != -1 and value_count[0] != len(values):
+        if scalar[0]:
+            raise TypeError('value expected to be scalar'.format(value_count[0]))
+        else:
+            raise TypeError('values expected to be {:d}-tuple'.format(value_count[0]))
+
+    if ht_type == BCF_HT_REAL:
+        for v in values:
+            if not(v is None or isinstance(v, (float, int))):
+                raise TypeError('invalid value for Float format')
+    elif ht_type == BCF_HT_INT:
+        for v in values:
+            if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
+                raise TypeError('invalid value for Integer format')
+        for v in values:
+            if not(v is None or bcf_int32_missing < v <= INT32_MAX):
+                raise ValueError('Integer value too small/large to store in VCF/BCF')
+    elif ht_type == BCF_HT_STR:
+        values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+    elif ht_type == BCF_HT_FLAG:
+        if values[0] not in (True, False, None, 1, 0):
+            raise ValueError('Flag values must be: True, False, None, 1, 0')
+    else:
+        raise TypeError('unsupported type')
+
+    realloc[0] = 0
+    if len(values) <= 1 and hl_type == BCF_HL_INFO:
+        realloc[0] = 0
+    elif len(values) > bt_len:
+        realloc[0] = 1
+    elif bt_type == BCF_BT_INT8:
+        for v in values:
+            if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
+                realloc[0] = 1
+                break
+    elif bt_type == BCF_BT_INT16:
+        for v in values:
+            if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
+                realloc[0] = 1
+                break
+
+    return values
+
+
+cdef bcf_encode_alleles(VariantRecord record, values):
+    if record is None:
+        raise ValueError('record must not be None')
+
+    cdef bcf1_t *r = record.ptr
+    cdef int32_t nalleles = r.n_allele
+    cdef list gt_values = []
+    cdef char *s
+    cdef int i
+
+    if values is None:
+        return ()
+
+    if not isinstance(values, (list, tuple)):
+        values = (values,)
+
+    for value in values:
+        if value is None:
+            gt_values.append(bcf_gt_missing)
+        elif isinstance(value, (str, bytes)):
+            bvalue = force_bytes(value)
+            s = bvalue
+            for i in range(r.n_allele):
+                if strcmp(r.d.allele[i], s) != 0:
+                    gt_values.append(bcf_gt_unphased(i))
+                    break
+            else:
+                raise ValueError('Unknown allele')
+        else:
+            i = value
+            if not (0 <= i < nalleles):
+                raise ValueError('Invalid allele index')
+            gt_values.append(bcf_gt_unphased(i))
+
+    return gt_values
+
+
+cdef bcf_info_set_value(VariantRecord record, key, value):
+    if record is None:
+        raise ValueError('record must not be None')
+
+    cdef bcf_hdr_t *hdr = record.header.ptr
+    cdef bcf1_t *r = record.ptr
+    cdef vdict_t *d
+    cdef khiter_t k
+    cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
+    cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
+
+    if bcf_unpack(r, BCF_UN_INFO) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    bkey = force_bytes(key)
+    cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+    if info:
+        info_id = info.key
+    else:
+        d = <vdict_t *>hdr.dict[BCF_DT_ID]
+        k = kh_get_vdict(d, bkey)
+
+        if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+            raise KeyError('unknown INFO')
+
+        info_id = kh_val_vdict(d, k).id
+
+    if not check_header_id(hdr, BCF_HL_INFO, info_id):
+        raise ValueError('Invalid header')
+
+    info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
+    values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
+                              info.type if info else -1,
+                              info.len  if info else -1,
+                              &value_count, &scalar, &realloc)
+
+    if info_type == BCF_HT_FLAG:
+        if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
+            raise ValueError('Unable to update INFO values')
+        return
+
+    vlen = value_count < 0
+    value_count = len(values)
+
+    # If we can, write updated values to existing allocated storage
+    if info and not realloc:
+        r.d.shared_dirty |= BCF1_DIRTY_INF
+
+        if value_count == 0:
+            info.len = 0
+            # FIXME: Check if need to free vptr if info.len > 0?
+        elif value_count == 1:
+            # FIXME: Check if need to free vptr if info.len > 0?
+            if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
+                bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
+            elif info.type == BCF_BT_FLOAT:
+                bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
+            else:
+                raise TypeError('unsupported info type code')
+            info.len = 1
+        else:
+            bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
+        return
+
+    alloc_len = max(1, value_count)
+    if info and info.len > alloc_len:
+        alloc_len = info.len
+
+    new_values = bcf_empty_array(info_type, alloc_len, vlen)
+    cdef char *valp = <char *>new_values
+
+    if info_type == BCF_HT_INT:
+        dst_type = BCF_BT_INT32
+    elif info_type == BCF_HT_REAL:
+        dst_type = BCF_BT_FLOAT
+    elif info_type == BCF_HT_STR:
+        dst_type = BCF_BT_CHAR
+    else:
+        raise ValueError('Unsupported INFO type')
+
+    bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
+
+    if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
+        raise ValueError('Unable to update INFO values')
+
+
+cdef bcf_info_del_value(VariantRecord record, key):
+    if record is None:
+        raise ValueError('record must not be None')
+
+    cdef bcf_hdr_t *hdr = record.header.ptr
+    cdef bcf1_t *r = record.ptr
+    cdef ssize_t value_count
+    cdef int scalar
+
+    if bcf_unpack(r, BCF_UN_INFO) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    bkey = force_bytes(key)
+    cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+    if not info:
+        raise KeyError(key)
+
+    bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar)
+
+    if value_count <= 0:
+        null_value = ()
+    elif scalar:
+        null_value = None
+    else:
+        null_value = (None,)*value_count
+
+    bcf_info_set_value(record, bkey, null_value)
+
+
+cdef bcf_format_get_value(VariantRecordSample sample, key):
+    if sample is None:
+        raise ValueError('sample must not be None')
+
+    cdef bcf_hdr_t *hdr = sample.record.header.ptr
+    cdef bcf1_t *r = sample.record.ptr
+    cdef ssize_t count
+    cdef int scalar
+
+    if bcf_unpack(r, BCF_UN_ALL) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    bkey = force_bytes(key)
+    cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+    if not fmt or not fmt.p:
+        raise KeyError('invalid FORMAT')
+
+    if is_gt_fmt(hdr, fmt.id):
+        return bcf_format_get_allele_indices(sample)
+
+    bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar)
+
+    if fmt.p and fmt.n and fmt.size:
+        return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
+    elif scalar:
+        return None
+    elif count <= 0:
+        return ()
+    else:
+        return (None,)*count
+
+
+cdef bcf_format_set_value(VariantRecordSample sample, key, value):
+    if sample is None:
+        raise ValueError('sample must not be None')
+
+    cdef bcf_hdr_t *hdr = sample.record.header.ptr
+    cdef bcf1_t *r = sample.record.ptr
+    cdef int fmt_id
+    cdef vdict_t *d
+    cdef khiter_t k
+    cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
+    cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
+
+    if bcf_unpack(r, BCF_UN_ALL) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    bkey = force_bytes(key)
+    cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+    if fmt:
+        fmt_id = fmt.id
+    else:
+        d = <vdict_t *>hdr.dict[BCF_DT_ID]
+        k = kh_get_vdict(d, bkey)
+
+        if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
+            raise KeyError('unknown format')
+
+        fmt_id = kh_val_vdict(d, k).id
+
+    if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
+        raise ValueError('Invalid header')
+
+    fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
+
+    if fmt_type == BCF_HT_FLAG:
+        raise ValueError('Flag types are not allowed on FORMATs')
+
+    if is_gt_fmt(hdr, fmt_id):
+        value = bcf_encode_alleles(sample.record, value)
+        # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
+        fmt_type = BCF_HT_INT
+
+    values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
+                              fmt.type if fmt else -1,
+                              fmt.n    if fmt else -1,
+                              &value_count, &scalar, &realloc)
+
+    vlen = value_count < 0
+    value_count = len(values)
+
+    # If we can, write updated values to existing allocated storage
+    if fmt and not realloc:
+        r.d.indiv_dirty = 1
+        bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
+        return
+
+    alloc_len = max(1, value_count)
+    if fmt and fmt.n > alloc_len:
+        alloc_len = fmt.n
+
+    n = bcf_hdr_nsamples(hdr)
+    new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
+    cdef char *valp = <char *>new_values
+
+    if fmt_type == BCF_HT_INT:
+        dst_type = BCF_BT_INT32
+        dst_size = sizeof(int32_t) * alloc_len
+    elif fmt_type == BCF_HT_REAL:
+        dst_type = BCF_BT_FLOAT
+        dst_size = sizeof(float) * alloc_len
+    elif fmt_type == BCF_HT_STR:
+        dst_type = BCF_BT_CHAR
+        dst_size = sizeof(char) * alloc_len
+    else:
+        raise ValueError('Unsupported FORMAT type')
+
+    if fmt and n > 1:
+        for i in range(n):
+            bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
+                                  valp  + i*dst_size, dst_type, alloc_len,
+                                  vlen)
+
+    bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
+
+    if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
+        raise ValueError('Unable to update format values')
+
+
+cdef bcf_format_del_value(VariantRecordSample sample, key):
+    if sample is None:
+        raise ValueError('sample must not be None')
+
+    cdef bcf_hdr_t *hdr = sample.record.header.ptr
+    cdef bcf1_t *r = sample.record.ptr
+    cdef ssize_t value_count
+    cdef int scalar
+
+    if bcf_unpack(r, BCF_UN_ALL) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    bkey = force_bytes(key)
+    cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+    if not fmt or not fmt.p:
+        raise KeyError(key)
+
+    bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar)
+
+    if value_count <= 0:
+        null_value = ()
+    elif scalar:
+        null_value = None
+    else:
+        null_value = (None,)*value_count
+
+    bcf_format_set_value(sample, bkey, null_value)
+
+
+cdef bcf_format_get_allele_indices(VariantRecordSample sample):
+    if sample is None:
+        raise ValueError('sample must not be None')
+
+    cdef bcf_hdr_t *hdr = sample.record.header.ptr
+    cdef bcf1_t *r = sample.record.ptr
+    cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+    if bcf_unpack(r, BCF_UN_ALL) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    if sample.index < 0 or sample.index >= n or not r.n_fmt:
+        return ()
+
+    cdef bcf_fmt_t *fmt0 = r.d.fmt
+    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+    if not gt0 or not fmt0.n:
+        return ()
+
+    cdef int8_t  *data8
+    cdef int16_t *data16
+    cdef int32_t *data32
+    cdef int32_t a, nalleles = r.n_allele
+    cdef list alleles = []
+
+    if fmt0.type == BCF_BT_INT8:
+        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data8[i] == bcf_int8_vector_end:
+                break
+            elif data8[i] == bcf_gt_missing:
+                a = -1
+            else:
+                a = bcf_gt_allele(data8[i])
+            alleles.append(a if 0 <= a < nalleles else None)
+    elif fmt0.type == BCF_BT_INT16:
+        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data16[i] == bcf_int16_vector_end:
+                break
+            elif data16[i] == bcf_gt_missing:
+                a = -1
+            else:
+                a = bcf_gt_allele(data16[i])
+            alleles.append(a if 0 <= a < nalleles else None)
+    elif fmt0.type == BCF_BT_INT32:
+        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data32[i] == bcf_int32_vector_end:
+                break
+            elif data32[i] == bcf_gt_missing:
+                a = -1
+            else:
+                a = bcf_gt_allele(data32[i])
+            alleles.append(a if 0 <= a < nalleles else None)
+
+    return tuple(alleles)
+
+
+cdef bcf_format_get_alleles(VariantRecordSample sample):
+    if sample is None:
+        raise ValueError('sample must not be None')
+
+    cdef bcf_hdr_t *hdr = sample.record.header.ptr
+    cdef bcf1_t *r = sample.record.ptr
+    cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
+
+    if bcf_unpack(r, BCF_UN_ALL) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    cdef int32_t nalleles = r.n_allele
+
+    if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
+        return ()
+
+    cdef bcf_fmt_t *fmt0 = r.d.fmt
+    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+    if not gt0 or not fmt0.n:
+        return ()
+
+    cdef int32_t  a
+    cdef int8_t  *data8
+    cdef int16_t *data16
+    cdef int32_t *data32
+    alleles = []
+    if fmt0.type == BCF_BT_INT8:
+        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data8[i] == bcf_int8_vector_end:
+                break
+            a = bcf_gt_allele(data8[i])
+            alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+    elif fmt0.type == BCF_BT_INT16:
+        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data16[i] == bcf_int16_vector_end:
+                break
+            a = bcf_gt_allele(data16[i])
+            alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+    elif fmt0.type == BCF_BT_INT32:
+        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data32[i] == bcf_int32_vector_end:
+                break
+            a = bcf_gt_allele(data32[i])
+            alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+    return tuple(alleles)
+
+
+cdef bint bcf_sample_get_phased(VariantRecordSample sample):
+    if sample is None:
+        raise ValueError('sample must not be None')
+
+    cdef bcf_hdr_t *hdr = sample.record.header.ptr
+    cdef bcf1_t *r = sample.record.ptr
+    cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+    if bcf_unpack(r, BCF_UN_ALL) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    if sample.index < 0 or sample.index >= n or not r.n_fmt:
+        return False
+
+    cdef bcf_fmt_t *fmt0 = r.d.fmt
+    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+    if not gt0 or not fmt0.n:
+        return False
+
+    cdef int8_t  *data8
+    cdef int16_t *data16
+    cdef int32_t *data32
+
+    cdef bint phased = False
+
+    if fmt0.type == BCF_BT_INT8:
+        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data8[i] == bcf_int8_vector_end:
+                break
+            elif data8[i] == bcf_int8_missing:
+                continue
+            elif i and not bcf_gt_is_phased(data8[i]):
+                return False
+            else:
+                phased = True
+    elif fmt0.type == BCF_BT_INT16:
+        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data16[i] == bcf_int16_vector_end:
+                break
+            elif data16[i] == bcf_int16_missing:
+                continue
+            elif i and not bcf_gt_is_phased(data16[i]):
+                return False
+            else:
+                phased = True
+    elif fmt0.type == BCF_BT_INT32:
+        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data32[i] == bcf_int32_vector_end:
+                break
+            elif data32[i] == bcf_int32_missing:
+                continue
+            elif i and not bcf_gt_is_phased(data32[i]):
+                return False
+            else:
+                phased = True
+
+    return phased
+
+
+cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
+    if sample is None:
+        raise ValueError('sample must not be None')
+
+    cdef bcf_hdr_t *hdr = sample.record.header.ptr
+    cdef bcf1_t *r = sample.record.ptr
+    cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+    if bcf_unpack(r, BCF_UN_ALL) < 0:
+        raise ValueError('Error unpacking VariantRecord')
+
+    if sample.index < 0 or sample.index >= n or not r.n_fmt:
+        return
+
+    cdef bcf_fmt_t *fmt0 = r.d.fmt
+    cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+    if not gt0 or not fmt0.n:
+        raise ValueError('Cannot set phased before genotype is set')
+
+    cdef int8_t  *data8
+    cdef int16_t *data16
+    cdef int32_t *data32
+
+    if fmt0.type == BCF_BT_INT8:
+        data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data8[i] == bcf_int8_vector_end:
+                break
+            elif data8[i] == bcf_int8_missing:
+                continue
+            elif i:
+                data8[i] = (data8[i] & 0xFE) | phased
+    elif fmt0.type == BCF_BT_INT16:
+        data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data16[i] == bcf_int16_vector_end:
+                break
+            elif data16[i] == bcf_int16_missing:
+                continue
+            elif i:
+                data16[i] = (data16[i] & 0xFFFE) | phased
+    elif fmt0.type == BCF_BT_INT32:
+        data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+        for i in range(fmt0.n):
+            if data32[i] == bcf_int32_vector_end:
+                break
+            elif data32[i] == bcf_int32_missing:
+                continue
+            elif i:
+                data32[i] = (data32[i] & 0xFFFFFFFE) | phased
+
+
+########################################################################
+########################################################################
+## Variant Header objects
+########################################################################
+
+
+cdef bcf_header_remove_hrec(VariantHeader header, int i):
+    if header is None:
+        raise ValueError('header must not be None')
+
+    cdef bcf_hdr_t *hdr = header.ptr
+
+    if i < 0 or i >= hdr.nhrec:
+        raise ValueError('Invalid header record index')
+
+    cdef bcf_hrec_t *hrec = hdr.hrec[i]
+    hdr.nhrec -= 1
+
+    if i < hdr.nhrec:
+        memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
+
+    bcf_hrec_destroy(hrec)
+    hdr.hrec[hdr.nhrec] = NULL
+    hdr.dirty = 1
+
+
+#FIXME: implement a full mapping interface
+#FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
+#       object lifetime.
+cdef class VariantHeaderRecord(object):
+    """header record from a :class:`VariantHeader` object"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    @property
+    def type(self):
+        """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
+        cdef bcf_hrec_t *r = self.ptr
+        if not r:
+            return None
+        return METADATA_TYPES[r.type]
+
+    @property
+    def key(self):
+        """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
+        cdef bcf_hrec_t *r = self.ptr
+        return bcf_str_cache_get_charptr(r.key) if r and r.key else None
+
+    @property
+    def value(self):
+        """header value.  Set only for generic lines, None for FILTER/INFO, etc."""
+        cdef bcf_hrec_t *r = self.ptr
+        return charptr_to_str(r.value) if r and r.value else None
+
+    @property
+    def attrs(self):
+        """sequence of additional header attributes"""
+        cdef bcf_hrec_t *r = self.ptr
+        if not r:
+            return ()
+        cdef int i
+        return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
+                      charptr_to_str(r.vals[i]) if r.vals[i] else None)
+                     for i in range(r.nkeys))
+
+    def __len__(self):
+        cdef bcf_hrec_t *r = self.ptr
+        return r.nkeys if r else 0
+
+    def __bool__(self):
+        cdef bcf_hrec_t *r = self.ptr
+        return r != NULL and r.nkeys != 0
+
+    def __getitem__(self, key):
+        """get attribute value"""
+        cdef bcf_hrec_t *r = self.ptr
+        cdef int i
+        if r:
+            bkey = force_bytes(key)
+            for i in range(r.nkeys):
+                if r.keys[i] and r.keys[i] == bkey:
+                    return charptr_to_str(r.vals[i]) if r.vals[i] else None
+        raise KeyError('cannot find metadata key')
+
+    def __iter__(self):
+        cdef bcf_hrec_t *r = self.ptr
+        if not r:
+            return
+        cdef int i
+        for i in range(r.nkeys):
+            if r.keys[i]:
+                yield bcf_str_cache_get_charptr(r.keys[i])
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        try:
+            self[key]
+        except KeyError:
+            return False
+        else:
+            return True
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        cdef bcf_hrec_t *r = self.ptr
+        if not r:
+            return
+        cdef int i
+        for i in range(r.nkeys):
+            if r.keys[i]:
+                yield charptr_to_str(r.vals[i]) if r.vals[i] else None
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        cdef bcf_hrec_t *r = self.ptr
+        if not r:
+            return
+        cdef int i
+        for i in range(r.nkeys):
+            if r.keys[i]:
+                yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+    def __str__(self):
+        cdef bcf_hrec_t *r = self.ptr
+
+        if not r:
+            raise ValueError('cannot convert deleted record to str')
+
+        cdef kstring_t hrec_str
+        hrec_str.l = hrec_str.m = 0
+        hrec_str.s = NULL
+
+        bcf_hrec_format(r, &hrec_str)
+
+        ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
+
+        if hrec_str.m:
+            free(hrec_str.s)
+
+        return ret
+
+    # FIXME: Not safe -- causes trivial segfaults at the moment
+    def remove(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef bcf_hrec_t *r = self.ptr
+        if not r:
+            return
+        assert(r.key)
+        cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
+        print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key))
+        bcf_hdr_remove(hdr, r.type, key)
+        self.ptr = NULL
+
+
+cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    if not hdr:
+        return None
+
+    cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
+    record.header = header
+    record.ptr = hdr
+
+    return record
+
+
+cdef class VariantHeaderRecords(object):
+    """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __len__(self):
+        return self.header.ptr.nhrec
+
+    def __bool__(self):
+        return self.header.ptr.nhrec != 0
+
+    def __getitem__(self, index):
+        cdef int32_t i = index
+        if i < 0 or i >= self.header.ptr.nhrec:
+            raise IndexError('invalid header record index')
+        return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
+
+    def __iter__(self):
+        cdef int32_t i
+        for i in range(self.header.ptr.nhrec):
+            yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
+
+    __hash__ = None
+
+
+cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
+    records.header = header
+    return records
+
+
+cdef class VariantMetadata(object):
+    """filter, info or format metadata record from a :class:`VariantHeader` object"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    @property
+    def name(self):
+        """metadata name"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
+
+    # Q: Should this be exposed?
+    @property
+    def id(self):
+        """metadata internal header id number"""
+        return self.id
+
+    @property
+    def number(self):
+        """metadata number (i.e. cardinality)"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+
+        if not check_header_id(hdr, self.type, self.id):
+            raise ValueError('Invalid header id')
+
+        if self.type == BCF_HL_FLT:
+            return None
+
+        cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
+        if l == BCF_VL_FIXED:
+            return bcf_hdr_id2number(hdr, self.type, self.id)
+        elif l == BCF_VL_VAR:
+            return '.'
+        else:
+            return METADATA_LENGTHS[l]
+
+    @property
+    def type(self):
+        """metadata value type"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        if not check_header_id(hdr, self.type, self.id):
+            raise ValueError('Invalid header id')
+
+        if self.type == BCF_HL_FLT:
+            return None
+        return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
+
+    @property
+    def description(self):
+        """metadata description (or None if not set)"""
+        descr = self.record.get('Description')
+        if descr:
+            descr = descr.strip('"')
+        return force_str(descr)
+
+    @property
+    def record(self):
+        """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        if not check_header_id(hdr, self.type, self.id):
+            raise ValueError('Invalid header id')
+        cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
+        if not hrec:
+            return None
+        return makeVariantHeaderRecord(self.header, hrec)
+
+    def remove_header(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key
+        bcf_hdr_remove(hdr, self.type, bkey)
+
+
+cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
+        raise ValueError('invalid metadata type')
+
+    if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
+        raise ValueError('invalid metadata id')
+
+    cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
+    meta.header = header
+    meta.type = type
+    meta.id = id
+
+    return meta
+
+
+cdef class VariantHeaderMetadata(object):
+    """mapping from filter, info or format name to :class:`VariantMetadata` object"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def add(self, id, number, type, description, **kwargs):
+        """Add a new filter, info or format record"""
+        if id in self:
+            raise ValueError('Header already exists for id={}'.format(id))
+
+        if self.type == BCF_HL_FLT:
+            if number is not None:
+                raise ValueError('Number must be None when adding a filter')
+            if type is not None:
+                raise ValueError('Type must be None when adding a filter')
+
+            items = [('ID', id), ('Description', description)]
+        else:
+            if type not in VALUE_TYPES:
+                raise ValueError('unknown type specified: {}'.format(type))
+            if number is None:
+                number = '.'
+
+            items = [('ID', id),
+                     ('Number', number),
+                     ('Type', type),
+                     ('Description', description)]
+
+        items += kwargs.items()
+        self.header.add_meta(METADATA_TYPES[self.type], items=items)
+
+    def __len__(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef bcf_idpair_t *idpair
+        cdef int32_t i, n = 0
+
+        for i in range(hdr.n[BCF_DT_ID]):
+            idpair = hdr.id[BCF_DT_ID] + i
+            if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+                n += 1
+        return n
+
+    def __bool__(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef bcf_idpair_t *idpair
+        cdef int32_t i
+
+        for i in range(hdr.n[BCF_DT_ID]):
+            idpair = hdr.id[BCF_DT_ID] + i
+            if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+                return True
+        return False
+
+    def __getitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
+
+        bkey = force_bytes(key)
+        cdef khiter_t k = kh_get_vdict(d, bkey)
+
+        if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
+            raise KeyError('invalid key')
+
+        return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
+
+    def remove_header(self, key):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
+
+        bkey = force_bytes(key)
+        cdef khiter_t k = kh_get_vdict(d, bkey)
+
+        if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
+            raise KeyError('invalid key')
+
+        bcf_hdr_remove(hdr, self.type, bkey)
+        #bcf_hdr_sync(hdr)
+
+    def clear_header(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        bcf_hdr_remove(hdr, self.type, NULL)
+        #bcf_hdr_sync(hdr)
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef bcf_idpair_t *idpair
+        cdef int32_t i
+
+        for i in range(hdr.n[BCF_DT_ID]):
+            idpair = hdr.id[BCF_DT_ID] + i
+            if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+                yield bcf_str_cache_get_charptr(idpair.key)
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        try:
+            self[key]
+        except KeyError:
+            return False
+        else:
+            return True
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        for key in self:
+            yield self[key]
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        for key in self:
+            yield (key, self[key])
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+
+cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
+    meta.header = header
+    meta.type = type
+
+    return meta
+
+
+cdef class VariantContig(object):
+    """contig metadata from a :class:`VariantHeader`"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    @property
+    def name(self):
+        """contig name"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
+
+    @property
+    def id(self):
+        """contig internal id number"""
+        return self.id
+
+    @property
+    def length(self):
+        """contig length or None if not available"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
+        return length if length else None
+
+    @property
+    def header(self):
+        """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
+        return makeVariantHeaderRecord(self.header, hrec)
+
+    def remove_header(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key
+        bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+
+
+cdef VariantContig makeVariantContig(VariantHeader header, int id):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
+        raise ValueError('invalid contig id')
+
+    cdef VariantContig contig = VariantContig.__new__(VariantContig)
+    contig.header = header
+    contig.id = id
+
+    return contig
+
+
+cdef class VariantHeaderContigs(object):
+    """mapping from contig name or index to :class:`VariantContig` object."""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __len__(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
+        return hdr.n[BCF_DT_CTG]
+
+    def __bool__(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
+        return hdr.n[BCF_DT_CTG] != 0
+
+    def __getitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int index
+
+        if isinstance(key, int):
+            index = key
+            if index < 0 or index >= hdr.n[BCF_DT_CTG]:
+                raise IndexError('invalid contig index')
+            return makeVariantContig(self.header, index)
+
+        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
+        bkey = force_bytes(key)
+        cdef khiter_t k = kh_get_vdict(d, bkey)
+
+        if k == kh_end(d):
+            raise KeyError('invalid contig')
+
+        cdef int id = kh_val_vdict(d, k).id
+
+        return makeVariantContig(self.header, id)
+
+    def remove_header(self, key):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int index
+        cdef const char *bkey
+        cdef vdict_t *d
+        cdef khiter_t k
+
+        if isinstance(key, int):
+            index = key
+            if index < 0 or index >= hdr.n[BCF_DT_CTG]:
+                raise IndexError('invalid contig index')
+            bkey = hdr.id[BCF_DT_CTG][self.id].key
+        else:
+            d = <vdict_t *>hdr.dict[BCF_DT_CTG]
+            key = force_bytes(key)
+            if kh_get_vdict(d, key) == kh_end(d):
+                raise KeyError('invalid contig')
+            bkey = key
+
+        bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+
+    def clear_header(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
+        #bcf_hdr_sync(hdr)
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
+        cdef uint32_t n = kh_size(d)
+
+        assert n == hdr.n[BCF_DT_CTG]
+
+        for i in range(n):
+            yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        try:
+            self[key]
+        except KeyError:
+            return False
+        else:
+            return True
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        for key in self:
+            yield self[key]
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        for key in self:
+            yield (key, self[key])
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+    def add(self, id, **kwargs):
+        """Add a new contig record"""
+        if id in self:
+            raise ValueError('Header already exists for contig {}'.format(id))
+
+        items = [('ID', id)] + kwargs.items()
+        self.header.add_meta('contig', items=items)
+
+
+cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
+    contigs.header = header
+
+    return contigs
+
+
+cdef class VariantHeaderSamples(object):
+    """sequence of sample names from a :class:`VariantHeader` object"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __len__(self):
+        return bcf_hdr_nsamples(self.header.ptr)
+
+    def __bool__(self):
+        return bcf_hdr_nsamples(self.header.ptr) != 0
+
+    def __getitem__(self, index):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int32_t n = bcf_hdr_nsamples(hdr)
+        cdef int32_t i = index
+
+        if i < 0 or i >= n:
+            raise IndexError('invalid sample index')
+
+        return charptr_to_str(hdr.samples[i])
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+        for i in range(n):
+            yield charptr_to_str(hdr.samples[i])
+
+    def __contains__(self, key):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
+        bkey = force_bytes(key)
+        cdef khiter_t k = kh_get_vdict(d, bkey)
+
+        return k != kh_end(d)
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+    def add(self, name):
+        """Add a new sample"""
+        self.header.add_sample(name)
+
+
+cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
+    samples.header = header
+
+    return samples
+
+
+cdef class VariantHeader(object):
+    """header information for a :class:`VariantFile` object"""
+    #FIXME: Add structured proxy
+    #FIXME: Add generic proxy
+    #FIXME: Add mutable methods
+
+    # See makeVariantHeader for C constructor
+    def __cinit__(self):
+        self.ptr = NULL
+
+    # Python constructor
+    def __init__(self):
+        self.ptr = bcf_hdr_init(b'w')
+        if not self.ptr:
+            raise ValueError('cannot create VariantHeader')
+
+    def __dealloc__(self):
+        if self.ptr:
+            bcf_hdr_destroy(self.ptr)
+            self.ptr = NULL
+
+    def __bool__(self):
+        # self.ptr == NULL should be impossible
+        return self.ptr != NULL
+
+    def copy(self):
+        return makeVariantHeader(bcf_hdr_dup(self.ptr))
+
+    def merge(self, VariantHeader header):
+        if header is None:
+            raise ValueError('header must not be None')
+        bcf_hdr_merge(self.ptr, header.ptr)
+
+    @property
+    def version(self):
+        """VCF version"""
+        return force_str(bcf_hdr_get_version(self.ptr))
+
+    @property
+    def samples(self):
+        """samples (:class:`VariantHeaderSamples`)"""
+        return makeVariantHeaderSamples(self)
+
+    @property
+    def records(self):
+        """header records (:class:`VariantHeaderRecords`)"""
+        return makeVariantHeaderRecords(self)
+
+    @property
+    def contigs(self):
+        """contig information (:class:`VariantHeaderContigs`)"""
+        return makeVariantHeaderContigs(self)
+
+    @property
+    def filters(self):
+        """filter metadata (:class:`VariantHeaderMetadata`)"""
+        return makeVariantHeaderMetadata(self, BCF_HL_FLT)
+
+    @property
+    def info(self):
+        """info metadata (:class:`VariantHeaderMetadata`)"""
+        return makeVariantHeaderMetadata(self, BCF_HL_INFO)
+
+    @property
+    def formats(self):
+        """format metadata (:class:`VariantHeaderMetadata`)"""
+        return makeVariantHeaderMetadata(self, BCF_HL_FMT)
+
+    @property
+    def alts(self):
+        """alt metadata (:class:`dict` ID->record).
+
+        The data returned just a snapshot of alt records, is created
+        every time the property is requested, and modifications will
+        not be reflected in the header metadata and vice versa.
+
+        i.e. it is just a dict that reflects the state of alt records
+        at the time it is created.
+        """
+        return {record['ID']:record for record in self.records
+                if record.key.upper() == 'ALT' }
+
+    # only safe to do when opening an htsfile
+    cdef _subset_samples(self, include_samples):
+        keep_samples    = set(self.samples)
+        include_samples = set(include_samples)
+        missing_samples = include_samples - keep_samples
+        keep_samples   &= include_samples
+
+        if missing_samples:
+            # FIXME: add specialized exception with payload
+            raise ValueError(
+                'missing {:d} requested samples'.format(
+                    len(missing_samples)))
+
+        keep_samples = force_bytes(','.join(keep_samples))
+        cdef char *keep = <char *>keep_samples if keep_samples else NULL
+        cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
+
+        if ret != 0:
+            raise ValueError(
+                'bcf_hdr_set_samples failed: ret = {}'.format(ret))
+
+    def __str__(self):
+        cdef int hlen
+        cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
+
+        try:
+            return charptr_to_str_w_len(hstr, hlen)
+        finally:
+            free(hstr)
+
+    cpdef VariantRecord new_record(self):
+        """Create a new empty VariantRecord"""
+        r = makeVariantRecord(self, bcf_init())
+        r.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
+        return r
+
+    def add_record(self, VariantHeaderRecord record):
+        """Add an existing :class:`VariantHeaderRecord` to this header"""
+        if record is None:
+            raise ValueError('record must not be None')
+
+        cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
+
+        bcf_hdr_add_hrec(self.ptr, hrec)
+
+        if self.ptr.dirty:
+            bcf_hdr_sync(self.ptr)
+
+    def add_line(self, line):
+        """Add a metadata line to this header"""
+        bline = force_bytes(line)
+        if bcf_hdr_append(self.ptr, bline) < 0:
+            raise ValueError('invalid header line')
+
+        if self.ptr.dirty:
+            bcf_hdr_sync(self.ptr)
+
+    def add_meta(self, key, value=None, items=None):
+        """Add metadata to this header"""
+        if not ((value is not None) ^ (items is not None)):
+            raise ValueError('either value or items must be specified')
+
+        cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
+        cdef int quoted
+
+        try:
+            key = force_bytes(key)
+            hrec.key = strdup(key)
+
+            if value is not None:
+                hrec.value = strdup(force_bytes(value))
+            else:
+                for key, value in items:
+                    key = force_bytes(key)
+                    bcf_hrec_add_key(hrec, key, <int>len(key))
+
+                    value = force_bytes(str(value))
+                    quoted = strpbrk(value, ' ;,"\t<>') != NULL
+                    bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
+        except:
+            bcf_hrec_destroy(hrec)
+            raise
+
+        bcf_hdr_add_hrec(self.ptr, hrec)
+
+        if self.ptr.dirty:
+            bcf_hdr_sync(self.ptr)
+
+    def add_sample(self, name):
+        """Add a new sample to this header"""
+        bname = force_bytes(name)
+        if bcf_hdr_add_sample(self.ptr, bname) < 0:
+            raise ValueError('Duplicated sample name: {}'.format(name))
+        if self.ptr.dirty:
+            bcf_hdr_sync(self.ptr)
+
+
+cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
+    if not hdr:
+        raise ValueError('cannot create VariantHeader')
+
+    cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
+    header.ptr = hdr
+
+    return header
+
+
+########################################################################
+########################################################################
+## Variant Record objects
+########################################################################
+
+cdef class VariantRecordFilter(object):
+    """Filters set on a :class:`VariantRecord` object, presented as a mapping from
+       filter index or name to :class:`VariantMetadata` object"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __len__(self):
+        return self.record.ptr.d.n_flt
+
+    def __bool__(self):
+        return self.record.ptr.d.n_flt != 0
+
+    def __getitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int index, id
+        cdef int n = r.d.n_flt
+
+        if isinstance(key, int):
+            index = key
+
+            if index < 0 or index >= n:
+                raise IndexError('invalid filter index')
+
+            id = r.d.flt[index]
+        else:
+            if key == '.':
+                key = 'PASS'
+
+            bkey = force_bytes(key)
+            id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+            if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
+                raise KeyError('Invalid filter')
+
+        return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
+
+    def add(self, key):
+        """Add a new filter"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int id
+
+        if key == '.':
+            key = 'PASS'
+
+        bkey = force_bytes(key)
+        id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+        if not check_header_id(hdr, BCF_HL_FLT, id):
+            raise KeyError('Invalid filter')
+
+        bcf_add_filter(hdr, r, id)
+
+    def __delitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int index, id
+        cdef int n = r.d.n_flt
+
+        if isinstance(key, int):
+            index = key
+
+            if index < 0 or index >= n:
+                raise IndexError('invalid filter index')
+
+            id = r.d.flt[index]
+        else:
+            if key == '.':
+                key = 'PASS'
+
+            bkey = force_bytes(key)
+            id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+            if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
+                raise KeyError('Invalid filter')
+
+        bcf_remove_filter(hdr, r, id, 0)
+
+    def clear(self):
+        """Clear all filters"""
+        cdef bcf1_t *r = self.record.ptr
+        r.d.shared_dirty |= BCF1_DIRTY_FLT
+        r.d.n_flt = 0
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int i
+
+        for i in range(r.d.n_flt):
+            yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        bkey = force_bytes(key)
+        return bcf_has_filter(hdr, r, bkey) == 1
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        for key in self:
+            yield self[key]
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        for key in self:
+            yield (key, self[key])
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+
+cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
+    if not record:
+        raise ValueError('invalid VariantRecord')
+
+    cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
+    filter.record = record
+
+    return filter
+
+
+cdef class VariantRecordFormat(object):
+    """Format data present for each sample in a :class:`VariantRecord` object,
+       presented as mapping from format name to :class:`VariantMetadata` object."""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __len__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int i, n = 0
+
+        for i in range(r.n_fmt):
+            if r.d.fmt[i].p:
+                n += 1
+        return n
+
+    def __bool__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int i
+
+        for i in range(r.n_fmt):
+            if r.d.fmt[i].p:
+                return True
+        return False
+
+    def __getitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+
+        bkey = force_bytes(key)
+        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+        if not fmt or not fmt.p:
+            raise KeyError('unknown format')
+
+        return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
+
+    def __delitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+
+        bkey = force_bytes(key)
+        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+        if not fmt or not fmt.p:
+            raise KeyError('unknown format')
+
+        if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
+            raise ValueError('Unable to delete FORMAT')
+
+    def clear(self):
+        """Clear all formats for all samples within the associated
+           :class:`VariantRecord` instance"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_fmt_t *fmt
+        cdef const char *key
+        cdef int i
+
+        for i in reversed(range(r.n_fmt)):
+            fmt = &r.d.fmt[i]
+            if fmt.p:
+                key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
+                if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
+                    raise ValueError('Unable to delete FORMAT')
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_fmt_t *fmt
+        cdef int i
+
+        for i in range(r.n_fmt):
+            fmt = &r.d.fmt[i]
+            if fmt.p:
+                yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        bkey = force_bytes(key)
+        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+        return fmt != NULL and fmt.p != NULL
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        for key in self:
+            yield self[key]
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        for key in self:
+            yield (key, self[key])
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+
+cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
+    if not record:
+        raise ValueError('invalid VariantRecord')
+
+    cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
+    format.record = record
+
+    return format
+
+
+#TODO: Add a getmeta method to return the corresponding VariantMetadata?
+cdef class VariantRecordInfo(object):
+    """Info data stored in a :class:`VariantRecord` object, presented as a
+       mapping from info metadata name to value."""
+
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __len__(self):
+        return self.record.ptr.n_info
+
+    def __bool__(self):
+        return self.record.ptr.n_info != 0
+
+    def __getitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef vdict_t *d
+        cdef khiter_t k
+        cdef info_id
+
+        if bcf_unpack(r, BCF_UN_INFO) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+
+        bkey = force_bytes(key)
+        cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+        if not info:
+            d = <vdict_t *>hdr.dict[BCF_DT_ID]
+            k = kh_get_vdict(d, bkey)
+
+            if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+                raise KeyError('Unknown INFO field: {}'.format(key))
+
+            info_id = kh_val_vdict(d, k).id
+        else:
+            info_id = info.key
+
+        if not check_header_id(hdr, BCF_HL_INFO, info_id):
+            raise ValueError('Invalid header')
+
+        if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
+            return info != NULL and info.vptr != NULL
+
+        if not info or not info.vptr:
+            raise KeyError('Invalid INFO field: {}'.format(key))
+
+        return bcf_info_get_value(self.record, info)
+
+    def __setitem__(self, key, value):
+        bcf_info_set_value(self.record, key, value)
+
+    def __delitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+
+        if bcf_unpack(r, BCF_UN_INFO) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+
+        bkey = force_bytes(key)
+        cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+        if not info or not info.vptr:
+            raise KeyError('Unknown INFO field: {}'.format(key))
+
+        if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
+            raise ValueError('Unable to delete INFO')
+
+    def clear(self):
+        """Clear all info data"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_info_t *info
+        cdef const char *key
+        cdef int i
+
+        if bcf_unpack(r, BCF_UN_INFO) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+
+        for i in range(r.n_info):
+            info = &r.d.info[i]
+            if info and info.vptr:
+                key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+                if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
+                    raise ValueError('Unable to delete INFO')
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_info_t *info
+        cdef int i
+
+        for i in range(r.n_info):
+            info = &r.d.info[i]
+            if info and info.vptr:
+                yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key))
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+
+        if bcf_unpack(r, BCF_UN_INFO) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+
+        bkey = force_bytes(key)
+        cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+        return info != NULL
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_info_t *info
+        cdef int i
+
+        for i in range(r.n_info):
+            info = &r.d.info[i]
+            if info and info.vptr:
+                yield bcf_info_get_value(self.record, info)
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_info_t *info
+        cdef int i
+
+        for i in range(r.n_info):
+            info = &r.d.info[i]
+            if info and info.vptr:
+                key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+                value = bcf_info_get_value(self.record, info)
+                yield bcf_str_cache_get_charptr(key), value
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+
+cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
+    if not record:
+        raise ValueError('invalid VariantRecord')
+
+    cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
+    info.record = record
+
+    return info
+
+
+cdef class VariantRecordSamples(object):
+    """mapping from sample index or name to :class:`VariantRecordSample` object."""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __len__(self):
+        return bcf_hdr_nsamples(self.record.header.ptr)
+
+    def __bool__(self):
+        return bcf_hdr_nsamples(self.record.header.ptr) != 0
+
+    def __getitem__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int n = bcf_hdr_nsamples(hdr)
+        cdef int sample_index
+        cdef vdict_t *d
+        cdef khiter_t k
+
+        if isinstance(key, int):
+            sample_index = key
+        else:
+            bkey = force_bytes(key)
+            sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
+            if sample_index < 0:
+                raise KeyError('invalid sample name')
+
+        if sample_index < 0 or sample_index >= n:
+            raise IndexError('invalid sample index')
+
+        return makeVariantRecordSample(self.record, sample_index)
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+        for i in range(n):
+            yield charptr_to_str(hdr.samples[i])
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int n = bcf_hdr_nsamples(hdr)
+        cdef int sample_index
+        cdef vdict_t *d
+        cdef khiter_t k
+
+        if isinstance(key, int):
+            sample_index = key
+        else:
+            bkey = force_bytes(key)
+            sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
+            if sample_index < 0:
+                raise KeyError('invalid sample name')
+
+        return 0 <= sample_index < n
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+        for i in range(n):
+            yield makeVariantRecordSample(self.record, i)
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+        for i in range(n):
+            yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+
+cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
+    if not record:
+        raise ValueError('invalid VariantRecord')
+
+    cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
+        VariantRecordSamples)
+    samples.record = record
+
+    return samples
+
+
+cdef class VariantRecord(object):
+    """Variant record"""
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    def __dealloc__(self):
+        if self.ptr:
+            bcf_destroy1(self.ptr)
+            self.ptr = NULL
+
+    def copy(self):
+        """return a copy of this VariantRecord object"""
+        return makeVariantRecord(self.header, bcf_dup(self.ptr))
+
+    def translate(self, VariantHeader dst_header):
+        if dst_header is None:
+            raise ValueError('dst_header must not be None')
+
+        cdef bcf_hdr_t *src_hdr = self.header.ptr
+        cdef bcf_hdr_t *dst_hdr = dst_header.ptr
+
+        if src_hdr != dst_hdr:
+            if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
+                msg = 'Cannot translate record.  Number of samples does not match header ({} vs {})'
+                raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
+
+            bcf_translate(dst_hdr, src_hdr, self.ptr)
+
+    @property
+    def rid(self):
+        """internal reference id number"""
+        return self.ptr.rid
+
+    @rid.setter
+    def rid(self, value):
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int r = value
+        if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
+            raise ValueError('invalid reference id')
+        self.ptr.rid = r
+
+    @property
+    def chrom(self):
+        """chromosome/contig name"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int rid = self.ptr.rid
+        if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
+            raise ValueError('Invalid header')
+        return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
+
+    @chrom.setter
+    def chrom(self, value):
+        cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
+        bchrom = force_bytes(value)
+        cdef khint_t k = kh_get_vdict(d, bchrom)
+        if k == kh_end(d):
+            raise ValueError('Invalid chromosome/contig')
+        self.ptr.rid = kh_val_vdict(d, k).id
+
+    @property
+    def contig(self):
+        """chromosome/contig name"""
+        cdef bcf_hdr_t *hdr = self.header.ptr
+        cdef int rid = self.ptr.rid
+        if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
+            raise ValueError('Invalid header')
+        return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
+
+    @contig.setter
+    def contig(self, value):
+        cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
+        bchrom = force_bytes(value)
+        cdef khint_t k = kh_get_vdict(d, bchrom)
+        if k == kh_end(d):
+            raise ValueError('Invalid chromosome/contig')
+        self.ptr.rid = kh_val_vdict(d, k).id
+
+    @property
+    def pos(self):
+        """record start position on chrom/contig (1-based inclusive)"""
+        return self.ptr.pos + 1
+
+    @pos.setter
+    def pos(self, value):
+        cdef int p = value
+        if p < 1:
+            raise ValueError('Position must be positive')
+        self.ptr.pos = p - 1
+
+    @property
+    def start(self):
+        """record start position on chrom/contig (0-based inclusive)"""
+        return self.ptr.pos
+
+    @start.setter
+    def start(self, value):
+        cdef int s = value
+        if s < 0:
+            raise ValueError('Start coordinate must be non-negative')
+        self.ptr.pos = s
+
+    @property
+    def stop(self):
+        """record stop position on chrom/contig (0-based exclusive)"""
+        return self.ptr.pos + self.ptr.rlen
+
+    @stop.setter
+    def stop(self, value):
+        cdef int s = value
+        if s < self.ptr.pos:
+            raise ValueError('Stop coordinate must be greater than or equal to start')
+        self.ptr.rlen = s - self.ptr.pos
+        if self.ptr.rlen != len(self.ref) or 'END' in self.info:
+            self.info['END'] = s
+
+    @property
+    def rlen(self):
+        """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)"""
+        return self.ptr.rlen
+
+    @rlen.setter
+    def rlen(self, value):
+        cdef int r = value
+        if r < 0:
+            raise ValueError('Reference length must be non-negative')
+        self.ptr.rlen = r
+        if r != len(self.ref) or 'END' in self.info:
+            self.info['END'] = self.ptr.pos + r
+
+    @property
+    def qual(self):
+        """phred scaled quality score or None if not available"""
+        return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
+
+    @qual.setter
+    def qual(self, value):
+        if value is not None:
+            self.ptr.qual = value
+        else:
+            bcf_float_set(&self.ptr.qual, bcf_float_missing)
+
+
+#   @property
+#   def n_allele(self):
+#       return self.ptr.n_allele
+
+#   @property
+#   def n_sample(self):
+#       return self.ptr.n_sample
+
+    @property
+    def id(self):
+        """record identifier or None if not available"""
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+
+    @id.setter
+    def id(self, value):
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        cdef char *idstr = NULL
+        if value is not None:
+            bid = force_bytes(value)
+            idstr = bid
+        if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
+            raise ValueError('Error updating id')
+
+    @property
+    def ref(self):
+        """reference allele"""
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        return charptr_to_str(r.d.allele[0]) if r.d.allele else None
+
+    @ref.setter
+    def ref(self, value):
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        #FIXME: Set alleles directly -- this is stupid
+        if not value:
+            raise ValueError('ref allele must not be null')
+        value = force_bytes(value)
+        if r.d.allele and r.n_allele:
+            alleles = [r.d.allele[i] for i in range(r.n_allele)]
+            alleles[0] = value
+        else:
+            alleles = [value]
+        self.alleles = alleles
+
+    @property
+    def alleles(self):
+        """tuple of reference allele followed by alt alleles"""
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        if not r.d.allele:
+            return None
+        cdef tuple res = PyTuple_New(r.n_allele)
+        for i in range(r.n_allele):
+            a = charptr_to_str(r.d.allele[i])
+            PyTuple_SET_ITEM(res, i, a)
+            Py_INCREF(a)
+        return res
+
+    @alleles.setter
+    def alleles(self, value):
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        value = [force_bytes(v) for v in value]
+        if b'' in value:
+            raise ValueError('cannot set null allele')
+        value = b','.join(value)
+        if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
+            raise ValueError('Error updating alleles')
+
+    @property
+    def alts(self):
+        """tuple of alt alleles"""
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        if r.n_allele < 2 or not r.d.allele:
+            return None
+        cdef tuple res = PyTuple_New(r.n_allele - 1)
+        for i in range(1, r.n_allele):
+            a = charptr_to_str(r.d.allele[i])
+            PyTuple_SET_ITEM(res, i - 1, a)
+            Py_INCREF(a)
+        return res
+
+    @alts.setter
+    def alts(self, value):
+        #FIXME: Set alleles directly -- this is stupid
+        cdef bcf1_t *r = self.ptr
+        if bcf_unpack(r, BCF_UN_STR) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        value = [force_bytes(v) for v in value]
+        if b'' in value:
+            raise ValueError('cannot set null alt allele')
+        ref  = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
+        self.alleles = ref + value
+
+    @property
+    def filter(self):
+        """filter information (see :class:`VariantRecordFilter`)"""
+        if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        return makeVariantRecordFilter(self)
+
+    @property
+    def info(self):
+        """info data (see :class:`VariantRecordInfo`)"""
+        if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        return makeVariantRecordInfo(self)
+
+    @property
+    def format(self):
+        """sample format metadata (see :class:`VariantRecordFormat`)"""
+        if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        return makeVariantRecordFormat(self)
+
+    @property
+    def samples(self):
+        """sample data (see :class:`VariantRecordSamples`)"""
+        if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+        return makeVariantRecordSamples(self)
+
+    def __str__(self):
+        cdef kstring_t line
+        cdef char c
+
+        line.l = line.m = 0
+        line.s = NULL
+
+        if vcf_format(self.header.ptr, self.ptr, &line) < 0:
+            if line.m:
+                free(line.s)
+            raise ValueError('vcf_format failed')
+
+        # Strip CR/LF?
+        #while line.l:
+        #    c = line.s[line.l - 1]
+        #    if c != b'\n' and c != b'\r':
+        #        break
+        #    line.l -= 1
+
+        ret = charptr_to_str_w_len(line.s, line.l)
+
+        if line.m:
+            free(line.s)
+
+        return ret
+
+
+cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    if not r:
+        raise ValueError('cannot create VariantRecord')
+
+    if r.errcode:
+        msg = []
+        #if r.errcode & BCF_ERR_CTG_UNDEF:
+        #    msg.append('undefined contig')
+        #if r.errcode & BCF_ERR_TAG_UNDEF:
+        #    msg.append('undefined tag')
+        if r.errcode & BCF_ERR_NCOLS:
+            msg.append('invalid number of columns')
+        if r.errcode & BCF_ERR_LIMITS:
+            msg.append('limits violated')
+        if r.errcode & BCF_ERR_CHAR:
+            msg.append('invalid character found')
+        if r.errcode & BCF_ERR_CTG_INVALID:
+            msg.append('invalid contig')
+        if r.errcode & BCF_ERR_TAG_INVALID:
+            msg.append('invalid tag')
+
+        if msg:
+            msg = ', '.join(msg)
+            raise ValueError('Error(s) reading record: {}'.format(msg))
+
+    cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
+    record.header = header
+    record.ptr = r
+
+    return record
+
+
+########################################################################
+########################################################################
+## Variant Sampletype object
+########################################################################
+
+
+cdef class VariantRecordSample(object):
+    """Data for a single sample from a :class:`VariantRecord` object.
+       Provides data accessors for genotypes and a mapping interface
+       from format name to values.
+    """
+    def __init__(self, *args, **kwargs):
+        raise TypeError('this class cannot be instantiated from Python')
+
+    @property
+    def name(self):
+        """sample name"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+        if self.index < 0 or self.index >= n:
+            raise ValueError('invalid sample index')
+
+        return charptr_to_str(hdr.samples[self.index])
+
+    @property
+    def allele_indices(self):
+        """allele indices for called genotype, if present.  Otherwise None"""
+        return bcf_format_get_allele_indices(self)
+
+    @allele_indices.setter
+    def allele_indices(self, value):
+        self['GT'] = value
+
+    @allele_indices.deleter
+    def allele_indices(self):
+        self['GT'] = ()
+
+    @property
+    def alleles(self):
+        """alleles for called genotype, if present.  Otherwise None"""
+        return bcf_format_get_alleles(self)
+
+    @alleles.setter
+    def alleles(self, value):
+        self['GT'] = value
+
+    @alleles.deleter
+    def alleles(self):
+        self['GT'] = ()
+
+    @property
+    def phased(self):
+        """False if genotype is missing or any allele is unphased.  Otherwise True."""
+        return bcf_sample_get_phased(self)
+
+    @phased.setter
+    def phased(self, value):
+        bcf_sample_set_phased(self, value)
+
+    def __len__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int i, n = 0
+
+        if bcf_unpack(r, BCF_UN_FMT) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+
+        for i in range(r.n_fmt):
+            if r.d.fmt[i].p:
+                n += 1
+        return n
+
+    def __bool__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef int i
+
+        if bcf_unpack(r, BCF_UN_FMT) < 0:
+            raise ValueError('Error unpacking VariantRecord')
+
+        for i in range(r.n_fmt):
+            if r.d.fmt[i].p:
+                return True
+        return False
+
+    def __getitem__(self, key):
+        return bcf_format_get_value(self, key)
+
+    def __setitem__(self, key, value):
+        bcf_format_set_value(self, key, value)
+
+    def __delitem__(self, key):
+        bcf_format_del_value(self, key)
+
+    def clear(self):
+        """Clear all format data (including genotype) for this sample"""
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_fmt_t *fmt
+        cdef int i
+
+        for i in range(r.n_fmt):
+            fmt = &r.d.fmt[i]
+            if fmt.p:
+                bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
+
+    def __iter__(self):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        cdef bcf_fmt_t *fmt
+        cdef int i
+
+        for i in range(r.n_fmt):
+            fmt = &r.d.fmt[i]
+            if r.d.fmt[i].p:
+                yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        cdef bcf_hdr_t *hdr = self.record.header.ptr
+        cdef bcf1_t *r = self.record.ptr
+        bkey = force_bytes(key)
+        cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+        return fmt != NULL and fmt.p != NULL
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        for key in self:
+            yield self[key]
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        for key in self:
+            yield (key, self[key])
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+
+cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
+    if not record or sample_index < 0:
+        raise ValueError('cannot create VariantRecordSample')
+
+    cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
+    sample.record = record
+    sample.index = sample_index
+
+    return sample
+
+
+########################################################################
+########################################################################
+## Index objects
+########################################################################
+
+
+cdef class BaseIndex(object):
+    def __init__(self):
+        self.refs = ()
+        self.remap = {}
+
+    def __len__(self):
+        return len(self.refs)
+
+    def __bool__(self):
+        return len(self.refs) != 0
+
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            return self.refs[key]
+        else:
+            return self.refmap[key]
+
+    def __iter__(self):
+        return iter(self.refs)
+
+    def get(self, key, default=None):
+        """D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None."""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def __contains__(self, key):
+        try:
+            self[key]
+        except KeyError:
+            return False
+        else:
+            return True
+
+    def iterkeys(self):
+        """D.iterkeys() -> an iterator over the keys of D"""
+        return iter(self)
+
+    def itervalues(self):
+        """D.itervalues() -> an iterator over the values of D"""
+        for key in self:
+            yield self[key]
+
+    def iteritems(self):
+        """D.iteritems() -> an iterator over the (key, value) items of D"""
+        for key in self:
+            yield (key, self[key])
+
+    def keys(self):
+        """D.keys() -> list of D's keys"""
+        return list(self)
+
+    def items(self):
+        """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+        return list(self.iteritems())
+
+    def values(self):
+        """D.values() -> list of D's values"""
+        return list(self.itervalues())
+
+    # Mappings are not hashable by default, but subclasses can change this
+    __hash__ = None
+
+    #TODO: implement __richcmp__
+
+
+cdef class BCFIndex(object):
+    """CSI index data structure for BCF files"""
+    def __init__(self):
+        self.refs = ()
+        self.refmap = {}
+
+        if not self.ptr:
+            raise ValueError('Invalid index object')
+
+        cdef int n
+        cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
+
+        self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
+        self.refmap = { r:i for i,r in enumerate(self.refs) }
+
+    def __dealloc__(self):
+        if self.ptr:
+            hts_idx_destroy(self.ptr)
+            self.ptr = NULL
+
+    def fetch(self, bcf, contig, start, stop, region, reopen):
+        return BCFIterator(bcf, contig, start, stop, region, reopen)
+
+
+cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
+    if not idx:
+        return None
+
+    if not header:
+        raise ValueError('invalid VariantHeader')
+
+    cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
+    index.header = header
+    index.ptr = idx
+    index.__init__()
+
+    return index
+
+
+cdef class TabixIndex(BaseIndex):
+    """Tabix index data structure for VCF files"""
+    def __init__(self):
+        self.refs = ()
+        self.refmap = {}
+
+        if not self.ptr:
+            raise ValueError('Invalid index object')
+
+        cdef int n
+        cdef const char **refs = tbx_seqnames(self.ptr, &n)
+
+        self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
+        self.refmap = { r:i for i,r in enumerate(self.refs) }
+
+    def __dealloc__(self):
+        if self.ptr:
+            tbx_destroy(self.ptr)
+            self.ptr = NULL
+
+    def fetch(self, bcf, contig, start, stop, region, reopen):
+        return TabixIterator(bcf, contig, start, stop, region, reopen)
+
+
+cdef TabixIndex makeTabixIndex(tbx_t *idx):
+    if not idx:
+        return None
+
+    cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
+    index.ptr = idx
+    index.__init__()
+
+    return index
+
+
+########################################################################
+########################################################################
+## Iterators
+########################################################################
+
+
+cdef class BaseIterator(object):
+    pass
+
+
+# Interal function to clean up after iteration stop or failure.
+# This would be a nested function if it weren't a cdef function.
+cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
+    bcf_destroy1(record)
+
+    # destroy iter so future calls to __next__ raise StopIteration
+    bcf_itr_destroy(self.iter)
+    self.iter = NULL
+
+
+cdef class BCFIterator(BaseIterator):
+    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+        if bcf is None:
+            raise ValueError('bcf must not be None')
+
+        if not isinstance(bcf.index, BCFIndex):
+            raise ValueError('bcf index required')
+
+        cdef BCFIndex index = bcf.index
+        cdef int rid, cstart, cstop
+        cdef char *cregion
+
+        if not index:
+            raise ValueError('bcf index required')
+
+        if reopen:
+            bcf = bcf.copy()
+
+        if region is not None:
+            if contig is not None or start is not None or stop is not None:
+                raise ValueError  # FIXME
+
+            bregion = force_bytes(region)
+            cregion = bregion
+            with nogil:
+                self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion)
+        else:
+            if contig is None:
+                raise ValueError  # FIXME
+
+            try:
+                rid = index.refmap[contig]
+            except KeyError:
+                raise ValueError('Unknown contig specified')
+
+            if start is None:
+                start = 0
+            if stop is None:
+                stop = MAX_POS
+
+            cstart, cstop = start, stop
+
+            with nogil:
+                self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
+
+        # Do not fail on self.iter == NULL, since it signifies a null query.
+
+        self.bcf = bcf
+        self.index = index
+
+    def __dealloc__(self):
+        if self.iter:
+            bcf_itr_destroy(self.iter)
+            self.iter = NULL
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.iter:
+            raise StopIteration
+
+        cdef bcf1_t *record = bcf_init1()
+
+        record.pos = -1
+        if self.bcf.drop_samples:
+            record.max_unpack = BCF_UN_SHR
+
+        cdef int ret
+
+        with nogil:
+            ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
+
+        if ret < 0:
+            _stop_BCFIterator(self, record)
+            if ret == -1:
+                raise StopIteration
+            else:
+                raise ValueError('error reading BCF file')
+
+        ret = bcf_subset_format(self.bcf.header.ptr, record)
+
+        if ret < 0:
+            _stop_BCFIterator(self, record)
+            raise ValueError('error in bcf_subset_format')
+
+        return makeVariantRecord(self.bcf.header, record)
+
+
+cdef class TabixIterator(BaseIterator):
+    def __cinit__(self, *args, **kwargs):
+        self.line_buffer.l = 0
+        self.line_buffer.m = 0
+        self.line_buffer.s = NULL
+
+    def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+        if bcf is None:
+            raise ValueError('bcf must not be None')
+
+        if not isinstance(bcf.index, TabixIndex):
+            raise ValueError('tabix index required')
+
+        cdef TabixIndex index = bcf.index
+
+        if not index:
+            raise ValueError('bcf index required')
+
+        if reopen:
+            bcf = bcf.copy()
+
+        if region is not None:
+            if contig is not None or start is not None or stop is not None:
+                raise ValueError  # FIXME
+
+            self.iter = tbx_itr_querys(index.ptr, region)
+        else:
+            if contig is None:
+                raise ValueError  # FIXME
+
+            rid = index.refmap.get(contig, -1)
+
+            if start is None:
+                start = 0
+            if stop is None:
+                stop = MAX_POS
+
+            self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
+
+        # Do not fail on self.iter == NULL, since it signifies a null query.
+
+        self.bcf = bcf
+        self.index = index
+
+    def __dealloc__(self):
+        if self.iter:
+            tbx_itr_destroy(self.iter)
+            self.iter = NULL
+
+        if self.line_buffer.m:
+            free(self.line_buffer.s)
+
+        self.line_buffer.l = 0
+        self.line_buffer.m = 0
+        self.line_buffer.s = NULL
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.iter:
+            raise StopIteration
+
+        cdef int ret
+
+        with nogil:
+            ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
+
+        if ret < 0:
+            tbx_itr_destroy(self.iter)
+            self.iter = NULL
+            if ret == -1:
+                raise StopIteration
+            else:
+                raise ValueError('error reading indexed VCF file')
+
+        cdef bcf1_t *record = bcf_init1()
+
+        record.pos = -1
+        if self.bcf.drop_samples:
+            record.max_unpack = BCF_UN_SHR
+
+        ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
+
+        # FIXME: stop iteration on parse failure?
+        if ret < 0:
+            bcf_destroy1(record)
+            raise ValueError('error in vcf_parse')
+
+        return makeVariantRecord(self.bcf.header, record)
+
+
+########################################################################
+########################################################################
+## Variant File
+########################################################################
+
+
+cdef class VariantFile(HTSFile):
+    """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
+    duplicate_filehandle=True)*
+
+    A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
+    opened.
+
+    If an index for a variant file exists (.csi or .tbi), it will be
+    opened automatically.  Without an index random access to records
+    via :meth:`fetch` is disabled.
+
+    For writing, a :class:`VariantHeader` object must be provided,
+    typically obtained from another :term:`VCF` file/:term:`BCF`
+    file.
+
+    Parameters
+    ----------
+    mode : string
+        *mode* should be ``r`` for reading or ``w`` for writing. The default is
+        text mode (:term:`VCF`).  For binary (:term:`BCF`) I/O you should append
+        ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
+
+        If ``b`` is present, it must immediately follow ``r`` or ``w``.  Valid
+        modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
+        For instance, to open a :term:`BCF` formatted file for reading, type::
+
+            f = pysam.VariantFile('ex1.bcf','r')
+
+        If mode is not specified, we will try to auto-detect the file type.  All
+        of the following should work::
+
+            f1 = pysam.VariantFile('ex1.bcf')
+            f2 = pysam.VariantFile('ex1.vcf')
+            f3 = pysam.VariantFile('ex1.vcf.gz')
+
+    index_filename : string
+        Explicit path to an index file.
+
+    header : VariantHeader
+        :class:`VariantHeader` object required for writing.
+
+    drop_samples: bool
+        Ignore sample information when reading.
+
+    duplicate_filehandle: bool 
+        By default, file handles passed either directly or through
+        File-like objects will be duplicated before passing them to
+        htslib. The duplication prevents issues where the same stream
+        will be closed by htslib and through destruction of the
+        high-level python object. Set to False to turn off
+        duplication.
+
+    """
+    def __cinit__(self, *args, **kwargs):
+        self.htsfile = NULL
+
+    def __init__(self, *args, **kwargs):
+        self.header         = None
+        self.index          = None
+        self.filename       = None
+        self.mode           = None
+        self.index_filename = None
+        self.is_stream      = False
+        self.is_remote      = False
+        self.is_reading     = False
+        self.drop_samples   = False
+        self.header_written = False
+        self.start_offset   = -1
+
+        self.open(*args, **kwargs)
+
+    def close(self):
+        """closes the :class:`pysam.VariantFile`."""
+        cdef int ret = 0
+        self.header = self.index = None
+        if self.htsfile:
+            # Write header if no records were written
+            if self.htsfile.is_write and not self.header_written:
+                self.header_written = True
+                with nogil:
+                    bcf_hdr_write(self.htsfile, self.header.ptr)
+
+            ret = hts_close(self.htsfile)
+            self.htsfile = NULL
+
+        if ret < 0:
+            global errno
+            if errno == EPIPE:
+                errno = 0
+            else:
+                raise OSError(errno, force_str(strerror(errno)))
+
+    def __iter__(self):
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+
+        if self.htsfile.is_write:
+            raise ValueError('cannot iterate over Variantfile opened for writing')
+
+        self.is_reading = 1
+        return self
+
+    def __next__(self):
+        cdef int ret
+        cdef bcf1_t *record = bcf_init1()
+
+        record.pos = -1
+        if self.drop_samples:
+            record.max_unpack = BCF_UN_SHR
+
+        with nogil:
+            ret = bcf_read1(self.htsfile, self.header.ptr, record)
+
+        if ret < 0:
+            bcf_destroy1(record)
+            if ret == -1:
+                raise StopIteration
+            elif ret == -2:
+                raise IOError('truncated file')
+            else:
+                raise ValueError('Variant read failed')
+
+        return makeVariantRecord(self.header, record)
+
+    def copy(self):
+        if not self.is_open:
+            raise ValueError
+
+        cdef VariantFile vars = VariantFile.__new__(VariantFile)
+        cdef bcf_hdr_t *hdr
+
+        # FIXME: re-open using fd or else header and index could be invalid
+        vars.htsfile = self._open_htsfile()
+
+        if not vars.htsfile:
+            raise ValueError('Cannot re-open htsfile')
+
+        # minimize overhead by re-using header and index.  This approach is
+        # currently risky, but see above for how this can be mitigated.
+        vars.header         = self.header
+        vars.index          = self.index
+
+        vars.filename       = self.filename
+        vars.mode           = self.mode
+        vars.index_filename = self.index_filename
+        vars.drop_samples   = self.drop_samples
+        vars.is_stream      = self.is_stream
+        vars.is_remote      = self.is_remote
+        vars.is_reading     = self.is_reading
+        vars.start_offset   = self.start_offset
+        vars.header_written = self.header_written
+
+        if self.htsfile.is_bin:
+            vars.seek(self.tell())
+        else:
+            with nogil:
+                hdr = bcf_hdr_read(vars.htsfile)
+            makeVariantHeader(hdr)
+
+        return vars
+
+    def open(self, filename, mode='r',
+             index_filename=None,
+             VariantHeader header=None,
+             drop_samples=False,
+             duplicate_filehandle=True):
+        """open a vcf/bcf file.
+
+        If open is called on an existing VariantFile, the current file will be
+        closed and a new file will be opened.
+        """
+        cdef bcf_hdr_t *hdr
+        cdef BGZF *bgzfp
+        cdef hts_idx_t *idx
+        cdef tbx_t *tidx
+        cdef char *cfilename
+        cdef char *cindex_filename = NULL
+        cdef char *cmode
+
+        # close a previously opened file
+        if self.is_open:
+            self.close()
+
+        if not mode or mode[0] not in 'rwa':
+            raise ValueError('mode must begin with r, w or a')
+
+        self.duplicate_filehandle = duplicate_filehandle
+
+        format_modes = [m for m in mode[1:] if m in 'bcguz']
+        if len(format_modes) > 1:
+            raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
+
+        invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
+        if invalid_modes:
+            raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
+
+        # Autodetect mode from filename
+        if mode == 'w' and isinstance(filename, str):
+            if filename.endswith('.gz'):
+                mode = 'wz'
+            elif filename.endswith('.bcf'):
+                mode = 'wb'
+
+        # for htslib, wbu seems to not work
+        if mode == 'wbu':
+            mode = 'wb0'
+
+        self.mode = mode = force_bytes(mode)
+        try:
+            filename = encode_filename(filename)
+            self.is_remote = hisremote(filename)
+            self.is_stream = filename == b'-'
+        except TypeError:
+            filename = filename
+            self.is_remote = False
+            self.is_stream = True
+
+        self.filename = filename
+
+        if index_filename is not None:
+            self.index_filename = index_filename = encode_filename(index_filename)
+        else:
+            self.index_filename = None
+
+        self.drop_samples = bool(drop_samples)
+        self.header = None
+
+        self.header_written = False
+
+        if mode.startswith(b'w'):
+            # open file for writing
+            if index_filename is not None:
+                raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
+
+            # header structure (used for writing)
+            if header:
+                self.header = header.copy()
+            else:
+                self.header = VariantHeader()
+                #raise ValueError('a VariantHeader must be specified')
+
+            # Header is not written until the first write or on close
+            self.htsfile = self._open_htsfile()
+
+            if not self.htsfile:
+                raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
+
+        elif mode.startswith(b'r'):
+            # open file for reading
+            
+            if not self._exists():
+                raise IOError('file `{}` not found'.format(filename))
+
+            self.htsfile = self._open_htsfile()
+
+            if not self.htsfile:
+                raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+            if self.htsfile.format.format not in (bcf, vcf):
+                raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+            if self.htsfile.format.compression == bgzf:
+                bgzfp = hts_get_bgzfp(self.htsfile)
+                if bgzfp and bgzf_check_EOF(bgzfp) == 0:
+                    warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename))
+
+            with nogil:
+                hdr = bcf_hdr_read(self.htsfile)
+
+            try:
+                self.header = makeVariantHeader(hdr)
+            except ValueError:
+                raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+            if isinstance(self.filename, bytes):
+                cfilename = self.filename
+            else:
+                cfilename = NULL
+
+            # check for index and open if present
+            if self.htsfile.format.format == bcf and cfilename:
+                if index_filename is not None:
+                    cindex_filename = index_filename
+                with nogil:
+                    idx = bcf_index_load2(cfilename, cindex_filename)
+                self.index = makeBCFIndex(self.header, idx)
+
+            elif self.htsfile.format.compression == bgzf and cfilename:
+                if index_filename is not None:
+                    cindex_filename = index_filename
+                with nogil:
+                    tidx = tbx_index_load2(cfilename, cindex_filename)
+                self.index = makeTabixIndex(tidx)
+
+            if not self.is_stream:
+                self.start_offset = self.tell()
+        else:
+            raise ValueError("unknown mode {}".format(mode))
+
+    def reset(self):
+        """reset file position to beginning of file just after the header."""
+        return self.seek(self.start_offset)
+
+
+    def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
+        """fetch records in a :term:`region` using 0-based indexing. The
+        region is specified by :term:`contig`, *start* and *end*.
+        Alternatively, a samtools :term:`region` string can be supplied.
+
+        Without *contig* or *region* all mapped records will be fetched.  The
+        records will be returned ordered by contig, which will not necessarily
+        be the order within the file.
+
+        Set *reopen* to true if you will be using multiple iterators on the
+        same file at the same time.  The iterator returned will receive its
+        own copy of a filehandle to the file effectively re-opening the
+        file.  Re-opening a file incurrs some overhead, so use with care.
+
+        If only *contig* is set, all records on *contig* will be fetched.
+        If both *region* and *contig* are given, an exception is raised.
+
+        Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
+        (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
+        read sequentially.
+        """
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+
+        if self.htsfile.is_write:
+            raise ValueError('cannot fetch from Variantfile opened for writing')
+
+        if contig is None and region is None:
+            self.is_reading = 1
+            bcf = self.copy() if reopen else self
+            bcf.seek(self.start_offset)
+            return iter(bcf)
+
+        if not self.index:
+            raise ValueError('fetch requires an index')
+
+        self.is_reading = 1
+        return self.index.fetch(self, contig, start, stop, region, reopen)
+
+    cpdef VariantRecord new_record(self):
+        """Create a new empty VariantRecord"""
+        return self.header.new_record()
+
+    cpdef int write(self, VariantRecord record) except -1:
+        """
+        write a single :class:`pysam.VariantRecord` to disk.
+
+        returns the number of bytes written.
+        """
+        if record is None:
+            raise ValueError('record must not be None')
+
+        if not self.is_open:
+            return ValueError('I/O operation on closed file')
+
+        if not self.htsfile.is_write:
+            raise ValueError('cannot write to a Variantfile opened for reading')
+
+        if not self.header_written:
+            self.header_written = True
+            with nogil:
+                bcf_hdr_write(self.htsfile, self.header.ptr)
+
+        #if record.header is not self.header:
+        #    record.translate(self.header)
+        #    raise ValueError('Writing records from a different VariantFile is not yet supported')
+
+        if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
+            msg = 'Invalid VariantRecord.  Number of samples does not match header ({} vs {})'
+            raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
+
+        cdef int ret
+
+        with nogil:
+            ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
+
+        if ret < 0:
+            raise IOError(errno, strerror(errno))
+
+        return ret
+
+    def subset_samples(self, include_samples):
+        """
+        Read only a subset of samples to reduce processing time and memory.
+        Must be called prior to retrieving records.
+        """
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+
+        if self.htsfile.is_write:
+            raise ValueError('cannot subset samples from Variantfile opened for writing')
+
+        if self.is_reading:
+            raise ValueError('cannot subset samples after fetching records')
+
+        self.header._subset_samples(include_samples)
+
+        # potentially unnecessary optimization that also sets max_unpack
+        if not include_samples:
+            self.drop_samples = True
diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx

new file mode 100644 (file)

index 0000000..558ceff
--- /dev/null
+++ b/pysam/libcbgzf.pyx
@@ -0,0 +1,209 @@
+"""Functions that read and write block gzipped files.
+
+The user of the file doesn't have to worry about the compression
+and random access is allowed if an index file is present."""
+
+# based on Python 3.5's gzip module
+
+import io
+
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+
+from cpython.object cimport PyObject
+from cpython.bytes  cimport PyBytes_FromStringAndSize, _PyBytes_Resize
+
+from pysam.libcutils   cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libchtslib  cimport *
+
+
+__all__ = ["BGZFile"]
+
+
+BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE
+
+
+cdef class BGZFile(object):
+    """The BGZFile class simulates most of the methods of a file object with
+    the exception of the truncate() method.
+
+    This class only supports opening files in binary mode. If you need to open a
+    compressed file in text mode, use the gzip.open() function.
+    """
+    cdef BGZF* bgzf
+    cdef bytes name, index
+
+    def __init__(self, filename, mode=None, index=None):
+        """Constructor for the BGZFile class.
+
+        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
+        'xb' depending on whether the file will be read or written.  The default
+        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
+        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
+        'wb', 'a' and 'ab', and 'x' and 'xb'.
+        """
+        if mode and ('t' in mode or 'U' in mode):
+            raise ValueError("Invalid mode: {!r}".format(mode))
+        if not mode:
+            mode = 'rb'
+        if mode and 'b' not in mode:
+            mode += 'b'
+        self.name = force_bytes(filename)
+        self.index = force_bytes(index) if index is not None else None
+        self.bgzf = bgzf_open(self.name, mode)
+
+        if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0:
+            raise IOError('Error building bgzf index')
+
+    def __dealloc__(self):
+        self.close()
+
+    def write(self,data):
+        if not self.bgzf:
+            raise ValueError("write() on closed BGZFile object")
+
+        if not self.bgzf.is_write:
+            import errno
+            raise OSError(errno.EBADF, "write() on read-only BGZFile object")
+
+        if isinstance(data, bytes):
+            length = len(data)
+        else:
+            # accept any data that supports the buffer protocol
+            data = memoryview(data)
+            length = data.nbytes
+
+        if length > 0 and bgzf_write(self.bgzf, <char *>data, length) < 0:
+            raise IOError('BGZFile write failed')
+
+        return length
+
+    def read(self, size=-1):
+        cdef ssize_t read_size
+
+        if not self.bgzf:
+            raise ValueError("read() on closed BGZFile object")
+
+        if self.bgzf.is_write:
+            import errno
+            raise OSError(errno.EBADF, "read() on write-only BGZFile object")
+
+        if size < 0:
+            chunks = []
+            while 1:
+                chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE)
+                cdata = <bytes>chunk
+                read_size = bgzf_read(self.bgzf, <char *>chunk, BUFFER_SIZE)
+                if read_size < 0:
+                    raise IOError('Error reading from BGZFile')
+                elif not read_size:
+                    break
+                elif read_size < BUFFER_SIZE:
+                    chunk = chunk[:read_size]
+                chunks.append(chunk)
+            return b''.join(chunks)
+
+        elif size > 0:
+            chunk = PyBytes_FromStringAndSize(NULL, size)
+            read_size = bgzf_read(self.bgzf, <char *>chunk, size)
+            if read_size < 0:
+                raise IOError('Error reading from BGZFile')
+            elif read_size < size:
+                chunk = chunk[:size]
+            return chunk
+        else:
+            return b''
+
+    @property
+    def closed(self):
+        return self.bgzf == NULL
+
+    def close(self):
+        if not self.bgzf:
+            return
+
+        if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
+            raise IOError('Error flushing BGZFile object')
+
+        if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0:
+            raise IOError('Cannot write index')
+
+        cdef ret = bgzf_close(self.bgzf)
+        self.bgzf = NULL
+
+        if ret < 0:
+            raise IOError('Error closing BGZFile object')
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, tb):
+        self.close()
+
+    def flush(self):
+        if not self.bgzf:
+            return
+
+        if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
+            raise IOError('Error flushing BGZFile object')
+
+    def fileno(self):
+        """Invoke the underlying file object's fileno() method.
+
+        This will raise AttributeError if the underlying file object
+        doesn't support fileno().
+        """
+        raise AttributeError('fileno')
+
+    def rewind(self):
+        '''Return the uncompressed stream file position indicator to the
+        beginning of the file'''
+        if not self.bgzf:
+            raise ValueError("rewind() on closed BGZFile object")
+        if not self.bgzf.is_write:
+            raise OSError("Can't rewind in write mode")
+        if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0:
+            raise IOError('Error seeking BGZFFile object')
+
+    def readable(self):
+        if not self.bgzf:
+            raise ValueError("readable() on closed BGZFile object")
+        return self.bgzf != NULL and not self.bgzf.is_write
+
+    def writable(self):
+        return self.bgzf != NULL and self.bgzf.is_write
+
+    def seekable(self):
+        return True
+
+    def seek(self, offset, whence=io.SEEK_SET):
+        if not self.bgzf:
+            raise ValueError("seek() on closed BGZFile object")
+        if whence is not io.SEEK_SET:
+            raise ValueError('Seek from end not supported')
+
+        cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET)
+        if off < 0:
+            raise IOError('Error seeking BGZFFile object')
+
+        return off
+
+    def readline(self, size=-1):
+        if not self.bgzf:
+            raise ValueError("readline() on closed BGZFile object")
+
+        cdef kstring_t line
+        cdef char c
+
+        line.l = line.m = 0
+        line.s = NULL
+        if bgzf_getline(self.bgzf, '\n', &line) < 0:
+            raise IOError('Error reading line in BGZFFile object')
+
+        ret = charptr_to_str_w_len(line.s, line.l)
+
+        if line.m:
+            free(line.s)
+
+        return ret
diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd

new file mode 100644 (file)

index 0000000..2f5f44b
--- /dev/null
+++ b/pysam/libcfaidx.pxd
@@ -0,0 +1,79 @@
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+cimport cython
+
+from cpython cimport array
+from pysam.libchtslib cimport faidx_t, kstring_t, BGZF
+
+# These functions are put here and not in chtslib.pxd in order
+# to avoid warnings for unused functions.
+cdef extern from "pysam_stream.h" nogil:
+
+    ctypedef struct kstream_t:
+        pass
+
+    ctypedef struct kseq_t:
+        kstring_t name
+        kstring_t comment
+        kstring_t seq
+        kstring_t qual
+
+    kseq_t *kseq_init(BGZF *)
+    int kseq_read(kseq_t *)
+    void kseq_destroy(kseq_t *)
+    kstream_t *ks_init(BGZF *)
+    void ks_destroy(kstream_t *)
+
+    # Retrieve characters from stream until delimiter
+    # is reached placing results in str.
+    int ks_getuntil(kstream_t *,
+                    int delimiter,
+                    kstring_t * str,
+                    int * dret)
+
+cdef class FastaFile:
+    cdef bint is_remote
+    cdef object _filename, _references, _lengths, reference2length
+    cdef faidx_t* fastafile
+    cdef char* _fetch(self, char* reference,
+                      int start, int end, int* length)
+
+
+cdef class FastqProxy:
+    cdef kseq_t * _delegate
+    cdef cython.str tostring(self)
+    cpdef array.array get_quality_array(self, int offset=*)
+
+
+cdef class PersistentFastqProxy:
+    """
+    Python container for pysam.libcfaidx.FastqProxy with persistence.
+    """
+    cdef public str comment, quality, sequence, name
+    cdef cython.str tostring(self)
+    cpdef array.array get_quality_array(self, int offset=*)
+
+
+cdef class FastxFile:
+    cdef object _filename
+    cdef BGZF * fastqfile
+    cdef kseq_t * entry
+    cdef bint persist
+    cdef bint is_remote
+
+    cdef kseq_t * getCurrent(self)
+    cdef int cnext(self)
+
+
+# Compatibility Layer for pysam 0.8.1
+cdef class FastqFile(FastxFile):
+    pass
+
+
+# Compatibility Layer for pysam < 0.8
+cdef class Fastafile(FastaFile):
+    pass
+
diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx

new file mode 100644 (file)

index 0000000..774152d
--- /dev/null
+++ b/pysam/libcfaidx.pyx
@@ -0,0 +1,572 @@
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+###############################################################################
+# The principal classes defined in this module are:
+#
+# class FastaFile   random read read/write access to faidx indexd files
+# class FastxFile   streamed read/write access to fasta/fastq files
+#
+# Additionally this module defines several additional classes that are part
+# of the internal API. These are:
+#
+# class FastqProxy
+# class PersistentFastqProxy
+#
+# For backwards compatibility, the following classes are also defined:
+#
+# class Fastafile   equivalent to FastaFile
+# class FastqFile   equivalent to FastxFile
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import sys
+import os
+import re
+from cpython cimport array
+
+from cpython cimport PyErr_SetString, \
+    PyBytes_Check, \
+    PyUnicode_Check, \
+    PyBytes_FromStringAndSize
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libchtslib cimport \
+    faidx_nseq, fai_load, fai_destroy, fai_fetch, \
+    faidx_seq_len, \
+    faidx_fetch_seq, hisremote, \
+    bgzf_open, bgzf_close
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport qualitystring_to_array, parse_region
+
+cdef class FastqProxy
+cdef makeFastqProxy(kseq_t * src):
+    '''enter src into AlignedRead.'''
+    cdef FastqProxy dest = FastqProxy.__new__(FastqProxy)
+    dest._delegate = src
+    return dest
+
+## TODO:
+##        add automatic indexing.
+##        add function to get sequence names.
+cdef class FastaFile:
+    """Random access to fasta formatted files that
+    have been indexed by :term:`faidx`.
+
+    The file is automatically opened. The index file of file
+    ``<filename>`` is expected to be called ``<filename>.fai``.
+
+    Parameters
+    ----------
+
+    filename : string
+        Filename of fasta file to be opened.
+
+    filepath_index : string
+        Optional, filename of the index. By default this is
+        the filename + ".fai".
+
+    Raises
+    ------
+
+    ValueError
+        if index file is missing
+
+    IOError
+        if file could not be opened
+
+    """
+
+    def __cinit__(self, *args, **kwargs):
+        self.fastafile = NULL
+        self._filename = None
+        self._references = None
+        self._lengths = None
+        self.reference2length = None
+        self._open(*args, **kwargs)
+
+    def is_open(self):
+        '''return true if samfile has been opened.'''
+        return self.fastafile != NULL
+
+    def __len__(self):
+        if self.fastafile == NULL:
+            raise ValueError("calling len() on closed file")
+
+        return faidx_nseq(self.fastafile)
+
+    def _open(self, filename, filepath_index=None):
+        '''open an indexed fasta file.
+
+        This method expects an indexed fasta file.
+        '''
+
+        # close a previously opened file
+        if self.fastafile != NULL:
+            self.close()
+
+        self._filename = encode_filename(filename)
+        cdef char *cfilename = self._filename
+        self.is_remote = hisremote(cfilename)
+
+        if filepath_index is not None:
+            raise NotImplementedError(
+                "setting an explicit path for the index "
+                "is not implemented")
+
+        # open file for reading
+        if (self._filename != b"-"
+            and not self.is_remote
+            and not os.path.exists(filename)):
+            raise IOError("file `%s` not found" % filename)
+
+        with nogil:
+            self.fastafile = fai_load(cfilename)
+
+        if self.fastafile == NULL:
+            raise IOError("could not open file `%s`" % filename)
+
+        if self.is_remote:
+            filepath_index = os.path.basename(
+                re.sub("[^:]+:[/]*", "", filename)) + ".fai"
+        elif filepath_index is None:
+            filepath_index = filename + ".fai"
+
+        if not os.path.exists(filepath_index):
+            raise ValueError("could not locate index file {}".format(
+                filepath_index))
+
+        with open(filepath_index) as inf:
+            data = [x.split("\t") for x in inf]
+            self._references = tuple(x[0] for x in data)
+            self._lengths = tuple(int(x[1]) for x in data)
+            self.reference2length = dict(zip(self._references, self._lengths))
+
+    def close(self):
+        """close the file."""
+        if self.fastafile != NULL:
+            fai_destroy(self.fastafile)
+            self.fastafile = NULL
+
+    def __dealloc__(self):
+        if self.fastafile != NULL:
+            fai_destroy(self.fastafile)
+            self.fastafile = NULL
+
+    # context manager interface
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+        return False
+
+    property closed:
+        """"bool indicating the current state of the file object.
+        This is a read-only attribute; the close() method changes the value.
+        """
+        def __get__(self):
+            return not self.is_open()
+
+    property filename:
+        """filename associated with this object. This is a read-only attribute."""
+        def __get__(self):
+            return self._filename
+
+    property references:
+        '''tuple with the names of :term:`reference` sequences.'''
+        def __get__(self):
+            return self._references
+
+    property nreferences:
+        """"int with the number of :term:`reference` sequences in the file.
+        This is a read-only attribute."""
+        def __get__(self):
+            return len(self._references) if self.references else None
+
+    property lengths:
+        """tuple with the lengths of :term:`reference` sequences."""
+        def __get__(self):
+            return self._lengths
+
+    def fetch(self,
+              reference=None,
+              start=None,
+              end=None,
+              region=None):
+        """fetch sequences in a :term:`region`.
+
+        A region can
+        either be specified by :term:`reference`, `start` and
+        `end`. `start` and `end` denote 0-based, half-open
+        intervals.
+
+        Alternatively, a samtools :term:`region` string can be
+        supplied.
+
+        If any of the coordinates are missing they will be replaced by the
+        minimum (`start`) or maximum (`end`) coordinate.
+
+        Note that region strings are 1-based, while `start` and `end` denote
+        an interval in python coordinates.
+        The region is specified by :term:`reference`, `start` and `end`.
+
+        Returns
+        -------
+
+        string : a string with the sequence specified by the region.
+
+        Raises
+        ------
+
+        IndexError
+            if the coordinates are out of range
+
+        ValueError
+            if the region is invalid
+
+        """
+
+        if not self.is_open():
+            raise ValueError("I/O operation on closed file" )
+
+        cdef int length
+        cdef char *seq
+        cdef char *ref
+        cdef int rstart, rend
+
+        reference, rstart, rend = parse_region(reference, start, end, region)
+
+        if reference is None:
+            raise ValueError("no sequence/region supplied.")
+
+        if rstart == rend:
+            return ""
+
+        ref = reference
+        with nogil:
+            length = faidx_seq_len(self.fastafile, ref)
+        if length == -1:
+            raise KeyError("sequence '%s' not present" % reference)
+        if rstart >= length:
+            return ""
+
+        # fai_fetch adds a '\0' at the end
+        with nogil:
+            seq = faidx_fetch_seq(self.fastafile,
+                                  ref,
+                                  rstart,
+                                  rend-1,
+                                  &length)
+
+        if seq == NULL:
+            raise ValueError(
+                "failure when retrieving sequence on '%s'" % reference)
+
+        try:
+            return charptr_to_str(seq)
+        finally:
+            free(seq)
+
+    cdef char * _fetch(self, char * reference, int start, int end, int * length):
+        '''fetch sequence for reference, start and end'''
+
+        with nogil:
+            return faidx_fetch_seq(self.fastafile,
+                                   reference,
+                                   start,
+                                   end-1,
+                                   length)
+
+    def get_reference_length(self, reference):
+        '''return the length of reference.'''
+        return self.reference2length[reference]
+
+    def __getitem__(self, reference):
+        return self.fetch(reference)
+
+    def __contains__(self, reference):
+        '''return true if reference in fasta file.'''
+        return reference in self.reference2length
+
+
+cdef class FastqProxy:
+    """A single entry in a fastq file."""
+    def __init__(self): pass
+
+    property name:
+        """The name of each entry in the fastq file."""
+        def __get__(self):
+            return charptr_to_str(self._delegate.name.s)
+
+    property sequence:
+        """The sequence of each entry in the fastq file."""
+        def __get__(self):
+            return charptr_to_str(self._delegate.seq.s)
+
+    property comment:
+        def __get__(self):
+            if self._delegate.comment.l:
+                return charptr_to_str(self._delegate.comment.s)
+            else:
+                return None
+
+    property quality:
+        """The quality score of each entry in the fastq file, represented as a string."""
+        def __get__(self):
+            if self._delegate.qual.l:
+                return charptr_to_str(self._delegate.qual.s)
+            else:
+                return None
+
+    cdef cython.str tostring(self):
+        if self.comment is None:
+            comment = ""
+        else:
+            comment = " %s" % self.comment
+
+        if self.quality is None:
+            return ">%s%s\n%s" % (self.name, comment, self.sequence)
+        else:
+            return "@%s%s\n%s\n+\n%s" % (self.name, comment,
+                                         self.sequence, self.quality)
+
+    def __str__(self):
+        return self.tostring()
+
+    cpdef array.array get_quality_array(self, int offset=33):
+        '''return quality values as integer array after subtracting offset.'''
+        if self.quality is None:
+            return None
+        return qualitystring_to_array(force_bytes(self.quality),
+                                      offset=offset)
+
+cdef class PersistentFastqProxy:
+    """
+    Python container for pysam.libcfaidx.FastqProxy with persistence.
+    Needed to compare multiple fastq records from the same file.
+    """
+    def __init__(self, FastqProxy FastqRead):
+        self.comment = FastqRead.comment
+        self.quality = FastqRead.quality
+        self.sequence = FastqRead.sequence
+        self.name = FastqRead.name
+
+    cdef cython.str tostring(self):
+        if self.comment is None:
+            comment = ""
+        else:
+            comment = " %s" % self.comment
+
+        if self.quality is None:
+            return ">%s%s\n%s" % (self.name, comment, self.sequence)
+        else:
+            return "@%s%s\n%s\n+\n%s" % (self.name, comment,
+                                         self.sequence, self.quality)
+
+    def __str__(self):
+        return self.tostring()
+
+    cpdef array.array get_quality_array(self, int offset=33):
+        '''return quality values as array after subtracting offset.'''
+        if self.quality is None:
+            return None
+        return qualitystring_to_array(force_bytes(self.quality),
+                                      offset=offset)
+
+
+cdef class FastxFile:
+    """Stream access to :term:`fasta` or :term:`fastq` formatted files.
+
+    The file is automatically opened.
+
+    Entries in the file can be both fastq or fasta formatted or even a
+    mixture of the two.
+
+    This file object permits iterating over all entries in the
+    file. Random access is not implemented. The iteration returns
+    objects of type :class:`FastqProxy`
+
+    Parameters
+    ----------
+
+    filename : string
+        Filename of fasta/fastq file to be opened.
+
+    persist : bool
+
+        If True (default) make a copy of the entry in the file during
+        iteration. If set to False, no copy will be made. This will
+        permit faster iteration, but an entry will not persist when
+        the iteration continues.
+
+    Notes
+    -----
+    Prior to version 0.8.2, this was called FastqFile.
+
+    Raises
+    ------
+
+    IOError
+        if file could not be opened
+
+
+    Examples
+    --------
+    >>> with pysam.FastxFile(filename) as fh:
+    ...    for entry in fh:
+    ...        print(entry.name)
+    ...        print(entry.sequence)
+    ...        print(entry.comment)
+    ...        print(entry.quality)
+
+    """
+    def __cinit__(self, *args, **kwargs):
+        # self.fastqfile = <gzFile*>NULL
+        self._filename = None
+        self.entry = NULL
+        self._open(*args, **kwargs)
+
+    def is_open(self):
+        '''return true if samfile has been opened.'''
+        return self.entry != NULL
+
+    def _open(self, filename, persist=True):
+        '''open a fastq/fasta file in *filename*
+
+        Paramentes
+        ----------
+
+        persist : bool
+
+            if True return a copy of the underlying data (default
+            True).  The copy will persist even if the iteration
+            on the file continues.
+
+        '''
+        if self.fastqfile != NULL:
+            self.close()
+
+        self._filename = encode_filename(filename)
+        cdef char *cfilename = self._filename
+        self.is_remote = hisremote(cfilename)
+
+        # open file for reading
+        if (self._filename != b"-"
+            and not self.is_remote
+            and not os.path.exists(filename)):
+            raise IOError("file `%s` not found" % filename)
+
+        self.persist = persist
+
+        with nogil:
+            self.fastqfile = bgzf_open(cfilename, "r")
+            self.entry = kseq_init(self.fastqfile)
+        self._filename = filename
+
+    def close(self):
+        '''close the file.'''
+        if self.fastqfile != NULL:
+            bgzf_close(self.fastqfile)
+            self.fastqfile = NULL
+        if self.entry != NULL:
+            kseq_destroy(self.entry)
+            self.entry = NULL
+
+    def __dealloc__(self):
+        if self.fastqfile != NULL:
+            bgzf_close(self.fastqfile)
+        if self.entry:
+            kseq_destroy(self.entry)
+
+    # context manager interface
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+        return False
+
+    property closed:
+        """"bool indicating the current state of the file object.
+        This is a read-only attribute; the close() method changes the value.
+        """
+        def __get__(self):
+            return not self.is_open()
+
+    property filename:
+        """string with the filename associated with this object."""
+        def __get__(self):
+            return self._filename
+
+    def __iter__(self):
+        if not self.is_open():
+            raise ValueError("I/O operation on closed file")
+        return self
+
+    cdef kseq_t * getCurrent(self):
+        return self.entry
+
+    cdef int cnext(self):
+        '''C version of iterator
+        '''
+        with nogil:
+            return kseq_read(self.entry)
+
+    def __next__(self):
+        """
+        python version of next().
+        """
+        cdef int l
+        with nogil:
+            l = kseq_read(self.entry)
+        if (l >= 0):
+            if self.persist:
+                return PersistentFastqProxy(makeFastqProxy(self.entry))
+            return makeFastqProxy(self.entry)
+        else:
+            raise StopIteration
+
+# Compatibility Layer for pysam 0.8.1
+cdef class FastqFile(FastxFile):
+    """FastqFile is deprecated: use FastxFile instead"""
+    pass
+
+# Compatibility Layer for pysam < 0.8
+cdef class Fastafile(FastaFile):
+    """Fastafile is deprecated: use FastaFile instead"""
+    pass
+
+__all__ = ["FastaFile",
+           "FastqFile",
+           "FastxFile",
+           "Fastafile",
+           "FastqProxy"]
diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd

new file mode 100644 (file)

index 0000000..657a754
--- /dev/null
+++ b/pysam/libchtslib.pxd
@@ -0,0 +1,1916 @@
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+from posix.types cimport off_t
+
+cdef extern from "Python.h":
+   FILE* PyFile_AsFile(object)
+
+
+cdef extern from "htslib/kstring.h" nogil:
+    ctypedef struct kstring_t:
+        size_t l, m
+        char *s
+
+
+cdef extern from "htslib_util.h" nogil:
+    int hts_set_verbosity(int verbosity)
+    int hts_get_verbosity()
+
+    ctypedef uint32_t khint32_t
+    ctypedef uint32_t khint_t
+    ctypedef khint_t  khiter_t
+
+    # Used to manage BCF Header info
+    ctypedef struct vdict_t:
+        khint_t n_buckets, size, n_occupied, upper_bound
+        khint32_t *flags
+        const char *keys
+        bcf_idinfo_t *vals
+
+    # Used to manage indexed contigs in Tabix
+    ctypedef struct s2i_t:
+        khint_t n_buckets, size, n_occupied, upper_bound
+        khint32_t *flags
+        const char *keys
+        int64_t *vals
+
+    # Generic khash methods
+    khint_t kh_size(void *d)
+    khint_t kh_begin(void *d)
+    khint_t kh_end(void *d)
+    int kh_exist(void *d, khiter_t i)
+
+    # Specialized khash methods for vdict
+    khint_t kh_get_vdict(vdict_t *d, const char *key)
+    const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
+    bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
+
+
+cdef extern from "htslib/hfile.h" nogil:
+    ctypedef struct hFILE
+
+    # @abstract  Open the named file or URL as a stream
+    # @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
+    hFILE *hopen(const char *filename, const char *mode)
+
+    # @abstract  Associate a stream with an existing open file descriptor
+    # @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
+    # @notes     For socket descriptors (on Windows), mode should contain 's'.
+    hFILE *hdopen(int fd, const char *mode)
+
+    # @abstract  Report whether the file name or URL denotes remote storage
+    # @return    0 if local, 1 if remote.
+    # @notes     "Remote" means involving e.g. explicit network access, with the
+    #   implication that callers may wish to cache such files' contents locally.
+    int hisremote(const char *filename)
+
+    # @abstract  Flush (for output streams) and close the stream
+    # @return    0 if successful, or EOF (with errno set) if an error occurred.
+    int hclose(hFILE *fp)
+
+    # @abstract  Close the stream, without flushing or propagating errors
+    # @notes     For use while cleaning up after an error only.  Preserves errno.
+    void hclose_abruptly(hFILE *fp)
+
+    # @abstract  Return the stream's error indicator
+    # @return    Non-zero (in fact, an errno value) if an error has occurred.
+    # @notes     This would be called herror() and return true/false to parallel
+    #   ferror(3), but a networking-related herror(3) function already exists.  */
+    int herrno(hFILE *fp)
+
+    # @abstract  Clear the stream's error indicator
+    void hclearerr(hFILE *fp)
+
+    # @abstract  Reposition the read/write stream offset
+    # @return    The resulting offset within the stream (as per lseek(2)),
+    #   or negative if an error occurred.
+    off_t hseek(hFILE *fp, off_t offset, int whence)
+
+    # @abstract  Report the current stream offset
+    # @return    The offset within the stream, starting from zero.
+    off_t htell(hFILE *fp)
+
+    # @abstract  Read one character from the stream
+    # @return    The character read, or EOF on end-of-file or error
+    int hgetc(hFILE *fp)
+
+    # @abstract  Peek at characters to be read without removing them from buffers
+    # @param fp      The file stream
+    # @param buffer  The buffer to which the peeked bytes will be written
+    # @param nbytes  The number of bytes to peek at; limited by the size of the
+    #   internal buffer, which could be as small as 4K.
+    # @return    The number of bytes peeked, which may be less than nbytes if EOF
+    #   is encountered; or negative, if there was an I/O error.
+    # @notes  The characters peeked at remain in the stream's internal buffer,
+    #   and will be returned by later hread() etc calls.
+    ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
+
+    # @abstract  Read a block of characters from the file
+    # @return    The number of bytes read, or negative if an error occurred.
+    # @notes     The full nbytes requested will be returned, except as limited
+    #   by EOF or I/O errors.
+    ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
+
+    # @abstract  Write a character to the stream
+    # @return    The character written, or EOF if an error occurred.
+    int hputc(int c, hFILE *fp)
+
+    # @abstract  Write a string to the stream
+    # @return    0 if successful, or EOF if an error occurred.
+    int hputs(const char *text, hFILE *fp)
+
+    # @abstract  Write a block of characters to the file
+    # @return    Either nbytes, or negative if an error occurred.
+    # @notes     In the absence of I/O errors, the full nbytes will be written.
+    ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
+
+    # @abstract  For writing streams, flush buffered output to the underlying stream
+    # @return    0 if successful, or EOF if an error occurred.
+    int hflush(hFILE *fp)
+
+
+cdef extern from "htslib/bgzf.h" nogil:
+    ctypedef struct bgzf_mtaux_t
+    ctypedef struct bgzidx_t
+    ctypedef struct z_stream
+
+    ctypedef struct BGZF:
+        unsigned           errcode
+        unsigned           is_write
+        int           is_be
+        int           compress_level
+        int           is_compressed
+        int           is_gzip
+        int           cache_size
+        int64_t       block_address
+        int64_t       uncompressed_address
+        void         *uncompressed_block
+        void         *compressed_block
+        void         *cache
+        hFILE        *fp
+        bgzf_mtaux_t *mt
+        bgzidx_t     *idx
+        int           idx_build_otf
+        z_stream     *gz_stream
+
+    #*****************
+    #  Basic routines *
+    # *****************/
+
+    #  Open an existing file descriptor for reading or writing.
+    #
+    #  @param fd    file descriptor
+    #  @param mode  mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+    #               writing, 'a' for appending, 'g' for gzip rather than BGZF
+    #               compression (with 'w' only), and digit specifies the zlib
+    #               compression level.
+    #               Note that there is a distinction between 'u' and '0': the
+    #               first yields plain uncompressed output whereas the latter
+    #               outputs uncompressed data wrapped in the zlib format.
+    #  @return      BGZF file handler; 0 on error
+
+    BGZF* bgzf_dopen(int fd, const char *mode)
+    BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility
+
+    #  Open the specified file for reading or writing.
+    BGZF* bgzf_open(const char* path, const char *mode)
+
+    #  Open an existing hFILE stream for reading or writing.
+    BGZF* bgzf_hopen(hFILE *fp, const char *mode)
+
+    #  Close the BGZF and free all associated resources.
+    #
+    #  @param fp    BGZF file handler
+    #  @return      0 on success and -1 on error
+    int bgzf_close(BGZF *fp)
+
+    #  Read up to _length_ bytes from the file storing into _data_.
+    #
+    #  @param fp     BGZF file handler
+    #  @param data   data array to read into
+    #  @param length size of data to read
+    #  @return       number of bytes actually read; 0 on end-of-file and -1 on error
+    ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
+
+    #  Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
+    #  the complete _length_ bytes will be written (or queued for writing).
+    #
+    #  @param fp     BGZF file handler
+    #  @param data   data array to write
+    #  @param length size of data to write
+    #  @return       number of bytes written (i.e., _length_); negative on error
+    ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
+
+    #  Read up to _length_ bytes directly from the underlying stream without
+    #  decompressing.  Bypasses BGZF blocking, so must be used with care in
+    #  specialised circumstances only.
+    #
+    #  @param fp     BGZF file handler
+    #  @param data   data array to read into
+    #  @param length number of raw bytes to read
+    #  @return       number of bytes actually read; 0 on end-of-file and -1 on error
+    ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
+
+    #  Write _length_ bytes directly to the underlying stream without
+    #  compressing.  Bypasses BGZF blocking, so must be used with care
+    #  in specialised circumstances only.
+    #
+    #  @param fp     BGZF file handler
+    #  @param data   data array to write
+    #  @param length number of raw bytes to write
+    #  @return       number of bytes actually written; -1 on error
+    ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
+
+    #  Write the data in the buffer to the file.
+    int bgzf_flush(BGZF *fp)
+
+    int SEEK_SET
+
+    #  Return a virtual file pointer to the current location in the file.
+    #  No interpetation of the value should be made, other than a subsequent
+    #  call to bgzf_seek can be used to position the file at the same point.
+    #  Return value is non-negative on success.
+    int64_t bgzf_tell(BGZF *fp)
+
+    #  Set the file to read from the location specified by _pos_.
+    #
+    #  @param fp     BGZF file handler
+    #  @param pos    virtual file offset returned by bgzf_tell()
+    #  @param whence must be SEEK_SET
+    #  @return       0 on success and -1 on error
+    # /
+    int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
+
+    #  Check if the BGZF end-of-file (EOF) marker is present
+    #
+    #  @param fp    BGZF file handler opened for reading
+    #  @return      1 if the EOF marker is present and correct
+    #               2 if it can't be checked, e.g., because fp isn't seekable
+    #               0 if the EOF marker is absent
+    #               -1 (with errno set) on error
+    int bgzf_check_EOF(BGZF *fp)
+
+    #  Check if a file is in the BGZF format
+    #
+    #  @param fn    file name
+    #  @return      1 if _fn_ is BGZF; 0 if not or on I/O error
+    int bgzf_is_bgzf(const char *fn)
+
+    #*********************
+    #  Advanced routines *
+    #*********************
+
+    #  Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+    #
+    #  @param fp    BGZF file handler
+    #  @param size  size of cache in bytes; 0 to disable caching (default)
+    void bgzf_set_cache_size(BGZF *fp, int size)
+
+    #  Flush the file if the remaining buffer size is smaller than _size_
+    #  @return      0 if flushing succeeded or was not needed; negative on error
+    int bgzf_flush_try(BGZF *fp, ssize_t size)
+
+    #  Read one byte from a BGZF file. It is faster than bgzf_read()
+    #  @param fp     BGZF file handler
+    #  @return       byte read; -1 on end-of-file or error
+    int bgzf_getc(BGZF *fp)
+
+    #  Read one line from a BGZF file. It is faster than bgzf_getc()
+    #
+    #  @param fp     BGZF file handler
+    #  @param delim  delimitor
+    #  @param str    string to write to; must be initialized
+    #  @return       length of the string; 0 on end-of-file; negative on error
+    int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+
+    #  Read the next BGZF block.
+    int bgzf_read_block(BGZF *fp)
+
+    #  Enable multi-threading (only effective on writing and when the
+    #  library was compiled with -DBGZF_MT)
+    #
+    #  @param fp          BGZF file handler; must be opened for writing
+    #  @param n_threads   #threads used for writing
+    #  @param n_sub_blks  #blocks processed by each thread; a value 64-256 is recommended
+    int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+
+
+    # Compress a single BGZF block.
+    #
+    # @param dst    output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
+    # @param dlen   size of output buffer; updated on return to the number
+    #               of bytes actually written to dst
+    # @param src    buffer to be compressed
+    # @param slen   size of data to compress (must be <= BGZF_BLOCK_SIZE)
+    # @param level  compression level
+    # @return       0 on success and negative on error
+    #
+    int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
+
+    #*******************
+    #  bgzidx routines *
+    #   BGZF at the uncompressed offset
+    #
+    #   @param fp           BGZF file handler; must be opened for reading
+    #   @param uoffset      file offset in the uncompressed data
+    #   @param where        SEEK_SET supported atm
+    #
+    #   Returns 0 on success and -1 on error.
+    int bgzf_useek(BGZF *fp, long uoffset, int where)
+
+    #   Position in uncompressed BGZF
+    #
+    #   @param fp           BGZF file handler; must be opened for reading
+    #
+    #   Returns the current offset on success and -1 on error.
+    long bgzf_utell(BGZF *fp)
+
+    #  Tell BGZF to build index while compressing.
+    #
+    #  @param fp          BGZF file handler; can be opened for reading or writing.
+    #
+    #  Returns 0 on success and -1 on error.
+    int bgzf_index_build_init(BGZF *fp)
+
+    #  Load BGZF index
+    #
+    #  @param fp          BGZF file handler
+    #  @param bname       base name
+    #  @param suffix      suffix to add to bname (can be NULL)
+    #
+    #  Returns 0 on success and -1 on error.
+    int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
+
+    #  Save BGZF index
+    #
+    #  @param fp          BGZF file handler
+    #  @param bname       base name
+    #  @param suffix      suffix to add to bname (can be NULL)
+    #
+    #  Returns 0 on success and -1 on error.
+    int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
+
+
+cdef extern from "htslib/hts.h" nogil:
+    uint32_t kroundup32(uint32_t x)
+
+    ctypedef struct cram_fd
+
+    union FilePointerUnion:
+        BGZF    *bgzf
+        cram_fd *cram
+        hFILE   *hfile
+        void    *voidp
+
+    enum htsFormatCategory:
+        unknown_category
+        sequence_data    # Sequence data -- SAM, BAM, CRAM, etc
+        variant_data     # Variant calling data -- VCF, BCF, etc
+        index_file       # Index file associated with some data file
+        region_list      # Coordinate intervals or regions -- BED, etc
+        category_maximum
+
+    enum htsExactFormat:
+        unknown_format
+        binary_format
+        text_format
+        sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
+        format_maximum
+
+    enum htsCompression:
+        no_compression, gzip, bgzf, custom
+        compression_maximum
+
+    enum hts_fmt_option:
+        CRAM_OPT_DECODE_MD,
+        CRAM_OPT_PREFIX,
+        CRAM_OPT_VERBOSITY,
+        CRAM_OPT_SEQS_PER_SLICE,
+        CRAM_OPT_SLICES_PER_CONTAINER,
+        CRAM_OPT_RANGE,
+        CRAM_OPT_VERSION,
+        CRAM_OPT_EMBED_REF,
+        CRAM_OPT_IGNORE_MD5,
+        CRAM_OPT_REFERENCE,
+        CRAM_OPT_MULTI_SEQ_PER_SLICE,
+        CRAM_OPT_NO_REF,
+        CRAM_OPT_USE_BZIP2,
+        CRAM_OPT_SHARED_REF,
+        CRAM_OPT_NTHREADS,
+        CRAM_OPT_THREAD_POOL,
+        CRAM_OPT_USE_LZMA,
+        CRAM_OPT_USE_RANS,
+        CRAM_OPT_REQUIRED_FIELDS,
+        HTS_OPT_COMPRESSION_LEVEL,
+        HTS_OPT_NTHREADS,
+
+    ctypedef struct htsVersion:
+        short major, minor
+
+    ctypedef struct htsFormat:
+        htsFormatCategory category
+        htsExactFormat    format
+        htsVersion        version
+        htsCompression    compression
+        short             compression_level
+        void              *specific  
+
+    ctypedef struct htsFile:
+        uint8_t  is_bin
+        uint8_t  is_write
+        uint8_t  is_be
+        uint8_t  is_cram
+        int64_t lineno
+        kstring_t line
+        char *fn
+        char *fn_aux
+        FilePointerUnion fp
+        htsFormat format
+
+    int hts_verbose
+
+    # @abstract Table for converting a nucleotide character to 4-bit encoding.
+    # The input character may be either an IUPAC ambiguity code, '=' for 0, or
+    # '0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
+    # for A/C/G/T or combinations of these bits for ambiguous bases.
+    const unsigned char *seq_nt16_table
+
+    # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+    # ambiguity code letter (or '=' when given 0).
+    const char *seq_nt16_str
+
+    # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+    # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+    const int *seq_nt16_int
+
+    # @abstract  Get the htslib version number
+    # @return    For released versions, a string like "N.N[.N]"; or git describe
+    # output if using a library built within a Git repository.
+    const char *hts_version()
+
+    # @abstract    Determine format by peeking at the start of a file
+    # @param fp    File opened for reading, positioned at the beginning
+    # @param fmt   Format structure that will be filled out on return
+    # @return      0 for success, or negative if an error occurred.
+    int hts_detect_format(hFILE *fp, htsFormat *fmt)
+
+    # @abstract    Get a human-readable description of the file format
+    # @return      Description string, to be freed by the caller after use.
+    char *hts_format_description(const htsFormat *format)
+
+    # @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
+    # @param fn       The file name or "-" for stdin/stdout
+    # @param mode     Mode matching / [rwa][bceguxz0-9]* /
+    # @discussion
+    #     With 'r' opens for reading; any further format mode letters are ignored
+    #     as the format is detected by checking the first few bytes or BGZF blocks
+    #     of the file.  With 'w' or 'a' opens for writing or appending, with format
+    #     specifier letters:
+    #       b  binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
+    #       c  CRAM format
+    #       g  gzip compressed
+    #       u  uncompressed
+    #       z  bgzf compressed
+    #       [0-9]  zlib compression level
+    #     and with non-format option letters (for any of 'r'/'w'/'a'):
+    #       e  close the file on exec(2) (opens with O_CLOEXEC, where supported)
+    #       x  create the file exclusively (opens with O_EXCL, where supported)
+    #     Note that there is a distinction between 'u' and '0': the first yields
+    #     plain uncompressed output whereas the latter outputs uncompressed data
+    #     wrapped in the zlib format.
+    # @example
+    #     [rw]b  .. compressed BCF, BAM, FAI
+    #     [rw]bu .. uncompressed BCF
+    #     [rw]z  .. compressed VCF
+    #     [rw]   .. uncompressed VCF
+    htsFile *hts_open(const char *fn, const char *mode)
+
+    # @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
+    # @param fn       The file name or "-" for stdin/stdout
+    # @param mode     Open mode, as per hts_open()
+    # @param fmt      Optional format specific parameters
+    # @discussion
+    #     See hts_open() for description of fn and mode.
+    #     // TODO Update documentation for s/opts/fmt/
+    #     Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
+    #     if defined, override mode.  Opts also contains a linked list of hts_opt
+    #     structures to apply to the open file handle.  These can contain things
+    #     like pointers to the reference or information on compression levels,
+    #     block sizes, etc.
+    htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
+
+    # @abstract       Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+    # @param fp       The already-open file handle
+    # @param fn       The file name or "-" for stdin/stdout
+    # @param mode     Open mode, as per hts_open()
+    htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
+
+    # @abstract  Close a file handle, flushing buffered data for output streams
+    # @param fp  The file handle to be closed
+    # @return    0 for success, or negative if an error occurred.
+    int hts_close(htsFile *fp)
+
+    # @abstract  Returns the file's format information
+    # @param fp  The file handle
+    # @return    Read-only pointer to the file's htsFormat.
+    const htsFormat *hts_get_format(htsFile *fp)
+
+    # @ abstract      Returns a string containing the file format extension.
+    # @ param format  Format structure containing the file type.
+    # @ return        A string ("sam", "bam", etc) or "?" for unknown formats.
+    const char *hts_format_file_extension(const htsFormat *format)
+
+    # @abstract  Sets a specified CRAM option on the open file handle.
+    # @param fp  The file handle open the open file.
+    # @param opt The CRAM_OPT_* option.
+    # @param ... Optional arguments, dependent on the option used.
+    # @return    0 for success, or negative if an error occurred.
+    int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
+
+    int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
+    char **hts_readlines(const char *fn, int *_n)
+
+    #   @abstract       Parse comma-separated list or read list from a file
+    #   @param list     File name or comma-separated list
+    #   @param is_file
+    #   @param _n       Size of the output array (number of items read)
+    #   @return         NULL on failure or pointer to newly allocated array of
+    #                   strings
+    char **hts_readlist(const char *fn, int is_file, int *_n)
+
+    # @abstract  Create extra threads to aid compress/decompression for this file
+    # @param fp  The file handle
+    # @param n   The number of worker threads to create
+    # @return    0 for success, or negative if an error occurred.
+    # @notes     THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
+    int hts_set_threads(htsFile *fp, int n)
+
+    # @abstract  Set .fai filename for a file opened for reading
+    # @return    0 for success, negative on failure
+    # @discussion
+    #     Called before *_hdr_read(), this provides the name of a .fai file
+    #     used to provide a reference list if the htsFile contains no @SQ headers.
+    int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
+
+    int8_t HTS_IDX_NOCOOR
+    int8_t HTS_IDX_START
+    int8_t HTS_IDX_REST
+    int8_t HTS_IDX_NONE
+
+    int8_t HTS_FMT_CSI
+    int8_t HTS_FMT_BAI
+    int8_t HTS_FMT_TBI
+    int8_t HTS_FMT_CRAI
+
+    BGZF *hts_get_bgzfp(htsFile *fp)
+    int hts_useek(htsFile *fp, long uoffset, int where)
+    long hts_utell(htsFile *fp)
+
+    ctypedef struct hts_idx_t
+
+    ctypedef struct hts_pair64_t:
+        uint64_t u, v
+
+    ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end)
+
+    ctypedef struct hts_bins_t:
+        int n, m
+        int *a
+
+    ctypedef struct hts_itr_t:
+        uint32_t read_rest
+        uint32_t finished
+        int tid, bed, end, n_off, i
+        int curr_tid, curr_beg, curr_end
+        uint64_t curr_off
+        hts_pair64_t *off
+        hts_readrec_func *readfunc
+        hts_bins_t bins
+
+    hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
+    void hts_idx_destroy(hts_idx_t *idx)
+    int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
+    void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
+
+    #### Save an index to a file
+    #    @param idx  Index to be written
+    #    @param fn   Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
+    #    @param fmt  One of the HTS_FMT_* index formats
+    #    @return  0 if successful, or negative if an error occurred.
+    int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
+
+    #### Save an index to a specific file
+    #    @param idx    Index to be written
+    #    @param fn     Input BAM/BCF/etc filename
+    #    @param fnidx  Output filename, or NULL to add .bai/.csi/etc to @a fn
+    #    @param fmt    One of the HTS_FMT_* index formats
+    #    @return  0 if successful, or negative if an error occurred.
+    int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
+
+    #### Load an index file
+    #    @param fn   BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
+    #                the extension substituted, to search for an existing index file
+    #    @param fmt  One of the HTS_FMT_* index formats
+    #    @return  The index, or NULL if an error occurred.
+    hts_idx_t *hts_idx_load(const char *fn, int fmt)
+
+    #### Load a specific index file
+    #    @param fn     Input BAM/BCF/etc filename
+    #    @param fnidx  The input index filename
+    #    @return  The index, or NULL if an error occurred.
+    hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
+
+    uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
+    void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
+
+    int hts_idx_get_stat(const hts_idx_t* idx, int tid,
+                         uint64_t* mapped, uint64_t* unmapped)
+
+    uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
+
+    int HTS_PARSE_THOUSANDS_SEP  # Ignore ',' separators within numbers
+
+    # Parse a numeric string
+    #    The number may be expressed in scientific notation, and optionally may
+    #    contain commas in the integer part (before any decimal point or E notation).
+    #    @param str     String to be parsed
+    #    @param strend  If non-NULL, set on return to point to the first character
+    #                   in @a str after those forming the parsed number
+    #    @param flags   Or'ed-together combination of HTS_PARSE_* flags
+    #    @return  Converted value of the parsed number.
+    #
+    #    When @a strend is NULL, a warning will be printed (if hts_verbose is 2
+    #    or more) if there are any trailing characters after the number.
+    long long hts_parse_decimal(const char *str, char **strend, int flags)
+
+    # Parse a "CHR:START-END"-style region string
+    #    @param str  String to be parsed
+    #    @param beg  Set on return to the 0-based start of the region
+    #    @param end  Set on return to the 1-based end of the region
+    #    @return  Pointer to the colon or '\0' after the reference sequence name,
+    #             or NULL if @a str could not be parsed.
+    const char *hts_parse_reg(const char *str, int *beg, int *end)
+
+    hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
+    void hts_itr_destroy(hts_itr_t *iter)
+
+    ctypedef int (*hts_name2id_f)(void*, const char*)
+    ctypedef const char *(*hts_id2name_f)(void*, int)
+    ctypedef hts_itr_t *hts_itr_query_func(
+        const hts_idx_t *idx,
+        int tid,
+        int beg,
+        int end,
+        hts_readrec_func *readrec)
+
+    hts_itr_t *hts_itr_querys(
+        const hts_idx_t *idx,
+        const char *reg,
+        hts_name2id_f getid,
+        void *hdr,
+        hts_itr_query_func *itr_query,
+        hts_readrec_func *readrec)
+
+    int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
+    const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)  # free only the array, not the values
+
+    # hts_file_type() - Convenience function to determine file type
+    # @fname: the file name
+    #
+    # Returns one of the FT_* defines.
+    #
+    # DEPRECATED:  This function has been replaced by hts_detect_format().
+    # It and these FT_* macros will be removed in a future HTSlib release.
+    int FT_UNKN
+    int FT_GZ
+    int FT_VCF
+    int FT_VCF_GZ
+    int FT_BCF
+    int FT_BCF_GZ
+    int FT_STDIN
+
+    int hts_file_type(const char *fname)
+
+    inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+    inline int hts_bin_bot(int bin, int n_lvls)
+
+    # * Endianness *
+    inline int ed_is_big()
+    inline uint16_t ed_swap_2(uint16_t v)
+    inline void *ed_swap_2p(void *x)
+    inline uint32_t ed_swap_4(uint32_t v)
+    inline void *ed_swap_4p(void *x)
+    inline uint64_t ed_swap_8(uint64_t v)
+    inline void *ed_swap_8p(void *x)
+
+
+cdef extern from "htslib/sam.h" nogil:
+    #**********************
+    #*** SAM/BAM header ***
+    #**********************
+
+    # @abstract Structure for the alignment header.
+    # @field n_targets   number of reference sequences
+    # @field l_text      length of the plain text in the header
+    # @field target_len  lengths of the reference sequences
+    # @field target_name names of the reference sequences
+    # @field text        plain text
+    # @field sdict       header dictionary
+
+    ctypedef struct bam_hdr_t:
+         int32_t n_targets, ignore_sam_err
+         uint32_t l_text
+         uint32_t *target_len
+         uint8_t *cigar_tab
+         char **target_name
+         char *text
+         void *sdict
+
+    #****************************
+    #*** CIGAR related macros ***
+    #****************************
+
+    int BAM_CMATCH
+    int BAM_CINS
+    int BAM_CDEL
+    int BAM_CREF_SKIP
+    int BAM_CSOFT_CLIP
+    int BAM_CHARD_CLIP
+    int BAM_CPAD
+    int BAM_CEQUAL
+    int BAM_CDIFF
+    int BAM_CBACK
+
+    char    *BAM_CIGAR_STR
+    int      BAM_CIGAR_SHIFT
+    uint32_t BAM_CIGAR_MASK
+    uint32_t BAM_CIGAR_TYPE
+
+    char bam_cigar_op(uint32_t c)
+    uint32_t bam_cigar_oplen(uint32_t c)
+    char bam_cigar_opchr(uint32_t)
+    uint32_t bam_cigar_gen(char, uint32_t)
+    int bam_cigar_type(char o)
+
+    # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair
+    int BAM_FPAIRED
+    # @abstract the read is mapped in a proper pair
+    int BAM_FPROPER_PAIR
+    # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
+    int BAM_FUNMAP
+    # @abstract the mate is unmapped
+    int BAM_FMUNMAP
+    # @abstract the read is mapped to the reverse strand
+    int BAM_FREVERSE
+    # @abstract the mate is mapped to the reverse strand
+    int BAM_FMREVERSE
+    # @abstract this is read1
+    int BAM_FREAD1
+    # @abstract this is read2
+    int BAM_FREAD2
+    # @abstract not primary alignment
+    int BAM_FSECONDARY
+    # @abstract QC failure
+    int BAM_FQCFAIL
+    # @abstract optical or PCR duplicate
+    int BAM_FDUP
+    # @abstract supplementary alignment
+    int BAM_FSUPPLEMENTARY
+
+    #*************************
+    #*** Alignment records ***
+    #*************************
+
+    # @abstract Structure for core alignment information.
+    # @field  tid     chromosome ID, defined by bam_hdr_t
+    # @field  pos     0-based leftmost coordinate
+    # @field  bin     bin calculated by bam_reg2bin()
+    # @field  qual    mapping quality
+    # @field  l_qname length of the query name
+    # @field  flag    bitwise flag
+    # @field  n_cigar number of CIGAR operations
+    # @field  l_qseq  length of the query sequence (read)
+    # @field  mtid    chromosome ID of next read in template, defined by bam_hdr_t
+    # @field  mpos    0-based leftmost coordinate of next read in template
+
+    ctypedef struct bam1_core_t:
+        int32_t tid
+        int32_t pos
+        uint16_t bin
+        uint8_t qual
+        uint8_t l_qname
+        uint16_t flag
+        uint16_t n_cigar
+        int32_t l_qseq
+        int32_t mtid
+        int32_t mpos
+        int32_t isize
+
+    # @abstract Structure for one alignment.
+    # @field  core       core information about the alignment
+    # @field  l_data     current length of bam1_t::data
+    # @field  m_data     maximum length of bam1_t::data
+    # @field  data       all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
+    #
+    # @discussion Notes:
+    #
+    # 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+    # 2. l_qseq is calculated from the total length of an alignment block
+    # on reading or from CIGAR.
+    # 3. cigar data is encoded 4 bytes per CIGAR operation.
+    # 4. seq is nybble-encoded according to seq_nt16_table.
+    ctypedef struct bam1_t:
+        bam1_core_t core
+        int l_data, m_data
+        uint8_t *data
+        uint64_t id
+
+    # @abstract  Get whether the query is on the reverse strand
+    # @param  b  pointer to an alignment
+    # @return    boolean true if query is on the reverse strand
+    int bam_is_rev(bam1_t *b)
+
+    # @abstract  Get whether the query's mate is on the reverse strand
+    # @param  b  pointer to an alignment
+    # @return    boolean true if query's mate on the reverse strand
+    int bam_is_mrev(bam1_t *b)
+
+    # @abstract  Get the name of the query
+    # @param  b  pointer to an alignment
+    # @return    pointer to the name string, null terminated
+    char *bam_get_qname(bam1_t *b)
+
+    # @abstract  Get the CIGAR array
+    # @param  b  pointer to an alignment
+    # @return    pointer to the CIGAR array
+    #
+    # @discussion In the CIGAR array, each element is a 32-bit integer. The
+    # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+    # length of a CIGAR.
+    uint32_t *bam_get_cigar(bam1_t *b)
+
+    # @abstract  Get query sequence
+    # @param  b  pointer to an alignment
+    # @return    pointer to sequence
+    #
+    # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+    # 8 for T and 15 for N. Two bases are packed in one byte with the base
+    # at the higher 4 bits having smaller coordinate on the read. It is
+    # recommended to use bam_seqi() macro to get the base.
+    char *bam_get_seq(bam1_t *b)
+
+    # @abstract  Get query quality
+    # @param  b  pointer to an alignment
+    # @return    pointer to quality string
+    uint8_t *bam_get_qual(bam1_t *b)
+
+    # @abstract  Get auxiliary data
+    # @param  b  pointer to an alignment
+    # @return    pointer to the concatenated auxiliary data
+    uint8_t *bam_get_aux(bam1_t *b)
+
+    # @abstract  Get length of auxiliary data
+    # @param  b  pointer to an alignment
+    # @return    length of the concatenated auxiliary data
+    int bam_get_l_aux(bam1_t *b)
+
+    # @abstract  Get a base on read
+    # @param  s  Query sequence returned by bam1_seq()
+    # @param  i  The i-th position, 0-based
+    # @return    4-bit integer representing the base.
+    char bam_seqi(char *s, int i)
+
+    #**************************
+    #*** Exported functions ***
+    #**************************
+
+    #***************
+    #*** BAM I/O ***
+    #***************
+
+    bam_hdr_t *bam_hdr_init()
+    bam_hdr_t *bam_hdr_read(BGZF *fp)
+    int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
+    void bam_hdr_destroy(bam_hdr_t *h)
+    int bam_name2id(bam_hdr_t *h, const char *ref)
+    bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0)
+
+    bam1_t *bam_init1()
+    void bam_destroy1(bam1_t *b)
+    int bam_read1(BGZF *fp, bam1_t *b)
+    int bam_write1(BGZF *fp, const bam1_t *b)
+    bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+    bam1_t *bam_dup1(const bam1_t *bsrc)
+
+    int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
+    int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
+
+    # @abstract Calculate the rightmost base position of an alignment on the
+    # reference genome.
+
+    # @param  b  pointer to an alignment
+    # @return    the coordinate of the first base after the alignment, 0-based
+
+    # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
+    # For an unmapped read (either according to its flags or if it has no cigar
+    # string), we return b->core.pos + 1 by convention.
+    int32_t bam_endpos(const bam1_t *b)
+
+    int   bam_str2flag(const char *str)  # returns negative value on error
+    char *bam_flag2str(int flag)         # The string must be freed by the user
+
+    #*************************
+    #*** BAM/CRAM indexing ***
+    #*************************
+
+    # These BAM iterator functions work only on BAM files.  To work with either
+    # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
+    void bam_itr_destroy(hts_itr_t *iter)
+    hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+    hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
+    int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
+
+    # Load/build .csi or .bai BAM index file.  Does not work with CRAM.
+    # It is recommended to use the sam_index_* functions below instead.
+    hts_idx_t *bam_index_load(const char *fn)
+    int bam_index_build(const char *fn, int min_shift)
+
+    # Load a BAM (.csi or .bai) or CRAM (.crai) index file
+    # @param fp  File handle of the data file whose index is being opened
+    # @param fn  BAM/CRAM/etc filename to search alongside for the index file
+    # @return  The index, or NULL if an error occurred.
+    hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
+
+    # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
+    # @param fp     File handle of the data file whose index is being opened
+    # @param fn     BAM/CRAM/etc data file filename
+    # @param fnidx  Index filename, or NULL to search alongside @a fn
+    # @return  The index, or NULL if an error occurred.
+    hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
+
+    # Generate and save an index file
+    # @param fn        Input BAM/etc filename, to which .csi/etc will be added
+    # @param min_shift Positive to generate CSI, or 0 to generate BAI
+    # @return  0 if successful, or negative if an error occurred (usually -1; or
+    #         -2: opening fn failed; -3: format not indexable)
+    int sam_index_build(const char *fn, int min_shift)
+
+    # Generate and save an index to a specific file
+    # @param fn        Input BAM/CRAM/etc filename
+    # @param fnidx     Output filename, or NULL to add .bai/.csi/etc to @a fn
+    # @param min_shift Positive to generate CSI, or 0 to generate BAI
+    # @return  0 if successful, or negative if an error occurred.
+    int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
+
+    void sam_itr_destroy(hts_itr_t *iter)
+    hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+    hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
+    int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
+
+    #***************
+    #*** SAM I/O ***
+    #***************
+
+    htsFile *sam_open(const char *fn, const char *mode)
+    htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
+    int sam_close(htsFile *fp)
+
+    int sam_open_mode(char *mode, const char *fn, const char *format)
+
+    # A version of sam_open_mode that can handle ,key=value options.
+    # The format string is allocated and returned, to be freed by the caller.
+    # Prefix should be "r" or "w",
+    char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
+
+    bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
+    bam_hdr_t *sam_hdr_read(htsFile *fp)
+    int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
+
+    int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
+    int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
+    int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
+    int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
+
+    #*************************************
+    #*** Manipulating auxiliary fields ***
+    #*************************************
+
+    uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
+    int32_t  bam_aux2i(const uint8_t *s)
+    double   bam_aux2f(const uint8_t *s)
+    char     bam_aux2A(const uint8_t *s)
+    char    *bam_aux2Z(const uint8_t *s)
+
+    void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data)
+    int bam_aux_del(bam1_t *b, uint8_t *s)
+
+    #**************************
+    #*** Pileup and Mpileup ***
+    #**************************
+
+    # @abstract Structure for one alignment covering the pileup position.
+    # @field  b          pointer to the alignment
+    # @field  qpos       position of the read base at the pileup site, 0-based
+    # @field  indel      indel length; 0 for no indel, positive for ins and negative for del
+    # @field  level      the level of the read in the "viewer" mode
+    # @field  is_del     1 iff the base on the padded read is a deletion
+    # @field  is_head    ???
+    # @field  is_tail    ???
+    # @field  is_refskip ???
+    # @field  aux        ???
+    #
+    # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+    # difference between the two functions is that the former does not
+    # set bam_pileup1_t::level, while the later does. Level helps the
+    # implementation of alignment viewers, but calculating this has some
+    # overhead.
+    #
+    # is_del, is_head, etc are a bit field, declaring as below should
+    # work as expected, see
+    # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
+
+    ctypedef struct bam_pileup1_t:
+        bam1_t *b
+        int32_t qpos
+        int indel, level
+        uint32_t is_del
+        uint32_t is_head
+        uint32_t is_tail
+        uint32_t is_refskip
+        uint32_t aux
+
+    ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
+    ctypedef int (*bam_test_f)()
+
+    ctypedef struct __bam_plp_t
+    ctypedef __bam_plp_t *bam_plp_t
+
+    ctypedef struct __bam_mplp_t
+    ctypedef __bam_mplp_t *bam_mplp_t
+
+    # bam_plp_init() - sets an iterator over multiple
+    # @func:      see mplp_func in bam_plcmd.c in samtools for an example. Expected return
+    #             status: 0 on success, -1 on end, < -1 on non-recoverable errors
+    # @data:      user data to pass to @func
+    bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
+    void bam_plp_destroy(bam_plp_t iter)
+    int bam_plp_push(bam_plp_t iter, const bam1_t *b)
+    const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+    const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+    void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
+    void bam_plp_reset(bam_plp_t iter)
+
+    bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+
+    # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
+    # read pairs and for each base pair set the base quality of the
+    # lower-quality base to zero, thus effectively discarding it from
+    # calling. If the two bases are identical, the quality of the other base
+    # is increased to the sum of their qualities (capped at 200), otherwise
+    # it is multiplied by 0.8.
+    void bam_mplp_init_overlaps(bam_mplp_t iter)
+    void bam_mplp_destroy(bam_mplp_t iter)
+    void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
+    int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+
+    # Added by AH
+    # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
+
+
+cdef extern from "htslib/faidx.h" nogil:
+
+    ctypedef struct faidx_t:
+       pass
+
+    int fai_build(char *fn)
+
+    void fai_destroy(faidx_t *fai)
+
+    faidx_t *fai_load(char *fn)
+
+    char *fai_fetch(faidx_t *fai,
+                    char *reg,
+                    int *len)
+
+    int faidx_nseq(faidx_t *fai)
+
+    int faidx_has_seq(faidx_t *fai, const char *seq)
+
+    char *faidx_fetch_seq(faidx_t *fai,
+                         char *c_name,
+                         int p_beg_i,
+                         int p_end_i,
+                         int *len)
+
+    int faidx_seq_len(faidx_t *fai, const char *seq)
+
+
+# tabix support
+cdef extern from "htslib/tbx.h" nogil:
+
+    # tbx.h definitions
+    int8_t TBX_MAX_SHIFT
+    int8_t TBX_GENERIC
+    int8_t TBX_SAM
+    int8_t TBX_VCF
+    int8_t TBX_UCSC
+
+    ctypedef struct tbx_conf_t:
+        int32_t preset
+        int32_t sc, bc, ec   # seq col., beg col. and end col.
+        int32_t meta_char, line_skip
+
+    ctypedef struct tbx_t:
+        tbx_conf_t conf
+        hts_idx_t *idx
+        void * dict
+
+    tbx_conf_t tbx_conf_gff
+    tbx_conf_t tbx_conf_bed
+    tbx_conf_t tbx_conf_psltbl
+    tbx_conf_t tbx_conf_sam
+    tbx_conf_t tbx_conf_vcf
+
+    void tbx_itr_destroy(hts_itr_t * iter)
+    hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
+    hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
+    int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data)
+
+    int tbx_name2id(tbx_t *tbx, char *ss)
+
+    int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
+    int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
+
+    tbx_t * tbx_index_load(char *fn)
+    tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
+
+    # free the array but not the values
+    char **tbx_seqnames(tbx_t *tbx, int *n)
+
+    void tbx_destroy(tbx_t *tbx)
+
+
+# VCF/BCF API
+cdef extern from "htslib/vcf.h" nogil:
+
+    # Header struct
+
+    uint8_t BCF_HL_FLT   # header line
+    uint8_t BCF_HL_INFO
+    uint8_t BCF_HL_FMT
+    uint8_t BCF_HL_CTG
+    uint8_t BCF_HL_STR   # structured header line TAG=<A=..,B=..>
+    uint8_t BCF_HL_GEN   # generic header line
+
+    uint8_t BCF_HT_FLAG  # header type
+    uint8_t BCF_HT_INT
+    uint8_t BCF_HT_REAL
+    uint8_t BCF_HT_STR
+
+    uint8_t BCF_VL_FIXED # variable length
+    uint8_t BCF_VL_VAR
+    uint8_t BCF_VL_A
+    uint8_t BCF_VL_G
+    uint8_t BCF_VL_R
+
+    # === Dictionary ===
+    #
+    # The header keeps three dictonaries. The first keeps IDs in the
+    # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
+    # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
+    # is the actual hash table, which is opaque to the end users. In the hash
+    # table, the key is the ID or sample name as a C string and the value is a
+    # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
+    # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
+    # size of the hash table or, equivalently, the length of the id[] arrays.
+
+    uint8_t BCF_DT_ID       # dictionary type
+    uint8_t BCF_DT_CTG
+    uint8_t BCF_DT_SAMPLE
+
+    # Complete textual representation of a header line
+    ctypedef struct bcf_hrec_t:
+        int type            # One of the BCF_HL_* type
+        char *key           # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
+        char *value         # Set only for generic lines, NULL for FILTER/INFO, etc.
+        int nkeys           # Number of structured fields
+        char **keys         # The key=value pairs
+        char **vals
+
+    ctypedef struct bcf_idinfo_t:
+        uint32_t info[3]     # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
+        bcf_hrec_t *hrec[3]  # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
+        int id
+
+    ctypedef struct bcf_idpair_t:
+        const char *key
+        const bcf_idinfo_t *val
+
+    ctypedef struct bcf_hdr_t:
+        int32_t n[3]                # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
+        bcf_idpair_t *id[3]
+        void *dict[3]               # ID dictionary, contig dict and sample dict
+        char **samples
+        bcf_hrec_t **hrec
+        int nhrec, dirty
+        int ntransl
+        int *transl[2]              # for bcf_translate()
+        int nsamples_ori            # for bcf_hdr_set_samples()
+        uint8_t *keep_samples
+        kstring_t mem
+        int32_t m[3]                # m: allocated size of the dictionary block in use (see n above)
+
+    uint8_t bcf_type_shift[]
+
+    # * VCF record *
+
+    uint8_t BCF_BT_NULL
+    uint8_t BCF_BT_INT8
+    uint8_t BCF_BT_INT16
+    uint8_t BCF_BT_INT32
+    uint8_t BCF_BT_FLOAT
+    uint8_t BCF_BT_CHAR
+
+    uint8_t VCF_REF
+    uint8_t VCF_SNP
+    uint8_t VCF_MNP
+    uint8_t VCF_INDEL
+    uint8_t VCF_OTHER
+
+    ctypedef struct variant_t:
+        int type, n     # variant type and the number of bases affected, negative for deletions
+
+    ctypedef struct bcf_fmt_t:
+        int id             # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
+        int n, size, type  # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
+        uint8_t *p         # same as vptr and vptr_* in bcf_info_t below
+        uint32_t p_len
+        uint32_t p_off
+        uint8_t p_free
+
+    union bcf_info_union_t:
+        int32_t i      # integer value
+        float f        # float value
+
+    ctypedef struct bcf_info_t:
+        int key        # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
+        int type, len  # type: one of BCF_BT_* types; len: vector length, 1 for scalars
+
+        # v1 union only set if $len==1; for easier access
+        bcf_info_union_t v1
+        uint8_t *vptr           # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
+        uint32_t vptr_len       # length of the vptr block or, when set, of the vptr_mod block, excluding offset
+        uint32_t vptr_off       # vptr offset, i.e., the size of the INFO key plus size+type bytes
+        uint8_t  vptr_free      # indicates that vptr-vptr_off must be freed; set only when modified and the new
+                                # data block is bigger than the original
+
+    uint8_t BCF1_DIRTY_ID
+    uint8_t BCF1_DIRTY_ALS
+    uint8_t BCF1_DIRTY_FLT
+    uint8_t BCF1_DIRTY_INF
+
+    ctypedef struct bcf_dec_t:
+        int m_fmt, m_info, m_id, m_als, m_allele, m_flt  # allocated size (high-water mark); do not change
+        int n_flt           # Number of FILTER fields
+        int *flt            # FILTER keys in the dictionary
+        char *id            # ID
+        char *als           # REF+ALT block (\0-seperated)
+        char **allele       # allele[0] is the REF (allele[] pointers to the als block); all null terminated
+        bcf_info_t *info    # INFO
+        bcf_fmt_t *fmt      # FORMAT and individual sample
+        variant_t *var      # $var and $var_type set only when set_variant_types called
+        int n_var, var_type
+        int shared_dirty    # if set, shared.s must be recreated on BCF output
+        int indiv_dirty     # if set, indiv.s must be recreated on BCF output
+
+    uint8_t BCF_ERR_CTG_UNDEF
+    uint8_t BCF_ERR_TAG_UNDEF
+    uint8_t BCF_ERR_NCOLS
+    uint8_t BCF_ERR_LIMITS
+    uint8_t BCF_ERR_CHAR
+    uint8_t BCF_ERR_CTG_INVALID
+    uint8_t BCF_ERR_TAG_INVALID
+
+    # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
+    # is slower because the string is first to be parsed, packed into BCF line
+    # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
+    # is known in advance that some of the fields will not be required (notably
+    # the sample columns), parsing of these can be skipped by setting max_unpack
+    # appropriately.
+    # Similarly, it is fast to output a BCF line because the columns (kept in
+    # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
+    # line must be formatted in vcf_format.
+
+    ctypedef struct bcf1_t:
+        int32_t rid               # CHROM
+        int32_t pos               # POS
+        int32_t rlen              # length of REF
+        float qual                # QUAL
+        uint32_t n_info, n_allele
+        uint32_t n_fmt, n_sample
+        kstring_t shared, indiv
+        bcf_dec_t d               # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
+        int max_unpack            # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
+        int unpacked              # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
+        int unpack_size[3]        # the original block size of ID, REF+ALT and FILTER
+        int errcode               # one of BCF_ERR_* codes
+
+    ####### API #######
+
+    # BCF and VCF I/O
+    #
+    # A note about naming conventions: htslib internally represents VCF
+    # records as bcf1_t data structures, therefore most functions are
+    # prefixed with bcf_. There are a few exceptions where the functions must
+    # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
+    # these cases, functions prefixed with bcf_ are more general and work
+    # with both BCF and VCF.
+
+    # bcf_hdr_init() - create an empty BCF header.
+    # @param mode    "r" or "w"
+    #
+    # When opened for writing, the mandatory fileFormat and
+    # FILTER=PASS lines are added automatically.
+    bcf_hdr_t *bcf_hdr_init(const char *mode)
+
+    # Destroy a BCF header struct
+    void bcf_hdr_destroy(bcf_hdr_t *h)
+
+    # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
+    bcf1_t *bcf_init()
+
+    # Deallocate a bcf1_t object
+    void bcf_destroy(bcf1_t *v)
+
+    # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
+    # not the bcf1_t object itself.
+    void bcf_empty(bcf1_t *v)
+
+    # Make the bcf1_t object ready for next read. Intended mostly for
+    # internal use, the user should rarely need to call this function
+    # directly.
+    void bcf_clear(bcf1_t *v)
+
+    # Reads VCF or BCF header
+    bcf_hdr_t *bcf_hdr_read(htsFile *fp)
+
+    # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
+    # @samples: samples to include or exclude from file or as a comma-separated string.
+    #             LIST|FILE   .. select samples in list/file
+    #             ^LIST|FILE  .. exclude samples from list/file
+    #             -           .. include all samples
+    #             NULL        .. exclude all samples
+    # @is_file: @samples is a file (1) or a comma-separated list (0)
+    #
+    # The bottleneck of VCF reading is parsing of genotype fields. If the
+    # reader knows in advance that only subset of samples is needed (possibly
+    # no samples at all), the performance of bcf_read() can be significantly
+    # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
+    # The function bcf_read() will subset the VCF/BCF records automatically
+    # with the notable exception when reading records via bcf_itr_next().
+    # In this case, bcf_subset_format() must be called explicitly, because
+    # bcf_readrec() does not see the header.
+    #
+    # Returns 0 on success, -1 on error or a positive integer if the list
+    # contains samples not present in the VCF header. In such a case, the
+    # return value is the index of the offending sample.
+    #
+    int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
+    int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
+
+    # Writes VCF or BCF header
+    int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
+
+    # Parse VCF line contained in kstring and populate the bcf1_t struct
+    int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
+
+    # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
+    int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
+
+    # bcf_read() - read next VCF or BCF record
+    #
+    # Returns -1 on critical errors, 0 otherwise. On errors which are not
+    # critical for reading, such as missing header definitions, v->errcode is
+    # set to one of BCF_ERR* code and must be checked before calling
+    # vcf_write().
+    int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+
+    # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
+    #
+    # Note that bcf_unpack() must be called even when reading VCF. It is safe
+    # to call the function repeatedly, it will not unpack the same field
+    # twice.
+    uint8_t BCF_UN_STR        # up to ALT inclusive
+    uint8_t BCF_UN_FLT        # up to FILTER
+    uint8_t BCF_UN_INFO       # up to INFO
+    uint8_t BCF_UN_SHR        # all shared information
+    uint8_t BCF_UN_FMT        # unpack format and each sample
+    uint8_t BCF_UN_IND        # a synonymo of BCF_UN_FMT
+    uint8_t BCF_UN_ALL        # everything
+
+    int bcf_unpack(bcf1_t *b, int which)
+
+    # bcf_dup() - create a copy of BCF record.
+    #
+    # Note that bcf_unpack() must be called on the returned copy as if it was
+    # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
+    # internally to reflect any changes made by bcf_update_* functions.
+    bcf1_t *bcf_dup(bcf1_t *src)
+    bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
+
+    # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
+    int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
+
+    # The following functions work only with VCFs and should rarely be called
+    # directly. Usually one wants to use their bcf_* alternatives, which work
+    # transparently with both VCFs and BCFs.
+    bcf_hdr_t *vcf_hdr_read(htsFile *fp)
+    int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
+    int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+    int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+
+    #************************************************************************
+    # Header querying and manipulation routines
+    #************************************************************************
+
+    # Create a new header using the supplied template
+    bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
+
+    # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
+    # Returns 0 on success or sets a bit on error:
+    #     1 .. conflicting definitions of tag length
+    #     # todo
+    int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
+
+    # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
+    # @param dst: the destination header to be merged into, NULL on the first pass
+    # @param src: the source header
+    #
+    # Notes:
+    #     - use as:
+    #         bcf_hdr_t *dst = NULL;
+    #         for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
+    #
+    #     - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
+    #     combining multiple BCF headers. The current bcf_hdr_combine()
+    #     does not have this problem, but became slow when used for many files.
+    bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
+
+    # bcf_hdr_add_sample() - add a new sample.
+    # @param sample:  sample name to be added
+    int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
+
+    # Read VCF header from a file and update the header
+    int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
+
+    # Returns formatted header (newly allocated string) and its length,
+    # excluding the terminating \0. If is_bcf parameter is unset, IDX
+    # fields are discarded.
+    char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
+
+    # Append new VCF header line, returns 0 on success
+    int bcf_hdr_append(bcf_hdr_t *h, const char *line)
+    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
+
+    # VCF version, e.g. VCFv4.2
+    const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
+    void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
+
+    # bcf_hdr_remove() - remove VCF header tag
+    # @param type:      one of BCF_HL_*
+    # @param key:       tag name or NULL to remove all tags of the given type
+    void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
+
+    # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
+    # @param n:        number of samples to keep
+    # @param samples:  names of the samples to keep
+    # @param imap:     mapping from index in @samples to the sample index in the original file
+    #
+    # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
+    # by comparing n and bcf_hdr_nsamples(out_hdr).
+    # This function can be used to reorder samples.
+    # See also bcf_subset() which subsets individual records.
+    #
+    bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
+
+    # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
+    const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
+
+    # Get number of samples
+    int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
+
+    # The following functions are for internal use and should rarely be called directly
+    int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
+    int bcf_hdr_sync(bcf_hdr_t *h)
+    bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
+    void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
+    int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
+
+    # bcf_hdr_get_hrec() - get header line info
+    # @param type:  one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
+    # @param key:   the header key for generic lines (e.g. "fileformat"), any field
+    #                 for structured lines, typically "ID".
+    # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
+    # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
+    #
+    bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
+    bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
+    void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
+    void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
+    int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
+    void hrec_add_idx(bcf_hrec_t *hrec, int idx)
+    void bcf_hrec_destroy(bcf_hrec_t *hrec)
+
+    #************************************************************************
+    # Individual record querying and manipulation routines
+    #************************************************************************
+
+    # See the description of bcf_hdr_subset()
+    int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
+
+    # bcf_translate() - translate tags ids to be consistent with different header. This function
+    #                   is useful when lines from multiple VCF need to be combined.
+    # @dst_hdr:   the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
+    # @src_hdr:   the source header, used in bcf_read()
+    # @src_line:  line obtained by bcf_read()
+    int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
+
+    # bcf_get_variant_type[s]()  - returns one of VCF_REF, VCF_SNP, etc
+    int bcf_get_variant_types(bcf1_t *rec)
+    int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
+    int bcf_is_snp(bcf1_t *v)
+
+    # bcf_update_filter() - sets the FILTER column
+    # @flt_ids:  The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+    # @n:        Number of filters. If n==0, all filters are removed
+    int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
+
+    # bcf_add_filter() - adds to the FILTER column
+    # @flt_id:   The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+    #
+    # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
+    int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
+
+    # bcf_remove_filter() - removes from the FILTER column
+    # @flt_id:   filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+    # @pass:     when set to 1 and no filters are present, set to PASS
+    int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
+
+    # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
+    int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
+
+    # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
+    # @alleles:           Array of alleles
+    # @nals:              Number of alleles
+    # @alleles_string:    Comma-separated alleles, starting with the REF allele
+    int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
+    int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
+
+    # bcf_update_id() - sets new ID string
+    # bcf_add_id() - adds to the ID string checking for duplicates
+    int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+    int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+
+    # bcf_update_info_*() - functions for updating INFO fields
+    # @hdr:       the BCF header
+    # @line:      VCF line to be edited
+    # @key:       the INFO tag to be updated
+    # @values:    pointer to the array of values. Pass NULL to remove the tag.
+    # @n:         number of values in the array. When set to 0, the INFO tag is removed
+    #
+    # The @string in bcf_update_info_flag() is optional, @n indicates whether
+    # the flag is set or removed.
+    #
+    # Returns 0 on success or negative value on error.
+    #
+    int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
+    int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
+    int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+    int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+    int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+
+    # bcf_update_format_*() - functions for updating FORMAT fields
+    # @values:    pointer to the array of values, the same number of elements
+    #             is expected for each sample. Missing values must be padded
+    #             with bcf_*_missing or bcf_*_vector_end values.
+    # @n:         number of values in the array. If n==0, existing tag is removed.
+    #
+    # The function bcf_update_format_string() is a higher-level (slower) variant of
+    # bcf_update_format_char(). The former accepts array of \0-terminated strings
+    # whereas the latter requires that the strings are collapsed into a single array
+    # of fixed-length strings. In case of strings with variable length, shorter strings
+    # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
+    # are not \0-terminated.
+    #
+    # Returns 0 on success or negative value on error.
+    #
+    int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
+    int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
+    int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+    int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
+    int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
+    int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+
+    # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
+    # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
+    # from bcf_get_genotypes() below.
+    uint32_t bcf_gt_phased(uint32_t idx)
+    uint32_t bcf_gt_unphased(uint32_t idx)
+    uint32_t bcf_gt_missing
+    uint32_t bcf_gt_is_missing(uint32_t val)
+    uint32_t bcf_gt_is_phased(uint32_t idx)
+    uint32_t bcf_gt_allele(uint32_t val)
+
+    # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
+    uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
+    void bcf_gt2alleles(int igt, int *a, int *b)
+
+    # bcf_get_fmt() - returns pointer to FORMAT's field data
+    # @header: for access to BCF_DT_ID dictionary
+    # @line:   VCF line obtained from vcf_parse1
+    # @fmt:    one of GT,PL,...
+    #
+    # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
+    # is not available.
+    #
+    bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+    bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+
+    # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
+    # @line: VCF line obtained from vcf_parse1
+    # @id:  The header index for the tag, obtained from bcf_hdr_id2int()
+    #
+    # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
+    # as their goal is to avoid the header lookup.
+    #
+    bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
+    bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
+
+    # bcf_get_info_*() - get INFO values, integers or floats
+    # @hdr:       BCF header
+    # @line:      BCF record
+    # @tag:       INFO tag to retrieve
+    # @dst:       *dst is pointer to a memory location, can point to NULL
+    # @ndst:      pointer to the size of allocated memory
+    #
+    # Returns negative value on error or the number of written values on
+    # success. bcf_get_info_string() returns on success the number of
+    # characters written excluding the null-terminating byte. bcf_get_info_flag()
+    # returns 1 when flag is set or 0 if not.
+    #
+    # List of return codes:
+    #     -1 .. no such INFO tag defined in the header
+    #     -2 .. clash between types defined in the header and encountered in the VCF record
+    #     -3 .. tag is not present in the VCF record
+    #
+    int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
+    int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
+    int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
+    int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
+    int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+
+    # bcf_get_format_*() - same as bcf_get_info*() above
+    #
+    # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
+    # see the description of bcf_update_format_string() and bcf_update_format_char() above.
+    # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
+    # a single block of \0-terminated strings collapsed into a single array and an array of pointers
+    # to these strings. Both arrays must be cleaned by the user.
+    #
+    # Returns negative value on error or the number of written values on success.
+    #
+    # Example:
+    #     int ndst = 0; char **dst = NULL
+    #     if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
+    #         for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
+    #     free(dst[0]); free(dst)
+    #
+    # Example:
+    #     int ngt, *gt_arr = NULL, ngt_arr = 0
+    #     ngt = bcf_get_genotypes(hdr, line, &gt_arr, &ngt_arr)
+    #
+    int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
+    int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
+    int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
+    int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst)
+    int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
+    int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+
+    #************************************************************************
+    # Helper functions
+    #************************************************************************
+
+    #
+    # bcf_hdr_id2int() - Translates string into numeric ID
+    # bcf_hdr_int2id() - Translates numeric ID into string
+    # @type:     one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
+    # @id:       tag name, such as: PL, DP, GT, etc.
+    #
+    # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
+    # fields in BCF records.
+    #
+    int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
+    const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
+
+    # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
+    # bcf_hdr_id2name() - Translates numeric ID to sequence name
+    #
+    int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
+    const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
+    const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
+
+    #
+    # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
+    # @type:      one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
+    # @int_id:    return value of bcf_hdr_id2int, must be >=0
+    #
+    # The returned values are:
+    #    bcf_hdr_id2length   ..  whether the number of values is fixed or variable, one of BCF_VL_*
+    #    bcf_hdr_id2number   ..  the number of values, 0xfffff for variable length fields
+    #    bcf_hdr_id2type     ..  the field type, one of BCF_HT_*
+    #    bcf_hdr_id2coltype  ..  the column type, one of BCF_HL_*
+    #
+    # Notes: Prior to using the macros, the presence of the info should be
+    # tested with bcf_hdr_idinfo_exists().
+    #
+    int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
+    int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
+    int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
+    int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
+    int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
+    bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
+
+    void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
+    uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
+
+    void bcf_enc_vchar(kstring_t *s, int l, const char *a)
+    void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
+    void bcf_enc_vfloat(kstring_t *s, int n, float *a)
+
+    #************************************************************************
+    # BCF index
+    #
+    # Note that these functions work with BCFs only. See synced_bcf_reader.h
+    # which provides (amongst other things) an API to work transparently with
+    # both indexed BCFs and VCFs.
+    #************************************************************************
+
+    hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
+    int bcf_index_build(const char *fn, int min_shift)
+    int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
+
+    #*******************
+    # Typed value I/O *
+    #******************
+
+    # Note that in contrast with BCFv2.1 specification, HTSlib implementation
+    # allows missing values in vectors. For integer types, the values 0x80,
+    # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
+    # 0x80000001 as end-of-vector indicators.  Similarly for floats, the value of
+    # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
+    # end-of-vector indicator.
+    # Note that the end-of-vector byte is not part of the vector.
+
+    # This trial BCF version (v2.2) is compatible with the VCF specification and
+    # enables to handle correctly vectors with different ploidy in presence of
+    # missing values.
+
+    int32_t bcf_int8_vector_end
+    int32_t bcf_int16_vector_end
+    int32_t bcf_int32_vector_end
+    int32_t bcf_str_vector_end
+    int32_t bcf_int8_missing
+    int32_t bcf_int16_missing
+    int32_t bcf_int32_missing
+    int32_t bcf_str_missing
+
+    uint32_t bcf_float_vector_end
+    uint32_t bcf_float_missing
+
+    void bcf_float_set(float *ptr, uint32_t value)
+    void bcf_float_set_vector_end(float *x)
+    void bcf_float_set_missing(float *x)
+
+    int bcf_float_is_missing(float f)
+    int bcf_float_is_vector_end(float f)
+    void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
+    void bcf_enc_size(kstring_t *s, int size, int type)
+    int bcf_enc_inttype(long x)
+    void bcf_enc_int1(kstring_t *s, int32_t x)
+    int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
+    int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
+    int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
+
+    # These trivial wrappers are defined only for consistency with other parts of htslib
+    bcf1_t *bcf_init1()
+    int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+    int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+    int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+    int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+    void bcf_destroy1(bcf1_t *v)
+    void bcf_empty1(bcf1_t *v)
+    int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
+    void bcf_clear1(bcf1_t *v)
+    int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
+
+    # Other nice wrappers
+    void bcf_itr_destroy(hts_itr_t *iter)
+    hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+    hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
+    int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
+    hts_idx_t *bcf_index_load(const char *fn)
+    const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
+
+
+# VCF/BCF utility functions
+cdef extern from "htslib/vcfutils.h" nogil:
+    struct kbitset_t
+
+    # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
+    # @header:  for access to BCF_DT_ID dictionary
+    # @line:    VCF line obtain from vcf_parse1
+    #
+    # Returns the number of removed alleles on success or negative
+    # on error:
+    #     -1 .. some allele index is out of bounds
+    int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
+
+    # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
+    # @header:  for access to BCF_DT_ID dictionary
+    # @line:    VCF line obtained from vcf_parse1
+    # @mask:    alleles to remove
+    #
+    # If you have more than 31 alleles, then the integer bit mask will
+    # overflow, so use bcf_remove_allele_set instead
+    void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
+
+    # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
+    # @header:  for access to BCF_DT_ID dictionary
+    # @line:    VCF line obtained from vcf_parse1
+    # @rm_set:  pointer to kbitset_t object with bits set for allele
+    #           indexes to remove
+    #
+    # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
+    void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
+
+    # bcf_calc_ac() - calculate the number of REF and ALT alleles
+    # @header:  for access to BCF_DT_ID dictionary
+    # @line:    VCF line obtained from vcf_parse1
+    # @ac:      array of length line->n_allele
+    # @which:   determine if INFO/AN,AC and indv fields be used
+    #
+    # Returns 1 if the call succeeded, or 0 if the value could not
+    # be determined.
+    #
+    # The value of @which determines if existing INFO/AC,AN can be
+    # used (BCF_UN_INFO) and and if indv fields can be splitted
+    # (BCF_UN_FMT).
+    int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
+
+    # bcf_gt_type() - determines type of the genotype
+    # @fmt_ptr:  the GT format field as set for example by set_fmt_ptr
+    # @isample:  sample index (starting from 0)
+    # @ial:      index of the 1st non-reference allele (starting from 1)
+    # @jal:      index of the 2nd non-reference allele (starting from 1)
+    #
+    # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
+    # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
+    # is not NULL and the genotype has one or more non-reference
+    # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
+    # position of the allele which appeared first in ALT. If $jal is
+    # not null and the genotype is GT_HET_AA, $jal will be set and is
+    # the position of the second allele in ALT.
+    uint8_t GT_HOM_RR    # note: the actual value of GT_* matters, used in dosage r2 calculation
+    uint8_t GT_HOM_AA
+    uint8_t GT_HET_RA
+    uint8_t GT_HET_AA
+    uint8_t GT_HAPL_R
+    uint8_t GT_HAPL_A
+    uint8_t GT_UNKN
+    int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
+
+    int bcf_acgt2int(char c)
+    char bcf_int2acgt(int i)
+
+    # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
+    # @i,j:  allele indexes, 0-based, i<=j
+    # Returns index to the Number=G diploid array
+    uint32_t bcf_ij2G(uint32_t i, uint32_t j)
+
+
+cdef class HTSFile(object):
+    cdef          htsFile *htsfile       # pointer to htsFile structure
+    cdef          int64_t start_offset   # BGZF offset of first record
+
+    cdef readonly object  filename       # filename as supplied by user
+    cdef readonly object  mode           # file opening mode
+    cdef readonly object  index_filename # filename of index, if supplied by user
+
+    cdef readonly bint    is_stream      # Is htsfile a non-seekable stream
+    cdef readonly bint    is_remote      # Is htsfile a remote stream
+    cdef readonly bint   duplicate_filehandle   # Duplicate filehandle when opening via fh
+
+    cdef htsFile *_open_htsfile(self) except? NULL
diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx

new file mode 100644 (file)

index 0000000..7eea059
--- /dev/null
+++ b/pysam/libchtslib.pyx
@@ -0,0 +1,265 @@
+# cython: embedsignature=True
+# cython: profile=True
+# adds doc-strings for sphinx
+import os
+
+from posix.unistd cimport dup
+
+from pysam.libchtslib cimport *
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+
+__all__ = ["get_verbosity", "set_verbosity"]
+
+
+########################################################################
+########################################################################
+## Constants
+########################################################################
+
+cdef int   MAX_POS = 2 << 29
+cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
+cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
+                      'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
+cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
+
+
+cpdef set_verbosity(int verbosity):
+    """Set htslib's hts_verbose global variable to the specified value."""
+    return hts_set_verbosity(verbosity)
+
+cpdef get_verbosity():
+    """Return the value of htslib's hts_verbose global variable."""
+    return hts_get_verbosity()
+
+
+class CallableValue(object):
+    def __init__(self, value):
+        self.value = value
+    def __call__(self):
+        return self.value
+    def __bool__(self):
+        return self.value
+    def __nonzero__(self):
+        return self.value
+    def __eq__(self, other):
+        return self.value == other
+    def __ne__(self, other):
+        return self.value != other
+
+
+CTrue = CallableValue(True)
+CFalse = CallableValue(False)
+
+
+cdef class HTSFile(object):
+    """
+    Base class for HTS file types
+    """
+    def __cinit__(self, *args, **kwargs):
+        self.htsfile = NULL
+        self.duplicate_filehandle = True
+
+    def __dealloc__(self):
+        if self.htsfile:
+            hts_close(self.htsfile)
+            self.htsfile = NULL
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+        return False
+
+    @property
+    def category(self):
+        """General file format category.  One of UNKNOWN, ALIGNMENTS,
+        VARIANTS, INDEX, REGIONS"""
+        if not self.htsfile:
+            raise ValueError('metadata not available on closed file')
+        return FORMAT_CATEGORIES[self.htsfile.format.category]
+
+    @property
+    def format(self):
+        """File format.
+
+        One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
+        BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
+        """
+        if not self.htsfile:
+            raise ValueError('metadata not available on closed file')
+        return FORMATS[self.htsfile.format.format]
+
+    @property
+    def version(self):
+        """Tuple of file format version numbers (major, minor)"""
+        if not self.htsfile:
+            raise ValueError('metadata not available on closed file')
+        return self.htsfile.format.version.major, self.htsfile.format.version.minor
+
+    @property
+    def compression(self):
+        """File compression.
+
+        One of NONE, GZIP, BGZF, CUSTOM."""
+        if not self.htsfile:
+            raise ValueError('metadata not available on closed file')
+        return COMPRESSION[self.htsfile.format.compression]
+
+    @property
+    def description(self):
+        """Vaguely human readable description of the file format"""
+        if not self.htsfile:
+            raise ValueError('metadata not available on closed file')
+        cdef char *desc = hts_format_description(&self.htsfile.format)
+        try:
+            return charptr_to_str(desc)
+        finally:
+            free(desc)
+
+    @property
+    def is_open(self):
+        """return True if HTSFile is open and in a valid state."""
+        return CTrue if self.htsfile != NULL else CFalse
+
+    @property
+    def is_closed(self):
+        """return True if HTSFile is closed."""
+        return self.htsfile == NULL
+
+    @property
+    def closed(self):
+        """return True if HTSFile is closed."""
+        return self.htsfile == NULL
+
+    @property
+    def is_write(self):
+        """return True if HTSFile is open for writing"""
+        return self.htsfile != NULL and self.htsfile.is_write != 0
+
+    @property
+    def is_read(self):
+        """return True if HTSFile is open for reading"""
+        return self.htsfile != NULL and self.htsfile.is_write == 0
+
+    @property
+    def is_sam(self):
+        """return True if HTSFile is reading or writing a SAM alignment file"""
+        return self.htsfile != NULL and self.htsfile.format.format == sam
+
+    @property
+    def is_bam(self):
+        """return True if HTSFile is reading or writing a BAM alignment file"""
+        return self.htsfile != NULL and self.htsfile.format.format == bam
+
+    @property
+    def is_cram(self):
+        """return True if HTSFile is reading or writing a BAM alignment file"""
+        return self.htsfile != NULL and self.htsfile.format.format == cram
+
+    @property
+    def is_vcf(self):
+        """return True if HTSFile is reading or writing a VCF variant file"""
+        return self.htsfile != NULL and self.htsfile.format.format == vcf
+
+    @property
+    def is_bcf(self):
+        """return True if HTSFile is reading or writing a BCF variant file"""
+        return self.htsfile != NULL and self.htsfile.format.format == bcf
+
+    def reset(self):
+        """reset file position to beginning of file just after the header.
+
+        Returns
+        -------
+
+        The file position after moving the file pointer.
+
+        """
+        return self.seek(self.start_offset)
+
+    def seek(self, uint64_t offset):
+        """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+        if self.is_stream:
+            raise OSError('seek not available in streams')
+
+        cdef int64_t ret
+        if self.htsfile.format.compression != no_compression:
+            with nogil:
+                ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
+        else:
+            with nogil:
+                ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
+        return ret
+
+    def tell(self):
+        """return current file position, see :meth:`pysam.HTSFile.seek`."""
+        if not self.is_open:
+            raise ValueError('I/O operation on closed file')
+        if self.is_stream:
+            raise OSError('tell not available in streams')
+
+        cdef int64_t ret
+        if self.htsfile.format.compression != no_compression:
+            with nogil:
+                ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
+        else:
+            with nogil:
+                ret = hts_utell(self.htsfile)
+        return ret
+
+    cdef htsFile *_open_htsfile(self) except? NULL:
+        cdef char *cfilename
+        cdef char *cmode = self.mode
+        cdef int fd, dup_fd
+
+        if isinstance(self.filename, bytes):
+            cfilename = self.filename
+            with nogil:
+                return hts_open(cfilename, cmode)
+        else:
+            if isinstance(self.filename, int):
+                fd = self.filename
+            else:
+                fd = self.filename.fileno()
+               
+            if self.duplicate_filehandle:
+                dup_fd = dup(fd)
+            else:
+                dup_fd = fd
+
+            # Replicate mode normalization done in hts_open_format
+            smode = self.mode.replace(b'b', b'').replace(b'c', b'')
+            if b'b' in self.mode:
+                smode += b'b'
+            elif b'c' in self.mode:
+                smode += b'c'
+            cmode = smode
+
+            hfile = hdopen(dup_fd, cmode)
+            if hfile == NULL:
+                raise IOError('Cannot create hfile')
+
+            try:
+                # filename.name can be an int
+                filename = str(self.filename.name)
+            except AttributeError:
+                filename = '<fd:{}>'.format(fd)
+
+            filename = encode_filename(filename)
+            cfilename = filename
+            with nogil:
+                return hts_hopen(hfile, cfilename, cmode)
+
+    def _exists(self):
+        """return False iff file is local, a file and exists.
+        """
+        return (not isinstance(self.filename, (str, bytes)) or
+                self.filename == b'-' or
+                self.is_remote or
+                os.path.exists(self.filename))
diff --git a/pysam/libcsamfile.pxd b/pysam/libcsamfile.pxd

new file mode 100644 (file)

index 0000000..de36998
--- /dev/null
+++ b/pysam/libcsamfile.pxd
@@ -0,0 +1,45 @@
+from pysam.libcalignmentfile cimport AlignedSegment, AlignmentFile
+
+#################################################
+# Compatibility Layer for pysam < 0.8
+
+# import all declarations from htslib
+from pysam.libchtslib cimport *
+
+cdef class AlignedRead(AlignedSegment):
+    pass
+
+cdef class Samfile(AlignmentFile):
+    pass
+
+# import the conversion functions
+cdef extern from "htslib_util.h":
+
+    # add *nbytes* into the variable length data of *src* at *pos*
+    bam1_t * pysam_bam_update(bam1_t * b,
+                              size_t nbytes_old,
+                              size_t nbytes_new,
+                              uint8_t * pos)
+
+    # now: static
+    int aux_type2size(int)
+
+    char * pysam_bam_get_qname(bam1_t * b)
+    uint32_t * pysam_bam_get_cigar(bam1_t * b)
+    uint8_t * pysam_bam_get_seq(bam1_t * b)
+    uint8_t * pysam_bam_get_qual(bam1_t * b)
+    uint8_t * pysam_bam_get_aux(bam1_t * b)
+    int pysam_bam_get_l_aux(bam1_t * b)
+    char pysam_bam_seqi(uint8_t * s, int i)
+
+    uint16_t pysam_get_bin(bam1_t * b)
+    uint8_t pysam_get_qual(bam1_t * b)
+    uint8_t pysam_get_l_qname(bam1_t * b)
+    uint16_t pysam_get_flag(bam1_t * b)
+    uint16_t pysam_get_n_cigar(bam1_t * b)
+    void pysam_set_bin(bam1_t * b, uint16_t v)
+    void pysam_set_qual(bam1_t * b, uint8_t v)
+    void pysam_set_l_qname(bam1_t * b, uint8_t v)
+    void pysam_set_flag(bam1_t * b, uint16_t v)
+    void pysam_set_n_cigar(bam1_t * b, uint16_t v)
+    void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
diff --git a/pysam/libcsamfile.pyx b/pysam/libcsamfile.pyx

new file mode 100644 (file)

index 0000000..bde93d8
--- /dev/null
+++ b/pysam/libcsamfile.pyx
@@ -0,0 +1,43 @@
+# cython: embedsignature=True
+# cython: profile=True
+# adds doc-strings for sphinx
+import tempfile
+import os
+import sys
+import types
+import itertools
+import struct
+import ctypes
+import collections
+import re
+import platform
+import warnings
+from cpython cimport PyErr_SetString, \
+    PyBytes_Check, \
+    PyUnicode_Check, \
+    PyBytes_FromStringAndSize
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+
+
+cdef class Samfile(AlignmentFile):
+    '''Deprecated alternative for :class:`~pysam.AlignmentFile`
+
+    Added for backwards compatibility with pysam <= 0.8.0
+    '''
+    pass
+
+
+cdef class AlignedRead(AlignedSegment):
+    '''Deprecated alternative for :class:`~pysam.AlignedSegment`
+
+    Added for backwards compatibility with pysam <= 0.8.0
+    '''
+    pass
+
+
+__all__ = ['Samfile', 'AlignedRead']
+
+
diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd

new file mode 100644 (file)

index 0000000..12cd9dd
--- /dev/null
+++ b/pysam/libctabix.pxd
@@ -0,0 +1,123 @@
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+
+# Note: this replaces python "open"!
+cdef extern from "fcntl.h":
+    int open(char *pathname, int flags)
+
+cdef extern from "unistd.h" nogil:
+    ctypedef int ssize_t
+    ssize_t read(int fd, void *buf, size_t count)
+    int close(int fd)
+
+from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \
+    tbx_t, kstring_t, BGZF, HTSFile
+
+
+# These functions are put here and not in chtslib.pxd in order
+# to avoid warnings for unused functions.
+cdef extern from "pysam_stream.h" nogil:
+
+    ctypedef struct kstream_t:
+        pass
+
+    ctypedef struct kseq_t:
+        kstring_t name
+        kstring_t comment
+        kstring_t seq
+        kstring_t qual
+
+    kseq_t *kseq_init(BGZF *)
+    int kseq_read(kseq_t *)
+    void kseq_destroy(kseq_t *)
+    kstream_t *ks_init(BGZF *)
+    void ks_destroy(kstream_t *)
+
+    # Retrieve characters from stream until delimiter
+    # is reached placing results in str.
+    int ks_getuntil(kstream_t *,
+                    int delimiter,
+                    kstring_t * str,
+                    int * dret)
+
+
+cdef class tabix_file_iterator:
+    cdef BGZF * fh
+    cdef kstream_t * kstream
+    cdef kstring_t buffer
+    cdef size_t size
+    cdef Parser parser
+    cdef int fd
+    cdef int duplicated_fd
+    cdef infile
+
+    cdef __cnext__(self)
+
+
+cdef class TabixFile(HTSFile):
+    # pointer to index structure
+    cdef tbx_t * index
+
+    cdef readonly object filename_index
+
+    cdef Parser parser
+
+    cdef encoding    
+
+
+cdef class Parser:
+    cdef encoding
+    cdef parse(self, char * buffer, int len)
+
+
+cdef class asTuple(Parser):
+    cdef parse(self, char * buffer, int len)
+
+
+cdef class asGTF(Parser):
+    pass
+
+
+cdef class asBed(Parser):
+    pass
+
+
+cdef class asVCF(Parser):
+    pass
+
+
+cdef class TabixIterator:
+    cdef hts_itr_t * iterator
+    cdef TabixFile tabixfile
+    cdef kstring_t buffer
+    cdef encoding
+    cdef int __cnext__(self)
+
+
+cdef class TabixIteratorParsed(TabixIterator):
+    cdef Parser parser
+
+
+cdef class GZIterator:
+    cdef object _filename
+    cdef BGZF * gzipfile
+    cdef kstream_t * kstream
+    cdef kstring_t buffer
+    cdef int __cnext__(self)
+    cdef encoding
+
+
+cdef class GZIteratorHead(GZIterator):
+    pass
+
+
+cdef class GZIteratorParsed(GZIterator):
+    cdef Parser parser
+
+
+# Compatibility Layer for pysam < 0.8
+cdef class Tabixfile(TabixFile):
+    pass
diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx

new file mode 100644 (file)

index 0000000..10dc23b
--- /dev/null
+++ b/pysam/libctabix.pyx
@@ -0,0 +1,1188 @@
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for access to tabix indexed files in bgzf format
+###############################################################################
+# The principal classes and functions defined in this module are:
+#
+# class TabixFile  class wrapping tabix indexed files in bgzf format
+#
+# class asTuple  Parser class for tuples
+# class asGT     Parser class for GTF formatted rows
+# class asBed    Parser class for Bed formatted rows
+# class asVCF    Parser class for VCF formatted rows
+#
+# class tabix_generic_iterator  Streamed iterator of bgzf formatted files
+#
+# Additionally this module defines several additional classes that are part
+# of the internal API. These are:
+#
+# class Parser                base class for parsers of tab-separated rows
+# class tabix_file_iterator
+# class TabixIterator         iterator class over rows in bgzf file
+# class EmptyIterator
+#
+# For backwards compatibility, the following classes are also defined:
+#
+# class Tabixfile   equivalent to TabixFile
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import os
+import sys
+
+from libc.stdio cimport printf, fprintf, stderr
+from libc.string cimport strerror
+from libc.errno cimport errno
+from posix.unistd cimport dup
+
+from cpython cimport PyErr_SetString, PyBytes_Check, \
+    PyUnicode_Check, PyBytes_FromStringAndSize, \
+    PyObject_AsFileDescriptor
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+cimport pysam.libctabixproxies as ctabixproxies
+
+from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
+    BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
+    tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
+    tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
+    tbx_destroy, hisremote, region_list
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+cdef class Parser:
+
+    def __init__(self, encoding="ascii"):
+        self.encoding = encoding
+
+    def set_encoding(self, encoding):
+        self.encoding = encoding
+
+    def get_encoding(self):
+        return self.encoding
+
+    cdef parse(self, char * buffer, int length):
+        raise NotImplementedError(
+            'parse method of %s not implemented' % str(self))
+
+    def __call__(self, char * buffer, int length):
+        return self.parse(buffer, length)
+
+
+cdef class asTuple(Parser):
+    '''converts a :term:`tabix row` into a python tuple.
+
+    A field in a row is accessed by numeric index.
+    ''' 
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.TupleProxy r
+        r = ctabixproxies.TupleProxy(self.encoding)
+        # need to copy - there were some
+        # persistence issues with "present"
+        r.copy(buffer, len)
+        return r
+
+
+cdef class asGTF(Parser):
+    '''converts a :term:`tabix row` into a GTF record with the following
+    fields:
+   
+    +----------+----------+-------------------------------+
+    |*Column*  |*Name*    |*Content*                      |
+    +----------+----------+-------------------------------+
+    |1         |contig    |the chromosome name            |
+    +----------+----------+-------------------------------+
+    |2         |feature   |The feature type               |
+    +----------+----------+-------------------------------+
+    |3         |source    |The feature source             |
+    +----------+----------+-------------------------------+
+    |4         |start     |genomic start coordinate       |
+    |          |          |(0-based)                      |
+    +----------+----------+-------------------------------+
+    |5         |end       |genomic end coordinate         |
+    |          |          |(0-based)                      |
+    +----------+----------+-------------------------------+
+    |6         |score     |feature score                  |
+    +----------+----------+-------------------------------+
+    |7         |strand    |strand                         |
+    +----------+----------+-------------------------------+
+    |8         |frame     |frame                          |
+    +----------+----------+-------------------------------+
+    |9         |attributes|the attribute field            |
+    +----------+----------+-------------------------------+
+
+    GTF formatted entries also define the following fields that
+    are derived from the attributes field:
+
+    +--------------------+------------------------------+
+    |*Name*              |*Content*                     |
+    +--------------------+------------------------------+
+    |gene_id             |the gene identifier           |
+    +--------------------+------------------------------+
+    |transcript_id       |the transcript identifier     |
+    +--------------------+------------------------------+
+
+    ''' 
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.GTFProxy r
+        r = ctabixproxies.GTFProxy(self.encoding)
+        r.copy(buffer, len)
+        return r
+
+
+cdef class asBed(Parser):
+    '''converts a :term:`tabix row` into a bed record
+    with the following fields:
+
+    +-----------+-----------+------------------------------------------+
+    |*Column*   |*Field*    |*Contents*                                |
+    |           |           |                                          |
+    +-----------+-----------+------------------------------------------+
+    |1          |contig     |contig                                    |
+    |           |           |                                          |
+    +-----------+-----------+------------------------------------------+
+    |2          |start      |genomic start coordinate (zero-based)     |
+    +-----------+-----------+------------------------------------------+
+    |3          |end        |genomic end coordinate plus one           |
+    |           |           |(zero-based)                              |
+    +-----------+-----------+------------------------------------------+
+    |4          |name       |name of feature.                          |
+    +-----------+-----------+------------------------------------------+
+    |5          |score      |score of feature                          |
+    +-----------+-----------+------------------------------------------+
+    |6          |strand     |strand of feature                         |
+    +-----------+-----------+------------------------------------------+
+    |7          |thickStart |thickStart                                |
+    +-----------+-----------+------------------------------------------+
+    |8          |thickEnd   |thickEnd                                  |
+    +-----------+-----------+------------------------------------------+
+    |9          |itemRGB    |itemRGB                                   |
+    +-----------+-----------+------------------------------------------+
+    |10         |blockCount |number of bocks                           |
+    +-----------+-----------+------------------------------------------+
+    |11         |blockSizes |',' separated string of block sizes       |
+    +-----------+-----------+------------------------------------------+
+    |12         |blockStarts|',' separated string of block genomic     |
+    |           |           |start positions                           |
+    +-----------+-----------+------------------------------------------+
+
+    Only the first three fields are required. Additional
+    fields are optional, but if one is defined, all the preceding
+    need to be defined as well.
+
+    ''' 
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.BedProxy r
+        r = ctabixproxies.BedProxy(self.encoding)
+        r.copy(buffer, len)
+        return r
+
+
+cdef class asVCF(Parser): 
+    '''converts a :term:`tabix row` into a VCF record with
+    the following fields:
+    
+    +----------+---------+------------------------------------+
+    |*Column*  |*Field*  |*Contents*                          |
+    |          |         |                                    |
+    +----------+---------+------------------------------------+
+    |1         |contig   |chromosome                          |
+    +----------+---------+------------------------------------+
+    |2         |pos      |chromosomal position, zero-based    |
+    +----------+---------+------------------------------------+
+    |3         |id       |id                                  |
+    +----------+---------+------------------------------------+
+    |4         |ref      |reference allele                    |
+    +----------+---------+------------------------------------+
+    |5         |alt      |alternate alleles                   |
+    +----------+---------+------------------------------------+
+    |6         |qual     |quality                             |
+    +----------+---------+------------------------------------+
+    |7         |filter   |filter                              |
+    +----------+---------+------------------------------------+
+    |8         |info     |info                                |
+    +----------+---------+------------------------------------+
+    |9         |format   |format specifier.                   |
+    +----------+---------+------------------------------------+
+
+    Access to genotypes is via index::
+
+        contig = vcf.contig
+        first_sample_genotype = vcf[0]
+        second_sample_genotype = vcf[1]
+
+    '''
+    cdef parse(self, char * buffer, int len):
+        cdef ctabixproxies.VCFProxy r
+        r = ctabixproxies.VCFProxy(self.encoding)
+        r.copy(buffer, len)
+        return r
+
+
+cdef class TabixFile:
+    """Random access to bgzf formatted files that
+    have been indexed by :term:`tabix`.
+
+    The file is automatically opened. The index file of file
+    ``<filename>`` is expected to be called ``<filename>.tbi``
+    by default (see parameter `index`).
+    
+    Parameters
+    ----------
+    
+    filename : string
+        Filename of bgzf file to be opened.
+
+    index : string
+        The filename of the index. If not set, the default is to
+        assume that the index is called ``filename.tbi`
+
+    mode : char
+        The file opening mode. Currently, only ``r`` is permitted.
+        
+    parser : :class:`pysam.Parser`
+    
+        sets the default parser for this tabix file. If `parser`
+        is None, the results are returned as an unparsed string.
+        Otherwise, `parser` is assumed to be a functor that will return
+        parsed data (see for example :class:`~pysam.asTuple` and
+        :class:`~pysam.asGTF`).
+
+    encoding : string
+
+        The encoding passed to the parser
+
+    Raises
+    ------
+    
+    ValueError
+        if index file is missing.
+
+    IOError
+        if file could not be opened
+    """
+    def __cinit__(self,
+                  filename,
+                  mode='r',
+                  parser=None,
+                  index=None,
+                  encoding="ascii",
+                  *args,
+                  **kwargs ):
+
+        self.htsfile = NULL
+        self.is_remote = False
+        self.is_stream = False
+        self.parser = parser
+        self._open(filename, mode, index, *args, **kwargs)
+        self.encoding = encoding
+
+    def _open( self, 
+               filename,
+               mode='r',
+               index=None,
+              ):
+        '''open a :term:`tabix file` for reading.'''
+
+        if mode != 'r':
+            raise ValueError("invalid file opening mode `%s`" % mode)
+
+        if self.htsfile != NULL:
+            self.close()
+        self.htsfile = NULL
+
+        filename_index = index or (filename + ".tbi")
+        # encode all the strings to pass to tabix
+        self.filename = encode_filename(filename)
+        self.filename_index = encode_filename(filename_index)
+
+        self.is_stream = self.filename == b'-'
+        self.is_remote = hisremote(self.filename)
+
+        if not self.is_remote:
+            if not os.path.exists(filename):
+                raise IOError("file `%s` not found" % filename)
+
+            if not os.path.exists(filename_index):
+                raise IOError("index `%s` not found" % filename_index)
+
+        # open file
+        cdef char *cfilename = self.filename
+        with nogil:
+            self.htsfile = hts_open(cfilename, 'r')
+
+        if self.htsfile == NULL:
+            raise IOError("could not open file `%s`" % filename)
+        
+        #if self.htsfile.format.category != region_list:
+        #    raise ValueError("file does not contain region data")
+
+        cfilename = self.filename_index
+        with nogil:
+            self.index = tbx_index_load(cfilename)
+
+        if self.index == NULL:
+            raise IOError("could not open index for `%s`" % filename)
+
+        if not self.is_stream:
+            self.start_offset = self.tell()
+
+    def _dup(self):
+        '''return a copy of this tabix file.
+        
+        The file is being re-opened.
+        '''
+        return TabixFile(self.filename,
+                         mode="r", 
+                         parser=self.parser,
+                         index=self.filename_index,
+                         encoding=self.encoding)
+
+    def fetch(self, 
+              reference=None,
+              start=None, 
+              end=None, 
+              region=None,
+              parser=None,
+              multiple_iterators=False):
+        '''fetch one or more rows in a :term:`region` using 0-based
+        indexing. The region is specified by :term:`reference`,
+        *start* and *end*. Alternatively, a samtools :term:`region`
+        string can be supplied.
+
+        Without *reference* or *region* all entries will be fetched. 
+        
+        If only *reference* is set, all reads matching on *reference*
+        will be fetched.
+
+        If *parser* is None, the default parser will be used for
+        parsing.
+        
+        Set *multiple_iterators* to true if you will be using multiple
+        iterators on the same file at the same time. The iterator
+        returned will receive its own copy of a filehandle to the file
+        effectively re-opening the file. Re-opening a file creates
+        some overhead, so beware.
+
+        '''
+        if not self.is_open():
+            raise ValueError("I/O operation on closed file")
+
+        # convert coordinates to region string, which is one-based
+        if reference:
+            if end is not None:
+                if end < 0:
+                    raise ValueError("end out of range (%i)" % end)
+                if start is None:
+                    start = 0
+                    
+                if start < 0:
+                    raise ValueError("start out of range (%i)" % end)
+                elif start > end:
+                    raise ValueError(
+                        'start (%i) >= end (%i)' % (start, end))
+                elif start == end:
+                    return EmptyIterator()
+                else:
+                    region = '%s:%i-%i' % (reference, start + 1, end)
+            elif start is not None:
+                if start < 0:
+                    raise ValueError("start out of range (%i)" % end)
+                region = '%s:%i' % (reference, start + 1)
+            else:
+                region = reference
+
+        # get iterator
+        cdef hts_itr_t * itr
+        cdef char *cstr
+        cdef TabixFile fileobj
+
+        # reopen the same file if necessary
+        if multiple_iterators:
+            fileobj = self._dup()
+        else:
+            fileobj = self
+
+        if region is None:
+            # without region or reference - iterate from start
+            with nogil:
+                itr = tbx_itr_queryi(fileobj.index,
+                                      HTS_IDX_START,
+                                      0,
+                                      0)
+        else:
+            s = force_bytes(region, encoding=fileobj.encoding)
+            cstr = s
+            with nogil:
+                itr = tbx_itr_querys(fileobj.index, cstr)
+
+        if itr == NULL:
+            if region is None:
+                if len(self.contigs) > 0:
+                    # when accessing a tabix file created prior tabix 1.0
+                    # the full-file iterator is empty.
+                    raise ValueError(
+                        "could not create iterator, possible "
+                        "tabix version mismatch")
+                else:
+                    # possible reason is that the file is empty -
+                    # return an empty iterator
+                    return EmptyIterator()
+            else:
+                raise ValueError(
+                    "could not create iterator for region '%s'" %
+                    region)
+            
+        # use default parser if no parser is specified
+        if parser is None:
+            parser = fileobj.parser
+
+        cdef TabixIterator a
+        if parser is None: 
+            a = TabixIterator(encoding=fileobj.encoding)
+        else:
+            parser.set_encoding(fileobj.encoding)
+            a = TabixIteratorParsed(parser)
+
+        a.tabixfile = fileobj
+        a.iterator = itr
+
+        return a
+
+    ###############################################################
+    ###############################################################
+    ###############################################################
+    ## properties
+    ###############################################################
+    property header:
+        '''the file header.
+
+        The file header consists of the lines at the beginning of a
+        file that are prefixed by the comment character ``#``.
+       
+        .. note::
+            The header is returned as an iterator presenting lines
+            without the newline character.
+        
+        .. note::
+            The header is only available for local files. For remote
+            files an Attribute Error is raised.
+
+        '''
+        
+        def __get__(self):
+            if self.is_remote:
+                raise AttributeError(
+                    "the header is not available for remote files")
+            return GZIteratorHead(self.filename)
+
+    property contigs:
+        '''list of chromosome names'''
+        def __get__(self):
+            cdef char ** sequences
+            cdef int nsequences
+            
+            with nogil:
+                sequences = tbx_seqnames(self.index, &nsequences)
+            cdef int x
+            result = []
+            for x from 0 <= x < nsequences:
+                result.append(force_str(sequences[x]))
+            
+            # htslib instructions:
+            # only free container, not the sequences themselves
+            free(sequences)
+
+            return result
+            
+    def close(self):
+        '''
+        closes the :class:`pysam.TabixFile`.'''
+        if self.htsfile != NULL:
+            hts_close(self.htsfile)
+            self.htsfile = NULL
+        if self.index != NULL:
+            tbx_destroy(self.index)
+            self.index = NULL
+
+    def __dealloc__( self ):
+        # remember: dealloc cannot call other python methods
+        # note: no doc string
+        # note: __del__ is not called.
+        if self.htsfile != NULL:
+            hts_close(self.htsfile)
+            self.htsfile = NULL
+        if self.index != NULL:
+            tbx_destroy(self.index)
+
+
+cdef class TabixIterator:
+    """iterates over rows in *tabixfile* in region
+    given by *tid*, *start* and *end*.
+    """
+
+    def __init__(self, encoding="ascii"):
+        self.encoding = encoding
+    
+    def __iter__(self):
+        self.buffer.s = NULL
+        self.buffer.l = 0
+        self.buffer.m = 0
+
+        return self 
+
+    cdef int __cnext__(self):
+        '''iterate to next element.
+        
+        Return -5 if file has been closed when this function
+        was called.
+        '''
+        if self.tabixfile.htsfile == NULL:
+            return -5
+
+        cdef int retval
+
+        while 1:
+            with nogil:
+                retval = tbx_itr_next(
+                    self.tabixfile.htsfile,
+                    self.tabixfile.index,
+                    self.iterator,
+                    &self.buffer)
+
+            if retval < 0:
+                break
+
+            if self.buffer.s[0] != '#':
+                break
+
+        return retval
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        
+        cdef int retval = self.__cnext__()
+        if retval == -5:
+            raise IOError("iteration on closed file")
+        elif retval < 0:
+            raise StopIteration
+
+        return charptr_to_str(self.buffer.s, self.encoding)
+
+    def next(self):
+        return self.__next__()
+
+    def __dealloc__(self):
+        if <void*>self.iterator != NULL:
+            tbx_itr_destroy(self.iterator)
+        if self.buffer.s != NULL:
+            free(self.buffer.s)
+
+
+class EmptyIterator:
+    '''empty iterator'''
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        raise StopIteration()
+
+    def __next__(self):
+        raise StopIteration()
+
+
+cdef class TabixIteratorParsed(TabixIterator):
+    """iterates over mapped reads in a region.
+
+    The *parser* determines the encoding.
+
+    Returns parsed data.
+    """
+
+    def __init__(self, 
+                 Parser parser):
+        
+        TabixIterator.__init__(self)
+        self.parser = parser
+
+    def __next__(self): 
+        """python version of next().
+
+        pyrex uses this non-standard name instead of next()
+        """
+        
+        cdef int retval = self.__cnext__()
+        if retval == -5:
+            raise IOError("iteration on closed file")
+        elif retval < 0:
+            raise StopIteration
+
+        return self.parser.parse(self.buffer.s,
+                                 self.buffer.l)
+
+
+cdef class GZIterator:
+    def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
+        '''iterate line-by-line through gzip (or bgzip)
+        compressed file.
+        '''
+        if not os.path.exists(filename):
+            raise IOError("No such file or directory: %s" % filename)
+
+        filename = encode_filename(filename)
+        cdef char *cfilename = filename
+        with nogil:
+            self.gzipfile = bgzf_open(cfilename, "r")
+        self._filename = filename
+        self.kstream = ks_init(self.gzipfile)
+        self.encoding = encoding
+
+        self.buffer.l = 0
+        self.buffer.m = 0
+        self.buffer.s = <char*>malloc(buffer_size)
+
+    def __dealloc__(self):
+        '''close file.'''
+        if self.gzipfile != NULL:
+            bgzf_close(self.gzipfile)
+            self.gzipfile = NULL
+        if self.buffer.s != NULL:
+            free(self.buffer.s)
+        if self.kstream != NULL:
+            ks_destroy(self.kstream)
+
+    def __iter__(self):
+        return self
+
+    cdef int __cnext__(self):
+        cdef int dret = 0
+        cdef int retval = 0
+        while 1:
+            with nogil:
+                retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
+            
+            if retval < 0: 
+                break
+
+            return dret
+        return -1
+
+    def __next__(self):
+        """python version of next().
+        """
+        cdef int retval = self.__cnext__()
+        if retval < 0:
+            raise StopIteration
+        return force_str(self.buffer.s, self.encoding)
+
+
+cdef class GZIteratorHead(GZIterator):
+    '''iterate line-by-line through gzip (or bgzip)
+    compressed file returning comments at top of file.
+    '''
+
+    def __next__(self):
+        """python version of next().
+        """
+        cdef int retval = self.__cnext__()
+        if retval < 0:
+            raise StopIteration
+        if self.buffer.s[0] == '#':
+            return self.buffer.s
+        else:
+            raise StopIteration
+
+
+cdef class GZIteratorParsed(GZIterator):
+    '''iterate line-by-line through gzip (or bgzip)
+    compressed file returning comments at top of file.
+    '''
+
+    def __init__(self, parser):
+        self.parser = parser
+
+    def __next__(self):
+        """python version of next().
+        """
+        cdef int retval = self.__cnext__()
+        if retval < 0:
+            raise StopIteration
+
+        return self.parser.parse(self.buffer.s,
+                                 self.buffer.l)
+
+
+def tabix_compress(filename_in, 
+                   filename_out,
+                   force=False):
+    '''compress *filename_in* writing the output to *filename_out*.
+    
+    Raise an IOError if *filename_out* already exists, unless *force*
+    is set.
+    '''
+
+    if not force and os.path.exists(filename_out):
+        raise IOError(
+            "Filename '%s' already exists, use *force* to "
+            "overwrite" % filename_out)
+
+    cdef int WINDOW_SIZE
+    cdef int c, r
+    cdef void * buffer
+    cdef BGZF * fp
+    cdef int fd_src
+    cdef bint is_empty = True
+    cdef int O_RDONLY
+    O_RDONLY = os.O_RDONLY
+
+    WINDOW_SIZE = 64 * 1024
+
+    fn = encode_filename(filename_out)
+    cdef char *cfn = fn
+    with nogil:
+        fp = bgzf_open(cfn, "w")
+    if fp == NULL:
+        raise IOError("could not open '%s' for writing" % filename_out)
+
+    fn = encode_filename(filename_in)
+    fd_src = open(fn, O_RDONLY)
+    if fd_src == 0:
+        raise IOError("could not open '%s' for reading" % filename_in)
+
+    buffer = malloc(WINDOW_SIZE)
+    c = 1
+    
+    while c > 0:
+        with nogil:
+            c = read(fd_src, buffer, WINDOW_SIZE)
+            if c > 0:
+                is_empty = False
+            r = bgzf_write(fp, buffer, c)
+        if r < 0:
+            free(buffer)
+            raise OSError("writing failed")
+        
+    free(buffer)
+    r = bgzf_close(fp)
+    if r < 0:
+        raise OSError("error %i when writing to file %s" % (r, filename_out))
+
+    r = close(fd_src)
+    # an empty file will return with -1, thus ignore this.
+    if r < 0:
+        if not (r == -1 and is_empty):
+            raise OSError("error %i when closing file %s" % (r, filename_in))
+
+
+def tabix_index( filename, 
+                 force = False,
+                 seq_col = None, 
+                 start_col = None, 
+                 end_col = None,
+                 preset = None,
+                 meta_char = "#",
+                 zerobased = False,
+                 int min_shift = -1,
+                ):
+    '''index tab-separated *filename* using tabix.
+
+    An existing index will not be overwritten unless
+    *force* is set.
+
+    The index will be built from coordinates
+    in columns *seq_col*, *start_col* and *end_col*.
+
+    The contents of *filename* have to be sorted by 
+    contig and position - the method does not check
+    if the file is sorted.
+
+    Column indices are 0-based. Coordinates in the file
+    are assumed to be 1-based.
+
+    If *preset* is provided, the column coordinates
+    are taken from a preset. Valid values for preset
+    are "gff", "bed", "sam", "vcf", psltbl", "pileup".
+    
+    Lines beginning with *meta_char* and the first
+    *line_skip* lines will be skipped.
+    
+    If *filename* does not end in ".gz", it will be automatically
+    compressed. The original file will be removed and only the 
+    compressed file will be retained. 
+
+    If *filename* ends in *gz*, the file is assumed to be already
+    compressed with bgzf.
+
+    *min-shift* sets the minimal interval size to 1<<INT; 0 for the
+    old tabix index. The default of -1 is changed inside htslib to 
+    the old tabix default of 0.
+
+    returns the filename of the compressed data
+
+    '''
+    
+    if not os.path.exists(filename):
+        raise IOError("No such file '%s'" % filename)
+
+    if preset is None and \
+       (seq_col is None or start_col is None or end_col is None):
+        raise ValueError(
+            "neither preset nor seq_col,start_col and end_col given")
+
+    if not filename.endswith(".gz"): 
+        tabix_compress(filename, filename + ".gz", force=force)
+        os.unlink( filename )
+        filename += ".gz"
+
+    if not force and os.path.exists(filename + ".tbi"):
+        raise IOError(
+            "Filename '%s.tbi' already exists, use *force* to overwrite")
+
+    # columns (1-based):
+    #   preset-code, contig, start, end, metachar for
+    #     comments, lines to ignore at beginning
+    # 0 is a missing column
+    preset2conf = {
+        'gff' : (0, 1, 4, 5, ord('#'), 0),
+        'bed' : (0x10000, 1, 2, 3, ord('#'), 0),
+        'psltbl' : (0x10000, 15, 17, 18, ord('#'), 0),
+        'sam' : (1, 3, 4, 0, ord('@'), 0),
+        'vcf' : (2, 1, 2, 0, ord('#'), 0),
+        'pileup': (3, 1, 2, 0, ord('#'), 0),
+        }
+
+    if preset:
+        try:
+            conf_data = preset2conf[preset]
+        except KeyError:
+            raise KeyError(
+                "unknown preset '%s', valid presets are '%s'" %
+                (preset, ",".join(preset2conf.keys())))
+    else:
+        if end_col == None:
+            end_col = -1
+        preset = 0
+
+        # note that tabix internally works with 0-based coordinates
+        # and open/closed intervals.  When using a preset, conversion
+        # is automatically taken care of.  Otherwise, the coordinates
+        # are assumed to be 1-based closed intervals and -1 is
+        # subtracted from the start coordinate. To avoid doing this,
+        # set the TI_FLAG_UCSC=0x10000 flag:
+        if zerobased:
+            preset = preset | 0x10000
+
+        conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
+                
+    cdef tbx_conf_t conf
+    conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
+
+
+    fn = encode_filename(filename)
+    cdef char *cfn = fn
+    with nogil:
+        tbx_index_build(cfn, min_shift, &conf)
+    
+    return filename
+
+# #########################################################
+# cdef class tabix_file_iterator_old:
+#     '''iterate over ``infile``.
+
+#     This iterator is not safe. If the :meth:`__next__()` method is called 
+#     after ``infile`` is closed, the result is undefined (see ``fclose()``).
+
+#     The iterator might either raise a StopIteration or segfault.
+#     '''
+
+
+#     def __cinit__(self, 
+#                   infile, 
+#                   Parser parser,
+#                   int buffer_size = 65536 ):
+
+#         cdef int fd = PyObject_AsFileDescriptor( infile )
+#         if fd == -1: raise ValueError( "I/O operation on closed file." )
+#         self.infile = fdopen( fd, 'r')
+
+#         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
+
+#         self.buffer = <char*>malloc( buffer_size )        
+#         self.size = buffer_size
+#         self.parser = parser
+
+#     def __iter__(self):
+#         return self
+
+#     cdef __cnext__(self):
+
+#         cdef char * b
+#         cdef size_t nbytes
+#         b = self.buffer
+
+#         while not feof( self.infile ):
+#             nbytes = getline( &b, &self.size, self.infile)
+
+#             # stop at first error or eof
+#             if (nbytes == -1): break
+#             # skip comments
+#             if (b[0] == '#'): continue
+
+#             # skip empty lines
+#             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
+
+#             # make sure that entry is complete
+#             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+#                 result = b
+#                 raise ValueError( "incomplete line at %s" % result )
+
+#             # make sure that this goes fully through C
+#             # otherwise buffer is copied to/from a
+#             # Python object causing segfaults as
+#             # the wrong memory is freed
+#             return self.parser.parse( b, nbytes )
+
+#         raise StopIteration
+
+#     def __dealloc__(self):
+#         free(self.buffer)
+
+#     def __next__(self):
+#         return self.__cnext__()
+
+#########################################################
+#########################################################
+#########################################################
+## Iterators for parsing through unindexed files.
+#########################################################
+# cdef buildGzipError(void *gzfp):
+#     cdef int errnum = 0
+#     cdef char *s = gzerror(gzfp, &errnum)
+#     return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
+
+
+cdef class tabix_file_iterator:
+    '''iterate over a compressed or uncompressed ``infile``.
+    '''
+
+    def __cinit__(self, 
+                  infile, 
+                  Parser parser,
+                  int buffer_size=65536):
+
+        if infile.closed:
+            raise ValueError("I/O operation on closed file.")
+
+        self.infile = infile
+
+        cdef int fd = PyObject_AsFileDescriptor(infile)
+        if fd == -1:
+            raise ValueError("I/O operation on closed file.")
+
+        self.duplicated_fd = dup(fd)
+
+        # From the manual:
+        # gzopen can be used to read a file which is not in gzip format; 
+        # in this case gzread will directly read from the file without decompression. 
+        # When reading, this will be detected automatically by looking 
+        # for the magic two-byte gzip header. 
+        self.fh = bgzf_dopen(self.duplicated_fd, 'r')
+
+        if self.fh == NULL: 
+            raise IOError('%s' % strerror(errno))
+
+        self.kstream = ks_init(self.fh) 
+        
+        self.buffer.s = <char*>malloc(buffer_size)
+        #if self.buffer == NULL:
+        #    raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
+        #self.size = buffer_size
+        self.parser = parser
+
+    def __iter__(self):
+        return self
+
+    cdef __cnext__(self):
+
+        cdef char * b
+        cdef int dret = 0
+        cdef int retval = 0
+        while 1:
+            with nogil:
+                retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
+            
+            if retval < 0: 
+                break
+                #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
+
+            b = self.buffer.s
+            
+            # skip comments
+            if (b[0] == '#'):
+                continue
+
+            # skip empty lines
+            if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
+                continue
+
+            # gzgets terminates at \n, no need to test
+
+            # parser creates a copy
+            return self.parser.parse(b, self.buffer.l)
+
+        raise StopIteration
+
+    def __dealloc__(self):
+        free(self.buffer.s)
+        ks_destroy(self.kstream)
+        bgzf_close(self.fh)
+        
+    def __next__(self):
+        return self.__cnext__()
+
+    def next(self):
+        return self.__cnext__()
+    
+
+class tabix_generic_iterator:
+    '''iterate over ``infile``.
+    
+    Permits the use of file-like objects for example from the gzip module.
+    '''
+    def __init__(self, infile, parser):
+
+        self.infile = infile
+        if self.infile.closed:
+            raise ValueError("I/O operation on closed file.")
+        self.parser = parser
+
+    def __iter__(self):
+        return self
+
+    # cython version - required for python 3
+    def __next__(self):
+        
+        cdef char * b
+        cdef char * cpy
+        cdef size_t nbytes
+
+        encoding = self.parser.get_encoding()
+
+        # note that GzipFile.close() does not close the file
+        # reading is still possible.
+        if self.infile.closed:
+            raise ValueError("I/O operation on closed file.")
+
+        while 1:
+
+            line = self.infile.readline()
+            if not line:
+                break
+            
+            s = force_bytes(line, encoding)
+            b = s
+            nbytes = len(line)
+            assert b[nbytes] == '\0'
+
+            # skip comments
+            if b[0] == '#':
+                continue
+
+            # skip empty lines
+            if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
+                continue
+            
+            # make sure that entry is complete
+            if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+                raise ValueError("incomplete line at %s" % line)
+            
+            bytes_cpy = <bytes> b
+            cpy = <char *> bytes_cpy
+
+            return self.parser(cpy, nbytes)            
+
+        raise StopIteration
+
+    # python version - required for python 2.7
+    def next(self):
+        return self.__next__()
+
+def tabix_iterator(infile, parser):
+    """return an iterator over all entries in a file.
+    
+    Results are returned parsed as specified by the *parser*. If
+    *parser* is None, the results are returned as an unparsed string.
+    Otherwise, *parser* is assumed to be a functor that will return
+    parsed data (see for example :class:`~pysam.asTuple` and
+    :class:`~pysam.asGTF`).
+
+    """
+    if PY_MAJOR_VERSION >= 3:
+        return tabix_generic_iterator(infile, parser)
+    else:
+        return tabix_file_iterator(infile, parser)
+        
+    # file objects can use C stdio
+    # used to be: isinstance( infile, file):
+    # if PY_MAJOR_VERSION >= 3:
+    #     if isinstance( infile, io.IOBase ):
+    #         return tabix_copy_iterator( infile, parser )
+    #     else:
+    #         return tabix_generic_iterator( infile, parser )
+    # else:
+#        if isinstance( infile, file ):
+#            return tabix_copy_iterator( infile, parser )
+#        else:
+#            return tabix_generic_iterator( infile, parser )
+    
+cdef class Tabixfile(TabixFile):
+    """Tabixfile is deprecated: use TabixFile instead"""
+    pass
+
+
+__all__ = [
+    "tabix_index", 
+    "tabix_compress",
+    "TabixFile",
+    "Tabixfile",
+    "asTuple",
+    "asGTF",
+    "asVCF",
+    "asBed",
+    "GZIterator",
+    "GZIteratorHead",
+    "tabix_iterator", 
+    "tabix_generic_iterator", 
+    "tabix_file_iterator", 
+]
diff --git a/pysam/libctabixproxies.pxd b/pysam/libctabixproxies.pxd

new file mode 100644 (file)

index 0000000..5317b81
--- /dev/null
+++ b/pysam/libctabixproxies.pxd
@@ -0,0 +1,59 @@
+#cdef extern from "Python.h":
+#    ctypedef struct FILE
+
+from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t
+
+cdef class TupleProxy:
+
+    cdef:
+        char * data
+        char ** fields
+        int nfields
+        int index
+        int nbytes
+        int offset
+        bint is_modified
+
+    cdef encoding
+
+    cpdef int getMaxFields(self)
+    cpdef int getMinFields(self)
+#    cdef char * _getindex(self, int idx)
+
+    cdef take(self, char * buffer, size_t nbytes)
+    cdef present(self, char * buffer, size_t nbytes)
+    cdef copy(self, char * buffer, size_t nbytes, bint reset=*)
+    cdef update(self, char * buffer, size_t nbytes)
+
+cdef class GTFProxy(TupleProxy) :
+
+    cdef:
+        char * _attributes
+        cdef bint hasOwnAttributes
+
+    cpdef int getMaxFields(self)
+    cpdef int getMinFields(self)
+    cdef char * getAttributes(self)
+
+cdef class NamedTupleProxy(TupleProxy):
+    pass
+
+cdef class BedProxy(NamedTupleProxy):
+
+    cdef:
+        char * contig
+        uint32_t start
+        uint32_t end
+        int bedfields
+
+    cpdef int getMaxFields(self)
+    cpdef int getMinFields(self)
+    cdef update(self, char * buffer, size_t nbytes)
+
+cdef class VCFProxy(NamedTupleProxy) :
+
+    cdef:
+        char * contig
+        uint32_t pos
+
+    cdef update(self, char * buffer, size_t nbytes)
diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx

new file mode 100644 (file)

index 0000000..9a8a678
--- /dev/null
+++ b/pysam/libctabixproxies.pyx
@@ -0,0 +1,827 @@
+from cpython cimport PyBytes_FromStringAndSize
+
+from libc.stdio cimport printf, feof, fgets
+from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
+from libc.stdlib cimport free, malloc, calloc, realloc
+from libc.stdlib cimport atoi, atol, atof
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+import collections
+
+cdef char *StrOrEmpty(char * buffer):
+     if buffer == NULL:
+         return ""
+     else: return buffer
+
+cdef int isNew(char * p, char * buffer, size_t nbytes):
+    """return True if `p` is located within `buffer` of size
+    `nbytes`
+    """
+    if p == NULL:
+        return 0
+    return not (buffer <= p < buffer + nbytes)
+
+
+cdef class TupleProxy:
+    '''Proxy class for access to parsed row as a tuple.
+
+    This class represents a table row for fast read-access.
+
+    Access to individual fields is via the [] operator.
+    
+    Only read-only access is implemented.
+
+    '''
+
+    def __cinit__(self, encoding="ascii"): 
+        self.data = NULL
+        self.fields = NULL
+        self.index = 0
+        self.nbytes = 0
+        self.is_modified = 0
+        self.nfields = 0
+        # start counting at field offset
+        self.offset = 0
+        self.encoding = encoding
+
+    def __dealloc__(self):
+        cdef int x
+        if self.is_modified:
+            for x from 0 <= x < self.nfields:
+                if isNew(self.fields[x], self.data, self.nbytes):
+                    free(self.fields[x])
+                    self.fields[x] = NULL
+
+        if self.data != NULL:
+            free(self.data)
+        if self.fields != NULL:
+            free(self.fields)
+
+    def __copy__(self):
+        if self.is_modified:
+            raise NotImplementedError(
+                "copying modified tuples is not implemented")
+        cdef TupleProxy n = type(self)()
+        n.copy(self.data, self.nbytes, reset=True)
+        return n
+
+    def compare(self, TupleProxy other):
+        '''return -1,0,1, if contents in this are binary
+        <,=,> to *other*
+
+        '''
+        if self.is_modified or other.is_modified:
+            raise NotImplementedError(
+                'comparison of modified TupleProxies is not implemented')
+        if self.data == other.data:
+            return 0
+
+        if self.nbytes < other.nbytes:
+            return -1
+        elif self.nbytes > other.nbytes:
+            return 1
+        return memcmp(self.data, other.data, self.nbytes)
+
+    def __richcmp__(self, TupleProxy other, int op):
+        if op == 2:  # == operator
+            return self.compare(other) == 0
+        elif op == 3:  # != operator
+            return self.compare(other) != 0
+        else:
+            err_msg = "op {0} isn't implemented yet".format(op)
+            raise NotImplementedError(err_msg)
+
+    cdef take(self, char * buffer, size_t nbytes):
+        '''start presenting buffer.
+
+        Take ownership of the pointer.
+        '''
+        self.data = buffer
+        self.nbytes = nbytes
+        self.update(buffer, nbytes)
+
+    cdef present(self, char * buffer, size_t nbytes):
+        '''start presenting buffer.
+
+        Do not take ownership of the pointer.
+        '''
+        self.update(buffer, nbytes)
+
+    cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
+        '''start presenting buffer of size *nbytes*.
+
+        Buffer is a '\0'-terminated string without the '\n'.
+
+        Take a copy of buffer.
+        '''
+        # +1 for '\0'
+        cdef int s = sizeof(char) *  (nbytes + 1)
+        self.data = <char*>malloc(s)
+        if self.data == NULL:
+            raise ValueError("out of memory in TupleProxy.copy()")
+        memcpy(<char*>self.data, buffer, s)
+
+        if reset:
+            for x from 0 <= x < nbytes:
+                if self.data[x] == '\0':
+                    self.data[x] = '\t'
+
+        self.update(self.data, nbytes)
+
+    cpdef int getMinFields(self):
+        '''return minimum number of fields.'''
+        # 1 is not a valid tabix entry, but TupleProxy
+        # could be more generic.
+        return 1
+
+    cpdef int getMaxFields(self):
+        '''return maximum number of fields. Return 
+        0 for unknown length.'''
+        return 0
+
+    cdef update(self, char * buffer, size_t nbytes):
+        '''update internal data.
+
+        *buffer* is a \0 terminated string.
+
+        *nbytes* is the number of bytes in buffer (excluding
+        the \0)
+
+        Update starts work in buffer, thus can be used
+        to collect any number of fields until nbytes
+        is exhausted.
+
+        If max_fields is set, the number of fields is initialized to
+        max_fields.
+
+        '''
+        cdef char * pos
+        cdef char * old_pos
+        cdef int field
+        cdef int max_fields, min_fields, x
+
+        assert strlen(buffer) == nbytes, \
+            "length of buffer (%i) != number of bytes (%i)" % (
+            strlen(buffer), nbytes)
+
+        if buffer[nbytes] != 0:
+            raise ValueError("incomplete line at %s" % buffer)
+
+        #################################
+        # remove line breaks and feeds and update number of bytes
+        x = nbytes - 1
+        while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): 
+            buffer[x] = '\0'
+            x -= 1
+        self.nbytes = x + 1
+
+        #################################
+        # clear data
+        if self.fields != NULL:
+            free(self.fields)
+ 
+        for field from 0 <= field < self.nfields:
+            if isNew(self.fields[field], self.data, self.nbytes):
+                free(self.fields[field])
+                
+        self.is_modified = self.nfields = 0
+
+        #################################
+        # allocate new
+        max_fields = self.getMaxFields()
+        # pre-count fields - better would be
+        # to guess or dynamically grow
+        if max_fields == 0:
+            for x from 0 <= x < nbytes:
+                if buffer[x] == '\t':
+                    max_fields += 1
+            max_fields += 1
+
+        self.fields = <char **>calloc(max_fields, sizeof(char *)) 
+        if self.fields == NULL:
+            raise ValueError("out of memory in TupleProxy.update()")
+
+        #################################
+        # start filling
+        field = 0
+        self.fields[field] = pos = buffer
+        field += 1
+        old_pos = pos
+        while 1:
+            
+            pos = <char*>memchr(pos, '\t', nbytes)
+            if pos == NULL:
+                break
+            if field >= max_fields:
+                raise ValueError(
+                    "parsing error: more than %i fields in line: %s" %
+                    (max_fields, buffer))
+
+            pos[0] = '\0'
+            pos += 1
+            self.fields[field] = pos
+            field += 1
+            nbytes -= pos - old_pos
+            if nbytes < 0:
+                break
+            old_pos = pos
+        self.nfields = field
+        if self.nfields < self.getMinFields():
+            raise ValueError(
+                "parsing error: fewer that %i fields in line: %s" %
+                (self.getMinFields(), buffer))
+
+    def _getindex(self, int index):
+        '''return item at idx index'''
+        cdef int i = index
+        if i < 0:
+            i += self.nfields
+        if i < 0:
+            raise IndexError("list index out of range")
+        # apply offset - separating a fixed number 
+        # of fields from a variable number such as in VCF
+        i += self.offset
+        if i >= self.nfields:
+            raise IndexError(
+                "list index out of range %i >= %i" %
+                (i, self.nfields))
+        return force_str(self.fields[i], self.encoding)
+
+    def __getitem__(self, key):
+        if type(key) == int:
+            return self._getindex(key)
+        # slice object
+        start, end, step = key.indices(self.nfields)
+        result = []
+        for index in range(start, end, step):
+            result.append(self._getindex(index))
+        return result
+
+    def _setindex(self, index, value):
+        '''set item at idx index.'''
+        cdef int idx = index
+        if idx < 0:
+            raise IndexError("list index out of range")
+        if idx >= self.nfields:
+            raise IndexError("list index out of range")
+
+        if isNew(self.fields[idx], self.data, self.nbytes):
+            free(self.fields[idx] )
+
+        self.is_modified = 1
+
+        if value is None:
+            self.fields[idx] = NULL
+            return
+
+        # conversion with error checking
+        value = force_bytes(value)
+        cdef char * tmp = <char*>value
+        self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
+        if self.fields[idx] == NULL:
+            raise ValueError("out of memory" )
+        strcpy(self.fields[idx], tmp)
+
+    def __setitem__(self, index, value):
+        '''set item at *index* to *value*'''
+        cdef int i = index
+        if i < 0:
+            i += self.nfields
+        i += self.offset
+        
+        self._setindex(i, value)
+
+    def __len__(self):
+        return self.nfields
+
+    def __iter__(self):
+        self.index = 0
+        return self
+
+    def __next__(self): 
+        """python version of next().
+        """
+        if self.index >= self.nfields:
+            raise StopIteration
+        cdef char * retval = self.fields[self.index]
+        self.index += 1
+        if retval == NULL:
+            return None
+        else:
+            return force_str(retval, self.encoding)
+
+    def __str__(self):
+        '''return original data'''
+        # copy and replace \0 bytes with \t characters
+        cdef char * cpy
+        if self.is_modified:
+            # todo: treat NULL values
+            result = []
+            for x in xrange(0, self.nfields):
+                result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
+            return "\t".join(result)
+        else:
+            cpy = <char*>calloc(sizeof(char), self.nbytes+1)
+            if cpy == NULL:
+                raise ValueError("out of memory")
+            memcpy(cpy, self.data, self.nbytes+1)
+            for x from 0 <= x < self.nbytes:
+                if cpy[x] == '\0':
+                    cpy[x] = '\t'
+            result = cpy[:self.nbytes]
+            free(cpy)
+            r = result.decode(self.encoding)
+            return r
+
+def toDot(v):
+    '''convert value to '.' if None'''
+    if v is None:
+        return "." 
+    else:
+        return str(v)
+
+def quote(v):
+    '''return a quoted attribute.'''
+    if isinstance(v, str):
+        return '"%s"' % v
+    else: 
+        return str(v)
+
+
+cdef class GTFProxy(TupleProxy):
+    '''Proxy class for access to GTF fields.
+
+    This class represents a GTF entry for fast read-access.
+    Write-access has been added as well, though some care must
+    be taken. If any of the string fields (contig, source, ...)
+    are set, the new value is tied to the lifetime of the
+    argument that was supplied.
+
+    The only exception is the attributes field when set from
+    a dictionary - this field will manage its own memory.
+    '''
+
+    def __cinit__(self): 
+        # automatically calls TupleProxy.__cinit__
+        self.hasOwnAttributes = False
+        self._attributes = NULL
+
+    def __dealloc__(self):
+        # automatically calls TupleProxy.__dealloc__
+        if self.hasOwnAttributes:
+            free(self._attributes)
+
+    cpdef int getMinFields(self):
+        '''return minimum number of fields.'''
+        return 9
+
+    cpdef int getMaxFields(self):
+        '''return max number of fields.'''
+        return 9
+
+    property contig:
+        '''contig of feature.'''
+        def __get__(self):
+            return self._getindex(0)
+        def __set__(self, value):
+            self._setindex(0, value)
+
+    property source:
+        '''feature source.'''
+        def __get__(self):
+            return self._getindex(1)
+        def __set__(self, value):
+            if value is None:
+                value = "."
+            self._setindex(1, value)
+
+    property feature:
+        '''feature name.'''
+        def __get__(self):
+            return self._getindex(2)
+        def __set__(self, value):
+            if value is None:
+                value = "."
+            self._setindex(2, value)
+
+    property start:
+        '''feature start (in 0-based open/closed coordinates).'''
+        def __get__(self ):
+            return int( self._getindex(3)) - 1
+        def __set__(self, value ):
+            self._setindex(3, str(value+1))
+
+    property end:
+        '''feature end (in 0-based open/closed coordinates).'''
+        def __get__(self):
+            return int(self._getindex(4))
+        def __set__(self, value):
+            self._setindex(4, str(value))
+
+    property score:
+        '''feature score.'''
+        def __get__(self): 
+            v = self._getindex(5)
+            if v == "" or v[0] == '.':
+                return None
+            else:
+                return float(v)
+
+        def __set__(self, value):
+            if value is None:
+                value = "."
+            self._setindex(5, str(value))
+
+    property strand:
+        '''feature strand.'''
+        def __get__(self):
+           return self._getindex(6)
+        def __set__(self, value ):
+            if value is None:
+                value = "."
+            self._setindex(6, value)
+
+    property frame:
+       '''feature frame.'''
+       def __get__(self):
+            v = self._getindex(7)
+            if v == "" or v[0] == '.':
+                return v
+            else:
+                return int(v)
+
+       def __set__(self, value):
+            if value is None:
+                value = "."
+            self._setindex(7, str(value))
+
+    property attributes:
+        '''feature attributes (as a string).'''
+        def __get__(self): 
+            if self.hasOwnAttributes:
+                return force_str(self._attributes)
+            else:
+                return force_str(self._getindex(8))
+        def __set__( self, value): 
+            if self.hasOwnAttributes:
+                free(self._attributes)
+                self._attributes = NULL
+                self.hasOwnAttributes = False
+            self._setindex(8, value)
+
+    cdef char * getAttributes(self):
+        '''return pointer to attributes.'''
+        cdef char * attributes
+        if self.hasOwnAttributes:
+            attributes = self._attributes
+        else:
+            attributes = self.fields[8]
+        if attributes == NULL:
+            raise KeyError("no attributes defined GTF entry")
+        return attributes
+
+    def asDict(self):
+        """parse attributes - return as dict
+        """
+
+        # remove comments
+        attributes = self.attributes
+
+        # separate into fields
+        # Fields might contain a ";", for example in ENSEMBL GTF file
+        # for mouse, v78:
+        # ...; transcript_name "TXNRD2;-001"; ....
+        # The current heuristic is to split on a semicolon followed by a
+        # space, see also http://mblab.wustl.edu/GTF22.html
+
+        # Remove white space to prevent a last empty field.
+        fields = [x.strip() for x in attributes.strip().split("; ")]
+        
+        result = collections.OrderedDict()
+
+        for f in fields:
+
+            # strip semicolon (GTF files without a space after the last semicolon)
+            if f.endswith(";"):
+                f = f[:-1]
+
+            # split at most once in order to avoid separating
+            # multi-word values
+            d = [x.strip() for x in f.split(" ", 1)]
+
+            n,v = d[0], d[1]
+            if len(d) > 2:
+                v = d[1:]
+
+            if v[0] == '"' and v[-1] == '"':
+                v = v[1:-1]
+            else:
+                ## try to convert to a value
+                try:
+                    v = float(v)
+                    v = int(v)
+                except ValueError:
+                    pass
+                except TypeError:
+                    pass
+
+            result[n] = v
+        
+        return result
+    
+    def fromDict(self, d):
+        '''set attributes from a dictionary.'''
+        cdef char * p
+        cdef int l
+
+        # clean up if this field is set twice
+        if self.hasOwnAttributes: 
+            free(self._attributes)
+
+        aa = []
+        for k,v in d.items():
+            if isinstance(v, str):
+                aa.append( '%s "%s"' % (k,v) )
+            else:
+                aa.append( '%s %s' % (k,str(v)) )
+
+        a = force_bytes("; ".join(aa) + ";")
+        p = a
+        l = len(a)
+        self._attributes = <char *>calloc(l + 1, sizeof(char))
+        if self._attributes == NULL:
+            raise ValueError("out of memory")
+        memcpy(self._attributes, p, l)
+
+        self.hasOwnAttributes = True
+        self.is_modified = True
+
+    def __str__(self):
+        cdef char * cpy
+        cdef int x
+
+        if self.is_modified:
+            return "\t".join( 
+                (self.contig, 
+                 self.source, 
+                 self.feature, 
+                 str(self.start+1),
+                 str(self.end),
+                 toDot(self.score),
+                 toDot(self.strand),
+                 toDot(self.frame),
+                 self.attributes))
+        else: 
+            return TupleProxy.__str__(self)
+
+    def invert(self, int lcontig):
+        '''invert coordinates to negative strand coordinates
+        
+        This method will only act if the feature is on the
+        negative strand.'''
+
+        if self.strand[0] == '-':
+            start = min(self.start, self.end)
+            end = max(self.start, self.end)
+            self.start, self.end = lcontig - end, lcontig - start
+
+    def keys(self):
+        '''return a list of attributes defined in this entry.'''
+        r = self.attributes
+        return [x.strip().split(" ")[0]
+                # separator is ';' followed by space
+                for x in r.split("; ") if x.strip() != '']
+
+    def __getitem__(self, key):
+        return self.__getattr__(key)
+
+    def __getattr__(self, item):
+        """Generic lookup of attribute from GFF/GTF attributes 
+        Only called if there *isn't* an attribute with this name
+        """
+        cdef char * start
+        cdef char * query
+        cdef char * cpy
+        cdef char * end
+        cdef int l
+
+        #
+        # important to use the getAttributes function.
+        # Using the self.attributes property to access
+        # the attributes caused a hard-to-trace bug
+        # in which fields in the attribute string were
+        # set to 0.
+        # Running through valgrind complained that
+        # memory was accessed in the memory field
+        # that has been released. It is not clear
+        # why this happened and might be a cython bug
+        # (Version 0.16). The valgrind warnings
+        # disappeard after accessing the C data structures
+        # directly and so did the bug.
+        cdef char * attributes = self.getAttributes()
+        if attributes == NULL:
+            raise KeyError("key %s not found, no attributes" % item)
+
+        # add space in order to make sure
+        # to not pick up a field that is a prefix of another field
+        r = force_bytes(item + " ")
+        query = r
+        start = strstr(attributes, query)
+
+        if start == NULL:
+            raise AttributeError("'GTFProxy' has no attribute '%s'" % item)
+
+        start += strlen(query)
+        # skip gaps before
+        while start[0] == ' ':
+            start += 1
+
+        if start[0] == '"':
+            start += 1
+            end = start
+            while end[0] != '\0' and end[0] != '"':
+                end += 1
+            l = end - start
+            result = force_str(PyBytes_FromStringAndSize(start, l),
+                                self.encoding)
+            return result
+        else:
+            return force_str(start, self.encoding)
+
+    def setAttribute(self, name, value):
+        '''convenience method to set an attribute.'''
+        r = self.asDict()
+        r[name] = value
+        self.fromDict(r)
+
+    def __cmp__(self, other):
+        return (self.contig, self.strand, self.start) < \
+            (other.contig, other.strand, other.start)
+
+    # python 3 compatibility
+    def __richcmp__(GTFProxy self, GTFProxy other, int op):
+        if op == 0:
+            return (self.contig, self.strand, self.start) < \
+                (other.contig, other.strand, other.start)
+        elif op == 1:
+            return (self.contig, self.strand, self.start) <= \
+                (other.contig, other.strand, other.start)
+        elif op == 2:
+            return self.compare(other) == 0
+        elif op == 3:
+            return self.compare(other) != 0
+        else:
+            err_msg = "op {0} isn't implemented yet".format(op)
+            raise NotImplementedError(err_msg)
+
+
+cdef class NamedTupleProxy(TupleProxy):
+
+    map_key2field = {}
+
+    def __setattr__(self, key, value):
+        '''set attribute.'''
+        cdef int idx
+        idx, f = self.map_key2field[key]
+        if self.nfields < idx:
+            raise KeyError("field %s not set" % key)
+        TupleProxy.__setitem__(self, idx, str(value))
+
+    def __getattr__(self, key):
+        cdef int idx
+        idx, f = self.map_key2field[key]
+        if self.nfields < idx:
+            raise KeyError("field %s not set" % key)
+        if f == str:
+            return force_str(self.fields[idx],
+                              self.encoding)
+        return f(self.fields[idx])
+
+
+cdef class BedProxy(NamedTupleProxy):
+    '''Proxy class for access to Bed fields.
+
+    This class represents a BED entry for fast read-access.
+    '''
+    map_key2field = {
+        'contig' : (0, str),
+        'start' : (1, int),
+        'end' : (2, int),
+        'name' : (3, str),
+        'score' : (4, float),
+        'strand' : (5, str),
+        'thickStart' : (6, int),
+        'thickEnd' : (7, int),
+        'itemRGB' : (8, str),
+        'blockCount': (9, int),
+        'blockSizes': (10, str),
+        'blockStarts': (11, str), } 
+
+    cpdef int getMinFields(self):
+        '''return minimum number of fields.'''
+        return 3
+
+    cpdef int getMaxFields(self):
+        '''return max number of fields.'''
+        return 12
+
+    cdef update(self, char * buffer, size_t nbytes):
+        '''update internal data.
+
+        nbytes does not include the terminal '\0'.
+        '''
+        TupleProxy.update(self, buffer, nbytes)
+
+        if self.nfields < 3:
+            raise ValueError(
+                "bed format requires at least three columns")
+
+        # determines bed format
+        self.bedfields = self.nfields
+
+        # do automatic conversion
+        self.contig = self.fields[0]
+        self.start = atoi(self.fields[1]) 
+        self.end = atoi(self.fields[2])
+
+    # __setattr__ in base class seems to take precedence
+    # hence implement setters in __setattr__
+    #property start:
+    #    def __get__( self ): return self.start
+    #property end:
+    #    def __get__( self ): return self.end
+
+    def __str__(self):
+
+        cdef int save_fields = self.nfields
+        # ensure fields to use correct format
+        self.nfields = self.bedfields
+        retval = TupleProxy.__str__(self)
+        self.nfields = save_fields
+        return retval
+
+    def __setattr__(self, key, value ):
+        '''set attribute.'''
+        if key == "start":
+            self.start = value
+        elif key == "end":
+            self.end = value
+
+        cdef int idx
+        idx, f = self.map_key2field[key]
+        TupleProxy._setindex(self, idx, str(value) )
+
+cdef class VCFProxy(NamedTupleProxy):
+    '''Proxy class for access to VCF fields.
+
+    The genotypes are accessed via a numeric index.
+    Sample headers are not available.
+    '''
+    map_key2field = { 
+        'contig' : (0, str),
+        'pos' : (1, int),
+        'id' : (2, str),
+        'ref' : (3, str),
+        'alt' : (4, str),
+        'qual' : (5, str),
+        'filter' : (6, str),
+        'info' : (7, str),
+        'format' : (8, str) }
+
+    def __cinit__(self): 
+        # automatically calls TupleProxy.__cinit__
+        # start indexed access at genotypes
+        self.offset = 9
+
+    cdef update(self, char * buffer, size_t nbytes):
+        '''update internal data.
+        
+        nbytes does not include the terminal '\0'.
+        '''
+        TupleProxy.update(self, buffer, nbytes)
+
+        self.contig = self.fields[0]
+        # vcf counts from 1 - correct here
+        self.pos = atoi(self.fields[1]) - 1
+                             
+    def __len__(self):
+        '''return number of genotype fields.'''
+        return max(0, self.nfields - 9)
+
+    property pos:
+       '''feature end (in 0-based open/closed coordinates).'''
+       def __get__(self): 
+           return self.pos
+
+    def __setattr__(self, key, value):
+        '''set attribute.'''
+        if key == "pos": 
+            self.pos = value
+            value += 1
+
+        cdef int idx
+        idx, f = self.map_key2field[key]
+        TupleProxy._setindex(self, idx, str(value))
+
diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd

new file mode 100644 (file)

index 0000000..81e544a
--- /dev/null
+++ b/pysam/libcutils.pxd
@@ -0,0 +1,38 @@
+#########################################################################
+# Utility functions used across pysam
+#########################################################################
+cimport cython
+from cpython cimport array as c_array
+
+cpdef parse_region(reference=*, start=*, end=*, region=*)
+
+#########################################################################
+# Utility functions for quality string conversions
+
+cpdef c_array.array qualitystring_to_array(input_str, int offset=*)
+cpdef array_to_qualitystring(c_array.array arr, int offset=*)
+cpdef qualities_to_qualitystring(qualities, int offset=*)
+
+########################################################################
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+cdef charptr_to_str(const char *s, encoding=*)
+cdef bytes charptr_to_bytes(const char *s, encoding=*)
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*)
+cdef force_str(object s, encoding=*)
+cdef bytes force_bytes(object s, encoding=*)
+cdef bytes encode_filename(object filename)
+cdef from_string_and_size(const char *s, size_t length)
+
+cdef extern from "pysam_util.h":
+
+    int samtools_main(int argc, char *argv[])
+    int bcftools_main(int argc, char *argv[])
+    void pysam_set_stderr(int fd)
+    void pysam_unset_stderr()
+    void pysam_set_stdout(int fd)
+    void pysam_set_stdout_fn(const char *)
+    void pysam_unset_stdout()
+    void set_optind(int)
diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx

new file mode 100644 (file)

index 0000000..80bd9e4
--- /dev/null
+++ b/pysam/libcutils.pyx
@@ -0,0 +1,375 @@
+import types
+import sys
+import string
+import re
+import tempfile
+import os
+import io
+from contextlib import contextmanager
+
+from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
+from cpython cimport PyBytes_Check, PyUnicode_Check
+from cpython cimport array as c_array
+from libc.stdlib cimport calloc, free
+from libc.string cimport strncpy
+from libc.stdio cimport fprintf, stderr, fflush
+from libc.stdio cimport stdout as c_stdout
+from posix.fcntl cimport open as c_open, O_WRONLY
+
+#####################################################################
+# hard-coded constants
+cdef int MAX_POS = 2 << 29
+
+#################################################################
+# Utility functions for quality string conversions
+cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
+    """convert a qualitystring to an array of quality values."""
+    if input_str is None:
+        return None
+    qs = force_bytes(input_str)
+    cdef char i
+    return c_array.array('B', [i - offset for i in qs])
+
+
+cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
+    """convert an array of quality values to a string."""
+    if qualities is None:
+        return None
+    cdef int x
+
+    cdef c_array.array result
+    result = c_array.clone(qualities, len(qualities), zero=False)
+
+    for x from 0 <= x < len(qualities):
+        result[x] = qualities[x] + offset
+    return force_str(result.tostring())
+
+
+cpdef qualities_to_qualitystring(qualities, int offset=33):
+    """convert a list or array of quality scores to the string
+    representation used in the SAM format.
+
+    Parameters
+    ----------
+    offset : int
+        offset to be added to the quality scores to arrive at
+        the characters of the quality string (default=33).
+
+    Returns
+    -------
+    string
+         a quality string
+
+    """
+    cdef char x
+    if qualities is None:
+        return None
+    elif isinstance(qualities, c_array.array):
+        return array_to_qualitystring(qualities, offset=offset)
+    else:
+        # tuples and lists
+        return force_str("".join([chr(x + offset) for x in qualities]))
+
+
+########################################################################
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+
+cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+
+cdef from_string_and_size(const char* s, size_t length):
+    if IS_PYTHON3:
+        return s[:length].decode("ascii")
+    else:
+        return s[:length]
+
+
+# filename encoding (adapted from lxml.etree.pyx)
+cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
+
+
+cdef bytes encode_filename(object filename):
+    """Make sure a filename is 8-bit encoded (or None)."""
+    if filename is None:
+        return None
+    elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2:
+        # Added to support path-like objects
+        return os.fsencode(filename)
+    elif PyBytes_Check(filename):
+        return filename
+    elif PyUnicode_Check(filename):
+        return filename.encode(FILENAME_ENCODING)
+    else:
+        raise TypeError("Argument must be string or unicode.")
+
+
+cdef bytes force_bytes(object s, encoding="ascii"):
+    """convert string or unicode object to bytes, assuming
+    ascii encoding.
+    """
+    if s is None:
+        return None
+    elif PyBytes_Check(s):
+        return s
+    elif PyUnicode_Check(s):
+        return s.encode(encoding)
+    else:
+        raise TypeError("Argument must be string, bytes or unicode.")
+
+
+cdef charptr_to_str(const char* s, encoding="ascii"):
+    if s == NULL:
+        return None
+    if PY_MAJOR_VERSION < 3:
+        return s
+    else:
+        return s.decode(encoding)
+
+
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
+    if s == NULL:
+        return None
+    if PY_MAJOR_VERSION < 3:
+        return s[:n]
+    else:
+        return s[:n].decode(encoding)
+
+
+cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
+    if s == NULL:
+        return None
+    else:
+        return s
+
+
+cdef force_str(object s, encoding="ascii"):
+    """Return s converted to str type of current Python
+    (bytes in Py2, unicode in Py3)"""
+    if s is None:
+        return None
+    if PY_MAJOR_VERSION < 3:
+        return s
+    elif PyBytes_Check(s):
+        return s.decode(encoding)
+    else:
+        # assume unicode
+        return s
+
+
+cpdef parse_region(reference=None,
+                   start=None,
+                   end=None,
+                   region=None):
+    """parse alternative ways to specify a genomic region. A region can
+    either be specified by :term:`reference`, `start` and
+    `end`. `start` and `end` denote 0-based, half-open
+    intervals.
+
+    Alternatively, a samtools :term:`region` string can be
+    supplied.
+
+    If any of the coordinates are missing they will be replaced by the
+    minimum (`start`) or maximum (`end`) coordinate.
+
+    Note that region strings are 1-based, while `start` and `end` denote
+    an interval in python coordinates.
+
+    Returns
+    -------
+
+    tuple :  a tuple of `reference`, `start` and `end`.
+
+    Raises
+    ------
+
+    ValueError
+       for invalid or out of bounds regions.
+
+    """
+    cdef int rtid
+    cdef long long rstart
+    cdef long long rend
+
+    rtid = -1
+    rstart = 0
+    rend = MAX_POS
+    if start != None:
+        try:
+            rstart = start
+        except OverflowError:
+            raise ValueError('start out of range (%i)' % start)
+
+    if end != None:
+        try:
+            rend = end
+        except OverflowError:
+            raise ValueError('end out of range (%i)' % end)
+
+    if region:
+        region = force_str(region)
+        parts = re.split("[:-]", region)
+        reference = parts[0]
+        if len(parts) >= 2:
+            rstart = int(parts[1]) - 1
+        if len(parts) >= 3:
+            rend = int(parts[2])
+
+    if not reference:
+        return None, 0, 0
+
+    if not 0 <= rstart < MAX_POS:
+        raise ValueError('start out of range (%i)' % rstart)
+    if not 0 <= rend <= MAX_POS:
+        raise ValueError('end out of range (%i)' % rend)
+    if rstart > rend:
+        raise ValueError(
+            'invalid region: start (%i) > end (%i)' % (rstart, rend))
+
+    return force_bytes(reference), rstart, rend
+
+
+def _pysam_dispatch(collection,
+                    method,
+                    args=None,
+                    catch_stdout=True,
+                    save_stdout=None):
+    '''call ``method`` in samtools/bcftools providing arguments in args.
+    
+    Catching of stdout can be turned off by setting *catch_stdout* to
+    False.
+
+    '''
+
+    if method == "index":
+        if not os.path.exists(args[0]):
+            raise IOError("No such file or directory: '%s'" % args[0])
+            
+    if args is None:
+        args = []
+    else:
+        args = list(args)
+
+    # redirect stderr to file
+    stderr_h, stderr_f = tempfile.mkstemp()
+    pysam_set_stderr(stderr_h)
+
+    # redirect stdout to file
+    if save_stdout:
+        stdout_f = save_stdout
+        stdout_h = c_open(force_bytes(stdout_f),
+                          O_WRONLY)
+        if stdout_h == -1:
+            raise OSError("error while opening {} for writing".format(stdout_f))
+
+        pysam_set_stdout_fn(force_bytes(stdout_f))
+        pysam_set_stdout(stdout_h)
+    elif catch_stdout:
+        stdout_h, stdout_f = tempfile.mkstemp()
+
+        MAP_STDOUT_OPTIONS = {
+            "samtools": {
+                "view": "-o {}",
+                "mpileup": "-o {}",
+                "depad": "-o {}",
+                "calmd": "",  # uses pysam_stdout_fn
+            },
+            "bcftools": {}
+        }
+
+        stdout_option = None
+        if collection == "bcftools":
+            # in bcftools, most methods accept -o, the exceptions
+            # are below:
+            if method not in ("index", "roh", "stats"):
+                stdout_option = "-o {}"
+        elif method in MAP_STDOUT_OPTIONS[collection]:
+            # special case - samtools view -c outputs on stdout
+            if not(method == "view" and "-c" in args):
+                stdout_option = MAP_STDOUT_OPTIONS[collection][method]
+
+        if stdout_option is not None:
+            os.close(stdout_h)
+            pysam_set_stdout_fn(force_bytes(stdout_f))
+            args.extend(stdout_option.format(stdout_f).split(" "))
+        else:
+            pysam_set_stdout(stdout_h)
+    else:
+        pysam_set_stdout_fn("-")
+
+    # setup the function call to samtools/bcftools main
+    cdef char ** cargs
+    cdef int i, n, retval, l
+    n = len(args)
+    method = force_bytes(method)
+    collection = force_bytes(collection)
+    args = [force_bytes(a) for a in args]
+
+    # allocate two more for first (dummy) argument (contains command)
+    cdef int extra_args = 0
+    if method == b"index":
+        extra_args = 1
+    # add extra arguments for commands accepting optional arguments
+    # such as 'samtools index x.bam [out.index]'
+    cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
+    cargs[0] = collection
+    cargs[1] = method
+
+    # create copies of strings - getopt for long options permutes
+    # arguments
+    for i from 0 <= i < n:
+        l = len(args[i])
+        cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
+        strncpy(cargs[i + 2], args[i], l)
+    
+    # reset getopt. On OsX there getopt reset is different
+    # between getopt and getopt_long
+    if method in [b'index', b'cat', b'quickcheck',
+                  b'faidx', b'kprobaln']:
+        set_optind(1)
+    else:
+        set_optind(0)
+
+    # call samtools/bcftools
+    if collection == b"samtools":
+        retval = samtools_main(n + 2, cargs)
+    elif collection == b"bcftools":
+        retval = bcftools_main(n + 2, cargs)
+
+    for i from 0 <= i < n:
+        free(cargs[i + 2])
+    free(cargs)
+
+    # get error messages
+    def _collect(fn):
+        out = []
+        try:
+            with open(fn, "r") as inf:
+                out = inf.read()
+        except UnicodeDecodeError:
+            with open(fn, "rb") as inf:
+                # read binary output
+                out = inf.read()
+        finally:
+            os.remove(fn)
+        return out
+
+    pysam_unset_stderr()
+    out_stderr = _collect(stderr_f)
+
+    if save_stdout:
+        pysam_unset_stdout()
+        out_stdout = None
+    elif catch_stdout:
+        pysam_unset_stdout()
+        out_stdout = _collect(stdout_f)
+    else:
+        out_stdout = None
+
+    return retval, out_stderr, out_stdout
+
+
+__all__ = ["qualitystring_to_array",
+           "array_to_qualitystring",
+           "qualities_to_qualitystring"]
diff --git a/pysam/libcvcf.pxd b/pysam/libcvcf.pxd

new file mode 100644 (file)

index 0000000..139597f
--- /dev/null
+++ b/pysam/libcvcf.pxd
@@ -0,0 +1,2 @@
+
+
diff --git a/pysam/libcvcf.pyx b/pysam/libcvcf.pyx

new file mode 100644 (file)

index 0000000..956f8a5
--- /dev/null
+++ b/pysam/libcvcf.pyx
@@ -0,0 +1,1203 @@
+# cython: embedsignature=True
+#
+# Code to read, write and edit VCF files
+#
+# VCF lines are encoded as a dictionary with these keys (note: all lowercase):
+# 'chrom':  string
+# 'pos':    integer
+# 'id':     string
+# 'ref':    string
+# 'alt':    list of strings
+# 'qual':   integer
+# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
+# 'info':   dictionary of values (see below)
+# 'format': list of keys (strings)
+# sample keys: dictionary of values (see below)
+#
+# The sample keys are accessible through vcf.getsamples()
+#
+# A dictionary of values contains value keys (defined in ##INFO or
+# ##FORMAT lines) which map to a list, containing integers, floats,
+# strings, or characters.  Missing values are replaced by a particular
+# value, often -1 or .
+#
+# Genotypes are not stored as a string, but as a list of 1 or 3
+# elements (for haploid and diploid samples), the first (and last) the
+# integer representing an allele, and the second the separation
+# character.  Note that there is just one genotype per sample, but for
+# consistency the single element is stored in a list.
+#
+# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
+# (key, value) pairs and are accessible through getheader()
+#
+# The VCF class can be instantiated with a 'regions' variable
+# consisting of tuples (chrom,start,end) encoding 0-based half-open
+# segments.  Only variants with a position inside the segment will be
+# parsed.  A regions parser is available under parse_regions.
+#
+# When instantiated, a reference can be passed to the VCF class.  This
+# may be any class that supports a fetch(chrom, start, end) method.
+#
+# NOTE: the position that is returned to Python is 0-based, NOT
+# 1-based as in the VCF file.
+# NOTE: There is also preliminary VCF functionality in the VariantFile class.
+#
+# TODO:
+#  only v4.0 writing is complete; alleles are not converted to v3.3 format
+#
+
+from collections import namedtuple, defaultdict
+from operator import itemgetter
+import sys, re, copy, bisect
+
+from libc.stdlib cimport atoi
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+
+cimport pysam.libctabix as libctabix
+cimport pysam.libctabixproxies as libctabixproxies
+
+from pysam.libcutils cimport force_str
+
+import pysam
+
+gtsRegEx = re.compile("[|/\\\\]")
+alleleRegEx = re.compile('^[ACGTN]+$')
+
+# Utility function.  Uses 0-based coordinates
+def get_sequence(chrom, start, end, fa):
+    # obtain sequence from .fa file, without truncation
+    if end<=start: return ""
+    if not fa: return "N"*(end-start)
+    if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
+    sequence = fa.fetch(chrom, start, end).upper()
+    if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
+    return sequence
+
+# Utility function.  Parses a region string
+def parse_regions( string ):
+    result = []
+    for r in string.split(','):
+        elts = r.split(':')
+        chrom, start, end = elts[0], 0, 3000000000
+        if len(elts)==1: pass
+        elif len(elts)==2:
+            if len(elts[1])>0:
+                ielts = elts[1].split('-')
+                if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
+                try:    start, end = int(ielts[0])-1, int(ielts[1])
+                except: raise ValueError("Don't understand region string '%s'" % r)
+        else:
+            raise ValueError("Don't understand region string '%s'" % r)
+        result.append( (chrom,start,end) )
+    return result
+
+
+FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
+
+###########################################################################################################
+#
+# New class
+#
+###########################################################################################################
+
+cdef class VCFRecord(libctabixproxies.TupleProxy):
+    '''vcf record.
+
+    initialized from data and vcf meta
+    '''
+
+    cdef vcf
+    cdef char * contig
+    cdef uint32_t pos
+
+    def __init__(self, vcf):
+        self.vcf = vcf
+        self.encoding = vcf.encoding
+
+        # if len(data) != len(self.vcf._samples):
+        #     self.vcf.error(str(data),
+        #                self.BAD_NUMBER_OF_COLUMNS,
+        #                "expected %s for %s samples (%s), got %s" % \
+        #                    (len(self.vcf._samples),
+        #                     len(self.vcf._samples),
+        #                     self.vcf._samples,
+        #                     len(data)))
+
+    def __cinit__(self, vcf):
+        # start indexed access at genotypes
+        self.offset = 9
+
+        self.vcf = vcf
+        self.encoding = vcf.encoding
+
+    def error(self, line, error, opt=None):
+        '''raise error.'''
+        # pass to vcf file for error handling
+        return self.vcf.error(line, error, opt)
+
+    cdef update(self, char * buffer, size_t nbytes):
+        '''update internal data.
+
+        nbytes does not include the terminal '\0'.
+        '''
+        libctabixproxies.TupleProxy.update(self, buffer, nbytes)
+
+        self.contig = self.fields[0]
+        # vcf counts from 1 - correct here
+        self.pos = atoi(self.fields[1]) - 1
+
+    def __len__(self):
+        return max(0, self.nfields - 9)
+
+    property contig:
+        def __get__(self): return self.contig
+
+    property pos:
+        def __get__(self): return self.pos
+
+    property id:
+        def __get__(self): return self.fields[2]
+
+    property ref:
+        def __get__(self):
+            return self.fields[3]
+
+    property alt:
+        def __get__(self):
+            # convert v3.3 to v4.0 alleles below
+            alt = self.fields[4]
+            if alt == ".": alt = []
+            else: alt = alt.upper().split(',')
+            return alt
+
+    property qual:
+        def __get__(self):
+            qual = self.fields[5]
+            if qual == b".": qual = -1
+            else:
+                try:    qual = float(qual)
+                except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
+            return qual
+
+    property filter:
+        def __get__(self):
+            f = self.fields[6]
+            # postpone checking that filters exist.  Encode missing filter or no filtering as empty list
+            if f == b"." or f == b"PASS" or f == b"0": return []
+            else: return f.split(';')
+
+    property info:
+        def __get__(self):
+            col = self.fields[7]
+            # dictionary of keys, and list of values
+            info = {}
+            if col != b".":
+                for blurp in col.split(';'):
+                    elts = blurp.split('=')
+                    if len(elts) == 1: v = None
+                    elif len(elts) == 2: v = elts[1]
+                    else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
+                    info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
+            return info
+
+    property format:
+         def __get__(self):
+             return self.fields[8].split(':')
+
+    property samples:
+        def __get__(self):
+            return self.vcf._samples
+
+    def __getitem__(self, key):
+
+        # parse sample columns
+        values = self.fields[self.vcf._sample2column[key]].split(':')
+        alt = self.alt
+        format = self.format
+
+        if len(values) > len(format):
+            self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
+                           (len(values),key,len(format)))
+
+        result = {}
+        for idx in range(len(format)):
+            expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
+            if idx < len(values): value = values[idx]
+            else:
+                if expected == -1: value = "."
+                else: value = ",".join(["."]*expected)
+
+            result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
+            if expected != -1 and len(result[format[idx]]) != expected:
+                self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
+                               "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
+                if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
+                result[format[idx]] = result[format[idx]][:expected]
+
+        return result
+
+
+cdef class asVCFRecord(libctabix.Parser):
+    '''converts a :term:`tabix row` into a VCF record.'''
+    cdef vcffile
+    def __init__(self, vcffile):
+        self.vcffile = vcffile
+
+    cdef parse(self, char * buffer, int len):
+        cdef VCFRecord r
+        r = VCFRecord(self.vcffile)
+        r.copy(buffer, len)
+        return r
+
+class VCF(object):
+
+    # types
+    NT_UNKNOWN = 0
+    NT_NUMBER = 1
+    NT_ALLELES = 2
+    NT_NR_ALLELES = 3
+    NT_GENOTYPES = 4
+    NT_PHASED_GENOTYPES = 5
+
+    _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
+                1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
+                2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
+                3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
+                4:"POS_NOT_NUMERICAL:Position column is not numerical",
+                5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
+                6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
+                7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
+                8:"POS_NOT_POSITIVE:Position field must be >0",
+                9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
+               10:"ERROR_INFO_STRING:Error while parsing info field",
+               11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
+               12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
+               13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
+               14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
+               15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
+               16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
+               17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
+               18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
+               19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
+               20:"MISSING_REF:Reference allele missing",
+               21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
+               22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
+               23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
+               24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
+               25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
+               26:"WRONG_REF:Wrong reference %s",
+               27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
+               28:"BAD_CHR_TAG:Error calculating chr tag for %s",
+               29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
+               30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
+               31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
+               32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
+               33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
+                }
+
+    # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
+    _header = []
+
+    # version number; 33=v3.3; 40=v4.0
+    _version = 40
+
+    # info, filter and format data
+    _info = {}
+    _filter = {}
+    _format = {}
+
+    # header; and required columns
+    _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
+    _samples = []
+
+    # control behaviour
+    _ignored_errors = set([11,31])   # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
+    _warn_errors = set([])
+    _leftalign = False
+
+    # reference sequence
+    _reference = None
+
+    # regions to include; None includes everything
+    _regions = None
+
+    # statefull stuff
+    _lineno = -1
+    _line = None
+    _lines = None
+
+    def __init__(self, _copy=None, reference=None, regions=None,
+                 lines=None, leftalign=False):
+        # make error identifiers accessible by name
+        for id in self._errors.keys():
+            self.__dict__[self._errors[id].split(':')[0]] = id
+        if _copy != None:
+            self._leftalign = _copy._leftalign
+            self._header = _copy._header[:]
+            self._version = _copy._version
+            self._info = copy.deepcopy(_copy._info)
+            self._filter = copy.deepcopy(_copy._filter)
+            self._format = copy.deepcopy(_copy._format)
+            self._samples = _copy._samples[:]
+            self._sample2column = copy.deepcopy(_copy._sample2column)
+            self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
+            self._warn_errors = copy.deepcopy(_copy._warn_errors)
+            self._reference = _copy._reference
+            self._regions = _copy._regions
+        if reference: self._reference = reference
+        if regions: self._regions = regions
+        if leftalign: self._leftalign = leftalign
+        self._lines = lines
+        self.encoding = "ascii"
+        self.tabixfile = None
+
+    def error(self,line,error,opt=None):
+        if error in self._ignored_errors: return
+        errorlabel, errorstring = self._errors[error].split(':')
+        if opt: errorstring = errorstring % opt
+        errwarn = ["Error","Warning"][error in self._warn_errors]
+        errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
+        if error in self._warn_errors: return
+        raise ValueError(errorstring)
+
+    def parse_format(self,line,format,filter=False):
+        if self._version == 40:
+            if not format.startswith('<'):
+                self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
+                format = "<"+format
+            if not format.endswith('>'):
+                self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
+                format += ">"
+            format = format[1:-1]
+        data = {'id':None,'number':None,'type':None,'descr':None}
+        idx = 0
+        while len(format.strip())>0:
+            elts = format.strip().split(',')
+            first, rest = elts[0], ','.join(elts[1:])
+            if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
+                if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
+                if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+                first = ["ID=","Number=","Type=","Description="][idx] + first
+            if first.startswith('ID='):            data['id'] = first.split('=')[1]
+            elif first.startswith('Number='):      data['number'] = first.split('=')[1]
+            elif first.startswith('Type='):        data['type'] = first.split('=')[1]
+            elif first.startswith('Description='):
+                elts = format.split('"')
+                if len(elts)<3:
+                    self.error(line,self.FORMAT_MISSING_QUOTES)
+                    elts = first.split('=') + [rest]
+                data['descr'] = elts[1]
+                rest = '"'.join(elts[2:])
+                if rest.startswith(','): rest = rest[1:]
+            else:
+                self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+            format = rest
+            idx += 1
+            if filter and idx==1: idx=3  # skip number and type fields for FILTER format strings
+        if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+        if 'descr' not in data:
+            # missing description
+            self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+            data['descr'] = ""
+        if not data['type'] and not data['number']:
+            # fine, ##filter format
+            return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
+        if not data['type'] in ["Integer","Float","Character","String","Flag"]:
+            self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+        # I would like a missing-value field, but it isn't there
+        if data['type'] in ['Integer','Float']: data['missing'] = None    # Do NOT use arbitrary int/float as missing value
+        else:                                   data['missing'] = '.'
+        if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+        try:
+            n = int(data['number'])
+            t = self.NT_NUMBER
+        except ValueError:
+            n = -1
+            if data['number'] == '.':                   t = self.NT_UNKNOWN
+            elif data['number'] == '#alleles':          t = self.NT_ALLELES
+            elif data['number'] == '#nonref_alleles':   t = self.NT_NR_ALLELES
+            elif data['number'] == '#genotypes':        t = self.NT_GENOTYPES
+            elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
+            elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
+            # abbreviations added in VCF version v4.1
+            elif data['number'] == 'A': t = self.NT_ALLELES
+            elif data['number'] == 'G': t = self.NT_GENOTYPES
+            else:
+                self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+        # if number is 0 - type must be Flag
+        if n == 0 and data['type'] != 'Flag':
+            self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
+            # force type 'Flag' if no number
+            data['type'] = 'Flag'
+
+        return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
+
+    def format_format( self, fmt, filter=False ):
+        values = [('ID',fmt.id)]
+        if fmt.number != None and not filter:
+            if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
+            elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
+            elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
+            elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
+            elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
+            elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
+            else:
+                raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
+            values.append( ('Number',nmb) )
+            values.append( ('Type', fmt.type) )
+        values.append( ('Description', '"' + fmt.description + '"') )
+        if self._version == 33:
+            format = ",".join([v for k,v in values])
+        else:
+            format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
+        return format
+
+    def get_expected(self, format, formatdict, alt):
+        fmt = formatdict[format]
+        if fmt.numbertype == self.NT_UNKNOWN: return -1
+        if fmt.numbertype == self.NT_NUMBER: return fmt.number
+        if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
+        if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
+        if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
+        if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
+        return 0
+
+
+    def _add_definition(self, formatdict, key, data, line ):
+        if key in formatdict: return
+        self.error(line,self.ERROR_UNKNOWN_KEY,key)
+        if data == None:
+            formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
+            return
+        if data == []: data = [""]             # unsure what type -- say string
+        if type(data[0]) == type(0.0):
+            formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
+            return
+        if type(data[0]) == type(0):
+            formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
+            return
+        formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
+
+
+    # todo: trim trailing missing values
+    def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
+        output, sdata = [], []
+        if type(data) == type([]): # for FORMAT field, make data with dummy values
+            d = {}
+            for k in data: d[k] = []
+            data = d
+        # convert missing values; and silently add definitions if required
+        for k in data:
+            self._add_definition( format, k, data[k], "(output)" )
+            for idx,v in enumerate(data[k]):
+                if v == format[k].missingvalue: data[k][idx] = "."
+        # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
+        for k in data:
+            if k != 'GT': sdata.append( (k,data[k]) )
+        sdata.sort()
+        if 'GT' in data:
+            sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
+        for k,v in sdata:
+            if v == []: v = None
+            if key and value:
+                if v != None: output.append( k+"="+','.join(map(str,v)) )
+                else: output.append( k )
+            elif key: output.append(k)
+            elif value:
+                if v != None: output.append( ','.join(map(str,v)) )
+                else: output.append( "." )                    # should not happen
+        # snip off trailing missing data
+        while len(output) > 1:
+            last = output[-1].replace(',','').replace('.','')
+            if len(last)>0: break
+            output = output[:-1]
+        return separator.join(output)
+
+
+    def enter_default_format(self):
+        for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
+                  FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
+                  FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
+                  FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
+                  FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
+                  FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
+                  FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
+                  FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
+                  FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
+                  FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1),    # unknown number, since may be haploid
+                  FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
+                  FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
+                  FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
+                  FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
+                  ]:
+            if f.id not in self._format:
+                self._format[f.id] = f
+
+    def parse_header(self, line):
+
+        assert line.startswith('##')
+        elts = line[2:].split('=')
+        key = elts[0].strip()
+        value = '='.join(elts[1:]).strip()
+        if key == "fileformat":
+            if value == "VCFv3.3":
+                self._version = 33
+            elif value == "VCFv4.0":
+                self._version = 40
+            elif value == "VCFv4.1":
+                # AH - for testing
+                self._version = 40
+            elif value == "VCFv4.2":
+                # AH - for testing
+                self._version = 40
+            else:
+                self.error(line,self.UNKNOWN_FORMAT_STRING)
+        elif key == "INFO":
+            f = self.parse_format(line, value)
+            self._info[ f.id ] = f
+        elif key == "FILTER":
+            f = self.parse_format(line, value, filter=True)
+            self._filter[ f.id ] = f
+        elif key == "FORMAT":
+            f = self.parse_format(line, value)
+            self._format[ f.id ] = f
+        else:
+            # keep other keys in the header field
+            self._header.append( (key,value) )
+
+
+    def write_header( self, stream ):
+        stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
+        for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
+        for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
+            for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
+
+
+    def parse_heading( self, line ):
+        assert line.startswith('#')
+        assert not line.startswith('##')
+        headings = line[1:].split('\t')
+        # test for 8, as FORMAT field might be missing
+        if len(headings)==1 and len(line[1:].split()) >= 8:
+            self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
+            headings = line[1:].split()
+
+        for i,s in enumerate(self._required):
+
+            if len(headings)<=i or headings[i] != s:
+
+                if len(headings) <= i:
+                    err = "(%sth entry not found)" % (i+1)
+                else:
+                    err = "(found %s, expected %s)" % (headings[i],s)
+
+                #self.error(line,self.BADLY_FORMATTED_HEADING,err)
+                # allow FORMAT column to be absent
+                if len(headings) == 8:
+                    headings.append("FORMAT")
+                else:
+                    self.error(line,self.BADLY_FORMATTED_HEADING,err)
+
+        self._samples = headings[9:]
+        self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
+
+    def write_heading( self, stream ):
+        stream.write("#" + "\t".join(self._required + self._samples) + "\n")
+
+    def convertGT(self, GTstring):
+        if GTstring == ".": return ["."]
+        try:
+            gts = gtsRegEx.split(GTstring)
+            if len(gts) == 1: return [int(gts[0])]
+            if len(gts) != 2: raise ValueError()
+            if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
+            return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
+        except ValueError:
+            self.error(self._line,self.BAD_GENOTYPE,GTstring)
+            return [".","|","."]
+
+    def convertGTback(self, GTdata):
+        return ''.join(map(str,GTdata))
+
+    def parse_formatdata( self, key, value, formatdict, line ):
+        # To do: check that the right number of values is present
+        f = formatdict.get(key,None)
+        if f == None:
+            self._add_definition(formatdict, key, value, line )
+            f = formatdict[key]
+        if f.type == "Flag":
+            if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
+            return []
+        values = value.split(',')
+        # deal with trailing data in some early VCF files
+        if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
+            self.error(line,self.ERROR_TRAILING_DATA,values[-1])
+            values[-1] = values[-1].split(';')[0]
+        if f.type == "Integer":
+            for idx,v in enumerate(values):
+                try:
+                    if v == ".": values[idx] = f.missingvalue
+                    else:        values[idx] = int(v)
+                except:
+                    self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
+                    return [0] * len(values)
+            return values
+        elif f.type == "String":
+            self._line = line
+            if f.id == "GT": values = list(map( self.convertGT, values ))
+            return values
+        elif f.type == "Character":
+            for v in values:
+                if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
+            return values
+        elif f.type == "Float":
+            for idx,v in enumerate(values):
+                if v == ".": values[idx] = f.missingvalue
+            try: return list(map(float,values))
+            except:
+                self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
+                return [0.0] * len(values)
+        else:
+            # can't happen
+            self.error(line,self.ERROR_INFO_STRING)
+
+    def inregion(self, chrom, pos):
+        if not self._regions: return True
+        for r in self._regions:
+            if r[0] == chrom and r[1] <= pos < r[2]: return True
+        return False
+
+    def parse_data( self, line, lineparse=False ):
+        cols = line.split('\t')
+        if len(cols) != len(self._samples)+9:
+            # gracefully deal with absent FORMAT column
+            # and those missing samples
+            if len(cols) == 8:
+                cols.append("")
+            else:
+                self.error(line,
+                           self.BAD_NUMBER_OF_COLUMNS,
+                           "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
+
+        chrom = cols[0]
+
+        # get 0-based position
+        try:    pos = int(cols[1])-1
+        except: self.error(line,self.POS_NOT_NUMERICAL)
+        if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
+
+        # implement filtering
+        if not self.inregion(chrom,pos): return None
+
+        # end of first-pass parse for sortedVCF
+        if lineparse: return chrom, pos, line
+
+        id = cols[2]
+
+        ref = cols[3].upper()
+        if ref == ".":
+            self.error(line,self.MISSING_REF)
+            if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
+            else:                   ref = ""
+        else:
+            for c in ref:
+                if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
+            if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
+
+        # make sure reference is sane
+        if self._reference:
+            left = max(0,pos-100)
+            faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
+            faref = faref_leftflank[pos-left:]
+            if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
+            ref = faref
+
+        # convert v3.3 to v4.0 alleles below
+        if cols[4] == ".": alt = []
+        else: alt = cols[4].upper().split(',')
+
+        if cols[5] == ".": qual = -1
+        else:
+            try:    qual = float(cols[5])
+            except: self.error(line,self.QUAL_NOT_NUMERICAL)
+
+        # postpone checking that filters exist.  Encode missing filter or no filtering as empty list
+        if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
+        else: filter = cols[6].split(';')
+
+        # dictionary of keys, and list of values
+        info = {}
+        if cols[7] != ".":
+            for blurp in cols[7].split(';'):
+                elts = blurp.split('=')
+                if len(elts) == 1: v = None
+                elif len(elts) == 2: v = elts[1]
+                else: self.error(line,self.ERROR_INFO_STRING)
+                info[elts[0]] = self.parse_formatdata(elts[0],
+                                                      v,
+                                                      self._info,
+                                                      line)
+
+        # Gracefully deal with absent FORMAT column
+        if cols[8] == "": format = []
+        else: format = cols[8].split(':')
+
+        # check: all filters are defined
+        for f in filter:
+            if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
+
+        # check: format fields are defined
+        if self._format:
+            for f in format:
+                if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
+
+        # convert v3.3 alleles
+        if self._version == 33:
+            if len(ref) != 1: self.error(line,self.V33_BAD_REF)
+            newalts = []
+            have_deletions = False
+            for a in alt:
+                if len(a) == 1: a = a + ref[1:]                       # SNP; add trailing reference
+                elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:]  # insertion just beyond pos; add first and trailing reference
+                elif a.startswith('D'): # allow D<seq> and D<num>
+                    have_deletions = True
+                    try:
+                        l = int(a[1:])          # throws ValueError if sequence
+                        if len(ref) < l:        # add to reference if necessary
+                            addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
+                            ref += addns
+                            for i,na in enumerate(newalts): newalts[i] = na+addns
+                        a = ref[l:]             # new deletion, deleting pos...pos+l
+                    except ValueError:
+                        s = a[1:]
+                        if len(ref) < len(s):   # add Ns to reference if necessary
+                            addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
+                            if not s.endswith(addns) and addns != 'N'*len(addns):
+                                self.error(line,self.V33_UNMATCHED_DELETION,
+                                           "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
+                            ref += addns
+                            for i,na in enumerate(newalts): newalts[i] = na+addns
+                        a = ref[len(s):]        # new deletion, deleting from pos
+                else:
+                    self.error(line,self.V33_BAD_ALLELE)
+                newalts.append(a)
+            alt = newalts
+            # deletion alleles exist, add dummy 1st reference allele, and account for leading base
+            if have_deletions:
+                if pos == 0:
+                    # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
+                    addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
+                    ref += addn
+                    alt = [allele+addn for allele in alt]
+                else:
+                    addn = get_sequence(chrom,pos-1,pos,self._reference)
+                    ref = addn + ref
+                    alt = [addn + allele for allele in alt]
+                    pos -= 1
+        else:
+            # format v4.0 -- just check for nucleotides
+            for allele in alt:
+                if not alleleRegEx.match(allele):
+                    self.error(line,self.V40_BAD_ALLELE,allele)
+
+        # check for leading nucleotide in indel calls
+        for allele in alt:
+            if len(allele) != len(ref):
+                if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
+                if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
+                    self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
+
+        # trim trailing bases in alleles
+        # AH: not certain why trimming this needs to be added
+        #     disabled now for unit testing
+        # if alt:
+        #     for i in range(1,min(len(ref),min(map(len,alt)))):
+        #         if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
+        #             break
+        #         ref, alt = ref[:-1], [allele[:-1] for allele in alt]
+
+        # left-align alleles, if a reference is available
+        if self._leftalign and self._reference:
+            while left < pos:
+                movable = True
+                for allele in alt:
+                    if len(allele) > len(ref):
+                        longest, shortest = allele, ref
+                    else:
+                        longest, shortest = ref, allele
+                    if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
+                        movable = False
+                    if longest[-1].upper() != longest[len(shortest)-1].upper():
+                        movable = False
+                if not movable:
+                    break
+                ref = ref[:-1]
+                alt = [allele[:-1] for allele in alt]
+                if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
+                    ref = faref_leftflank[pos-left-1] + ref
+                    alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
+                    pos -= 1
+
+        # parse sample columns
+        samples = []
+        for sample in cols[9:]:
+            dict = {}
+            values = sample.split(':')
+            if len(values) > len(format):
+                self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
+            for idx in range(len(format)):
+                expected = self.get_expected(format[idx], self._format, alt)
+                if idx < len(values): value = values[idx]
+                else:
+                    if expected == -1: value = "."
+                    else: value = ",".join(["."]*expected)
+
+                dict[format[idx]] = self.parse_formatdata(format[idx],
+                                                          value,
+                                                          self._format,
+                                                          line)
+                if expected != -1 and len(dict[format[idx]]) != expected:
+                    self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
+                               "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
+                    if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
+                    dict[format[idx]] = dict[format[idx]][:expected]
+            samples.append( dict )
+
+        # done
+        d = {'chrom':chrom,
+             'pos':pos,      # return 0-based position
+             'id':id,
+             'ref':ref,
+             'alt':alt,
+             'qual':qual,
+             'filter':filter,
+             'info':info,
+             'format':format}
+        for key,value in zip(self._samples,samples):
+            d[key] = value
+
+        return d
+
+
+    def write_data(self, stream, data):
+        required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
+        for k in required:
+            if k not in data: raise ValueError("Required key %s not found in data" % str(k))
+        if data['alt'] == []: alt = "."
+        else: alt = ",".join(data['alt'])
+        if data['filter'] == None: filter = "."
+        elif data['filter'] == []:
+            if self._version == 33: filter = "0"
+            else: filter = "PASS"
+        else: filter = ';'.join(data['filter'])
+        if data['qual'] == -1: qual = "."
+        else: qual = str(data['qual'])
+
+        output = [data['chrom'],
+                  str(data['pos']+1),   # change to 1-based position
+                  data['id'],
+                  data['ref'],
+                  alt,
+                  qual,
+                  filter,
+                  self.format_formatdata(
+                      data['info'], self._info, separator=";"),
+                  self.format_formatdata(
+                      data['format'], self._format, value=False)]
+
+        for s in self._samples:
+            output.append(self.format_formatdata(
+                data[s], self._format, key=False))
+
+        stream.write( "\t".join(output) + "\n" )
+
+    def _parse_header(self, stream):
+        self._lineno = 0
+        for line in stream:
+            line = force_str(line, self.encoding)
+            self._lineno += 1
+            if line.startswith('##'):
+                self.parse_header(line.strip())
+            elif line.startswith('#'):
+                self.parse_heading(line.strip())
+                self.enter_default_format()
+            else:
+                break
+        return line
+
+    def _parse(self, line, stream):
+        # deal with files with header only
+        if line.startswith("##"): return
+        if len(line.strip()) > 0:
+            d = self.parse_data( line.strip() )
+            if d: yield d
+        for line in stream:
+            self._lineno += 1
+            if self._lines and self._lineno > self._lines: raise StopIteration
+            d = self.parse_data( line.strip() )
+            if d: yield d
+
+    ######################################################################################################
+    #
+    # API follows
+    #
+    ######################################################################################################
+
+    def getsamples(self):
+        """ List of samples in VCF file """
+        return self._samples
+
+    def setsamples(self,samples):
+        """ List of samples in VCF file """
+        self._samples = samples
+
+    def getheader(self):
+        """ List of header key-value pairs (strings) """
+        return self._header
+
+    def setheader(self,header):
+        """ List of header key-value pairs (strings) """
+        self._header = header
+
+    def getinfo(self):
+        """ Dictionary of ##INFO tags, as VCF.FORMAT values """
+        return self._info
+
+    def setinfo(self,info):
+        """ Dictionary of ##INFO tags, as VCF.FORMAT values """
+        self._info = info
+
+    def getformat(self):
+        """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
+        return self._format
+
+    def setformat(self,format):
+        """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
+        self._format = format
+
+    def getfilter(self):
+        """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
+        return self._filter
+
+    def setfilter(self,filter):
+        """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
+        self._filter = filter
+
+    def setversion(self, version):
+        if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
+        self._version = version
+
+    def setregions(self, regions):
+        self._regions = regions
+
+    def setreference(self, ref):
+        """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
+        self._reference = ref
+
+    def ignoreerror(self, errorstring):
+        try:             self._ignored_errors.add(self.__dict__[errorstring])
+        except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
+
+    def warnerror(self, errorstring):
+        try:             self._warn_errors.add(self.__dict__[errorstring])
+        except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
+
+    def parse(self, stream):
+        """ Parse a stream of VCF-formatted lines.  Initializes class instance and return generator """
+        last_line = self._parse_header(stream)
+        # now return a generator that does the actual work.  In this way the pre-processing is done
+        # before the first piece of data is yielded
+        return self._parse(last_line, stream)
+
+    def write(self, stream, datagenerator):
+        """ Writes a VCF file to a stream, using a data generator (or list) """
+        self.write_header(stream)
+        self.write_heading(stream)
+        for data in datagenerator: self.write_data(stream,data)
+
+    def writeheader(self, stream):
+        """ Writes a VCF header """
+        self.write_header(stream)
+        self.write_heading(stream)
+
+    def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
+        """ Utility function: compares two calls for equality """
+        # a variant should always be assigned to a unique position, one base before
+        # the leftmost position of the alignment gap.  If this rule is implemented
+        # correctly, the two positions must be equal for the calls to be identical.
+        if pos1 != pos2: return False
+        # from both calls, trim rightmost bases when identical.  Do this safely, i.e.
+        # only when the reference bases are not Ns
+        while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
+            ref1 = ref1[:-1]
+            alt1 = alt1[:-1]
+        while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
+            ref2 = ref2[:-1]
+            alt2 = alt2[:-1]
+        # now, the alternative alleles must be identical
+        return alt1 == alt2
+
+###########################################################################################################
+###########################################################################################################
+## API functions added by Andreas
+###########################################################################################################
+
+    def connect(self, filename, encoding="ascii"):
+        '''connect to tabix file.'''
+        self.encoding=encoding
+        self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
+        self._parse_header(self.tabixfile.header)
+
+    def __del__(self):
+        self.close()
+        self.tabixfile = None
+
+    def close(self):
+        if self.tabixfile:
+            self.tabixfile.close()
+            self.tabixfile = None
+
+    def fetch(self,
+              reference=None,
+              start=None,
+              end=None,
+              region=None ):
+        """ Parse a stream of VCF-formatted lines.
+        Initializes class instance and return generator """
+        return self.tabixfile.fetch(
+            reference,
+            start,
+            end,
+            region,
+            parser = asVCFRecord(self))
+
+    def validate(self, record):
+        '''validate vcf record.
+
+        returns a validated record.
+        '''
+
+        raise NotImplementedError("needs to be checked")
+
+        chrom, pos = record.chrom, record.pos
+
+        # check reference
+        ref = record.ref
+        if ref == ".":
+            self.error(str(record),self.MISSING_REF)
+            if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
+            else:                   ref = ""
+        else:
+            for c in ref:
+                if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
+                if "N" in ref: ref = get_sequence(chrom,
+                                                  pos,
+                                                  pos+len(ref),
+                                                  self._reference)
+
+        # make sure reference is sane
+        if self._reference:
+            left = max(0,self.pos-100)
+            faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
+            faref = faref_leftflank[pos-left:]
+            if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
+            ref = faref
+
+        # check: format fields are defined
+        for f in record.format:
+            if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
+
+        # check: all filters are defined
+        for f in record.filter:
+            if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
+
+        # convert v3.3 alleles
+        if self._version == 33:
+            if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
+            newalts = []
+            have_deletions = False
+            for a in alt:
+                if len(a) == 1: a = a + ref[1:]                       # SNP; add trailing reference
+                elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:]  # insertion just beyond pos; add first and trailing reference
+                elif a.startswith('D'): # allow D<seq> and D<num>
+                    have_deletions = True
+                    try:
+                        l = int(a[1:])          # throws ValueError if sequence
+                        if len(ref) < l:        # add to reference if necessary
+                            addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
+                            ref += addns
+                            for i,na in enumerate(newalts): newalts[i] = na+addns
+                        a = ref[l:]             # new deletion, deleting pos...pos+l
+                    except ValueError:
+                        s = a[1:]
+                        if len(ref) < len(s):   # add Ns to reference if necessary
+                            addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
+                            if not s.endswith(addns) and addns != 'N'*len(addns):
+                                self.error(str(record),self.V33_UNMATCHED_DELETION,
+                                           "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
+                            ref += addns
+                            for i,na in enumerate(newalts): newalts[i] = na+addns
+                        a = ref[len(s):]        # new deletion, deleting from pos
+                else:
+                    self.error(str(record),self.V33_BAD_ALLELE)
+                newalts.append(a)
+            alt = newalts
+            # deletion alleles exist, add dummy 1st reference allele, and account for leading base
+            if have_deletions:
+                if pos == 0:
+                    # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
+                    addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
+                    ref += addn
+                    alt = [allele+addn for allele in alt]
+                else:
+                    addn = get_sequence(chrom,pos-1,pos,self._reference)
+                    ref = addn + ref
+                    alt = [addn + allele for allele in alt]
+                    pos -= 1
+        else:
+            # format v4.0 -- just check for nucleotides
+            for allele in alt:
+                if not alleleRegEx.match(allele):
+                    self.error(str(record),self.V40_BAD_ALLELE,allele)
+
+
+        # check for leading nucleotide in indel calls
+        for allele in alt:
+            if len(allele) != len(ref):
+                if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
+                if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
+                    self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
+
+        # trim trailing bases in alleles
+        # AH: not certain why trimming this needs to be added
+        #     disabled now for unit testing
+        # for i in range(1,min(len(ref),min(map(len,alt)))):
+        #     if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
+        #         break
+        #     ref, alt = ref[:-1], [allele[:-1] for allele in alt]
+
+        # left-align alleles, if a reference is available
+        if self._leftalign and self._reference:
+            while left < pos:
+                movable = True
+                for allele in alt:
+                    if len(allele) > len(ref):
+                        longest, shortest = allele, ref
+                    else:
+                        longest, shortest = ref, allele
+                    if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
+                        movable = False
+                    if longest[-1].upper() != longest[len(shortest)-1].upper():
+                        movable = False
+                if not movable:
+                    break
+                ref = ref[:-1]
+                alt = [allele[:-1] for allele in alt]
+                if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
+                    ref = faref_leftflank[pos-left-1] + ref
+                    alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
+                    pos -= 1
+
+__all__ = [
+    "VCF", "VCFRecord", ]
diff --git a/pysam/utils.py b/pysam/utils.py

index c5bb5393b213cece56392749bf821856c25dc11d..5c045df1acda9782fd5c6ad5dd44121016f7f3c7 100644 (file)
--- a/pysam/utils.py
+++ b/pysam/utils.py
@@ -1,4 +1,4 @@
-from pysam.cutils import _pysam_dispatch
+from pysam.libcutils import _pysam_dispatch
  
  
  class SamtoolsError(Exception):
diff --git a/pysam/version.py b/pysam/version.py

index 0a985de995b4046f675d6c620a474318816bb209..facb3bbcb6dee197298eb0056066427685153481 100644 (file)
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,7 +1,9 @@
  # pysam versioning information
  
-__version__ = "0.9.1.4"
+__version__ = "0.10.0"
  
  __samtools_version__ = "1.3.1"
  
-__htslib_version__ = "1.3.1"
+__bcftools_version__ = "1.3.1"
+
+__htslib_version__ = "1.3.2"
diff --git a/requirements.txt b/requirements.txt

index 687929a1cf353e6bacf221e906912cf737f964ce..6e8fc4445a0c7d5fcfd66e128283ec0d41f39011 100644 (file)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-cython>=0.22
+cython>=0.24.1
diff --git a/run_tests_travis.sh b/run_tests_travis.sh

index 414043ea8c8e32e00fcb25e3a3df51930f69e3fa..a229ff5cdab9ea12fdb0210209ee5175c7297c6f 100755 (executable)
--- a/run_tests_travis.sh
+++ b/run_tests_travis.sh
@@ -6,75 +6,36 @@ WORKDIR=`pwd`
  
  #Install miniconda python
  if [ $TRAVIS_OS_NAME == "osx" ]; then
-       curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-       bash Miniconda3-latest-MacOSX-x86_64.sh -b
+       wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh
  else
-       curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-       bash Miniconda3-latest-Linux-x86_64.sh -b
+       wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate  # Default OS versions are old and have SSL / CERT issues
  fi
  
+bash Miniconda3.sh -b
+
  # Create a new conda environment with the target python version
  ~/miniconda3/bin/conda install conda-build -y
-~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose
+~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose psutil pip 
+
+# activate testenv environment
+source ~/miniconda3/bin/activate testenv
  
-# Add new conda environment to PATH
-export PATH=~/miniconda3/envs/testenv/bin/:$PATH
+conda config --add channels conda-forge
+conda config --add channels defaults
+conda config --add channels r
+conda config --add channels bioconda
  
-# Hack to force linking to anaconda libraries rather than system libraries
-#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
-#export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
+conda install -y samtools bcftools htslib
  
  # Need to make C compiler and linker use the anaconda includes and libraries:
  export PREFIX=~/miniconda3/
  export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
  export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
  
-# create a new folder to store external tools
-mkdir -p $WORKDIR/external-tools
-
-# install htslib
-cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 > htslib-1.3.1.tar.bz2
-tar xjvf htslib-1.3.1.tar.bz2
-cd htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/htslib-1.3.1
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3.1
-
-# install samtools, compile against htslib
-cd $WORKDIR/external-tools
-curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3.1/samtools-1.3.1.tar.bz2 > samtools-1.3.1.tar.bz2
-tar xjvf samtools-1.3.1.tar.bz2
-cd samtools-1.3.1
-./configure --with-htslib=../htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/samtools-1.3.1
-
-echo "installed samtools"
  samtools --version
-
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-# install bcftools
-cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/bcftools/releases/download/1.3.1/bcftools-1.3.1.tar.bz2 > bcftools-1.3.1.tar.bz2
-tar xjf bcftools-1.3.1.tar.bz2
-cd bcftools-1.3.1
-./configure --with-htslib=../htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3.1
-
-echo "installed bcftools"
+htslib --version
  bcftools --version
  
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-popd
-
  # Try building conda recipe first
  ~/miniconda3/bin/conda-build ci/conda-recipe/ --python=$CONDA_PY
  
@@ -105,9 +66,10 @@ if [ $? != 0 ]; then
      exit 1
  fi
  
-# build source tar-ball
+# build source tar-ball. Make sure to build so that .pyx files
+# are cythonized.
  cd ..
-python setup.py sdist
+python setup.py build sdist
  
  if [ $? != 0 ]; then
      exit 1
@@ -123,7 +85,7 @@ fi
  
  # test pip installation from tar-ball with cython
  echo "pip installing with cython"
-pip install --verbose --no-deps --no-use-wheel dist/pysam-*.tar.gz
+pip install --verbose --no-deps --no-binary=:all: dist/pysam-*.tar.gz
  
  if [ $? != 0 ]; then
      exit 1
@@ -131,10 +93,10 @@ fi
  
  # attempt pip installation without cython
  echo "pip installing without cython"
-~/miniconda3/bin/conda remove cython
+~/miniconda3/bin/conda remove -y cython
  ~/miniconda3/bin/conda list
-echo "pthyon is" `which python`
-pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz
+echo "python is" `which python`
+pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
  
  if [ $? != 0 ]; then
      exit 1
@@ -144,7 +106,7 @@ fi
  # command line options
  echo "pip installing without cython and no configure options"
  export HTSLIB_CONFIGURE_OPTIONS=""
-pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz
+pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
  
  if [ $? != 0 ]; then
      exit 1
diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c

index 3d5ffa56a3cbdf8fb820e4244e736a7b72e8a7a0..8c883b0b39debc91a8a89f74907edb96678da710 100644 (file)
--- a/samtools/sam_view.c.pysam.c
+++ b/samtools/sam_view.c.pysam.c
@@ -489,9 +489,9 @@ int main_samview(int argc, char *argv[])
      }
  
  view_end:
-    if (is_count && ret == 0)
+    if (is_count && ret == 0) 
          fprintf(pysam_stdout, "%" PRId64 "\n", count);
-
+    
      // close files, free and return
      if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
      if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
diff --git a/setup.py b/setup.py

index e301f11ce0cb0dcbcba53b72328d99b9c62e784b..6d5261721d7cf41d5cb1b314664ae1adce2afe3d 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -60,13 +60,18 @@ def run_configure(option):
  
  
  def run_make_print_config():
-    stdout = subprocess.check_output(["make", "print-config"])
+    stdout = subprocess.check_output(["make", "-s", "print-config"])
      if IS_PYTHON3:
          stdout = stdout.decode("ascii")
  
-    result = dict([[x.strip() for x in line.split("=")]
-                   for line in stdout.splitlines()])
-    return result
+    make_print_config = {}
+    for line in stdout.splitlines():
+        if "=" in line:
+            row = line.split("=")
+            if len(row) == 2:
+                make_print_config.update(
+                    {row[0].strip(): row[1].strip()})
+    return make_print_config
  
  
  def configure_library(library_dir, env_options=None, options=[]):
@@ -139,16 +144,12 @@ try:
      import cython
      HAVE_CYTHON = True
      print ("# pysam: cython is available - using cythonize if necessary")
-    source_pattern = "pysam/c%s.pyx"
-    if HTSLIB_MODE != "external":
-        HTSLIB_MODE = "shared"
+    source_pattern = "pysam/libc%s.pyx"
  except ImportError:
      HAVE_CYTHON = False
      print ("# pysam: no cython available - using pre-compiled C")
      # no Cython available - use existing C code
-    source_pattern = "pysam/c%s.c"
-    if HTSLIB_MODE != "external":
-        HTSLIB_MODE = "shared"
+    source_pattern = "pysam/libc%s.c"
  
  # collect pysam version
  sys.path.insert(0, "pysam")
@@ -230,7 +231,6 @@ if HTSLIB_LIBRARY_DIR:
      chtslib_sources = []
      htslib_library_dirs = [HTSLIB_LIBRARY_DIR]
      htslib_include_dirs = [HTSLIB_INCLUDE_DIR]
-    internal_htslib_libraries = []
      external_htslib_libraries = ['z', 'hts']
  
  elif HTSLIB_MODE == 'separate':
@@ -240,7 +240,6 @@ elif HTSLIB_MODE == 'separate':
      shared_htslib_sources = htslib_sources
      htslib_library_dirs = []
      htslib_include_dirs = ['htslib']
-    internal_htslib_libraries = []
  
  elif HTSLIB_MODE == 'shared':
      # link each pysam component against the same
@@ -249,30 +248,15 @@ elif HTSLIB_MODE == 'shared':
      htslib_library_dirs = [
          'pysam',
          ".",
-        os.path.join("build",
-                     distutils_dir_name("lib"),
-                     "pysam")]
+        os.path.join("build", distutils_dir_name("lib"), "pysam")]
  
      htslib_include_dirs = ['htslib']
  
-    if IS_PYTHON3:
-        if sys.version_info.minor >= 5:
-            internal_htslib_libraries = ["chtslib.{}".format(
-                sysconfig.get_config_var('SOABI'))]
-        else:
-            if sys.platform == "darwin":
-                # On OSX, python 3.3 and 3.4 Libs have no platform tags.
-                internal_htslib_libraries = ["chtslib"]
-            else:
-                internal_htslib_libraries = ["chtslib.{}{}".format(
-                    sys.implementation.cache_tag,
-                    sys.abiflags)]
-    else:
-        internal_htslib_libraries = ["chtslib"]
-
  else:
      raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
  
+internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]]
+
  # build config.py
  with open(os.path.join("pysam", "config.py"), "w") as outf:
      outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE))
@@ -382,7 +366,7 @@ chtslib = Extension(
  # Selected ones have been copied into samfile_utils.c
  # Needs to be devolved somehow.
  csamfile = Extension(
-    "pysam.csamfile",
+    "pysam.libcsamfile",
      [source_pattern % "samfile",
       "pysam/htslib_util.c",
       "pysam/samfile_util.c",
@@ -402,7 +386,7 @@ csamfile = Extension(
  # Selected ones have been copied into samfile_utils.c
  # Needs to be devolved somehow.
  calignmentfile = Extension(
-    "pysam.calignmentfile",
+    "pysam.libcalignmentfile",
      [source_pattern % "alignmentfile",
       "pysam/htslib_util.c",
       "pysam/samfile_util.c",
@@ -422,7 +406,7 @@ calignmentfile = Extension(
  # Selected ones have been copied into samfile_utils.c
  # Needs to be devolved somehow.
  calignedsegment = Extension(
-    "pysam.calignedsegment",
+    "pysam.libcalignedsegment",
      [source_pattern % "alignedsegment",
       "pysam/htslib_util.c",
       "pysam/samfile_util.c",
@@ -438,7 +422,7 @@ calignedsegment = Extension(
  )
  
  ctabix = Extension(
-    "pysam.ctabix",
+    "pysam.libctabix",
      [source_pattern % "tabix",
       "pysam/tabix_util.c"] +
      htslib_sources +
@@ -452,7 +436,7 @@ ctabix = Extension(
  )
  
  cutils = Extension(
-    "pysam.cutils",
+    "pysam.libcutils",
      [source_pattern % "utils", "pysam/pysam_util.c"] +
      glob.glob(os.path.join("samtools", "*.pysam.c")) +
      # glob.glob(os.path.join("samtools", "*", "*.pysam.c")) +
@@ -470,7 +454,7 @@ cutils = Extension(
  )
  
  cfaidx = Extension(
-    "pysam.cfaidx",
+    "pysam.libcfaidx",
      [source_pattern % "faidx"] +
      htslib_sources +
      os_c_files,
@@ -483,7 +467,7 @@ cfaidx = Extension(
  )
  
  ctabixproxies = Extension(
-    "pysam.ctabixproxies",
+    "pysam.libctabixproxies",
      [source_pattern % "tabixproxies"] +
      os_c_files,
      library_dirs=htslib_library_dirs,
@@ -495,7 +479,7 @@ ctabixproxies = Extension(
  )
  
  cvcf = Extension(
-    "pysam.cvcf",
+    "pysam.libcvcf",
      [source_pattern % "vcf"] +
      os_c_files,
      library_dirs=htslib_library_dirs,
@@ -507,7 +491,7 @@ cvcf = Extension(
  )
  
  cbcf = Extension(
-    "pysam.cbcf",
+    "pysam.libcbcf",
      [source_pattern % "bcf"] +
      htslib_sources +
      os_c_files,
@@ -519,6 +503,19 @@ cbcf = Extension(
      define_macros=define_macros
  )
  
+cbgzf = Extension(
+    "pysam.libcbgzf",
+    [source_pattern % "bgzf"] +
+    htslib_sources +
+    os_c_files,
+    library_dirs=htslib_library_dirs,
+    include_dirs=["htslib", "."] + include_os + htslib_include_dirs,
+    libraries=external_htslib_libraries + internal_htslib_libraries,
+    language="c",
+    extra_compile_args=extra_compile_args,
+    define_macros=define_macros
+)
+
  metadata = {
      'name': "pysam",
      'version': version,
@@ -539,6 +536,7 @@ metadata = {
                      ctabixproxies,
                      cvcf,
                      cbcf,
+                    cbgzf,
                      cfaidx,
                      cutils],
      'cmdclass': cmdclass,
diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py

index 94b2eb361b08471c2f8e67d910e36125e524e20a..b0a3466d46c25c81fe730b776a372581fd3da88f 100644 (file)
--- a/tests/AlignedSegment_test.py
+++ b/tests/AlignedSegment_test.py
@@ -46,19 +46,19 @@ class TestAlignedSegment(ReadTest):
          self.assertEqual(a.query_sequence, None)
          self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None)
          self.assertEqual(a.flag, 0)
-        self.assertEqual(a.reference_id, 0)
+        self.assertEqual(a.reference_id, -1)
          self.assertEqual(a.mapping_quality, 0)
          self.assertEqual(a.cigartuples, None)
          self.assertEqual(a.tags, [])
-        self.assertEqual(a.next_reference_id, 0)
-        self.assertEqual(a.next_reference_start, 0)
+        self.assertEqual(a.next_reference_id, -1)
+        self.assertEqual(a.next_reference_start, -1)
          self.assertEqual(a.template_length, 0)
  
      def testStrOfEmptyRead(self):
          a = pysam.AlignedSegment()
          s = str(a)
          self.assertEqual(
-            "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
+            "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
              s)
  
      def testSettingTagInEmptyRead(self):
@@ -231,6 +231,24 @@ class TestAlignedSegment(ReadTest):
          self.assertEqual(a.get_blocks(),
                           [(20, 30), (31, 40), (40, 60)])
  
+    def test_infer_query_length(self):
+        '''Test infer_query_length on M|=|X|I|D|H|S cigar ops'''
+        a = self.buildRead()
+        a.cigarstring = '15M'
+        self.assertEqual(a.infer_query_length(), 15)
+        a.cigarstring = '15='
+        self.assertEqual(a.infer_query_length(), 15)
+        a.cigarstring = '15X'
+        self.assertEqual(a.infer_query_length(), 15)
+        a.cigarstring = '5M5I5M'
+        self.assertEqual(a.infer_query_length(), 15)
+        a.cigarstring = '5M5D5M'
+        self.assertEqual(a.infer_query_length(), 10)
+        a.cigarstring = '5H10M'
+        self.assertEqual(a.infer_query_length(), 15)
+        a.cigarstring = '5S10M'
+        self.assertEqual(a.infer_query_length(), 15)
+
      def test_get_aligned_pairs_soft_clipping(self):
          a = self.buildRead()
          a.cigartuples = ((4, 2), (0, 35), (4, 3))
@@ -375,6 +393,18 @@ class TestAlignedSegment(ReadTest):
          a.cigarstring = "1S20M1S"
          self.assertEqual(a.query_alignment_length, 20)
  
+    def test_query_length_is_limited(self):
+        
+        a = self.buildRead()
+        a.query_name = "A" * 1
+        a.query_name = "A" * 254
+        self.assertRaises(
+            ValueError,
+            setattr,
+            a,
+            "query_name",
+            "A" * 255)
+
  
  class TestCigarStats(ReadTest):
      
@@ -754,5 +784,6 @@ class TestAsString(unittest.TestCase):
              for s, p in zip(reference, pysamf):
                  self.assertEqual(s, p.tostring(pysamf))
  
+
  if __name__ == "__main__":
      unittest.main()
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py

index c042f4f02bb369d1bb3288d16ec1d999402a43c2..18fb05be04a05945d1857a14ba670f0decd4314c 100644 (file)
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -29,7 +29,8 @@ from TestUtils import checkBinaryEqual, checkURL, \
      get_temp_filename
  
  
-DATADIR = "pysam_data"
+DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                       "pysam_data"))
  
  
  ##################################################
@@ -353,26 +354,53 @@ class BasicTestBAMFromFilename(BasicTestBAMFromFetch):
  class BasicTestBAMFromFile(BasicTestBAMFromFetch):
  
      def setUp(self):
-        f = open(os.path.join(DATADIR, "ex3.bam"))
-        self.samfile = pysam.AlignmentFile(
-            f, "rb")
+        with open(os.path.join(DATADIR, "ex3.bam")) as f:
+            self.samfile = pysam.AlignmentFile(
+                f, "rb")
+        self.reads = [r for r in self.samfile]
+
+
+class BasicTestBAMFromFileNo(BasicTestBAMFromFetch):
+
+    def setUp(self):
+        with open(os.path.join(DATADIR, "ex3.bam")) as f:
+            self.samfile = pysam.AlignmentFile(
+                f.fileno(), "rb")
          self.reads = [r for r in self.samfile]
  
  
  class BasicTestSAMFromFile(BasicTestBAMFromFetch):
  
      def setUp(self):
-        f = open(os.path.join(DATADIR, "ex3.sam"))
-        self.samfile = pysam.AlignmentFile(
-            f, "r")
+        with open(os.path.join(DATADIR, "ex3.sam")) as f:
+            self.samfile = pysam.AlignmentFile(
+                f, "r")
+        self.reads = [r for r in self.samfile]
+
+
+class BasicTestSAMFromFileNo(BasicTestBAMFromFetch):
+
+    def setUp(self):
+        with open(os.path.join(DATADIR, "ex3.sam")) as f:
+            self.samfile = pysam.AlignmentFile(
+                f.fileno(), "r")
          self.reads = [r for r in self.samfile]
  
  
  class BasicTestCRAMFromFile(BasicTestCRAMFromFetch):
  
      def setUp(self):
-        f = open(os.path.join(DATADIR, "ex3.cram"))
-        self.samfile = pysam.AlignmentFile(f, "rc")
+        with open(os.path.join(DATADIR, "ex3.cram")) as f:
+            self.samfile = pysam.AlignmentFile(f, "rc")
+        self.reads = [r for r in self.samfile]
+
+
+class BasicTestCRAMFromFileNo(BasicTestCRAMFromFetch):
+
+    def setUp(self):
+        with open(os.path.join(DATADIR, "ex3.cram")) as f:
+            self.samfile = pysam.AlignmentFile(
+                f.fileno(), "rc")
          self.reads = [r for r in self.samfile]
  
  
@@ -690,7 +718,7 @@ class TestIO(unittest.TestCase):
          samfile = pysam.AlignmentFile(f, "rb")
          f.close()
          self.assertTrue(f.closed)
-        # access to Samfile should still work
+        # access to Samfile still works
          self.checkEcho("ex1.bam",
                         "ex1.bam",
                         "tmp_ex1.bam",
@@ -818,6 +846,15 @@ class TestIO(unittest.TestCase):
              mode="rb")
          self.assertEqual(len(list(samfile.fetch())), 3270)
  
+    def testBAMWithCSIIndex(self):
+        '''see issue 116'''
+        input_filename = os.path.join(DATADIR, "ex1_csi.bam")
+        samfile = pysam.AlignmentFile(input_filename,
+                                      "rb",
+                                      check_sq=False)
+        samfile.fetch('chr2')
+
+
  
  class TestAutoDetect(unittest.TestCase):
  
@@ -1291,8 +1328,8 @@ class TestHeaderSAM(unittest.TestCase):
  
      """testing header manipulation"""
  
-    header = {'SQ': [{'LN': 1575, 'SN': 'chr1'},
-                     {'LN': 1584, 'SN': 'chr2'}],
+    header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
+                     {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
                'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891',
                        'PU': 'SC_1_10', "CN": "name:with:colon"},
                       {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891',
@@ -2343,6 +2380,46 @@ class TestPileupQueryPosition(unittest.TestCase):
                          last[r.alignment.query_name] = r.query_position
  
  
+class TestFindIntrons(unittest.TestCase):
+    samfilename = "pysam_data/ex_spliced.bam"
+
+    def setUp(self):
+        self.samfile = pysam.AlignmentFile(self.samfilename)
+
+    def tearDown(self):
+        self.samfile.close()
+
+    def test_total(self):
+        all_read_counts = self.samfile.count()
+        splice_sites = self.samfile.find_introns(self.samfile.fetch())
+        self.assertEqual(sum(splice_sites.values()), all_read_counts -1)  # there is a single unspliced read in there
+         
+    def test_first(self):
+        reads = list(self.samfile.fetch())[:10]
+        splice_sites = self.samfile.find_introns(reads)
+        starts = [14792+38 - 1]
+        stops = [14792+38 + 140 - 1]
+        self.assertEqual(len(splice_sites), 1)
+        self.assertTrue((starts[0], stops[0]) in splice_sites)
+        self.assertEqual(splice_sites[(starts[0], stops[0])], 9) # first one is the unspliced read
+
+    def test_all(self):
+        reads = list(self.samfile.fetch())
+        splice_sites = self.samfile.find_introns(reads)
+        should = collections.Counter({
+            (14829, 14969): 33,
+            (15038, 15795): 24,
+            (15947, 16606): 3,
+            (16765, 16857): 9,
+            (16765, 16875): 1,
+            (17055, 17232): 19,
+            (17055, 17605): 3,
+            (17055, 17914): 1,
+            (17368, 17605): 7,
+            })
+        self.assertEqual(should,  splice_sites)
+
+
  class TestLogging(unittest.TestCase):
  
      '''test around bug issue 42,
@@ -2511,7 +2588,6 @@ class TestMappedUnmapped(unittest.TestCase):
                               inf.mapped)
  
  
-
  class TestSamtoolsProxy(unittest.TestCase):
  
      '''tests for sanity checking access to samtools functions.'''
@@ -2592,6 +2668,34 @@ class TestVerbosity(unittest.TestCase):
          self.assertEqual(pysam.get_verbosity(), 3)
  
  
+class TestSanityCheckingBAM(unittest.TestCase):
+    
+    mode = "wb"
+
+    def check_write(self, read):
+        
+        fn = "tmp_test_sanity_check.bam"
+        names = ["chr1"]
+        lengths = [10000]
+        with pysam.AlignmentFile(
+                fn, 
+                self.mode,
+                reference_names=names,
+                reference_lengths=lengths) as outf:
+            outf.write(read)
+
+        if os.path.exists(fn):
+            os.unlink(fn)
+            
+    def test_empty_read_gives_value_error(self):
+        read = pysam.AlignedSegment()
+        self.check_write(read)
+
+# SAM writing fails, as query length is 0
+# class TestSanityCheckingSAM(TestSanityCheckingSAM):
+#     mode = "w"
+    
+
  if __name__ == "__main__":
      # build data files
      print ("building data files")
diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py

index 1fc88f3c7b74f84c909ac953a84d9a9f0d2e787b..ff130457017b747f5eff31f5f3340dedd8e71e4e 100644 (file)
--- a/tests/SamFile_test.py
+++ b/tests/SamFile_test.py
@@ -941,8 +941,8 @@ class TestIteratorColumn2(unittest.TestCase):
  
  class TestHeaderSam(unittest.TestCase):
  
-    header = {'SQ': [{'LN': 1575, 'SN': 'chr1'},
-                     {'LN': 1584, 'SN': 'chr2'}],
+    header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
+                     {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
                'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"},
                       {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}],
                'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}],
@@ -1231,19 +1231,19 @@ class TestAlignedRead(ReadTest):
          self.assertEqual(a.seq, None)
          self.assertEqual(a.qual, None)
          self.assertEqual(a.flag, 0)
-        self.assertEqual(a.rname, 0)
+        self.assertEqual(a.rname, -1)
          self.assertEqual(a.mapq, 0)
          self.assertEqual(a.cigar, [])
          self.assertEqual(a.tags, [])
-        self.assertEqual(a.mrnm, 0)
-        self.assertEqual(a.mpos, 0)
+        self.assertEqual(a.mrnm, -1)
+        self.assertEqual(a.mpos, -1)
          self.assertEqual(a.isize, 0)
  
      def testStrOfEmptyRead(self):
          a = pysam.AlignedRead()
          s = str(a)
          self.assertEqual(
-            "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
+            "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
              s)
  
      def buildRead(self):
diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py

new file mode 100644 (file)

index 0000000..ce59da7
--- /dev/null
+++ b/tests/StreamFiledescriptors_test.py
@@ -0,0 +1,82 @@
+import os
+import subprocess
+import threading
+import errno
+import unittest
+
+from pysam import AlignmentFile
+
+DATADIR = os.path.abspath(os.path.join(
+    os.path.dirname(__file__),
+    "pysam_data"))
+
+
+def alignmentfile_writer_thread(infile, outfile):
+    def _writer_thread(infile, outfile):
+        """read  from infile and write to outfile"""
+        try:
+            i = 0
+            for record in infile:
+                outfile.write(record)
+                i += 1
+        except IOError as e:
+            if e.errno != errno.EPIPE:
+                pass
+        finally:
+            outfile.close()
+
+    writer = threading.Thread(target=_writer_thread, args=(infile, outfile))
+    writer.daemon = True
+    writer.start()
+    return writer
+
+
+class StreamTest(unittest.TestCase):
+
+    def stream_process(self, proc, in_stream, out_stream, writer):
+
+        with AlignmentFile(proc.stdout) as infile:
+            read = 0
+            for record in infile:
+                read += 1
+        return 0, read
+
+    def test_text_processing(self):
+
+        proc = subprocess.Popen('head -n200',
+                                stdin=subprocess.PIPE,
+                                stdout=subprocess.PIPE,
+                                shell=True)
+
+        in_stream = AlignmentFile('pysam_data/ex1.bam')
+        out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
+        writer = alignmentfile_writer_thread(in_stream,
+                                             out_stream)
+
+        written, read = self.stream_process(proc,
+                                            in_stream,
+                                            out_stream,
+                                            writer)
+        self.assertEqual(read, 198)
+
+    def test_samtools_processing(self):
+
+        proc = subprocess.Popen('samtools view -b -f 4',
+                                stdin=subprocess.PIPE,
+                                stdout=subprocess.PIPE,
+                                shell=True)
+
+        in_stream = AlignmentFile('pysam_data/ex1.bam')
+        out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
+        writer = alignmentfile_writer_thread(in_stream,
+                                             out_stream)
+
+        written, read = self.stream_process(proc,
+                                            in_stream,
+                                            out_stream,
+                                            writer)
+        self.assertEqual(read, 35)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py

index ef21245418b7f1c5d9b208a3ed5e31bb18abd288..aa82c6656753491a40b570fcefaba8f98b550c8f 100644 (file)
--- a/tests/VariantFile_test.py
+++ b/tests/VariantFile_test.py
@@ -1,8 +1,15 @@
  import os
+import sys
  import unittest
  import pysam
  import gzip
  import subprocess
+
+try:
+    from pathlib import Path
+except ImportError:
+    Path = None
+
  from TestUtils import get_temp_filename, check_lines_equal
  
  DATADIR="cbcf_data"
@@ -75,6 +82,17 @@ class TestOpening(unittest.TestCase):
  
          os.unlink("tmp_testEmptyFile.vcf")
  
+
+    if Path and sys.version_info >= (3,6):
+        def testEmptyFileVCFFromPath(self):
+            with open("tmp_testEmptyFile.vcf", "w"):
+                pass
+
+            self.assertRaises(ValueError, pysam.VariantFile,
+                              Path("tmp_testEmptyFile.vcf"))
+
+            os.unlink("tmp_testEmptyFile.vcf")
+
      def testEmptyFileVCFGZWithIndex(self):
          with open("tmp_testEmptyFile.vcf", "w"):
              pass
@@ -171,12 +189,12 @@ class TestHeader(unittest.TestCase):
          # remove last header line starting with #CHROM
          ref.pop()
          ref = sorted(ref)
-        comp = sorted([str(x) for x in v.header.records])
+        comp = sorted(str(x) for x in v.header.records)
  
          self.assertEqual(len(ref), len(comp))
  
          for x, y in zip(ref, comp):
-            self.assertEqual(x[:-1], str(y))
+            self.assertEqual(x, y)
  
  
  # These tests need to be separate and start from newly opened files.  This
@@ -195,6 +213,13 @@ class TestParsing(unittest.TestCase):
          chrom = [rec.chrom for rec in v]
          self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
  
+    if Path and sys.version_info >= (3,6):
+        def testChromFromPath(self):
+            fn = os.path.join(DATADIR, self.filename)
+            v = pysam.VariantFile(Path(fn))
+            chrom = [rec.chrom for rec in v]
+            self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
+
      def testPos(self):
          fn = os.path.join(DATADIR, self.filename)
          v = pysam.VariantFile(fn)
@@ -330,9 +355,22 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
      """construct VariantFile from scratch."""
  
      filename = "example_vcf42_withcontigs.vcf"
+    compression = 'NONE'
+    description = 'VCF version 4.2 variant calling text'
  
-    def complete_check(self, fn_in, fn_out):
+    def testBase(self):
+        with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+            self.assertEqual(inf.category, 'VARIANTS')
+            self.assertEqual(inf.format, 'VCF')
+            self.assertEqual(inf.version, (4, 2))
+            self.assertEqual(inf.compression, self.compression)
+            self.assertEqual(inf.description, self.description)
+            self.assertTrue(inf.is_open)
+            self.assertEqual(inf.is_read, True)
+            self.assertEqual(inf.is_write, False)
  
+    def complete_check(self, fn_in, fn_out):
+        self.maxDiff = None
          check_lines_equal(
              self, fn_in, fn_out, sort=True,
              filter_f=lambda x: x.startswith("##contig"))
@@ -349,14 +387,15 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
          for record in vcf_in.header.records:
              header.add_record(record)
  
-        fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf"
-        vcf_out = pysam.VariantFile(fn, "w", header=header)
+        for sample in vcf_in.header.samples:
+            header.add_sample(sample)
+
+        vcf_out = pysam.VariantFile(fn_out, "w", header=header)
          for record in vcf_in:
-            # currently segfaults here:
-            # vcf_out.write(record)
-            pass
-        return
+            record.translate(header)
+            vcf_out.write(record)
  
+        vcf_in.close()
          vcf_out.close()
          self.complete_check(fn_in, fn_out)
  
@@ -370,6 +409,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
          for record in vcf_in:
              vcf_out.write(record)
  
+        vcf_in.close()
          vcf_out.close()
  
          self.complete_check(fn_in, fn_out)
@@ -397,8 +437,8 @@ class TestConstructionVCFWithContigs(unittest.TestCase):
  
          self.complete_check(fn_in, fn_out)
  
-# Currently segfaults for VCFs without contigs
-# class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs):
+
+#class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs):
  #     """construct VariantFile from scratch."""
  #     filename = "example_vcf40.vcf"
  
@@ -407,18 +447,33 @@ class TestConstructionVCFGZWithContigs(TestConstructionVCFWithContigs):
      """construct VariantFile from scratch."""
  
      filename = "example_vcf42_withcontigs.vcf.gz"
+    compression = 'BGZF'
+    description = 'VCF version 4.2 BGZF-compressed variant calling data'
  
  
  class TestConstructionVCFGZWithoutContigs(TestConstructionVCFWithContigs):
      """construct VariantFile from scratch."""
  
      filename = "example_vcf42.vcf.gz"
+    compression = 'BGZF'
+    description = 'VCF version 4.2 BGZF-compressed variant calling data'
  
  
  class TestSettingRecordValues(unittest.TestCase):
  
      filename = "example_vcf40.vcf"
  
+    def testBase(self):
+        with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+            self.assertEqual(inf.category, 'VARIANTS')
+            self.assertEqual(inf.format, 'VCF')
+            self.assertEqual(inf.version, (4, 0))
+            self.assertEqual(inf.compression, 'NONE')
+            self.assertEqual(inf.description, 'VCF version 4.0 variant calling text')
+            self.assertTrue(inf.is_open)
+            self.assertEqual(inf.is_read, True)
+            self.assertEqual(inf.is_write, False)
+
      def testSetQual(self):
          with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
              record = next(inf)
@@ -435,8 +490,7 @@ class TestSettingRecordValues(unittest.TestCase):
              sample = record.samples["NA00001"]
              print (sample["GT"])
              self.assertEqual(sample["GT"], (0, 0))
-#      Fails with TypeError
-#            sample["GT"] = sample["GT"]
+            sample["GT"] = sample["GT"]
  
  class TestSubsetting(unittest.TestCase):
      
diff --git a/tests/_compile_test.pyx b/tests/_compile_test.pyx

index db6b5b671be22c66eae8bdf3683250e14e66255d..dfe79372c9e7ced14388541736b404644a336400 100644 (file)
--- a/tests/_compile_test.pyx
+++ b/tests/_compile_test.pyx
@@ -1,5 +1,5 @@
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-from pysam.ctabix cimport Tabixfile
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+from pysam.libctabix cimport Tabixfile
  
  cdef AlignmentFile samfile
  cdef Tabixfile tabixfile
diff --git a/tests/_cython_flagstat.pyx b/tests/_cython_flagstat.pyx

index f0f03bb2adfd96a17c39088f50e1eab0562fb84d..8e376b017b156916bf7154278da5d45bf8f19c94 100644 (file)
--- a/tests/_cython_flagstat.pyx
+++ b/tests/_cython_flagstat.pyx
@@ -1,6 +1,6 @@
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-from pysam.calignmentfile cimport pysam_get_flag
-from pysam.calignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
+from pysam.libcalignedsegment cimport pysam_get_flag
  
  def count(AlignmentFile samfile):
      cdef int is_proper = 0
diff --git a/tests/cython_flagstat.py b/tests/cython_flagstat.py

deleted file mode 100644 (file)

index 851157a..0000000
--- a/tests/cython_flagstat.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import pysam
-
-import pyximport
-pyximport.install()
-import _cython_flagstat
-
-is_paired, is_proper = _cython_flagstat.count(
-    pysam.AlignmentFile("ex1.bam", "rb"))
-
-print ("there are alignments of %i paired reads" % is_paired)
-print ("there are %i proper paired alignments" % is_proper)
diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile

index 89a4a0c7e2402152756fc953566542e7cfe441d2..2ccedd28a7d2873487db8429a1914aa1ebf3599a 100644 (file)
--- a/tests/pysam_data/Makefile
+++ b/tests/pysam_data/Makefile
@@ -18,7 +18,8 @@ all: ex1.pileup.gz \
         empty.bam empty.bam.bai \
         explicit_index.bam explicit_index.cram \
         faidx_empty_seq.fq.gz \
-       ex1.fa.gz ex1.fa.gz.fai
+       ex1.fa.gz ex1.fa.gz.fai \
+       ex1_csi.bam
  
  # ex2.sam - as ex1.sam, but with header
  ex2.sam.gz: ex1.bam ex1.bam.bai
@@ -44,6 +45,7 @@ uncompressed.bam: ex2.sam
  
  ex1.fa.fai:ex1.fa
                 samtools faidx ex1.fa
+
  ex1.bam:ex1.sam.gz ex1.fa.fai
                 samtools import ex1.fa.fai ex1.sam.gz ex1.bam
  
@@ -56,6 +58,10 @@ ex1.pileup.gz:ex1.bam ex1.fa
  ex2_truncated.bam: ex2.bam
         head -c 124000 ex2.bam > ex2_truncated.bam
  
+ex1_csi.bam: ex1.bam
+       cp ex1.bam ex1_csi.bam
+       samtools index -c ex1_csi.bam
+
  empty.bam: ex2.sam
         grep "^@" $< | samtools view -Sb - > $@
  
diff --git a/tests/pysam_data/ex3.sam b/tests/pysam_data/ex3.sam

index 495d4fe3b1f3b31dcf1d0bfa717dfc417a990f61..7a0918827d3faad6a91068c87d3b76d38ff193b0 100644 (file)
--- a/tests/pysam_data/ex3.sam
+++ b/tests/pysam_data/ex3.sam
@@ -1,6 +1,6 @@
  @HD    VN:1.0
-@SQ    SN:chr1 LN:1575
-@SQ    SN:chr2 LN:1584
+@SQ    SN:chr1 LN:1575 AH:chr1:5000000-5010000
+@SQ    SN:chr2 LN:1584 AH:*
  @RG    ID:L1   PU:SC_1_10      LB:SC_1 SM:NA12891      CN:name:with:colon
  @RG    ID:L2   PU:SC_2_12      LB:SC_2 SM:NA12891      CN:name:with:colon
  @PG    ID:P1   VN:1.0
diff --git a/tests/pysam_data/ex_spliced.sam b/tests/pysam_data/ex_spliced.sam

new file mode 100644 (file)

index 0000000..ae8086a
--- /dev/null
+++ b/tests/pysam_data/ex_spliced.sam
@@ -0,0 +1,297 @@
+@HD    VN:1.4  SO:coordinate
+@SQ    SN:1    LN:248956422
+@SQ    SN:2    LN:242193529
+@SQ    SN:3    LN:198295559
+@SQ    SN:4    LN:190214555
+@SQ    SN:5    LN:181538259
+@SQ    SN:6    LN:170805979
+@SQ    SN:7    LN:159345973
+@SQ    SN:8    LN:145138636
+@SQ    SN:9    LN:138394717
+@SQ    SN:10   LN:133797422
+@SQ    SN:11   LN:135086622
+@SQ    SN:12   LN:133275309
+@SQ    SN:13   LN:114364328
+@SQ    SN:14   LN:107043718
+@SQ    SN:15   LN:101991189
+@SQ    SN:16   LN:90338345
+@SQ    SN:17   LN:83257441
+@SQ    SN:18   LN:80373285
+@SQ    SN:19   LN:58617616
+@SQ    SN:20   LN:64444167
+@SQ    SN:21   LN:46709983
+@SQ    SN:22   LN:50818468
+@SQ    SN:X    LN:156040895
+@SQ    SN:Y    LN:57227415
+@SQ    SN:MT   LN:16569
+@SQ    SN:GL000008.2   LN:209709
+@SQ    SN:GL000009.2   LN:201709
+@SQ    SN:GL000194.1   LN:191469
+@SQ    SN:GL000195.1   LN:182896
+@SQ    SN:GL000205.2   LN:185591
+@SQ    SN:GL000208.1   LN:92689
+@SQ    SN:GL000213.1   LN:164239
+@SQ    SN:GL000214.1   LN:137718
+@SQ    SN:GL000216.2   LN:176608
+@SQ    SN:GL000218.1   LN:161147
+@SQ    SN:GL000219.1   LN:179198
+@SQ    SN:GL000220.1   LN:161802
+@SQ    SN:GL000221.1   LN:155397
+@SQ    SN:GL000224.1   LN:179693
+@SQ    SN:GL000225.1   LN:211173
+@SQ    SN:GL000226.1   LN:15008
+@SQ    SN:KI270302.1   LN:2274
+@SQ    SN:KI270303.1   LN:1942
+@SQ    SN:KI270304.1   LN:2165
+@SQ    SN:KI270305.1   LN:1472
+@SQ    SN:KI270310.1   LN:1201
+@SQ    SN:KI270311.1   LN:12399
+@SQ    SN:KI270312.1   LN:998
+@SQ    SN:KI270315.1   LN:2276
+@SQ    SN:KI270316.1   LN:1444
+@SQ    SN:KI270317.1   LN:37690
+@SQ    SN:KI270320.1   LN:4416
+@SQ    SN:KI270322.1   LN:21476
+@SQ    SN:KI270329.1   LN:1040
+@SQ    SN:KI270330.1   LN:1652
+@SQ    SN:KI270333.1   LN:2699
+@SQ    SN:KI270334.1   LN:1368
+@SQ    SN:KI270335.1   LN:1048
+@SQ    SN:KI270336.1   LN:1026
+@SQ    SN:KI270337.1   LN:1121
+@SQ    SN:KI270338.1   LN:1428
+@SQ    SN:KI270340.1   LN:1428
+@SQ    SN:KI270362.1   LN:3530
+@SQ    SN:KI270363.1   LN:1803
+@SQ    SN:KI270364.1   LN:2855
+@SQ    SN:KI270366.1   LN:8320
+@SQ    SN:KI270371.1   LN:2805
+@SQ    SN:KI270372.1   LN:1650
+@SQ    SN:KI270373.1   LN:1451
+@SQ    SN:KI270374.1   LN:2656
+@SQ    SN:KI270375.1   LN:2378
+@SQ    SN:KI270376.1   LN:1136
+@SQ    SN:KI270378.1   LN:1048
+@SQ    SN:KI270379.1   LN:1045
+@SQ    SN:KI270381.1   LN:1930
+@SQ    SN:KI270382.1   LN:4215
+@SQ    SN:KI270383.1   LN:1750
+@SQ    SN:KI270384.1   LN:1658
+@SQ    SN:KI270385.1   LN:990
+@SQ    SN:KI270386.1   LN:1788
+@SQ    SN:KI270387.1   LN:1537
+@SQ    SN:KI270388.1   LN:1216
+@SQ    SN:KI270389.1   LN:1298
+@SQ    SN:KI270390.1   LN:2387
+@SQ    SN:KI270391.1   LN:1484
+@SQ    SN:KI270392.1   LN:971
+@SQ    SN:KI270393.1   LN:1308
+@SQ    SN:KI270394.1   LN:970
+@SQ    SN:KI270395.1   LN:1143
+@SQ    SN:KI270396.1   LN:1880
+@SQ    SN:KI270411.1   LN:2646
+@SQ    SN:KI270412.1   LN:1179
+@SQ    SN:KI270414.1   LN:2489
+@SQ    SN:KI270417.1   LN:2043
+@SQ    SN:KI270418.1   LN:2145
+@SQ    SN:KI270419.1   LN:1029
+@SQ    SN:KI270420.1   LN:2321
+@SQ    SN:KI270422.1   LN:1445
+@SQ    SN:KI270423.1   LN:981
+@SQ    SN:KI270424.1   LN:2140
+@SQ    SN:KI270425.1   LN:1884
+@SQ    SN:KI270429.1   LN:1361
+@SQ    SN:KI270435.1   LN:92983
+@SQ    SN:KI270438.1   LN:112505
+@SQ    SN:KI270442.1   LN:392061
+@SQ    SN:KI270448.1   LN:7992
+@SQ    SN:KI270465.1   LN:1774
+@SQ    SN:KI270466.1   LN:1233
+@SQ    SN:KI270467.1   LN:3920
+@SQ    SN:KI270468.1   LN:4055
+@SQ    SN:KI270507.1   LN:5353
+@SQ    SN:KI270508.1   LN:1951
+@SQ    SN:KI270509.1   LN:2318
+@SQ    SN:KI270510.1   LN:2415
+@SQ    SN:KI270511.1   LN:8127
+@SQ    SN:KI270512.1   LN:22689
+@SQ    SN:KI270515.1   LN:6361
+@SQ    SN:KI270516.1   LN:1300
+@SQ    SN:KI270517.1   LN:3253
+@SQ    SN:KI270518.1   LN:2186
+@SQ    SN:KI270519.1   LN:138126
+@SQ    SN:KI270521.1   LN:7642
+@SQ    SN:KI270522.1   LN:5674
+@SQ    SN:KI270528.1   LN:2983
+@SQ    SN:KI270529.1   LN:1899
+@SQ    SN:KI270530.1   LN:2168
+@SQ    SN:KI270538.1   LN:91309
+@SQ    SN:KI270539.1   LN:993
+@SQ    SN:KI270544.1   LN:1202
+@SQ    SN:KI270548.1   LN:1599
+@SQ    SN:KI270579.1   LN:31033
+@SQ    SN:KI270580.1   LN:1553
+@SQ    SN:KI270581.1   LN:7046
+@SQ    SN:KI270582.1   LN:6504
+@SQ    SN:KI270583.1   LN:1400
+@SQ    SN:KI270584.1   LN:4513
+@SQ    SN:KI270587.1   LN:2969
+@SQ    SN:KI270588.1   LN:6158
+@SQ    SN:KI270589.1   LN:44474
+@SQ    SN:KI270590.1   LN:4685
+@SQ    SN:KI270591.1   LN:5796
+@SQ    SN:KI270593.1   LN:3041
+@SQ    SN:KI270706.1   LN:175055
+@SQ    SN:KI270707.1   LN:32032
+@SQ    SN:KI270708.1   LN:127682
+@SQ    SN:KI270709.1   LN:66860
+@SQ    SN:KI270710.1   LN:40176
+@SQ    SN:KI270711.1   LN:42210
+@SQ    SN:KI270712.1   LN:176043
+@SQ    SN:KI270713.1   LN:40745
+@SQ    SN:KI270714.1   LN:41717
+@SQ    SN:KI270715.1   LN:161471
+@SQ    SN:KI270716.1   LN:153799
+@SQ    SN:KI270717.1   LN:40062
+@SQ    SN:KI270718.1   LN:38054
+@SQ    SN:KI270719.1   LN:176845
+@SQ    SN:KI270720.1   LN:39050
+@SQ    SN:KI270721.1   LN:100316
+@SQ    SN:KI270722.1   LN:194050
+@SQ    SN:KI270723.1   LN:38115
+@SQ    SN:KI270724.1   LN:39555
+@SQ    SN:KI270725.1   LN:172810
+@SQ    SN:KI270726.1   LN:43739
+@SQ    SN:KI270727.1   LN:448248
+@SQ    SN:KI270728.1   LN:1872759
+@SQ    SN:KI270729.1   LN:280839
+@SQ    SN:KI270730.1   LN:112551
+@SQ    SN:KI270731.1   LN:150754
+@SQ    SN:KI270732.1   LN:41543
+@SQ    SN:KI270733.1   LN:179772
+@SQ    SN:KI270734.1   LN:165050
+@SQ    SN:KI270735.1   LN:42811
+@SQ    SN:KI270736.1   LN:181920
+@SQ    SN:KI270737.1   LN:103838
+@SQ    SN:KI270738.1   LN:99375
+@SQ    SN:KI270739.1   LN:73985
+@SQ    SN:KI270740.1   LN:37240
+@SQ    SN:KI270741.1   LN:157432
+@SQ    SN:KI270742.1   LN:186739
+@SQ    SN:KI270743.1   LN:210658
+@SQ    SN:KI270744.1   LN:168472
+@SQ    SN:KI270745.1   LN:41891
+@SQ    SN:KI270746.1   LN:66486
+@SQ    SN:KI270747.1   LN:198735
+@SQ    SN:KI270748.1   LN:93321
+@SQ    SN:KI270749.1   LN:158759
+@SQ    SN:KI270750.1   LN:148850
+@SQ    SN:KI270751.1   LN:150742
+@SQ    SN:KI270752.1   LN:27745
+@SQ    SN:KI270753.1   LN:62944
+@SQ    SN:KI270754.1   LN:40191
+@SQ    SN:KI270755.1   LN:36723
+@SQ    SN:KI270756.1   LN:79590
+@SQ    SN:KI270757.1   LN:71251
+@PG    ID:STAR PN:STAR VN:STAR_2.4.1a
+HWI-C00113:131:HMHYWADXX:1:2202:17748:47494    272     1       14792   0       51M     *       0       0       GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT     CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG     NH:i:6  HI:i:3  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2202:17748:47494    272     1       14792   0       38M140N13M      *       0       0       GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT     CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG     NH:i:6  HI:i:3  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1214:7658:35836     272     1       14792   0       38M140N13M      *       0       0       GGGCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCAT     CCCFFFFFHHHHHJJJJJJJJCGHIJJIJJJJJJIJJGIJJIJIJIJJJJI     NH:i:6  HI:i:3  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2114:4116:44566     272     1       14794   0       36M140N15M      *       0       0       GCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA     <@@DDDDDDFHCFHEFGBE+2AFH@GIEGF=GGHII9F<GHHIIA@6=48;     NH:i:6  HI:i:3  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1114:13704:81420    272     1       14795   0       35M140N16M      *       0       0       CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC     @@@DDDDFHBFHHGIGIE3CFHIIIIII<E@FHIGIIC?BFDHDHGIIIII     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2115:10483:10806    272     1       14795   0       35M140N16M      *       0       0       CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC     CCCFFFFFHHHHHJJJIIHHJIJJJJJJHHHIHGIIJJJHJIJJIJJJJJJ     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2214:18560:59872    272     1       14795   0       35M140N16M      *       0       0       CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC     =;?BA@DAFFFFF?EAF;A?9CH?CB9E9?D9FGEGGCGGEHIDBE@FFH;     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2028:49488     272     1       14795   0       35M140N16M      *       0       0       CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC     ???+4@B?FHHFHFGIBEF9BDHCB??CEHGG*1C<FEHAF?(?(@@=B8@     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2949:63319     272     1       14795   0       1S35M140N15M    *       0       0       ACCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA     @@CDFDFFHHHDHGIIIEEFGIJJJGIJIIGCGIJJJJJJIGIJIJJJHGA     NH:i:6  HI:i:3  AS:i:46 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1209:18680:84812    272     1       14795   0       35M140N16M      *       0       0       CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC     CC@FFFFFHHFHHJJJHHEHIIJJJJJIIEGFIJJJIIIIJJIJJJJJIII     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2205:10731:40239    272     1       14795   0       35M140N16M      *       0       0       CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC     @@<=A@DDFB:?FG<F;:3CCBEEBFGIIA?GIAD>B?BFF<BDF<8B8FF     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2207:18860:77945    272     1       14795   0       35M140N16M      *       0       0       CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC     C@CFFFFFHHHHHJJJIJGHIIJJJJJJJJJJJJIJJJJJIJJIJJJJIJJ     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1102:4544:68832     272     1       14796   0       34M140N17M      *       0       0       CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC     ???DDDDDBDDD:CE:C2<CFBDDECCEDC>DEE??BD?D@DADD<CC=8B     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1203:5829:91963     272     1       14796   0       34M140N17M      *       0       0       CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC     CCCFFFFFHHHHHJJIJIHJIIJGIIJIJJJJJIJJIJGGIJJJGIJJHJI     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2110:3018:17806     272     1       14796   0       34M140N17M      *       0       0       CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC     CCCFFFFFHHHFHIJJIFGIJIHIJIIDHIIJGGIIJIJJJJJJIJJIIIJ     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1111:17873:10434    272     1       14797   0       33M140N18M      *       0       0       CCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCA     ???DDDADDDDD8:2<2<FFFEIICEI;E@>DDBDDD@<?0@@D9=<.BBB     NH:i:6  HI:i:3  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1109:10709:93463    272     1       14801   0       29M140N22M      *       0       0       ACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTT     CC@DFFFFHHGH<CGDFHIIGIDAEGDGBBGFH@GEH:FHGGHIEFFDHII     NH:i:7  HI:i:4  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2214:12148:62454    272     1       14801   0       29M140N22M      *       0       0       ACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTT     ?@?DDDDD:FF=CFGGIGGBFGA;3<1:EEG>FGHFHFHIHGI@?DGC@CF     NH:i:7  HI:i:4  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1105:1515:82248     272     1       14802   0       28M140N23M      *       0       0       CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG     ??:ADBDDDDD:A<+C?AFDB@E?F4<*?:?1:??):??0009??9?(8BC     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1110:16355:5537     272     1       14802   0       28M140N23M      *       0       0       CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG     @CCFFFFFHH?ADHGIJIJJJJJIIEHIJJJJJIJIGIIJJIJJIIJIJJJ     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1102:17802:20689    272     1       14805   0       25M140N26M      *       0       0       GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG     CCCFFFFFHHHHHJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJI     NH:i:7  HI:i:4  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1104:7670:95815     272     1       14805   0       25M140N26M      *       0       0       GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG     @@@DBDDDHHBFDBFGEBBGHG@HIBHIDHBGGGEFBDDDFDGBBBGCHHI     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1110:11368:101298   272     1       14805   0       25M140N26M      *       0       0       GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG     BCCFFFFFCFHHHJJJJJIJJJJJJJJJJJJJGJJJJJJJJJJJJJJJIJJ     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2363:85646     272     1       14805   0       25M140N26M      *       0       0       GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG     @C@FFFB?CFDFHDHGIIIIEGIIIIEDGIIIIIIIIIGGIIGIIGCGHIH     NH:i:7  HI:i:4  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:2213:6044:80821     272     1       14805   0       25M140N26M      *       0       0       GCTCCGGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG     @@@FFFFFFFBFDGIIJIJGGFHIIIJIIJGIIEHI<FEGIIFEGIHIHGE     NH:i:7  HI:i:4  AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:1105:6336:76198     272     1       14807   0       23M140N28M      *       0       0       CCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAA     CCCFFFFFHHGHHGGHIIJEHIJIEGIJIJGIIIJICBFFIGAHFHHHJBH     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1108:3508:3794      272     1       14807   0       1S23M140N27M    *       0       0       GCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGA     CCCFFFFDHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJIJJJJJJJJJIJJ     NH:i:7  HI:i:4  AS:i:48 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2209:11671:4960     272     1       14811   0       19M140N32M      *       0       0       GGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGA     BCBFFFFFHHHHHJJGIIGIJJJJJJJJJJJJIIJIJJJJJJHIJIIJIJI     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2105:5117:87572     272     1       14812   0       18M140N33M      *       0       0       GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT     C@CFFFFFFFFHGFEFHIJJJJGIBHIJJJGGCHIEEGIJJFDGGGIGIGI     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2202:9099:82513     272     1       14812   0       18M140N33M      *       0       0       GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT     8?;DDDDDDFB?A3:<EE<<CGA+<F:F1?D*:*1:))???99??<FB??B     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2214:5571:19703     272     1       14812   0       18M140N33M      *       0       0       GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT     =:?7ADFDHHHDHHGIEF?CFFCBFEG@G>CHIGEGFG?FGHGA>9B8BF@     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1215:4185:31561     272     1       14815   0       15M140N36M      *       0       0       CTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCG     CCCFFFFFHHHHGJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJIJJJH     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2108:1506:70629     272     1       14816   0       14M140N37M      *       0       0       TTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGA     ?@@;BDDD=DFFDDDFGGFCA?)<CEG@C??C?FDFFB<FGIFDFFDFC;;     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1113:4051:71948     272     1       14818   0       12M140N39M      *       0       0       TCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGACA     @@@FFDFFF:CDCFGGDHHGEFHIJIJIIGIGHIJDBBDHGI@9BFGIEHI     NH:i:7  HI:i:4  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1101:6114:49389     272     1       15001   0       38M757N13M      *       0       0       ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC     CCCFFFFFHHHHHHIJJJJJJJJIJJJJJJIJJJJJJJJJJJJJGJJJJJJ     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1103:12497:23506    272     1       15001   0       38M757N13M      *       0       0       ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC     CCCFFFFFHHHHGJIJJJJJJJJJJJJJJJJJJJJJIJJIJJJJJIJIJJI     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2214:19931:4971     272     1       15002   0       37M757N14M      *       0       0       TCCTACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCC     ?;=+4ADDBBBDAFGGI>1@F?F+AAEEEB<GAG;?DGFE>FFIIF@DE4=     NH:i:8  HI:i:2  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1108:6828:32713     272     1       15003   0       36M757N15M      *       0       0       CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     ?@@ADDDDD?CDD:CFB@:@G@ABGFGFGFBFEAFEEEFCFCF@F=)8=@>     NH:i:8  HI:i:2  AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:1111:7491:39504     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     CCCFFDFFFHHHFIGEGHIGIGGDGIJFHEHGGIJJJIJIJJJJJIIIIGI     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1212:16079:85811    272     1       15003   0       36M757N15M      *       0       0       CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     @CCFFFFFHGHHHJJJJJJJIIJJJJJIJIJJHIIJJJJIJJJJJJJIJJJ     NH:i:8  HI:i:2  AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:2101:7167:50357     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     @@@DD?DDFHHHD@?<AGHBEHFGAGIFHEH3??BFGBD@GGCHGGGCHI;     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2201:9548:48040     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     CCCFFFFFHHHHHJJJJJJJJJJJJIJHIIIGGIIJJJJHIJJJJIJIJJJ     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2201:14017:74222    272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     =@?=DDDDDBDCFHB@CG?EF<BC>CG?FHGIIIIG@??BGHIE;8@B<FB     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2204:7589:97905     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     CCCFFFFFHHHHDHIGGGJIJJIJJJJIJJJJIGIJIJJIJJJJJIJJIJJ     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2212:18929:92726    272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     @@@DDDDDFFFF:AE<AFGFEHFFAF8:1:@8:DBBD9BB?/BDF<CDB<F     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2215:2615:12154     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     CCCFFFFFGHHHHJJJJJJJJIJJJJJJJJJJJJJJJJJIJJJJJJJJJJI     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1106:7741:42827     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     CCCFFFFFHHHHHJJJJJJJJJIJJJIIJJJJIJJJJJJJJJJJIJGHIHH     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1201:8380:74978     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     CCCFFFFFHFHHHIJJJJJJJJJJJJJFHIIGJJJJJJJIGGIIJEEDHHI     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1205:11268:38021    272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     1:?7DDD?ACDFBGHEAE@FGB@;@@A@C@0?F9FBFCF@48*9==3=CCF     NH:i:8  HI:i:2  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1208:17413:76793    272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     C@@FFFFFGGDDDGHIGGIH<FHGEHB8CEIIJIIFG?FFHFHIJII>FEG     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1211:4828:84953     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     @BB?DFFFHHHHHIJIJJJJJJJJJJJHIJJIIJJJJJJIIIJJJJJJIJI     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2107:20905:80208    272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     @CCFFFFFHHHFBHIIEIIDIHGGGGG@GGHCFGHIIJIGGGGIJIGIGGH     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2112:6263:84991     272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     @@@?DDDDFBH?FHIGIGIIGG;GHBGCD?DCGIIGHEGBBFHGGIHBFIG     NH:i:8  HI:i:2  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:2202:10314:26844    272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     CCCFFFFFHHHHHJJJJJJJJJJJJJJIJJJJIIJJJJJJJJJJJJJJJJJ     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2213:21028:90280    272     1       15003   0       36M757N15M      *       0       0       CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT     @@@BDDDAD?FDF9GIBB@@FFG3CFF:DD)?BD*9D@@F4BDEEEFFF8=     NH:i:8  HI:i:2  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1216:14847:22529    272     1       15004   0       35M757N16M      *       0       0       CGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTT     @@@FFFFDHHBDHIIIJJJJIIIIIIJJIJJGIJIFIJJIDHHGBEHIJJJ     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2111:14281:81135    272     1       15007   0       32M757N19M      *       0       0       CATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTG     @@@DDDBD42=:ACFFIE?FFGAFF@FFFDGEAG>D@DBB9BC3D@EDFFA     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2203:4824:93169     272     1       15008   0       31M757N20M      *       0       0       ATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTGC     CCCFFFFFHHHHHJJJJIJJJJHIJIJJJJJJJJGIJJJJI?DFGFHIHJF     NH:i:8  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1112:17298:87937    272     1       15925   1       23M659N28M      *       0       0       CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC     ?@;;BA;3ABC?C?6EGDGIIBA+AAC<?D9CBGG@@FFFFAFCIIECC7=     NH:i:4  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2210:17342:39133    272     1       15925   1       23M659N28M      *       0       0       CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC     @?@DDDDDFAB::<EBFGIFG@FF9AECEFIFAGCD:F@F8=@E;7)77@@     NH:i:4  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2112:17740:2548     272     1       15931   1       17M659N34M      *       0       0       CCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTCGTCCTG     @@@FFD:ACFDCFCGGGDF?HHIBDEHFGHDHFIIGBDGEEHIFHGIIGHH     NH:i:4  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2112:15228:46115    272     1       16727   0       39M92N12M       *       0       0       GGGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTC     @CCDDFF:?FFHH-@B:AABCB@DDEEDCDCCDCCCCD>ACD>>:9:??2<     NH:i:6  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2109:14386:93817    272     1       16728   0       38M92N13M       *       0       0       GGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTCG     @CCFFFDDHHHHDHIFHIJJJGHHIIJHHHHHHFFFFEFEEEECDDDDDDB     NH:i:6  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2203:14322:7218     272     1       16741   0       25M110N26M      *       0       0       GGTGTTAGTACCCCATCTTGTAGGTCTCAGTGTGGAAGGTGGGCAGTTCTG     ?@?DDD?BFFHHFB7EFGGGEFHIHA<CFHIGEHI<FEHH<=DEGG?DGEH     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1212:14242:21074    272     1       16751   0       15M92N36M       *       0       0       CCCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGG     @@CDFFFFGGBFFECGGGGIIHCHCEG@FAEGII9?*?BB9BFGC@H)=FG     NH:i:6  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1106:12278:45196    272     1       16752   0       14M92N37M       *       0       0       CCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGC     CCCFFFFFHHHHHHIHIJIJIJJJJJIJFEHHIGGEGHGFHGBGGH8BGH;     NH:i:6  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:12811:3161     272     1       16752   0       14M92N37M       *       0       0       CCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGC     CCCFFFFFHFFHGEGHIJJJJJIIGHFHAFEGGDDCE:DBDGDHFH?CGH@     NH:i:6  HI:i:2  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1102:2563:17519     272     1       16753   0       13M92N38M       *       0       0       CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGCGGGCA     ?<?D>DB8D>822<CC<<<+A;CE?1):C?))1:*0B<9*8*0*((7@4'3     NH:i:6  HI:i:2  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2201:2398:40333     16      1       16753   0       13M92N38M       *       0       0       CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA     CCCFFFFFHHHDHEHIHIJGF1FFHEBH@FHICHDD<B?DDA@?FD?FHFH     NH:i:6  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2209:7506:25914     16      1       16753   0       13M92N38M       *       0       0       CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA     @C@FDFFFFHHHHIIJIIIJJJJJJBHG=?DFBC<?:?9?FGHCG8BHHD7     NH:i:6  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1207:6786:56046     16      1       16753   0       13M92N38M       *       0       0       CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA     @B@DFFFFHHHHHGIGIIIAGIJEGAHHEGHHBF>BDG?FHBGEH?FHGG3     NH:i:6  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2116:7403:96086     272     1       17020   0       36M177N15M      *       0       0       GCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG     :?=DDD=AAAC:+<CA2C+::AFAC,9<9CA+::CEDDDD>BDIIIIIIA?     NH:i:7  HI:i:5  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1209:11002:81132    272     1       17020   0       36M177N15M      *       0       0       GCCCGGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG     @@@DD@A<@DDDF;BCGF<4CHCEG?EG@FGF9)?BB:?B?DBF>D?**9B     NH:i:7  HI:i:4  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1115:8064:78307     272     1       17021   0       35M177N16M      *       0       0       CCCTGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGC     11844BBDD=FDFEFFFDFI?HEHAFBEHEEEFC?E:FDGDD<FE4:9??9     NH:i:7  HI:i:5  AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1211:18547:26385    272     1       17027   0       29M177N22M      *       0       0       TCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTC     BCCFFFFFHHHHHJJIIJHJJJJJJJIJJJIJJJJJJJJJIJJJJJJJJJJ     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2109:12204:47428    272     1       17028   0       28M177N23M      *       0       0       CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT     @@@DDBD:=ACDBFEGECFGIHD>DH9CBEHHHEEFB?F>GD@3?FB?BB@     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1101:15891:42282    272     1       17028   0       28M177N23M      *       0       0       CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT     CCCFFFFFHHHHHJHHIIJJJJJJJJJJIIJJJJIJJJIJJJJJJJJJJJJ     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1107:10929:6659     272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     CCCFFFFFHHHHDHIHHJJGJJJJJJJIJJIJGIJJJIJJJIJJJJJIJJG     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1114:7098:71178     272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     =?@BDEEFHBDHFBGIEGIHEHIGDHGEIIJIIIEHIHIIGHDGHIGIIH@     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1209:3383:100724    272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     ?@@ADDDDDHDH?EEFH<CAHHGCHIF?GG>EHIGIIGHGHIFII>BFIH?     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2111:3771:31345     272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     @@@DFFFFGHDHHHJGIHJJJJGIJJIJIJIIJJIIJJIGHIJJJIJJIJ<     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2205:14794:36455    272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     CCCFFFFFHHHHHIJJJJJJJJJJJJJJJJJJIJJJJIJJIJJJJJJJJJJ     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1107:19701:64552    272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     CCCFFDFFHHHHDGIIJIJJJIIJDGHGJJJJJJIJJJJJJJGIJJJJJJF     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1210:18711:88303    272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     CCCFFFFFHHHHHJJJJJJJJIJFIJJJEIIHIIJJIIJJGJJJIJJJJJE     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:19113:15559    272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     @@@7B>DDC=<AF@<CFB?<,?FFDBF3AD+?9*?EGCF>@BFBGBAF<FG     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:14258:59619    272     1       17030   0       26M177N25M      *       0       0       GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG     =?@DDD:=DADFAEFGEF<,AADE<F<?AAFCGG@?FD>CGBF<D<9B<D<     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2103:9695:24819     272     1       17031   0       25M177N26M      *       0       0       GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC     CCCFFFFFHHHHHJHIJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJGG     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1204:13994:2816     272     1       17031   0       25M177N26M      *       0       0       GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC     ?@@DDDDDHHHHFHEFAABA@?FGBEFHIIIHH>DB@DHIHIDD>@@GHID     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1212:15591:47491    272     1       17031   0       25M177N26M      *       0       0       GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC     @@C+ADDDDHFFDEGEGIIIDFHIFHIIIIIGEHIIBH>FGGGHGHFGGII     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2215:10125:81395    272     1       17031   0       25M859N26M      *       0       0       GCACATAGAAGTAGTTCTCTGGGACCTGCAGGGCCCGCTCGTCCAGGGGGC     CCCFFFFFGHHHHJJJJJJJJJHJJJJJJIJIIJJJHIJJJJJJJJJIJHE     NH:i:6  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2102:9065:90529     16      1       17033   0       2S23M550N26M    *       0       0       GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG     C@CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJFHIFHIJIJJJJJJJJ     NH:i:5  HI:i:2  AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2204:7767:77376     16      1       17033   0       2S23M550N26M    *       0       0       GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG     @@@FDFDDBFHADEHEIGIGIJIGHIHG?EDGHGGCFH:B?BD@FGFHGIH     NH:i:5  HI:i:2  AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1212:6793:42000     16      1       17033   0       2S23M550N26M    *       0       0       GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG     @@?DADBD8CFADGFHIIIIE3A<EC:EHGGGIIB8?80?DDH>9?<FGCD     NH:i:5  HI:i:2  AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1211:14829:37922    272     1       17036   0       20M177N31M      *       0       0       TAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGCTGATG     @<@BDBFDFFHAFHFF@GIIIHECFHFGFHICFHFIIIIGIIEGFF<FHII     NH:i:7  HI:i:6  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2216:19937:47046    16      1       17334   1       35M237N16M      *       0       0       CAGCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGT     CC@F?DA?FDHHHIIGI@DHHGGFHHHIIAG@F@GFHHGGHEHG7-;FEHE     NH:i:4  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1111:7653:49738     16      1       17336   1       33M237N18M      *       0       0       GCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTG     ?8=4BDDDFDFFAGF@@GD?9?FBFDDDBDEFFIII?BDEFFI75F5;65C     NH:i:4  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2205:11163:47820    16      1       17336   1       33M237N18M      *       0       0       GCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTG     C@CFFFFFHHHHHJJJJIIJJJIJIJJJJJJJJJJJJJJJJJJGHIAGHII     NH:i:4  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2208:13311:23997    16      1       17340   1       29M237N22M      *       0       0       GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA     ?@8AD?@D?F>DH?FHGHH@EHGIEHGGIIIGGHIGHGFDEHGH=FHGIIH     NH:i:3  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2207:3786:78354     16      1       17340   1       29M237N22M      *       0       0       GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA     CCCFFFFFHHHHHJJJJJIIJJJJJJJJJJJJHHIJIHHBFIHIIJJJJJI     NH:i:3  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1115:8438:81914     16      1       17341   1       28M237N23M      *       0       0       GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG     @@CFFFFDHH?HDGGHIIGIGHIGHGIDIIIFGIIGHHDG:?DFHEHIIII     NH:i:3  HI:i:1  AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1114:13486:49038    16      1       17341   1       28M237N23M      *       0       0       GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG     ?@@:D@DDFHAFFHGGFHFHH@CCHIIIII@:CFGFGGC?D)?8DHHGCGI     NH:i:3  HI:i:1  AS:i:49 nM:i:0
diff --git a/tests/python_flagstat.py b/tests/python_flagstat.py

deleted file mode 100644 (file)

index b14e52d..0000000
--- a/tests/python_flagstat.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import pysam
-
-is_paired = 0
-is_proper = 0
-
-for read in pysam.AlignmentFile("ex1.bam", "rb"):
-    is_paired += read.is_paired
-    is_proper += read.is_proper_pair
-
-print ("there are alignments of %i paired reads" % is_paired)
-print ("there are %i proper paired alignments" % is_proper)
diff --git a/tests/samtools_test.py b/tests/samtools_test.py

index d5b27913a23a0903bcf67d49f30b9497d89ffac1..aa4c5540ad58121318f1af3a09cf90632eaf1484 100644 (file)
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -71,6 +71,7 @@ class SamtoolsTest(unittest.TestCase):
      # an output file.
      statements = [
          "view ex1.bam > %(out)s_ex1.view",
+        "view -c ex1.bam > %(out)s_ex1.count",
          # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam",
          "sort ex1.bam -o %(out)s_ex1.sort.bam",
          "mpileup ex1.bam > %(out)s_ex1.pileup",
author	Afif Elghraoui <afif@debian.org>
	Tue, 24 Jan 2017 03:09:05 +0000 (19:09 -0800)
committer	Afif Elghraoui <afif@debian.org>
	Tue, 24 Jan 2017 03:09:05 +0000 (19:09 -0800)
.travis.yml		patch \| blob \| history
bcftools/vcfisec.c		patch \| blob \| history
bcftools/vcfisec.c.pysam.c		patch \| blob \| history
benchmark/cython_flagstat.py	[new file with mode: 0644]	patch \| blob
benchmark/python_flagstat.py	[new file with mode: 0644]	patch \| blob
buildwheels.sh	[new file with mode: 0755]	patch \| blob
cy_build.py		patch \| blob \| history
doc/api.rst		patch \| blob \| history
doc/release.rst		patch \| blob \| history
doc/usage.rst		patch \| blob \| history
pysam/__init__.py		patch \| blob \| history
pysam/calignedsegment.pxd	[deleted file]	patch \| blob \| history
pysam/calignedsegment.pyx	[deleted file]	patch \| blob \| history
pysam/calignmentfile.pxd	[deleted file]	patch \| blob \| history
pysam/calignmentfile.pyx	[deleted file]	patch \| blob \| history
pysam/cbcf.pxd	[deleted file]	patch \| blob \| history
pysam/cbcf.pyx	[deleted file]	patch \| blob \| history
pysam/cfaidx.pxd	[deleted file]	patch \| blob \| history
pysam/cfaidx.pyx	[deleted file]	patch \| blob \| history
pysam/chtslib.pxd	[deleted file]	patch \| blob \| history
pysam/chtslib.pyx	[deleted file]	patch \| blob \| history
pysam/csamfile.pxd	[deleted file]	patch \| blob \| history
pysam/csamfile.pyx	[deleted file]	patch \| blob \| history
pysam/ctabix.pxd	[deleted file]	patch \| blob \| history
pysam/ctabix.pyx	[deleted file]	patch \| blob \| history
pysam/ctabixproxies.pxd	[deleted file]	patch \| blob \| history
pysam/ctabixproxies.pyx	[deleted file]	patch \| blob \| history
pysam/cutils.pxd	[deleted file]	patch \| blob \| history
pysam/cutils.pyx	[deleted file]	patch \| blob \| history
pysam/cvcf.pxd	[deleted file]	patch \| blob \| history
pysam/cvcf.pyx	[deleted file]	patch \| blob \| history
pysam/libcalignedsegment.pxd	[new file with mode: 0644]	patch \| blob
pysam/libcalignedsegment.pyx	[new file with mode: 0644]	patch \| blob
pysam/libcalignmentfile.pxd	[new file with mode: 0644]	patch \| blob
pysam/libcalignmentfile.pyx	[new file with mode: 0644]	patch \| blob
pysam/libcbcf.pxd	[new file with mode: 0644]	patch \| blob
pysam/libcbcf.pyx	[new file with mode: 0644]	patch \| blob
pysam/libcbgzf.pyx	[new file with mode: 0644]	patch \| blob
pysam/libcfaidx.pxd	[new file with mode: 0644]	patch \| blob
pysam/libcfaidx.pyx	[new file with mode: 0644]	patch \| blob
pysam/libchtslib.pxd	[new file with mode: 0644]	patch \| blob
pysam/libchtslib.pyx	[new file with mode: 0644]	patch \| blob
pysam/libcsamfile.pxd	[new file with mode: 0644]	patch \| blob
pysam/libcsamfile.pyx	[new file with mode: 0644]	patch \| blob
pysam/libctabix.pxd	[new file with mode: 0644]	patch \| blob
pysam/libctabix.pyx	[new file with mode: 0644]	patch \| blob
pysam/libctabixproxies.pxd	[new file with mode: 0644]	patch \| blob
pysam/libctabixproxies.pyx	[new file with mode: 0644]	patch \| blob
pysam/libcutils.pxd	[new file with mode: 0644]	patch \| blob
pysam/libcutils.pyx	[new file with mode: 0644]	patch \| blob
pysam/libcvcf.pxd	[new file with mode: 0644]	patch \| blob
pysam/libcvcf.pyx	[new file with mode: 0644]	patch \| blob
pysam/utils.py		patch \| blob \| history
pysam/version.py		patch \| blob \| history
requirements.txt		patch \| blob \| history
run_tests_travis.sh		patch \| blob \| history
samtools/sam_view.c.pysam.c		patch \| blob \| history
setup.py		patch \| blob \| history
tests/AlignedSegment_test.py		patch \| blob \| history
tests/AlignmentFile_test.py		patch \| blob \| history
tests/SamFile_test.py		patch \| blob \| history
tests/StreamFiledescriptors_test.py	[new file with mode: 0644]	patch \| blob
tests/VariantFile_test.py		patch \| blob \| history
tests/_compile_test.pyx		patch \| blob \| history
tests/_cython_flagstat.pyx		patch \| blob \| history
tests/cython_flagstat.py	[deleted file]	patch \| blob \| history
tests/pysam_data/Makefile		patch \| blob \| history
tests/pysam_data/ex3.sam		patch \| blob \| history
tests/pysam_data/ex_spliced.sam	[new file with mode: 0644]	patch \| blob
tests/python_flagstat.py	[deleted file]	patch \| blob \| history
tests/samtools_test.py		patch \| blob \| history