- osx
language: c
-sudo: required
+sudo: false
env:
matrix:
- CONDA_PY=2.7
- - CONDA_PY=3.3
- CONDA_PY=3.4
- CONDA_PY=3.5
+ - CONDA_PY=3.6
addons:
apt:
while (*p && *p!=',') p++;
if ( *p==',' ) p++;
}
- if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+ if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files);
if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
{
if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
while (*p && *p!=',') p++;
if ( *p==',' ) p++;
}
- if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files);
+ if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files);
if ( args->isec_op==OP_COMPLEMENT && args->nwrite )
{
if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n");
--- /dev/null
+"""compute number of reads/alignments from BAM file
+===================================================
+
+This is a benchmarking utility script with limited functionality.
+
+Compute simple flag stats on a BAM-file using
+the pysam cython interface.
+
+"""
+
+import sys
+import pysam
+import pyximport
+pyximport.install()
+import _cython_flagstat
+
+assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0])
+
+is_paired, is_proper = _cython_flagstat.count(
+ pysam.AlignmentFile(sys.argv[1], "rb"))
+
+print ("there are alignments of %i paired reads" % is_paired)
+print ("there are %i proper paired alignments" % is_proper)
--- /dev/null
+"""compute number of reads/alignments from BAM file
+===================================================
+
+This is a benchmarking utility script with limited functionality.
+
+Compute simple flag stats on a BAM-file using
+the pysam python interface.
+"""
+
+import sys
+import pysam
+
+assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0])
+
+is_paired = 0
+is_proper = 0
+
+for read in pysam.AlignmentFile(sys.argv[1], "rb"):
+ is_paired += read.is_paired
+ is_proper += read.is_proper_pair
+
+print ("there are alignments of %i paired reads" % is_paired)
+print ("there are %i proper paired alignments" % is_proper)
--- /dev/null
+#!/bin/bash
+#
+# Build manylinux1 wheels for pysam. Based on the example at
+# <https://github.com/pypa/python-manylinux-demo>
+#
+# It is best to run this in a fresh clone of the repository!
+#
+# Run this within the repository root:
+# docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
+#
+# The wheels will be put into the wheelhouse/ subdirectory.
+#
+# For interactive tests:
+# docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash
+
+set -xeuo pipefail
+
+# For convenience, if this script is called from outside of a docker container,
+# it starts a container and runs itself inside of it.
+if ! grep -q docker /proc/1/cgroup; then
+ # We are not inside a container
+ exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0
+fi
+
+yum install -y zlib-devel
+
+# Python 2.6 is not supported
+rm -r /opt/python/cp26*
+
+# Python 3.3 builds fail with:
+# /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib
+rm -r /opt/python/cp33*
+
+# Without libcurl support, htslib can open files from HTTP and FTP URLs.
+# With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a
+# current version of OpenSSL, and we do not want to be responsible for
+# updating the wheels as soon as there are any security issues. So disable
+# libcurl for now.
+# See also <https://github.com/pypa/manylinux/issues/74>.
+#
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
+
+PYBINS="/opt/python/*/bin"
+for PYBIN in ${PYBINS}; do
+ ${PYBIN}/pip install -r /io/requirements.txt
+ ${PYBIN}/pip wheel /io/ -w wheelhouse/
+done
+
+# Bundle external shared libraries into the wheels
+#
+# The '-L .' option is a workaround. By default, auditwheel puts all external
+# libraries (.so files) into a .libs directory and sets the RUNPATH to $ORIGIN/.libs.
+# When HTSLIB_MODE is 'shared' (now the default), then all so libraries part of
+# pysam require that RUNPATH is set to $ORIGIN (without the .libs). It seems
+# auditwheel overwrites $ORIGIN with $ORIGIN/.libs. This workaround makes
+# auditwheel set the RUNPATH to "$ORIGIN/." and it will work as desired.
+#
+for whl in wheelhouse/*.whl; do
+ auditwheel repair -L . $whl -w /io/wheelhouse/
+done
+
+# Created files are owned by root, so fix permissions.
+chown -R --reference=/io/setup.py /io/wheelhouse/
+
+# TODO Install packages and test them
+#for PYBIN in ${PYBINS}; do
+# ${PYBIN}/pip install pysam --no-index -f /io/wheelhouse
+# (cd $HOME; ${PYBIN}/nosetests ...)
+#done
config_vars = get_config_vars()
config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '')
config_vars['SHLIB_EXT'] = '.so'
- config_vars['SO'] = '.so'
def is_pip_install():
ext.library_dirs.append(os.path.join(self.build_lib, "pysam"))
if sys.platform == 'darwin':
-
relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
if "develop" in sys.argv or "test" in sys.argv:
:members:
-.. autoclass:: pysam.cfaidx.FastqProxy
+.. autoclass:: pysam.FastqProxy
:members:
Release notes
=============
+Release 0.10.0
+==============
+
+This release implements further functionality in the VariantFile API
+and includes several bugfixes:
+
+* treat special case -c option in samtools view outputs to stdout even
+ if -o given, fixes #315
+* permit reading BAM files with CSI index, closes #370
+* raise Error if query name exceeds maximum length, fixes #373
+* new method to compute hash value for AlignedSegment
+* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile
+* Avoid segfault by detecting out of range reference_id and
+ next_reference in AlignedSegment.tostring
+* Issue #355: Implement streams using file descriptors for VariantFile
+* upgrade to htslib 1.3.2
+* fix compilation with musl libc
+* Issue #316, #360: Rename all Cython modules to have lib as a prefix
+* Issue #332, hardclipped bases in cigar included by
+ pysam.AlignedSegment.infer_query_length()
+* Added support for Python 3.6 filename encoding protocol
+* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord
+* Issue #331, fix failure in VariantFile.reset() method
+* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and
+ VariantRecord.copy() methods to create new VariantRecord objects
+* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters
+* Preliminary (potentially unsafe) support for removing and altering header metadata
+* Many minor fixes and improvements to VariantFile and related objects
+
Release 0.9.1
=============
for row in tbx.fetch("chr1", 1000, 2000):
print ("chromosome is", row[0])
-By providing a parser argument to :class:`~pysam.AlignmentFile.fetch`
+By providing a parser to :class:`~pysam.AlignmentFile.fetch`
or :class:`~pysam.TabixFile`, the data will we presented in parsed
-form:
+form::
for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asTuple()):
print ("chromosome is", row.contig)
+ print ("first field (chrom)=", row[0])
+
+Pre-built parsers are available for :term:`bed`
+(:class:`~pysam.asBed`) formatted files and :term:`gtf`
+(:class:`~pysam.asGTF`) formatted files. Thus, additional fields
+become available through named access, for example::
+
+ for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asBed()):
+ print ("name is", row.name)
+
.. Currently inactivated as pileup deprecated
.. Using the samtools SNP caller
import sysconfig
from pysam.libchtslib import *
-from pysam.cutils import *
-import pysam.cutils as cutils
-import pysam.cfaidx as cfaidx
-from pysam.cfaidx import *
-import pysam.ctabix as ctabix
-from pysam.ctabix import *
-import pysam.csamfile as csamfile
-from pysam.csamfile import *
-import pysam.calignmentfile as calignmentfile
-from pysam.calignmentfile import *
-import pysam.calignedsegment as calignedsegment
-from pysam.calignedsegment import *
-import pysam.cvcf as cvcf
-from pysam.cvcf import *
-import pysam.cbcf as cbcf
-from pysam.cbcf import *
+from pysam.libcutils import *
+import pysam.libcutils as libcutils
+import pysam.libcfaidx as libcfaidx
+from pysam.libcfaidx import *
+import pysam.libctabix as libctabix
+from pysam.libctabix import *
+import pysam.libcsamfile as libcsamfile
+from pysam.libcsamfile import *
+import pysam.libcalignmentfile as libcalignmentfile
+from pysam.libcalignmentfile import *
+import pysam.libcalignedsegment as libcalignedsegment
+from pysam.libcalignedsegment import *
+import pysam.libcvcf as libcvcf
+from pysam.libcvcf import *
+import pysam.libcbcf as libcbcf
+from pysam.libcbcf import *
+import pysam.libcbgzf as libcbgzf
+from pysam.libcbgzf import *
from pysam.utils import SamtoolsError
import pysam.Pileup as Pileup
from pysam.samtools import *
# export all the symbols from separate modules
__all__ = \
libchtslib.__all__ +\
- cutils.__all__ +\
- ctabix.__all__ +\
- cvcf.__all__ +\
- cbcf.__all__ +\
- cfaidx.__all__ +\
- calignmentfile.__all__ +\
- calignedsegment.__all__ +\
- csamfile.__all__ +\
+ libcutils.__all__ +\
+ libctabix.__all__ +\
+ libcvcf.__all__ +\
+ libcbcf.__all__ +\
+ libcbgzf.__all__ +\
+ libcfaidx.__all__ +\
+ libcalignmentfile.__all__ +\
+ libcalignedsegment.__all__ +\
+ libcsamfile.__all__ +\
["SamtoolsError"] +\
["Pileup"]
def get_libraries():
'''return a list of libraries to link against.'''
- # Note that this list does not include csamtools.so as there are
+ # Note that this list does not include libcsamtools.so as there are
# numerous name conflicts with libchtslib.so.
dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
- pysam_libs = ['ctabixproxies',
- 'cfaidx',
- 'csamfile',
- 'cvcf',
- 'cbcf',
- 'ctabix']
+ pysam_libs = ['libctabixproxies',
+ 'libcfaidx',
+ 'libcsamfile',
+ 'libcvcf',
+ 'libcbcf',
+ 'libctabix']
if pysam.config.HTSLIB == "builtin":
pysam_libs.append('libchtslib')
- if sys.version_info.major >= 3:
- if sys.version_info.minor >= 5:
- return [os.path.join(dirname, x + ".{}.so".format(
- sysconfig.get_config_var('SOABI'))) for x in pysam_libs]
- else:
- return [os.path.join(dirname, x + ".{}{}.so".format(
- sys.implementation.cache_tag,
- sys.abiflags)) for x in pysam_libs]
- else:
- return [os.path.join(dirname, x + ".so") for x in pysam_libs]
+ so = sysconfig.get_config_var('SO')
+ return [os.path.join(dirname, x + so) for x in pysam_libs]
+++ /dev/null
-from pysam.chtslib cimport *
-
-cdef extern from "htslib_util.h":
-
- # add *nbytes* into the variable length data of *src* at *pos*
- bam1_t * pysam_bam_update(bam1_t * b,
- size_t nbytes_old,
- size_t nbytes_new,
- uint8_t * pos)
-
- # now: static
- int aux_type2size(int)
-
- char * pysam_bam_get_qname(bam1_t * b)
- uint32_t * pysam_bam_get_cigar(bam1_t * b)
- uint8_t * pysam_bam_get_seq(bam1_t * b)
- uint8_t * pysam_bam_get_qual(bam1_t * b)
- uint8_t * pysam_bam_get_aux(bam1_t * b)
- int pysam_bam_get_l_aux(bam1_t * b)
- char pysam_bam_seqi(uint8_t * s, int i)
-
- uint16_t pysam_get_bin(bam1_t * b)
- uint8_t pysam_get_qual(bam1_t * b)
- uint8_t pysam_get_l_qname(bam1_t * b)
- uint16_t pysam_get_flag(bam1_t * b)
- uint16_t pysam_get_n_cigar(bam1_t * b)
- void pysam_set_bin(bam1_t * b, uint16_t v)
- void pysam_set_qual(bam1_t * b, uint8_t v)
- void pysam_set_l_qname(bam1_t * b, uint8_t v)
- void pysam_set_flag(bam1_t * b, uint16_t v)
- void pysam_set_n_cigar(bam1_t * b, uint16_t v)
- void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
-
-
-from pysam.calignmentfile cimport AlignmentFile
-ctypedef AlignmentFile AlignmentFile_t
-
-
-# Note: need to declare all C fields and methods here
-cdef class AlignedSegment:
-
- # object that this AlignedSegment represents
- cdef bam1_t * _delegate
-
- # the file from which this AlignedSegment originates (can be None)
- cdef AlignmentFile _alignment_file
-
- # caching of array properties for quick access
- cdef object cache_query_qualities
- cdef object cache_query_alignment_qualities
- cdef object cache_query_sequence
- cdef object cache_query_alignment_sequence
-
- # add an alignment tag with value to the AlignedSegment
- # an existing tag of the same name will be replaced.
- cpdef set_tag(self, tag, value, value_type=?, replace=?)
-
- # add an alignment tag with value to the AlignedSegment
- # an existing tag of the same name will be replaced.
- cpdef get_tag(self, tag, with_value_type=?)
-
- # return true if tag exists
- cpdef has_tag(self, tag)
-
- # returns a valid sam alignment string
- cpdef tostring(self, AlignmentFile_t handle)
-
-
-cdef class PileupColumn:
- cdef bam_pileup1_t ** plp
- cdef int tid
- cdef int pos
- cdef int n_pu
- cdef AlignmentFile _alignment_file
-
-
-cdef class PileupRead:
- cdef AlignedSegment _alignment
- cdef int32_t _qpos
- cdef int _indel
- cdef int _level
- cdef uint32_t _is_del
- cdef uint32_t _is_head
- cdef uint32_t _is_tail
- cdef uint32_t _is_refskip
-
-# factor methods
-cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file)
-cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file)
-cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
-cdef inline uint32_t get_alignment_length(bam1_t * src)
+++ /dev/null
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-# Cython wrapper for SAM/BAM/CRAM files based on htslib
-###############################################################################
-# The principal classes defined in this module are:
-#
-# class AlignedSegment an aligned segment (read)
-#
-# class PileupColumn a collection of segments (PileupRead) aligned to
-# a particular genomic position.
-#
-# class PileupRead an AlignedSegment aligned to a particular genomic
-# position. Contains additional attributes with respect
-# to this.
-#
-# Additionally this module defines numerous additional classes that are part
-# of the internal API. These are:
-#
-# Various iterator classes to iterate over alignments in sequential (IteratorRow)
-# or in a stacked fashion (IteratorColumn):
-#
-# class IteratorRow
-# class IteratorRowRegion
-# class IteratorRowHead
-# class IteratorRowAll
-# class IteratorRowAllRefs
-# class IteratorRowSelection
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-import re
-import array
-import ctypes
-import struct
-
-cimport cython
-from cpython cimport array as c_array
-from cpython.version cimport PY_MAJOR_VERSION
-from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
-from libc.string cimport strchr
-from cpython cimport array as c_array
-
-from pysam.cutils cimport force_bytes, force_str, \
- charptr_to_str, charptr_to_bytes
-from pysam.cutils cimport qualities_to_qualitystring, qualitystring_to_array, \
- array_to_qualitystring
-
-# Constants for binary tag conversion
-cdef char * htslib_types = 'cCsSiIf'
-cdef char * parray_types = 'bBhHiIf'
-
-# translation tables
-
-# cigar code to character and vice versa
-cdef char* CODE2CIGAR= "MIDNSHP=XB"
-cdef int NCIGAR_CODES = 10
-
-if PY_MAJOR_VERSION >= 3:
- CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
-else:
- CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
-
-CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
-
-#####################################################################
-# typecode guessing
-cdef inline char map_typecode_htslib_to_python(uint8_t s):
- """map an htslib typecode to the corresponding python typecode
- to be used in the struct or array modules."""
-
- # map type from htslib to python array
- cdef char * f = strchr(htslib_types, s)
-
- if f == NULL:
- return 0
- return parray_types[f - htslib_types]
-
-cdef inline uint8_t map_typecode_python_to_htslib(char s):
- """determine value type from type code of array"""
- cdef char * f = strchr(parray_types, s)
- if f == NULL:
- return 0
- return htslib_types[f - parray_types]
-
-# optional tag data manipulation
-cdef convert_binary_tag(uint8_t * tag):
- """return bytesize, number of values and array of values
- in aux_data memory location pointed to by tag."""
- cdef uint8_t auxtype
- cdef uint8_t byte_size
- cdef int32_t nvalues
- # get byte size
- auxtype = tag[0]
- byte_size = aux_type2size(auxtype)
- tag += 1
- # get number of values in array
- nvalues = (<int32_t*>tag)[0]
- tag += 4
-
- # define python array
- cdef c_array.array c_values = array.array(
- chr(map_typecode_htslib_to_python(auxtype)))
- c_array.resize(c_values, nvalues)
-
- # copy data
- memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
-
- # no need to check for endian-ness as bam1_core_t fields
- # and aux_data are in host endian-ness. See sam.c and calls
- # to swap_data
- return byte_size, nvalues, c_values
-
-
-cdef inline uint8_t get_value_code(value, value_type=None):
- '''guess type code for a *value*. If *value_type* is None,
- the type code will be inferred based on the Python type of
- *value*'''
- cdef uint8_t typecode
- cdef char * _char_type
-
- if value_type is None:
- if isinstance(value, int):
- typecode = 'i'
- elif isinstance(value, float):
- typecode = 'd'
- elif isinstance(value, str):
- typecode = 'Z'
- elif isinstance(value, bytes):
- typecode = 'Z'
- elif isinstance(value, array.array) or \
- isinstance(value, list) or \
- isinstance(value, tuple):
- typecode = 'B'
- else:
- return 0
- else:
- if value_type not in 'Zidf':
- return 0
- value_type = force_bytes(value_type)
- _char_type = value_type
- typecode = (<uint8_t*>_char_type)[0]
-
- return typecode
-
-
-cdef inline bytes getTypecode(value, maximum_value=None):
- '''returns the value typecode of a value.
-
- If max is specified, the approprite type is
- returned for a range where value is the minimum.
- '''
-
- if maximum_value is None:
- maximum_value = value
-
- cdef bytes valuetype
-
- t = type(value)
-
- if t is float:
- valuetype = b'f'
- elif t is int:
- # signed ints
- if value < 0:
- if value >= -128 and maximum_value < 128:
- valuetype = b'c'
- elif value >= -32768 and maximum_value < 32768:
- valuetype = b's'
- elif value < -2147483648 or maximum_value >= 2147483648:
- raise ValueError(
- "at least one signed integer out of range of "
- "BAM/SAM specification")
- else:
- valuetype = b'i'
- # unsigned ints
- else:
- if maximum_value < 256:
- valuetype = b'C'
- elif maximum_value < 65536:
- valuetype = b'S'
- elif maximum_value >= 4294967296:
- raise ValueError(
- "at least one integer out of range of BAM/SAM specification")
- else:
- valuetype = b'I'
- else:
- # Note: hex strings (H) are not supported yet
- if t is not bytes:
- value = value.encode('ascii')
- if len(value) == 1:
- valuetype = b'A'
- else:
- valuetype = b'Z'
-
- return valuetype
-
-
-cdef inline packTags(tags):
- """pack a list of tags. Each tag is a tuple of (tag, tuple).
-
- Values are packed into the most space efficient data structure
- possible unless the tag contains a third field with the typecode.
-
- Returns a format string and the associated list of arguments
- to be used in a call to struct.pack_into.
- """
- fmts, args = ["<"], []
-
- cdef char array_typecode
-
- datatype2format = {
- b'c': ('b', 1),
- b'C': ('B', 1),
- b's': ('h', 2),
- b'S': ('H', 2),
- b'i': ('i', 4),
- b'I': ('I', 4),
- b'f': ('f', 4),
- b'A': ('c', 1)}
-
- for tag in tags:
-
- if len(tag) == 2:
- pytag, value = tag
- valuetype = None
- elif len(tag) == 3:
- pytag, value, valuetype = tag
- else:
- raise ValueError("malformatted tag: %s" % str(tag))
-
- pytag = force_bytes(pytag)
- valuetype = force_bytes(valuetype)
- t = type(value)
-
- if t is tuple or t is list:
- # binary tags from tuples or lists
- if valuetype is None:
- # automatically determine value type - first value
- # determines type. If there is a mix of types, the
- # result is undefined.
- valuetype = getTypecode(min(value), max(value))
-
- if valuetype not in datatype2format:
- raise ValueError("invalid value type '%s'" % valuetype)
-
- datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0])
- args.extend([pytag[:2],
- b"B",
- valuetype,
- len(value)] + list(value))
-
- elif isinstance(value, array.array):
- # binary tags from arrays
- if valuetype is None:
- array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
-
- if array_typecode == 0:
- raise ValueError("unsupported type code '{}'"
- .format(value.typecode))
-
- valuetype = force_bytes(chr(array_typecode))
-
- if valuetype not in datatype2format:
- raise ValueError("invalid value type '%s' (%s)" %
- (valuetype, type(valuetype)))
-
- # use array.tostring() to retrieve byte representation and
- # save as bytes
- datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1])
- args.extend([pytag[:2],
- b"B",
- valuetype,
- len(value),
- force_bytes(value.tostring())])
-
- else:
- if valuetype is None:
- valuetype = getTypecode(value)
-
- if valuetype in b"AZ":
- value = force_bytes(value)
-
- if valuetype == b"Z":
- datafmt = "2sc%is" % (len(value)+1)
- else:
- datafmt = "2sc%s" % datatype2format[valuetype][0]
-
- args.extend([pytag[:2],
- valuetype,
- value])
-
- fmts.append(datafmt)
-
- return "".join(fmts), args
-
-
-cdef inline int32_t calculateQueryLength(bam1_t * src):
- """return query length computed from CIGAR alignment.
-
- Return 0 if there is no CIGAR alignment.
- """
-
- cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
-
- if cigar_p == NULL:
- return 0
-
- cdef uint32_t k, qpos
- cdef int op
- qpos = 0
-
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
-
- if op == BAM_CMATCH or op == BAM_CINS or \
- op == BAM_CSOFT_CLIP or \
- op == BAM_CEQUAL or op == BAM_CDIFF:
- qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
-
- return qpos
-
-
-cdef inline int32_t getQueryStart(bam1_t *src) except -1:
- cdef uint32_t * cigar_p
- cdef uint32_t k, op
- cdef uint32_t start_offset = 0
-
- if pysam_get_n_cigar(src):
- cigar_p = pysam_bam_get_cigar(src);
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- if op == BAM_CHARD_CLIP:
- if start_offset != 0 and start_offset != src.core.l_qseq:
- PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
- return -1
- elif op == BAM_CSOFT_CLIP:
- start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
- else:
- break
-
- return start_offset
-
-
-cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
- cdef uint32_t * cigar_p
- cdef uint32_t k, op
- cdef uint32_t end_offset = src.core.l_qseq
-
- # if there is no sequence, compute length from cigar string
- if end_offset == 0:
- end_offset = calculateQueryLength(src)
-
- # walk backwards in cigar string
- if pysam_get_n_cigar(src) > 1:
- cigar_p = pysam_bam_get_cigar(src);
- for k from pysam_get_n_cigar(src) > k >= 1:
- op = cigar_p[k] & BAM_CIGAR_MASK
- if op == BAM_CHARD_CLIP:
- if end_offset != 0 and end_offset != src.core.l_qseq:
- PyErr_SetString(ValueError,
- 'Invalid clipping in CIGAR string')
- return -1
- elif op == BAM_CSOFT_CLIP:
- end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
- else:
- break
-
- return end_offset
-
-
-cdef inline bytes getSequenceInRange(bam1_t *src,
- uint32_t start,
- uint32_t end):
- """return python string of the sequence in a bam1_t object.
- """
-
- cdef uint8_t * p
- cdef uint32_t k
- cdef char * s
-
- if not src.core.l_qseq:
- return None
-
- seq = PyBytes_FromStringAndSize(NULL, end - start)
- s = <char*>seq
- p = pysam_bam_get_seq(src)
-
- for k from start <= k < end:
- # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
- # note: do not use string literal as it will be a python string
- s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf]
-
- return charptr_to_bytes(seq)
-
-
-cdef inline object getQualitiesInRange(bam1_t *src,
- uint32_t start,
- uint32_t end):
- """return python array of quality values from a bam1_t object"""
-
- cdef uint8_t * p
- cdef uint32_t k
-
- p = pysam_bam_get_qual(src)
- if p[0] == 0xff:
- return None
-
- # 'B': unsigned char
- cdef c_array.array result = array.array('B', [0])
- c_array.resize(result, end - start)
-
- # copy data
- memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
-
- return result
-
-
-#####################################################################
-## private factory methods
-cdef class AlignedSegment
-cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file):
- '''return an AlignedSegment object constructed from `src`'''
- # note that the following does not call __init__
- cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
- dest._delegate = bam_dup1(src)
- dest._alignment_file = alignment_file
- return dest
-
-
-cdef class PileupColumn
-cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos,
- int n_pu, AlignmentFile alignment_file):
- '''return a PileupColumn object constructed from pileup in `plp` and
- setting additional attributes.
-
- '''
- # note that the following does not call __init__
- cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
- dest._alignment_file = alignment_file
- dest.plp = plp
- dest.tid = tid
- dest.pos = pos
- dest.n_pu = n_pu
- return dest
-
-cdef class PileupRead
-cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file):
- '''return a PileupRead object construted from a bam_pileup1_t * object.'''
- cdef PileupRead dest = PileupRead.__new__(PileupRead)
- dest._alignment = makeAlignedSegment(src.b, alignment_file)
- dest._qpos = src.qpos
- dest._indel = src.indel
- dest._level = src.level
- dest._is_del = src.is_del
- dest._is_head = src.is_head
- dest._is_tail = src.is_tail
- dest._is_refskip = src.is_refskip
- return dest
-
-
-cdef inline uint32_t get_alignment_length(bam1_t * src):
- cdef int k = 0
- cdef uint32_t l = 0
- if src == NULL:
- return 0
- cdef uint32_t * cigar_p = bam_get_cigar(src)
- if cigar_p == NULL:
- return 0
- cdef int op
- cdef int n = pysam_get_n_cigar(src)
- for k from 0 <= k < n:
- op = cigar_p[k] & BAM_CIGAR_MASK
- if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
- continue
- l += cigar_p[k] >> BAM_CIGAR_SHIFT
- return l
-
-
-# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
-cdef inline bytes build_alignment_sequence(bam1_t * src):
- """return expanded sequence from MD tag.
-
- The sequence includes substitutions and both insertions in the
- reference as well as deletions to the reference sequence. Combine
- with the cigar string to reconstitute the query or the reference
- sequence.
-
- Positions corresponding to `N` (skipped region from the reference)
- in the CIGAR string will not appear in the returned sequence. The
- MD should correspondingly not contain these. Thus proper tags are::
-
- Deletion from the reference: cigar=5M1D5M MD=5^C5
- Skipped region from reference: cigar=5M1N5M MD=10
-
- Returns
- -------
-
- None, if no MD tag is present.
-
- """
- if src == NULL:
- return None
-
- cdef uint32_t start = getQueryStart(src)
- cdef uint32_t end = getQueryEnd(src)
- # get read sequence, taking into account soft-clipping
- r = getSequenceInRange(src, start, end)
- cdef char * read_sequence = r
- cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
- if cigar_p == NULL:
- return None
-
- cdef uint32_t r_idx = 0
- cdef int op
- cdef uint32_t k, i, l, x
- cdef int nmatches = 0
- cdef int s_idx = 0
-
- cdef uint32_t max_len = get_alignment_length(src)
- if max_len == 0:
- raise ValueError("could not determine alignment length")
-
- cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
- if s == NULL:
- raise ValueError(
- "could not allocated sequence of length %i" % max_len)
-
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
- if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
- for i from 0 <= i < l:
- s[s_idx] = read_sequence[r_idx]
- r_idx += 1
- s_idx += 1
- elif op == BAM_CDEL:
- for i from 0 <= i < l:
- s[s_idx] = '-'
- s_idx += 1
- elif op == BAM_CREF_SKIP:
- pass
- elif op == BAM_CINS:
- for i from 0 <= i < l:
- # encode insertions into reference as lowercase
- s[s_idx] = read_sequence[r_idx] + 32
- r_idx += 1
- s_idx += 1
- elif op == BAM_CSOFT_CLIP:
- pass
- elif op == BAM_CHARD_CLIP:
- pass # advances neither
- elif op == BAM_CPAD:
- raise NotImplementedError(
- "Padding (BAM_CPAD, 6) is currently not supported. "
- "Please implement. Sorry about that.")
-
- cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
- if md_tag_ptr == NULL:
- seq = PyBytes_FromStringAndSize(s, s_idx)
- free(s)
- return seq
-
- cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
- cdef int md_idx = 0
- s_idx = 0
-
- while md_tag[md_idx] != 0:
- # c is numerical
- if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
- nmatches *= 10
- nmatches += md_tag[md_idx] - 48
- md_idx += 1
- continue
- else:
- # save matches up to this point, skipping insertions
- for x from 0 <= x < nmatches:
- while s[s_idx] >= 'a':
- s_idx += 1
- s_idx += 1
- while s[s_idx] >= 'a':
- s_idx += 1
-
- r_idx += nmatches
- nmatches = 0
- if md_tag[md_idx] == '^':
- md_idx += 1
- while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
- assert s[s_idx] == '-'
- s[s_idx] = md_tag[md_idx]
- s_idx += 1
- md_idx += 1
- else:
- # save mismatch and change to lower case
- s[s_idx] = md_tag[md_idx] + 32
- s_idx += 1
- r_idx += 1
- md_idx += 1
-
- # save matches up to this point, skipping insertions
- for x from 0 <= x < nmatches:
- while s[s_idx] >= 'a':
- s_idx += 1
- s_idx += 1
- while s[s_idx] >= 'a':
- s_idx += 1
-
- seq = PyBytes_FromStringAndSize(s, s_idx)
- free(s)
-
- return seq
-
-
-cdef class AlignedSegment:
- '''Class representing an aligned segment.
-
- This class stores a handle to the samtools C-structure representing
- an aligned read. Member read access is forwarded to the C-structure
- and converted into python objects. This implementation should be fast,
- as only the data needed is converted.
-
- For write access, the C-structure is updated in-place. This is
- not the most efficient way to build BAM entries, as the variable
- length data is concatenated and thus needs to be resized if
- a field is updated. Furthermore, the BAM entry might be
- in an inconsistent state.
-
- One issue to look out for is that the sequence should always
- be set *before* the quality scores. Setting the sequence will
- also erase any quality scores that were set previously.
- '''
-
- # Now only called when instances are created from Python
- def __init__(self):
- # see bam_init1
- self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
- # allocate some memory. If size is 0, calloc does not return a
- # pointer that can be passed to free() so allocate 40 bytes
- # for a new read
- self._delegate.m_data = 40
- self._delegate.data = <uint8_t *>calloc(
- self._delegate.m_data, 1)
- self._delegate.l_data = 0
-
- # caching for selected fields
- self.cache_query_qualities = None
- self.cache_query_alignment_qualities = None
- self.cache_query_sequence = None
- self.cache_query_alignment_sequence = None
-
- def __dealloc__(self):
- bam_destroy1(self._delegate)
-
- def __str__(self):
- """return string representation of alignment.
-
- The representation is an approximate :term:`SAM` format, because
- an aligned read might not be associated with a :term:`AlignmentFile`.
- As a result :term:`tid` is shown instead of the reference name.
- Similarly, the tags field is returned in its parsed state.
-
- To get a valid SAM record, use :meth:`tostring`.
- """
- # sam-parsing is done in sam.c/bam_format1_core which
- # requires a valid header.
- return "\t".join(map(str, (self.query_name,
- self.flag,
- self.reference_id,
- self.reference_start,
- self.mapping_quality,
- self.cigarstring,
- self.next_reference_id,
- self.next_reference_start,
- self.query_alignment_length,
- self.query_sequence,
- self.query_qualities,
- self.tags)))
-
- def __copy__(self):
- return makeAlignedSegment(self._delegate, self._alignment_file)
-
- def __deepcopy__(self, memo):
- return makeAlignedSegment(self._delegate, self._alignment_file)
-
- def compare(self, AlignedSegment other):
- '''return -1,0,1, if contents in this are binary
- <,=,> to *other*
-
- '''
-
- cdef int retval, x
- cdef bam1_t *t
- cdef bam1_t *o
-
- t = self._delegate
- o = other._delegate
-
- # uncomment for debugging purposes
- # cdef unsigned char * oo, * tt
- # tt = <unsigned char*>(&t.core)
- # oo = <unsigned char*>(&o.core)
- # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
- # tt = <unsigned char*>(t.data)
- # oo = <unsigned char*>(o.data)
- # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
-
- # Fast-path test for object identity
- if t == o:
- return 0
-
- retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
-
- if retval:
- return retval
- # cmp(t.l_data, o.l_data)
- retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
- if retval:
- return retval
- return memcmp(t.data, o.data, t.l_data)
-
- def __richcmp__(self, AlignedSegment other, int op):
- if op == 2: # == operator
- return self.compare(other) == 0
- elif op == 3: # != operator
- return self.compare(other) != 0
- else:
- return NotImplemented
-
- def __hash__(self):
- cdef bam1_t * src
- src = self._delegate
- # shift and xor values in the core structure
- # make sure tid and mtid are shifted by different amounts
- # should variable length data be included?
- cdef uint32_t hash_value = src.core.tid << 24 ^ \
- src.core.pos << 16 ^ \
- src.core.qual << 8 ^ \
- src.core.flag ^ \
- src.core.isize << 24 ^ \
- src.core.mtid << 16 ^ \
- src.core.mpos << 8
-
- return hash_value
-
- cpdef tostring(self, AlignmentFile_t htsfile):
- """returns a string representation of the aligned segment.
-
- The output format is valid SAM format.
-
- Parameters
- ----------
-
- htsfile -- AlignmentFile object to map numerical
- identifers to chromosome names.
- """
-
- cdef kstring_t line
- line.l = line.m = 0
- line.s = NULL
-
- if sam_format1(htsfile.header, self._delegate, &line) < 0:
- if line.m:
- free(line.s)
- raise ValueError('sam_format failed')
-
- ret = force_str(line.s[:line.l])
-
- if line.m:
- free(line.s)
-
- return ret
-
- ########################################################
- ## Basic attributes in order of appearance in SAM format
- property query_name:
- """the query template name (None if not present)"""
- def __get__(self):
- cdef bam1_t * src
- src = self._delegate
- if pysam_get_l_qname(src) == 0:
- return None
- return charptr_to_str(<char *>pysam_bam_get_qname(src))
-
- def __set__(self, qname):
- if qname is None or len(qname) == 0:
- return
- qname = force_bytes(qname)
- cdef bam1_t * src
- cdef int l
- cdef char * p
-
- src = self._delegate
- p = pysam_bam_get_qname(src)
-
- # the qname is \0 terminated
- l = len(qname) + 1
- pysam_bam_update(src,
- pysam_get_l_qname(src),
- l,
- <uint8_t*>p)
-
-
- pysam_set_l_qname(src, l)
-
- # re-acquire pointer to location in memory
- # as it might have moved
- p = pysam_bam_get_qname(src)
-
- strncpy(p, qname, l)
-
- property flag:
- """properties flag"""
- def __get__(self):
- return pysam_get_flag(self._delegate)
- def __set__(self, flag):
- pysam_set_flag(self._delegate, flag)
-
- property reference_name:
- """:term:`reference` name (None if no AlignmentFile is associated)"""
- def __get__(self):
- if self._alignment_file is not None:
- return self._alignment_file.getrname(self._delegate.core.tid)
- return None
-
- property reference_id:
- """:term:`reference` ID
-
- .. note::
-
- This field contains the index of the reference sequence in
- the sequence dictionary. To obtain the name of the
- reference sequence, use
- :meth:`pysam.AlignmentFile.getrname()`
-
- """
- def __get__(self): return self._delegate.core.tid
- def __set__(self, tid): self._delegate.core.tid = tid
-
- property reference_start:
- """0-based leftmost coordinate"""
- def __get__(self): return self._delegate.core.pos
- def __set__(self, pos):
- ## setting the position requires updating the "bin" attribute
- cdef bam1_t * src
- src = self._delegate
- src.core.pos = pos
- if pysam_get_n_cigar(src):
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- bam_endpos(src),
- 14,
- 5))
- else:
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- src.core.pos + 1,
- 14,
- 5))
-
- property mapping_quality:
- """mapping quality"""
- def __get__(self):
- return pysam_get_qual(self._delegate)
- def __set__(self, qual):
- pysam_set_qual(self._delegate, qual)
-
- property cigarstring:
- '''the :term:`cigar` alignment as a string.
-
- The cigar string is a string of alternating integers
- and characters denoting the length and the type of
- an operation.
-
- .. note::
- The order length,operation is specified in the
- SAM format. It is different from the order of
- the :attr:`cigar` property.
-
- Returns None if not present.
-
- To unset the cigarstring, assign None or the
- empty string.
- '''
- def __get__(self):
- c = self.cigartuples
- if c is None:
- return None
- # reverse order
- else:
- return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
-
- def __set__(self, cigar):
- if cigar is None or len(cigar) == 0:
- self.cigartuples = []
- else:
- parts = CIGAR_REGEX.findall(cigar)
- # reverse order
- self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
-
- # TODO
- # property cigar:
- # """the cigar alignment"""
-
- property next_reference_id:
- """the :term:`reference` id of the mate/next read."""
- def __get__(self): return self._delegate.core.mtid
- def __set__(self, mtid):
- self._delegate.core.mtid = mtid
-
- property next_reference_name:
- """:term:`reference` name of the mate/next read (None if no
- AlignmentFile is associated)"""
- def __get__(self):
- if self._alignment_file is not None:
- return self._alignment_file.getrname(self._delegate.core.mtid)
- return None
-
- property next_reference_start:
- """the position of the mate/next read."""
- def __get__(self):
- return self._delegate.core.mpos
- def __set__(self, mpos):
- self._delegate.core.mpos = mpos
-
- property query_length:
- """the length of the query/read.
-
- This value corresponds to the length of the sequence supplied
- in the BAM/SAM file. The length of a query is 0 if there is no
- sequence in the BAM/SAM file. In those cases, the read length
- can be inferred from the CIGAR alignment, see
- :meth:`pysam.AlignmentFile.infer_query_length.`.
-
- The length includes soft-clipped bases and is equal to
- ``len(query_sequence)``.
-
- This property is read-only but can be set by providing a
- sequence.
-
- Returns 0 if not available.
-
- """
- def __get__(self):
- return self._delegate.core.l_qseq
-
- property template_length:
- """the observed query template length"""
- def __get__(self):
- return self._delegate.core.isize
- def __set__(self, isize):
- self._delegate.core.isize = isize
-
- property query_sequence:
- """read sequence bases, including :term:`soft clipped` bases
- (None if not present).
-
- Note that assigning to seq will invalidate any quality scores.
- Thus, to in-place edit the sequence and quality scores, copies of
- the quality scores need to be taken. Consider trimming for example::
-
- q = read.query_qualities
- read.query_squence = read.query_sequence[5:10]
- read.query_qualities = q[5:10]
-
- The sequence is returned as it is stored in the BAM file. Some mappers
- might have stored a reverse complement of the original read
- sequence.
- """
- def __get__(self):
- if self.cache_query_sequence:
- return self.cache_query_sequence
-
- cdef bam1_t * src
- cdef char * s
- src = self._delegate
-
- if src.core.l_qseq == 0:
- return None
-
- self.cache_query_sequence = force_str(getSequenceInRange(
- src, 0, src.core.l_qseq))
- return self.cache_query_sequence
-
- def __set__(self, seq):
- # samtools manages sequence and quality length memory together
- # if no quality information is present, the first byte says 0xff.
- cdef bam1_t * src
- cdef uint8_t * p
- cdef char * s
- cdef int l, k
- cdef Py_ssize_t nbytes_new, nbytes_old
-
- if seq == None:
- l = 0
- else:
- l = len(seq)
- seq = force_bytes(seq)
-
- src = self._delegate
-
- # as the sequence is stored in half-bytes, the total length (sequence
- # plus quality scores) is (l+1)/2 + l
- nbytes_new = (l + 1) / 2 + l
- nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq
-
- # acquire pointer to location in memory
- p = pysam_bam_get_seq(src)
- src.core.l_qseq = l
-
- # change length of data field
- pysam_bam_update(src,
- nbytes_old,
- nbytes_new,
- p)
-
- if l > 0:
- # re-acquire pointer to location in memory
- # as it might have moved
- p = pysam_bam_get_seq(src)
- for k from 0 <= k < nbytes_new:
- p[k] = 0
- # convert to C string
- s = seq
- for k from 0 <= k < l:
- p[k/2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
-
- # erase qualities
- p = pysam_bam_get_qual(src)
- p[0] = 0xff
-
- self.cache_query_sequence = force_str(seq)
-
- # clear cached values for quality values
- self.cache_query_qualities = None
- self.cache_query_alignment_qualities = None
-
- property query_qualities:
- """read sequence base qualities, including :term:`soft
- clipped` bases (None if not present).
-
- Quality scores are returned as a python array of unsigned
- chars. Note that this is not the ASCII-encoded value typically
- seen in FASTQ or SAM formatted files. Thus, no offset of 33
- needs to be subtracted.
-
- Note that to set quality scores the sequence has to be set
- beforehand as this will determine the expected length of the
- quality score array.
-
- This method raises a ValueError if the length of the
- quality scores and the sequence are not the same.
-
- """
- def __get__(self):
-
- if self.cache_query_qualities:
- return self.cache_query_qualities
-
- cdef bam1_t * src
- cdef char * q
-
- src = self._delegate
-
- if src.core.l_qseq == 0:
- return None
-
- self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
- return self.cache_query_qualities
-
- def __set__(self, qual):
-
- # note that memory is already allocated via setting the sequence
- # hence length match of sequence and quality needs is checked.
- cdef bam1_t * src
- cdef uint8_t * p
- cdef int l
-
- src = self._delegate
- p = pysam_bam_get_qual(src)
- if qual is None or len(qual) == 0:
- # if absent and there is a sequence: set to 0xff
- if src.core.l_qseq != 0:
- p[0] = 0xff
- return
-
- # check for length match
- l = len(qual)
- if src.core.l_qseq != l:
- raise ValueError(
- "quality and sequence mismatch: %i != %i" %
- (l, src.core.l_qseq))
-
- # create a python array object filling it
- # with the quality scores
-
- # NB: should avoid this copying if qual is
- # already of the correct type.
- cdef c_array.array result = c_array.array('B', qual)
-
- # copy data
- memcpy(p, result.data.as_voidptr, l)
-
- # save in cache
- self.cache_query_qualities = qual
-
- property bin:
- """properties bin"""
- def __get__(self):
- return pysam_get_bin(self._delegate)
- def __set__(self, bin):
- pysam_set_bin(self._delegate, bin)
-
-
- ##########################################################
- # Derived simple attributes. These are simple attributes of
- # AlignedSegment getting and setting values.
- ##########################################################
- # 1. Flags
- ##########################################################
- property is_paired:
- """true if read is paired in sequencing"""
- def __get__(self):
- return (self.flag & BAM_FPAIRED) != 0
- def __set__(self,val):
- pysam_update_flag(self._delegate, val, BAM_FPAIRED)
-
- property is_proper_pair:
- """true if read is mapped in a proper pair"""
- def __get__(self):
- return (self.flag & BAM_FPROPER_PAIR) != 0
- def __set__(self,val):
- pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
- property is_unmapped:
- """true if read itself is unmapped"""
- def __get__(self):
- return (self.flag & BAM_FUNMAP) != 0
- def __set__(self, val):
- pysam_update_flag(self._delegate, val, BAM_FUNMAP)
- property mate_is_unmapped:
- """true if the mate is unmapped"""
- def __get__(self):
- return (self.flag & BAM_FMUNMAP) != 0
- def __set__(self,val):
- pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
- property is_reverse:
- """true if read is mapped to reverse strand"""
- def __get__(self):
- return (self.flag & BAM_FREVERSE) != 0
- def __set__(self,val):
- pysam_update_flag(self._delegate, val, BAM_FREVERSE)
- property mate_is_reverse:
- """true is read is mapped to reverse strand"""
- def __get__(self):
- return (self.flag & BAM_FMREVERSE) != 0
- def __set__(self,val):
- pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
- property is_read1:
- """true if this is read1"""
- def __get__(self):
- return (self.flag & BAM_FREAD1) != 0
- def __set__(self,val):
- pysam_update_flag(self._delegate, val, BAM_FREAD1)
- property is_read2:
- """true if this is read2"""
- def __get__(self):
- return (self.flag & BAM_FREAD2) != 0
- def __set__(self, val):
- pysam_update_flag(self._delegate, val, BAM_FREAD2)
- property is_secondary:
- """true if not primary alignment"""
- def __get__(self):
- return (self.flag & BAM_FSECONDARY) != 0
- def __set__(self, val):
- pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
- property is_qcfail:
- """true if QC failure"""
- def __get__(self):
- return (self.flag & BAM_FQCFAIL) != 0
- def __set__(self, val):
- pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
- property is_duplicate:
- """true if optical or PCR duplicate"""
- def __get__(self):
- return (self.flag & BAM_FDUP) != 0
- def __set__(self, val):
- pysam_update_flag(self._delegate, val, BAM_FDUP)
- property is_supplementary:
- """true if this is a supplementary alignment"""
- def __get__(self):
- return (self.flag & BAM_FSUPPLEMENTARY) != 0
- def __set__(self, val):
- pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
-
- # 2. Coordinates and lengths
- property reference_end:
- '''aligned reference position of the read on the reference genome.
-
- reference_end points to one past the last aligned residue.
- Returns None if not available (read is unmapped or no cigar
- alignment present).
-
- '''
- def __get__(self):
- cdef bam1_t * src
- src = self._delegate
- if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
- return None
- return bam_endpos(src)
-
- property reference_length:
- '''aligned length of the read on the reference genome.
-
- This is equal to `aend - pos`. Returns None if not available.'''
- def __get__(self):
- cdef bam1_t * src
- src = self._delegate
- if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
- return None
- return bam_endpos(src) - \
- self._delegate.core.pos
-
- property query_alignment_sequence:
- """aligned portion of the read.
-
- This is a substring of :attr:`seq` that excludes flanking
- bases that were :term:`soft clipped` (None if not present). It
- is equal to ``seq[qstart:qend]``.
-
- SAM/BAM files may include extra flanking bases that are not
- part of the alignment. These bases may be the result of the
- Smith-Waterman or other algorithms, which may not require
- alignments that begin at the first residue or end at the last.
- In addition, extra sequencing adapters, multiplex identifiers,
- and low-quality bases that were not considered for alignment
- may have been retained.
-
- """
-
- def __get__(self):
- if self.cache_query_alignment_sequence:
- return self.cache_query_alignment_sequence
-
- cdef bam1_t * src
- cdef uint32_t start, end
-
- src = self._delegate
-
- if src.core.l_qseq == 0:
- return None
-
- start = getQueryStart(src)
- end = getQueryEnd(src)
-
- self.cache_query_alignment_sequence = force_str(
- getSequenceInRange(src, start, end))
- return self.cache_query_alignment_sequence
-
- property query_alignment_qualities:
- """aligned query sequence quality values (None if not present). These
- are the quality values that correspond to :attr:`query`, that
- is, they exclude qualities of :term:`soft clipped` bases. This
- is equal to ``qual[qstart:qend]``.
-
- Quality scores are returned as a python array of unsigned
- chars. Note that this is not the ASCII-encoded value typically
- seen in FASTQ or SAM formatted files. Thus, no offset of 33
- needs to be subtracted.
-
- This property is read-only.
-
- """
- def __get__(self):
-
- if self.cache_query_alignment_qualities:
- return self.cache_query_alignment_qualities
-
- cdef bam1_t * src
- cdef uint32_t start, end
-
- src = self._delegate
-
- if src.core.l_qseq == 0:
- return None
-
- start = getQueryStart(src)
- end = getQueryEnd(src)
- self.cache_query_alignment_qualities = \
- getQualitiesInRange(src, start, end)
- return self.cache_query_alignment_qualities
-
- property query_alignment_start:
- """start index of the aligned query portion of the sequence (0-based,
- inclusive).
-
- This the index of the first base in :attr:`seq` that is not
- soft-clipped.
-
- """
- def __get__(self):
- return getQueryStart(self._delegate)
-
- property query_alignment_end:
- """end index of the aligned query portion of the sequence (0-based,
- exclusive)"""
- def __get__(self):
- return getQueryEnd(self._delegate)
-
- property query_alignment_length:
- """length of the aligned query sequence.
-
- This is equal to :attr:`qend` - :attr:`qstart`"""
- def __get__(self):
- cdef bam1_t * src
- src = self._delegate
- return getQueryEnd(src) - getQueryStart(src)
-
- #####################################################
- # Computed properties
-
- def get_reference_positions(self, full_length=False):
- """a list of reference positions that this read aligns to.
-
- By default, this method only returns positions in the
- reference that are within the alignment. If *full_length* is
- set, None values will be included for any soft-clipped or
- unaligned positions within the read. The returned list will
- thus be of the same length as the read.
-
- """
- cdef uint32_t k, i, pos
- cdef int op
- cdef uint32_t * cigar_p
- cdef bam1_t * src
- cdef bint _full = full_length
-
- src = self._delegate
- if pysam_get_n_cigar(src) == 0:
- return []
-
- result = []
- pos = src.core.pos
- cigar_p = pysam_bam_get_cigar(src)
-
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
-
- if op == BAM_CSOFT_CLIP or op == BAM_CINS:
- if _full:
- for i from 0 <= i < l:
- result.append(None)
- elif op == BAM_CMATCH:
- for i from pos <= i < pos + l:
- result.append(i)
- pos += l
- elif op == BAM_CDEL or op == BAM_CREF_SKIP:
- pos += l
-
- return result
-
- def infer_query_length(self, always=True):
- """inferred read length from CIGAR string.
-
- If *always* is set to True, the read length
- will be always inferred. If set to False, the length
- of the read sequence will be returned if it is
- available.
-
- Returns None if CIGAR string is not present.
- """
-
- cdef uint32_t * cigar_p
- cdef bam1_t * src
-
- src = self._delegate
-
- if not always and src.core.l_qseq:
- return src.core.l_qseq
-
- return calculateQueryLength(src)
-
- def get_reference_sequence(self):
- """return the reference sequence.
-
- This method requires the MD tag to be set.
- """
- cdef uint32_t k, i
- cdef int op
- cdef bam1_t * src = self._delegate
- ref_seq = force_str(build_alignment_sequence(src))
- if ref_seq is None:
- raise ValueError("MD tag not present")
-
- cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
- cdef uint32_t r_idx = 0
- result = []
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
- if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
- for i from 0 <= i < l:
- result.append(ref_seq[r_idx])
- r_idx += 1
- elif op == BAM_CDEL:
- for i from 0 <= i < l:
- result.append(ref_seq[r_idx])
- r_idx += 1
- elif op == BAM_CREF_SKIP:
- pass
- elif op == BAM_CINS:
- r_idx += l
- elif op == BAM_CSOFT_CLIP:
- pass
- elif op == BAM_CHARD_CLIP:
- pass # advances neither
- elif op == BAM_CPAD:
- raise NotImplementedError(
- "Padding (BAM_CPAD, 6) is currently not supported. "
- "Please implement. Sorry about that.")
-
- return "".join(result)
-
- def get_aligned_pairs(self, matches_only=False, with_seq=False):
- """a list of aligned read (query) and reference positions.
-
- For inserts, deletions, skipping either query or reference
- position may be None.
-
- Padding is currently not supported and leads to an exception.
-
- Parameters
- ----------
-
- matches_only : bool
- If True, only matched bases are returned - no None on either
- side.
- with_seq : bool
- If True, return a third element in the tuple containing the
- reference sequence. Substitutions are lower-case. This option
- requires an MD tag to be present.
-
- Returns
- -------
-
- aligned_pairs : list of tuples
-
- """
- cdef uint32_t k, i, pos, qpos, r_idx, l
- cdef int op
- cdef uint32_t * cigar_p
- cdef bam1_t * src = self._delegate
- cdef bint _matches_only = bool(matches_only)
- cdef bint _with_seq = bool(with_seq)
-
- # TODO: this method performs no checking and assumes that
- # read sequence, cigar and MD tag are consistent.
-
- if _with_seq:
- ref_seq = force_str(self.get_reference_sequence())
- if ref_seq is None:
- raise ValueError("MD tag not present")
-
- r_idx = 0
-
- if pysam_get_n_cigar(src) == 0:
- return []
-
- result = []
- pos = src.core.pos
- qpos = 0
- cigar_p = pysam_bam_get_cigar(src)
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
-
- if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
- if _with_seq:
- for i from pos <= i < pos + l:
- result.append((qpos, i, ref_seq[r_idx]))
- r_idx += 1
- qpos += 1
- else:
- for i from pos <= i < pos + l:
- result.append((qpos, i))
- qpos += 1
- pos += l
-
- elif op == BAM_CINS or op == BAM_CSOFT_CLIP:
- if not _matches_only:
- if _with_seq:
- for i from pos <= i < pos + l:
- result.append((qpos, None, None))
- qpos += 1
- else:
- for i from pos <= i < pos + l:
- result.append((qpos, None))
- qpos += 1
- else:
- qpos += l
-
- elif op == BAM_CDEL:
- if not _matches_only:
- if _with_seq:
- for i from pos <= i < pos + l:
- result.append((None, i, ref_seq[r_idx]))
- r_idx += 1
- else:
- for i from pos <= i < pos + l:
- result.append((None, i))
- pos += l
-
- elif op == BAM_CHARD_CLIP:
- pass # advances neither
-
- elif op == BAM_CREF_SKIP:
- if not _matches_only:
- if _with_seq:
- for i from pos <= i < pos + l:
- result.append((None, i, None))
- else:
- for i from pos <= i < pos + l:
- result.append((None, i))
-
- pos += l
-
- elif op == BAM_CPAD:
- raise NotImplementedError(
- "Padding (BAM_CPAD, 6) is currently not supported. "
- "Please implement. Sorry about that.")
-
- return result
-
- def get_blocks(self):
- """ a list of start and end positions of
- aligned gapless blocks.
-
- The start and end positions are in genomic
- coordinates.
-
- Blocks are not normalized, i.e. two blocks
- might be directly adjacent. This happens if
- the two blocks are separated by an insertion
- in the read.
- """
-
- cdef uint32_t k, pos, l
- cdef int op
- cdef uint32_t * cigar_p
- cdef bam1_t * src
-
- src = self._delegate
- if pysam_get_n_cigar(src) == 0:
- return []
-
- result = []
- pos = src.core.pos
- cigar_p = pysam_bam_get_cigar(src)
- l = 0
-
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
- if op == BAM_CMATCH:
- result.append((pos, pos + l))
- pos += l
- elif op == BAM_CDEL or op == BAM_CREF_SKIP:
- pos += l
-
- return result
-
- def get_overlap(self, uint32_t start, uint32_t end):
- """return number of aligned bases of read overlapping the interval
- *start* and *end* on the reference sequence.
-
- Return None if cigar alignment is not available.
- """
- cdef uint32_t k, i, pos, overlap
- cdef int op, o
- cdef uint32_t * cigar_p
- cdef bam1_t * src
-
- overlap = 0
-
- src = self._delegate
- if pysam_get_n_cigar(src) == 0:
- return None
- pos = src.core.pos
- o = 0
-
- cigar_p = pysam_bam_get_cigar(src)
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
-
- if op == BAM_CMATCH:
- o = min( pos + l, end) - max( pos, start )
- if o > 0: overlap += o
-
- if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
- pos += l
-
- return overlap
-
- def get_cigar_stats(self):
- """summary of operations in cigar string.
-
- The output order in the array is "MIDNSHP=X" followed by a
- field for the NM tag. If the NM tag is not present, this
- field will always be 0.
-
- +-----+--------------+-----+
- |M |BAM_CMATCH |0 |
- +-----+--------------+-----+
- |I |BAM_CINS |1 |
- +-----+--------------+-----+
- |D |BAM_CDEL |2 |
- +-----+--------------+-----+
- |N |BAM_CREF_SKIP |3 |
- +-----+--------------+-----+
- |S |BAM_CSOFT_CLIP|4 |
- +-----+--------------+-----+
- |H |BAM_CHARD_CLIP|5 |
- +-----+--------------+-----+
- |P |BAM_CPAD |6 |
- +-----+--------------+-----+
- |= |BAM_CEQUAL |7 |
- +-----+--------------+-----+
- |X |BAM_CDIFF |8 |
- +-----+--------------+-----+
- |NM |NM tag |9 |
- +-----+--------------+-----+
-
- If no cigar string is present, empty arrays will be returned.
-
- Parameters
- ----------
-
- Returns
- -------
-
- arrays : two arrays. The first contains the nucleotide counts within
- each cigar operation, the second contains the number of blocks for
- each cigar operation.
-
- """
-
- cdef int nfields = NCIGAR_CODES + 1
-
- cdef c_array.array base_counts = array.array(
- "I",
- [0] * nfields)
- cdef uint32_t [:] base_view = base_counts
- cdef c_array.array block_counts = array.array(
- "I",
- [0] * nfields)
- cdef uint32_t [:] block_view = block_counts
-
- cdef bam1_t * src = self._delegate
- cdef int op
- cdef uint32_t l
- cdef int32_t k
- cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
-
- if cigar_p == NULL:
- return None
-
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
- base_view[op] += l
- block_view[op] += 1
-
- cdef uint8_t * v = bam_aux_get(src, 'NM')
- if v != NULL:
- base_view[nfields - 1] = <int32_t>bam_aux2i(v)
-
- return base_counts, block_counts
-
- #####################################################
- ## Unsorted as yet
- # TODO: capture in CIGAR object
- property cigartuples:
- """the :term:`cigar` alignment. The alignment
- is returned as a list of tuples of (operation, length).
-
- If the alignment is not present, None is returned.
-
- The operations are:
-
- +-----+--------------+-----+
- |M |BAM_CMATCH |0 |
- +-----+--------------+-----+
- |I |BAM_CINS |1 |
- +-----+--------------+-----+
- |D |BAM_CDEL |2 |
- +-----+--------------+-----+
- |N |BAM_CREF_SKIP |3 |
- +-----+--------------+-----+
- |S |BAM_CSOFT_CLIP|4 |
- +-----+--------------+-----+
- |H |BAM_CHARD_CLIP|5 |
- +-----+--------------+-----+
- |P |BAM_CPAD |6 |
- +-----+--------------+-----+
- |= |BAM_CEQUAL |7 |
- +-----+--------------+-----+
- |X |BAM_CDIFF |8 |
- +-----+--------------+-----+
-
- .. note::
- The output is a list of (operation, length) tuples, such as
- ``[(0, 30)]``.
- This is different from the SAM specification and
- the :attr:`cigarstring` property, which uses a
- (length, operation) order, for example: ``30M``.
-
- To unset the cigar property, assign an empty list
- or None.
- """
- def __get__(self):
- cdef uint32_t * cigar_p
- cdef bam1_t * src
- cdef uint32_t op, l
- cdef int k
-
- src = self._delegate
- if pysam_get_n_cigar(src) == 0:
- return None
-
- cigar = []
-
- cigar_p = pysam_bam_get_cigar(src);
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- l = cigar_p[k] >> BAM_CIGAR_SHIFT
- cigar.append((op, l))
- return cigar
-
- def __set__(self, values):
- cdef uint32_t * p
- cdef bam1_t * src
- cdef op, l
- cdef int k, ncigar
-
- k = 0
-
- src = self._delegate
-
- # get location of cigar string
- p = pysam_bam_get_cigar(src)
-
- # empty values for cigar string
- if values is None:
- values = []
-
- ncigar = len(values)
- # create space for cigar data within src.data
- pysam_bam_update(src,
- pysam_get_n_cigar(src) * 4,
- ncigar * 4,
- <uint8_t*>p)
-
- # length is number of cigar operations, not bytes
- pysam_set_n_cigar(src, ncigar)
-
- # re-acquire pointer to location in memory
- # as it might have moved
- p = pysam_bam_get_cigar(src)
-
- # insert cigar operations
- for op, l in values:
- p[k] = l << BAM_CIGAR_SHIFT | op
- k += 1
-
- ## setting the cigar string requires updating the bin
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- bam_endpos(src),
- 14,
- 5))
-
-
- cpdef set_tag(self,
- tag,
- value,
- value_type=None,
- replace=True):
- """sets a particular field *tag* to *value* in the optional alignment
- section.
-
- *value_type* describes the type of *value* that is to entered
- into the alignment record.. It can be set explicitly to one
- of the valid one-letter type codes. If unset, an appropriate
- type will be chosen automatically.
-
- An existing value of the same *tag* will be overwritten unless
- replace is set to False. This is usually not recommened as a
- tag may only appear once in the optional alignment section.
-
- If *value* is None, the tag will be deleted.
- """
-
- cdef int value_size
- cdef uint8_t * value_ptr
- cdef uint8_t *existing_ptr
- cdef uint8_t typecode
- cdef float float_value
- cdef double double_value
- cdef int32_t int_value
- cdef bam1_t * src = self._delegate
- cdef char * _value_type
- cdef c_array.array array_value
- cdef object buffer
-
- if len(tag) != 2:
- raise ValueError('Invalid tag: %s' % tag)
-
- tag = force_bytes(tag)
- if replace:
- existing_ptr = bam_aux_get(src, tag)
- if existing_ptr:
- bam_aux_del(src, existing_ptr)
-
- # setting value to None deletes a tag
- if value is None:
- return
-
- typecode = get_value_code(value, value_type)
- if typecode == 0:
- raise ValueError("can't guess type or invalid type code specified")
-
- # Not Endian-safe, but then again neither is samtools!
- if typecode == 'Z':
- value = force_bytes(value)
- value_ptr = <uint8_t*><char*>value
- value_size = len(value)+1
- elif typecode == 'i':
- int_value = value
- value_ptr = <uint8_t*>&int_value
- value_size = sizeof(int32_t)
- elif typecode == 'd':
- double_value = value
- value_ptr = <uint8_t*>&double_value
- value_size = sizeof(double)
- elif typecode == 'f':
- float_value = value
- value_ptr = <uint8_t*>&float_value
- value_size = sizeof(float)
- elif typecode == 'B':
- # the following goes through python, needs to be cleaned up
- # pack array using struct
- if value_type is None:
- fmt, args = packTags([(tag, value)])
- else:
- fmt, args = packTags([(tag, value, value_type)])
-
- # remove tag and type code as set by bam_aux_append
- # first four chars of format (<2sc)
- fmt = '<' + fmt[4:]
- # first two values to pack
- args = args[2:]
- value_size = struct.calcsize(fmt)
- # buffer will be freed when object goes out of scope
- buffer = ctypes.create_string_buffer(value_size)
- struct.pack_into(fmt, buffer, 0, *args)
- # bam_aux_append copies data from value_ptr
- bam_aux_append(src,
- tag,
- typecode,
- value_size,
- <uint8_t*>buffer.raw)
- return
- else:
- raise ValueError('unsupported value_type in set_option')
-
- bam_aux_append(src,
- tag,
- typecode,
- value_size,
- value_ptr)
-
- cpdef has_tag(self, tag):
- """returns true if the optional alignment section
- contains a given *tag*."""
- cdef uint8_t * v
- cdef int nvalues
- btag = force_bytes(tag)
- v = bam_aux_get(self._delegate, btag)
- return v != NULL
-
- cpdef get_tag(self, tag, with_value_type=False):
- """
- retrieves data from the optional alignment section
- given a two-letter *tag* denoting the field.
-
- The returned value is cast into an appropriate python type.
-
- This method is the fastest way to access the optional
- alignment section if only few tags need to be retrieved.
-
- Parameters
- ----------
-
- tag :
- data tag.
-
- with_value_type : Optional[bool]
- if set to True, the return value is a tuple of (tag value, type code).
- (default False)
-
- Returns
- -------
-
- A python object with the value of the `tag`. The type of the
- object depends on the data type in the data record.
-
- Raises
- ------
-
- KeyError
- If `tag` is not present, a KeyError is raised.
-
- """
- cdef uint8_t * v
- cdef int nvalues
- btag = force_bytes(tag)
- v = bam_aux_get(self._delegate, btag)
- if v == NULL:
- raise KeyError("tag '%s' not present" % tag)
- if chr(v[0]) == "B":
- auxtype = chr(v[0]) + chr(v[1])
- else:
- auxtype = chr(v[0])
-
- if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
- value = <int>bam_aux2i(v)
- elif auxtype == 'i' or auxtype == 'I':
- value = <int32_t>bam_aux2i(v)
- elif auxtype == 'f' or auxtype == 'F':
- value = <float>bam_aux2f(v)
- elif auxtype == 'd' or auxtype == 'D':
- value = <double>bam_aux2f(v)
- elif auxtype == 'A':
- # there might a more efficient way
- # to convert a char into a string
- value = '%c' % <char>bam_aux2A(v)
- elif auxtype == 'Z':
- value = charptr_to_str(<char*>bam_aux2Z(v))
- elif auxtype[0] == 'B':
- bytesize, nvalues, values = convert_binary_tag(v + 1)
- value = values
- else:
- raise ValueError("unknown auxiliary type '%s'" % auxtype)
-
- if with_value_type:
- return (value, auxtype)
- else:
- return value
-
- def get_tags(self, with_value_type=False):
- """the fields in the optional aligment section.
-
- Returns a list of all fields in the optional
- alignment section. Values are converted to appropriate python
- values. For example:
-
- [(NM, 2), (RG, "GJP00TM04")]
-
- If *with_value_type* is set, the value type as encode in
- the AlignedSegment record will be returned as well:
-
- [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
-
- This method will convert all values in the optional alignment
- section. When getting only one or few tags, please see
- :meth:`get_tag` for a quicker way to achieve this.
-
- """
-
- cdef char * ctag
- cdef bam1_t * src
- cdef uint8_t * s
- cdef char auxtag[3]
- cdef char auxtype
- cdef uint8_t byte_size
- cdef int32_t nvalues
-
- src = self._delegate
- if src.l_data == 0:
- return []
- s = pysam_bam_get_aux(src)
- result = []
- auxtag[2] = 0
- while s < (src.data + src.l_data):
- # get tag
- auxtag[0] = s[0]
- auxtag[1] = s[1]
- s += 2
- auxtype = s[0]
- if auxtype in ('c', 'C'):
- value = <int>bam_aux2i(s)
- s += 1
- elif auxtype in ('s', 'S'):
- value = <int>bam_aux2i(s)
- s += 2
- elif auxtype in ('i', 'I'):
- value = <int32_t>bam_aux2i(s)
- s += 4
- elif auxtype == 'f':
- value = <float>bam_aux2f(s)
- s += 4
- elif auxtype == 'd':
- value = <double>bam_aux2f(s)
- s += 8
- elif auxtype == 'A':
- value = "%c" % <char>bam_aux2A(s)
- s += 1
- elif auxtype in ('Z', 'H'):
- value = charptr_to_str(<char*>bam_aux2Z(s))
- # +1 for NULL terminated string
- s += len(value) + 1
- elif auxtype == 'B':
- s += 1
- byte_size, nvalues, value = convert_binary_tag(s)
- # 5 for 1 char and 1 int
- s += 5 + (nvalues * byte_size) - 1
- else:
- raise KeyError("unknown type '%s'" % auxtype)
-
- s += 1
-
- if with_value_type:
- result.append((charptr_to_str(auxtag), value, chr(auxtype)))
- else:
- result.append((charptr_to_str(auxtag), value))
-
- return result
-
- def set_tags(self, tags):
- """sets the fields in the optional alignmest section with
- a list of (tag, value) tuples.
-
- The :term:`value type` of the values is determined from the
- python type. Optionally, a type may be given explicitly as
- a third value in the tuple, For example:
-
- x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
-
- This method will not enforce the rule that the same tag may appear
- only once in the optional alignment section.
- """
-
- cdef bam1_t * src
- cdef uint8_t * s
- cdef char * temp
- cdef int new_size = 0
- cdef int old_size
- src = self._delegate
-
- # convert and pack the data
- if tags is not None and len(tags) > 0:
- fmt, args = packTags(tags)
- new_size = struct.calcsize(fmt)
- buffer = ctypes.create_string_buffer(new_size)
- struct.pack_into(fmt,
- buffer,
- 0,
- *args)
-
- # delete the old data and allocate new space.
- # If total_size == 0, the aux field will be
- # empty
- old_size = pysam_bam_get_l_aux(src)
- pysam_bam_update(src,
- old_size,
- new_size,
- pysam_bam_get_aux(src))
-
- # copy data only if there is any
- if new_size > 0:
-
- # get location of new data
- s = pysam_bam_get_aux(src)
-
- # check if there is direct path from buffer.raw to tmp
- p = buffer.raw
- # create handle to make sure buffer stays alive long
- # enough for memcpy, see issue 129
- temp = p
- memcpy(s, temp, new_size)
-
-
- ########################################################
- # Compatibility Accessors
- # Functions, properties for compatibility with pysam < 0.8
- #
- # Several options
- # change the factory functions according to API
- # * requires code changes throughout, incl passing
- # handles to factory functions
- # subclass functions and add attributes at runtime
- # e.g.: AlignedSegments.qname = AlignedSegments.query_name
- # * will slow down the default interface
- # explicit declaration of getters/setters
- ########################################################
- property qname:
- """deprecated, use query_name instead"""
- def __get__(self): return self.query_name
- def __set__(self, v): self.query_name = v
- property tid:
- """deprecated, use reference_id instead"""
- def __get__(self): return self.reference_id
- def __set__(self, v): self.reference_id = v
- property pos:
- """deprecated, use reference_start instead"""
- def __get__(self): return self.reference_start
- def __set__(self, v): self.reference_start = v
- property mapq:
- """deprecated, use mapping_quality instead"""
- def __get__(self): return self.mapping_quality
- def __set__(self, v): self.mapping_quality = v
- property rnext:
- """deprecated, use next_reference_id instead"""
- def __get__(self): return self.next_reference_id
- def __set__(self, v): self.next_reference_id = v
- property pnext:
- """deprecated, use next_reference_start instead"""
- def __get__(self):
- return self.next_reference_start
- def __set__(self, v):
- self.next_reference_start = v
- property cigar:
- """deprecated, use cigartuples instead"""
- def __get__(self):
- r = self.cigartuples
- if r is None:
- r = []
- return r
- def __set__(self, v): self.cigartuples = v
- property tlen:
- """deprecated, use template_length instead"""
- def __get__(self):
- return self.template_length
- def __set__(self, v):
- self.template_length = v
- property seq:
- """deprecated, use query_sequence instead"""
- def __get__(self):
- return self.query_sequence
- def __set__(self, v):
- self.query_sequence = v
- property qual:
- """deprecated, query_qualities instead"""
- def __get__(self):
- return array_to_qualitystring(self.query_qualities)
- def __set__(self, v):
- self.query_qualities = qualitystring_to_array(v)
- property alen:
- """deprecated, reference_length instead"""
- def __get__(self):
- return self.reference_length
- def __set__(self, v):
- self.reference_length = v
- property aend:
- """deprecated, reference_end instead"""
- def __get__(self):
- return self.reference_end
- def __set__(self, v):
- self.reference_end = v
- property rlen:
- """deprecated, query_length instead"""
- def __get__(self):
- return self.query_length
- def __set__(self, v):
- self.query_length = v
- property query:
- """deprecated, query_alignment_sequence instead"""
- def __get__(self):
- return self.query_alignment_sequence
- def __set__(self, v):
- self.query_alignment_sequence = v
- property qqual:
- """deprecated, query_alignment_qualities instead"""
- def __get__(self):
- return array_to_qualitystring(self.query_alignment_qualities)
- def __set__(self, v):
- self.query_alignment_qualities = qualitystring_to_array(v)
- property qstart:
- """deprecated, use query_alignment_start instead"""
- def __get__(self):
- return self.query_alignment_start
- def __set__(self, v):
- self.query_alignment_start = v
- property qend:
- """deprecated, use query_alignment_end instead"""
- def __get__(self):
- return self.query_alignment_end
- def __set__(self, v):
- self.query_alignment_end = v
- property qlen:
- """deprecated, use query_alignment_length instead"""
- def __get__(self):
- return self.query_alignment_length
- def __set__(self, v):
- self.query_alignment_length = v
- property mrnm:
- """deprecated, use next_reference_id instead"""
- def __get__(self):
- return self.next_reference_id
- def __set__(self, v):
- self.next_reference_id = v
- property mpos:
- """deprecated, use next_reference_start instead"""
- def __get__(self):
- return self.next_reference_start
- def __set__(self, v):
- self.next_reference_start = v
- property rname:
- """deprecated, use reference_id instead"""
- def __get__(self):
- return self.reference_id
- def __set__(self, v):
- self.reference_id = v
- property isize:
- """deprecated, use template_length instead"""
- def __get__(self):
- return self.template_length
- def __set__(self, v):
- self.template_length = v
- property blocks:
- """deprecated, use get_blocks() instead"""
- def __get__(self):
- return self.get_blocks()
- property aligned_pairs:
- """deprecated, use get_aligned_pairs() instead"""
- def __get__(self):
- return self.get_aligned_pairs()
- property inferred_length:
- """deprecated, use infer_query_length() instead"""
- def __get__(self):
- return self.infer_query_length()
- property positions:
- """deprecated, use get_reference_positions() instead"""
- def __get__(self):
- return self.get_reference_positions()
- property tags:
- """deprecated, use get_tags() instead"""
- def __get__(self):
- return self.get_tags()
- def __set__(self, tags):
- self.set_tags(tags)
- def overlap(self):
- """deprecated, use get_overlap() instead"""
- return self.get_overlap()
- def opt(self, tag):
- """deprecated, use get_tag() instead"""
- return self.get_tag(tag)
- def setTag(self, tag, value, value_type=None, replace=True):
- """deprecated, use set_tag() instead"""
- return self.set_tag(tag, value, value_type, replace)
-
-
-cdef class PileupColumn:
- '''A pileup of reads at a particular reference sequence postion
- (:term:`column`). A pileup column contains all the reads that map
- to a certain target base.
-
- This class is a proxy for results returned by the samtools pileup
- engine. If the underlying engine iterator advances, the results
- of this column will change.
-
- '''
- def __init__(self):
- raise TypeError("this class cannot be instantiated from Python")
-
- def __str__(self):
- return "\t".join(map(str,
- (self.reference_id,
- self.reference_pos,
- self.nsegments))) +\
- "\n" +\
- "\n".join(map(str, self.pileups))
-
- property reference_id:
- '''the reference sequence number as defined in the header'''
- def __get__(self):
- return self.tid
-
- property reference_name:
- """:term:`reference` name (None if no AlignmentFile is associated)"""
- def __get__(self):
- if self._alignment_file is not None:
- return self._alignment_file.getrname(self.tid)
- return None
-
- property nsegments:
- '''number of reads mapping to this column.'''
- def __get__(self):
- return self.n_pu
- def __set__(self, n):
- self.n_pu = n
-
- property reference_pos:
- '''the position in the reference sequence (0-based).'''
- def __get__(self):
- return self.pos
-
- property pileups:
- '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
- def __get__(self):
- cdef int x
- pileups = []
-
- if self.plp == NULL or self.plp[0] == NULL:
- raise ValueError("PileupColumn accessed after iterator finished")
-
- # warning: there could be problems if self.n and self.buf are
- # out of sync.
- for x from 0 <= x < self.n_pu:
- pileups.append(makePileupRead(&(self.plp[0][x]),
- self._alignment_file))
- return pileups
-
- ########################################################
- # Compatibility Accessors
- # Functions, properties for compatibility with pysam < 0.8
- ########################################################
- property pos:
- def __get__(self):
- return self.reference_pos
- def __set__(self, v):
- self.reference_pos = v
-
- property tid:
- def __get__(self):
- return self.reference_id
- def __set__(self, v):
- self.reference_id = v
-
- property n:
- def __get__(self):
- return self.nsegments
- def __set__(self, v):
- self.nsegments = v
-
-
-cdef class PileupRead:
- '''Representation of a read aligned to a particular position in the
- reference sequence.
-
- '''
-
- def __init__(self):
- raise TypeError(
- "this class cannot be instantiated from Python")
-
- def __str__(self):
- return "\t".join(
- map(str,
- (self.alignment, self.query_position,
- self.indel, self.level,
- self.is_del, self.is_head,
- self.is_tail, self.is_refskip)))
-
- property alignment:
- """a :class:`pysam.AlignedSegment` object of the aligned read"""
- def __get__(self):
- return self._alignment
-
- property query_position:
- """position of the read base at the pileup site, 0-based.
- None if is_del or is_refskip is set.
-
- """
- def __get__(self):
- if self.is_del or self.is_refskip:
- return None
- else:
- return self._qpos
-
- property query_position_or_next:
- """position of the read base at the pileup site, 0-based.
-
- If the current position is a deletion, returns the next
- aligned base.
-
- """
- def __get__(self):
- return self._qpos
-
- property indel:
- """indel length for the position follwing the current pileup site.
-
- This quantity peeks ahead to the next cigar operation in this
- alignment. If the next operation is and insertion, indel will
- be positve. If the next operation is a deletion, it will be
- negation. 0 if the next operation is not an indel.
-
- """
- def __get__(self):
- return self._indel
-
- property level:
- """the level of the read in the "viewer" mode"""
- def __get__(self):
- return self._level
-
- property is_del:
- """1 iff the base on the padded read is a deletion"""
- def __get__(self):
- return self._is_del
-
- property is_head:
- """1 iff the base on the padded read is the left-most base."""
- def __get__(self):
- return self._is_head
-
- property is_tail:
- """1 iff the base on the padded read is the right-most base."""
- def __get__(self):
- return self._is_tail
-
- property is_refskip:
- def __get__(self):
- return self._is_refskip
-
-__all__ = [
- "AlignedSegment",
- "PileupColumn",
- "PileupRead"]
+++ /dev/null
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-
-from pysam.cfaidx cimport faidx_t, Fastafile
-from pysam.calignedsegment cimport AlignedSegment
-from pysam.chtslib cimport *
-
-from cpython cimport array
-cimport cython
-
-cdef extern from *:
- ctypedef char* const_char_ptr "const char*"
-
-cdef extern from "htslib_util.h":
-
- char * pysam_bam_get_qname(bam1_t * b)
-
-cdef extern from "samfile_util.h":
-
- int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
- int bam_prob_realn(bam1_t *b, const char *ref)
-
-####################################################################
-# Utility types
-
-ctypedef struct __iterdata:
- htsFile * htsfile
- bam_hdr_t * header
- hts_itr_t * iter
- faidx_t * fastafile
- int tid
- char * seq
- int seq_len
-
-
-cdef class AlignmentFile:
-
- cdef object _filename
- cdef object _reference_filename
-
- # pointer to htsFile structure
- cdef htsFile * htsfile
-
- # pointer to index
- cdef hts_idx_t *index
- # header structure
- cdef bam_hdr_t * header
- # true if file is bam format
- cdef readonly bint is_bam
- # true if file is bam format
- cdef readonly bint is_cram
- # true if not a file but a stream
- cdef readonly bint is_stream
- # true if file is not on the local filesystem
- cdef readonly bint is_remote
- # current read within iteration
- cdef bam1_t * b
- # file opening mode
- cdef char * mode
-
- # beginning of read section
- cdef int64_t start_offset
-
- cdef bam1_t * getCurrent(self)
- cdef int cnext(self)
-
- # write an aligned read
- cpdef int write(self, AlignedSegment read) except -1
-
-cdef class PileupColumn:
- cdef bam_pileup1_t ** plp
- cdef int tid
- cdef int pos
- cdef int n_pu
-
-cdef class PileupRead:
- cdef AlignedSegment _alignment
- cdef int32_t _qpos
- cdef int _indel
- cdef int _level
- cdef uint32_t _is_del
- cdef uint32_t _is_head
- cdef uint32_t _is_tail
- cdef uint32_t _is_refskip
-
-cdef class IteratorRow:
- cdef int retval
- cdef bam1_t * b
- cdef AlignmentFile samfile
- cdef htsFile * htsfile
- cdef bam_hdr_t * header
- cdef int owns_samfile
-
-cdef class IteratorRowRegion(IteratorRow):
- cdef hts_itr_t * iter
- cdef bam1_t * getCurrent(self)
- cdef int cnext(self)
-
-cdef class IteratorRowHead(IteratorRow):
- cdef int max_rows
- cdef int current_row
- cdef bam1_t * getCurrent(self)
- cdef int cnext(self)
-
-cdef class IteratorRowAll(IteratorRow):
- cdef bam1_t * getCurrent(self)
- cdef int cnext(self)
-
-cdef class IteratorRowAllRefs(IteratorRow):
- cdef int tid
- cdef IteratorRowRegion rowiter
-
-cdef class IteratorRowSelection(IteratorRow):
- cdef int current_pos
- cdef positions
- cdef bam1_t * getCurrent(self)
- cdef int cnext(self)
-
-cdef class IteratorColumn:
-
- # result of the last plbuf_push
- cdef IteratorRowRegion iter
- cdef int tid
- cdef int pos
- cdef int n_plp
- cdef int mask
- cdef bam_pileup1_t * plp
- cdef bam_plp_t pileup_iter
- cdef __iterdata iterdata
- cdef AlignmentFile samfile
- cdef Fastafile fastafile
- cdef stepper
- cdef int max_depth
-
- cdef int cnext(self)
- cdef char * getSequence(self)
- cdef setMask(self, mask)
- cdef setupIteratorData(self,
- int tid,
- int start,
- int end,
- int multiple_iterators=?)
-
- cdef reset(self, tid, start, end)
- cdef _free_pileup_iter(self)
-
-cdef class IteratorColumnRegion(IteratorColumn):
- cdef int start
- cdef int end
- cdef int truncate
-
-cdef class IteratorColumnAllRefs(IteratorColumn):
- pass
-
-cdef class IndexedReads:
- cdef AlignmentFile samfile
- cdef htsFile * htsfile
- cdef index
- cdef int owns_samfile
- cdef bam_hdr_t * header
-
+++ /dev/null
-# cython: embedsignature=True
-# cython: profile=True
-########################################################
-########################################################
-# Cython wrapper for SAM/BAM/CRAM files based on htslib
-########################################################
-# The principal classes defined in this module are:
-#
-# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
-#
-# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
-# the original sort order intact
-#
-# Additionally this module defines numerous additional classes that
-# are part of the internal API. These are:
-#
-# Various iterator classes to iterate over alignments in sequential
-# (IteratorRow) or in a stacked fashion (IteratorColumn):
-#
-# class IteratorRow
-# class IteratorRowRegion
-# class IteratorRowHead
-# class IteratorRowAll
-# class IteratorRowAllRefs
-# class IteratorRowSelection
-# class IteratorColumn
-# class IteratorColumnRegion
-# class IteratorColumnAllRefs
-#
-########################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-########################################################
-import os
-import collections
-import re
-import warnings
-import array
-
-from cpython cimport array as c_array
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-from pysam.calignedsegment cimport makeAlignedSegment, makePileupColumn
-from pysam.chtslib cimport hisremote
-
-if PY_MAJOR_VERSION >= 3:
- from io import StringIO
-else:
- from StringIO import StringIO
-
-cimport cython
-
-########################################################
-## Constants and global variables
-
-# defines imported from samtools
-DEF SEEK_SET = 0
-DEF SEEK_CUR = 1
-DEF SEEK_END = 2
-
-# maximum genomic coordinace
-cdef int MAX_POS = 2 << 29
-
-# valid types for SAM headers
-VALID_HEADER_TYPES = {"HD" : dict,
- "SQ" : list,
- "RG" : list,
- "PG" : list,
- "CO" : list}
-
-# order of records within SAM headers
-VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
-
-# default type conversions within SAM header records
-KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
- "SQ" : {"SN" : str, "LN" : int, "AS" : str,
- "M5" : str, "SP" : str, "UR" : str,},
- "RG" : {"ID" : str, "CN" : str, "DS" : str,
- "DT" : str, "FO" : str, "KS" : str,
- "LB" : str, "PG" : str, "PI" : str,
- "PL" : str, "PM" : str, "PU" : str,
- "SM" : str,},
- "PG" : {"ID" : str, "PN" : str, "CL" : str,
- "PP" : str, "DS" : str, "VN" : str,},}
-
-# output order of fields within records. Ensure that CL is at
-# the end as parsing a CL will ignore any subsequent records.
-VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"),
- "SQ" : ("SN", "LN", "AS", "M5",
- "UR", "SP"),
- "RG" : ("ID", "CN", "SM", "LB",
- "PU", "PI", "DT", "DS",
- "PL", "FO", "KS", "PG",
- "PM"),
- "PG" : ("PN", "ID", "VN", "PP",
- "DS", "CL"),}
-
-
-def build_header_line(fields, record):
- '''build a header line from `fields` dictionary for `record`'''
-
- # TODO: add checking for field and sort order
- line = ["@%s" % record]
- # comment
- if record == "CO":
- line.append(fields)
- # user tags
- elif record.islower():
- for key in sorted(fields):
- line.append("%s:%s" % (key, str(fields[key])))
- # defined tags
- else:
- # write fields of the specification
- for key in VALID_HEADER_ORDER[record]:
- if key in fields:
- line.append("%s:%s" % (key, str(fields[key])))
- # write user fields
- for key in fields:
- if not key.isupper():
- line.append("%s:%s" % (key, str(fields[key])))
-
- return "\t".join(line)
-
-cdef bam_hdr_t * build_header(new_header):
- '''return a new header built from a dictionary in `new_header`.
-
- This method inserts the text field, target_name and target_len.
- '''
-
- lines = []
-
- # check if hash exists
-
- # create new header and copy old data
- cdef bam_hdr_t * dest
-
- dest = bam_hdr_init()
-
- # first: defined tags
- for record in VALID_HEADERS:
- if record in new_header:
- ttype = VALID_HEADER_TYPES[record]
- data = new_header[record]
- if type(data) != type(ttype()):
- raise ValueError(
- "invalid type for record %s: %s, expected %s" %
- (record, type(data), type(ttype())))
- if type(data) is dict:
- lines.append(build_header_line(data, record))
- else:
- for fields in new_header[record]:
- lines.append(build_header_line(fields, record))
-
- # then: user tags (lower case), sorted alphabetically
- for record, data in sorted(new_header.items()):
- if record in VALID_HEADERS: continue
- if type(data) is dict:
- lines.append(build_header_line(data, record))
- else:
- for fields in new_header[record]:
- lines.append(build_header_line(fields, record))
-
- text = "\n".join(lines) + "\n"
- if dest.text != NULL: free( dest.text )
- dest.text = <char*>calloc(len(text), sizeof(char))
- dest.l_text = len(text)
- cdef bytes btext = text.encode('ascii')
- strncpy(dest.text, btext, dest.l_text)
-
- cdef bytes bseqname
- # collect targets
- if "SQ" in new_header:
- seqs = []
- for fields in new_header["SQ"]:
- try:
- seqs.append( (fields["SN"], fields["LN"] ) )
- except KeyError:
- raise KeyError( "incomplete sequence information in '%s'" % str(fields))
-
- dest.n_targets = len(seqs)
- dest.target_name = <char**>calloc(dest.n_targets, sizeof(char*))
- dest.target_len = <uint32_t*>calloc(dest.n_targets, sizeof(uint32_t))
-
- for x from 0 <= x < dest.n_targets:
- seqname, seqlen = seqs[x]
- dest.target_name[x] = <char*>calloc(
- len(seqname) + 1, sizeof(char))
- bseqname = seqname.encode('ascii')
- strncpy(dest.target_name[x], bseqname,
- len(seqname) + 1)
- dest.target_len[x] = seqlen
-
- return dest
-
-
-cdef class AlignmentFile:
- """AlignmentFile(filepath_or_object, mode=None, template=None,
- reference_names=None, reference_lengths=None, text=NULL,
- header=None, add_sq_text=False, check_header=True, check_sq=True,
- reference_filename=None, filename=None)
-
- A :term:`SAM`/:term:`BAM` formatted file.
-
- If `filepath_or_object` is a string, the file is automatically
- opened. If `filepath_or_object` is a python File object, the
- already opened file will be used.
-
- If the file is opened for reading an index for a BAM file exists
- (.bai), it will be opened automatically. Without an index random
- access via :meth:`~pysam.AlignmentFile.fetch` and
- :meth:`~pysam.AlignmentFile.pileup` is disabled.
-
- For writing, the header of a :term:`SAM` file/:term:`BAM` file can
- be constituted from several sources (see also the samtools format
- specification):
-
- 1. If `template` is given, the header is copied from a another
- `AlignmentFile` (`template` must be a
- :class:`~pysam.AlignmentFile`).
-
- 2. If `header` is given, the header is built from a
- multi-level dictionary.
-
- 3. If `text` is given, new header text is copied from raw
- text.
-
- 4. The names (`reference_names`) and lengths
- (`reference_lengths`) are supplied directly as lists.
-
- When reading or writing a CRAM file, the filename of a FASTA-formatted
- reference can be specified with `reference_filename`.
-
- By default, if a file is opened in mode 'r', it is checked
- for a valid header (`check_header` = True) and a definition of
- chromosome names (`check_sq` = True).
-
- Parameters
- ----------
- mode : string
- `mode` should be ``r`` for reading or ``w`` for writing. The
- default is text mode (:term:`SAM`). For binary (:term:`BAM`)
- I/O you should append ``b`` for compressed or ``u`` for
- uncompressed :term:`BAM` output. Use ``h`` to output header
- information in text (:term:`TAM`) mode. Use ``c`` for
- :term:`CRAM` formatted files.
-
- If ``b`` is present, it must immediately follow ``r`` or
- ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
- ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
- :term:`BAM` formatted file for reading, type::
-
- f = pysam.AlignmentFile('ex1.bam','rb')
-
- If mode is not specified, the method will try to auto-detect
- in the order 'rb', 'r', thus both the following should work::
-
- f1 = pysam.AlignmentFile('ex1.bam')
- f2 = pysam.AlignmentFile('ex1.sam')
-
- template : AlignmentFile
- when writing, copy header frem `template`.
-
- header : dict
- when writing, build header from a multi-level dictionary. The
- first level are the four types ('HD', 'SQ', ...). The second
- level are a list of lines, with each line being a list of
- tag-value pairs. The header is constructed first from all the
- defined fields, followed by user tags in alphabetical order.
-
- text : string
- when writing, use the string provided as the header
-
- reference_names : list
- see referece_lengths
-
- reference_lengths : list
- when writing, build header from list of chromosome names and
- lengths. By default, 'SQ' and 'LN' tags will be added to the
- header text. This option can be changed by unsetting the flag
- `add_sq_text`.
-
- add_sq_text : bool
- do not add 'SQ' and 'LN' tags to header. This option permits
- construction :term:`SAM` formatted files without a header.
-
- check_header : bool
- when reading, check if header is present (default=True)
-
- check_sq : bool
- when reading, check if SQ entries are present in header
- (default=True)
-
- reference_filename : string
- Path to a FASTA-formatted reference file. Valid only for CRAM files.
- When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
- specified in the header (``UR`` tag), which are normally used to find
- the reference.
-
- filename : string
- Alternative to filepath_or_object. Filename of the file
- to be opened.
-
- """
-
- def __cinit__(self, *args, **kwargs):
-
- self.htsfile = NULL
- self._filename = None
- self.is_bam = False
- self.is_stream = False
- self.is_cram = False
- self.is_remote = False
-
- if "filename" in kwargs:
- args = [kwargs["filename"]]
- del kwargs["filename"]
-
- self._open(*args, **kwargs)
-
- # allocate memory for iterator
- self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
-
- def is_open(self):
- '''return true if htsfile has been opened.'''
- return self.htsfile != NULL
-
- def has_index(self):
- """return true if htsfile has an existing (and opened) index.
- """
- return self.index != NULL
-
- def check_index(self):
- """return True if index is present.
-
- Raises
- ------
-
- AttributeError
- if htsfile is :term:`SAM` formatted and thus has no index.
-
- ValueError
- if htsfile is closed or index could not be opened.
- """
-
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- if not self.is_bam and not self.is_cram:
- raise AttributeError(
- "AlignmentFile.mapped only available in bam files")
- if self.index == NULL:
- raise ValueError(
- "mapping information not recorded in index "
- "or index not available")
- return True
-
- def _open(self,
- filepath_or_object,
- mode=None,
- AlignmentFile template=None,
- reference_names=None,
- reference_lengths=None,
- reference_filename=None,
- text=None,
- header=None,
- port=None,
- add_sq_text=True,
- check_header=True,
- check_sq=True,
- filepath_index=None,
- referencenames=None,
- referencelengths=None):
- '''open a sam, bam or cram formatted file.
-
- If _open is called on an existing file, the current file
- will be closed and a new file will be opened.
- '''
- cdef char *cfilename
- cdef char *creference_filename
- cdef char *cindexname
- cdef char *cmode
-
- # for backwards compatibility:
- if referencenames is not None:
- reference_names = referencenames
- if referencelengths is not None:
- reference_lengths = referencelengths
-
- # autodetection for read
- if mode is None:
- mode = "r"
-
- assert mode in ("r", "w", "rb", "wb", "wh",
- "wbu", "rU", "wb0",
- "rc", "wc"), \
- "invalid file opening mode `%s`" % mode
-
- # close a previously opened file
- if self.htsfile != NULL:
- self.close()
-
- # StringIO not supported
- if isinstance(filepath_or_object, StringIO):
- filename = "stringio"
- raise NotImplementedError(
- "access from StringIO objects not supported")
- if filepath_or_object.closed:
- raise ValueError('I/O operation on closed StringIO object')
- # check if we are working with a File object
- elif hasattr(filepath_or_object, "fileno"):
- filename = filepath_or_object.name
- if filepath_or_object.closed:
- raise ValueError('I/O operation on closed file')
- else:
- filename = filepath_or_object
-
- # for htslib, wbu seems to not work
- if mode == "wbu":
- mode = "wb0"
-
- cdef bytes bmode = mode.encode('ascii')
- self._filename = filename = encode_filename(filename)
- self._reference_filename = reference_filename = encode_filename(
- reference_filename)
-
- # FIXME: Use htsFormat when it is available
- self.is_stream = filename == b"-"
- self.is_remote = hisremote(filename)
-
- cdef char * ctext
- cdef hFILE * fp
- ctext = NULL
-
- if mode[0] == 'w':
- # open file for writing
-
- # header structure (used for writing)
- if template:
- self.header = bam_hdr_dup(template.header)
- elif header:
- self.header = build_header(header)
- else:
- # build header from a target names and lengths
- assert reference_names and reference_lengths, \
- ("either supply options `template`, `header` "
- "or both `reference_names` and `reference_lengths` "
- "for writing")
- assert len(reference_names) == len(reference_lengths), \
- "unequal names and lengths of reference sequences"
-
- # allocate and fill header
- reference_names = [force_bytes(ref) for ref in reference_names]
- self.header = bam_hdr_init()
- self.header.n_targets = len(reference_names)
- n = 0
- for x in reference_names:
- n += len(x) + 1
- self.header.target_name = <char**>calloc(
- n, sizeof(char*))
- self.header.target_len = <uint32_t*>calloc(
- n, sizeof(uint32_t))
- for x from 0 <= x < self.header.n_targets:
- self.header.target_len[x] = reference_lengths[x]
- name = reference_names[x]
- self.header.target_name[x] = <char*>calloc(
- len(name) + 1, sizeof(char))
- strncpy(self.header.target_name[x], name, len(name))
-
- # Optionally, if there is no text, add a SAM
- # compatible header to output file.
- if text is None and add_sq_text:
- text = []
- for x from 0 <= x < self.header.n_targets:
- text.append("@SQ\tSN:%s\tLN:%s\n" % \
- (force_str(reference_names[x]),
- reference_lengths[x]))
- text = ''.join(text)
-
- if text is not None:
- # copy without \0
- text = force_bytes(text)
- ctext = text
- self.header.l_text = strlen(ctext)
- self.header.text = <char*>calloc(
- strlen(ctext), sizeof(char))
- memcpy(self.header.text, ctext, strlen(ctext))
-
- # open file (hts_open is synonym with sam_open)
- cfilename, cmode = filename, bmode
- if hasattr(filepath_or_object, "fileno"):
- fp = hdopen(filepath_or_object.fileno(), cmode)
- with nogil:
- self.htsfile = hts_hopen(fp, cfilename, cmode)
- else:
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
-
- # htsfile.format does not get set until writing, so use
- # the format specifier explicitely given by the user.
- self.is_bam = "b" in mode
- self.is_cram = "c" in mode
-
- # set filename with reference sequences. If no filename
- # is given, the CRAM reference arrays will be built from
- # the @SQ header in the header
- if self.is_cram and reference_filename:
- # note that fn_aux takes ownership, so create a copy
- self.htsfile.fn_aux = strdup(self._reference_filename)
-
- # write header to htsfile
- if self.is_bam or self.is_cram or "h" in mode:
- with nogil:
- sam_hdr_write(self.htsfile, self.header)
-
- elif mode[0] == "r":
- # open file for reading
- if (filename != b"-"
- and not self.is_remote
- and not os.path.exists(filename)):
- raise IOError("file `%s` not found" % filename)
-
- # open file (hts_open is synonym with sam_open)
- cfilename, cmode = filename, bmode
- if hasattr(filepath_or_object, "fileno"):
- fp = hdopen(filepath_or_object.fileno(), cmode)
- with nogil:
- self.htsfile = hts_hopen(fp, cfilename, cmode)
- else:
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
-
- if self.htsfile == NULL:
- raise ValueError(
- "could not open file (mode='%s') - "
- "is it SAM/BAM format?" % mode)
-
- self.is_bam = self.htsfile.format.format == bam
- self.is_cram = self.htsfile.format.format == cram
-
- # bam files require a valid header
- if self.is_bam or self.is_cram:
- with nogil:
- self.header = sam_hdr_read(self.htsfile)
- if self.header == NULL:
- raise ValueError(
- "file does not have valid header (mode='%s') "
- "- is it BAM format?" % mode )
- else:
- # in sam files it is optional (htsfile full of
- # unmapped reads)
- if check_header:
- with nogil:
- self.header = sam_hdr_read(self.htsfile)
- if self.header == NULL:
- raise ValueError(
- "file does not have valid header (mode='%s') "
- "- is it SAM format?" % mode )
- # self.header.ignore_sam_err = True
-
- # set filename with reference sequences
- if self.is_cram and reference_filename:
- creference_filename = self._reference_filename
- hts_set_opt(self.htsfile,
- CRAM_OPT_REFERENCE,
- creference_filename)
-
- if check_sq and self.header.n_targets == 0:
- raise ValueError(
- ("file has no sequences defined (mode='%s') - "
- "is it SAM/BAM format? Consider opening with "
- "check_sq=False") % mode)
-
- if self.htsfile == NULL:
- raise IOError("could not open file `%s`" % filename )
-
- # check for index and open if present
- cdef int format_index = -1
- if self.is_bam:
- format_index = HTS_FMT_BAI
- elif self.is_cram:
- format_index = HTS_FMT_CRAI
-
- if mode[0] == "r" and (self.is_bam or self.is_cram):
-
- # open index for remote files
- if self.is_remote and not filepath_index:
- cfilename = filename
-
- with nogil:
- self.index = hts_idx_load(cfilename, format_index)
- if self.index == NULL:
- warnings.warn(
- "unable to open remote index for '%s'" % cfilename)
- else:
- has_index = True
- cfilename = filename
- if filepath_index:
- if not os.path.exists(filepath_index):
- warnings.warn(
- "unable to open index at %s" % cfilename)
- self.index = NULL
- has_index = False
- else:
- if self.is_bam \
- and not os.path.exists(filename + b".bai") \
- and not os.path.exists(filename[:-4] + b".bai"):
- self.index = NULL
- has_index = False
- elif self.is_cram \
- and not os.path.exists(filename + b".crai") \
- and not os.path.exists(filename[:-5] + b".crai"):
- self.index = NULL
- has_index = False
-
- if has_index:
- # returns NULL if there is no index or index could
- # not be opened
- if filepath_index:
- cindexname = filepath_index = encode_filename(filepath_index)
- with nogil:
- self.index = sam_index_load2(self.htsfile,
- cfilename,
- cindexname)
-
- else:
- with nogil:
- self.index = sam_index_load(self.htsfile,
- cfilename)
- if self.index == NULL:
- raise IOError(
- "error while opening index for '%s'" %
- filename)
-
- # save start of data section
- if not self.is_stream:
- self.start_offset = self.tell()
-
- def get_tid(self, reference):
- """
- return the numerical :term:`tid` corresponding to
- :term:`reference`
-
- returns -1 if reference is not known.
- """
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- reference = force_bytes(reference)
- return bam_name2id(self.header, reference)
-
- def get_reference_name(self, tid):
- """
- return :term:`reference` name corresponding to numerical :term:`tid`
- """
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- if not 0 <= tid < self.header.n_targets:
- raise ValueError("reference_id %i out of range 0<=tid<%i" %
- (tid, self.header.n_targets))
- return charptr_to_str(self.header.target_name[tid])
-
- def reset(self):
- """reset file position to beginning of file just after
- the header.
-
- Returns
- -------
-
- The file position after moving the file pointer.
-
- """
- return self.seek(self.start_offset, 0)
-
- def seek(self, uint64_t offset, int where=0):
- """move file pointer to position `offset`, see
- :meth:`pysam.AlignmentFile.tell`.
-
- Parameters
- ----------
-
- offset : int
-
- position of the read/write pointer within the file.
-
- where : int
-
- optional and defaults to 0 which means absolute file
- positioning, other values are 1 which means seek relative to
- the current position and 2 means seek relative to the file's
- end.
-
- Returns
- -------
-
- the file position after moving the file pointer
-
- """
-
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- if not self.is_bam:
- raise NotImplementedError(
- "seek only available in bam files")
- if self.is_stream:
- raise OSError("seek no available in streams")
-
- cdef uint64_t pos
- with nogil:
- pos = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, where)
- return pos
-
- def tell(self):
- """
- return current file position.
- """
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- if not (self.is_bam or self.is_cram):
- raise NotImplementedError(
- "seek only available in bam files")
-
- cdef uint64_t pos
- with nogil:
- pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
- return pos
-
- def parse_region(self,
- reference=None,
- start=None,
- end=None,
- region=None,
- tid=None):
- """parse alternative ways to specify a genomic region. A region can
- either be specified by :term:`reference`, `start` and
- `end`. `start` and `end` denote 0-based, half-open
- intervals.
-
- Alternatively, a samtools :term:`region` string can be
- supplied.
-
- If any of the coordinates are missing they will be replaced by the
- minimum (`start`) or maximum (`end`) coordinate.
-
- Note that region strings are 1-based, while `start` and `end` denote
- an interval in python coordinates.
-
- Returns
- -------
-
- tuple : a tuple of `flag`, :term:`tid`, `start` and `end`. The
- flag indicates whether no coordinates were supplied and the
- genomic region is the complete genomic space.
-
- Raises
- ------
-
- ValueError
- for invalid or out of bounds regions.
-
- """
- cdef int rtid
- cdef long long rstart
- cdef long long rend
-
- rtid = -1
- rstart = 0
- rend = MAX_POS
- if start != None:
- try:
- rstart = start
- except OverflowError:
- raise ValueError('start out of range (%i)' % start)
-
- if end != None:
- try:
- rend = end
- except OverflowError:
- raise ValueError('end out of range (%i)' % end)
-
- if region:
- region = force_str(region)
- parts = re.split("[:-]", region)
- reference = parts[0]
- if len(parts) >= 2:
- rstart = int(parts[1]) - 1
- if len(parts) >= 3:
- rend = int(parts[2])
-
- if not reference:
- return 0, 0, 0, 0
-
- if tid is not None:
- rtid = tid
- else:
- rtid = self.gettid(reference)
-
- if rtid < 0:
- raise ValueError(
- "invalid reference `%s`" % reference)
- if rstart > rend:
- raise ValueError(
- 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend))
- if not 0 <= rstart < MAX_POS:
- raise ValueError('start out of range (%i)' % rstart)
- if not 0 <= rend <= MAX_POS:
- raise ValueError('end out of range (%i)' % rend)
-
- return 1, rtid, rstart, rend
-
- def fetch(self,
- reference=None,
- start=None,
- end=None,
- region=None,
- tid=None,
- until_eof=False,
- multiple_iterators=False):
- """fetch reads aligned in a :term:`region`.
-
- See :meth:`AlignmentFile.parse_region` for more information
- on genomic regions.
-
- Without a `reference` or `region` all mapped reads in the file
- will be fetched. The reads will be returned ordered by reference
- sequence, which will not necessarily be the order within the
- file. This mode of iteration still requires an index. If there is
- no index, use `until_eof=True`.
-
- If only `reference` is set, all reads aligned to `reference`
- will be fetched.
-
- A :term:`SAM` file does not allow random access. If `region`
- or `reference` are given, an exception is raised.
-
- :class:`~pysam.FastaFile`
- :class:`~pysam.IteratorRow`
- :class:`~pysam.IteratorRow`
- :class:`~IteratorRow`
- :class:`IteratorRow`
-
- Parameters
- ----------
-
- until_eof : bool
-
- If `until_eof` is True, all reads from the current file
- position will be returned in order as they are within the
- file. Using this option will also fetch unmapped reads.
-
- multiple_iterators : bool
-
- If `multiple_iterators` is True, multiple
- iterators on the same file can be used at the same time. The
- iterator returned will receive its own copy of a filehandle to
- the file effectively re-opening the file. Re-opening a file
- creates some overhead, so beware.
-
- Returns
- -------
-
- An iterator over a collection of reads.
-
- Raises
- ------
-
- ValueError
- if the genomic coordinates are out of range or invalid or the
- file does not permit random access to genomic coordinates.
-
- """
- cdef int rtid, rstart, rend, has_coord
-
- if not self.is_open():
- raise ValueError( "I/O operation on closed file" )
-
- has_coord, rtid, rstart, rend = self.parse_region(
- reference,
- start,
- end,
- region,
- tid)
-
- # Turn of re-opening if htsfile is a stream
- if self.is_stream:
- multiple_iterators = False
-
- if self.is_bam or self.is_cram:
- if not until_eof and not self.is_remote:
- if not self.has_index():
- raise ValueError(
- "fetch called on bamfile without index")
-
- if has_coord:
- return IteratorRowRegion(
- self, rtid, rstart, rend,
- multiple_iterators=multiple_iterators)
- else:
- if until_eof:
- return IteratorRowAll(
- self,
- multiple_iterators=multiple_iterators)
- else:
- # AH: check - reason why no multiple_iterators for
- # AllRefs?
- return IteratorRowAllRefs(
- self,
- multiple_iterators=multiple_iterators)
- else:
- if has_coord:
- raise ValueError(
- "fetching by region is not available for sam files")
-
- if self.header == NULL:
- raise ValueError(
- "fetch called for htsfile without header")
-
- # check if targets are defined
- # give warning, sam_read1 segfaults
- if self.header.n_targets == 0:
- warnings.warn("fetch called for htsfile without header")
-
- return IteratorRowAll(self,
- multiple_iterators=multiple_iterators)
-
- def head(self, n, multiple_iterators=True):
- '''return an iterator over the first n alignments.
-
- This iterator is is useful for inspecting the bam-file.
-
- Parameters
- ----------
-
- multiple_iterators : bool
-
- is set to True by default in order to
- avoid changing the current file position.
-
- Returns
- -------
-
- an iterator over a collection of reads
-
- '''
- return IteratorRowHead(self, n,
- multiple_iterators=multiple_iterators)
-
- def mate(self, AlignedSegment read):
- '''return the mate of :class:`~pysam.AlignedSegment` `read`.
-
- .. note::
-
- Calling this method will change the file position.
- This might interfere with any iterators that have
- not re-opened the file.
-
- .. note::
-
- This method is too slow for high-throughput processing.
- If a read needs to be processed with its mate, work
- from a read name sorted file or, better, cache reads.
-
- Returns
- -------
-
- :class:`~pysam.AlignedSegment` : the mate
-
- Raises
- ------
-
- ValueError
- if the read is unpaired or the mate is unmapped
-
- '''
- cdef uint32_t flag = read._delegate.core.flag
-
- if flag & BAM_FPAIRED == 0:
- raise ValueError("read %s: is unpaired" %
- (read.query_name))
- if flag & BAM_FMUNMAP != 0:
- raise ValueError("mate %s: is unmapped" %
- (read.query_name))
-
- # xor flags to get the other mate
- cdef int x = BAM_FREAD1 + BAM_FREAD2
- flag = (flag ^ x) & x
-
- # Make sure to use a separate file to jump around
- # to mate as otherwise the original file position
- # will be lost
- # The following code is not using the C API and
- # could thus be made much quicker, for example
- # by using tell and seek.
- for mate in self.fetch(
- read._delegate.core.mpos,
- read._delegate.core.mpos + 1,
- tid=read._delegate.core.mtid,
- multiple_iterators=True):
- if mate.flag & flag != 0 and \
- mate.query_name == read.query_name:
- break
- else:
- raise ValueError("mate not found")
-
- return mate
-
- def pileup(self,
- reference=None,
- start=None,
- end=None,
- region=None,
- **kwargs):
- """perform a :term:`pileup` within a :term:`region`. The region is
- specified by :term:`reference`, 'start' and 'end' (using
- 0-based indexing). Alternatively, a samtools 'region' string
- can be supplied.
-
- Without 'reference' or 'region' all reads will be used for the
- pileup. The reads will be returned ordered by
- :term:`reference` sequence, which will not necessarily be the
- order within the file.
-
- Note that :term:`SAM` formatted files do not allow random
- access. In these files, if a 'region' or 'reference' are
- given an exception is raised.
-
- .. note::
-
- 'all' reads which overlap the region are returned. The
- first base returned will be the first base of the first
- read 'not' necessarily the first base of the region used
- in the query.
-
- Parameters
- ----------
-
- stepper : string
- The stepper controlls how the iterator advances.
- Possible options for the stepper are
-
- ``all``
- skip reads in which any of the following flags are set:
- BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
-
- ``nofilter``
- uses every single read
-
- ``samtools``
- same filter and read processing as in :term:`csamtools`
- pileup. This requires a 'fastafile' to be given.
-
-
- fastafile : :class:`~pysam.FastaFile` object.
-
- This is required for some of the steppers.
-
- max_depth : int
- Maximum read depth permitted. The default limit is '8000'.
-
- truncate : bool
-
- By default, the samtools pileup engine outputs all reads
- overlapping a region. If truncate is True and a region is
- given, only columns in the exact region specificied are
- returned.
-
- Returns
- -------
-
- an iterator over genomic positions.
-
- """
- cdef int rtid, rstart, rend, has_coord
-
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
-
- has_coord, rtid, rstart, rend = self.parse_region(
- reference, start, end, region)
-
- if self.is_bam or self.is_cram:
- if not self.has_index():
- raise ValueError("no index available for pileup")
-
- if has_coord:
- return IteratorColumnRegion(self,
- tid=rtid,
- start=rstart,
- end=rend,
- **kwargs )
- else:
- return IteratorColumnAllRefs(self, **kwargs )
-
- else:
- raise NotImplementedError(
- "pileup of samfiles not implemented yet")
-
- def count(self,
- reference=None,
- start=None,
- end=None,
- region=None,
- until_eof=False,
- read_callback="nofilter"):
- '''count the number of reads in :term:`region`
-
- The region is specified by :term:`reference`, `start` and
- `end`. Alternatively, a :term:`samtools` :term:`region` string
- can be supplied.
-
- A :term:`SAM` file does not allow random access and if
- `region` or `reference` are given, an exception is raised.
-
- Parameters
- ----------
-
- reference : string
- reference_name of the genomic region (chromosome)
-
- start : int
- start of the genomic region
-
- end : int
- end of the genomic region
-
- region : string
- a region string in samtools format.
-
- until_eof : bool
- count until the end of the file, possibly including
- unmapped reads as well.
-
- read_callback: string or function
-
- select a call-back to ignore reads when counting. It can
- be either a string with the following values:
-
- ``all``
- skip reads in which any of the following
- flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
- BAM_FDUP
-
- ``nofilter``
- uses every single read
-
- Alternatively, `read_callback` can be a function
- ``check_read(read)`` that should return True only for
- those reads that shall be included in the counting.
-
- Raises
- ------
-
- ValueError
- if the genomic coordinates are out of range or invalid.
-
- '''
- cdef AlignedSegment read
- cdef long counter = 0
-
- if not self.is_open():
- raise ValueError( "I/O operation on closed file" )
-
- cdef int filter_method = 0
- if read_callback == "all":
- filter_method = 1
- elif read_callback == "nofilter":
- filter_method = 2
-
- for read in self.fetch(reference=reference,
- start=start,
- end=end,
- region=region,
- until_eof=until_eof):
- # apply filter
- if filter_method == 1:
- # filter = "all"
- if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
- continue
- elif filter_method == 2:
- # filter = "nofilter"
- pass
- else:
- if not read_callback(read):
- continue
- counter += 1
-
- return counter
-
- @cython.boundscheck(False) # we do manual bounds checking
- def count_coverage(self,
- reference=None,
- start=None,
- end=None,
- region=None,
- quality_threshold=15,
- read_callback='all'):
- """count the coverage of genomic positions by reads in :term:`region`.
-
- The region is specified by :term:`reference`, `start` and
- `end`. Alternatively, a :term:`samtools` :term:`region` string
- can be supplied. The coverage is computed per-base [ACGT].
-
- Parameters
- ----------
-
- reference : string
- reference_name of the genomic region (chromosome)
-
- start : int
- start of the genomic region
-
- end : int
- end of the genomic region
-
- region : int
- a region string.
-
- quality_threshold : int
- quality_threshold is the minimum quality score (in phred) a
- base has to reach to be counted.
-
- read_callback: string or function
-
- select a call-back to ignore reads when counting. It can
- be either a string with the following values:
-
- ``all``
- skip reads in which any of the following
- flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
- BAM_FDUP
-
- ``nofilter``
- uses every single read
-
- Alternatively, `read_callback` can be a function
- ``check_read(read)`` that should return True only for
- those reads that shall be included in the counting.
-
- Raises
- ------
-
- ValueError
- if the genomic coordinates are out of range or invalid.
-
- Returns
- -------
-
- four array.arrays of the same length in order A C G T : tuple
-
- """
-
- cdef int _start = start
- cdef int _stop = end
- cdef int length = _stop - _start
- cdef c_array.array int_array_template = array.array('L', [])
- cdef c_array.array count_a
- cdef c_array.array count_c
- cdef c_array.array count_g
- cdef c_array.array count_t
- count_a = c_array.clone(int_array_template, length, zero=True)
- count_c = c_array.clone(int_array_template, length, zero=True)
- count_g = c_array.clone(int_array_template, length, zero=True)
- count_t = c_array.clone(int_array_template, length, zero=True)
-
- cdef AlignedSegment read
- cdef cython.str seq
- cdef c_array.array quality
- cdef int qpos
- cdef int refpos
- cdef int c = 0
- cdef int filter_method = 0
- if read_callback == "all":
- filter_method = 1
- elif read_callback == "nofilter":
- filter_method = 2
-
- cdef int _threshold = quality_threshold
- for read in self.fetch(reference=reference,
- start=start,
- end=end,
- region=region):
- # apply filter
- if filter_method == 1:
- # filter = "all"
- if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
- continue
- elif filter_method == 2:
- # filter = "nofilter"
- pass
- else:
- if not read_callback(read):
- continue
-
- # count
- seq = read.seq
- quality = read.query_qualities
- for qpos, refpos in read.get_aligned_pairs(True):
- if qpos is not None and refpos is not None and \
- _start <= refpos < _stop:
- if quality[qpos] >= quality_threshold:
- if seq[qpos] == 'A':
- count_a.data.as_ulongs[refpos - _start] += 1
- if seq[qpos] == 'C':
- count_c.data.as_ulongs[refpos - _start] += 1
- if seq[qpos] == 'G':
- count_g.data.as_ulongs[refpos - _start] += 1
- if seq[qpos] == 'T':
- count_t.data.as_ulongs[refpos - _start] += 1
-
- return count_a, count_c, count_g, count_t
-
- def close(self):
- '''
- closes the :class:`pysam.AlignmentFile`.'''
- if self.htsfile != NULL:
- hts_close(self.htsfile)
- hts_idx_destroy(self.index);
- self.htsfile = NULL
-
- def __dealloc__(self):
- # remember: dealloc cannot call other methods
- # note: no doc string
- # note: __del__ is not called.
-
- # FIXME[kbj]: isn't self.close a method? I've been duplicating
- # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty
- # solution and perhaps unnecessary given that calling self.close has
- # been working for years.
- # AH: I have removed the call to close. Even though it is working,
- # it seems to be dangerous according to the documentation as the
- # object be partially deconstructed already.
- if self.htsfile != NULL:
- hts_close(self.htsfile)
- hts_idx_destroy(self.index);
- self.htsfile = NULL
-
- bam_destroy1(self.b)
- if self.header != NULL:
- bam_hdr_destroy(self.header)
-
- cpdef int write(self, AlignedSegment read) except -1:
- '''
- write a single :class:`pysam.AlignedSegment` to disk.
-
- Raises
- ------
- ValueError
- if the writing failed
-
- Returns
- -------
-
- int : the number of bytes written. If the file is closed,
- this will be 0.
- '''
- if not self.is_open():
- return 0
-
- cdef int ret
-
- with nogil:
- ret = sam_write1(self.htsfile,
- self.header,
- read._delegate)
-
- # kbj: Still need to raise an exception with except -1. Otherwise
- # when ret == -1 we get a "SystemError: error return without
- # exception set".
- if ret < 0:
- raise ValueError('sam write failed')
-
- return ret
-
- # context manager interface
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- ###############################################################
- ###############################################################
- ###############################################################
- ## properties
- ###############################################################
- property closed:
- """bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
- """
- def __get__(self):
- return not self.is_open()
-
- property filename:
- """filename associated with this object. This is a read-only attribute."""
- def __get__(self):
- return self._filename
-
- property nreferences:
- """"int with the number of :term:`reference` sequences in the file.
- This is a read-only attribute."""
- def __get__(self):
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- return self.header.n_targets
-
- property references:
- """tuple with the names of :term:`reference` sequences. This is a
- read-only attribute"""
- def __get__(self):
- if not self.is_open(): raise ValueError( "I/O operation on closed file" )
- t = []
- for x from 0 <= x < self.header.n_targets:
- t.append(charptr_to_str(self.header.target_name[x]))
- return tuple(t)
-
- property lengths:
- """tuple of the lengths of the :term:`reference` sequences. This is a
- read-only attribute. The lengths are in the same order as
- :attr:`pysam.AlignmentFile.references`
-
- """
- def __get__(self):
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- t = []
- for x from 0 <= x < self.header.n_targets:
- t.append(self.header.target_len[x])
- return tuple(t)
-
- property mapped:
- """int with total number of mapped alignments according to the
- statistics recorded in the index. This is a read-only
- attribute.
- """
- def __get__(self):
- self.check_index()
- cdef int tid
- cdef uint64_t total = 0
- cdef uint64_t mapped, unmapped
- for tid from 0 <= tid < self.header.n_targets:
- with nogil:
- hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
- total += mapped
- return total
-
- property unmapped:
- """int with total number of unmapped reads according to the statistics
- recorded in the index. This number of reads includes the number of reads
- without coordinates. This is a read-only attribute.
- """
- def __get__(self):
- self.check_index()
- cdef int tid
- cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
- cdef uint64_t mapped, unmapped
- for tid from 0 <= tid < self.header.n_targets:
- with nogil:
- hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
- total += unmapped
- return total
-
- property nocoordinate:
- """int with total number of reads without coordinates according to the
- statistics recorded in the index. This is a read-only attribute.
- """
- def __get__(self):
- self.check_index()
- cdef uint64_t n
- with nogil:
- n = hts_idx_get_n_no_coor(self.index)
- return n
-
- property format:
- '''string describing the file format'''
- def __get__(self):
- if not self.is_open():
- raise ValueError( "I/O operation on closed file" )
- return hts_format_description(&self.htsfile.format)
-
- property text:
- '''string with the full contents of the :term:`sam file` header as a
- string.
-
- This is a read-only attribute.
-
- See :attr:`pysam.AlignmentFile.header` to get a parsed
- representation of the header.
- '''
- def __get__(self):
- if not self.is_open():
- raise ValueError( "I/O operation on closed file" )
- return from_string_and_size(self.header.text, self.header.l_text)
-
- property header:
- """two-level dictionay with header information from the file.
-
- This is a read-only attribute.
-
- The first level contains the record (``HD``, ``SQ``, etc) and
- the second level contains the fields (``VN``, ``LN``, etc).
-
- The parser is validating and will raise an AssertionError if
- if encounters any record or field tags that are not part of
- the SAM specification. Use the
- :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
- header.
-
- The parsing follows the SAM format specification with the
- exception of the ``CL`` field. This option will consume the
- rest of a header line irrespective of any additional fields.
- This behaviour has been added to accommodate command line
- options that contain characters that are not valid field
- separators.
-
- """
- def __get__(self):
- if not self.is_open():
- raise ValueError( "I/O operation on closed file" )
-
- result = {}
-
- if self.header.text != NULL:
- # convert to python string (note: call self.text to
- # create 0-terminated string)
- t = self.text
- for line in t.split("\n"):
- if not line.strip(): continue
- assert line.startswith("@"), \
- "header line without '@': '%s'" % line
- fields = line[1:].split("\t")
- record = fields[0]
- assert record in VALID_HEADER_TYPES, \
- "header line with invalid type '%s': '%s'" % (record, line)
-
- # treat comments
- if record == "CO":
- if record not in result:
- result[record] = []
- result[record].append("\t".join( fields[1:]))
- continue
- # the following is clumsy as generators do not work?
- x = {}
-
- for idx, field in enumerate(fields[1:]):
- if ":" not in field:
- raise ValueError("malformatted header: no ':' in field" )
- key, value = field.split(":", 1)
- if key in ("CL",):
- # special treatment for command line
- # statements (CL). These might contain
- # characters that are non-conformant with
- # the valid field separators in the SAM
- # header. Thus, in contravention to the
- # SAM API, consume the rest of the line.
- key, value = "\t".join(fields[idx+1:]).split(":", 1)
- x[key] = KNOWN_HEADER_FIELDS[record][key](value)
- break
-
- # interpret type of known header record tags, default to str
- x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
-
- if VALID_HEADER_TYPES[record] == dict:
- if record in result:
- raise ValueError(
- "multiple '%s' lines are not permitted" % record)
-
- result[record] = x
- elif VALID_HEADER_TYPES[record] == list:
- if record not in result: result[record] = []
- result[record].append(x)
-
- # if there are no SQ lines in the header, add the
- # reference names from the information in the bam
- # file.
- #
- # Background: c-samtools keeps the textual part of the
- # header separate from the list of reference names and
- # lengths. Thus, if a header contains only SQ lines,
- # the SQ information is not part of the textual header
- # and thus are missing from the output. See issue 84.
- if "SQ" not in result:
- sq = []
- for ref, length in zip(self.references, self.lengths):
- sq.append({'LN': length, 'SN': ref })
- result["SQ"] = sq
-
- return result
-
- ###############################################################
- ## file-object like iterator access
- ## note: concurrent access will cause errors (see IteratorRow
- ## and multiple_iterators)
- ## Possible solutions: deprecate or open new file handle
- def __iter__(self):
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
-
- if not self.is_bam and self.header.n_targets == 0:
- raise NotImplementedError(
- "can not iterate over samfile without header")
- return self
-
- cdef bam1_t * getCurrent( self ):
- return self.b
-
- cdef int cnext(self):
- '''
- cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
- '''
- cdef int ret
- with nogil:
- ret = sam_read1(self.htsfile,
- self.header,
- self.b)
- return ret
-
- def __next__(self):
- cdef int ret = self.cnext()
- if (ret >= 0):
- return makeAlignedSegment(self.b, self)
- elif ret == -2:
- raise IOError('truncated file')
- else:
- raise StopIteration
-
- # Compatibility functions for pysam < 0.8.3
- def gettid(self, reference):
- """deprecated, use get_tid() instead"""
- return self.get_tid(reference)
-
- def getrname(self, tid):
- """deprecated, use get_reference_name() instead"""
- return self.get_reference_name(tid)
-
-
-cdef class IteratorRow:
- '''abstract base class for iterators over mapped reads.
-
- Various iterators implement different behaviours for wrapping around
- contig boundaries. Examples include:
-
- :class:`pysam.IteratorRowRegion`
- iterate within a single contig and a defined region.
-
- :class:`pysam.IteratorRowAll`
- iterate until EOF. This iterator will also include unmapped reads.
-
- :class:`pysam.IteratorRowAllRefs`
- iterate over all reads in all reference sequences.
-
- The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
-
- .. note::
-
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- '''
-
- def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
- cdef char *cfilename
- cdef char *creference_filename
-
- if not samfile.is_open():
- raise ValueError("I/O operation on closed file")
-
- # makes sure that samfile stays alive as long as the
- # iterator is alive
- self.samfile = samfile
-
- # reopen the file - note that this makes the iterator
- # slow and causes pileup to slow down significantly.
- if multiple_iterators:
- cfilename = samfile._filename
- with nogil:
- self.htsfile = hts_open(cfilename, 'r')
- assert self.htsfile != NULL
- # read header - required for accurate positioning
- # could a tell/seek work?
- with nogil:
- self.header = sam_hdr_read(self.htsfile)
- assert self.header != NULL
- self.owns_samfile = True
- # options specific to CRAM files
- if samfile.is_cram and samfile._reference_filename:
- creference_filename = samfile._reference_filename
- hts_set_opt(self.htsfile,
- CRAM_OPT_REFERENCE,
- creference_filename)
-
- else:
- self.htsfile = self.samfile.htsfile
- self.owns_samfile = False
- self.header = self.samfile.header
-
- self.retval = 0
-
- self.b = bam_init1()
-
- def __dealloc__(self):
- bam_destroy1(self.b)
- if self.owns_samfile:
- hts_close(self.htsfile)
- bam_hdr_destroy(self.header)
-
-
-cdef class IteratorRowRegion(IteratorRow):
- """*(AlignmentFile samfile, int tid, int beg, int end,
- int multiple_iterators=False)*
-
- iterate over mapped reads in a region.
-
- .. note::
-
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- """
-
- def __init__(self, AlignmentFile samfile,
- int tid, int beg, int end,
- int multiple_iterators=False):
-
- IteratorRow.__init__(self, samfile,
- multiple_iterators=multiple_iterators)
-
- if not samfile.has_index():
- raise ValueError("no index available for iteration")
-
- with nogil:
- self.iter = sam_itr_queryi(
- self.samfile.index,
- tid,
- beg,
- end)
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent(self):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
- with nogil:
- self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
- self.iter,
- self.b,
- self.htsfile)
-
- def __next__(self):
- self.cnext()
- if self.retval >= 0:
- return makeAlignedSegment(self.b, self.samfile)
- elif self.retval == -2:
- # Note: it is currently not the case that hts_iter_next
- # returns -2 for a truncated file.
- # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
- raise IOError('truncated file')
- else:
- raise StopIteration
-
- def __dealloc__(self):
- hts_itr_destroy(self.iter)
-
-
-cdef class IteratorRowHead(IteratorRow):
- """*(AlignmentFile samfile, n, int multiple_iterators=False)*
-
- iterate over first n reads in `samfile`
-
- .. note::
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.head`.
-
- """
-
- def __init__(self, AlignmentFile samfile, int n,
- int multiple_iterators=False):
-
- IteratorRow.__init__(self, samfile,
- multiple_iterators=multiple_iterators)
-
- self.max_rows = n
- self.current_row = 0
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent( self ):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
- cdef int ret
- with nogil:
- ret = sam_read1(self.htsfile,
- self.samfile.header,
- self.b)
- return ret
-
- def __next__(self):
- if self.current_row >= self.max_rows:
- raise StopIteration
-
- cdef int ret = self.cnext()
- if ret >= 0:
- self.current_row += 1
- return makeAlignedSegment(self.b, self.samfile)
- elif ret == -2:
- raise IOError('truncated file')
- else:
- raise StopIteration
-
-
-cdef class IteratorRowAll(IteratorRow):
- """*(AlignmentFile samfile, int multiple_iterators=False)*
-
- iterate over all reads in `samfile`
-
- .. note::
-
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- """
-
- def __init__(self, AlignmentFile samfile,
- int multiple_iterators=False):
-
- IteratorRow.__init__(self, samfile,
- multiple_iterators=multiple_iterators)
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent( self ):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
- cdef int ret
- with nogil:
- ret = sam_read1(self.htsfile,
- self.samfile.header,
- self.b)
- return ret
-
- def __next__(self):
- cdef int ret = self.cnext()
- if ret >= 0:
- return makeAlignedSegment(self.b, self.samfile)
- elif ret == -2:
- raise IOError('truncated file')
- else:
- raise StopIteration
-
-
-cdef class IteratorRowAllRefs(IteratorRow):
- """iterates over all mapped reads by chaining iterators over each
- reference
-
- .. note::
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- """
-
- def __init__(self, AlignmentFile samfile,
- multiple_iterators=False):
-
- IteratorRow.__init__(self, samfile,
- multiple_iterators=multiple_iterators)
-
- if not samfile.has_index():
- raise ValueError("no index available for fetch")
-
- self.tid = -1
-
- def nextiter(self):
- # get a new iterator for a chromosome. The file
- # will not be re-opened.
- self.rowiter = IteratorRowRegion(self.samfile,
- self.tid,
- 0,
- 1<<29)
- # set htsfile and header of the rowiter
- # to the values in this iterator to reflect multiple_iterators
- self.rowiter.htsfile = self.htsfile
- self.rowiter.header = self.header
-
- # make sure the iterator understand that IteratorRowAllRefs
- # has ownership
- self.rowiter.owns_samfile = False
-
- def __iter__(self):
- return self
-
- def __next__(self):
- # Create an initial iterator
- if self.tid == -1:
- if not self.samfile.nreferences:
- raise StopIteration
- self.tid = 0
- self.nextiter()
-
- while 1:
- self.rowiter.cnext()
-
- # If current iterator is not exhausted, return aligned read
- if self.rowiter.retval > 0:
- return makeAlignedSegment(self.rowiter.b, self.samfile)
-
- self.tid += 1
-
- # Otherwise, proceed to next reference or stop
- if self.tid < self.samfile.nreferences:
- self.nextiter()
- else:
- raise StopIteration
-
-
-cdef class IteratorRowSelection(IteratorRow):
- """*(AlignmentFile samfile)*
-
- iterate over reads in `samfile` at a given list of file positions.
-
- .. note::
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
- """
-
- def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
-
- IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
-
- self.positions = positions
- self.current_pos = 0
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent(self):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator'''
- # end iteration if out of positions
- if self.current_pos >= len(self.positions): return -1
-
- cdef uint64_t pos = self.positions[self.current_pos]
- with nogil:
- bgzf_seek(hts_get_bgzfp(self.htsfile),
- pos,
- 0)
- self.current_pos += 1
-
- cdef int ret
- with nogil:
- ret = sam_read1(self.htsfile,
- self.samfile.header,
- self.b)
- return ret
-
- def __next__(self):
- cdef int ret = self.cnext()
- if (ret >= 0):
- return makeAlignedSegment(self.b, self.samfile)
- elif (ret == -2):
- raise IOError('truncated file')
- else:
- raise StopIteration
-
-
-cdef int __advance_nofilter(void *data, bam1_t *b):
- '''advance without any read filtering.
- '''
- cdef __iterdata * d
- d = <__iterdata*>data
- cdef int ret
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b)
- return ret
-
-
-cdef int __advance_all(void *data, bam1_t *b):
- '''only use reads for pileup passing basic
- filters:
-
- BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
- '''
-
- cdef __iterdata * d
- cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
- d = <__iterdata*>data
- cdef int ret
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b)
- while ret >= 0 and b.core.flag & mask:
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b)
- return ret
-
-
-cdef int __advance_snpcalls(void * data, bam1_t * b):
- '''advance using same filter and read processing as in
- the samtools pileup.
- '''
-
- # Note that this method requries acces to some
- # functions in the samtools code base and is thus
- # not htslib only.
- # The functions accessed in samtools are:
- # 1. bam_prob_realn
- # 2. bam_cap_mapQ
- cdef __iterdata * d
- d = <__iterdata*>data
-
- cdef int ret
- cdef int skip = 0
- cdef int q
- cdef int is_cns = 1
- cdef int is_nobaq = 0
- cdef int capQ_thres = 0
-
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b)
-
- # reload sequence
- if d.fastafile != NULL and b.core.tid != d.tid:
- if d.seq != NULL:
- free(d.seq)
- d.tid = b.core.tid
- with nogil:
- d.seq = faidx_fetch_seq(
- d.fastafile,
- d.header.target_name[d.tid],
- 0, MAX_POS,
- &d.seq_len)
-
- if d.seq == NULL:
- raise ValueError(
- "reference sequence for '%s' (tid=%i) not found" % \
- (d.header.target_name[d.tid],
- d.tid))
-
- while ret >= 0:
- skip = 0
-
- # realign read - changes base qualities
- if d.seq != NULL and is_cns and not is_nobaq:
- bam_prob_realn(b, d.seq)
-
- if d.seq != NULL and capQ_thres > 10:
- q = bam_cap_mapQ(b, d.seq, capQ_thres)
- if q < 0:
- skip = 1
- elif b.core.qual > q:
- b.core.qual = q
- if b.core.flag & BAM_FUNMAP:
- skip = 1
- elif b.core.flag & 1 and not b.core.flag & 2:
- skip = 1
-
- if not skip:
- break
- # additional filters
-
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b)
-
- return ret
-
-cdef class IteratorColumn:
- '''abstract base class for iterators over columns.
-
- IteratorColumn objects wrap the pileup functionality of samtools.
-
- For reasons of efficiency, the iterator points to the current
- pileup buffer. The pileup buffer is updated at every iteration.
- This might cause some unexpected behavious. For example,
- consider the conversion to a list::
-
- f = AlignmentFile("file.bam", "rb")
- result = list( f.pileup() )
-
- Here, ``result`` will contain ``n`` objects of type
- :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
- ``result`` will contain the same information.
-
- The desired behaviour can be achieved by list comprehension::
-
- result = [ x.pileups() for x in f.pileup() ]
-
- ``result`` will be a list of ``n`` lists of objects of type
- :class:`~pysam.PileupRead`.
-
- If the iterator is associated with a :class:`~pysam.Fastafile` using the
- :meth:`addReference` method, then the iterator will export the
- current sequence via the methods :meth:`getSequence` and
- :meth:`seq_len`.
-
- Optional kwargs to the iterator:
-
- stepper
- The stepper controls how the iterator advances.
-
- Valid values are None, "all" (default), "nofilter" or "samtools".
-
- See AlignmentFile.pileup for description.
-
- fastafile
- A :class:`~pysam.FastaFile` object
-
- max_depth
- maximum read depth. The default is 8000.
-
- '''
-
- def __cinit__( self, AlignmentFile samfile, **kwargs ):
- self.samfile = samfile
- self.fastafile = kwargs.get("fastafile", None)
- self.stepper = kwargs.get("stepper", None)
- self.max_depth = kwargs.get("max_depth", 8000)
- self.iterdata.seq = NULL
- self.tid = 0
- self.pos = 0
- self.n_plp = 0
- self.plp = NULL
- self.pileup_iter = <bam_plp_t>NULL
-
- def __iter__(self):
- return self
-
- cdef int cnext(self):
- '''perform next iteration.
- '''
- # do not release gil here because of call-backs
- self.plp = bam_plp_auto(self.pileup_iter,
- &self.tid,
- &self.pos,
- &self.n_plp)
-
- cdef char * getSequence(self):
- '''return current reference sequence underlying the iterator.
- '''
- return self.iterdata.seq
-
- property seq_len:
- '''current sequence length.'''
- def __get__(self):
- return self.iterdata.seq_len
-
- def addReference(self, Fastafile fastafile):
- '''
- add reference sequences in `fastafile` to iterator.'''
- self.fastafile = fastafile
- if self.iterdata.seq != NULL:
- free(self.iterdata.seq)
- self.iterdata.tid = -1
- self.iterdata.fastafile = self.fastafile.fastafile
-
- def hasReference(self):
- '''
- return true if iterator is associated with a reference'''
- return self.fastafile
-
- cdef setMask(self, mask):
- '''set masking flag in iterator.
-
- reads with bits set in `mask` will be skipped.
- '''
- raise NotImplementedError()
- # self.mask = mask
- # bam_plp_set_mask( self.pileup_iter, self.mask )
-
- cdef setupIteratorData( self,
- int tid,
- int start,
- int end,
- int multiple_iterators=0 ):
- '''setup the iterator structure'''
-
- self.iter = IteratorRowRegion(self.samfile, tid, start, end, multiple_iterators)
- self.iterdata.htsfile = self.samfile.htsfile
- self.iterdata.iter = self.iter.iter
- self.iterdata.seq = NULL
- self.iterdata.tid = -1
- self.iterdata.header = self.samfile.header
-
- if self.fastafile is not None:
- self.iterdata.fastafile = self.fastafile.fastafile
- else:
- self.iterdata.fastafile = NULL
-
- # Free any previously allocated memory before reassigning
- # pileup_iter
- self._free_pileup_iter()
-
- if self.stepper is None or self.stepper == "all":
- with nogil:
- self.pileup_iter = bam_plp_init(
- <bam_plp_auto_f>&__advance_all,
- &self.iterdata)
- elif self.stepper == "nofilter":
- with nogil:
- self.pileup_iter = bam_plp_init(
- <bam_plp_auto_f>&__advance_nofilter,
- &self.iterdata)
- elif self.stepper == "samtools":
- with nogil:
- self.pileup_iter = bam_plp_init(
- <bam_plp_auto_f>&__advance_snpcalls,
- &self.iterdata)
- else:
- raise ValueError(
- "unknown stepper option `%s` in IteratorColumn" % self.stepper)
-
- if self.max_depth:
- with nogil:
- bam_plp_set_maxcnt(self.pileup_iter, self.max_depth)
-
- # bam_plp_set_mask( self.pileup_iter, self.mask )
-
- cdef reset( self, tid, start, end ):
- '''reset iterator position.
-
- This permits using the iterator multiple times without
- having to incur the full set-up costs.
- '''
- self.iter = IteratorRowRegion( self.samfile, tid, start, end, multiple_iterators = 0 )
- self.iterdata.iter = self.iter.iter
-
- # invalidate sequence if different tid
- if self.tid != tid:
- if self.iterdata.seq != NULL:
- free(self.iterdata.seq)
- self.iterdata.seq = NULL
- self.iterdata.tid = -1
-
- # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
- with nogil:
- bam_plp_reset(self.pileup_iter)
-
- cdef _free_pileup_iter(self):
- '''free the memory alloc'd by bam_plp_init.
-
- This is needed before setupIteratorData allocates
- another pileup_iter, or else memory will be lost.
- '''
- if self.pileup_iter != <bam_plp_t>NULL:
- with nogil:
- bam_plp_reset(self.pileup_iter)
- bam_plp_destroy(self.pileup_iter)
- self.pileup_iter = <bam_plp_t>NULL
-
- def __dealloc__(self):
- # reset in order to avoid memory leak messages for iterators
- # that have not been fully consumed
- self._free_pileup_iter()
- self.plp = <bam_pileup1_t*>NULL
-
- if self.iterdata.seq != NULL:
- free(self.iterdata.seq)
- self.iterdata.seq = NULL
-
-
-cdef class IteratorColumnRegion(IteratorColumn):
- '''iterates over a region only.
- '''
- def __cinit__(self, AlignmentFile samfile,
- int tid = 0,
- int start = 0,
- int end = MAX_POS,
- int truncate = False,
- **kwargs ):
-
- # initialize iterator
- self.setupIteratorData(tid, start, end, 1)
- self.start = start
- self.end = end
- self.truncate = truncate
-
- def __next__(self):
-
- while 1:
- self.cnext()
- if self.n_plp < 0:
- raise ValueError("error during iteration" )
-
- if self.plp == NULL:
- raise StopIteration
-
- if self.truncate:
- if self.start > self.pos: continue
- if self.pos >= self.end: raise StopIteration
-
- return makePileupColumn(&self.plp,
- self.tid,
- self.pos,
- self.n_plp,
- self.samfile)
-
-
-cdef class IteratorColumnAllRefs(IteratorColumn):
- """iterates over all columns by chaining iterators over each reference
- """
-
- def __cinit__(self,
- AlignmentFile samfile,
- **kwargs):
-
- # no iteration over empty files
- if not samfile.nreferences:
- raise StopIteration
-
- # initialize iterator
- self.setupIteratorData(self.tid, 0, MAX_POS, 1)
-
- def __next__(self):
-
- while 1:
- self.cnext()
-
- if self.n_plp < 0:
- raise ValueError("error during iteration" )
-
- # return result, if within same reference
- if self.plp != NULL:
- return makePileupColumn(&self.plp,
- self.tid,
- self.pos,
- self.n_plp,
- self.samfile)
-
- # otherwise, proceed to next reference or stop
- self.tid += 1
- if self.tid < self.samfile.nreferences:
- self.setupIteratorData(self.tid, 0, MAX_POS, 0)
- else:
- raise StopIteration
-
-
-cdef class SNPCall:
- '''the results of a SNP call.'''
- cdef int _tid
- cdef int _pos
- cdef char _reference_base
- cdef char _genotype
- cdef int _consensus_quality
- cdef int _snp_quality
- cdef int _rms_mapping_quality
- cdef int _coverage
-
- property tid:
- '''the chromosome ID as is defined in the header'''
- def __get__(self):
- return self._tid
-
- property pos:
- '''nucleotide position of SNP.'''
- def __get__(self): return self._pos
-
- property reference_base:
- '''reference base at pos. ``N`` if no reference sequence supplied.'''
- def __get__(self): return from_string_and_size( &self._reference_base, 1 )
-
- property genotype:
- '''the genotype called.'''
- def __get__(self): return from_string_and_size( &self._genotype, 1 )
-
- property consensus_quality:
- '''the genotype quality (Phred-scaled).'''
- def __get__(self): return self._consensus_quality
-
- property snp_quality:
- '''the snp quality (Phred scaled) - probability of consensus being
- identical to reference sequence.'''
- def __get__(self): return self._snp_quality
-
- property mapping_quality:
- '''the root mean square (rms) of the mapping quality of all reads
- involved in the call.'''
- def __get__(self): return self._rms_mapping_quality
-
- property coverage:
- '''coverage or read depth - the number of reads involved in the call.'''
- def __get__(self): return self._coverage
-
- def __str__(self):
-
- return "\t".join( map(str, (
- self.tid,
- self.pos,
- self.reference_base,
- self.genotype,
- self.consensus_quality,
- self.snp_quality,
- self.mapping_quality,
- self.coverage ) ) )
-
-
-cdef class IndexedReads:
- """*(AlignmentFile samfile, multiple_iterators=True)
-
- Index a Sam/BAM-file by query name while keeping the
- original sort order intact.
-
- The index is kept in memory and can be substantial.
-
- By default, the file is re-openend to avoid conflicts if multiple
- operators work on the same file. Set `multiple_iterators` = False
- to not re-open `samfile`.
-
- Parameters
- ----------
-
- samfile : AlignmentFile
- File to be indexed.
-
- multiple_iterators : bool
- Flag indicating whether the file should be reopened. Reopening prevents
- existing iterators being affected by the indexing.
-
- """
-
- def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
- cdef char *cfilename
-
- # makes sure that samfile stays alive as long as this
- # object is alive.
- self.samfile = samfile
-
- assert samfile.is_bam, "can only IndexReads on bam files"
-
- # multiple_iterators the file - note that this makes the iterator
- # slow and causes pileup to slow down significantly.
- if multiple_iterators:
- cfilename = samfile._filename
- with nogil:
- self.htsfile = hts_open(cfilename, 'r')
- assert self.htsfile != NULL
- # read header - required for accurate positioning
- with nogil:
- self.header = sam_hdr_read(self.htsfile)
- self.owns_samfile = True
- else:
- self.htsfile = self.samfile.htsfile
- self.header = self.samfile.header
- self.owns_samfile = False
-
- def build(self):
- '''build the index.'''
-
- self.index = collections.defaultdict(list)
-
- # this method will start indexing from the current file
- # position if you decide
- cdef int ret = 1
- cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
-
- cdef uint64_t pos
-
- while ret > 0:
- with nogil:
- pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
- ret = sam_read1(self.htsfile,
- self.samfile.header,
- b)
- if ret > 0:
- qname = charptr_to_str(pysam_bam_get_qname(b))
- self.index[qname].append(pos)
-
- bam_destroy1(b)
-
- def find(self, query_name):
- '''find `query_name` in index.
-
- Returns
- -------
-
- IteratorRowSelection
- Returns an iterator over all reads with query_name.
-
- Raises
- ------
-
- KeyError
- if the `query_name` is not in the index.
-
- '''
- if query_name in self.index:
- return IteratorRowSelection(
- self.samfile,
- self.index[query_name],
- multiple_iterators = False)
- else:
- raise KeyError("read %s not found" % query_name)
-
- def __dealloc__(self):
- if self.owns_samfile:
- hts_close(self.htsfile)
- bam_hdr_destroy(self.header)
-
-__all__ = [
- "AlignmentFile",
- "IteratorRow",
- "IteratorColumn",
- "IndexedReads"]
+++ /dev/null
-###############################################################################
-###############################################################################
-## Cython wrapper for htslib VCF/BCF reader/writer
-###############################################################################
-#
-# NOTICE: This code is incomplete and preliminary. It is nearly complete as
-# an immutable interface, but has no capability (yet) to mutate the
-# resulting data (beyond dropping all samples). Documentation still
-# needs to be written and a unit test suite is in the works. The
-# code is also specific to Python 2 and will require a bit of work
-# to properly adapt to Python 3.
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com)
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-
-from pysam.chtslib cimport *
-
-
-cdef class VariantHeader(object):
- cdef bcf_hdr_t *ptr
-
- cdef _subset_samples(self, include_samples)
-
-
-cdef class VariantHeaderRecord(object):
- cdef VariantHeader header
- cdef bcf_hrec_t *ptr
-
-
-cdef class VariantHeaderRecords(object):
- cdef VariantHeader header
-
-
-cdef class VariantHeaderContigs(object):
- cdef VariantHeader header
-
-
-cdef class VariantHeaderSamples(object):
- cdef VariantHeader header
-
-
-cdef class VariantContig(object):
- cdef VariantHeader header
- cdef int id
-
-
-cdef class VariantMetadata(object):
- cdef VariantHeader header
- cdef int type
- cdef int id
-
-
-cdef class VariantHeaderMetadata(object):
- cdef VariantHeader header
- cdef int32_t type
-
-
-cdef class VariantRecord(object):
- cdef VariantHeader header
- cdef bcf1_t *ptr
-
-
-cdef class VariantRecordFilter(object):
- cdef VariantRecord record
-
-
-cdef class VariantRecordFormat(object):
- cdef VariantRecord record
-
-
-cdef class VariantRecordInfo(object):
- cdef VariantRecord record
-
-
-cdef class VariantRecordSamples(object):
- cdef VariantRecord record
-
-
-cdef class VariantRecordSample(object):
- cdef VariantRecord record
- cdef readonly int32_t index
-
-
-cdef class BaseIndex(object):
- cdef tuple refs
- cdef dict refmap
-
-
-cdef class BCFIndex(BaseIndex):
- cdef VariantHeader header
- cdef hts_idx_t *ptr
-
-
-cdef class TabixIndex(BaseIndex):
- cdef tbx_t *ptr
-
-
-cdef class BaseIterator(object):
- cdef VariantFile bcf
- cdef hts_itr_t *iter
-
-
-cdef class BCFIterator(BaseIterator):
- cdef BCFIndex index
-
-
-cdef class TabixIterator(BaseIterator):
- cdef TabixIndex index
- cdef kstring_t line_buffer
-
-
-cdef class VariantFile(object):
- cdef htsFile *htsfile # pointer to htsFile structure
- cdef int64_t start_offset # BGZF offset of first record
-
- cdef readonly object filename # filename as supplied by user
- cdef readonly object mode # file opening mode
- cdef readonly object index_filename # filename of index, if supplied by user
-
- cdef readonly VariantHeader header
- cdef readonly BaseIndex index
-
- cdef readonly bint drop_samples # true if sample information is to be ignored
-
- # FIXME: Temporary, use htsFormat when it is available
- cdef readonly bint is_bcf # true if file is a bcf file
- cdef readonly bint is_stream # true if not a seekable file but a stream
- cdef readonly bint is_remote # true if file is not on the local filesystem
- cdef readonly bint is_reading # true if file has begun reading records
-
- cpdef int write(self, VariantRecord record) except -1
+++ /dev/null
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-## Cython wrapper for htslib VCF/BCF reader/writer
-###############################################################################
-#
-# NOTICE: This code is incomplete and preliminary. It offers a nearly
-# complete Pythonic interface to VCF/BCF metadata and data with
-# reading and writing capability. It has limited capability to to
-# mutate the resulting data. Documentation and a unit test suite
-# are in the works. The code is best tested under Python 2, but
-# should also work with Python 3. Please report any remaining
-# str/bytes issues on the github site when using Python 3 and I'll
-# fix them promptly.
-#
-# Here is a minimal example of how to use the API:
-#
-# $ cat bcfview.py
-# import sys
-# from pysam import VariantFile
-#
-# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
-# bcf_out = VariantFile('-', 'w', header=bcf_in.header)
-#
-# for rec in bcf_in:
-# bcf_out.write(rec)
-#
-# Performance is fairly close to that of bcftools view. Here is an example
-# using some 1k Genomes data:
-#
-# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
-# 1103799
-#
-# real 0m56.114s
-# user 1m4.489s
-# sys 0m3.102s
-#
-# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
-# 1103800 # bcftools adds an extra header
-#
-# real 0m55.126s
-# user 1m3.502s
-# sys 0m3.459s
-#
-# Here is a quick tour through the API::
-#
-# VariantFile(filename, mode=None, header=None, drop_samples=False)
-#
-# Attributes / Properties
-#
-# htsfile: htsFile* [private]
-# start_offset: BGZF offset of first record [private]
-# filename: filename [read only]
-# mode: mode [read only]
-# header: VariantHeader object [read only]
-# index: TabixIndex, BCFIndex or None [read only]
-# drop_samples: sample information is to be ignored [read only]
-#
-# is_stream: file is stdin/stdout [read only]
-# is_remote: file is not on the local filesystem [read only]
-# is_reading: file has begun reading records [read only]
-# category: file format general category [read only]
-# format: file format [read only]
-# version: tuple of (major, minor) format version [read only]
-# compression: file compression
-# description: vaguely human readable description of [read only]
-# file format.
-#
-# Methods:
-# copy()
-# close()
-# open(filename, mode=None, header=None, drop_samples=False)
-# reset()
-# seek(offset)
-# tell()
-# fetch(contig=None, start=None, stop=None, region=None, reopen=False)
-# subset_samples(include_samples)
-#
-# VariantHeader()
-#
-# version: VCF version
-# samples: sequence-like access to samples
-# records: sequence-like access to partially parsed headers
-# contigs: mapping-like object for contig name -> VariantContig
-#
-# filters: mapping-like object for filter name -> VariantMetadata
-# info: mapping-like object for info name -> VariantMetadata
-# formats: mapping-like object for formats name -> VariantMetadata
-#
-# VariantRecord(...)
-#
-# header: VariantHeader object
-# rid: reference id (i.e. tid)
-# chrom: chromosome/contig string
-# contig: synonym for chrom
-# pos: 1-based start position (inclusive)
-# start: 0-based start position (inclusive)
-# stop: 0-based stop position (exclusive)
-# rlen: reference length (stop - start)
-# id: record identifier
-# ref: reference allele
-# alleles: alleles (ref followed by alts)
-# alts: alt alleles
-# qual: quality (float)
-# filter: mapping-like object for filter name -> type info
-# info: mapping-like object for info name -> value
-# format: mapping-like object for format name -> type info
-# samples: mapping-like object of sample genotypes & attrs
-#
-# VariantRecordSample(...)
-#
-# name: sample name
-# index: sample index
-# allele_indices: tuple of allele indices (ref=0, alt=1..len(alts), missing=-1)
-# alleles: tuple of alleles (missing=None)
-#
-# VariantRecordSample is also a mapping object from formats to values
-#
-# VariantContig(...)
-#
-# id: reference id (i.e. tid)
-# name: chromosome/contig string
-# length: contig length if provided, else None
-# header: defining VariantHeaderRecord
-#
-# VariantMetadata(...) # for FILTER, INFO and FORMAT metadata
-#
-# id: internal id
-# name: metadata name
-# type: value data type
-# number: number of values
-# header: defining VariantHeaderRecord
-#
-# VariantHeaderRecord(...) # replace with single tuple of key/value pairs?
-#
-# type: record type
-# key: first record key
-# value: first record value
-# attrs: remaining key/value pairs
-#
-###############################################################################
-#
-# TODO list for next major sprint:
-#
-# * more genotype methods
-# * unit test suite (perhaps py.test based)
-# * documentation
-# * htslib 1.2 format info
-#
-# For later sprints:
-#
-# * ability to create indices
-# * mutable header and record data
-# * pickle support
-# * Python 3 support
-# * left/right locus normalization
-# * parallel iteration (like synced_bcf_reader)
-# * fix reopen to re-use fd
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com)
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-
-from __future__ import division, print_function
-
-import os
-import sys
-
-from libc.string cimport strcmp, strpbrk
-from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
-
-cimport cython
-
-from cpython.object cimport PyObject
-from cpython.ref cimport Py_INCREF
-from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
-from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
-from cpython.bytes cimport PyBytes_FromStringAndSize
-from cpython.unicode cimport PyUnicode_DecodeASCII
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.chtslib cimport hisremote
-
-
-from warnings import warn
-
-
-__all__ = ['VariantFile',
- 'VariantHeader',
- 'VariantHeaderRecord',
- 'VariantRecord']
-
-########################################################################
-########################################################################
-## Constants
-########################################################################
-
-cdef int MAX_POS = 2 << 29
-cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
-cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
-cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
-
-cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
-cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
- 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
-cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
-
-########################################################################
-########################################################################
-## Python 3 compatibility functions
-########################################################################
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.cutils cimport encode_filename, from_string_and_size
-
-
-########################################################################
-########################################################################
-## VCF/BCF string intern system
-########################################################################
-
-cdef dict bcf_str_cache = {}
-
-cdef inline bcf_str_cache_get_charptr(const char* s):
- if s == NULL:
- return None
-
- cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
- if pystr:
- return <object>pystr
-
- if PY_MAJOR_VERSION < 3:
- val = s
- else:
- val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
-
- PyDict_SetItemString(bcf_str_cache, s, val)
-
- return val
-
-
-########################################################################
-########################################################################
-## Low level type conversion helpers
-########################################################################
-
-
-cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
- return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
-
-
-cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
- if not a:
- return None
- try:
- return tuple(charptr_to_str(a[i]) for i in range(n))
- finally:
- if free_after and a:
- free(a)
-
-
-cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
- cdef char *datac
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
- cdef float *dataf
- cdef int i
-
- if not data or n <= 0:
- return None
-
- if type == BCF_BT_CHAR:
- datac = <char *>data
- while n and datac[n-1] == bcf_str_vector_end:
- n -= 1
- value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None
- # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
-
- value = tuple(v or None for v in value.split(',')) if value else ()
- # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
- else:
- value = []
- if type == BCF_BT_INT8:
- data8 = <int8_t *>data
- for i in range(n):
- if data8[i] == bcf_int8_vector_end:
- break
- value.append(data8[i] if data8[i] != bcf_int8_missing else None)
- elif type == BCF_BT_INT16:
- data16 = <int16_t *>data
- for i in range(n):
- if data16[i] == bcf_int16_vector_end:
- break
- value.append(data16[i] if data16[i] != bcf_int16_missing else None)
- elif type == BCF_BT_INT32:
- data32 = <int32_t *>data
- for i in range(n):
- if data32[i] == bcf_int32_vector_end:
- break
- value.append(data32[i] if data32[i] != bcf_int32_missing else None)
- elif type == BCF_BT_FLOAT:
- dataf = <float *>data
- for i in range(n):
- if bcf_float_is_vector_end(dataf[i]):
- break
- value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
- else:
- raise TypeError('unsupported info type code')
-
- # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
- if not value:
- if scalar:
- value = None
- elif count <= 0:
- value = ()
- else:
- value = (None,)*count
- elif scalar and len(value) == 1:
- value = value[0]
- else:
- value = tuple(value)
-
- return value
-
-
-cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
- cdef char *datac
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
- cdef float *dataf
- cdef ssize_t i, value_count = len(values)
-
- assert(value_count <= n)
-
- if bt_type == BCF_BT_CHAR:
- if not isinstance(values, (str, bytes)):
- values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
- value_count = len(values)
- assert(value_count <= n)
- datac = <char *>data
- memcpy(datac, <char *>values, value_count)
- for i in range(value_count, n):
- datac[i] = 0
- elif bt_type == BCF_BT_INT8:
- datai8 = <int8_t *>data
- for i in range(value_count):
- val = values[i]
- datai8[i] = val if val is not None else bcf_int8_missing
- for i in range(value_count, n):
- datai8[i] = bcf_int8_vector_end
- elif bt_type == BCF_BT_INT16:
- datai16 = <int16_t *>data
- for i in range(value_count):
- val = values[i]
- datai16[i] = val if val is not None else bcf_int16_missing
- for i in range(value_count, n):
- datai16[i] = bcf_int16_vector_end
- elif bt_type == BCF_BT_INT32:
- datai32 = <int32_t *>data
- for i in range(value_count):
- val = values[i]
- datai32[i] = val if val is not None else bcf_int32_missing
- for i in range(value_count, n):
- datai32[i] = bcf_int32_vector_end
- elif bt_type == BCF_BT_FLOAT:
- dataf = <float *>data
- for i in range(value_count):
- val = values[i]
- if val is None:
- bcf_float_set(dataf + i, bcf_float_missing)
- else:
- dataf[i] = val
- for i in range(value_count, n):
- bcf_float_set(dataf + i, bcf_float_vector_end)
- else:
- raise TypeError('unsupported type')
-
-
-cdef bcf_empty_array(int type, ssize_t n, int vlen):
- cdef char *datac
- cdef int32_t *data32
- cdef float *dataf
- cdef int i
-
- if n <= 0:
- raise ValueError('Cannot create empty array')
-
- if type == BCF_HT_STR:
- value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
- datac = <char *>value
- for i in range(n):
- datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
- elif type == BCF_HT_INT:
- value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
- data32 = <int32_t *><char *>value
- for i in range(n):
- data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
- elif type == BCF_HT_REAL:
- value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
- dataf = <float *><char *>value
- for i in range(n):
- bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
- else:
- raise TypeError('unsupported header type code')
-
- return value
-
-
-cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
- void *dst_data, int dst_type, ssize_t dst_values,
- int vlen):
- cdef char *src_datac
- cdef char *dst_datac
- cdef int8_t *src_datai8
- cdef int16_t *src_datai16
- cdef int32_t *src_datai32
- cdef int32_t *dst_datai
- cdef float *src_dataf
- cdef float *dst_dataf
- cdef ssize_t src_size, dst_size, i, j
- cdef int val
-
- if src_values > dst_values:
- raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
-
- if src_type == dst_type == BCF_BT_CHAR:
- src_datac = <char *>src_data
- dst_datac = <char *>dst_data
- memcpy(src_datac, dst_datac, src_values)
- for i in range(src_values, dst_values):
- dst_datac[i] = 0
- elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
- src_datai8 = <int8_t *>src_data
- dst_datai = <int32_t *>dst_data
- for i in range(src_values):
- val = src_datai8[i]
- if val == bcf_int8_missing:
- val = bcf_int32_missing
- elif val == bcf_int8_vector_end:
- val = bcf_int32_vector_end
- dst_datai[i] = val
- for i in range(src_values, dst_values):
- dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
- elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
- src_datai16 = <int16_t *>src_data
- dst_datai = <int32_t *>dst_data
- for i in range(src_values):
- val = src_datai16[i]
- if val == bcf_int16_missing:
- val = bcf_int32_missing
- elif val == bcf_int16_vector_end:
- val = bcf_int32_vector_end
- dst_datai[i] = val
- for i in range(src_values, dst_values):
- dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
- elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
- src_datai32 = <int32_t *>src_data
- dst_datai = <int32_t *>dst_data
- for i in range(src_values):
- dst_datai[i] = src_datai32[i]
- for i in range(src_values, dst_values):
- dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
- elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
- src_dataf = <float *>src_data
- dst_dataf = <float *>dst_data
- for i in range(src_values):
- dst_dataf[i] = src_dataf[i]
- for i in range(src_values, dst_values):
- bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
- else:
- raise TypeError('unsupported types')
-
-
-cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
- cdef bcf_hdr_t *hdr = record.header.ptr
- cdef bcf1_t *r = record.ptr
- cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
- cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
-
- scalar[0] = 0
-
- if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
- count[0] = number
- elif length == BCF_VL_FIXED:
- if number == 1:
- scalar[0] = 1
- count[0] = number
- elif length == BCF_VL_R:
- count[0] = r.n_allele
- elif length == BCF_VL_A:
- count[0] = r.n_allele - 1
- elif length == BCF_VL_G:
- count[0] = r.n_allele * (r.n_allele + 1) // 2
- elif length == BCF_VL_VAR:
- count[0] = -1
- else:
- raise ValueError('Unknown format length')
-
-
-cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
- cdef bcf_hdr_t *hdr = record.header.ptr
-
- cdef char *s
- cdef ssize_t count
- cdef int scalar
-
- bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar)
-
- if z.len == 0:
- if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
- value = True
- elif scalar:
- value = None
- else:
- value = ()
- elif z.len == 1:
- if z.type == BCF_BT_INT8:
- if z.v1.i == bcf_int8_missing:
- value = None
- elif z.v1.i == bcf_int8_vector_end:
- value = ()
- else:
- value = z.v1.i
- elif z.type == BCF_BT_INT16:
- if z.v1.i == bcf_int16_missing:
- value = None
- elif z.v1.i == bcf_int16_vector_end:
- value = ()
- else:
- value = z.v1.i
- elif z.type == BCF_BT_INT32:
- if z.v1.i == bcf_int32_missing:
- value = None
- elif z.v1.i == bcf_int32_vector_end:
- value = ()
- else:
- value = z.v1.i
- elif z.type == BCF_BT_FLOAT:
- if bcf_float_is_missing(z.v1.f):
- value = None
- elif bcf_float_is_vector_end(z.v1.f):
- value = ()
- else:
- value = z.v1.f
- elif z.type == BCF_BT_CHAR:
- value = force_str(chr(z.v1.i))
- else:
- raise TypeError('unsupported info type code')
-
- if not scalar and value != ():
- value = (value,)
- else:
- value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
-
- return value
-
-
-cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
- int id, int bt_type, ssize_t bt_len, ssize_t *value_count,
- int *scalar, int *realloc):
-
- bcf_get_value_count(record, hl_type, id, value_count, scalar)
-
- # Validate values now that we know the type and size
- values = (value,) if not isinstance(value, tuple) else value
-
- # Validate values now that we know the type and size
- if ht_type == BCF_HT_FLAG:
- value_count[0] = 1
-
- if value_count[0] != -1 and value_count[0] != len(values):
- if scalar[0]:
- raise TypeError('value expected to be scalar'.format(value_count[0]))
- else:
- raise TypeError('values expected to be {:d}-tuple'.format(value_count[0]))
-
- if ht_type == BCF_HT_REAL:
- for v in values:
- if not(v is None or isinstance(v, (float, int))):
- raise TypeError('invalid value for Float format')
- elif ht_type == BCF_HT_INT:
- for v in values:
- if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
- raise TypeError('invalid value for Integer format')
- for v in values:
- if not(v is None or bcf_int32_missing < v <= INT32_MAX):
- raise ValueError('Integer value too small/large to store in VCF/BCF')
- elif ht_type == BCF_HT_STR:
- values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
- elif ht_type == BCF_HT_FLAG:
- if values[0] not in (True, False, None, 1, 0):
- raise ValueError('Flag values must be: True, False, None, 1, 0')
- else:
- raise TypeError('unsupported type')
-
- realloc[0] = 0
- if len(values) <= 1 and hl_type == BCF_HL_INFO:
- realloc[0] = 0
- elif len(values) > bt_len:
- realloc[0] = 1
- elif bt_type == BCF_BT_INT8:
- for v in values:
- if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
- realloc[0] = 1
- break
- elif bt_type == BCF_BT_INT16:
- for v in values:
- if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
- realloc[0] = 1
- break
-
- return values
-
-
-cdef bcf_encode_alleles(VariantRecord record, values):
- cdef bcf1_t *r = record.ptr
- cdef int32_t nalleles = r.n_allele
- cdef list gt_values = []
- cdef char *s
- cdef int i
-
- if not values:
- return ()
-
- if not isinstance(values, (list, tuple)):
- values = (values,)
-
- for value in values:
- if value is None:
- gt_values.append(None)
- elif isinstance(value, (str, bytes)):
- bvalue = force_bytes(value)
- s = bvalue
- for i in range(r.n_allele):
- if strcmp(r.d.allele[i], s) != 0:
- gt_values.append(bcf_gt_unphased(i))
- break
- else:
- raise ValueError('Unknown allele')
- else:
- i = value
- if not (0 <= i < nalleles):
- raise ValueError('Invalid allele index')
- gt_values.append(bcf_gt_unphased(i))
-
- return gt_values
-
-
-cdef bcf_info_set_value(VariantRecord record, key, value):
- cdef bcf_hdr_t *hdr = record.header.ptr
- cdef bcf1_t *r = record.ptr
- cdef vdict_t *d
- cdef khiter_t k
- cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
- cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
-
- if bcf_unpack(r, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
- if info:
- info_id = info.key
- else:
- d = <vdict_t *>hdr.dict[BCF_DT_ID]
- k = kh_get_vdict(d, bkey)
-
- if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
- raise KeyError('unknown INFO')
-
- info_id = kh_val_vdict(d, k).id
-
- info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
- values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
- info.type if info else -1, info.len if info else -1,
- &value_count, &scalar, &realloc)
-
- if info_type == BCF_HT_FLAG:
- if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
- raise ValueError('Unable to update INFO values')
- return
-
- vlen = value_count < 0
- value_count = len(values)
-
- # If we can, write updated values to existing allocated storage
- if info and not realloc:
- r.d.shared_dirty |= BCF1_DIRTY_INF
-
- if value_count == 0:
- info.len = 0
- # FIXME: Check if need to free vptr if info.len > 0?
- elif value_count == 1:
- # FIXME: Check if need to free vptr if info.len > 0?
- if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
- bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
- elif info.type == BCF_BT_FLOAT:
- bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
- else:
- raise TypeError('unsupported info type code')
- info.len = 1
- else:
- bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
- return
-
- alloc_len = max(1, value_count)
- if info and info.len > alloc_len:
- alloc_len = info.len
-
- new_values = bcf_empty_array(info_type, alloc_len, vlen)
- cdef char *valp = <char *>new_values
-
- if info_type == BCF_HT_INT:
- dst_type = BCF_BT_INT32
- elif info_type == BCF_HT_REAL:
- dst_type = BCF_BT_FLOAT
- elif info_type == BCF_HT_STR:
- dst_type = BCF_BT_CHAR
- else:
- raise ValueError('Unsupported INFO type')
-
- bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
-
- if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
- raise ValueError('Unable to update INFO values')
-
-
-cdef bcf_info_del_value(VariantRecord record, key):
- cdef bcf_hdr_t *hdr = record.header.ptr
- cdef bcf1_t *r = record.ptr
- cdef ssize_t value_count
- cdef int scalar
-
- if bcf_unpack(r, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
- if not info:
- raise KeyError(key)
-
- bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar)
-
- if value_count <= 0:
- null_value = ()
- elif scalar:
- null_value = None
- else:
- null_value = (None,)*value_count
-
- bcf_info_set_value(record, bkey, null_value)
-
-
-cdef bcf_format_get_value(VariantRecordSample sample, key):
- cdef bcf_hdr_t *hdr = sample.record.header.ptr
- cdef bcf1_t *r = sample.record.ptr
- cdef ssize_t count
- cdef int scalar
-
- if bcf_unpack(r, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
- if not fmt or not fmt.p:
- raise KeyError('invalid FORMAT')
-
- if is_gt_fmt(hdr, fmt.id):
- return bcf_format_get_allele_indices(sample)
-
- bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar)
-
- if fmt.p and fmt.n and fmt.size:
- return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
- elif scalar:
- return None
- elif count <= 0:
- return ()
- else:
- return (None,)*count
-
-
-cdef bcf_format_set_value(VariantRecordSample sample, key, value):
- cdef bcf_hdr_t *hdr = sample.record.header.ptr
- cdef bcf1_t *r = sample.record.ptr
- cdef int fmt_id
- cdef vdict_t *d
- cdef khiter_t k
- cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
- cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
-
- if bcf_unpack(r, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
- if fmt:
- fmt_id = fmt.id
- else:
- d = <vdict_t *>hdr.dict[BCF_DT_ID]
- k = kh_get_vdict(d, bkey)
-
- if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
- raise KeyError('unknown format')
-
- fmt_id = kh_val_vdict(d, k).id
-
- fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
-
- if fmt_type == BCF_HT_FLAG:
- raise ValueError('Flag types are not allowed on FORMATs')
-
- if is_gt_fmt(hdr, fmt_id):
- value = bcf_encode_alleles(sample.record, value)
-
- values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
- fmt.type if fmt else -1, fmt.n if fmt else -1,
- &value_count, &scalar, &realloc)
-
- vlen = value_count < 0
- value_count = len(values)
-
- # If we can, write updated values to existing allocated storage
- if fmt and not realloc:
- r.d.indiv_dirty = 1
- bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
- return
-
- alloc_len = max(1, value_count)
- if fmt and fmt.n > alloc_len:
- alloc_len = fmt.n
-
- n = bcf_hdr_nsamples(hdr)
- new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
- cdef char *valp = <char *>new_values
-
- if fmt_type == BCF_HT_INT:
- dst_type = BCF_BT_INT32
- dst_size = sizeof(int32_t) * alloc_len
- elif fmt_type == BCF_HT_REAL:
- dst_type = BCF_BT_FLOAT
- dst_size = sizeof(float) * alloc_len
- elif fmt_type == BCF_HT_STR:
- dst_type = BCF_BT_CHAR
- dst_size = sizeof(char) * alloc_len
- else:
- raise ValueError('Unsupported FORMAT type')
-
- if fmt and n > 1:
- for i in range(n):
- bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
- valp + i*dst_size, dst_type, alloc_len,
- vlen)
-
- bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
-
- if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
- raise ValueError('Unable to update format values')
-
-
-cdef bcf_format_del_value(VariantRecordSample sample, key):
- cdef bcf_hdr_t *hdr = sample.record.header.ptr
- cdef bcf1_t *r = sample.record.ptr
- cdef ssize_t value_count
- cdef int scalar
-
- if bcf_unpack(r, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
- if not fmt or not fmt.p:
- raise KeyError(key)
-
- bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar)
-
- if value_count <= 0:
- null_value = ()
- elif scalar:
- null_value = None
- else:
- null_value = (None,)*value_count
-
- bcf_format_set_value(sample, bkey, null_value)
-
-
-cdef bcf_format_get_allele_indices(VariantRecordSample sample):
- cdef bcf_hdr_t *hdr = sample.record.header.ptr
- cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
-
- if bcf_unpack(r, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- if sample.index < 0 or sample.index >= n or not r.n_fmt:
- return ()
-
- cdef bcf_fmt_t *fmt0 = r.d.fmt
- cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
- if not gt0 or not fmt0.n:
- return ()
-
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
- cdef int32_t a, nalleles = r.n_allele
- cdef list alleles = []
-
- if fmt0.type == BCF_BT_INT8:
- data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data8[i] == bcf_int8_vector_end:
- break
- elif data8[i] == bcf_int8_missing:
- a = -1
- else:
- a = bcf_gt_allele(data8[i])
- alleles.append(a if 0 <= a < nalleles else None)
- elif fmt0.type == BCF_BT_INT16:
- data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data16[i] == bcf_int16_vector_end:
- break
- elif data16[i] == bcf_int16_missing:
- a = -1
- else:
- a = bcf_gt_allele(data16[i])
- alleles.append(a if 0 <= a < nalleles else None)
- elif fmt0.type == BCF_BT_INT32:
- data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data32[i] == bcf_int32_vector_end:
- break
- elif data32[i] == bcf_int32_missing:
- a = -1
- else:
- a = bcf_gt_allele(data32[i])
- alleles.append(a if 0 <= a < nalleles else None)
-
- return tuple(alleles)
-
-
-cdef bcf_format_get_alleles(VariantRecordSample sample):
- cdef bcf_hdr_t *hdr = sample.record.header.ptr
- cdef bcf1_t *r = sample.record.ptr
- cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
-
- if bcf_unpack(r, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- cdef int32_t nalleles = r.n_allele
-
- if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
- return ()
-
- cdef bcf_fmt_t *fmt0 = r.d.fmt
- cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
- if not gt0 or not fmt0.n:
- return ()
-
- cdef int32_t a
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
- alleles = []
- if fmt0.type == BCF_BT_INT8:
- data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data8[i] == bcf_int8_vector_end:
- break
- a = bcf_gt_allele(data8[i])
- alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
- elif fmt0.type == BCF_BT_INT16:
- data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data16[i] == bcf_int16_vector_end:
- break
- a = bcf_gt_allele(data16[i])
- alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
- elif fmt0.type == BCF_BT_INT32:
- data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data32[i] == bcf_int32_vector_end:
- break
- a = bcf_gt_allele(data32[i])
- alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
- return tuple(alleles)
-
-
-cdef bint bcf_sample_get_phased(VariantRecordSample sample):
- cdef bcf_hdr_t *hdr = sample.record.header.ptr
- cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
-
- if bcf_unpack(r, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- if sample.index < 0 or sample.index >= n or not r.n_fmt:
- return False
-
- cdef bcf_fmt_t *fmt0 = r.d.fmt
- cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
- if not gt0 or not fmt0.n:
- return False
-
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
-
- cdef bint phased = False
-
- if fmt0.type == BCF_BT_INT8:
- data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data8[i] == bcf_int8_vector_end:
- break
- elif data8[i] == bcf_int8_missing:
- continue
- elif i and not bcf_gt_is_phased(data8[i]):
- return False
- else:
- phased = True
- elif fmt0.type == BCF_BT_INT16:
- data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data16[i] == bcf_int16_vector_end:
- break
- elif data16[i] == bcf_int16_missing:
- continue
- elif i and not bcf_gt_is_phased(data16[i]):
- return False
- else:
- phased = True
- elif fmt0.type == BCF_BT_INT32:
- data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data32[i] == bcf_int32_vector_end:
- break
- elif data32[i] == bcf_int32_missing:
- continue
- elif i and not bcf_gt_is_phased(data32[i]):
- return False
- else:
- phased = True
-
- return phased
-
-
-cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
- cdef bcf_hdr_t *hdr = sample.record.header.ptr
- cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
-
- if bcf_unpack(r, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- if sample.index < 0 or sample.index >= n or not r.n_fmt:
- return
-
- cdef bcf_fmt_t *fmt0 = r.d.fmt
- cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
-
- if not gt0 or not fmt0.n:
- raise ValueError('Cannot set phased before genotype is set')
-
- cdef int8_t *data8
- cdef int16_t *data16
- cdef int32_t *data32
-
- if fmt0.type == BCF_BT_INT8:
- data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data8[i] == bcf_int8_vector_end:
- break
- elif data8[i] == bcf_int8_missing:
- continue
- elif i:
- data8[i] = (data8[i] & 0xFE) | phased
- elif fmt0.type == BCF_BT_INT16:
- data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data16[i] == bcf_int16_vector_end:
- break
- elif data16[i] == bcf_int16_missing:
- continue
- elif i:
- data16[i] = (data16[i] & 0xFFFE) | phased
- elif fmt0.type == BCF_BT_INT32:
- data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
- for i in range(fmt0.n):
- if data32[i] == bcf_int32_vector_end:
- break
- elif data32[i] == bcf_int32_missing:
- continue
- elif i:
- data32[i] = (data32[i] & 0xFFFFFFFE) | phased
-
-
-########################################################################
-########################################################################
-## Variant Header objects
-########################################################################
-
-#FIXME: implement a full mapping interface
-#FIXME: passing bcf_hrec_t* may not be the safest approach once mutating
-# operations are allowed.
-cdef class VariantHeaderRecord(object):
- """header record from a :class:`VariantHeader` object"""
-
- property type:
- """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- return METADATA_TYPES[r.type]
-
- property key:
- """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- return bcf_str_cache_get_charptr(r.key) if r.key else None
-
- property value:
- """header value. Set only for generic lines, None for FILTER/INFO, etc."""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- return charptr_to_str(r.value) if r.value else None
-
- property attrs:
- """sequence of additional header attributes"""
- def __get__(self):
- cdef bcf_hrec_t *r = self.ptr
- cdef int i
- return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
- charptr_to_str(r.vals[i]) if r.vals[i] else None)
- for i in range(r.nkeys))
-
- def __len__(self):
- cdef bcf_hrec_t *r = self.ptr
- return r.nkeys
-
- def __bool__(self):
- cdef bcf_hrec_t *r = self.ptr
- return r.nkeys != 0
-
- def __getitem__(self, key):
- """get attribute value"""
- cdef bcf_hrec_t *r = self.ptr
- cdef int i
- bkey = force_bytes(key)
- for i in range(r.nkeys):
- if r.keys[i] and r.keys[i] == bkey:
- return charptr_to_str(r.vals[i]) if r.vals[i] else None
- raise KeyError('cannot find metadata key')
-
- def __iter__(self):
- cdef bcf_hrec_t *r = self.ptr
- cdef int i
- for i in range(r.nkeys):
- if r.keys[i]:
- yield bcf_str_cache_get_charptr(r.keys[i])
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- try:
- self[key]
- except KeyError:
- return False
- else:
- return True
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- cdef bcf_hrec_t *r = self.ptr
- cdef int i
- for i in range(r.nkeys):
- if r.keys[i]:
- yield charptr_to_str(r.vals[i]) if r.vals[i] else None
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- cdef bcf_hrec_t *r = self.ptr
- cdef int i
- for i in range(r.nkeys):
- if r.keys[i]:
- yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
- def __str__(self):
- cdef bcf_hrec_t *r = self.ptr
- if r.type == BCF_HL_GEN:
- return '##{}={}'.format(self.key, self.value)
- else:
- attrs = ','.join('{}={}'.format(k, v) for k,v in self.attrs if k != 'IDX')
- return '##{}=<{}>'.format(self.key or self.type, attrs)
-
-
-cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- if not hdr:
- return None
-
- cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
- record.header = header
- record.ptr = hdr
-
- return record
-
-
-cdef class VariantHeaderRecords(object):
- """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
-
- def __len__(self):
- return self.header.ptr.nhrec
-
- def __bool__(self):
- return self.header.ptr.nhrec != 0
-
- def __getitem__(self, index):
- cdef int32_t i = index
- if i < 0 or i >= self.header.ptr.nhrec:
- raise IndexError('invalid header record index')
- return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
-
- def __iter__(self):
- cdef int32_t i
- for i in range(self.header.ptr.nhrec):
- yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
-
- __hash__ = None
-
-
-cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
- records.header = header
- return records
-
-
-cdef class VariantMetadata(object):
- """filter, info or format metadata record from a :class:`VariantHeader`
- object"""
-
- property name:
- """metadata name"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
-
- # Q: Should this be exposed?
- property id:
- """metadata internal header id number"""
- def __get__(self):
- return self.id
-
- property number:
- """metadata number (i.e. cardinality)"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or self.type == BCF_HL_FLT:
- return None
- cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
- if l == BCF_VL_FIXED:
- return bcf_hdr_id2number(hdr, self.type, self.id)
- elif l == BCF_VL_VAR:
- return '.'
- else:
- return METADATA_LENGTHS[l]
-
- property type:
- """metadata value type"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or \
- self.type == BCF_HL_FLT:
- return None
- return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
-
- property description:
- """metadata description (or None if not set)"""
- def __get__(self):
- descr = self.record.get('Description')
- if descr:
- descr = descr.strip('"')
- return force_str(descr)
-
- property record:
- """:class:`VariantHeaderRecord` associated with this
- :class:`VariantMetadata` object"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- if not bcf_hdr_idinfo_exists(hdr, self.type, self.id):
- return None
- cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
- if not hrec:
- return None
- return makeVariantHeaderRecord(self.header, hrec)
-
-
-cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
- raise ValueError('invalid metadata type')
-
- if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
- raise ValueError('invalid metadata id')
-
- cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
- meta.header = header
- meta.type = type
- meta.id = id
-
- return meta
-
-
-cdef class VariantHeaderMetadata(object):
- """mapping from filter, info or format name to :class:`VariantMetadata` object"""
-
- def add(self, id, number, type, description, **kwargs):
- """Add a new filter, info or format record"""
- if id in self:
- raise ValueError('Header already exists for id={}'.format(id))
-
- if self.type == BCF_HL_FLT:
- if number is not None:
- raise ValueError('Number must be None when adding a filter')
- if type is not None:
- raise ValueError('Type must be None when adding a filter')
-
- items = [('ID', id), ('Description', description)]
- else:
- if type not in VALUE_TYPES:
- raise ValueError('unknown type specified: {}'.format(type))
- if number is None:
- number = '.'
-
- items = [('ID', id),
- ('Number', number),
- ('Type', type),
- ('Description', description)]
-
- items += kwargs.items()
- self.header.add_meta(METADATA_TYPES[self.type], items=items)
-
- def __len__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef bcf_idpair_t *idpair
- cdef int32_t i, n = 0
-
- for i in range(hdr.n[BCF_DT_ID]):
- idpair = hdr.id[BCF_DT_ID] + i
- if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
- n += 1
- return n
-
- def __bool__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef bcf_idpair_t *idpair
- cdef int32_t i
-
- for i in range(hdr.n[BCF_DT_ID]):
- idpair = hdr.id[BCF_DT_ID] + i
- if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
- return True
- return False
-
- def __getitem__(self, key):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
-
- bkey = force_bytes(key)
- cdef khiter_t k = kh_get_vdict(d, bkey)
-
- if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
- raise KeyError('invalid filter')
-
- return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef bcf_idpair_t *idpair
- cdef int32_t i
-
- for i in range(hdr.n[BCF_DT_ID]):
- idpair = hdr.id[BCF_DT_ID] + i
- if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
- yield bcf_str_cache_get_charptr(idpair.key)
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- try:
- self[key]
- except KeyError:
- return False
- else:
- return True
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- for key in self:
- yield self[key]
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- for key in self:
- yield (key, self[key])
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
-
-cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
- meta.header = header
- meta.type = type
-
- return meta
-
-
-cdef class VariantContig(object):
- """contig metadata from a :class:`VariantHeader`"""
-
- property name:
- """contig name"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
-
- property id:
- """contig internal id number"""
- def __get__(self):
- return self.id
-
- property length:
- """contig length or None if not available"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
- return length if length else None
-
- property header:
- """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
- return makeVariantHeaderRecord(self.header, hrec)
-
-
-cdef VariantContig makeVariantContig(VariantHeader header, int id):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
- raise ValueError('invalid contig id')
-
- cdef VariantContig contig = VariantContig.__new__(VariantContig)
- contig.header = header
- contig.id = id
-
- return contig
-
-
-cdef class VariantHeaderContigs(object):
- """mapping from contig name or index to :class:`VariantContig` object."""
-
- def __len__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
- return hdr.n[BCF_DT_CTG]
-
- def __bool__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
- return hdr.n[BCF_DT_CTG] != 0
-
- def __getitem__(self, key):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef int index
-
- if isinstance(key, int):
- index = key
- if index < 0 or index >= hdr.n[BCF_DT_CTG]:
- raise IndexError('invalid contig index')
- return makeVariantContig(self.header, index)
-
- cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
- bkey = force_bytes(key)
- cdef khiter_t k = kh_get_vdict(d, bkey)
-
- if k == kh_end(d):
- raise KeyError('invalid contig')
-
- cdef int id = kh_val_vdict(d, k).id
-
- return makeVariantContig(self.header, id)
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
- cdef uint32_t n = kh_size(d)
-
- assert n == hdr.n[BCF_DT_CTG]
-
- for i in range(n):
- yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- try:
- self[key]
- except KeyError:
- return False
- else:
- return True
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- for key in self:
- yield self[key]
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- for key in self:
- yield (key, self[key])
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
- def add(self, id, **kwargs):
- """Add a new contig record"""
- if id in self:
- raise ValueError('Header already exists for contig {}'.format(id))
-
- items = [('ID', id)] + kwargs.items()
- self.header.add_meta('contig', items=items)
-
-
-cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
- contigs.header = header
-
- return contigs
-
-
-cdef class VariantHeaderSamples(object):
- """sequence of sample names from a :class:`VariantHeader` object"""
-
- def __len__(self):
- return bcf_hdr_nsamples(self.header.ptr)
-
- def __bool__(self):
- return bcf_hdr_nsamples(self.header.ptr) != 0
-
- def __getitem__(self, index):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
- cdef int32_t i = index
-
- if i < 0 or i >= n:
- raise IndexError('invalid sample index')
-
- return charptr_to_str(hdr.samples[i])
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
- for i in range(n):
- yield charptr_to_str(hdr.samples[i])
-
- def __contains__(self, key):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
- bkey = force_bytes(key)
- cdef khiter_t k = kh_get_vdict(d, bkey)
-
- return k != kh_end(d)
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
- def add(self, name):
- """Add a new sample"""
- self.header.add_sample(name)
-
-
-cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
- samples.header = header
-
- return samples
-
-
-cdef class VariantHeader(object):
- """header information for a :class:`VariantFile` object"""
-
- #FIXME: Add structured proxy
- #FIXME: Add generic proxy
- #FIXME: Add mutable methods
-
- # See makeVariantHeader for C constructor
- def __cinit__(self):
- self.ptr = NULL
-
- # Python constructor
- def __init__(self):
- self.ptr = bcf_hdr_init(b'w')
- if not self.ptr:
- raise ValueError('cannot create VariantHeader')
-
- def __dealloc__(self):
- if self.ptr:
- bcf_hdr_destroy(self.ptr)
- self.ptr = NULL
-
- def __bool__(self):
- # self.ptr == NULL should be impossible
- return self.ptr != NULL
-
- def copy(self):
- return makeVariantHeader(bcf_hdr_dup(self.ptr))
-
- property version:
- """VCF version"""
- def __get__(self):
- return force_str(bcf_hdr_get_version(self.ptr))
-
- property samples:
- """samples (:class:`VariantHeaderSamples`)"""
- def __get__(self):
- return makeVariantHeaderSamples(self)
-
- property records:
- """header records (:class:`VariantHeaderRecords`)"""
- def __get__(self):
- return makeVariantHeaderRecords(self)
-
- property contigs:
- """contig information (:class:`VariantHeaderContigs`)"""
- def __get__(self):
- return makeVariantHeaderContigs(self)
-
- property filters:
- """filter metadata (:class:`VariantHeaderMetadata`)"""
- def __get__(self):
- return makeVariantHeaderMetadata(self, BCF_HL_FLT)
-
- property info:
- """info metadata (:class:`VariantHeaderMetadata`)"""
- def __get__(self):
- return makeVariantHeaderMetadata(self, BCF_HL_INFO)
-
- property formats:
- """format metadata (:class:`VariantHeaderMetadata`)"""
- def __get__(self):
- return makeVariantHeaderMetadata(self, BCF_HL_FMT)
-
- property alts:
- """alt metadata (:class:`dict` ID->record).
-
- The data returned just a snapshot of alt records, is created
- every time the property is requested, and modifications will
- not be reflected in the header metadata and vice versa.
-
- i.e. it is just a dict that reflects the state of alt records
- at the time it is created.
-
- """
- def __get__(self):
- return {record['ID']:record for record in self.records
- if record.key.upper() == 'ALT' }
-
-
- # only safe to do when opening an htsfile
- cdef _subset_samples(self, include_samples):
- keep_samples = set(self.samples)
- include_samples = set(include_samples)
- missing_samples = include_samples - keep_samples
- keep_samples &= include_samples
-
- if missing_samples:
- # FIXME: add specialized exception with payload
- raise ValueError(
- 'missing {:d} requested samples'.format(
- len(missing_samples)))
-
- keep_samples = force_bytes(','.join(keep_samples))
- cdef char *keep = <char *>keep_samples if keep_samples else NULL
- cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
-
- if ret != 0:
- raise ValueError(
- 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
-
- def __str__(self):
- cdef int hlen
- cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
-
- try:
- return charptr_to_str_w_len(hstr, hlen)
- finally:
- free(hstr)
-
- def add_record(self, VariantHeaderRecord record):
- """Add an existing :class:`VariantHeaderRecord` to this header"""
- cdef bcf_hrec_t *r = record.ptr
-
- if r.type == BCF_HL_GEN:
- self.add_meta(r.key, r.value)
- else:
- items = [(k,v) for k,v in record.attrs if k != 'IDX']
- self.add_meta(r.key, items=items)
-
- def add_line(self, line):
- """Add a metadata line to this header"""
- bline = force_bytes(line)
- if bcf_hdr_append(self.ptr, bline) < 0:
- raise ValueError('invalid header line')
-
- if self.ptr.dirty:
- bcf_hdr_sync(self.ptr)
-
- def add_meta(self, key, value=None, items=None):
- """Add metadata to this header"""
- if not ((value is not None) ^ (items is not None)):
- raise ValueError('either value or items must be specified')
-
- cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
- cdef int quoted
-
- try:
- key = force_bytes(key)
- hrec.key = strdup(key)
-
- if value is not None:
- hrec.value = strdup(force_bytes(value))
- else:
- for key, value in items:
- key = force_bytes(key)
- bcf_hrec_add_key(hrec, key, <int>len(key))
-
- value = force_bytes(str(value))
- quoted = strpbrk(value, ' ;,"\t<>') != NULL
- bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
- except:
- bcf_hrec_destroy(hrec)
- raise
-
- bcf_hdr_add_hrec(self.ptr, hrec)
-
- if self.ptr.dirty:
- bcf_hdr_sync(self.ptr)
-
- def add_sample(self, name):
- """Add a new sample to this header"""
- bname = force_bytes(name)
- if bcf_hdr_add_sample(self.ptr, bname) < 0:
- raise ValueError('Duplicated sample name: {}'.format(name))
- if self.ptr.dirty:
- bcf_hdr_sync(self.ptr)
-
-
-cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
- if not hdr:
- raise ValueError('cannot create VariantHeader')
-
- cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
- header.ptr = hdr
-
- return header
-
-
-########################################################################
-########################################################################
-## Variant Record objects
-########################################################################
-
-cdef class VariantRecordFilter(object):
- """Filters set on a :class:`VariantRecord` object, presented as a mapping from
- filter index or name to :class:`VariantMetadata` object"""
-
- def __len__(self):
- return self.record.ptr.d.n_flt
-
- def __bool__(self):
- return self.record.ptr.d.n_flt != 0
-
- def __getitem__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int index, id
- cdef int n = r.d.n_flt
-
- if isinstance(key, int):
- index = key
-
- if index < 0 or index >= n:
- raise IndexError('invalid filter index')
-
- id = r.d.flt[index]
- else:
- if key == '.':
- key = 'PASS'
-
- bkey = force_bytes(key)
- id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
-
- if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
- or not bcf_has_filter(hdr, self.record.ptr, bkey):
- raise KeyError('Invalid filter')
-
- return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
-
- def __delitem__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int index, id
- cdef int n = r.d.n_flt
-
- if isinstance(key, int):
- index = key
-
- if index < 0 or index >= n:
- raise IndexError('invalid filter index')
-
- id = r.d.flt[index]
- else:
- if key == '.':
- key = 'PASS'
-
- bkey = force_bytes(key)
- id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
-
- if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \
- or not bcf_has_filter(hdr, self.record.ptr, bkey):
- raise KeyError('Invalid filter')
-
- bcf_remove_filter(hdr, r, id, 0)
-
- def clear(self):
- """Clear all filters"""
- cdef bcf1_t *r = self.record.ptr
- r.d.shared_dirty |= BCF1_DIRTY_FLT
- r.d.n_flt = 0
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int i
-
- for i in range(r.d.n_flt):
- yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
- return bcf_has_filter(hdr, r, bkey) == 1
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- for key in self:
- yield self[key]
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- for key in self:
- yield (key, self[key])
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
-
-cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
- if not record:
- raise ValueError('invalid VariantRecord')
-
- cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
- filter.record = record
-
- return filter
-
-
-cdef class VariantRecordFormat(object):
- """Format data present for each sample in a :class:`VariantRecord` object,
- presented as mapping from format name to :class:`VariantMetadata` object."""
-
- def __len__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int i, n = 0
-
- for i in range(r.n_fmt):
- if r.d.fmt[i].p:
- n += 1
- return n
-
- def __bool__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int i
-
- for i in range(r.n_fmt):
- if r.d.fmt[i].p:
- return True
- return False
-
- def __getitem__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
-
- bkey = force_bytes(key)
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
- if not fmt or not fmt.p:
- raise KeyError('unknown format')
-
- return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
-
- def __delitem__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
-
- bkey = force_bytes(key)
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
-
- if not fmt or not fmt.p:
- raise KeyError('unknown format')
-
- if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
- raise ValueError('Unable to delete FORMAT')
-
- def clear(self):
- """Clear all formats for all samples within the associated
- :class:`VariantRecord` instance"""
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_fmt_t *fmt
- cdef const char *key
- cdef int i
-
- for i in reversed(range(r.n_fmt)):
- fmt = &r.d.fmt[i]
- if fmt.p:
- key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
- if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
- raise ValueError('Unable to delete FORMAT')
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_fmt_t *fmt
- cdef int i
-
- for i in range(r.n_fmt):
- fmt = &r.d.fmt[i]
- if fmt.p:
- yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
- return fmt != NULL and fmt.p != NULL
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- for key in self:
- yield self[key]
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- for key in self:
- yield (key, self[key])
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
-
-cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
- if not record:
- raise ValueError('invalid VariantRecord')
-
- cdef VariantRecordFormat format = VariantRecordFormat.__new__(
- VariantRecordFormat)
- format.record = record
-
- return format
-
-
-#TODO: Add a getmeta method to return the corresponding VariantMetadata?
-cdef class VariantRecordInfo(object):
- """Info data stored in a :class:`VariantRecord` object, presented as a
- mapping from info metadata name to value."""
-
- def __len__(self):
- return self.record.ptr.n_info
-
- def __bool__(self):
- return self.record.ptr.n_info != 0
-
- def __getitem__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef vdict_t *d
- cdef khiter_t k
- cdef info_id
-
- if bcf_unpack(r, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
- if not info:
- d = <vdict_t *>hdr.dict[BCF_DT_ID]
- k = kh_get_vdict(d, bkey)
-
- if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
- raise KeyError('Unknown INFO field: {}'.format(key))
-
- info_id = kh_val_vdict(d, k).id
- else:
- info_id = info.key
-
- if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
- return info != NULL and info.vptr != NULL
-
- if not info or not info.vptr:
- raise KeyError('Invalid INFO field: {}'.format(key))
-
- return bcf_info_get_value(self.record, info)
-
- def __setitem__(self, key, value):
- bcf_info_set_value(self.record, key, value)
-
- def __delitem__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
-
- if bcf_unpack(r, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
- if not info or not info.vptr:
- raise KeyError('Unknown INFO field: {}'.format(key))
-
- if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
- raise ValueError('Unable to delete INFO')
-
- def clear(self):
- """Clear all info data"""
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_info_t *info
- cdef const char *key
- cdef int i
-
- if bcf_unpack(r, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- for i in range(r.n_info):
- info = &r.d.info[i]
- if info and info.vptr:
- key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
- if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
- raise ValueError('Unable to delete INFO')
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_info_t *info
- cdef int i
-
- for i in range(r.n_info):
- info = &r.d.info[i]
- if info and info.vptr:
- yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key))
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
-
- if bcf_unpack(r, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- bkey = force_bytes(key)
- cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
-
- return info != NULL
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_info_t *info
- cdef int i
-
- for i in range(r.n_info):
- info = &r.d.info[i]
- if info and info.vptr:
- yield bcf_info_get_value(self.record, info)
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_info_t *info
- cdef int i
-
- for i in range(r.n_info):
- info = &r.d.info[i]
- if info and info.vptr:
- key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
- value = bcf_info_get_value(self.record, info)
- yield bcf_str_cache_get_charptr(key), value
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
-
-cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
- if not record:
- raise ValueError('invalid VariantRecord')
-
- cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
- info.record = record
-
- return info
-
-
-cdef class VariantRecordSamples(object):
- """mapping from sample index or name to :class:`VariantRecordSample` object."""
-
- def __len__(self):
- return bcf_hdr_nsamples(self.record.header.ptr)
-
- def __bool__(self):
- return bcf_hdr_nsamples(self.record.header.ptr) != 0
-
- def __getitem__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int n = bcf_hdr_nsamples(hdr)
- cdef int sample_index
- cdef vdict_t *d
- cdef khiter_t k
-
- if isinstance(key, int):
- sample_index = key
- else:
- bkey = force_bytes(key)
- sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
- if sample_index < 0:
- raise KeyError('invalid sample name')
-
- if sample_index < 0 or sample_index >= n:
- raise IndexError('invalid sample index')
-
- return makeVariantRecordSample(self.record, sample_index)
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
- for i in range(n):
- yield charptr_to_str(hdr.samples[i])
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int n = bcf_hdr_nsamples(hdr)
- cdef int sample_index
- cdef vdict_t *d
- cdef khiter_t k
-
- if isinstance(key, int):
- sample_index = key
- else:
- bkey = force_bytes(key)
- sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
- if sample_index < 0:
- raise KeyError('invalid sample name')
-
- return 0 <= sample_index < n
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
- for i in range(n):
- yield makeVariantRecordSample(self.record, i)
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
-
- for i in range(n):
- yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
-
-cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
- if not record:
- raise ValueError('invalid VariantRecord')
-
- cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
- VariantRecordSamples)
- samples.record = record
-
- return samples
-
-
-cdef class VariantRecord(object):
- """Variant record"""
-
- def __dealloc__(self):
- if self.ptr:
- bcf_destroy1(self.ptr)
- self.ptr = NULL
-
- property rid:
- """internal reference id number"""
- def __get__(self):
- return self.ptr.rid
- def __set__(self, rid):
- cdef bcf_hdr_t *hdr = self.header.ptr
- cdef int r = rid
- if rid < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
- raise ValueError('invalid reference id')
- self.ptr.rid = r
-
- property chrom:
- """chromosome/contig name"""
- def __get__(self):
- return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
- def __set__(self, chrom):
- cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
- bchrom = force_bytes(chrom)
- cdef khint_t k = kh_get_vdict(d, bchrom)
- if k == kh_end(d):
- raise ValueError('Invalid chromosome/contig')
- self.ptr.rid = kh_val_vdict(d, k).id
-
- property contig:
- """chromosome/contig name"""
- def __get__(self):
- return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid))
- def __set__(self, chrom):
- cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
- bchrom = force_bytes(chrom)
- cdef khint_t k = kh_get_vdict(d, bchrom)
- if k == kh_end(d):
- raise ValueError('Invalid chromosome/contig')
- self.ptr.rid = kh_val_vdict(d, k).id
-
- property pos:
- """record start position on chrom/contig (1-based inclusive)"""
- def __get__(self):
- return self.ptr.pos + 1
- def __set__(self, pos):
- if pos < 1:
- raise ValueError('Position must be positive')
- # FIXME: check start <= stop?
- # KBJ: Can't or else certain mutating operations will become
- # difficult or impossible. e.g. having to delete
- # info['END'] before being able to reset pos is going to
- # create subtle bugs. Better to check this when writing
- # records.
- self.ptr.pos = pos - 1
-
- property start:
- """record start position on chrom/contig (0-based inclusive)"""
- def __get__(self):
- return self.ptr.pos
- def __set__(self, start):
- if start < 0:
- raise ValueError('Start coordinate must be non-negative')
- # FIXME: check start <= stop?
- # KBJ: See above.
- self.ptr.pos = start
-
- property stop:
- """record stop position on chrom/contig (0-based exclusive)"""
- def __get__(self):
- return self.ptr.pos + self.ptr.rlen
- def __set__(self, stop):
- if stop < self.ptr.pos:
- raise ValueError('Stop coordinate must be greater than or equal to start')
- self.ptr.rlen = stop - self.ptr.pos
-
- property rlen:
- """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)"""
- def __get__(self):
- return self.ptr.rlen
- def __set__(self, rlen):
- if rlen < 0:
- raise ValueError('Reference length must be non-negative')
- self.ptr.rlen = rlen
-
- property qual:
- """phred scaled quality score or None if not available"""
- def __get__(self):
- return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
- def __set__(self, qual):
- if qual is not None:
- self.ptr.qual = qual
- else:
- bcf_float_set(&self.ptr.qual, bcf_float_missing)
-
-# property n_allele:
-# def __get__(self):
-# return self.ptr.n_allele
-
-# property n_sample:
-# def __get__(self):
-# return self.ptr.n_sample
-
- property id:
- """record identifier or None if not available"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
- def __set__(self, id):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- cdef char *idstr = NULL
- if id is not None:
- bid = force_bytes(id)
- idstr = bid
- if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
- raise ValueError('Error updating id')
-
- property ref:
- """reference allele"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return charptr_to_str(r.d.allele[0]) if r.d.allele else None
- def __set__(self, ref):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- #FIXME: Set alleles directly -- this is stupid
- if not ref:
- raise ValueError('ref allele cannot be null')
- ref = force_bytes(ref)
- if r.d.allele and r.n_allele:
- alleles = [r.d.allele[i] for i in range(r.n_allele)]
- alleles[0] = ref
- else:
- alleles = [ref]
- self.alleles = alleles
-
- property alleles:
- """tuple of reference allele followed by alt alleles"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- if not r.d.allele:
- return None
- cdef tuple res = PyTuple_New(r.n_allele)
- for i in range(r.n_allele):
- a = charptr_to_str(r.d.allele[i])
- PyTuple_SET_ITEM(res, i, a)
- Py_INCREF(a)
- return res
- def __set__(self, values):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- values = [force_bytes(v) for v in values]
- if b'' in values:
- raise ValueError('cannot set null allele')
- values = b','.join(values)
- if bcf_update_alleles_str(self.header.ptr, r, values) < 0:
- raise ValueError('Error updating alleles')
-
- property alts:
- """tuple of alt alleles"""
- def __get__(self):
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- if r.n_allele < 2 or not r.d.allele:
- return None
- cdef tuple res = PyTuple_New(r.n_allele - 1)
- for i in range(1, r.n_allele):
- a = charptr_to_str(r.d.allele[i])
- PyTuple_SET_ITEM(res, i - 1, a)
- Py_INCREF(a)
- return res
- def __set__(self, values):
- #FIXME: Set alleles directly -- this is stupid
- cdef bcf1_t *r = self.ptr
- if bcf_unpack(r, BCF_UN_STR) < 0:
- raise ValueError('Error unpacking VariantRecord')
- values = [force_bytes(v) for v in values]
- if b'' in values:
- raise ValueError('cannot set null alt allele')
- ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
- self.alleles = ref + values
-
- property filter:
- """filter information (see :class:`VariantRecordFilter`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordFilter(self)
-
- property info:
- """info data (see :class:`VariantRecordInfo`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordInfo(self)
-
- property format:
- """sample format metadata (see :class:`VariantRecordFormat`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordFormat(self)
-
- property samples:
- """sample data (see :class:`VariantRecordSamples`)"""
- def __get__(self):
- if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
- raise ValueError('Error unpacking VariantRecord')
- return makeVariantRecordSamples(self)
-
- def __str__(self):
- cdef kstring_t line
- cdef char c
-
- line.l = line.m = 0
- line.s = NULL
-
- if vcf_format(self.header.ptr, self.ptr, &line) < 0:
- if line.m:
- free(line.s)
- raise ValueError('vcf_format failed')
-
- # Strip CR/LF?
- #while line.l:
- # c = line.s[line.l - 1]
- # if c != b'\n' and c != b'\r':
- # break
- # line.l -= 1
-
- ret = charptr_to_str_w_len(line.s, line.l)
-
- if line.m:
- free(line.s)
-
- return ret
-
-
-cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
- if not header:
- raise ValueError('invalid VariantHeader')
-
- if not r:
- raise ValueError('cannot create VariantRecord')
-
- cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
- record.header = header
- record.ptr = r
-
- return record
-
-
-########################################################################
-########################################################################
-## Variant Sampletype object
-########################################################################
-
-
-cdef class VariantRecordSample(object):
- """Data for a single sample from a :class:`VariantRecord` object.
- Provides data accessors for genotypes and a mapping interface
- from format name to values.
- """
-
- property name:
- """sample name"""
- def __get__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
-
- if self.index < 0 or self.index >= n:
- raise ValueError('invalid sample index')
-
- return charptr_to_str(hdr.samples[self.index])
-
- property allele_indices:
- """allele indices for called genotype, if present. Otherwise None"""
- def __get__(self):
- return bcf_format_get_allele_indices(self)
- def __set__(self, values):
- self['GT'] = values
- def __del__(self):
- self['GT'] = ()
-
- property alleles:
- """alleles for called genotype, if present. Otherwise None"""
- def __get__(self):
- return bcf_format_get_alleles(self)
- def __set__(self, values):
- self['GT'] = values
- def __del__(self):
- self['GT'] = ()
-
- property phased:
- """False if genotype is missing or any allele is unphased. Otherwise True."""
- def __get__(self):
- return bcf_sample_get_phased(self)
- def __set__(self, value):
- bcf_sample_set_phased(self, value)
-
- def __len__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int i, n = 0
-
- if bcf_unpack(r, BCF_UN_FMT) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- for i in range(r.n_fmt):
- if r.d.fmt[i].p:
- n += 1
- return n
-
- def __bool__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef int i
-
- if bcf_unpack(r, BCF_UN_FMT) < 0:
- raise ValueError('Error unpacking VariantRecord')
-
- for i in range(r.n_fmt):
- if r.d.fmt[i].p:
- return True
- return False
-
- def __getitem__(self, key):
- return bcf_format_get_value(self, key)
-
- def __setitem__(self, key, value):
- bcf_format_set_value(self, key, value)
-
- def __delitem__(self, key):
- bcf_format_del_value(self, key)
-
- def clear(self):
- """Clear all format data (including genotype) for this sample"""
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_fmt_t *fmt
- cdef int i
-
- for i in range(r.n_fmt):
- fmt = &r.d.fmt[i]
- if fmt.p:
- bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
-
- def __iter__(self):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- cdef bcf_fmt_t *fmt
- cdef int i
-
- for i in range(r.n_fmt):
- fmt = &r.d.fmt[i]
- if r.d.fmt[i].p:
- yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- cdef bcf_hdr_t *hdr = self.record.header.ptr
- cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
- cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
- return fmt != NULL and fmt.p != NULL
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- for key in self:
- yield self[key]
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- for key in self:
- yield (key, self[key])
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
-
-cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
- if not record or sample_index < 0:
- raise ValueError('cannot create VariantRecordSample')
-
- cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
- sample.record = record
- sample.index = sample_index
-
- return sample
-
-
-########################################################################
-########################################################################
-## Index objects
-########################################################################
-
-
-cdef class BaseIndex(object):
- def __init__(self):
- self.refs = ()
- self.remap = {}
-
- def __len__(self):
- return len(self.refs)
-
- def __bool__(self):
- return len(self.refs) != 0
-
- def __getitem__(self, key):
- if isinstance(key, int):
- return self.refs[key]
- else:
- return self.refmap[key]
-
- def __iter__(self):
- return iter(self.refs)
-
- def get(self, key, default=None):
- """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
- return default
-
- def __contains__(self, key):
- try:
- self[key]
- except KeyError:
- return False
- else:
- return True
-
- def iterkeys(self):
- """D.iterkeys() -> an iterator over the keys of D"""
- return iter(self)
-
- def itervalues(self):
- """D.itervalues() -> an iterator over the values of D"""
- for key in self:
- yield self[key]
-
- def iteritems(self):
- """D.iteritems() -> an iterator over the (key, value) items of D"""
- for key in self:
- yield (key, self[key])
-
- def keys(self):
- """D.keys() -> list of D's keys"""
- return list(self)
-
- def items(self):
- """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
- return list(self.iteritems())
-
- def values(self):
- """D.values() -> list of D's values"""
- return list(self.itervalues())
-
- # Mappings are not hashable by default, but subclasses can change this
- __hash__ = None
-
- #TODO: implement __richcmp__
-
-
-cdef class BCFIndex(object):
- """CSI index data structure for BCF files"""
- def __init__(self):
- self.refs = ()
- self.refmap = {}
-
- if not self.ptr:
- raise ValueError('Invalid index object')
-
- cdef int n
- cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
-
- if not refs:
- raise ValueError('Cannot retrieve reference sequence names')
-
- self.refs = char_array_to_tuple(refs, n, free_after=1)
- self.refmap = { r:i for i,r in enumerate(self.refs) }
-
- def __dealloc__(self):
- if self.ptr:
- hts_idx_destroy(self.ptr)
- self.ptr = NULL
-
- def fetch(self, bcf, contig, start, stop, region, reopen):
- return BCFIterator(bcf, contig, start, stop, region, reopen)
-
-
-cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
- if not idx:
- return None
-
- if not header:
- raise ValueError('invalid VariantHeader')
-
- cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
- index.header = header
- index.ptr = idx
- index.__init__()
-
- return index
-
-
-cdef class TabixIndex(BaseIndex):
- """Tabix index data structure for VCF files"""
- def __init__(self):
- self.refs = ()
- self.refmap = {}
-
- if not self.ptr:
- raise ValueError('Invalid index object')
-
- cdef int n
- cdef const char **refs = tbx_seqnames(self.ptr, &n)
-
- if not refs:
- raise ValueError('Cannot retrieve reference sequence names')
-
- self.refs = char_array_to_tuple(refs, n, free_after=1)
- self.refmap = { r:i for i,r in enumerate(self.refs) }
-
- def __dealloc__(self):
- if self.ptr:
- tbx_destroy(self.ptr)
- self.ptr = NULL
-
- def fetch(self, bcf, contig, start, stop, region, reopen):
- return TabixIterator(bcf, contig, start, stop, region, reopen)
-
-
-cdef TabixIndex makeTabixIndex(tbx_t *idx):
- if not idx:
- return None
-
- cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
- index.ptr = idx
- index.__init__()
-
- return index
-
-
-########################################################################
-########################################################################
-## Iterators
-########################################################################
-
-
-cdef class BaseIterator(object):
- pass
-
-
-# Interal function to clean up after iteration stop or failure.
-# This would be a nested function if it weren't a cdef function.
-cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
- bcf_destroy1(record)
-
- # destroy iter so future calls to __next__ raise StopIteration
- bcf_itr_destroy(self.iter)
- self.iter = NULL
-
-
-cdef class BCFIterator(BaseIterator):
- def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
-
- if not isinstance(bcf.index, BCFIndex):
- raise ValueError('bcf index required')
-
- cdef BCFIndex index = bcf.index
- cdef int rid, cstart, cstop
- cdef char *cregion
-
- if not index:
- raise ValueError('bcf index required')
-
- if reopen:
- bcf = bcf.copy()
-
- if region is not None:
- if contig is not None or start is not None or stop is not None:
- raise ValueError # FIXME
-
- bregion = force_bytes(region)
- cregion = bregion
- with nogil:
- self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion)
- else:
- if contig is None:
- raise ValueError # FIXME
-
- try:
- rid = index.refmap[contig]
- except KeyError:
- raise('Unknown contig specified')
-
- if start is None:
- start = 0
- if stop is None:
- stop = MAX_POS
-
- cstart, cstop = start, stop
-
- with nogil:
- self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
-
- # Do not fail on self.iter == NULL, since it signifies a null query.
-
- self.bcf = bcf
- self.index = index
-
- def __dealloc__(self):
- if self.iter:
- bcf_itr_destroy(self.iter)
- self.iter = NULL
-
- def __iter__(self):
- return self
-
- def __next__(self):
- if not self.iter:
- raise StopIteration
-
- cdef bcf1_t *record = bcf_init1()
-
- record.pos = -1
- if self.bcf.drop_samples:
- record.max_unpack = BCF_UN_SHR
-
- cdef int ret
-
- with nogil:
- ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
-
- if ret < 0:
- _stop_BCFIterator(self, record)
- if ret == -1:
- raise StopIteration
- else:
- raise ValueError('error reading BCF file')
-
- ret = bcf_subset_format(self.bcf.header.ptr, record)
-
- if ret < 0:
- _stop_BCFIterator(self, record)
- raise ValueError('error in bcf_subset_format')
-
- return makeVariantRecord(self.bcf.header, record)
-
-
-cdef class TabixIterator(BaseIterator):
- def __cinit__(self, *args, **kwargs):
- self.line_buffer.l = 0
- self.line_buffer.m = 0
- self.line_buffer.s = NULL
-
- def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
- if not isinstance(bcf.index, TabixIndex):
- raise ValueError('tabix index required')
-
- cdef TabixIndex index = bcf.index
-
- if not index:
- raise ValueError('bcf index required')
-
- if reopen:
- bcf = bcf.copy()
-
- if region is not None:
- if contig is not None or start is not None or stop is not None:
- raise ValueError # FIXME
-
- self.iter = tbx_itr_querys(index.ptr, region)
- else:
- if contig is None:
- raise ValueError # FIXME
-
- rid = index.refmap.get(contig, -1)
-
- if start is None:
- start = 0
- if stop is None:
- stop = MAX_POS
-
- self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
-
- # Do not fail on self.iter == NULL, since it signifies a null query.
-
- self.bcf = bcf
- self.index = index
-
- def __dealloc__(self):
- if self.iter:
- tbx_itr_destroy(self.iter)
- self.iter = NULL
-
- if self.line_buffer.m:
- free(self.line_buffer.s)
-
- self.line_buffer.l = 0
- self.line_buffer.m = 0
- self.line_buffer.s = NULL
-
- def __iter__(self):
- return self
-
- def __next__(self):
- if not self.iter:
- raise StopIteration
-
- cdef int ret
-
- with nogil:
- ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
-
- if ret < 0:
- tbx_itr_destroy(self.iter)
- self.iter = NULL
- if ret == -1:
- raise StopIteration
- else:
- raise ValueError('error reading indexed VCF file')
-
- cdef bcf1_t *record = bcf_init1()
-
- record.pos = -1
- if self.bcf.drop_samples:
- record.max_unpack = BCF_UN_SHR
-
- ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
-
- # FIXME: stop iteration on parse failure?
- if ret < 0:
- bcf_destroy1(record)
- raise ValueError('error in vcf_parse')
-
- return makeVariantRecord(self.bcf.header, record)
-
-
-########################################################################
-########################################################################
-## Variant File
-########################################################################
-
-
-cdef class VariantFile(object):
- """*(filename, mode=None, index_filename=None, header=None, drop_samples=False)*
-
- A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
- opened.
-
- *mode* should be ``r`` for reading or ``w`` for writing. The default is
- text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
- ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
-
- If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
- modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
- For instance, to open a :term:`BCF` formatted file for reading, type::
-
- f = pysam.VariantFile('ex1.bcf','rb')
-
- If mode is not specified, we will try to auto-detect in the order 'rb',
- 'r', thus both the following should work::
-
- f1 = pysam.VariantFile('ex1.bcf')
- f2 = pysam.VariantFile('ex1.vcf')
-
- If an index for a variant file exists (.csi or .tbi), it will be opened
- automatically. Without an index random access to records via
- :meth:`fetch` is disabled.
-
- For writing, a :class:`VariantHeader` object must be provided, typically
- obtained from another :term:`VCF` file/:term:`BCF` file.
- """
- def __cinit__(self, *args, **kwargs):
- self.htsfile = NULL
-
- def __init__(self, *args, **kwargs):
- self.header = None
- self.index = None
- self.filename = None
- self.mode = None
- self.index_filename = None
- self.is_stream = False
- self.is_remote = False
- self.is_reading = False
- self.drop_samples = False
- self.start_offset = -1
-
- self.open(*args, **kwargs)
-
- def __dealloc__(self):
- if self.htsfile:
- hts_close(self.htsfile)
- self.htsfile = NULL
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- property category:
- """General file format category. One of UNKNOWN, ALIGNMENTS,
- VARIANTS, INDEX, REGIONS"""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return FORMAT_CATEGORIES[self.htsfile.format.category]
-
- property format:
- """File format.
-
- One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
- BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
- """
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return FORMATS[self.htsfile.format.format]
-
- property version:
- """Tuple of file format version numbers (major, minor)"""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return (self.htsfile.format.version.major,
- self.htsfile.format.version.minor)
-
- property compression:
- """File compression.
-
- One of NONE, GZIP, BGZF, CUSTOM."""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return COMPRESSION[self.htsfile.format.compression]
-
- property description:
- """Vaguely human readable description of the file format"""
- def __get__(self):
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- cdef char *desc = hts_format_description(&self.htsfile.format)
- try:
- return charptr_to_str(desc)
- finally:
- free(desc)
-
- def close(self):
- """closes the :class:`pysam.VariantFile`."""
- if self.htsfile:
- hts_close(self.htsfile)
- self.htsfile = NULL
- self.header = self.index = None
-
- property is_open:
- def __get__(self):
- """return True if VariantFile is open and in a valid state."""
- return self.htsfile != NULL
-
- def __iter__(self):
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
-
- if not self.mode.startswith(b'r'):
- raise ValueError(
- 'cannot iterate over Variantfile opened for writing')
-
- self.is_reading = 1
- return self
-
- def __next__(self):
- cdef int ret
- cdef bcf1_t *record = bcf_init1()
-
- record.pos = -1
- if self.drop_samples:
- record.max_unpack = BCF_UN_SHR
-
- with nogil:
- ret = bcf_read1(self.htsfile, self.header.ptr, record)
-
- if ret < 0:
- bcf_destroy1(record)
- if ret == -1:
- raise StopIteration
- elif ret == -2:
- raise IOError('truncated file')
- else:
- raise ValueError('Variant read failed')
-
- return makeVariantRecord(self.header, record)
-
- def copy(self):
- if not self.is_open:
- raise ValueError
-
- cdef VariantFile vars = VariantFile.__new__(VariantFile)
- cdef bcf_hdr_t *hdr
- cdef char *cfilename
- cdef char *cmode
-
- # FIXME: re-open using fd or else header and index could be invalid
- cfilename, cmode = self.filename, self.mode
- with nogil:
- vars.htsfile = hts_open(cfilename, cmode)
-
- if not vars.htsfile:
- raise ValueError('Cannot re-open htsfile')
-
- # minimize overhead by re-using header and index. This approach is
- # currently risky, but see above for how this can be mitigated.
- vars.header = self.header
- vars.index = self.index
-
- vars.filename = self.filename
- vars.mode = self.mode
- vars.index_filename = self.index_filename
- vars.drop_samples = self.drop_samples
- vars.is_stream = self.is_stream
- vars.is_remote = self.is_remote
- vars.is_reading = self.is_reading
- vars.start_offset = self.start_offset
-
- if self.htsfile.is_bin:
- vars.seek(self.tell())
- else:
- with nogil:
- hdr = bcf_hdr_read(vars.htsfile)
- makeVariantHeader(hdr)
-
- return vars
-
- def open(self, filename, mode='rb',
- index_filename=None,
- VariantHeader header=None,
- drop_samples=False):
- """open a vcf/bcf file.
-
- If open is called on an existing VariantFile, the current file will be
- closed and a new file will be opened.
- """
- cdef bcf_hdr_t *hdr
- cdef BGZF *bgzfp
- cdef hts_idx_t *idx
- cdef tbx_t *tidx
- cdef char *cfilename
- cdef char *cindex_filename = NULL
- cdef char *cmode
-
- # close a previously opened file
- if self.is_open:
- self.close()
-
- if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'):
- raise ValueError('invalid file opening mode `{}`'.format(mode))
-
- # for htslib, wbu seems to not work
- if mode == 'wbu':
- mode = 'wb0'
-
- self.mode = mode = force_bytes(mode)
- self.filename = filename = encode_filename(filename)
- if index_filename is not None:
- self.index_filename = index_filename = encode_filename(index_filename)
- else:
- self.index_filename = None
- self.drop_samples = bool(drop_samples)
- self.header = None
-
- self.is_remote = hisremote(filename)
- self.is_stream = filename == b'-'
-
- if mode.startswith(b'w'):
- # open file for writing
- if index_filename is not None:
- raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
-
- # header structure (used for writing)
- if header:
- self.header = header.copy()
- else:
- raise ValueError('a VariantHeader must be specified')
-
- # open file. Header gets written to file at the same time
- # for bam files and sam files (in the latter case, the
- # mode needs to be wh)
- cfilename, cmode = filename, mode
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
-
- if not self.htsfile:
- raise ValueError("could not open file `{}` (mode='{}')".format((filename, mode)))
-
- with nogil:
- bcf_hdr_write(self.htsfile, self.header.ptr)
-
- elif mode.startswith(b'r'):
- # open file for reading
- if filename != b'-' and not self.is_remote and not os.path.exists(filename):
- raise IOError('file `{}` not found'.format(filename))
-
- cfilename, cmode = filename, mode
- with nogil:
- self.htsfile = hts_open(cfilename, cmode)
-
- if not self.htsfile:
- raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
-
- if self.htsfile.format.format not in (bcf, vcf):
- raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
-
- if self.htsfile.format.compression == bgzf:
- bgzfp = hts_get_bgzfp(self.htsfile)
- if bgzfp and bgzf_check_EOF(bgzfp) == 0:
- warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename))
-
- with nogil:
- hdr = bcf_hdr_read(self.htsfile)
-
- try:
- self.header = makeVariantHeader(hdr)
- except ValueError:
- raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode))
-
- # check for index and open if present
- if self.htsfile.format.format == bcf:
- if index_filename is not None:
- cindex_filename = index_filename
- with nogil:
- idx = bcf_index_load2(cfilename, cindex_filename)
- self.index = makeBCFIndex(self.header, idx)
-
- elif self.htsfile.format.compression == bgzf:
- if index_filename is not None:
- cindex_filename = index_filename
- with nogil:
- tidx = tbx_index_load2(cfilename, cindex_filename)
- self.index = makeTabixIndex(tidx)
-
- if not self.is_stream:
- self.start_offset = self.tell()
- else:
- raise ValueError("unknown mode {}".format(mode))
-
- def reset(self):
- """reset file position to beginning of file just after the header."""
- return self.seek(self.start_offset, 0)
-
- def seek(self, uint64_t offset):
- """move file pointer to position *offset*, see
- :meth:`pysam.VariantFile.tell`."""
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
- if self.is_stream:
- raise OSError('seek not available in streams')
-
- cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
- with nogil:
- ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
- else:
- with nogil:
- ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
- return ret
-
- def tell(self):
- """return current file position, see :meth:`pysam.VariantFile.seek`."""
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
- if self.is_stream:
- raise OSError('tell not available in streams')
-
- cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
- with nogil:
- ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
- else:
- with nogil:
- ret = hts_utell(self.htsfile)
- return ret
-
- def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
- """fetch records in a :term:`region` using 0-based indexing. The
- region is specified by :term:`contig`, *start* and *end*.
- Alternatively, a samtools :term:`region` string can be supplied.
-
- Without *contig* or *region* all mapped records will be fetched. The
- records will be returned ordered by contig, which will not necessarily
- be the order within the file.
-
- Set *reopen* to true if you will be using multiple iterators on the
- same file at the same time. The iterator returned will receive its
- own copy of a filehandle to the file effectively re-opening the
- file. Re-opening a file incurrs some overhead, so use with care.
-
- If only *contig* is set, all records on *contig* will be fetched.
- If both *region* and *contig* are given, an exception is raised.
-
- Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
- (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
- read sequentially.
- """
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
-
- if not self.mode.startswith(b'r'):
- raise ValueError('cannot fetch from Variantfile opened '
- 'for writing')
-
- if contig is None and region is None:
- self.is_reading = 1
- bcf = self.copy() if reopen else self
- bcf.seek(self.start_offset)
- return iter(bcf)
-
- if not self.index:
- raise ValueError('fetch requires an index')
-
- self.is_reading = 1
- return self.index.fetch(self, contig, start, stop, region, reopen)
-
- cpdef int write(self, VariantRecord record) except -1:
- """
- write a single :class:`pysam.VariantRecord` to disk.
-
- returns the number of bytes written.
- """
- if not self.is_open:
- return ValueError('I/O operation on closed file')
-
- if not self.mode.startswith(b'w'):
- raise ValueError('cannot write to a Variantfile opened for reading')
-
- #if record.header is not self.header:
- # raise ValueError('Writing records from a different VariantFile is not yet supported')
-
- cdef int ret
-
- with nogil:
- ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
-
- if ret < 0:
- raise ValueError('write failed')
-
- return ret
-
- def subset_samples(self, include_samples):
- """
- Read only a subset of samples to reduce processing time and memory.
- Must be called prior to retrieving records.
- """
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
-
- if not self.mode.startswith(b'r'):
- raise ValueError('cannot subset samples from Variantfile '
- 'opened for writing')
-
- if self.is_reading:
- raise ValueError('cannot subset samples after fetching records')
-
- self.header._subset_samples(include_samples)
-
- # potentially unnecessary optimization that also sets max_unpack
- if not include_samples:
- self.drop_samples = True
+++ /dev/null
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-cimport cython
-
-from cpython cimport array
-from pysam.chtslib cimport faidx_t, kstring_t, BGZF
-
-# These functions are put here and not in chtslib.pxd in order
-# to avoid warnings for unused functions.
-cdef extern from "pysam_stream.h" nogil:
-
- ctypedef struct kstream_t:
- pass
-
- ctypedef struct kseq_t:
- kstring_t name
- kstring_t comment
- kstring_t seq
- kstring_t qual
-
- kseq_t *kseq_init(BGZF *)
- int kseq_read(kseq_t *)
- void kseq_destroy(kseq_t *)
- kstream_t *ks_init(BGZF *)
- void ks_destroy(kstream_t *)
-
- # Retrieve characters from stream until delimiter
- # is reached placing results in str.
- int ks_getuntil(kstream_t *,
- int delimiter,
- kstring_t * str,
- int * dret)
-
-cdef class FastaFile:
- cdef bint is_remote
- cdef object _filename, _references, _lengths, reference2length
- cdef faidx_t* fastafile
- cdef char* _fetch(self, char* reference,
- int start, int end, int* length)
-
-
-cdef class FastqProxy:
- cdef kseq_t * _delegate
- cdef cython.str tostring(self)
- cpdef array.array get_quality_array(self, int offset=*)
-
-
-cdef class PersistentFastqProxy:
- """
- Python container for pysam.cfaidx.FastqProxy with persistence.
- """
- cdef public str comment, quality, sequence, name
- cdef cython.str tostring(self)
- cpdef array.array get_quality_array(self, int offset=*)
-
-
-cdef class FastxFile:
- cdef object _filename
- cdef BGZF * fastqfile
- cdef kseq_t * entry
- cdef bint persist
- cdef bint is_remote
-
- cdef kseq_t * getCurrent(self)
- cdef int cnext(self)
-
-
-# Compatibility Layer for pysam 0.8.1
-cdef class FastqFile(FastxFile):
- pass
-
-
-# Compatibility Layer for pysam < 0.8
-cdef class Fastafile(FastaFile):
- pass
-
+++ /dev/null
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-# Cython wrapper for SAM/BAM/CRAM files based on htslib
-###############################################################################
-# The principal classes defined in this module are:
-#
-# class FastaFile random read read/write access to faidx indexd files
-# class FastxFile streamed read/write access to fasta/fastq files
-#
-# Additionally this module defines several additional classes that are part
-# of the internal API. These are:
-#
-# class FastqProxy
-# class PersistentFastqProxy
-#
-# For backwards compatibility, the following classes are also defined:
-#
-# class Fastafile equivalent to FastaFile
-# class FastqFile equivalent to FastxFile
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-import sys
-import os
-import re
-from cpython cimport array
-
-from cpython cimport PyErr_SetString, \
- PyBytes_Check, \
- PyUnicode_Check, \
- PyBytes_FromStringAndSize
-
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.chtslib cimport \
- faidx_nseq, fai_load, fai_destroy, fai_fetch, \
- faidx_seq_len, \
- faidx_fetch_seq, hisremote, \
- bgzf_open, bgzf_close
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-from pysam.cutils cimport qualitystring_to_array, parse_region
-
-cdef class FastqProxy
-cdef makeFastqProxy(kseq_t * src):
- '''enter src into AlignedRead.'''
- cdef FastqProxy dest = FastqProxy.__new__(FastqProxy)
- dest._delegate = src
- return dest
-
-## TODO:
-## add automatic indexing.
-## add function to get sequence names.
-cdef class FastaFile:
- """Random access to fasta formatted files that
- have been indexed by :term:`faidx`.
-
- The file is automatically opened. The index file of file
- ``<filename>`` is expected to be called ``<filename>.fai``.
-
- Parameters
- ----------
-
- filename : string
- Filename of fasta file to be opened.
-
- filepath_index : string
- Optional, filename of the index. By default this is
- the filename + ".fai".
-
- Raises
- ------
-
- ValueError
- if index file is missing
-
- IOError
- if file could not be opened
-
- """
-
- def __cinit__(self, *args, **kwargs):
- self.fastafile = NULL
- self._filename = None
- self._references = None
- self._lengths = None
- self.reference2length = None
- self._open(*args, **kwargs)
-
- def is_open(self):
- '''return true if samfile has been opened.'''
- return self.fastafile != NULL
-
- def __len__(self):
- if self.fastafile == NULL:
- raise ValueError("calling len() on closed file")
-
- return faidx_nseq(self.fastafile)
-
- def _open(self, filename, filepath_index=None):
- '''open an indexed fasta file.
-
- This method expects an indexed fasta file.
- '''
-
- # close a previously opened file
- if self.fastafile != NULL:
- self.close()
-
- self._filename = encode_filename(filename)
- cdef char *cfilename = self._filename
- self.is_remote = hisremote(cfilename)
-
- if filepath_index is not None:
- raise NotImplementedError(
- "setting an explicit path for the index "
- "is not implemented")
-
- # open file for reading
- if (self._filename != b"-"
- and not self.is_remote
- and not os.path.exists(filename)):
- raise IOError("file `%s` not found" % filename)
-
- with nogil:
- self.fastafile = fai_load(cfilename)
-
- if self.fastafile == NULL:
- raise IOError("could not open file `%s`" % filename)
-
- if self.is_remote:
- filepath_index = os.path.basename(
- re.sub("[^:]+:[/]*", "", filename)) + ".fai"
- elif filepath_index is None:
- filepath_index = filename + ".fai"
-
- if not os.path.exists(filepath_index):
- raise ValueError("could not locate index file {}".format(
- filepath_index))
-
- with open(filepath_index) as inf:
- data = [x.split("\t") for x in inf]
- self._references = tuple(x[0] for x in data)
- self._lengths = tuple(int(x[1]) for x in data)
- self.reference2length = dict(zip(self._references, self._lengths))
-
- def close(self):
- """close the file."""
- if self.fastafile != NULL:
- fai_destroy(self.fastafile)
- self.fastafile = NULL
-
- def __dealloc__(self):
- if self.fastafile != NULL:
- fai_destroy(self.fastafile)
- self.fastafile = NULL
-
- # context manager interface
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- property closed:
- """"bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
- """
- def __get__(self):
- return not self.is_open()
-
- property filename:
- """filename associated with this object. This is a read-only attribute."""
- def __get__(self):
- return self._filename
-
- property references:
- '''tuple with the names of :term:`reference` sequences.'''
- def __get__(self):
- return self._references
-
- property nreferences:
- """"int with the number of :term:`reference` sequences in the file.
- This is a read-only attribute."""
- def __get__(self):
- return len(self._references) if self.references else None
-
- property lengths:
- """tuple with the lengths of :term:`reference` sequences."""
- def __get__(self):
- return self._lengths
-
- def fetch(self,
- reference=None,
- start=None,
- end=None,
- region=None):
- """fetch sequences in a :term:`region`.
-
- A region can
- either be specified by :term:`reference`, `start` and
- `end`. `start` and `end` denote 0-based, half-open
- intervals.
-
- Alternatively, a samtools :term:`region` string can be
- supplied.
-
- If any of the coordinates are missing they will be replaced by the
- minimum (`start`) or maximum (`end`) coordinate.
-
- Note that region strings are 1-based, while `start` and `end` denote
- an interval in python coordinates.
- The region is specified by :term:`reference`, `start` and `end`.
-
- Returns
- -------
-
- string : a string with the sequence specified by the region.
-
- Raises
- ------
-
- IndexError
- if the coordinates are out of range
-
- ValueError
- if the region is invalid
-
- """
-
- if not self.is_open():
- raise ValueError("I/O operation on closed file" )
-
- cdef int length
- cdef char *seq
- cdef char *ref
- cdef int rstart, rend
-
- reference, rstart, rend = parse_region(reference, start, end, region)
-
- if reference is None:
- raise ValueError("no sequence/region supplied.")
-
- if rstart == rend:
- return ""
-
- ref = reference
- with nogil:
- length = faidx_seq_len(self.fastafile, ref)
- if length == -1:
- raise KeyError("sequence '%s' not present" % reference)
- if rstart >= length:
- return ""
-
- # fai_fetch adds a '\0' at the end
- with nogil:
- seq = faidx_fetch_seq(self.fastafile,
- ref,
- rstart,
- rend-1,
- &length)
-
- if seq == NULL:
- raise ValueError(
- "failure when retrieving sequence on '%s'" % reference)
-
- try:
- return charptr_to_str(seq)
- finally:
- free(seq)
-
- cdef char * _fetch(self, char * reference, int start, int end, int * length):
- '''fetch sequence for reference, start and end'''
-
- with nogil:
- return faidx_fetch_seq(self.fastafile,
- reference,
- start,
- end-1,
- length)
-
- def get_reference_length(self, reference):
- '''return the length of reference.'''
- return self.reference2length[reference]
-
- def __getitem__(self, reference):
- return self.fetch(reference)
-
- def __contains__(self, reference):
- '''return true if reference in fasta file.'''
- return reference in self.reference2length
-
-
-cdef class FastqProxy:
- """A single entry in a fastq file."""
- def __init__(self): pass
-
- property name:
- """The name of each entry in the fastq file."""
- def __get__(self):
- return charptr_to_str(self._delegate.name.s)
-
- property sequence:
- """The sequence of each entry in the fastq file."""
- def __get__(self):
- return charptr_to_str(self._delegate.seq.s)
-
- property comment:
- def __get__(self):
- if self._delegate.comment.l:
- return charptr_to_str(self._delegate.comment.s)
- else:
- return None
-
- property quality:
- """The quality score of each entry in the fastq file, represented as a string."""
- def __get__(self):
- if self._delegate.qual.l:
- return charptr_to_str(self._delegate.qual.s)
- else:
- return None
-
- cdef cython.str tostring(self):
- if self.comment is None:
- comment = ""
- else:
- comment = " %s" % self.comment
-
- if self.quality is None:
- return ">%s%s\n%s" % (self.name, comment, self.sequence)
- else:
- return "@%s%s\n%s\n+\n%s" % (self.name, comment,
- self.sequence, self.quality)
-
- def __str__(self):
- return self.tostring()
-
- cpdef array.array get_quality_array(self, int offset=33):
- '''return quality values as integer array after subtracting offset.'''
- if self.quality is None:
- return None
- return qualitystring_to_array(force_bytes(self.quality),
- offset=offset)
-
-cdef class PersistentFastqProxy:
- """
- Python container for pysam.cfaidx.FastqProxy with persistence.
- Needed to compare multiple fastq records from the same file.
- """
- def __init__(self, FastqProxy FastqRead):
- self.comment = FastqRead.comment
- self.quality = FastqRead.quality
- self.sequence = FastqRead.sequence
- self.name = FastqRead.name
-
- cdef cython.str tostring(self):
- if self.comment is None:
- comment = ""
- else:
- comment = " %s" % self.comment
-
- if self.quality is None:
- return ">%s%s\n%s" % (self.name, comment, self.sequence)
- else:
- return "@%s%s\n%s\n+\n%s" % (self.name, comment,
- self.sequence, self.quality)
-
- def __str__(self):
- return self.tostring()
-
- cpdef array.array get_quality_array(self, int offset=33):
- '''return quality values as array after subtracting offset.'''
- if self.quality is None:
- return None
- return qualitystring_to_array(force_bytes(self.quality),
- offset=offset)
-
-
-cdef class FastxFile:
- """Stream access to :term:`fasta` or :term:`fastq` formatted files.
-
- The file is automatically opened.
-
- Entries in the file can be both fastq or fasta formatted or even a
- mixture of the two.
-
- This file object permits iterating over all entries in the
- file. Random access is not implemented. The iteration returns
- objects of type :class:`FastqProxy`
-
- Parameters
- ----------
-
- filename : string
- Filename of fasta/fastq file to be opened.
-
- persist : bool
-
- If True (default) make a copy of the entry in the file during
- iteration. If set to False, no copy will be made. This will
- permit faster iteration, but an entry will not persist when
- the iteration continues.
-
- Notes
- -----
- Prior to version 0.8.2, this was called FastqFile.
-
- Raises
- ------
-
- IOError
- if file could not be opened
-
-
- Examples
- --------
- >>> with pysam.FastxFile(filename) as fh:
- ... for entry in fh:
- ... print(entry.name)
- ... print(entry.sequence)
- ... print(entry.comment)
- ... print(entry.quality)
-
- """
- def __cinit__(self, *args, **kwargs):
- # self.fastqfile = <gzFile*>NULL
- self._filename = None
- self.entry = NULL
- self._open(*args, **kwargs)
-
- def is_open(self):
- '''return true if samfile has been opened.'''
- return self.entry != NULL
-
- def _open(self, filename, persist=True):
- '''open a fastq/fasta file in *filename*
-
- Paramentes
- ----------
-
- persist : bool
-
- if True return a copy of the underlying data (default
- True). The copy will persist even if the iteration
- on the file continues.
-
- '''
- if self.fastqfile != NULL:
- self.close()
-
- self._filename = encode_filename(filename)
- cdef char *cfilename = self._filename
- self.is_remote = hisremote(cfilename)
-
- # open file for reading
- if (self._filename != b"-"
- and not self.is_remote
- and not os.path.exists(filename)):
- raise IOError("file `%s` not found" % filename)
-
- self.persist = persist
-
- with nogil:
- self.fastqfile = bgzf_open(cfilename, "r")
- self.entry = kseq_init(self.fastqfile)
- self._filename = filename
-
- def close(self):
- '''close the file.'''
- if self.fastqfile != NULL:
- bgzf_close(self.fastqfile)
- self.fastqfile = NULL
- if self.entry != NULL:
- kseq_destroy(self.entry)
- self.entry = NULL
-
- def __dealloc__(self):
- if self.fastqfile != NULL:
- bgzf_close(self.fastqfile)
- if self.entry:
- kseq_destroy(self.entry)
-
- # context manager interface
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- property closed:
- """"bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
- """
- def __get__(self):
- return not self.is_open()
-
- property filename:
- """string with the filename associated with this object."""
- def __get__(self):
- return self._filename
-
- def __iter__(self):
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- return self
-
- cdef kseq_t * getCurrent(self):
- return self.entry
-
- cdef int cnext(self):
- '''C version of iterator
- '''
- with nogil:
- return kseq_read(self.entry)
-
- def __next__(self):
- """
- python version of next().
- """
- cdef int l
- with nogil:
- l = kseq_read(self.entry)
- if (l >= 0):
- if self.persist:
- return PersistentFastqProxy(makeFastqProxy(self.entry))
- return makeFastqProxy(self.entry)
- else:
- raise StopIteration
-
-# Compatibility Layer for pysam 0.8.1
-cdef class FastqFile(FastxFile):
- """FastqFile is deprecated: use FastxFile instead"""
- pass
-
-# Compatibility Layer for pysam < 0.8
-cdef class Fastafile(FastaFile):
- """Fastafile is deprecated: use FastaFile instead"""
- pass
-
-__all__ = ["FastaFile",
- "FastqFile",
- "FastxFile",
- "Fastafile"]
+++ /dev/null
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-from posix.types cimport off_t
-
-cdef extern from "Python.h":
- FILE* PyFile_AsFile(object)
-
-
-cdef extern from "htslib/kstring.h" nogil:
- ctypedef struct kstring_t:
- size_t l, m
- char *s
-
-
-cdef extern from "htslib_util.h" nogil:
- int hts_set_verbosity(int verbosity)
- int hts_get_verbosity()
-
- ctypedef uint32_t khint32_t
- ctypedef uint32_t khint_t
- ctypedef khint_t khiter_t
-
- # Used to manage BCF Header info
- ctypedef struct vdict_t:
- khint_t n_buckets, size, n_occupied, upper_bound
- khint32_t *flags
- const char *keys
- bcf_idinfo_t *vals
-
- # Used to manage indexed contigs in Tabix
- ctypedef struct s2i_t:
- khint_t n_buckets, size, n_occupied, upper_bound
- khint32_t *flags
- const char *keys
- int64_t *vals
-
- # Generic khash methods
- khint_t kh_size(void *d)
- khint_t kh_begin(void *d)
- khint_t kh_end(void *d)
- int kh_exist(void *d, khiter_t i)
-
- # Specialized khash methods for vdict
- khint_t kh_get_vdict(vdict_t *d, const char *key)
- const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
- bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
-
-
-cdef extern from "htslib/hfile.h" nogil:
- ctypedef struct hFILE
-
- # @abstract Open the named file or URL as a stream
- # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
- hFILE *hopen(const char *filename, const char *mode)
-
- # @abstract Associate a stream with an existing open file descriptor
- # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
- # @notes For socket descriptors (on Windows), mode should contain 's'.
- hFILE *hdopen(int fd, const char *mode)
-
- # @abstract Report whether the file name or URL denotes remote storage
- # @return 0 if local, 1 if remote.
- # @notes "Remote" means involving e.g. explicit network access, with the
- # implication that callers may wish to cache such files' contents locally.
- int hisremote(const char *filename)
-
- # @abstract Flush (for output streams) and close the stream
- # @return 0 if successful, or EOF (with errno set) if an error occurred.
- int hclose(hFILE *fp)
-
- # @abstract Close the stream, without flushing or propagating errors
- # @notes For use while cleaning up after an error only. Preserves errno.
- void hclose_abruptly(hFILE *fp)
-
- # @abstract Return the stream's error indicator
- # @return Non-zero (in fact, an errno value) if an error has occurred.
- # @notes This would be called herror() and return true/false to parallel
- # ferror(3), but a networking-related herror(3) function already exists. */
- int herrno(hFILE *fp)
-
- # @abstract Clear the stream's error indicator
- void hclearerr(hFILE *fp)
-
- # @abstract Reposition the read/write stream offset
- # @return The resulting offset within the stream (as per lseek(2)),
- # or negative if an error occurred.
- off_t hseek(hFILE *fp, off_t offset, int whence)
-
- # @abstract Report the current stream offset
- # @return The offset within the stream, starting from zero.
- off_t htell(hFILE *fp)
-
- # @abstract Read one character from the stream
- # @return The character read, or EOF on end-of-file or error
- int hgetc(hFILE *fp)
-
- # @abstract Peek at characters to be read without removing them from buffers
- # @param fp The file stream
- # @param buffer The buffer to which the peeked bytes will be written
- # @param nbytes The number of bytes to peek at; limited by the size of the
- # internal buffer, which could be as small as 4K.
- # @return The number of bytes peeked, which may be less than nbytes if EOF
- # is encountered; or negative, if there was an I/O error.
- # @notes The characters peeked at remain in the stream's internal buffer,
- # and will be returned by later hread() etc calls.
- ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
-
- # @abstract Read a block of characters from the file
- # @return The number of bytes read, or negative if an error occurred.
- # @notes The full nbytes requested will be returned, except as limited
- # by EOF or I/O errors.
- ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
-
- # @abstract Write a character to the stream
- # @return The character written, or EOF if an error occurred.
- int hputc(int c, hFILE *fp)
-
- # @abstract Write a string to the stream
- # @return 0 if successful, or EOF if an error occurred.
- int hputs(const char *text, hFILE *fp)
-
- # @abstract Write a block of characters to the file
- # @return Either nbytes, or negative if an error occurred.
- # @notes In the absence of I/O errors, the full nbytes will be written.
- ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
-
- # @abstract For writing streams, flush buffered output to the underlying stream
- # @return 0 if successful, or EOF if an error occurred.
- int hflush(hFILE *fp)
-
-
-cdef extern from "htslib/bgzf.h" nogil:
- ctypedef struct bgzf_mtaux_t
- ctypedef struct bgzidx_t
- ctypedef struct z_stream
-
- ctypedef struct BGZF:
- unsigned errcode
- unsigned is_write
- int is_be
- int compress_level
- int is_compressed
- int is_gzip
- int cache_size
- int64_t block_address
- int64_t uncompressed_address
- void *uncompressed_block
- void *compressed_block
- void *cache
- hFILE *fp
- bgzf_mtaux_t *mt
- bgzidx_t *idx
- int idx_build_otf
- z_stream *gz_stream
-
- #*****************
- # Basic routines *
- # *****************/
-
- # Open an existing file descriptor for reading or writing.
- #
- # @param fd file descriptor
- # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
- # writing, 'a' for appending, 'g' for gzip rather than BGZF
- # compression (with 'w' only), and digit specifies the zlib
- # compression level.
- # Note that there is a distinction between 'u' and '0': the
- # first yields plain uncompressed output whereas the latter
- # outputs uncompressed data wrapped in the zlib format.
- # @return BGZF file handler; 0 on error
-
- BGZF* bgzf_dopen(int fd, const char *mode)
- BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility
-
- # Open the specified file for reading or writing.
- BGZF* bgzf_open(const char* path, const char *mode)
-
- # Open an existing hFILE stream for reading or writing.
- BGZF* bgzf_hopen(hFILE *fp, const char *mode)
-
- # Close the BGZF and free all associated resources.
- #
- # @param fp BGZF file handler
- # @return 0 on success and -1 on error
- int bgzf_close(BGZF *fp)
-
- # Read up to _length_ bytes from the file storing into _data_.
- #
- # @param fp BGZF file handler
- # @param data data array to read into
- # @param length size of data to read
- # @return number of bytes actually read; 0 on end-of-file and -1 on error
- ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
-
- # Write _length_ bytes from _data_ to the file. If no I/O errors occur,
- # the complete _length_ bytes will be written (or queued for writing).
- #
- # @param fp BGZF file handler
- # @param data data array to write
- # @param length size of data to write
- # @return number of bytes written (i.e., _length_); negative on error
- ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
-
- # Read up to _length_ bytes directly from the underlying stream without
- # decompressing. Bypasses BGZF blocking, so must be used with care in
- # specialised circumstances only.
- #
- # @param fp BGZF file handler
- # @param data data array to read into
- # @param length number of raw bytes to read
- # @return number of bytes actually read; 0 on end-of-file and -1 on error
- ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
-
- # Write _length_ bytes directly to the underlying stream without
- # compressing. Bypasses BGZF blocking, so must be used with care
- # in specialised circumstances only.
- #
- # @param fp BGZF file handler
- # @param data data array to write
- # @param length number of raw bytes to write
- # @return number of bytes actually written; -1 on error
- ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
-
- # Write the data in the buffer to the file.
- int bgzf_flush(BGZF *fp)
-
- int SEEK_SET
-
- # Return a virtual file pointer to the current location in the file.
- # No interpetation of the value should be made, other than a subsequent
- # call to bgzf_seek can be used to position the file at the same point.
- # Return value is non-negative on success.
- int64_t bgzf_tell(BGZF *fp)
-
- # Set the file to read from the location specified by _pos_.
- #
- # @param fp BGZF file handler
- # @param pos virtual file offset returned by bgzf_tell()
- # @param whence must be SEEK_SET
- # @return 0 on success and -1 on error
- # /
- int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
-
- # Check if the BGZF end-of-file (EOF) marker is present
- #
- # @param fp BGZF file handler opened for reading
- # @return 1 if the EOF marker is present and correct
- # 2 if it can't be checked, e.g., because fp isn't seekable
- # 0 if the EOF marker is absent
- # -1 (with errno set) on error
- int bgzf_check_EOF(BGZF *fp)
-
- # Check if a file is in the BGZF format
- #
- # @param fn file name
- # @return 1 if _fn_ is BGZF; 0 if not or on I/O error
- int bgzf_is_bgzf(const char *fn)
-
- #*********************
- # Advanced routines *
- #*********************
-
- # Set the cache size. Only effective when compiled with -DBGZF_CACHE.
- #
- # @param fp BGZF file handler
- # @param size size of cache in bytes; 0 to disable caching (default)
- void bgzf_set_cache_size(BGZF *fp, int size)
-
- # Flush the file if the remaining buffer size is smaller than _size_
- # @return 0 if flushing succeeded or was not needed; negative on error
- int bgzf_flush_try(BGZF *fp, ssize_t size)
-
- # Read one byte from a BGZF file. It is faster than bgzf_read()
- # @param fp BGZF file handler
- # @return byte read; -1 on end-of-file or error
- int bgzf_getc(BGZF *fp)
-
- # Read one line from a BGZF file. It is faster than bgzf_getc()
- #
- # @param fp BGZF file handler
- # @param delim delimitor
- # @param str string to write to; must be initialized
- # @return length of the string; 0 on end-of-file; negative on error
- int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
-
- # Read the next BGZF block.
- int bgzf_read_block(BGZF *fp)
-
- # Enable multi-threading (only effective on writing and when the
- # library was compiled with -DBGZF_MT)
- #
- # @param fp BGZF file handler; must be opened for writing
- # @param n_threads #threads used for writing
- # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
- int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
-
-
- # Compress a single BGZF block.
- #
- # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
- # @param dlen size of output buffer; updated on return to the number
- # of bytes actually written to dst
- # @param src buffer to be compressed
- # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE)
- # @param level compression level
- # @return 0 on success and negative on error
- #
- int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
-
- #*******************
- # bgzidx routines *
- # BGZF at the uncompressed offset
- #
- # @param fp BGZF file handler; must be opened for reading
- # @param uoffset file offset in the uncompressed data
- # @param where SEEK_SET supported atm
- #
- # Returns 0 on success and -1 on error.
- int bgzf_useek(BGZF *fp, long uoffset, int where)
-
- # Position in uncompressed BGZF
- #
- # @param fp BGZF file handler; must be opened for reading
- #
- # Returns the current offset on success and -1 on error.
- long bgzf_utell(BGZF *fp)
-
- # Tell BGZF to build index while compressing.
- #
- # @param fp BGZF file handler; can be opened for reading or writing.
- #
- # Returns 0 on success and -1 on error.
- int bgzf_index_build_init(BGZF *fp)
-
- # Load BGZF index
- #
- # @param fp BGZF file handler
- # @param bname base name
- # @param suffix suffix to add to bname (can be NULL)
- #
- # Returns 0 on success and -1 on error.
- int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
-
- # Save BGZF index
- #
- # @param fp BGZF file handler
- # @param bname base name
- # @param suffix suffix to add to bname (can be NULL)
- #
- # Returns 0 on success and -1 on error.
- int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
-
-
-cdef extern from "htslib/hts.h" nogil:
- uint32_t kroundup32(uint32_t x)
-
- ctypedef struct cram_fd
-
- union FilePointerUnion:
- BGZF *bgzf
- cram_fd *cram
- hFILE *hfile
- void *voidp
-
- enum htsFormatCategory:
- unknown_category
- sequence_data # Sequence data -- SAM, BAM, CRAM, etc
- variant_data # Variant calling data -- VCF, BCF, etc
- index_file # Index file associated with some data file
- region_list # Coordinate intervals or regions -- BED, etc
- category_maximum
-
- enum htsExactFormat:
- unknown_format
- binary_format
- text_format
- sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
- format_maximum
-
- enum htsCompression:
- no_compression, gzip, bgzf, custom
- compression_maximum
-
- enum hts_fmt_option:
- CRAM_OPT_DECODE_MD,
- CRAM_OPT_PREFIX,
- CRAM_OPT_VERBOSITY,
- CRAM_OPT_SEQS_PER_SLICE,
- CRAM_OPT_SLICES_PER_CONTAINER,
- CRAM_OPT_RANGE,
- CRAM_OPT_VERSION,
- CRAM_OPT_EMBED_REF,
- CRAM_OPT_IGNORE_MD5,
- CRAM_OPT_REFERENCE,
- CRAM_OPT_MULTI_SEQ_PER_SLICE,
- CRAM_OPT_NO_REF,
- CRAM_OPT_USE_BZIP2,
- CRAM_OPT_SHARED_REF,
- CRAM_OPT_NTHREADS,
- CRAM_OPT_THREAD_POOL,
- CRAM_OPT_USE_LZMA,
- CRAM_OPT_USE_RANS,
- CRAM_OPT_REQUIRED_FIELDS,
- HTS_OPT_COMPRESSION_LEVEL,
- HTS_OPT_NTHREADS,
-
- ctypedef struct htsVersion:
- short major, minor
-
- ctypedef struct htsFormat:
- htsFormatCategory category
- htsExactFormat format
- htsVersion version
- htsCompression compression
- short compression_level
- void *specific
-
- ctypedef struct htsFile:
- uint8_t is_bin
- uint8_t is_write
- uint8_t is_be
- uint8_t is_cram
- int64_t lineno
- kstring_t line
- char *fn
- char *fn_aux
- FilePointerUnion fp
- htsFormat format
-
- int hts_verbose
-
- # @abstract Table for converting a nucleotide character to 4-bit encoding.
- # The input character may be either an IUPAC ambiguity code, '=' for 0, or
- # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
- # for A/C/G/T or combinations of these bits for ambiguous bases.
- const unsigned char *seq_nt16_table
-
- # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
- # ambiguity code letter (or '=' when given 0).
- const char *seq_nt16_str
-
- # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
- # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
- const int *seq_nt16_int
-
- # @abstract Get the htslib version number
- # @return For released versions, a string like "N.N[.N]"; or git describe
- # output if using a library built within a Git repository.
- const char *hts_version()
-
- # @abstract Determine format by peeking at the start of a file
- # @param fp File opened for reading, positioned at the beginning
- # @param fmt Format structure that will be filled out on return
- # @return 0 for success, or negative if an error occurred.
- int hts_detect_format(hFILE *fp, htsFormat *fmt)
-
- # @abstract Get a human-readable description of the file format
- # @return Description string, to be freed by the caller after use.
- char *hts_format_description(const htsFormat *format)
-
- # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
- # @param fn The file name or "-" for stdin/stdout
- # @param mode Mode matching / [rwa][bceguxz0-9]* /
- # @discussion
- # With 'r' opens for reading; any further format mode letters are ignored
- # as the format is detected by checking the first few bytes or BGZF blocks
- # of the file. With 'w' or 'a' opens for writing or appending, with format
- # specifier letters:
- # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
- # c CRAM format
- # g gzip compressed
- # u uncompressed
- # z bgzf compressed
- # [0-9] zlib compression level
- # and with non-format option letters (for any of 'r'/'w'/'a'):
- # e close the file on exec(2) (opens with O_CLOEXEC, where supported)
- # x create the file exclusively (opens with O_EXCL, where supported)
- # Note that there is a distinction between 'u' and '0': the first yields
- # plain uncompressed output whereas the latter outputs uncompressed data
- # wrapped in the zlib format.
- # @example
- # [rw]b .. compressed BCF, BAM, FAI
- # [rw]bu .. uncompressed BCF
- # [rw]z .. compressed VCF
- # [rw] .. uncompressed VCF
- htsFile *hts_open(const char *fn, const char *mode)
-
- # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
- # @param fn The file name or "-" for stdin/stdout
- # @param mode Open mode, as per hts_open()
- # @param fmt Optional format specific parameters
- # @discussion
- # See hts_open() for description of fn and mode.
- # // TODO Update documentation for s/opts/fmt/
- # Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
- # if defined, override mode. Opts also contains a linked list of hts_opt
- # structures to apply to the open file handle. These can contain things
- # like pointers to the reference or information on compression levels,
- # block sizes, etc.
- htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
-
- # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
- # @param fp The already-open file handle
- # @param fn The file name or "-" for stdin/stdout
- # @param mode Open mode, as per hts_open()
- htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
-
- # @abstract Close a file handle, flushing buffered data for output streams
- # @param fp The file handle to be closed
- # @return 0 for success, or negative if an error occurred.
- int hts_close(htsFile *fp)
-
- # @abstract Returns the file's format information
- # @param fp The file handle
- # @return Read-only pointer to the file's htsFormat.
- const htsFormat *hts_get_format(htsFile *fp)
-
- # @ abstract Returns a string containing the file format extension.
- # @ param format Format structure containing the file type.
- # @ return A string ("sam", "bam", etc) or "?" for unknown formats.
- const char *hts_format_file_extension(const htsFormat *format)
-
- # @abstract Sets a specified CRAM option on the open file handle.
- # @param fp The file handle open the open file.
- # @param opt The CRAM_OPT_* option.
- # @param ... Optional arguments, dependent on the option used.
- # @return 0 for success, or negative if an error occurred.
- int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
-
- int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
- char **hts_readlines(const char *fn, int *_n)
-
- # @abstract Parse comma-separated list or read list from a file
- # @param list File name or comma-separated list
- # @param is_file
- # @param _n Size of the output array (number of items read)
- # @return NULL on failure or pointer to newly allocated array of
- # strings
- char **hts_readlist(const char *fn, int is_file, int *_n)
-
- # @abstract Create extra threads to aid compress/decompression for this file
- # @param fp The file handle
- # @param n The number of worker threads to create
- # @return 0 for success, or negative if an error occurred.
- # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
- int hts_set_threads(htsFile *fp, int n)
-
- # @abstract Set .fai filename for a file opened for reading
- # @return 0 for success, negative on failure
- # @discussion
- # Called before *_hdr_read(), this provides the name of a .fai file
- # used to provide a reference list if the htsFile contains no @SQ headers.
- int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
-
- int8_t HTS_IDX_NOCOOR
- int8_t HTS_IDX_START
- int8_t HTS_IDX_REST
- int8_t HTS_IDX_NONE
-
- int8_t HTS_FMT_CSI
- int8_t HTS_FMT_BAI
- int8_t HTS_FMT_TBI
- int8_t HTS_FMT_CRAI
-
- BGZF *hts_get_bgzfp(htsFile *fp)
- int hts_useek(htsFile *fp, long uoffset, int where)
- long hts_utell(htsFile *fp)
-
- ctypedef struct hts_idx_t
-
- ctypedef struct hts_pair64_t:
- uint64_t u, v
-
- ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end)
-
- ctypedef struct hts_bins_t:
- int n, m
- int *a
-
- ctypedef struct hts_itr_t:
- uint32_t read_rest
- uint32_t finished
- int tid, bed, end, n_off, i
- int curr_tid, curr_beg, curr_end
- uint64_t curr_off
- hts_pair64_t *off
- hts_readrec_func *readfunc
- hts_bins_t bins
-
- hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
- void hts_idx_destroy(hts_idx_t *idx)
- int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
- void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
-
- #### Save an index to a file
- # @param idx Index to be written
- # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
- # @param fmt One of the HTS_FMT_* index formats
- # @return 0 if successful, or negative if an error occurred.
- int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
-
- #### Save an index to a specific file
- # @param idx Index to be written
- # @param fn Input BAM/BCF/etc filename
- # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
- # @param fmt One of the HTS_FMT_* index formats
- # @return 0 if successful, or negative if an error occurred.
- int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
-
- #### Load an index file
- # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
- # the extension substituted, to search for an existing index file
- # @param fmt One of the HTS_FMT_* index formats
- # @return The index, or NULL if an error occurred.
- hts_idx_t *hts_idx_load(const char *fn, int fmt)
-
- #### Load a specific index file
- # @param fn Input BAM/BCF/etc filename
- # @param fnidx The input index filename
- # @return The index, or NULL if an error occurred.
- hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
-
- uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
- void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
-
- int hts_idx_get_stat(const hts_idx_t* idx, int tid,
- uint64_t* mapped, uint64_t* unmapped)
-
- uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
-
- int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers
-
- # Parse a numeric string
- # The number may be expressed in scientific notation, and optionally may
- # contain commas in the integer part (before any decimal point or E notation).
- # @param str String to be parsed
- # @param strend If non-NULL, set on return to point to the first character
- # in @a str after those forming the parsed number
- # @param flags Or'ed-together combination of HTS_PARSE_* flags
- # @return Converted value of the parsed number.
- #
- # When @a strend is NULL, a warning will be printed (if hts_verbose is 2
- # or more) if there are any trailing characters after the number.
- long long hts_parse_decimal(const char *str, char **strend, int flags)
-
- # Parse a "CHR:START-END"-style region string
- # @param str String to be parsed
- # @param beg Set on return to the 0-based start of the region
- # @param end Set on return to the 1-based end of the region
- # @return Pointer to the colon or '\0' after the reference sequence name,
- # or NULL if @a str could not be parsed.
- const char *hts_parse_reg(const char *str, int *beg, int *end)
-
- hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
- void hts_itr_destroy(hts_itr_t *iter)
-
- ctypedef int (*hts_name2id_f)(void*, const char*)
- ctypedef const char *(*hts_id2name_f)(void*, int)
- ctypedef hts_itr_t *hts_itr_query_func(
- const hts_idx_t *idx,
- int tid,
- int beg,
- int end,
- hts_readrec_func *readrec)
-
- hts_itr_t *hts_itr_querys(
- const hts_idx_t *idx,
- const char *reg,
- hts_name2id_f getid,
- void *hdr,
- hts_itr_query_func *itr_query,
- hts_readrec_func *readrec)
-
- int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
- const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values
-
- # hts_file_type() - Convenience function to determine file type
- # @fname: the file name
- #
- # Returns one of the FT_* defines.
- #
- # DEPRECATED: This function has been replaced by hts_detect_format().
- # It and these FT_* macros will be removed in a future HTSlib release.
- int FT_UNKN
- int FT_GZ
- int FT_VCF
- int FT_VCF_GZ
- int FT_BCF
- int FT_BCF_GZ
- int FT_STDIN
-
- int hts_file_type(const char *fname)
-
- inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
- inline int hts_bin_bot(int bin, int n_lvls)
-
- # * Endianness *
- inline int ed_is_big()
- inline uint16_t ed_swap_2(uint16_t v)
- inline void *ed_swap_2p(void *x)
- inline uint32_t ed_swap_4(uint32_t v)
- inline void *ed_swap_4p(void *x)
- inline uint64_t ed_swap_8(uint64_t v)
- inline void *ed_swap_8p(void *x)
-
-
-cdef extern from "htslib/sam.h" nogil:
- #**********************
- #*** SAM/BAM header ***
- #**********************
-
- # @abstract Structure for the alignment header.
- # @field n_targets number of reference sequences
- # @field l_text length of the plain text in the header
- # @field target_len lengths of the reference sequences
- # @field target_name names of the reference sequences
- # @field text plain text
- # @field sdict header dictionary
-
- ctypedef struct bam_hdr_t:
- int32_t n_targets, ignore_sam_err
- uint32_t l_text
- uint32_t *target_len
- uint8_t *cigar_tab
- char **target_name
- char *text
- void *sdict
-
- #****************************
- #*** CIGAR related macros ***
- #****************************
-
- int BAM_CMATCH
- int BAM_CINS
- int BAM_CDEL
- int BAM_CREF_SKIP
- int BAM_CSOFT_CLIP
- int BAM_CHARD_CLIP
- int BAM_CPAD
- int BAM_CEQUAL
- int BAM_CDIFF
- int BAM_CBACK
-
- char *BAM_CIGAR_STR
- int BAM_CIGAR_SHIFT
- uint32_t BAM_CIGAR_MASK
- uint32_t BAM_CIGAR_TYPE
-
- char bam_cigar_op(uint32_t c)
- uint32_t bam_cigar_oplen(uint32_t c)
- char bam_cigar_opchr(uint32_t)
- uint32_t bam_cigar_gen(char, uint32_t)
- int bam_cigar_type(char o)
-
- # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair
- int BAM_FPAIRED
- # @abstract the read is mapped in a proper pair
- int BAM_FPROPER_PAIR
- # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
- int BAM_FUNMAP
- # @abstract the mate is unmapped
- int BAM_FMUNMAP
- # @abstract the read is mapped to the reverse strand
- int BAM_FREVERSE
- # @abstract the mate is mapped to the reverse strand
- int BAM_FMREVERSE
- # @abstract this is read1
- int BAM_FREAD1
- # @abstract this is read2
- int BAM_FREAD2
- # @abstract not primary alignment
- int BAM_FSECONDARY
- # @abstract QC failure
- int BAM_FQCFAIL
- # @abstract optical or PCR duplicate
- int BAM_FDUP
- # @abstract supplementary alignment
- int BAM_FSUPPLEMENTARY
-
- #*************************
- #*** Alignment records ***
- #*************************
-
- # @abstract Structure for core alignment information.
- # @field tid chromosome ID, defined by bam_hdr_t
- # @field pos 0-based leftmost coordinate
- # @field bin bin calculated by bam_reg2bin()
- # @field qual mapping quality
- # @field l_qname length of the query name
- # @field flag bitwise flag
- # @field n_cigar number of CIGAR operations
- # @field l_qseq length of the query sequence (read)
- # @field mtid chromosome ID of next read in template, defined by bam_hdr_t
- # @field mpos 0-based leftmost coordinate of next read in template
-
- ctypedef struct bam1_core_t:
- int32_t tid
- int32_t pos
- uint16_t bin
- uint8_t qual
- uint8_t l_qname
- uint16_t flag
- uint16_t n_cigar
- int32_t l_qseq
- int32_t mtid
- int32_t mpos
- int32_t isize
-
- # @abstract Structure for one alignment.
- # @field core core information about the alignment
- # @field l_data current length of bam1_t::data
- # @field m_data maximum length of bam1_t::data
- # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
- #
- # @discussion Notes:
- #
- # 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
- # 2. l_qseq is calculated from the total length of an alignment block
- # on reading or from CIGAR.
- # 3. cigar data is encoded 4 bytes per CIGAR operation.
- # 4. seq is nybble-encoded according to seq_nt16_table.
- ctypedef struct bam1_t:
- bam1_core_t core
- int l_data, m_data
- uint8_t *data
- uint64_t id
-
- # @abstract Get whether the query is on the reverse strand
- # @param b pointer to an alignment
- # @return boolean true if query is on the reverse strand
- int bam_is_rev(bam1_t *b)
-
- # @abstract Get whether the query's mate is on the reverse strand
- # @param b pointer to an alignment
- # @return boolean true if query's mate on the reverse strand
- int bam_is_mrev(bam1_t *b)
-
- # @abstract Get the name of the query
- # @param b pointer to an alignment
- # @return pointer to the name string, null terminated
- char *bam_get_qname(bam1_t *b)
-
- # @abstract Get the CIGAR array
- # @param b pointer to an alignment
- # @return pointer to the CIGAR array
- #
- # @discussion In the CIGAR array, each element is a 32-bit integer. The
- # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
- # length of a CIGAR.
- uint32_t *bam_get_cigar(bam1_t *b)
-
- # @abstract Get query sequence
- # @param b pointer to an alignment
- # @return pointer to sequence
- #
- # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
- # 8 for T and 15 for N. Two bases are packed in one byte with the base
- # at the higher 4 bits having smaller coordinate on the read. It is
- # recommended to use bam_seqi() macro to get the base.
- char *bam_get_seq(bam1_t *b)
-
- # @abstract Get query quality
- # @param b pointer to an alignment
- # @return pointer to quality string
- uint8_t *bam_get_qual(bam1_t *b)
-
- # @abstract Get auxiliary data
- # @param b pointer to an alignment
- # @return pointer to the concatenated auxiliary data
- uint8_t *bam_get_aux(bam1_t *b)
-
- # @abstract Get length of auxiliary data
- # @param b pointer to an alignment
- # @return length of the concatenated auxiliary data
- int bam_get_l_aux(bam1_t *b)
-
- # @abstract Get a base on read
- # @param s Query sequence returned by bam1_seq()
- # @param i The i-th position, 0-based
- # @return 4-bit integer representing the base.
- char bam_seqi(char *s, int i)
-
- #**************************
- #*** Exported functions ***
- #**************************
-
- #***************
- #*** BAM I/O ***
- #***************
-
- bam_hdr_t *bam_hdr_init()
- bam_hdr_t *bam_hdr_read(BGZF *fp)
- int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
- void bam_hdr_destroy(bam_hdr_t *h)
- int bam_name2id(bam_hdr_t *h, const char *ref)
- bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0)
-
- bam1_t *bam_init1()
- void bam_destroy1(bam1_t *b)
- int bam_read1(BGZF *fp, bam1_t *b)
- int bam_write1(BGZF *fp, const bam1_t *b)
- bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
- bam1_t *bam_dup1(const bam1_t *bsrc)
-
- int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
- int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
-
- # @abstract Calculate the rightmost base position of an alignment on the
- # reference genome.
-
- # @param b pointer to an alignment
- # @return the coordinate of the first base after the alignment, 0-based
-
- # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
- # For an unmapped read (either according to its flags or if it has no cigar
- # string), we return b->core.pos + 1 by convention.
- int32_t bam_endpos(const bam1_t *b)
-
- int bam_str2flag(const char *str) # returns negative value on error
- char *bam_flag2str(int flag) # The string must be freed by the user
-
- #*************************
- #*** BAM/CRAM indexing ***
- #*************************
-
- # These BAM iterator functions work only on BAM files. To work with either
- # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
- void bam_itr_destroy(hts_itr_t *iter)
- hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
- hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
- int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
-
- # Load/build .csi or .bai BAM index file. Does not work with CRAM.
- # It is recommended to use the sam_index_* functions below instead.
- hts_idx_t *bam_index_load(const char *fn)
- int bam_index_build(const char *fn, int min_shift)
-
- # Load a BAM (.csi or .bai) or CRAM (.crai) index file
- # @param fp File handle of the data file whose index is being opened
- # @param fn BAM/CRAM/etc filename to search alongside for the index file
- # @return The index, or NULL if an error occurred.
- hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
-
- # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
- # @param fp File handle of the data file whose index is being opened
- # @param fn BAM/CRAM/etc data file filename
- # @param fnidx Index filename, or NULL to search alongside @a fn
- # @return The index, or NULL if an error occurred.
- hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
-
- # Generate and save an index file
- # @param fn Input BAM/etc filename, to which .csi/etc will be added
- # @param min_shift Positive to generate CSI, or 0 to generate BAI
- # @return 0 if successful, or negative if an error occurred (usually -1; or
- # -2: opening fn failed; -3: format not indexable)
- int sam_index_build(const char *fn, int min_shift)
-
- # Generate and save an index to a specific file
- # @param fn Input BAM/CRAM/etc filename
- # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
- # @param min_shift Positive to generate CSI, or 0 to generate BAI
- # @return 0 if successful, or negative if an error occurred.
- int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
-
- void sam_itr_destroy(hts_itr_t *iter)
- hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
- hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
- int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
-
- #***************
- #*** SAM I/O ***
- #***************
-
- htsFile *sam_open(const char *fn, const char *mode)
- htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
- int sam_close(htsFile *fp)
-
- int sam_open_mode(char *mode, const char *fn, const char *format)
-
- # A version of sam_open_mode that can handle ,key=value options.
- # The format string is allocated and returned, to be freed by the caller.
- # Prefix should be "r" or "w",
- char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
-
- bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
- bam_hdr_t *sam_hdr_read(htsFile *fp)
- int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
-
- int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
- int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
- int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
- int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
-
- #*************************************
- #*** Manipulating auxiliary fields ***
- #*************************************
-
- uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
- int32_t bam_aux2i(const uint8_t *s)
- double bam_aux2f(const uint8_t *s)
- char bam_aux2A(const uint8_t *s)
- char *bam_aux2Z(const uint8_t *s)
-
- void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data)
- int bam_aux_del(bam1_t *b, uint8_t *s)
-
- #**************************
- #*** Pileup and Mpileup ***
- #**************************
-
- # @abstract Structure for one alignment covering the pileup position.
- # @field b pointer to the alignment
- # @field qpos position of the read base at the pileup site, 0-based
- # @field indel indel length; 0 for no indel, positive for ins and negative for del
- # @field level the level of the read in the "viewer" mode
- # @field is_del 1 iff the base on the padded read is a deletion
- # @field is_head ???
- # @field is_tail ???
- # @field is_refskip ???
- # @field aux ???
- #
- # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
- # difference between the two functions is that the former does not
- # set bam_pileup1_t::level, while the later does. Level helps the
- # implementation of alignment viewers, but calculating this has some
- # overhead.
- #
- # is_del, is_head, etc are a bit field, declaring as below should
- # work as expected, see
- # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
-
- ctypedef struct bam_pileup1_t:
- bam1_t *b
- int32_t qpos
- int indel, level
- uint32_t is_del
- uint32_t is_head
- uint32_t is_tail
- uint32_t is_refskip
- uint32_t aux
-
- ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
- ctypedef int (*bam_test_f)()
-
- ctypedef struct __bam_plp_t
- ctypedef __bam_plp_t *bam_plp_t
-
- ctypedef struct __bam_mplp_t
- ctypedef __bam_mplp_t *bam_mplp_t
-
- # bam_plp_init() - sets an iterator over multiple
- # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return
- # status: 0 on success, -1 on end, < -1 on non-recoverable errors
- # @data: user data to pass to @func
- bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
- void bam_plp_destroy(bam_plp_t iter)
- int bam_plp_push(bam_plp_t iter, const bam1_t *b)
- const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
- const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
- void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
- void bam_plp_reset(bam_plp_t iter)
-
- bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
-
- # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
- # read pairs and for each base pair set the base quality of the
- # lower-quality base to zero, thus effectively discarding it from
- # calling. If the two bases are identical, the quality of the other base
- # is increased to the sum of their qualities (capped at 200), otherwise
- # it is multiplied by 0.8.
- void bam_mplp_init_overlaps(bam_mplp_t iter)
- void bam_mplp_destroy(bam_mplp_t iter)
- void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
- int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
-
- # Added by AH
- # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
-
-
-cdef extern from "htslib/faidx.h" nogil:
-
- ctypedef struct faidx_t:
- pass
-
- int fai_build(char *fn)
-
- void fai_destroy(faidx_t *fai)
-
- faidx_t *fai_load(char *fn)
-
- char *fai_fetch(faidx_t *fai,
- char *reg,
- int *len)
-
- int faidx_nseq(faidx_t *fai)
-
- int faidx_has_seq(faidx_t *fai, const char *seq)
-
- char *faidx_fetch_seq(faidx_t *fai,
- char *c_name,
- int p_beg_i,
- int p_end_i,
- int *len)
-
- int faidx_seq_len(faidx_t *fai, const char *seq)
-
-
-# tabix support
-cdef extern from "htslib/tbx.h" nogil:
-
- # tbx.h definitions
- int8_t TBX_MAX_SHIFT
- int8_t TBX_GENERIC
- int8_t TBX_SAM
- int8_t TBX_VCF
- int8_t TBX_UCSC
-
- ctypedef struct tbx_conf_t:
- int32_t preset
- int32_t sc, bc, ec # seq col., beg col. and end col.
- int32_t meta_char, line_skip
-
- ctypedef struct tbx_t:
- tbx_conf_t conf
- hts_idx_t *idx
- void * dict
-
- tbx_conf_t tbx_conf_gff
- tbx_conf_t tbx_conf_bed
- tbx_conf_t tbx_conf_psltbl
- tbx_conf_t tbx_conf_sam
- tbx_conf_t tbx_conf_vcf
-
- void tbx_itr_destroy(hts_itr_t * iter)
- hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
- hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
- int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data)
-
- int tbx_name2id(tbx_t *tbx, char *ss)
-
- int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
- int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
-
- tbx_t * tbx_index_load(char *fn)
- tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
-
- # free the array but not the values
- char **tbx_seqnames(tbx_t *tbx, int *n)
-
- void tbx_destroy(tbx_t *tbx)
-
-
-# VCF/BCF API
-cdef extern from "htslib/vcf.h" nogil:
-
- # Header struct
-
- uint8_t BCF_HL_FLT # header line
- uint8_t BCF_HL_INFO
- uint8_t BCF_HL_FMT
- uint8_t BCF_HL_CTG
- uint8_t BCF_HL_STR # structured header line TAG=<A=..,B=..>
- uint8_t BCF_HL_GEN # generic header line
-
- uint8_t BCF_HT_FLAG # header type
- uint8_t BCF_HT_INT
- uint8_t BCF_HT_REAL
- uint8_t BCF_HT_STR
-
- uint8_t BCF_VL_FIXED # variable length
- uint8_t BCF_VL_VAR
- uint8_t BCF_VL_A
- uint8_t BCF_VL_G
- uint8_t BCF_VL_R
-
- # === Dictionary ===
- #
- # The header keeps three dictonaries. The first keeps IDs in the
- # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
- # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
- # is the actual hash table, which is opaque to the end users. In the hash
- # table, the key is the ID or sample name as a C string and the value is a
- # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
- # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
- # size of the hash table or, equivalently, the length of the id[] arrays.
-
- uint8_t BCF_DT_ID # dictionary type
- uint8_t BCF_DT_CTG
- uint8_t BCF_DT_SAMPLE
-
- # Complete textual representation of a header line
- ctypedef struct bcf_hrec_t:
- int type # One of the BCF_HL_* type
- char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
- char *value # Set only for generic lines, NULL for FILTER/INFO, etc.
- int nkeys # Number of structured fields
- char **keys # The key=value pairs
- char **vals
-
- ctypedef struct bcf_idinfo_t:
- uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
- bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
- int id
-
- ctypedef struct bcf_idpair_t:
- const char *key
- const bcf_idinfo_t *val
-
- ctypedef struct bcf_hdr_t:
- int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
- bcf_idpair_t *id[3]
- void *dict[3] # ID dictionary, contig dict and sample dict
- char **samples
- bcf_hrec_t **hrec
- int nhrec, dirty
- int ntransl
- int *transl[2] # for bcf_translate()
- int nsamples_ori # for bcf_hdr_set_samples()
- uint8_t *keep_samples
- kstring_t mem
- int32_t m[3] # m: allocated size of the dictionary block in use (see n above)
-
- uint8_t bcf_type_shift[]
-
- # * VCF record *
-
- uint8_t BCF_BT_NULL
- uint8_t BCF_BT_INT8
- uint8_t BCF_BT_INT16
- uint8_t BCF_BT_INT32
- uint8_t BCF_BT_FLOAT
- uint8_t BCF_BT_CHAR
-
- uint8_t VCF_REF
- uint8_t VCF_SNP
- uint8_t VCF_MNP
- uint8_t VCF_INDEL
- uint8_t VCF_OTHER
-
- ctypedef struct variant_t:
- int type, n # variant type and the number of bases affected, negative for deletions
-
- ctypedef struct bcf_fmt_t:
- int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
- int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
- uint8_t *p # same as vptr and vptr_* in bcf_info_t below
- uint32_t p_len
- uint32_t p_off
- uint8_t p_free
-
- union bcf_info_union_t:
- int32_t i # integer value
- float f # float value
-
- ctypedef struct bcf_info_t:
- int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
- int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars
-
- # v1 union only set if $len==1; for easier access
- bcf_info_union_t v1
- uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
- uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset
- uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes
- uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new
- # data block is bigger than the original
-
- uint8_t BCF1_DIRTY_ID
- uint8_t BCF1_DIRTY_ALS
- uint8_t BCF1_DIRTY_FLT
- uint8_t BCF1_DIRTY_INF
-
- ctypedef struct bcf_dec_t:
- int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change
- int n_flt # Number of FILTER fields
- int *flt # FILTER keys in the dictionary
- char *id # ID
- char *als # REF+ALT block (\0-seperated)
- char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated
- bcf_info_t *info # INFO
- bcf_fmt_t *fmt # FORMAT and individual sample
- variant_t *var # $var and $var_type set only when set_variant_types called
- int n_var, var_type
- int shared_dirty # if set, shared.s must be recreated on BCF output
- int indiv_dirty # if set, indiv.s must be recreated on BCF output
-
- uint8_t BCF_ERR_CTG_UNDEF
- uint8_t BCF_ERR_TAG_UNDEF
- uint8_t BCF_ERR_NCOLS
- uint8_t BCF_ERR_LIMITS
-
- # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
- # is slower because the string is first to be parsed, packed into BCF line
- # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
- # is known in advance that some of the fields will not be required (notably
- # the sample columns), parsing of these can be skipped by setting max_unpack
- # appropriately.
- # Similarly, it is fast to output a BCF line because the columns (kept in
- # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
- # line must be formatted in vcf_format.
-
- ctypedef struct bcf1_t:
- int32_t rid # CHROM
- int32_t pos # POS
- int32_t rlen # length of REF
- float qual # QUAL
- uint32_t n_info, n_allele
- uint32_t n_fmt, n_sample
- kstring_t shared, indiv
- bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
- int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
- int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
- int unpack_size[3] # the original block size of ID, REF+ALT and FILTER
- int errcode # one of BCF_ERR_* codes
-
- ####### API #######
-
- # BCF and VCF I/O
- #
- # A note about naming conventions: htslib internally represents VCF
- # records as bcf1_t data structures, therefore most functions are
- # prefixed with bcf_. There are a few exceptions where the functions must
- # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
- # these cases, functions prefixed with bcf_ are more general and work
- # with both BCF and VCF.
-
- # bcf_hdr_init() - create an empty BCF header.
- # @param mode "r" or "w"
- #
- # When opened for writing, the mandatory fileFormat and
- # FILTER=PASS lines are added automatically.
- bcf_hdr_t *bcf_hdr_init(const char *mode)
-
- # Destroy a BCF header struct
- void bcf_hdr_destroy(bcf_hdr_t *h)
-
- # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
- bcf1_t *bcf_init()
-
- # Deallocate a bcf1_t object
- void bcf_destroy(bcf1_t *v)
-
- # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
- # not the bcf1_t object itself.
- void bcf_empty(bcf1_t *v)
-
- # Make the bcf1_t object ready for next read. Intended mostly for
- # internal use, the user should rarely need to call this function
- # directly.
- void bcf_clear(bcf1_t *v)
-
- # Reads VCF or BCF header
- bcf_hdr_t *bcf_hdr_read(htsFile *fp)
-
- # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
- # @samples: samples to include or exclude from file or as a comma-separated string.
- # LIST|FILE .. select samples in list/file
- # ^LIST|FILE .. exclude samples from list/file
- # - .. include all samples
- # NULL .. exclude all samples
- # @is_file: @samples is a file (1) or a comma-separated list (0)
- #
- # The bottleneck of VCF reading is parsing of genotype fields. If the
- # reader knows in advance that only subset of samples is needed (possibly
- # no samples at all), the performance of bcf_read() can be significantly
- # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
- # The function bcf_read() will subset the VCF/BCF records automatically
- # with the notable exception when reading records via bcf_itr_next().
- # In this case, bcf_subset_format() must be called explicitly, because
- # bcf_readrec() does not see the header.
- #
- # Returns 0 on success, -1 on error or a positive integer if the list
- # contains samples not present in the VCF header. In such a case, the
- # return value is the index of the offending sample.
- #
- int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
- int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
-
- # Writes VCF or BCF header
- int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
-
- # Parse VCF line contained in kstring and populate the bcf1_t struct
- int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
-
- # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
- int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
-
- # bcf_read() - read next VCF or BCF record
- #
- # Returns -1 on critical errors, 0 otherwise. On errors which are not
- # critical for reading, such as missing header definitions, v->errcode is
- # set to one of BCF_ERR* code and must be checked before calling
- # vcf_write().
- int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-
- # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
- #
- # Note that bcf_unpack() must be called even when reading VCF. It is safe
- # to call the function repeatedly, it will not unpack the same field
- # twice.
- uint8_t BCF_UN_STR # up to ALT inclusive
- uint8_t BCF_UN_FLT # up to FILTER
- uint8_t BCF_UN_INFO # up to INFO
- uint8_t BCF_UN_SHR # all shared information
- uint8_t BCF_UN_FMT # unpack format and each sample
- uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT
- uint8_t BCF_UN_ALL # everything
-
- int bcf_unpack(bcf1_t *b, int which)
-
- # bcf_dup() - create a copy of BCF record.
- #
- # Note that bcf_unpack() must be called on the returned copy as if it was
- # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
- # internally to reflect any changes made by bcf_update_* functions.
- bcf1_t *bcf_dup(bcf1_t *src)
- bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
-
- # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
- int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
-
- # The following functions work only with VCFs and should rarely be called
- # directly. Usually one wants to use their bcf_* alternatives, which work
- # transparently with both VCFs and BCFs.
- bcf_hdr_t *vcf_hdr_read(htsFile *fp)
- int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
- int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
- int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
-
- #************************************************************************
- # Header querying and manipulation routines
- #************************************************************************
-
- # Create a new header using the supplied template
- bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
-
- # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
- # Returns 0 on success or sets a bit on error:
- # 1 .. conflicting definitions of tag length
- # # todo
- int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
-
- # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
- # @param dst: the destination header to be merged into, NULL on the first pass
- # @param src: the source header
- #
- # Notes:
- # - use as:
- # bcf_hdr_t *dst = NULL;
- # for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
- #
- # - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
- # combining multiple BCF headers. The current bcf_hdr_combine()
- # does not have this problem, but became slow when used for many files.
- bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
-
- # bcf_hdr_add_sample() - add a new sample.
- # @param sample: sample name to be added
- int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
-
- # Read VCF header from a file and update the header
- int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
-
- # Returns formatted header (newly allocated string) and its length,
- # excluding the terminating \0. If is_bcf parameter is unset, IDX
- # fields are discarded.
- char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
-
- # Append new VCF header line, returns 0 on success
- int bcf_hdr_append(bcf_hdr_t *h, const char *line)
- int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
-
- # VCF version, e.g. VCFv4.2
- const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
- void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
-
- # bcf_hdr_remove() - remove VCF header tag
- # @param type: one of BCF_HL_*
- # @param key: tag name or NULL to remove all tags of the given type
- void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
-
- # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
- # @param n: number of samples to keep
- # @param samples: names of the samples to keep
- # @param imap: mapping from index in @samples to the sample index in the original file
- #
- # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
- # by comparing n and bcf_hdr_nsamples(out_hdr).
- # This function can be used to reorder samples.
- # See also bcf_subset() which subsets individual records.
- #
- bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
-
- # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
- const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
-
- # Get number of samples
- int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
-
- # The following functions are for internal use and should rarely be called directly
- int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
- int bcf_hdr_sync(bcf_hdr_t *h)
- bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
- void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
- int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
-
- # bcf_hdr_get_hrec() - get header line info
- # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
- # @param key: the header key for generic lines (e.g. "fileformat"), any field
- # for structured lines, typically "ID".
- # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
- # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
- #
- bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
- bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
- void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
- void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
- int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
- void hrec_add_idx(bcf_hrec_t *hrec, int idx)
- void bcf_hrec_destroy(bcf_hrec_t *hrec)
-
- #************************************************************************
- # Individual record querying and manipulation routines
- #************************************************************************
-
- # See the description of bcf_hdr_subset()
- int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
-
- # bcf_translate() - translate tags ids to be consistent with different header. This function
- # is useful when lines from multiple VCF need to be combined.
- # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
- # @src_hdr: the source header, used in bcf_read()
- # @src_line: line obtained by bcf_read()
- int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
-
- # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc
- int bcf_get_variant_types(bcf1_t *rec)
- int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
- int bcf_is_snp(bcf1_t *v)
-
- # bcf_update_filter() - sets the FILTER column
- # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
- # @n: Number of filters. If n==0, all filters are removed
- int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
-
- # bcf_add_filter() - adds to the FILTER column
- # @flt_id: The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
- #
- # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
- int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
-
- # bcf_remove_filter() - removes from the FILTER column
- # @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
- # @pass: when set to 1 and no filters are present, set to PASS
- int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
-
- # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
- int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
-
- # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
- # @alleles: Array of alleles
- # @nals: Number of alleles
- # @alleles_string: Comma-separated alleles, starting with the REF allele
- int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
- int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
-
- # bcf_update_id() - sets new ID string
- # bcf_add_id() - adds to the ID string checking for duplicates
- int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
- int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
-
- # bcf_update_info_*() - functions for updating INFO fields
- # @hdr: the BCF header
- # @line: VCF line to be edited
- # @key: the INFO tag to be updated
- # @values: pointer to the array of values. Pass NULL to remove the tag.
- # @n: number of values in the array. When set to 0, the INFO tag is removed
- #
- # The @string in bcf_update_info_flag() is optional, @n indicates whether
- # the flag is set or removed.
- #
- # Returns 0 on success or negative value on error.
- #
- int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
- int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
- int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
- int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
- int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
-
- # bcf_update_format_*() - functions for updating FORMAT fields
- # @values: pointer to the array of values, the same number of elements
- # is expected for each sample. Missing values must be padded
- # with bcf_*_missing or bcf_*_vector_end values.
- # @n: number of values in the array. If n==0, existing tag is removed.
- #
- # The function bcf_update_format_string() is a higher-level (slower) variant of
- # bcf_update_format_char(). The former accepts array of \0-terminated strings
- # whereas the latter requires that the strings are collapsed into a single array
- # of fixed-length strings. In case of strings with variable length, shorter strings
- # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
- # are not \0-terminated.
- #
- # Returns 0 on success or negative value on error.
- #
- int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
- int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
- int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
- int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
- int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
- int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
-
- # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
- # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
- # from bcf_get_genotypes() below.
- uint32_t bcf_gt_phased(uint32_t idx)
- uint32_t bcf_gt_unphased(uint32_t idx)
- uint32_t bcf_gt_missing
- uint32_t bcf_gt_is_missing(uint32_t val)
- uint32_t bcf_gt_is_phased(uint32_t idx)
- uint32_t bcf_gt_allele(uint32_t val)
-
- # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
- uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
- void bcf_gt2alleles(int igt, int *a, int *b)
-
- # bcf_get_fmt() - returns pointer to FORMAT's field data
- # @header: for access to BCF_DT_ID dictionary
- # @line: VCF line obtained from vcf_parse1
- # @fmt: one of GT,PL,...
- #
- # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
- # is not available.
- #
- bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
- bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
-
- # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
- # @line: VCF line obtained from vcf_parse1
- # @id: The header index for the tag, obtained from bcf_hdr_id2int()
- #
- # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
- # as their goal is to avoid the header lookup.
- #
- bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
- bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
-
- # bcf_get_info_*() - get INFO values, integers or floats
- # @hdr: BCF header
- # @line: BCF record
- # @tag: INFO tag to retrieve
- # @dst: *dst is pointer to a memory location, can point to NULL
- # @ndst: pointer to the size of allocated memory
- #
- # Returns negative value on error or the number of written values on
- # success. bcf_get_info_string() returns on success the number of
- # characters written excluding the null-terminating byte. bcf_get_info_flag()
- # returns 1 when flag is set or 0 if not.
- #
- # List of return codes:
- # -1 .. no such INFO tag defined in the header
- # -2 .. clash between types defined in the header and encountered in the VCF record
- # -3 .. tag is not present in the VCF record
- #
- int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
- int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
- int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
- int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
- int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
-
- # bcf_get_format_*() - same as bcf_get_info*() above
- #
- # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
- # see the description of bcf_update_format_string() and bcf_update_format_char() above.
- # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
- # a single block of \0-terminated strings collapsed into a single array and an array of pointers
- # to these strings. Both arrays must be cleaned by the user.
- #
- # Returns negative value on error or the number of written values on success.
- #
- # Example:
- # int ndst = 0; char **dst = NULL
- # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
- # for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
- # free(dst[0]); free(dst)
- #
- # Example:
- # int ngt, *gt_arr = NULL, ngt_arr = 0
- # ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr)
- #
- int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
- int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
- int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
- int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst)
- int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
- int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
-
- #************************************************************************
- # Helper functions
- #************************************************************************
-
- #
- # bcf_hdr_id2int() - Translates string into numeric ID
- # bcf_hdr_int2id() - Translates numeric ID into string
- # @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
- # @id: tag name, such as: PL, DP, GT, etc.
- #
- # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
- # fields in BCF records.
- #
- int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
- const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
-
- # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
- # bcf_hdr_id2name() - Translates numeric ID to sequence name
- #
- int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
- const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
- const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
-
- #
- # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
- # @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
- # @int_id: return value of bcf_hdr_id2int, must be >=0
- #
- # The returned values are:
- # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
- # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields
- # bcf_hdr_id2type .. the field type, one of BCF_HT_*
- # bcf_hdr_id2coltype .. the column type, one of BCF_HL_*
- #
- # Notes: Prior to using the macros, the presence of the info should be
- # tested with bcf_hdr_idinfo_exists().
- #
- int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
- int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
- int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
- int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
- int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
- bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
-
- void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
- uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
-
- void bcf_enc_vchar(kstring_t *s, int l, const char *a)
- void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
- void bcf_enc_vfloat(kstring_t *s, int n, float *a)
-
- #************************************************************************
- # BCF index
- #
- # Note that these functions work with BCFs only. See synced_bcf_reader.h
- # which provides (amongst other things) an API to work transparently with
- # both indexed BCFs and VCFs.
- #************************************************************************
-
- hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
- int bcf_index_build(const char *fn, int min_shift)
- int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
-
- #*******************
- # Typed value I/O *
- #******************
-
- # Note that in contrast with BCFv2.1 specification, HTSlib implementation
- # allows missing values in vectors. For integer types, the values 0x80,
- # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
- # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of
- # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
- # end-of-vector indicator.
- # Note that the end-of-vector byte is not part of the vector.
-
- # This trial BCF version (v2.2) is compatible with the VCF specification and
- # enables to handle correctly vectors with different ploidy in presence of
- # missing values.
-
- int32_t bcf_int8_vector_end
- int32_t bcf_int16_vector_end
- int32_t bcf_int32_vector_end
- int32_t bcf_str_vector_end
- int32_t bcf_int8_missing
- int32_t bcf_int16_missing
- int32_t bcf_int32_missing
- int32_t bcf_str_missing
-
- uint32_t bcf_float_vector_end
- uint32_t bcf_float_missing
-
- void bcf_float_set(float *ptr, uint32_t value)
- void bcf_float_set_vector_end(float *x)
- void bcf_float_set_missing(float *x)
-
- int bcf_float_is_missing(float f)
- int bcf_float_is_vector_end(float f)
- void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
- void bcf_enc_size(kstring_t *s, int size, int type)
- int bcf_enc_inttype(long x)
- void bcf_enc_int1(kstring_t *s, int32_t x)
- int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
- int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
- int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
-
- # These trivial wrappers are defined only for consistency with other parts of htslib
- bcf1_t *bcf_init1()
- int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
- int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
- int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
- int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
- void bcf_destroy1(bcf1_t *v)
- void bcf_empty1(bcf1_t *v)
- int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
- void bcf_clear1(bcf1_t *v)
- int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
-
- # Other nice wrappers
- void bcf_itr_destroy(hts_itr_t *iter)
- hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
- hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
- int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
- hts_idx_t *bcf_index_load(const char *fn)
- const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
-
-
-# VCF/BCF utility functions
-cdef extern from "htslib/vcfutils.h" nogil:
- struct kbitset_t
-
- # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
- # @header: for access to BCF_DT_ID dictionary
- # @line: VCF line obtain from vcf_parse1
- #
- # Returns the number of removed alleles on success or negative
- # on error:
- # -1 .. some allele index is out of bounds
- int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
-
- # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
- # @header: for access to BCF_DT_ID dictionary
- # @line: VCF line obtained from vcf_parse1
- # @mask: alleles to remove
- #
- # If you have more than 31 alleles, then the integer bit mask will
- # overflow, so use bcf_remove_allele_set instead
- void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
-
- # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
- # @header: for access to BCF_DT_ID dictionary
- # @line: VCF line obtained from vcf_parse1
- # @rm_set: pointer to kbitset_t object with bits set for allele
- # indexes to remove
- #
- # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
- void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
-
- # bcf_calc_ac() - calculate the number of REF and ALT alleles
- # @header: for access to BCF_DT_ID dictionary
- # @line: VCF line obtained from vcf_parse1
- # @ac: array of length line->n_allele
- # @which: determine if INFO/AN,AC and indv fields be used
- #
- # Returns 1 if the call succeeded, or 0 if the value could not
- # be determined.
- #
- # The value of @which determines if existing INFO/AC,AN can be
- # used (BCF_UN_INFO) and and if indv fields can be splitted
- # (BCF_UN_FMT).
- int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
-
- # bcf_gt_type() - determines type of the genotype
- # @fmt_ptr: the GT format field as set for example by set_fmt_ptr
- # @isample: sample index (starting from 0)
- # @ial: index of the 1st non-reference allele (starting from 1)
- # @jal: index of the 2nd non-reference allele (starting from 1)
- #
- # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
- # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
- # is not NULL and the genotype has one or more non-reference
- # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
- # position of the allele which appeared first in ALT. If $jal is
- # not null and the genotype is GT_HET_AA, $jal will be set and is
- # the position of the second allele in ALT.
- uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation
- uint8_t GT_HOM_AA
- uint8_t GT_HET_RA
- uint8_t GT_HET_AA
- uint8_t GT_HAPL_R
- uint8_t GT_HAPL_A
- uint8_t GT_UNKN
- int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
-
- int bcf_acgt2int(char c)
- char bcf_int2acgt(int i)
-
- # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
- # @i,j: allele indexes, 0-based, i<=j
- # Returns index to the Number=G diploid array
- uint32_t bcf_ij2G(uint32_t i, uint32_t j)
+++ /dev/null
-# cython: embedsignature=True
-# cython: profile=True
-# adds doc-strings for sphinx
-from pysam.chtslib cimport *
-
-cpdef set_verbosity(int verbosity):
- u"""Set htslib's hts_verbose global variable to the specified value.
- """
- return hts_set_verbosity(verbosity)
-
-cpdef get_verbosity():
- u"""Return the value of htslib's hts_verbose global variable.
- """
- return hts_get_verbosity()
-
-__all__ = [
- "get_verbosity",
- "set_verbosity"]
-
+++ /dev/null
-from pysam.calignmentfile cimport AlignedSegment, AlignmentFile
-
-#################################################
-# Compatibility Layer for pysam < 0.8
-
-# import all declarations from htslib
-from pysam.chtslib cimport *
-
-cdef class AlignedRead(AlignedSegment):
- pass
-
-cdef class Samfile(AlignmentFile):
- pass
-
-# import the conversion functions
-cdef extern from "htslib_util.h":
-
- # add *nbytes* into the variable length data of *src* at *pos*
- bam1_t * pysam_bam_update(bam1_t * b,
- size_t nbytes_old,
- size_t nbytes_new,
- uint8_t * pos)
-
- # now: static
- int aux_type2size(int)
-
- char * pysam_bam_get_qname(bam1_t * b)
- uint32_t * pysam_bam_get_cigar(bam1_t * b)
- uint8_t * pysam_bam_get_seq(bam1_t * b)
- uint8_t * pysam_bam_get_qual(bam1_t * b)
- uint8_t * pysam_bam_get_aux(bam1_t * b)
- int pysam_bam_get_l_aux(bam1_t * b)
- char pysam_bam_seqi(uint8_t * s, int i)
-
- uint16_t pysam_get_bin(bam1_t * b)
- uint8_t pysam_get_qual(bam1_t * b)
- uint8_t pysam_get_l_qname(bam1_t * b)
- uint16_t pysam_get_flag(bam1_t * b)
- uint16_t pysam_get_n_cigar(bam1_t * b)
- void pysam_set_bin(bam1_t * b, uint16_t v)
- void pysam_set_qual(bam1_t * b, uint8_t v)
- void pysam_set_l_qname(bam1_t * b, uint8_t v)
- void pysam_set_flag(bam1_t * b, uint16_t v)
- void pysam_set_n_cigar(bam1_t * b, uint16_t v)
- void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
+++ /dev/null
-# cython: embedsignature=True
-# cython: profile=True
-# adds doc-strings for sphinx
-import tempfile
-import os
-import sys
-import types
-import itertools
-import struct
-import ctypes
-import collections
-import re
-import platform
-import warnings
-from cpython cimport PyErr_SetString, \
- PyBytes_Check, \
- PyUnicode_Check, \
- PyBytes_FromStringAndSize
-
-from cpython.version cimport PY_MAJOR_VERSION
-
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-
-
-cdef class Samfile(AlignmentFile):
- '''Deprecated alternative for :class:`~pysam.AlignmentFile`
-
- Added for backwards compatibility with pysam <= 0.8.0
- '''
- pass
-
-
-cdef class AlignedRead(AlignedSegment):
- '''Deprecated alternative for :class:`~pysam.AlignedSegment`
-
- Added for backwards compatibility with pysam <= 0.8.0
- '''
- pass
-
-
-__all__ = ['Samfile', 'AlignedRead']
-
-
+++ /dev/null
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-from libc.stdlib cimport malloc, calloc, realloc, free
-from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
-from libc.stdio cimport FILE, printf
-
-# Note: this replaces python "open"!
-cdef extern from "fcntl.h":
- int open(char *pathname, int flags)
-
-cdef extern from "unistd.h" nogil:
- ctypedef int ssize_t
- ssize_t read(int fd, void *buf, size_t count)
- int close(int fd)
-
-from pysam.chtslib cimport hts_idx_t, hts_itr_t, htsFile, \
- tbx_t, kstring_t, BGZF
-
-# These functions are put here and not in chtslib.pxd in order
-# to avoid warnings for unused functions.
-cdef extern from "pysam_stream.h" nogil:
-
- ctypedef struct kstream_t:
- pass
-
- ctypedef struct kseq_t:
- kstring_t name
- kstring_t comment
- kstring_t seq
- kstring_t qual
-
- kseq_t *kseq_init(BGZF *)
- int kseq_read(kseq_t *)
- void kseq_destroy(kseq_t *)
- kstream_t *ks_init(BGZF *)
- void ks_destroy(kstream_t *)
-
- # Retrieve characters from stream until delimiter
- # is reached placing results in str.
- int ks_getuntil(kstream_t *,
- int delimiter,
- kstring_t * str,
- int * dret)
-
-
-cdef class tabix_file_iterator:
- cdef BGZF * fh
- cdef kstream_t * kstream
- cdef kstring_t buffer
- cdef size_t size
- cdef Parser parser
- cdef int fd
- cdef int duplicated_fd
- cdef infile
-
- cdef __cnext__(self)
-
-cdef class TabixFile:
-
- # pointer to tabixfile
- cdef htsFile * tabixfile
- # pointer to index structure
- cdef tbx_t * index
-
- # flag indicating whether file is remote
- cdef int is_remote
-
- cdef object _filename
- cdef object _filename_index
-
- cdef Parser parser
-
- cdef encoding
-
-cdef class Parser:
- cdef encoding
-
- cdef parse(self, char * buffer, int len)
-
-cdef class asTuple(Parser):
- cdef parse(self, char * buffer, int len)
-
-cdef class asGTF(Parser):
- pass
-
-cdef class asBed(Parser):
- pass
-
-cdef class asVCF(Parser):
- pass
-
-cdef class TabixIterator:
- cdef hts_itr_t * iterator
- cdef TabixFile tabixfile
- cdef kstring_t buffer
- cdef encoding
- cdef int __cnext__(self)
-
-cdef class TabixIteratorParsed(TabixIterator):
- cdef Parser parser
-
-cdef class GZIterator:
- cdef object _filename
- cdef BGZF * gzipfile
- cdef kstream_t * kstream
- cdef kstring_t buffer
- cdef int __cnext__(self)
- cdef encoding
-
-cdef class GZIteratorHead(GZIterator):
- pass
-
-cdef class GZIteratorParsed(GZIterator):
- cdef Parser parser
-
-# Compatibility Layer for pysam < 0.8
-cdef class Tabixfile(TabixFile):
- pass
-
+++ /dev/null
-# cython: embedsignature=True
-# cython: profile=True
-###############################################################################
-###############################################################################
-# Cython wrapper for access to tabix indexed files in bgzf format
-###############################################################################
-# The principal classes and functions defined in this module are:
-#
-# class TabixFile class wrapping tabix indexed files in bgzf format
-#
-# class asTuple Parser class for tuples
-# class asGT Parser class for GTF formatted rows
-# class asBed Parser class for Bed formatted rows
-# class asVCF Parser class for VCF formatted rows
-#
-# class tabix_generic_iterator Streamed iterator of bgzf formatted files
-#
-# Additionally this module defines several additional classes that are part
-# of the internal API. These are:
-#
-# class Parser base class for parsers of tab-separated rows
-# class tabix_file_iterator
-# class TabixIterator iterator class over rows in bgzf file
-# class EmptyIterator
-#
-# For backwards compatibility, the following classes are also defined:
-#
-# class Tabixfile equivalent to TabixFile
-#
-###############################################################################
-#
-# The MIT License
-#
-# Copyright (c) 2015 Andreas Heger
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-#
-###############################################################################
-import os
-import sys
-
-from libc.stdio cimport printf, fprintf, stderr
-from libc.string cimport strerror
-from libc.errno cimport errno
-from posix.unistd cimport dup
-
-from cpython cimport PyErr_SetString, PyBytes_Check, \
- PyUnicode_Check, PyBytes_FromStringAndSize, \
- PyObject_AsFileDescriptor
-
-from cpython.version cimport PY_MAJOR_VERSION
-
-cimport pysam.ctabixproxies as ctabixproxies
-
-from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
- BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
- tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
- tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
- tbx_destroy, hisremote
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-
-cdef class Parser:
-
- def __init__(self, encoding="ascii"):
- self.encoding = encoding
-
- def set_encoding(self, encoding):
- self.encoding = encoding
-
- def get_encoding(self):
- return self.encoding
-
- cdef parse(self, char * buffer, int length):
- raise NotImplementedError(
- 'parse method of %s not implemented' % str(self))
-
- def __call__(self, char * buffer, int length):
- return self.parse(buffer, length)
-
-
-cdef class asTuple(Parser):
- '''converts a :term:`tabix row` into a python tuple.
-
- A field in a row is accessed by numeric index.
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.TupleProxy r
- r = ctabixproxies.TupleProxy(self.encoding)
- # need to copy - there were some
- # persistence issues with "present"
- r.copy(buffer, len)
- return r
-
-
-cdef class asGTF(Parser):
- '''converts a :term:`tabix row` into a GTF record with the following
- fields:
-
- +----------+----------+-------------------------------+
- |*Column* |*Name* |*Content* |
- +----------+----------+-------------------------------+
- |1 |contig |the chromosome name |
- +----------+----------+-------------------------------+
- |2 |feature |The feature type |
- +----------+----------+-------------------------------+
- |3 |source |The feature source |
- +----------+----------+-------------------------------+
- |4 |start |genomic start coordinate |
- | | |(0-based) |
- +----------+----------+-------------------------------+
- |5 |end |genomic end coordinate |
- | | |(0-based) |
- +----------+----------+-------------------------------+
- |6 |score |feature score |
- +----------+----------+-------------------------------+
- |7 |strand |strand |
- +----------+----------+-------------------------------+
- |8 |frame |frame |
- +----------+----------+-------------------------------+
- |9 |attributes|the attribute field |
- +----------+----------+-------------------------------+
-
- GTF formatted entries also define the following fields that
- are derived from the attributes field:
-
- +--------------------+------------------------------+
- |*Name* |*Content* |
- +--------------------+------------------------------+
- |gene_id |the gene identifier |
- +--------------------+------------------------------+
- |transcript_id |the transcript identifier |
- +--------------------+------------------------------+
-
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.GTFProxy r
- r = ctabixproxies.GTFProxy(self.encoding)
- r.copy(buffer, len)
- return r
-
-
-cdef class asBed(Parser):
- '''converts a :term:`tabix row` into a bed record
- with the following fields:
-
- +-----------+-----------+------------------------------------------+
- |*Column* |*Field* |*Contents* |
- | | | |
- +-----------+-----------+------------------------------------------+
- |1 |contig |contig |
- | | | |
- +-----------+-----------+------------------------------------------+
- |2 |start |genomic start coordinate (zero-based) |
- +-----------+-----------+------------------------------------------+
- |3 |end |genomic end coordinate plus one |
- | | |(zero-based) |
- +-----------+-----------+------------------------------------------+
- |4 |name |name of feature. |
- +-----------+-----------+------------------------------------------+
- |5 |score |score of feature |
- +-----------+-----------+------------------------------------------+
- |6 |strand |strand of feature |
- +-----------+-----------+------------------------------------------+
- |7 |thickStart |thickStart |
- +-----------+-----------+------------------------------------------+
- |8 |thickEnd |thickEnd |
- +-----------+-----------+------------------------------------------+
- |9 |itemRGB |itemRGB |
- +-----------+-----------+------------------------------------------+
- |10 |blockCount |number of bocks |
- +-----------+-----------+------------------------------------------+
- |11 |blockSizes |',' separated string of block sizes |
- +-----------+-----------+------------------------------------------+
- |12 |blockStarts|',' separated string of block genomic |
- | | |start positions |
- +-----------+-----------+------------------------------------------+
-
- Only the first three fields are required. Additional
- fields are optional, but if one is defined, all the preceding
- need to be defined as well.
-
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.BedProxy r
- r = ctabixproxies.BedProxy(self.encoding)
- r.copy(buffer, len)
- return r
-
-
-cdef class asVCF(Parser):
- '''converts a :term:`tabix row` into a VCF record with
- the following fields:
-
- +----------+---------+------------------------------------+
- |*Column* |*Field* |*Contents* |
- | | | |
- +----------+---------+------------------------------------+
- |1 |contig |chromosome |
- +----------+---------+------------------------------------+
- |2 |pos |chromosomal position, zero-based |
- +----------+---------+------------------------------------+
- |3 |id |id |
- +----------+---------+------------------------------------+
- |4 |ref |reference allele |
- +----------+---------+------------------------------------+
- |5 |alt |alternate alleles |
- +----------+---------+------------------------------------+
- |6 |qual |quality |
- +----------+---------+------------------------------------+
- |7 |filter |filter |
- +----------+---------+------------------------------------+
- |8 |info |info |
- +----------+---------+------------------------------------+
- |9 |format |format specifier. |
- +----------+---------+------------------------------------+
-
- Access to genotypes is via index::
-
- contig = vcf.contig
- first_sample_genotype = vcf[0]
- second_sample_genotype = vcf[1]
-
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.VCFProxy r
- r = ctabixproxies.VCFProxy(self.encoding)
- r.copy(buffer, len)
- return r
-
-
-cdef class TabixFile:
- """Random access to bgzf formatted files that
- have been indexed by :term:`tabix`.
-
- The file is automatically opened. The index file of file
- ``<filename>`` is expected to be called ``<filename>.tbi``
- by default (see parameter `index`).
-
- Parameters
- ----------
-
- filename : string
- Filename of bgzf file to be opened.
-
- index : string
- The filename of the index. If not set, the default is to
- assume that the index is called ``filename.tbi`
-
- mode : char
- The file opening mode. Currently, only ``r`` is permitted.
-
- parser : :class:`pysam.Parser`
-
- sets the default parser for this tabix file. If `parser`
- is None, the results are returned as an unparsed string.
- Otherwise, `parser` is assumed to be a functor that will return
- parsed data (see for example :class:`~pysam.asTuple` and
- :class:`~pysam.asGTF`).
-
- encoding : string
-
- The encoding passed to the parser
-
- Raises
- ------
-
- ValueError
- if index file is missing.
-
- IOError
- if file could not be opened
- """
- def __cinit__(self,
- filename,
- mode = 'r',
- parser=None,
- index=None,
- encoding="ascii",
- *args,
- **kwargs ):
-
- self.tabixfile = NULL
- self.parser = parser
- self._open(filename, mode, index, *args, **kwargs)
- self.encoding = encoding
-
- def _open( self,
- filename,
- mode='r',
- index=None,
- ):
- '''open a :term:`tabix file` for reading.
- '''
-
- assert mode in ("r",), "invalid file opening mode `%s`" % mode
-
- if self.tabixfile != NULL:
- self.close()
- self.tabixfile = NULL
-
- filename_index = index or (filename + ".tbi")
- # encode all the strings to pass to tabix
- self._filename = encode_filename(filename)
- self._filename_index = encode_filename(filename_index)
-
- self.is_remote = hisremote(self._filename)
-
- if not self.is_remote:
- if not os.path.exists(filename):
- raise IOError("file `%s` not found" % filename)
-
- if not os.path.exists(filename_index):
- raise IOError("index `%s` not found" % filename_index)
-
- # open file
- cdef char *cfilename = self._filename
- with nogil:
- self.tabixfile = hts_open(cfilename, 'r')
-
- if self.tabixfile == NULL:
- raise IOError("could not open file `%s`" % filename)
-
- cfilename = self._filename_index
- with nogil:
- self.index = tbx_index_load(cfilename)
-
- if self.index == NULL:
- raise IOError("could not open index for `%s`" % filename)
-
- def _dup(self):
- '''return a copy of this tabix file.
-
- The file is being re-opened.
- '''
- return TabixFile(self._filename,
- mode="r",
- parser=self.parser,
- index=self._filename_index,
- encoding=self.encoding)
-
- def is_open(self):
- '''return true if samfile has been opened.'''
- return self.tabixfile != NULL
-
-
- def fetch(self,
- reference=None,
- start=None,
- end=None,
- region=None,
- parser=None,
- multiple_iterators=False):
- '''fetch one or more rows in a :term:`region` using 0-based
- indexing. The region is specified by :term:`reference`,
- *start* and *end*. Alternatively, a samtools :term:`region`
- string can be supplied.
-
- Without *reference* or *region* all entries will be fetched.
-
- If only *reference* is set, all reads matching on *reference*
- will be fetched.
-
- If *parser* is None, the default parser will be used for
- parsing.
-
- Set *multiple_iterators* to true if you will be using multiple
- iterators on the same file at the same time. The iterator
- returned will receive its own copy of a filehandle to the file
- effectively re-opening the file. Re-opening a file creates
- some overhead, so beware.
-
- '''
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
-
- # convert coordinates to region string, which is one-based
- if reference:
- if end is not None:
- if end < 0:
- raise ValueError("end out of range (%i)" % end)
- if start is None:
- start = 0
-
- if start < 0:
- raise ValueError("start out of range (%i)" % end)
- elif start > end:
- raise ValueError(
- 'start (%i) >= end (%i)' % (start, end))
- elif start == end:
- return EmptyIterator()
- else:
- region = '%s:%i-%i' % (reference, start + 1, end)
- elif start is not None:
- if start < 0:
- raise ValueError("start out of range (%i)" % end)
- region = '%s:%i' % (reference, start + 1)
- else:
- region = reference
-
- # get iterator
- cdef hts_itr_t * itr
- cdef char *cstr
- cdef TabixFile fileobj
-
- # reopen the same file if necessary
- if multiple_iterators:
- fileobj = self._dup()
- else:
- fileobj = self
-
- if region is None:
- # without region or reference - iterate from start
- with nogil:
- itr = tbx_itr_queryi(fileobj.index,
- HTS_IDX_START,
- 0,
- 0)
- else:
- s = force_bytes(region, encoding=fileobj.encoding)
- cstr = s
- with nogil:
- itr = tbx_itr_querys(fileobj.index, cstr)
-
- if itr == NULL:
- if region is None:
- if len(self.contigs) > 0:
- # when accessing a tabix file created prior tabix 1.0
- # the full-file iterator is empty.
- raise ValueError(
- "could not create iterator, possible "
- "tabix version mismatch")
- else:
- # possible reason is that the file is empty -
- # return an empty iterator
- return EmptyIterator()
- else:
- raise ValueError(
- "could not create iterator for region '%s'" %
- region)
-
- # use default parser if no parser is specified
- if parser is None:
- parser = fileobj.parser
-
- cdef TabixIterator a
- if parser is None:
- a = TabixIterator(encoding=fileobj.encoding)
- else:
- parser.set_encoding(fileobj.encoding)
- a = TabixIteratorParsed(parser)
-
- a.tabixfile = fileobj
- a.iterator = itr
-
- return a
-
- # context manager interface
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- ###############################################################
- ###############################################################
- ###############################################################
- ## properties
- ###############################################################
- property closed:
- """"bool indicating the current state of the file object.
- This is a read-only attribute; the close() method changes the value.
- """
- def __get__(self):
- return not self.is_open()
-
- property filename:
- '''filename associated with this object.'''
- def __get__(self):
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
- return self._filename
-
- property header:
- '''the file header.
-
- The file header consists of the lines at the beginning of a
- file that are prefixed by the comment character ``#``.
-
- .. note::
- The header is returned as an iterator presenting lines
- without the newline character.
-
- .. note::
- The header is only available for local files. For remote
- files an Attribute Error is raised.
-
- '''
-
- def __get__(self):
- if self.is_remote:
- raise AttributeError(
- "the header is not available for remote files")
- return GZIteratorHead(self.filename)
-
- property contigs:
- '''list of chromosome names'''
- def __get__(self):
- cdef char ** sequences
- cdef int nsequences
-
- with nogil:
- sequences = tbx_seqnames(self.index, &nsequences)
- cdef int x
- result = []
- for x from 0 <= x < nsequences:
- result.append(force_str(sequences[x]))
-
- # htslib instructions:
- # only free container, not the sequences themselves
- free(sequences)
-
- return result
-
- def close(self):
- '''
- closes the :class:`pysam.TabixFile`.'''
- if self.tabixfile != NULL:
- hts_close(self.tabixfile)
- self.tabixfile = NULL
- if self.index != NULL:
- tbx_destroy(self.index)
- self.index = NULL
-
- def __dealloc__( self ):
- # remember: dealloc cannot call other python methods
- # note: no doc string
- # note: __del__ is not called.
- if self.tabixfile != NULL:
- hts_close(self.tabixfile)
- self.tabixfile = NULL
- if self.index != NULL:
- tbx_destroy(self.index)
-
-
-cdef class TabixIterator:
- """iterates over rows in *tabixfile* in region
- given by *tid*, *start* and *end*.
- """
-
- def __init__(self, encoding="ascii"):
- self.encoding = encoding
-
- def __iter__(self):
- self.buffer.s = NULL
- self.buffer.l = 0
- self.buffer.m = 0
-
- return self
-
- cdef int __cnext__(self):
- '''iterate to next element.
-
- Return -5 if file has been closed when this function
- was called.
- '''
- if self.tabixfile.tabixfile == NULL:
- return -5
-
- cdef int retval
-
- while 1:
- with nogil:
- retval = tbx_itr_next(
- self.tabixfile.tabixfile,
- self.tabixfile.index,
- self.iterator,
- &self.buffer)
-
- if retval < 0:
- break
-
- if self.buffer.s[0] != '#':
- break
-
- return retval
-
- def __next__(self):
- """python version of next().
-
- pyrex uses this non-standard name instead of next()
- """
-
- cdef int retval = self.__cnext__()
- if retval == -5:
- raise IOError("iteration on closed file")
- elif retval < 0:
- raise StopIteration
-
- return charptr_to_str(self.buffer.s, self.encoding)
-
- def next(self):
- return self.__next__()
-
- def __dealloc__(self):
- if <void*>self.iterator != NULL:
- tbx_itr_destroy(self.iterator)
- if self.buffer.s != NULL:
- free(self.buffer.s)
-
-
-class EmptyIterator:
- '''empty iterator'''
-
- def __iter__(self):
- return self
-
- def next(self):
- raise StopIteration()
-
- def __next__(self):
- raise StopIteration()
-
-
-cdef class TabixIteratorParsed(TabixIterator):
- """iterates over mapped reads in a region.
-
- The *parser* determines the encoding.
-
- Returns parsed data.
- """
-
- def __init__(self,
- Parser parser):
-
- TabixIterator.__init__(self)
- self.parser = parser
-
- def __next__(self):
- """python version of next().
-
- pyrex uses this non-standard name instead of next()
- """
-
- cdef int retval = self.__cnext__()
- if retval == -5:
- raise IOError("iteration on closed file")
- elif retval < 0:
- raise StopIteration
-
- return self.parser.parse(self.buffer.s,
- self.buffer.l)
-
-
-cdef class GZIterator:
- def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
- '''iterate line-by-line through gzip (or bgzip)
- compressed file.
- '''
- if not os.path.exists(filename):
- raise IOError("No such file or directory: %s" % filename)
-
- filename = encode_filename(filename)
- cdef char *cfilename = filename
- with nogil:
- self.gzipfile = bgzf_open(cfilename, "r")
- self._filename = filename
- self.kstream = ks_init(self.gzipfile)
- self.encoding = encoding
-
- self.buffer.l = 0
- self.buffer.m = 0
- self.buffer.s = <char*>malloc(buffer_size)
-
- def __dealloc__(self):
- '''close file.'''
- if self.gzipfile != NULL:
- bgzf_close(self.gzipfile)
- self.gzipfile = NULL
- if self.buffer.s != NULL:
- free(self.buffer.s)
- if self.kstream != NULL:
- ks_destroy(self.kstream)
-
- def __iter__(self):
- return self
-
- cdef int __cnext__(self):
- cdef int dret = 0
- cdef int retval = 0
- while 1:
- with nogil:
- retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
-
- if retval < 0:
- break
-
- return dret
- return -1
-
- def __next__(self):
- """python version of next().
- """
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration
- return force_str(self.buffer.s, self.encoding)
-
-
-cdef class GZIteratorHead(GZIterator):
- '''iterate line-by-line through gzip (or bgzip)
- compressed file returning comments at top of file.
- '''
-
- def __next__(self):
- """python version of next().
- """
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration
- if self.buffer.s[0] == '#':
- return self.buffer.s
- else:
- raise StopIteration
-
-
-cdef class GZIteratorParsed(GZIterator):
- '''iterate line-by-line through gzip (or bgzip)
- compressed file returning comments at top of file.
- '''
-
- def __init__(self, parser):
- self.parser = parser
-
- def __next__(self):
- """python version of next().
- """
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration
-
- return self.parser.parse(self.buffer.s,
- self.buffer.l)
-
-
-def tabix_compress(filename_in,
- filename_out,
- force=False):
- '''compress *filename_in* writing the output to *filename_out*.
-
- Raise an IOError if *filename_out* already exists, unless *force*
- is set.
- '''
-
- if not force and os.path.exists(filename_out):
- raise IOError(
- "Filename '%s' already exists, use *force* to "
- "overwrite" % filename_out)
-
- cdef int WINDOW_SIZE
- cdef int c, r
- cdef void * buffer
- cdef BGZF * fp
- cdef int fd_src
- cdef bint is_empty = True
- cdef int O_RDONLY
- O_RDONLY = os.O_RDONLY
-
- WINDOW_SIZE = 64 * 1024
-
- fn = encode_filename(filename_out)
- cdef char *cfn = fn
- with nogil:
- fp = bgzf_open(cfn, "w")
- if fp == NULL:
- raise IOError("could not open '%s' for writing" % filename_out)
-
- fn = encode_filename(filename_in)
- fd_src = open(fn, O_RDONLY)
- if fd_src == 0:
- raise IOError("could not open '%s' for reading" % filename_in)
-
- buffer = malloc(WINDOW_SIZE)
- c = 1
-
- while c > 0:
- with nogil:
- c = read(fd_src, buffer, WINDOW_SIZE)
- if c > 0:
- is_empty = False
- r = bgzf_write(fp, buffer, c)
- if r < 0:
- free(buffer)
- raise OSError("writing failed")
-
- free(buffer)
- r = bgzf_close(fp)
- if r < 0:
- raise OSError("error %i when writing to file %s" % (r, filename_out))
-
- r = close(fd_src)
- # an empty file will return with -1, thus ignore this.
- if r < 0:
- if not (r == -1 and is_empty):
- raise OSError("error %i when closing file %s" % (r, filename_in))
-
-
-def tabix_index( filename,
- force = False,
- seq_col = None,
- start_col = None,
- end_col = None,
- preset = None,
- meta_char = "#",
- zerobased = False,
- int min_shift = -1,
- ):
- '''index tab-separated *filename* using tabix.
-
- An existing index will not be overwritten unless
- *force* is set.
-
- The index will be built from coordinates
- in columns *seq_col*, *start_col* and *end_col*.
-
- The contents of *filename* have to be sorted by
- contig and position - the method does not check
- if the file is sorted.
-
- Column indices are 0-based. Coordinates in the file
- are assumed to be 1-based.
-
- If *preset* is provided, the column coordinates
- are taken from a preset. Valid values for preset
- are "gff", "bed", "sam", "vcf", psltbl", "pileup".
-
- Lines beginning with *meta_char* and the first
- *line_skip* lines will be skipped.
-
- If *filename* does not end in ".gz", it will be automatically
- compressed. The original file will be removed and only the
- compressed file will be retained.
-
- If *filename* ends in *gz*, the file is assumed to be already
- compressed with bgzf.
-
- *min-shift* sets the minimal interval size to 1<<INT; 0 for the
- old tabix index. The default of -1 is changed inside htslib to
- the old tabix default of 0.
-
- returns the filename of the compressed data
-
- '''
-
- if not os.path.exists(filename):
- raise IOError("No such file '%s'" % filename)
-
- if preset is None and \
- (seq_col is None or start_col is None or end_col is None):
- raise ValueError(
- "neither preset nor seq_col,start_col and end_col given")
-
- if not filename.endswith(".gz"):
- tabix_compress(filename, filename + ".gz", force=force)
- os.unlink( filename )
- filename += ".gz"
-
- if not force and os.path.exists(filename + ".tbi"):
- raise IOError(
- "Filename '%s.tbi' already exists, use *force* to overwrite")
-
- # columns (1-based):
- # preset-code, contig, start, end, metachar for
- # comments, lines to ignore at beginning
- # 0 is a missing column
- preset2conf = {
- 'gff' : (0, 1, 4, 5, ord('#'), 0),
- 'bed' : (0x10000, 1, 2, 3, ord('#'), 0),
- 'psltbl' : (0x10000, 15, 17, 18, ord('#'), 0),
- 'sam' : (1, 3, 4, 0, ord('@'), 0),
- 'vcf' : (2, 1, 2, 0, ord('#'), 0),
- 'pileup': (3, 1, 2, 0, ord('#'), 0),
- }
-
- if preset:
- try:
- conf_data = preset2conf[preset]
- except KeyError:
- raise KeyError(
- "unknown preset '%s', valid presets are '%s'" %
- (preset, ",".join(preset2conf.keys())))
- else:
- if end_col == None:
- end_col = -1
- preset = 0
-
- # note that tabix internally works with 0-based coordinates
- # and open/closed intervals. When using a preset, conversion
- # is automatically taken care of. Otherwise, the coordinates
- # are assumed to be 1-based closed intervals and -1 is
- # subtracted from the start coordinate. To avoid doing this,
- # set the TI_FLAG_UCSC=0x10000 flag:
- if zerobased:
- preset = preset | 0x10000
-
- conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
-
- cdef tbx_conf_t conf
- conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
-
-
- fn = encode_filename(filename)
- cdef char *cfn = fn
- with nogil:
- tbx_index_build(cfn, min_shift, &conf)
-
- return filename
-
-# #########################################################
-# cdef class tabix_file_iterator_old:
-# '''iterate over ``infile``.
-
-# This iterator is not safe. If the :meth:`__next__()` method is called
-# after ``infile`` is closed, the result is undefined (see ``fclose()``).
-
-# The iterator might either raise a StopIteration or segfault.
-# '''
-
-
-# def __cinit__(self,
-# infile,
-# Parser parser,
-# int buffer_size = 65536 ):
-
-# cdef int fd = PyObject_AsFileDescriptor( infile )
-# if fd == -1: raise ValueError( "I/O operation on closed file." )
-# self.infile = fdopen( fd, 'r')
-
-# if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
-
-# self.buffer = <char*>malloc( buffer_size )
-# self.size = buffer_size
-# self.parser = parser
-
-# def __iter__(self):
-# return self
-
-# cdef __cnext__(self):
-
-# cdef char * b
-# cdef size_t nbytes
-# b = self.buffer
-
-# while not feof( self.infile ):
-# nbytes = getline( &b, &self.size, self.infile)
-
-# # stop at first error or eof
-# if (nbytes == -1): break
-# # skip comments
-# if (b[0] == '#'): continue
-
-# # skip empty lines
-# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
-
-# # make sure that entry is complete
-# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
-# result = b
-# raise ValueError( "incomplete line at %s" % result )
-
-# # make sure that this goes fully through C
-# # otherwise buffer is copied to/from a
-# # Python object causing segfaults as
-# # the wrong memory is freed
-# return self.parser.parse( b, nbytes )
-
-# raise StopIteration
-
-# def __dealloc__(self):
-# free(self.buffer)
-
-# def __next__(self):
-# return self.__cnext__()
-
-#########################################################
-#########################################################
-#########################################################
-## Iterators for parsing through unindexed files.
-#########################################################
-# cdef buildGzipError(void *gzfp):
-# cdef int errnum = 0
-# cdef char *s = gzerror(gzfp, &errnum)
-# return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
-
-
-cdef class tabix_file_iterator:
- '''iterate over a compressed or uncompressed ``infile``.
- '''
-
- def __cinit__(self,
- infile,
- Parser parser,
- int buffer_size=65536):
-
- if infile.closed:
- raise ValueError("I/O operation on closed file.")
-
- self.infile = infile
-
- cdef int fd = PyObject_AsFileDescriptor(infile)
- if fd == -1:
- raise ValueError("I/O operation on closed file.")
-
- self.duplicated_fd = dup(fd)
-
- # From the manual:
- # gzopen can be used to read a file which is not in gzip format;
- # in this case gzread will directly read from the file without decompression.
- # When reading, this will be detected automatically by looking
- # for the magic two-byte gzip header.
- self.fh = bgzf_dopen(self.duplicated_fd, 'r')
-
- if self.fh == NULL:
- raise IOError('%s' % strerror(errno))
-
- self.kstream = ks_init(self.fh)
-
- self.buffer.s = <char*>malloc(buffer_size)
- #if self.buffer == NULL:
- # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
- #self.size = buffer_size
- self.parser = parser
-
- def __iter__(self):
- return self
-
- cdef __cnext__(self):
-
- cdef char * b
- cdef int dret = 0
- cdef int retval = 0
- while 1:
- with nogil:
- retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
-
- if retval < 0:
- break
- #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
-
- b = self.buffer.s
-
- # skip comments
- if (b[0] == '#'):
- continue
-
- # skip empty lines
- if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
- continue
-
- # gzgets terminates at \n, no need to test
-
- # parser creates a copy
- return self.parser.parse(b, self.buffer.l)
-
- raise StopIteration
-
- def __dealloc__(self):
- free(self.buffer.s)
- ks_destroy(self.kstream)
- bgzf_close(self.fh)
-
- def __next__(self):
- return self.__cnext__()
-
- def next(self):
- return self.__cnext__()
-
-
-class tabix_generic_iterator:
- '''iterate over ``infile``.
-
- Permits the use of file-like objects for example from the gzip module.
- '''
- def __init__(self, infile, parser):
-
- self.infile = infile
- if self.infile.closed:
- raise ValueError("I/O operation on closed file.")
- self.parser = parser
-
- def __iter__(self):
- return self
-
- # cython version - required for python 3
- def __next__(self):
-
- cdef char * b
- cdef char * cpy
- cdef size_t nbytes
-
- encoding = self.parser.get_encoding()
-
- # note that GzipFile.close() does not close the file
- # reading is still possible.
- if self.infile.closed:
- raise ValueError("I/O operation on closed file.")
-
- while 1:
-
- line = self.infile.readline()
- if not line:
- break
-
- s = force_bytes(line, encoding)
- b = s
- nbytes = len(line)
- assert b[nbytes] == '\0'
-
- # skip comments
- if b[0] == '#':
- continue
-
- # skip empty lines
- if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
- continue
-
- # make sure that entry is complete
- if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
- raise ValueError("incomplete line at %s" % line)
-
- bytes_cpy = <bytes> b
- cpy = <char *> bytes_cpy
-
- return self.parser(cpy, nbytes)
-
- raise StopIteration
-
- # python version - required for python 2.7
- def next(self):
- return self.__next__()
-
-def tabix_iterator(infile, parser):
- """return an iterator over all entries in a file.
-
- Results are returned parsed as specified by the *parser*. If
- *parser* is None, the results are returned as an unparsed string.
- Otherwise, *parser* is assumed to be a functor that will return
- parsed data (see for example :class:`~pysam.asTuple` and
- :class:`~pysam.asGTF`).
-
- """
- if PY_MAJOR_VERSION >= 3:
- return tabix_generic_iterator(infile, parser)
- else:
- return tabix_file_iterator(infile, parser)
-
- # file objects can use C stdio
- # used to be: isinstance( infile, file):
- # if PY_MAJOR_VERSION >= 3:
- # if isinstance( infile, io.IOBase ):
- # return tabix_copy_iterator( infile, parser )
- # else:
- # return tabix_generic_iterator( infile, parser )
- # else:
-# if isinstance( infile, file ):
-# return tabix_copy_iterator( infile, parser )
-# else:
-# return tabix_generic_iterator( infile, parser )
-
-cdef class Tabixfile(TabixFile):
- """Tabixfile is deprecated: use TabixFile instead"""
- pass
-
-
-__all__ = [
- "tabix_index",
- "tabix_compress",
- "TabixFile",
- "Tabixfile",
- "asTuple",
- "asGTF",
- "asVCF",
- "asBed",
- "GZIterator",
- "GZIteratorHead",
- "tabix_iterator",
- "tabix_generic_iterator",
- "tabix_file_iterator",
-]
+++ /dev/null
-#cdef extern from "Python.h":
-# ctypedef struct FILE
-
-from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t
-
-cdef class TupleProxy:
-
- cdef:
- char * data
- char ** fields
- int nfields
- int index
- int nbytes
- int offset
- bint is_modified
-
- cdef encoding
-
- cpdef int getMaxFields(self)
- cpdef int getMinFields(self)
-# cdef char * _getindex(self, int idx)
-
- cdef take(self, char * buffer, size_t nbytes)
- cdef present(self, char * buffer, size_t nbytes)
- cdef copy(self, char * buffer, size_t nbytes, bint reset=*)
- cdef update(self, char * buffer, size_t nbytes)
-
-cdef class GTFProxy(TupleProxy) :
-
- cdef:
- char * _attributes
- cdef bint hasOwnAttributes
-
- cpdef int getMaxFields(self)
- cpdef int getMinFields(self)
- cdef char * getAttributes(self)
-
-cdef class NamedTupleProxy(TupleProxy):
- pass
-
-cdef class BedProxy(NamedTupleProxy):
-
- cdef:
- char * contig
- uint32_t start
- uint32_t end
- int bedfields
-
- cpdef int getMaxFields(self)
- cpdef int getMinFields(self)
- cdef update(self, char * buffer, size_t nbytes)
-
-cdef class VCFProxy(NamedTupleProxy) :
-
- cdef:
- char * contig
- uint32_t pos
-
- cdef update(self, char * buffer, size_t nbytes)
+++ /dev/null
-from cpython cimport PyBytes_FromStringAndSize
-
-from libc.stdio cimport printf, feof, fgets
-from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
-from libc.stdlib cimport free, malloc, calloc, realloc
-from libc.stdlib cimport atoi, atol, atof
-
-from pysam.cutils cimport force_bytes, force_str, charptr_to_str
-from pysam.cutils cimport encode_filename, from_string_and_size
-
-import collections
-
-cdef char *StrOrEmpty(char * buffer):
- if buffer == NULL:
- return ""
- else: return buffer
-
-cdef int isNew(char * p, char * buffer, size_t nbytes):
- """return True if `p` is located within `buffer` of size
- `nbytes`
- """
- if p == NULL:
- return 0
- return not (buffer <= p < buffer + nbytes)
-
-
-cdef class TupleProxy:
- '''Proxy class for access to parsed row as a tuple.
-
- This class represents a table row for fast read-access.
-
- Access to individual fields is via the [] operator.
-
- Only read-only access is implemented.
-
- '''
-
- def __cinit__(self, encoding="ascii"):
- self.data = NULL
- self.fields = NULL
- self.index = 0
- self.nbytes = 0
- self.is_modified = 0
- self.nfields = 0
- # start counting at field offset
- self.offset = 0
- self.encoding = encoding
-
- def __dealloc__(self):
- cdef int x
- if self.is_modified:
- for x from 0 <= x < self.nfields:
- if isNew(self.fields[x], self.data, self.nbytes):
- free(self.fields[x])
- self.fields[x] = NULL
-
- if self.data != NULL:
- free(self.data)
- if self.fields != NULL:
- free(self.fields)
-
- def __copy__(self):
- if self.is_modified:
- raise NotImplementedError(
- "copying modified tuples is not implemented")
- cdef TupleProxy n = type(self)()
- n.copy(self.data, self.nbytes, reset=True)
- return n
-
- def compare(self, TupleProxy other):
- '''return -1,0,1, if contents in this are binary
- <,=,> to *other*
-
- '''
- if self.is_modified or other.is_modified:
- raise NotImplementedError(
- 'comparison of modified TupleProxies is not implemented')
- if self.data == other.data:
- return 0
-
- if self.nbytes < other.nbytes:
- return -1
- elif self.nbytes > other.nbytes:
- return 1
- return memcmp(self.data, other.data, self.nbytes)
-
- def __richcmp__(self, TupleProxy other, int op):
- if op == 2: # == operator
- return self.compare(other) == 0
- elif op == 3: # != operator
- return self.compare(other) != 0
- else:
- err_msg = "op {0} isn't implemented yet".format(op)
- raise NotImplementedError(err_msg)
-
- cdef take(self, char * buffer, size_t nbytes):
- '''start presenting buffer.
-
- Take ownership of the pointer.
- '''
- self.data = buffer
- self.nbytes = nbytes
- self.update(buffer, nbytes)
-
- cdef present(self, char * buffer, size_t nbytes):
- '''start presenting buffer.
-
- Do not take ownership of the pointer.
- '''
- self.update(buffer, nbytes)
-
- cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
- '''start presenting buffer of size *nbytes*.
-
- Buffer is a '\0'-terminated string without the '\n'.
-
- Take a copy of buffer.
- '''
- # +1 for '\0'
- cdef int s = sizeof(char) * (nbytes + 1)
- self.data = <char*>malloc(s)
- if self.data == NULL:
- raise ValueError("out of memory in TupleProxy.copy()")
- memcpy(<char*>self.data, buffer, s)
-
- if reset:
- for x from 0 <= x < nbytes:
- if self.data[x] == '\0':
- self.data[x] = '\t'
-
- self.update(self.data, nbytes)
-
- cpdef int getMinFields(self):
- '''return minimum number of fields.'''
- # 1 is not a valid tabix entry, but TupleProxy
- # could be more generic.
- return 1
-
- cpdef int getMaxFields(self):
- '''return maximum number of fields. Return
- 0 for unknown length.'''
- return 0
-
- cdef update(self, char * buffer, size_t nbytes):
- '''update internal data.
-
- *buffer* is a \0 terminated string.
-
- *nbytes* is the number of bytes in buffer (excluding
- the \0)
-
- Update starts work in buffer, thus can be used
- to collect any number of fields until nbytes
- is exhausted.
-
- If max_fields is set, the number of fields is initialized to
- max_fields.
-
- '''
- cdef char * pos
- cdef char * old_pos
- cdef int field
- cdef int max_fields, min_fields, x
-
- assert strlen(buffer) == nbytes, \
- "length of buffer (%i) != number of bytes (%i)" % (
- strlen(buffer), nbytes)
-
- if buffer[nbytes] != 0:
- raise ValueError("incomplete line at %s" % buffer)
-
- #################################
- # remove line breaks and feeds and update number of bytes
- x = nbytes - 1
- while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'):
- buffer[x] = '\0'
- x -= 1
- self.nbytes = x + 1
-
- #################################
- # clear data
- if self.fields != NULL:
- free(self.fields)
-
- for field from 0 <= field < self.nfields:
- if isNew(self.fields[field], self.data, self.nbytes):
- free(self.fields[field])
-
- self.is_modified = self.nfields = 0
-
- #################################
- # allocate new
- max_fields = self.getMaxFields()
- # pre-count fields - better would be
- # to guess or dynamically grow
- if max_fields == 0:
- for x from 0 <= x < nbytes:
- if buffer[x] == '\t':
- max_fields += 1
- max_fields += 1
-
- self.fields = <char **>calloc(max_fields, sizeof(char *))
- if self.fields == NULL:
- raise ValueError("out of memory in TupleProxy.update()")
-
- #################################
- # start filling
- field = 0
- self.fields[field] = pos = buffer
- field += 1
- old_pos = pos
- while 1:
-
- pos = <char*>memchr(pos, '\t', nbytes)
- if pos == NULL:
- break
- if field >= max_fields:
- raise ValueError(
- "parsing error: more than %i fields in line: %s" %
- (max_fields, buffer))
-
- pos[0] = '\0'
- pos += 1
- self.fields[field] = pos
- field += 1
- nbytes -= pos - old_pos
- if nbytes < 0:
- break
- old_pos = pos
- self.nfields = field
- if self.nfields < self.getMinFields():
- raise ValueError(
- "parsing error: fewer that %i fields in line: %s" %
- (self.getMinFields(), buffer))
-
- def _getindex(self, int index):
- '''return item at idx index'''
- cdef int i = index
- if i < 0:
- i += self.nfields
- if i < 0:
- raise IndexError("list index out of range")
- # apply offset - separating a fixed number
- # of fields from a variable number such as in VCF
- i += self.offset
- if i >= self.nfields:
- raise IndexError(
- "list index out of range %i >= %i" %
- (i, self.nfields))
- return force_str(self.fields[i], self.encoding)
-
- def __getitem__(self, key):
- if type(key) == int:
- return self._getindex(key)
- # slice object
- start, end, step = key.indices(self.nfields)
- result = []
- for index in range(start, end, step):
- result.append(self._getindex(index))
- return result
-
- def _setindex(self, index, value):
- '''set item at idx index.'''
- cdef int idx = index
- if idx < 0:
- raise IndexError("list index out of range")
- if idx >= self.nfields:
- raise IndexError("list index out of range")
-
- if isNew(self.fields[idx], self.data, self.nbytes):
- free(self.fields[idx] )
-
- self.is_modified = 1
-
- if value is None:
- self.fields[idx] = NULL
- return
-
- # conversion with error checking
- value = force_bytes(value)
- cdef char * tmp = <char*>value
- self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
- if self.fields[idx] == NULL:
- raise ValueError("out of memory" )
- strcpy(self.fields[idx], tmp)
-
- def __setitem__(self, index, value):
- '''set item at *index* to *value*'''
- cdef int i = index
- if i < 0:
- i += self.nfields
- i += self.offset
-
- self._setindex(i, value)
-
- def __len__(self):
- return self.nfields
-
- def __iter__(self):
- self.index = 0
- return self
-
- def __next__(self):
- """python version of next().
- """
- if self.index >= self.nfields:
- raise StopIteration
- cdef char * retval = self.fields[self.index]
- self.index += 1
- if retval == NULL:
- return None
- else:
- return force_str(retval, self.encoding)
-
- def __str__(self):
- '''return original data'''
- # copy and replace \0 bytes with \t characters
- cdef char * cpy
- if self.is_modified:
- # todo: treat NULL values
- result = []
- for x in xrange(0, self.nfields):
- result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
- return "\t".join(result)
- else:
- cpy = <char*>calloc(sizeof(char), self.nbytes+1)
- if cpy == NULL:
- raise ValueError("out of memory")
- memcpy(cpy, self.data, self.nbytes+1)
- for x from 0 <= x < self.nbytes:
- if cpy[x] == '\0':
- cpy[x] = '\t'
- result = cpy[:self.nbytes]
- free(cpy)
- r = result.decode(self.encoding)
- return r
-
-def toDot(v):
- '''convert value to '.' if None'''
- if v is None:
- return "."
- else:
- return str(v)
-
-def quote(v):
- '''return a quoted attribute.'''
- if isinstance(v, str):
- return '"%s"' % v
- else:
- return str(v)
-
-
-cdef class GTFProxy(TupleProxy):
- '''Proxy class for access to GTF fields.
-
- This class represents a GTF entry for fast read-access.
- Write-access has been added as well, though some care must
- be taken. If any of the string fields (contig, source, ...)
- are set, the new value is tied to the lifetime of the
- argument that was supplied.
-
- The only exception is the attributes field when set from
- a dictionary - this field will manage its own memory.
- '''
-
- def __cinit__(self):
- # automatically calls TupleProxy.__cinit__
- self.hasOwnAttributes = False
- self._attributes = NULL
-
- def __dealloc__(self):
- # automatically calls TupleProxy.__dealloc__
- if self.hasOwnAttributes:
- free(self._attributes)
-
- cpdef int getMinFields(self):
- '''return minimum number of fields.'''
- return 9
-
- cpdef int getMaxFields(self):
- '''return max number of fields.'''
- return 9
-
- property contig:
- '''contig of feature.'''
- def __get__(self):
- return self._getindex(0)
- def __set__(self, value):
- self._setindex(0, value)
-
- property source:
- '''feature source.'''
- def __get__(self):
- return self._getindex(1)
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(1, value)
-
- property feature:
- '''feature name.'''
- def __get__(self):
- return self._getindex(2)
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(2, value)
-
- property start:
- '''feature start (in 0-based open/closed coordinates).'''
- def __get__(self ):
- return int( self._getindex(3)) - 1
- def __set__(self, value ):
- self._setindex(3, str(value+1))
-
- property end:
- '''feature end (in 0-based open/closed coordinates).'''
- def __get__(self):
- return int(self._getindex(4))
- def __set__(self, value):
- self._setindex(4, str(value))
-
- property score:
- '''feature score.'''
- def __get__(self):
- v = self._getindex(5)
- if v == "" or v[0] == '.':
- return None
- else:
- return float(v)
-
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(5, str(value))
-
- property strand:
- '''feature strand.'''
- def __get__(self):
- return self._getindex(6)
- def __set__(self, value ):
- if value is None:
- value = "."
- self._setindex(6, value)
-
- property frame:
- '''feature frame.'''
- def __get__(self):
- v = self._getindex(7)
- if v == "" or v[0] == '.':
- return v
- else:
- return int(v)
-
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(7, str(value))
-
- property attributes:
- '''feature attributes (as a string).'''
- def __get__(self):
- if self.hasOwnAttributes:
- return force_str(self._attributes)
- else:
- return force_str(self._getindex(8))
- def __set__( self, value):
- if self.hasOwnAttributes:
- free(self._attributes)
- self._attributes = NULL
- self.hasOwnAttributes = False
- self._setindex(8, value)
-
- cdef char * getAttributes(self):
- '''return pointer to attributes.'''
- cdef char * attributes
- if self.hasOwnAttributes:
- attributes = self._attributes
- else:
- attributes = self.fields[8]
- if attributes == NULL:
- raise KeyError("no attributes defined GTF entry")
- return attributes
-
- def asDict(self):
- """parse attributes - return as dict
- """
-
- # remove comments
- attributes = self.attributes
-
- # separate into fields
- # Fields might contain a ";", for example in ENSEMBL GTF file
- # for mouse, v78:
- # ...; transcript_name "TXNRD2;-001"; ....
- # The current heuristic is to split on a semicolon followed by a
- # space, see also http://mblab.wustl.edu/GTF22.html
-
- # Remove white space to prevent a last empty field.
- fields = [x.strip() for x in attributes.strip().split("; ")]
-
- result = collections.OrderedDict()
-
- for f in fields:
-
- # strip semicolon (GTF files without a space after the last semicolon)
- if f.endswith(";"):
- f = f[:-1]
-
- # split at most once in order to avoid separating
- # multi-word values
- d = [x.strip() for x in f.split(" ", 1)]
-
- n,v = d[0], d[1]
- if len(d) > 2:
- v = d[1:]
-
- if v[0] == '"' and v[-1] == '"':
- v = v[1:-1]
- else:
- ## try to convert to a value
- try:
- v = float(v)
- v = int(v)
- except ValueError:
- pass
- except TypeError:
- pass
-
- result[n] = v
-
- return result
-
- def fromDict(self, d):
- '''set attributes from a dictionary.'''
- cdef char * p
- cdef int l
-
- # clean up if this field is set twice
- if self.hasOwnAttributes:
- free(self._attributes)
-
- aa = []
- for k,v in d.items():
- if isinstance(v, str):
- aa.append( '%s "%s"' % (k,v) )
- else:
- aa.append( '%s %s' % (k,str(v)) )
-
- a = force_bytes("; ".join(aa) + ";")
- p = a
- l = len(a)
- self._attributes = <char *>calloc(l + 1, sizeof(char))
- if self._attributes == NULL:
- raise ValueError("out of memory")
- memcpy(self._attributes, p, l)
-
- self.hasOwnAttributes = True
- self.is_modified = True
-
- def __str__(self):
- cdef char * cpy
- cdef int x
-
- if self.is_modified:
- return "\t".join(
- (self.contig,
- self.source,
- self.feature,
- str(self.start+1),
- str(self.end),
- toDot(self.score),
- toDot(self.strand),
- toDot(self.frame),
- self.attributes))
- else:
- return TupleProxy.__str__(self)
-
- def invert(self, int lcontig):
- '''invert coordinates to negative strand coordinates
-
- This method will only act if the feature is on the
- negative strand.'''
-
- if self.strand[0] == '-':
- start = min(self.start, self.end)
- end = max(self.start, self.end)
- self.start, self.end = lcontig - end, lcontig - start
-
- def keys(self):
- '''return a list of attributes defined in this entry.'''
- r = self.attributes
- return [x.strip().split(" ")[0]
- # separator is ';' followed by space
- for x in r.split("; ") if x.strip() != '']
-
- def __getitem__(self, key):
- return self.__getattr__(key)
-
- def __getattr__(self, item):
- """Generic lookup of attribute from GFF/GTF attributes
- Only called if there *isn't* an attribute with this name
- """
- cdef char * start
- cdef char * query
- cdef char * cpy
- cdef char * end
- cdef int l
-
- #
- # important to use the getAttributes function.
- # Using the self.attributes property to access
- # the attributes caused a hard-to-trace bug
- # in which fields in the attribute string were
- # set to 0.
- # Running through valgrind complained that
- # memory was accessed in the memory field
- # that has been released. It is not clear
- # why this happened and might be a cython bug
- # (Version 0.16). The valgrind warnings
- # disappeard after accessing the C data structures
- # directly and so did the bug.
- cdef char * attributes = self.getAttributes()
- if attributes == NULL:
- raise KeyError("key %s not found, no attributes" % item)
-
- # add space in order to make sure
- # to not pick up a field that is a prefix of another field
- r = force_bytes(item + " ")
- query = r
- start = strstr(attributes, query)
-
- if start == NULL:
- raise AttributeError("'GTFProxy' has no attribute '%s'" % item)
-
- start += strlen(query)
- # skip gaps before
- while start[0] == ' ':
- start += 1
-
- if start[0] == '"':
- start += 1
- end = start
- while end[0] != '\0' and end[0] != '"':
- end += 1
- l = end - start
- result = force_str(PyBytes_FromStringAndSize(start, l),
- self.encoding)
- return result
- else:
- return force_str(start, self.encoding)
-
- def setAttribute(self, name, value):
- '''convenience method to set an attribute.'''
- r = self.asDict()
- r[name] = value
- self.fromDict(r)
-
- def __cmp__(self, other):
- return (self.contig, self.strand, self.start) < \
- (other.contig, other.strand, other.start)
-
- # python 3 compatibility
- def __richcmp__(GTFProxy self, GTFProxy other, int op):
- if op == 0:
- return (self.contig, self.strand, self.start) < \
- (other.contig, other.strand, other.start)
- elif op == 1:
- return (self.contig, self.strand, self.start) <= \
- (other.contig, other.strand, other.start)
- elif op == 2:
- return self.compare(other) == 0
- elif op == 3:
- return self.compare(other) != 0
- else:
- err_msg = "op {0} isn't implemented yet".format(op)
- raise NotImplementedError(err_msg)
-
-
-cdef class NamedTupleProxy(TupleProxy):
-
- map_key2field = {}
-
- def __setattr__(self, key, value):
- '''set attribute.'''
- cdef int idx
- idx, f = self.map_key2field[key]
- if self.nfields < idx:
- raise KeyError("field %s not set" % key)
- TupleProxy.__setitem__(self, idx, str(value))
-
- def __getattr__(self, key):
- cdef int idx
- idx, f = self.map_key2field[key]
- if self.nfields < idx:
- raise KeyError("field %s not set" % key)
- if f == str:
- return force_str(self.fields[idx],
- self.encoding)
- return f(self.fields[idx])
-
-
-cdef class BedProxy(NamedTupleProxy):
- '''Proxy class for access to Bed fields.
-
- This class represents a BED entry for fast read-access.
- '''
- map_key2field = {
- 'contig' : (0, str),
- 'start' : (1, int),
- 'end' : (2, int),
- 'name' : (3, str),
- 'score' : (4, float),
- 'strand' : (5, str),
- 'thickStart' : (6, int),
- 'thickEnd' : (7, int),
- 'itemRGB' : (8, str),
- 'blockCount': (9, int),
- 'blockSizes': (10, str),
- 'blockStarts': (11, str), }
-
- cpdef int getMinFields(self):
- '''return minimum number of fields.'''
- return 3
-
- cpdef int getMaxFields(self):
- '''return max number of fields.'''
- return 12
-
- cdef update(self, char * buffer, size_t nbytes):
- '''update internal data.
-
- nbytes does not include the terminal '\0'.
- '''
- TupleProxy.update(self, buffer, nbytes)
-
- if self.nfields < 3:
- raise ValueError(
- "bed format requires at least three columns")
-
- # determines bed format
- self.bedfields = self.nfields
-
- # do automatic conversion
- self.contig = self.fields[0]
- self.start = atoi(self.fields[1])
- self.end = atoi(self.fields[2])
-
- # __setattr__ in base class seems to take precedence
- # hence implement setters in __setattr__
- #property start:
- # def __get__( self ): return self.start
- #property end:
- # def __get__( self ): return self.end
-
- def __str__(self):
-
- cdef int save_fields = self.nfields
- # ensure fields to use correct format
- self.nfields = self.bedfields
- retval = TupleProxy.__str__(self)
- self.nfields = save_fields
- return retval
-
- def __setattr__(self, key, value ):
- '''set attribute.'''
- if key == "start":
- self.start = value
- elif key == "end":
- self.end = value
-
- cdef int idx
- idx, f = self.map_key2field[key]
- TupleProxy._setindex(self, idx, str(value) )
-
-cdef class VCFProxy(NamedTupleProxy):
- '''Proxy class for access to VCF fields.
-
- The genotypes are accessed via a numeric index.
- Sample headers are not available.
- '''
- map_key2field = {
- 'contig' : (0, str),
- 'pos' : (1, int),
- 'id' : (2, str),
- 'ref' : (3, str),
- 'alt' : (4, str),
- 'qual' : (5, str),
- 'filter' : (6, str),
- 'info' : (7, str),
- 'format' : (8, str) }
-
- def __cinit__(self):
- # automatically calls TupleProxy.__cinit__
- # start indexed access at genotypes
- self.offset = 9
-
- cdef update(self, char * buffer, size_t nbytes):
- '''update internal data.
-
- nbytes does not include the terminal '\0'.
- '''
- TupleProxy.update(self, buffer, nbytes)
-
- self.contig = self.fields[0]
- # vcf counts from 1 - correct here
- self.pos = atoi(self.fields[1]) - 1
-
- def __len__(self):
- '''return number of genotype fields.'''
- return max(0, self.nfields - 9)
-
- property pos:
- '''feature end (in 0-based open/closed coordinates).'''
- def __get__(self):
- return self.pos
-
- def __setattr__(self, key, value):
- '''set attribute.'''
- if key == "pos":
- self.pos = value
- value += 1
-
- cdef int idx
- idx, f = self.map_key2field[key]
- TupleProxy._setindex(self, idx, str(value))
-
+++ /dev/null
-#########################################################################
-# Utility functions used across pysam
-#########################################################################
-cimport cython
-from cpython cimport array as c_array
-
-cpdef parse_region(reference=*, start=*, end=*, region=*)
-
-#########################################################################
-# Utility functions for quality string conversions
-
-cpdef c_array.array qualitystring_to_array(input_str, int offset=*)
-cpdef array_to_qualitystring(c_array.array arr, int offset=*)
-cpdef qualities_to_qualitystring(qualities, int offset=*)
-
-########################################################################
-########################################################################
-########################################################################
-## Python 3 compatibility functions
-########################################################################
-cdef charptr_to_str(const char *s, encoding=*)
-cdef bytes charptr_to_bytes(const char *s, encoding=*)
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*)
-cdef force_str(object s, encoding=*)
-cdef bytes force_bytes(object s, encoding=*)
-cdef bytes encode_filename(object filename)
-cdef from_string_and_size(const char *s, size_t length)
-
-cdef extern from "pysam_util.h":
-
- int samtools_main(int argc, char *argv[])
- int bcftools_main(int argc, char *argv[])
- void pysam_set_stderr(int fd)
- void pysam_unset_stderr()
- void pysam_set_stdout(int fd)
- void pysam_set_stdout_fn(const char *)
- void pysam_unset_stdout()
- void set_optind(int)
+++ /dev/null
-import types
-import sys
-import string
-import re
-import tempfile
-import os
-import io
-from contextlib import contextmanager
-
-from cpython.version cimport PY_MAJOR_VERSION
-from cpython cimport PyBytes_Check, PyUnicode_Check
-from cpython cimport array as c_array
-from libc.stdlib cimport calloc, free
-from libc.string cimport strncpy
-from libc.stdio cimport fprintf, stderr, fflush
-from libc.stdio cimport stdout as c_stdout
-from posix.fcntl cimport open as c_open, O_WRONLY
-
-#####################################################################
-# hard-coded constants
-cdef int MAX_POS = 2 << 29
-
-#################################################################
-# Utility functions for quality string conversions
-cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
- """convert a qualitystring to an array of quality values."""
- if input_str is None:
- return None
- qs = force_bytes(input_str)
- cdef char i
- return c_array.array('B', [i - offset for i in qs])
-
-
-cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
- """convert an array of quality values to a string."""
- if qualities is None:
- return None
- cdef int x
-
- cdef c_array.array result
- result = c_array.clone(qualities, len(qualities), zero=False)
-
- for x from 0 <= x < len(qualities):
- result[x] = qualities[x] + offset
- return force_str(result.tostring())
-
-
-cpdef qualities_to_qualitystring(qualities, int offset=33):
- """convert a list or array of quality scores to the string
- representation used in the SAM format.
-
- Parameters
- ----------
- offset : int
- offset to be added to the quality scores to arrive at
- the characters of the quality string (default=33).
-
- Returns
- -------
- string
- a quality string
-
- """
- cdef char x
- if qualities is None:
- return None
- elif isinstance(qualities, c_array.array):
- return array_to_qualitystring(qualities, offset=offset)
- else:
- # tuples and lists
- return force_str("".join([chr(x + offset) for x in qualities]))
-
-
-########################################################################
-########################################################################
-########################################################################
-## Python 3 compatibility functions
-########################################################################
-cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
-
-cdef from_string_and_size(const char* s, size_t length):
- if IS_PYTHON3:
- return s[:length].decode("ascii")
- else:
- return s[:length]
-
-# filename encoding (copied from lxml.etree.pyx)
-cdef str _FILENAME_ENCODING
-_FILENAME_ENCODING = sys.getfilesystemencoding()
-if _FILENAME_ENCODING is None:
- _FILENAME_ENCODING = sys.getdefaultencoding()
-if _FILENAME_ENCODING is None:
- _FILENAME_ENCODING = 'ascii'
-
-#cdef char* _C_FILENAME_ENCODING
-#_C_FILENAME_ENCODING = <char*>_FILENAME_ENCODING
-
-cdef bytes encode_filename(object filename):
- """Make sure a filename is 8-bit encoded (or None)."""
- if filename is None:
- return None
- elif PyBytes_Check(filename):
- return filename
- elif PyUnicode_Check(filename):
- return filename.encode(_FILENAME_ENCODING)
- else:
- raise TypeError(u"Argument must be string or unicode.")
-
-cdef bytes force_bytes(object s, encoding="ascii"):
- u"""convert string or unicode object to bytes, assuming
- ascii encoding.
- """
- if not IS_PYTHON3:
- return s
- elif s is None:
- return None
- elif PyBytes_Check(s):
- return s
- elif PyUnicode_Check(s):
- return s.encode(encoding)
- else:
- raise TypeError(u"Argument must be string, bytes or unicode.")
-
-cdef charptr_to_str(const char* s, encoding="ascii"):
- if s == NULL:
- return None
- if PY_MAJOR_VERSION < 3:
- return s
- else:
- return s.decode(encoding)
-
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
- if s == NULL:
- return None
- if PY_MAJOR_VERSION < 3:
- return s[:n]
- else:
- return s[:n].decode(encoding)
-
-cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
- if s == NULL:
- return None
- else:
- return s
-
-cdef force_str(object s, encoding="ascii"):
- """Return s converted to str type of current Python
- (bytes in Py2, unicode in Py3)"""
- if s is None:
- return None
- if PY_MAJOR_VERSION < 3:
- return s
- elif PyBytes_Check(s):
- return s.decode(encoding)
- else:
- # assume unicode
- return s
-
-cpdef parse_region(reference=None,
- start=None,
- end=None,
- region=None):
- """parse alternative ways to specify a genomic region. A region can
- either be specified by :term:`reference`, `start` and
- `end`. `start` and `end` denote 0-based, half-open
- intervals.
-
- Alternatively, a samtools :term:`region` string can be
- supplied.
-
- If any of the coordinates are missing they will be replaced by the
- minimum (`start`) or maximum (`end`) coordinate.
-
- Note that region strings are 1-based, while `start` and `end` denote
- an interval in python coordinates.
-
- Returns
- -------
-
- tuple : a tuple of `reference`, `start` and `end`.
-
- Raises
- ------
-
- ValueError
- for invalid or out of bounds regions.
-
- """
- cdef int rtid
- cdef long long rstart
- cdef long long rend
-
- rtid = -1
- rstart = 0
- rend = MAX_POS
- if start != None:
- try:
- rstart = start
- except OverflowError:
- raise ValueError('start out of range (%i)' % start)
-
- if end != None:
- try:
- rend = end
- except OverflowError:
- raise ValueError('end out of range (%i)' % end)
-
- if region:
- region = force_str(region)
- parts = re.split("[:-]", region)
- reference = parts[0]
- if len(parts) >= 2:
- rstart = int(parts[1]) - 1
- if len(parts) >= 3:
- rend = int(parts[2])
-
- if not reference:
- return None, 0, 0
-
- if not 0 <= rstart < MAX_POS:
- raise ValueError('start out of range (%i)' % rstart)
- if not 0 <= rend <= MAX_POS:
- raise ValueError('end out of range (%i)' % rend)
- if rstart > rend:
- raise ValueError(
- 'invalid region: start (%i) > end (%i)' % (rstart, rend))
-
- return force_bytes(reference), rstart, rend
-
-
-def _pysam_dispatch(collection,
- method,
- args=None,
- catch_stdout=True,
- save_stdout=None):
- '''call ``method`` in samtools/bcftools providing arguments in args.
-
- Catching of stdout can be turned off by setting *catch_stdout* to
- False.
-
- '''
-
- if method == "index":
- if not os.path.exists(args[0]):
- raise IOError("No such file or directory: '%s'" % args[0])
-
- if args is None:
- args = []
- else:
- args = list(args)
-
- # redirect stderr to file
- stderr_h, stderr_f = tempfile.mkstemp()
- pysam_set_stderr(stderr_h)
-
- # redirect stdout to file
- if save_stdout:
- stdout_f = save_stdout
- stdout_h = c_open(force_bytes(stdout_f),
- O_WRONLY)
- if stdout_h == -1:
- raise OSError("error while opening {} for writing".format(stdout_f))
-
- pysam_set_stdout_fn(force_bytes(stdout_f))
- pysam_set_stdout(stdout_h)
- elif catch_stdout:
- stdout_h, stdout_f = tempfile.mkstemp()
-
- MAP_STDOUT_OPTIONS = {
- "samtools": {
- "view": "-o {}",
- "mpileup": "-o {}",
- "depad": "-o {}",
- "calmd": "", # uses pysam_stdout_fn
- },
- "bcftools": {}
- }
-
- stdout_option = None
- if collection == "bcftools":
- # in bcftools, most methods accept -o, the exceptions
- # are below:
- if method not in ("index", "roh", "stats"):
- stdout_option = "-o {}"
- elif method in MAP_STDOUT_OPTIONS[collection]:
- stdout_option = MAP_STDOUT_OPTIONS[collection][method]
-
- if stdout_option is not None:
- os.close(stdout_h)
- pysam_set_stdout_fn(force_bytes(stdout_f))
- args.extend(stdout_option.format(stdout_f).split(" "))
- else:
- pysam_set_stdout(stdout_h)
- else:
- pysam_set_stdout_fn("-")
-
- # setup the function call to samtools/bcftools main
- cdef char ** cargs
- cdef int i, n, retval, l
- n = len(args)
- method = force_bytes(method)
- collection = force_bytes(collection)
- args = [force_bytes(a) for a in args]
-
- # allocate two more for first (dummy) argument (contains command)
- cdef int extra_args = 0
- if method == b"index":
- extra_args = 1
- # add extra arguments for commands accepting optional arguments
- # such as 'samtools index x.bam [out.index]'
- cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
- cargs[0] = collection
- cargs[1] = method
-
- # create copies of strings - getopt for long options permutes
- # arguments
- for i from 0 <= i < n:
- l = len(args[i])
- cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
- strncpy(cargs[i + 2], args[i], l)
-
- # reset getopt. On OsX there getopt reset is different
- # between getopt and getopt_long
- if method in [b'index', b'cat', b'quickcheck',
- b'faidx', b'kprobaln']:
- set_optind(1)
- else:
- set_optind(0)
-
- # call samtools/bcftools
- if collection == b"samtools":
- retval = samtools_main(n + 2, cargs)
- elif collection == b"bcftools":
- retval = bcftools_main(n + 2, cargs)
-
- for i from 0 <= i < n:
- free(cargs[i + 2])
- free(cargs)
-
- # get error messages
- def _collect(fn):
- out = []
- try:
- with open(fn, "r") as inf:
- out = inf.read()
- except UnicodeDecodeError:
- with open(fn, "rb") as inf:
- # read binary output
- out = inf.read()
- finally:
- os.remove(fn)
- return out
-
- pysam_unset_stderr()
- out_stderr = _collect(stderr_f)
-
- if save_stdout:
- pysam_unset_stdout()
- out_stdout = None
- elif catch_stdout:
- pysam_unset_stdout()
- out_stdout = _collect(stdout_f)
- else:
- out_stdout = None
-
- return retval, out_stderr, out_stdout
-
-
-__all__ = ["qualitystring_to_array",
- "array_to_qualitystring",
- "qualities_to_qualitystring"]
+++ /dev/null
-# cython: embedsignature=True
-#
-# Code to read, write and edit VCF files
-#
-# VCF lines are encoded as a dictionary with these keys (note: all lowercase):
-# 'chrom': string
-# 'pos': integer
-# 'id': string
-# 'ref': string
-# 'alt': list of strings
-# 'qual': integer
-# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
-# 'info': dictionary of values (see below)
-# 'format': list of keys (strings)
-# sample keys: dictionary of values (see below)
-#
-# The sample keys are accessible through vcf.getsamples()
-#
-# A dictionary of values contains value keys (defined in ##INFO or
-# ##FORMAT lines) which map to a list, containing integers, floats,
-# strings, or characters. Missing values are replaced by a particular
-# value, often -1 or .
-#
-# Genotypes are not stored as a string, but as a list of 1 or 3
-# elements (for haploid and diploid samples), the first (and last) the
-# integer representing an allele, and the second the separation
-# character. Note that there is just one genotype per sample, but for
-# consistency the single element is stored in a list.
-#
-# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
-# (key, value) pairs and are accessible through getheader()
-#
-# The VCF class can be instantiated with a 'regions' variable
-# consisting of tuples (chrom,start,end) encoding 0-based half-open
-# segments. Only variants with a position inside the segment will be
-# parsed. A regions parser is available under parse_regions.
-#
-# When instantiated, a reference can be passed to the VCF class. This
-# may be any class that supports a fetch(chrom, start, end) method.
-#
-# NOTE: the position that is returned to Python is 0-based, NOT
-# 1-based as in the VCF file.
-# NOTE: There is also preliminary VCF functionality in the VariantFile class.
-#
-# TODO:
-# only v4.0 writing is complete; alleles are not converted to v3.3 format
-#
-
-from collections import namedtuple, defaultdict
-from operator import itemgetter
-import sys, re, copy, bisect
-
-from libc.stdlib cimport atoi
-from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
-from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
-
-cimport pysam.ctabix as ctabix
-cimport pysam.ctabixproxies as ctabixproxies
-
-from pysam.cutils cimport force_str
-
-import pysam
-
-gtsRegEx = re.compile("[|/\\\\]")
-alleleRegEx = re.compile('^[ACGTN]+$')
-
-# Utility function. Uses 0-based coordinates
-def get_sequence(chrom, start, end, fa):
- # obtain sequence from .fa file, without truncation
- if end<=start: return ""
- if not fa: return "N"*(end-start)
- if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
- sequence = fa.fetch(chrom, start, end).upper()
- if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
- return sequence
-
-# Utility function. Parses a region string
-def parse_regions( string ):
- result = []
- for r in string.split(','):
- elts = r.split(':')
- chrom, start, end = elts[0], 0, 3000000000
- if len(elts)==1: pass
- elif len(elts)==2:
- if len(elts[1])>0:
- ielts = elts[1].split('-')
- if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
- try: start, end = int(ielts[0])-1, int(ielts[1])
- except: raise ValueError("Don't understand region string '%s'" % r)
- else:
- raise ValueError("Don't understand region string '%s'" % r)
- result.append( (chrom,start,end) )
- return result
-
-
-FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
-
-###########################################################################################################
-#
-# New class
-#
-###########################################################################################################
-
-cdef class VCFRecord( ctabixproxies.TupleProxy):
- '''vcf record.
-
- initialized from data and vcf meta
- '''
-
- cdef vcf
- cdef char * contig
- cdef uint32_t pos
-
- def __init__(self, vcf):
- self.vcf = vcf
- self.encoding = vcf.encoding
-
- # if len(data) != len(self.vcf._samples):
- # self.vcf.error(str(data),
- # self.BAD_NUMBER_OF_COLUMNS,
- # "expected %s for %s samples (%s), got %s" % \
- # (len(self.vcf._samples),
- # len(self.vcf._samples),
- # self.vcf._samples,
- # len(data)))
-
- def __cinit__(self, vcf):
- # start indexed access at genotypes
- self.offset = 9
-
- self.vcf = vcf
- self.encoding = vcf.encoding
-
- def error(self, line, error, opt=None):
- '''raise error.'''
- # pass to vcf file for error handling
- return self.vcf.error(line, error, opt)
-
- cdef update(self, char * buffer, size_t nbytes):
- '''update internal data.
-
- nbytes does not include the terminal '\0'.
- '''
- ctabixproxies.TupleProxy.update(self, buffer, nbytes)
-
- self.contig = self.fields[0]
- # vcf counts from 1 - correct here
- self.pos = atoi(self.fields[1]) - 1
-
- def __len__(self):
- return max(0, self.nfields - 9)
-
- property contig:
- def __get__(self): return self.contig
-
- property pos:
- def __get__(self): return self.pos
-
- property id:
- def __get__(self): return self.fields[2]
-
- property ref:
- def __get__(self):
- return self.fields[3]
-
- property alt:
- def __get__(self):
- # convert v3.3 to v4.0 alleles below
- alt = self.fields[4]
- if alt == ".": alt = []
- else: alt = alt.upper().split(',')
- return alt
-
- property qual:
- def __get__(self):
- qual = self.fields[5]
- if qual == b".": qual = -1
- else:
- try: qual = float(qual)
- except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
- return qual
-
- property filter:
- def __get__(self):
- f = self.fields[6]
- # postpone checking that filters exist. Encode missing filter or no filtering as empty list
- if f == b"." or f == b"PASS" or f == b"0": return []
- else: return f.split(';')
-
- property info:
- def __get__(self):
- col = self.fields[7]
- # dictionary of keys, and list of values
- info = {}
- if col != b".":
- for blurp in col.split(';'):
- elts = blurp.split('=')
- if len(elts) == 1: v = None
- elif len(elts) == 2: v = elts[1]
- else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
- info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
- return info
-
- property format:
- def __get__(self):
- return self.fields[8].split(':')
-
- property samples:
- def __get__(self):
- return self.vcf._samples
-
- def __getitem__(self, key):
-
- # parse sample columns
- values = self.fields[self.vcf._sample2column[key]].split(':')
- alt = self.alt
- format = self.format
-
- if len(values) > len(format):
- self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
- (len(values),key,len(format)))
-
- result = {}
- for idx in range(len(format)):
- expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
- if idx < len(values): value = values[idx]
- else:
- if expected == -1: value = "."
- else: value = ",".join(["."]*expected)
-
- result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
- if expected != -1 and len(result[format[idx]]) != expected:
- self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
- "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
- if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
- result[format[idx]] = result[format[idx]][:expected]
-
- return result
-
-
-cdef class asVCFRecord(ctabix.Parser):
- '''converts a :term:`tabix row` into a VCF record.'''
- cdef vcffile
- def __init__(self, vcffile):
- self.vcffile = vcffile
-
- cdef parse(self, char * buffer, int len):
- cdef VCFRecord r
- r = VCFRecord(self.vcffile)
- r.copy(buffer, len)
- return r
-
-class VCF(object):
-
- # types
- NT_UNKNOWN = 0
- NT_NUMBER = 1
- NT_ALLELES = 2
- NT_NR_ALLELES = 3
- NT_GENOTYPES = 4
- NT_PHASED_GENOTYPES = 5
-
- _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
- 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
- 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
- 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
- 4:"POS_NOT_NUMERICAL:Position column is not numerical",
- 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
- 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
- 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
- 8:"POS_NOT_POSITIVE:Position field must be >0",
- 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
- 10:"ERROR_INFO_STRING:Error while parsing info field",
- 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
- 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
- 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
- 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
- 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
- 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
- 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
- 18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
- 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
- 20:"MISSING_REF:Reference allele missing",
- 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
- 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
- 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
- 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
- 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
- 26:"WRONG_REF:Wrong reference %s",
- 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
- 28:"BAD_CHR_TAG:Error calculating chr tag for %s",
- 29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
- 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
- 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
- 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
- 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
- }
-
- # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
- _header = []
-
- # version number; 33=v3.3; 40=v4.0
- _version = 40
-
- # info, filter and format data
- _info = {}
- _filter = {}
- _format = {}
-
- # header; and required columns
- _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
- _samples = []
-
- # control behaviour
- _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
- _warn_errors = set([])
- _leftalign = False
-
- # reference sequence
- _reference = None
-
- # regions to include; None includes everything
- _regions = None
-
- # statefull stuff
- _lineno = -1
- _line = None
- _lines = None
-
- def __init__(self, _copy=None, reference=None, regions=None,
- lines=None, leftalign=False):
- # make error identifiers accessible by name
- for id in self._errors.keys():
- self.__dict__[self._errors[id].split(':')[0]] = id
- if _copy != None:
- self._leftalign = _copy._leftalign
- self._header = _copy._header[:]
- self._version = _copy._version
- self._info = copy.deepcopy(_copy._info)
- self._filter = copy.deepcopy(_copy._filter)
- self._format = copy.deepcopy(_copy._format)
- self._samples = _copy._samples[:]
- self._sample2column = copy.deepcopy(_copy._sample2column)
- self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
- self._warn_errors = copy.deepcopy(_copy._warn_errors)
- self._reference = _copy._reference
- self._regions = _copy._regions
- if reference: self._reference = reference
- if regions: self._regions = regions
- if leftalign: self._leftalign = leftalign
- self._lines = lines
- self.encoding = "ascii"
- self.tabixfile = None
-
- def error(self,line,error,opt=None):
- if error in self._ignored_errors: return
- errorlabel, errorstring = self._errors[error].split(':')
- if opt: errorstring = errorstring % opt
- errwarn = ["Error","Warning"][error in self._warn_errors]
- errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
- if error in self._warn_errors: return
- raise ValueError(errorstring)
-
- def parse_format(self,line,format,filter=False):
- if self._version == 40:
- if not format.startswith('<'):
- self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
- format = "<"+format
- if not format.endswith('>'):
- self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
- format += ">"
- format = format[1:-1]
- data = {'id':None,'number':None,'type':None,'descr':None}
- idx = 0
- while len(format.strip())>0:
- elts = format.strip().split(',')
- first, rest = elts[0], ','.join(elts[1:])
- if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
- if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
- if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
- first = ["ID=","Number=","Type=","Description="][idx] + first
- if first.startswith('ID='): data['id'] = first.split('=')[1]
- elif first.startswith('Number='): data['number'] = first.split('=')[1]
- elif first.startswith('Type='): data['type'] = first.split('=')[1]
- elif first.startswith('Description='):
- elts = format.split('"')
- if len(elts)<3:
- self.error(line,self.FORMAT_MISSING_QUOTES)
- elts = first.split('=') + [rest]
- data['descr'] = elts[1]
- rest = '"'.join(elts[2:])
- if rest.startswith(','): rest = rest[1:]
- else:
- self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
- format = rest
- idx += 1
- if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings
- if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
- if 'descr' not in data:
- # missing description
- self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
- data['descr'] = ""
- if not data['type'] and not data['number']:
- # fine, ##filter format
- return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
- if not data['type'] in ["Integer","Float","Character","String","Flag"]:
- self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
- # I would like a missing-value field, but it isn't there
- if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value
- else: data['missing'] = '.'
- if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
- try:
- n = int(data['number'])
- t = self.NT_NUMBER
- except ValueError:
- n = -1
- if data['number'] == '.': t = self.NT_UNKNOWN
- elif data['number'] == '#alleles': t = self.NT_ALLELES
- elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES
- elif data['number'] == '#genotypes': t = self.NT_GENOTYPES
- elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
- elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
- # abbreviations added in VCF version v4.1
- elif data['number'] == 'A': t = self.NT_ALLELES
- elif data['number'] == 'G': t = self.NT_GENOTYPES
- else:
- self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
- # if number is 0 - type must be Flag
- if n == 0 and data['type'] != 'Flag':
- self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
- # force type 'Flag' if no number
- data['type'] = 'Flag'
-
- return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
-
- def format_format( self, fmt, filter=False ):
- values = [('ID',fmt.id)]
- if fmt.number != None and not filter:
- if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
- elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
- elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
- elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
- elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
- elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
- else:
- raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
- values.append( ('Number',nmb) )
- values.append( ('Type', fmt.type) )
- values.append( ('Description', '"' + fmt.description + '"') )
- if self._version == 33:
- format = ",".join([v for k,v in values])
- else:
- format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
- return format
-
- def get_expected(self, format, formatdict, alt):
- fmt = formatdict[format]
- if fmt.numbertype == self.NT_UNKNOWN: return -1
- if fmt.numbertype == self.NT_NUMBER: return fmt.number
- if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
- if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
- if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
- if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
- return 0
-
-
- def _add_definition(self, formatdict, key, data, line ):
- if key in formatdict: return
- self.error(line,self.ERROR_UNKNOWN_KEY,key)
- if data == None:
- formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
- return
- if data == []: data = [""] # unsure what type -- say string
- if type(data[0]) == type(0.0):
- formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
- return
- if type(data[0]) == type(0):
- formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
- return
- formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
-
-
- # todo: trim trailing missing values
- def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
- output, sdata = [], []
- if type(data) == type([]): # for FORMAT field, make data with dummy values
- d = {}
- for k in data: d[k] = []
- data = d
- # convert missing values; and silently add definitions if required
- for k in data:
- self._add_definition( format, k, data[k], "(output)" )
- for idx,v in enumerate(data[k]):
- if v == format[k].missingvalue: data[k][idx] = "."
- # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
- for k in data:
- if k != 'GT': sdata.append( (k,data[k]) )
- sdata.sort()
- if 'GT' in data:
- sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
- for k,v in sdata:
- if v == []: v = None
- if key and value:
- if v != None: output.append( k+"="+','.join(map(str,v)) )
- else: output.append( k )
- elif key: output.append(k)
- elif value:
- if v != None: output.append( ','.join(map(str,v)) )
- else: output.append( "." ) # should not happen
- # snip off trailing missing data
- while len(output) > 1:
- last = output[-1].replace(',','').replace('.','')
- if len(last)>0: break
- output = output[:-1]
- return separator.join(output)
-
-
- def enter_default_format(self):
- for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
- FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
- FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
- FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
- FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
- FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
- FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
- FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
- FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
- FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid
- FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
- FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
- FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
- FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
- ]:
- if f.id not in self._format:
- self._format[f.id] = f
-
- def parse_header(self, line):
-
- assert line.startswith('##')
- elts = line[2:].split('=')
- key = elts[0].strip()
- value = '='.join(elts[1:]).strip()
- if key == "fileformat":
- if value == "VCFv3.3":
- self._version = 33
- elif value == "VCFv4.0":
- self._version = 40
- elif value == "VCFv4.1":
- # AH - for testing
- self._version = 40
- elif value == "VCFv4.2":
- # AH - for testing
- self._version = 40
- else:
- self.error(line,self.UNKNOWN_FORMAT_STRING)
- elif key == "INFO":
- f = self.parse_format(line, value)
- self._info[ f.id ] = f
- elif key == "FILTER":
- f = self.parse_format(line, value, filter=True)
- self._filter[ f.id ] = f
- elif key == "FORMAT":
- f = self.parse_format(line, value)
- self._format[ f.id ] = f
- else:
- # keep other keys in the header field
- self._header.append( (key,value) )
-
-
- def write_header( self, stream ):
- stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
- for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
- for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
- for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
-
-
- def parse_heading( self, line ):
- assert line.startswith('#')
- assert not line.startswith('##')
- headings = line[1:].split('\t')
- # test for 8, as FORMAT field might be missing
- if len(headings)==1 and len(line[1:].split()) >= 8:
- self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
- headings = line[1:].split()
-
- for i,s in enumerate(self._required):
-
- if len(headings)<=i or headings[i] != s:
-
- if len(headings) <= i:
- err = "(%sth entry not found)" % (i+1)
- else:
- err = "(found %s, expected %s)" % (headings[i],s)
-
- #self.error(line,self.BADLY_FORMATTED_HEADING,err)
- # allow FORMAT column to be absent
- if len(headings) == 8:
- headings.append("FORMAT")
- else:
- self.error(line,self.BADLY_FORMATTED_HEADING,err)
-
- self._samples = headings[9:]
- self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
-
- def write_heading( self, stream ):
- stream.write("#" + "\t".join(self._required + self._samples) + "\n")
-
- def convertGT(self, GTstring):
- if GTstring == ".": return ["."]
- try:
- gts = gtsRegEx.split(GTstring)
- if len(gts) == 1: return [int(gts[0])]
- if len(gts) != 2: raise ValueError()
- if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
- return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
- except ValueError:
- self.error(self._line,self.BAD_GENOTYPE,GTstring)
- return [".","|","."]
-
- def convertGTback(self, GTdata):
- return ''.join(map(str,GTdata))
-
- def parse_formatdata( self, key, value, formatdict, line ):
- # To do: check that the right number of values is present
- f = formatdict.get(key,None)
- if f == None:
- self._add_definition(formatdict, key, value, line )
- f = formatdict[key]
- if f.type == "Flag":
- if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
- return []
- values = value.split(',')
- # deal with trailing data in some early VCF files
- if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
- self.error(line,self.ERROR_TRAILING_DATA,values[-1])
- values[-1] = values[-1].split(';')[0]
- if f.type == "Integer":
- for idx,v in enumerate(values):
- try:
- if v == ".": values[idx] = f.missingvalue
- else: values[idx] = int(v)
- except:
- self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
- return [0] * len(values)
- return values
- elif f.type == "String":
- self._line = line
- if f.id == "GT": values = list(map( self.convertGT, values ))
- return values
- elif f.type == "Character":
- for v in values:
- if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
- return values
- elif f.type == "Float":
- for idx,v in enumerate(values):
- if v == ".": values[idx] = f.missingvalue
- try: return list(map(float,values))
- except:
- self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
- return [0.0] * len(values)
- else:
- # can't happen
- self.error(line,self.ERROR_INFO_STRING)
-
- def inregion(self, chrom, pos):
- if not self._regions: return True
- for r in self._regions:
- if r[0] == chrom and r[1] <= pos < r[2]: return True
- return False
-
- def parse_data( self, line, lineparse=False ):
- cols = line.split('\t')
- if len(cols) != len(self._samples)+9:
- # gracefully deal with absent FORMAT column
- # and those missing samples
- if len(cols) == 8:
- cols.append("")
- else:
- self.error(line,
- self.BAD_NUMBER_OF_COLUMNS,
- "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
-
- chrom = cols[0]
-
- # get 0-based position
- try: pos = int(cols[1])-1
- except: self.error(line,self.POS_NOT_NUMERICAL)
- if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
-
- # implement filtering
- if not self.inregion(chrom,pos): return None
-
- # end of first-pass parse for sortedVCF
- if lineparse: return chrom, pos, line
-
- id = cols[2]
-
- ref = cols[3].upper()
- if ref == ".":
- self.error(line,self.MISSING_REF)
- if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
- else: ref = ""
- else:
- for c in ref:
- if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
- if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
-
- # make sure reference is sane
- if self._reference:
- left = max(0,pos-100)
- faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
- faref = faref_leftflank[pos-left:]
- if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
- ref = faref
-
- # convert v3.3 to v4.0 alleles below
- if cols[4] == ".": alt = []
- else: alt = cols[4].upper().split(',')
-
- if cols[5] == ".": qual = -1
- else:
- try: qual = float(cols[5])
- except: self.error(line,self.QUAL_NOT_NUMERICAL)
-
- # postpone checking that filters exist. Encode missing filter or no filtering as empty list
- if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
- else: filter = cols[6].split(';')
-
- # dictionary of keys, and list of values
- info = {}
- if cols[7] != ".":
- for blurp in cols[7].split(';'):
- elts = blurp.split('=')
- if len(elts) == 1: v = None
- elif len(elts) == 2: v = elts[1]
- else: self.error(line,self.ERROR_INFO_STRING)
- info[elts[0]] = self.parse_formatdata(elts[0],
- v,
- self._info,
- line)
-
- # Gracefully deal with absent FORMAT column
- if cols[8] == "": format = []
- else: format = cols[8].split(':')
-
- # check: all filters are defined
- for f in filter:
- if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
-
- # check: format fields are defined
- if self._format:
- for f in format:
- if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
-
- # convert v3.3 alleles
- if self._version == 33:
- if len(ref) != 1: self.error(line,self.V33_BAD_REF)
- newalts = []
- have_deletions = False
- for a in alt:
- if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
- elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
- elif a.startswith('D'): # allow D<seq> and D<num>
- have_deletions = True
- try:
- l = int(a[1:]) # throws ValueError if sequence
- if len(ref) < l: # add to reference if necessary
- addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
- ref += addns
- for i,na in enumerate(newalts): newalts[i] = na+addns
- a = ref[l:] # new deletion, deleting pos...pos+l
- except ValueError:
- s = a[1:]
- if len(ref) < len(s): # add Ns to reference if necessary
- addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
- if not s.endswith(addns) and addns != 'N'*len(addns):
- self.error(line,self.V33_UNMATCHED_DELETION,
- "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
- ref += addns
- for i,na in enumerate(newalts): newalts[i] = na+addns
- a = ref[len(s):] # new deletion, deleting from pos
- else:
- self.error(line,self.V33_BAD_ALLELE)
- newalts.append(a)
- alt = newalts
- # deletion alleles exist, add dummy 1st reference allele, and account for leading base
- if have_deletions:
- if pos == 0:
- # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
- addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
- ref += addn
- alt = [allele+addn for allele in alt]
- else:
- addn = get_sequence(chrom,pos-1,pos,self._reference)
- ref = addn + ref
- alt = [addn + allele for allele in alt]
- pos -= 1
- else:
- # format v4.0 -- just check for nucleotides
- for allele in alt:
- if not alleleRegEx.match(allele):
- self.error(line,self.V40_BAD_ALLELE,allele)
-
- # check for leading nucleotide in indel calls
- for allele in alt:
- if len(allele) != len(ref):
- if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
- if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
- self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
-
- # trim trailing bases in alleles
- # AH: not certain why trimming this needs to be added
- # disabled now for unit testing
- # if alt:
- # for i in range(1,min(len(ref),min(map(len,alt)))):
- # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
- # break
- # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
-
- # left-align alleles, if a reference is available
- if self._leftalign and self._reference:
- while left < pos:
- movable = True
- for allele in alt:
- if len(allele) > len(ref):
- longest, shortest = allele, ref
- else:
- longest, shortest = ref, allele
- if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
- movable = False
- if longest[-1].upper() != longest[len(shortest)-1].upper():
- movable = False
- if not movable:
- break
- ref = ref[:-1]
- alt = [allele[:-1] for allele in alt]
- if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
- ref = faref_leftflank[pos-left-1] + ref
- alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
- pos -= 1
-
- # parse sample columns
- samples = []
- for sample in cols[9:]:
- dict = {}
- values = sample.split(':')
- if len(values) > len(format):
- self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
- for idx in range(len(format)):
- expected = self.get_expected(format[idx], self._format, alt)
- if idx < len(values): value = values[idx]
- else:
- if expected == -1: value = "."
- else: value = ",".join(["."]*expected)
-
- dict[format[idx]] = self.parse_formatdata(format[idx],
- value,
- self._format,
- line)
- if expected != -1 and len(dict[format[idx]]) != expected:
- self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
- "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
- if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
- dict[format[idx]] = dict[format[idx]][:expected]
- samples.append( dict )
-
- # done
- d = {'chrom':chrom,
- 'pos':pos, # return 0-based position
- 'id':id,
- 'ref':ref,
- 'alt':alt,
- 'qual':qual,
- 'filter':filter,
- 'info':info,
- 'format':format}
- for key,value in zip(self._samples,samples):
- d[key] = value
-
- return d
-
-
- def write_data(self, stream, data):
- required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
- for k in required:
- if k not in data: raise ValueError("Required key %s not found in data" % str(k))
- if data['alt'] == []: alt = "."
- else: alt = ",".join(data['alt'])
- if data['filter'] == None: filter = "."
- elif data['filter'] == []:
- if self._version == 33: filter = "0"
- else: filter = "PASS"
- else: filter = ';'.join(data['filter'])
- if data['qual'] == -1: qual = "."
- else: qual = str(data['qual'])
-
- output = [data['chrom'],
- str(data['pos']+1), # change to 1-based position
- data['id'],
- data['ref'],
- alt,
- qual,
- filter,
- self.format_formatdata(
- data['info'], self._info, separator=";"),
- self.format_formatdata(
- data['format'], self._format, value=False)]
-
- for s in self._samples:
- output.append(self.format_formatdata(
- data[s], self._format, key=False))
-
- stream.write( "\t".join(output) + "\n" )
-
- def _parse_header(self, stream):
- self._lineno = 0
- for line in stream:
- line = force_str(line, self.encoding)
- self._lineno += 1
- if line.startswith('##'):
- self.parse_header(line.strip())
- elif line.startswith('#'):
- self.parse_heading(line.strip())
- self.enter_default_format()
- else:
- break
- return line
-
- def _parse(self, line, stream):
- # deal with files with header only
- if line.startswith("##"): return
- if len(line.strip()) > 0:
- d = self.parse_data( line.strip() )
- if d: yield d
- for line in stream:
- self._lineno += 1
- if self._lines and self._lineno > self._lines: raise StopIteration
- d = self.parse_data( line.strip() )
- if d: yield d
-
- ######################################################################################################
- #
- # API follows
- #
- ######################################################################################################
-
- def getsamples(self):
- """ List of samples in VCF file """
- return self._samples
-
- def setsamples(self,samples):
- """ List of samples in VCF file """
- self._samples = samples
-
- def getheader(self):
- """ List of header key-value pairs (strings) """
- return self._header
-
- def setheader(self,header):
- """ List of header key-value pairs (strings) """
- self._header = header
-
- def getinfo(self):
- """ Dictionary of ##INFO tags, as VCF.FORMAT values """
- return self._info
-
- def setinfo(self,info):
- """ Dictionary of ##INFO tags, as VCF.FORMAT values """
- self._info = info
-
- def getformat(self):
- """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
- return self._format
-
- def setformat(self,format):
- """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
- self._format = format
-
- def getfilter(self):
- """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
- return self._filter
-
- def setfilter(self,filter):
- """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
- self._filter = filter
-
- def setversion(self, version):
- if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
- self._version = version
-
- def setregions(self, regions):
- self._regions = regions
-
- def setreference(self, ref):
- """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
- self._reference = ref
-
- def ignoreerror(self, errorstring):
- try: self._ignored_errors.add(self.__dict__[errorstring])
- except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
-
- def warnerror(self, errorstring):
- try: self._warn_errors.add(self.__dict__[errorstring])
- except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
-
- def parse(self, stream):
- """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
- last_line = self._parse_header(stream)
- # now return a generator that does the actual work. In this way the pre-processing is done
- # before the first piece of data is yielded
- return self._parse(last_line, stream)
-
- def write(self, stream, datagenerator):
- """ Writes a VCF file to a stream, using a data generator (or list) """
- self.write_header(stream)
- self.write_heading(stream)
- for data in datagenerator: self.write_data(stream,data)
-
- def writeheader(self, stream):
- """ Writes a VCF header """
- self.write_header(stream)
- self.write_heading(stream)
-
- def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
- """ Utility function: compares two calls for equality """
- # a variant should always be assigned to a unique position, one base before
- # the leftmost position of the alignment gap. If this rule is implemented
- # correctly, the two positions must be equal for the calls to be identical.
- if pos1 != pos2: return False
- # from both calls, trim rightmost bases when identical. Do this safely, i.e.
- # only when the reference bases are not Ns
- while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
- ref1 = ref1[:-1]
- alt1 = alt1[:-1]
- while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
- ref2 = ref2[:-1]
- alt2 = alt2[:-1]
- # now, the alternative alleles must be identical
- return alt1 == alt2
-
-###########################################################################################################
-###########################################################################################################
-## API functions added by Andreas
-###########################################################################################################
-
- def connect(self, filename, encoding="ascii"):
- '''connect to tabix file.'''
- self.encoding=encoding
- self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
- self._parse_header(self.tabixfile.header)
-
- def __del__(self):
- self.close()
- self.tabixfile = None
-
- def close(self):
- if self.tabixfile:
- self.tabixfile.close()
- self.tabixfile = None
-
- def fetch(self,
- reference=None,
- start=None,
- end=None,
- region=None ):
- """ Parse a stream of VCF-formatted lines.
- Initializes class instance and return generator """
- return self.tabixfile.fetch(
- reference,
- start,
- end,
- region,
- parser = asVCFRecord(self))
-
- def validate(self, record):
- '''validate vcf record.
-
- returns a validated record.
- '''
-
- raise NotImplementedError("needs to be checked")
-
- chrom, pos = record.chrom, record.pos
-
- # check reference
- ref = record.ref
- if ref == ".":
- self.error(str(record),self.MISSING_REF)
- if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
- else: ref = ""
- else:
- for c in ref:
- if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
- if "N" in ref: ref = get_sequence(chrom,
- pos,
- pos+len(ref),
- self._reference)
-
- # make sure reference is sane
- if self._reference:
- left = max(0,self.pos-100)
- faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
- faref = faref_leftflank[pos-left:]
- if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
- ref = faref
-
- # check: format fields are defined
- for f in record.format:
- if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
-
- # check: all filters are defined
- for f in record.filter:
- if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
-
- # convert v3.3 alleles
- if self._version == 33:
- if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
- newalts = []
- have_deletions = False
- for a in alt:
- if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
- elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
- elif a.startswith('D'): # allow D<seq> and D<num>
- have_deletions = True
- try:
- l = int(a[1:]) # throws ValueError if sequence
- if len(ref) < l: # add to reference if necessary
- addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
- ref += addns
- for i,na in enumerate(newalts): newalts[i] = na+addns
- a = ref[l:] # new deletion, deleting pos...pos+l
- except ValueError:
- s = a[1:]
- if len(ref) < len(s): # add Ns to reference if necessary
- addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
- if not s.endswith(addns) and addns != 'N'*len(addns):
- self.error(str(record),self.V33_UNMATCHED_DELETION,
- "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
- ref += addns
- for i,na in enumerate(newalts): newalts[i] = na+addns
- a = ref[len(s):] # new deletion, deleting from pos
- else:
- self.error(str(record),self.V33_BAD_ALLELE)
- newalts.append(a)
- alt = newalts
- # deletion alleles exist, add dummy 1st reference allele, and account for leading base
- if have_deletions:
- if pos == 0:
- # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
- addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
- ref += addn
- alt = [allele+addn for allele in alt]
- else:
- addn = get_sequence(chrom,pos-1,pos,self._reference)
- ref = addn + ref
- alt = [addn + allele for allele in alt]
- pos -= 1
- else:
- # format v4.0 -- just check for nucleotides
- for allele in alt:
- if not alleleRegEx.match(allele):
- self.error(str(record),self.V40_BAD_ALLELE,allele)
-
-
- # check for leading nucleotide in indel calls
- for allele in alt:
- if len(allele) != len(ref):
- if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
- if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
- self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
-
- # trim trailing bases in alleles
- # AH: not certain why trimming this needs to be added
- # disabled now for unit testing
- # for i in range(1,min(len(ref),min(map(len,alt)))):
- # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
- # break
- # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
-
- # left-align alleles, if a reference is available
- if self._leftalign and self._reference:
- while left < pos:
- movable = True
- for allele in alt:
- if len(allele) > len(ref):
- longest, shortest = allele, ref
- else:
- longest, shortest = ref, allele
- if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
- movable = False
- if longest[-1].upper() != longest[len(shortest)-1].upper():
- movable = False
- if not movable:
- break
- ref = ref[:-1]
- alt = [allele[:-1] for allele in alt]
- if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
- ref = faref_leftflank[pos-left-1] + ref
- alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
- pos -= 1
-
-__all__ = [
- "VCF", "VCFRecord", ]
--- /dev/null
+from pysam.libchtslib cimport *
+
+cdef extern from "htslib_util.h":
+
+ # add *nbytes* into the variable length data of *src* at *pos*
+ bam1_t * pysam_bam_update(bam1_t * b,
+ size_t nbytes_old,
+ size_t nbytes_new,
+ uint8_t * pos)
+
+ # now: static
+ int aux_type2size(int)
+
+ char * pysam_bam_get_qname(bam1_t * b)
+ uint32_t * pysam_bam_get_cigar(bam1_t * b)
+ uint8_t * pysam_bam_get_seq(bam1_t * b)
+ uint8_t * pysam_bam_get_qual(bam1_t * b)
+ uint8_t * pysam_bam_get_aux(bam1_t * b)
+ int pysam_bam_get_l_aux(bam1_t * b)
+ char pysam_bam_seqi(uint8_t * s, int i)
+
+ uint16_t pysam_get_bin(bam1_t * b)
+ uint8_t pysam_get_qual(bam1_t * b)
+ uint8_t pysam_get_l_qname(bam1_t * b)
+ uint16_t pysam_get_flag(bam1_t * b)
+ uint16_t pysam_get_n_cigar(bam1_t * b)
+ void pysam_set_bin(bam1_t * b, uint16_t v)
+ void pysam_set_qual(bam1_t * b, uint8_t v)
+ void pysam_set_l_qname(bam1_t * b, uint8_t v)
+ void pysam_set_flag(bam1_t * b, uint16_t v)
+ void pysam_set_n_cigar(bam1_t * b, uint16_t v)
+ void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
+
+
+from pysam.libcalignmentfile cimport AlignmentFile
+ctypedef AlignmentFile AlignmentFile_t
+
+
+# Note: need to declare all C fields and methods here
+cdef class AlignedSegment:
+
+ # object that this AlignedSegment represents
+ cdef bam1_t * _delegate
+
+ # the file from which this AlignedSegment originates (can be None)
+ cdef AlignmentFile _alignment_file
+
+ # caching of array properties for quick access
+ cdef object cache_query_qualities
+ cdef object cache_query_alignment_qualities
+ cdef object cache_query_sequence
+ cdef object cache_query_alignment_sequence
+
+ # add an alignment tag with value to the AlignedSegment
+ # an existing tag of the same name will be replaced.
+ cpdef set_tag(self, tag, value, value_type=?, replace=?)
+
+ # add an alignment tag with value to the AlignedSegment
+ # an existing tag of the same name will be replaced.
+ cpdef get_tag(self, tag, with_value_type=?)
+
+ # return true if tag exists
+ cpdef has_tag(self, tag)
+
+ # returns a valid sam alignment string
+ cpdef tostring(self, AlignmentFile_t handle)
+
+
+cdef class PileupColumn:
+ cdef bam_pileup1_t ** plp
+ cdef int tid
+ cdef int pos
+ cdef int n_pu
+ cdef AlignmentFile _alignment_file
+
+
+cdef class PileupRead:
+ cdef AlignedSegment _alignment
+ cdef int32_t _qpos
+ cdef int _indel
+ cdef int _level
+ cdef uint32_t _is_del
+ cdef uint32_t _is_head
+ cdef uint32_t _is_tail
+ cdef uint32_t _is_refskip
+
+# factor methods
+cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file)
+cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file)
+cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file)
+cdef inline uint32_t get_alignment_length(bam1_t * src)
--- /dev/null
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+###############################################################################
+# The principal classes defined in this module are:
+#
+# class AlignedSegment an aligned segment (read)
+#
+# class PileupColumn a collection of segments (PileupRead) aligned to
+# a particular genomic position.
+#
+# class PileupRead an AlignedSegment aligned to a particular genomic
+# position. Contains additional attributes with respect
+# to this.
+#
+# Additionally this module defines numerous additional classes that are part
+# of the internal API. These are:
+#
+# Various iterator classes to iterate over alignments in sequential (IteratorRow)
+# or in a stacked fashion (IteratorColumn):
+#
+# class IteratorRow
+# class IteratorRowRegion
+# class IteratorRowHead
+# class IteratorRowAll
+# class IteratorRowAllRefs
+# class IteratorRowSelection
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import re
+import array
+import ctypes
+import struct
+
+cimport cython
+from cpython cimport array as c_array
+from cpython.version cimport PY_MAJOR_VERSION
+from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
+from libc.string cimport strchr
+from cpython cimport array as c_array
+
+from pysam.libcutils cimport force_bytes, force_str, \
+ charptr_to_str, charptr_to_bytes
+from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \
+ array_to_qualitystring
+
+# Constants for binary tag conversion
+cdef char * htslib_types = 'cCsSiIf'
+cdef char * parray_types = 'bBhHiIf'
+
+# translation tables
+
+# cigar code to character and vice versa
+cdef char* CODE2CIGAR= "MIDNSHP=XB"
+cdef int NCIGAR_CODES = 10
+
+if PY_MAJOR_VERSION >= 3:
+ CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
+else:
+ CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
+
+CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
+
+#####################################################################
+# C multiplication with wrapping around
+cdef inline uint32_t c_mul(uint32_t a, uint32_t b):
+ return (a * b) & 0xffffffff
+
+
+#####################################################################
+# typecode guessing
+cdef inline char map_typecode_htslib_to_python(uint8_t s):
+ """map an htslib typecode to the corresponding python typecode
+ to be used in the struct or array modules."""
+
+ # map type from htslib to python array
+ cdef char * f = strchr(htslib_types, s)
+
+ if f == NULL:
+ return 0
+ return parray_types[f - htslib_types]
+
+cdef inline uint8_t map_typecode_python_to_htslib(char s):
+ """determine value type from type code of array"""
+ cdef char * f = strchr(parray_types, s)
+ if f == NULL:
+ return 0
+ return htslib_types[f - parray_types]
+
+# optional tag data manipulation
+cdef convert_binary_tag(uint8_t * tag):
+ """return bytesize, number of values and array of values
+ in aux_data memory location pointed to by tag."""
+ cdef uint8_t auxtype
+ cdef uint8_t byte_size
+ cdef int32_t nvalues
+ # get byte size
+ auxtype = tag[0]
+ byte_size = aux_type2size(auxtype)
+ tag += 1
+ # get number of values in array
+ nvalues = (<int32_t*>tag)[0]
+ tag += 4
+
+ # define python array
+ cdef c_array.array c_values = array.array(
+ chr(map_typecode_htslib_to_python(auxtype)))
+ c_array.resize(c_values, nvalues)
+
+ # copy data
+ memcpy(c_values.data.as_voidptr, <uint8_t*>tag, nvalues * byte_size)
+
+ # no need to check for endian-ness as bam1_core_t fields
+ # and aux_data are in host endian-ness. See sam.c and calls
+ # to swap_data
+ return byte_size, nvalues, c_values
+
+
+cdef inline uint8_t get_value_code(value, value_type=None):
+ '''guess type code for a *value*. If *value_type* is None,
+ the type code will be inferred based on the Python type of
+ *value*'''
+ cdef uint8_t typecode
+ cdef char * _char_type
+
+ if value_type is None:
+ if isinstance(value, int):
+ typecode = 'i'
+ elif isinstance(value, float):
+ typecode = 'd'
+ elif isinstance(value, str):
+ typecode = 'Z'
+ elif isinstance(value, bytes):
+ typecode = 'Z'
+ elif isinstance(value, array.array) or \
+ isinstance(value, list) or \
+ isinstance(value, tuple):
+ typecode = 'B'
+ else:
+ return 0
+ else:
+ if value_type not in 'Zidf':
+ return 0
+ value_type = force_bytes(value_type)
+ _char_type = value_type
+ typecode = (<uint8_t*>_char_type)[0]
+
+ return typecode
+
+
+cdef inline bytes getTypecode(value, maximum_value=None):
+ '''returns the value typecode of a value.
+
+ If max is specified, the approprite type is
+ returned for a range where value is the minimum.
+ '''
+
+ if maximum_value is None:
+ maximum_value = value
+
+ cdef bytes valuetype
+
+ t = type(value)
+
+ if t is float:
+ valuetype = b'f'
+ elif t is int:
+ # signed ints
+ if value < 0:
+ if value >= -128 and maximum_value < 128:
+ valuetype = b'c'
+ elif value >= -32768 and maximum_value < 32768:
+ valuetype = b's'
+ elif value < -2147483648 or maximum_value >= 2147483648:
+ raise ValueError(
+ "at least one signed integer out of range of "
+ "BAM/SAM specification")
+ else:
+ valuetype = b'i'
+ # unsigned ints
+ else:
+ if maximum_value < 256:
+ valuetype = b'C'
+ elif maximum_value < 65536:
+ valuetype = b'S'
+ elif maximum_value >= 4294967296:
+ raise ValueError(
+ "at least one integer out of range of BAM/SAM specification")
+ else:
+ valuetype = b'I'
+ else:
+ # Note: hex strings (H) are not supported yet
+ if t is not bytes:
+ value = value.encode('ascii')
+ if len(value) == 1:
+ valuetype = b'A'
+ else:
+ valuetype = b'Z'
+
+ return valuetype
+
+
+cdef inline packTags(tags):
+ """pack a list of tags. Each tag is a tuple of (tag, tuple).
+
+ Values are packed into the most space efficient data structure
+ possible unless the tag contains a third field with the typecode.
+
+ Returns a format string and the associated list of arguments
+ to be used in a call to struct.pack_into.
+ """
+ fmts, args = ["<"], []
+
+ cdef char array_typecode
+
+ datatype2format = {
+ b'c': ('b', 1),
+ b'C': ('B', 1),
+ b's': ('h', 2),
+ b'S': ('H', 2),
+ b'i': ('i', 4),
+ b'I': ('I', 4),
+ b'f': ('f', 4),
+ b'A': ('c', 1)}
+
+ for tag in tags:
+
+ if len(tag) == 2:
+ pytag, value = tag
+ valuetype = None
+ elif len(tag) == 3:
+ pytag, value, valuetype = tag
+ else:
+ raise ValueError("malformatted tag: %s" % str(tag))
+
+ pytag = force_bytes(pytag)
+ valuetype = force_bytes(valuetype)
+ t = type(value)
+
+ if t is tuple or t is list:
+ # binary tags from tuples or lists
+ if valuetype is None:
+ # automatically determine value type - first value
+ # determines type. If there is a mix of types, the
+ # result is undefined.
+ valuetype = getTypecode(min(value), max(value))
+
+ if valuetype not in datatype2format:
+ raise ValueError("invalid value type '%s'" % valuetype)
+
+ datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0])
+ args.extend([pytag[:2],
+ b"B",
+ valuetype,
+ len(value)] + list(value))
+
+ elif isinstance(value, array.array):
+ # binary tags from arrays
+ if valuetype is None:
+ array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
+
+ if array_typecode == 0:
+ raise ValueError("unsupported type code '{}'"
+ .format(value.typecode))
+
+ valuetype = force_bytes(chr(array_typecode))
+
+ if valuetype not in datatype2format:
+ raise ValueError("invalid value type '%s' (%s)" %
+ (valuetype, type(valuetype)))
+
+ # use array.tostring() to retrieve byte representation and
+ # save as bytes
+ datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1])
+ args.extend([pytag[:2],
+ b"B",
+ valuetype,
+ len(value),
+ force_bytes(value.tostring())])
+
+ else:
+ if valuetype is None:
+ valuetype = getTypecode(value)
+
+ if valuetype in b"AZ":
+ value = force_bytes(value)
+
+ if valuetype == b"Z":
+ datafmt = "2sc%is" % (len(value)+1)
+ else:
+ datafmt = "2sc%s" % datatype2format[valuetype][0]
+
+ args.extend([pytag[:2],
+ valuetype,
+ value])
+
+ fmts.append(datafmt)
+
+ return "".join(fmts), args
+
+
+cdef inline int32_t calculateQueryLength(bam1_t * src):
+ """return query length computed from CIGAR alignment.
+
+ Return 0 if there is no CIGAR alignment.
+ """
+
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+
+ if cigar_p == NULL:
+ return 0
+
+ cdef uint32_t k, qpos
+ cdef int op
+ qpos = 0
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+
+ if op == BAM_CMATCH or \
+ op == BAM_CINS or \
+ op == BAM_CSOFT_CLIP or \
+ op == BAM_CHARD_CLIP or \
+ op == BAM_CEQUAL or \
+ op == BAM_CDIFF:
+ qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ return qpos
+
+
+cdef inline int32_t getQueryStart(bam1_t *src) except -1:
+ cdef uint32_t * cigar_p
+ cdef uint32_t k, op
+ cdef uint32_t start_offset = 0
+
+ if pysam_get_n_cigar(src):
+ cigar_p = pysam_bam_get_cigar(src);
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CHARD_CLIP:
+ if start_offset != 0 and start_offset != src.core.l_qseq:
+ PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
+ return -1
+ elif op == BAM_CSOFT_CLIP:
+ start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
+
+ return start_offset
+
+
+cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
+ cdef uint32_t * cigar_p
+ cdef uint32_t k, op
+ cdef uint32_t end_offset = src.core.l_qseq
+
+ # if there is no sequence, compute length from cigar string
+ if end_offset == 0:
+ end_offset = calculateQueryLength(src)
+
+ # walk backwards in cigar string
+ if pysam_get_n_cigar(src) > 1:
+ cigar_p = pysam_bam_get_cigar(src);
+ for k from pysam_get_n_cigar(src) > k >= 1:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CHARD_CLIP:
+ if end_offset != 0 and end_offset != src.core.l_qseq:
+ PyErr_SetString(ValueError,
+ 'Invalid clipping in CIGAR string')
+ return -1
+ elif op == BAM_CSOFT_CLIP:
+ end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
+
+ return end_offset
+
+
+cdef inline bytes getSequenceInRange(bam1_t *src,
+ uint32_t start,
+ uint32_t end):
+ """return python string of the sequence in a bam1_t object.
+ """
+
+ cdef uint8_t * p
+ cdef uint32_t k
+ cdef char * s
+
+ if not src.core.l_qseq:
+ return None
+
+ seq = PyBytes_FromStringAndSize(NULL, end - start)
+ s = <char*>seq
+ p = pysam_bam_get_seq(src)
+
+ for k from start <= k < end:
+ # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
+ # note: do not use string literal as it will be a python string
+ s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf]
+
+ return charptr_to_bytes(seq)
+
+
+cdef inline object getQualitiesInRange(bam1_t *src,
+ uint32_t start,
+ uint32_t end):
+ """return python array of quality values from a bam1_t object"""
+
+ cdef uint8_t * p
+ cdef uint32_t k
+
+ p = pysam_bam_get_qual(src)
+ if p[0] == 0xff:
+ return None
+
+ # 'B': unsigned char
+ cdef c_array.array result = array.array('B', [0])
+ c_array.resize(result, end - start)
+
+ # copy data
+ memcpy(result.data.as_voidptr, <void*>&p[start], end - start)
+
+ return result
+
+
+#####################################################################
+## private factory methods
+cdef class AlignedSegment
+cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file):
+ '''return an AlignedSegment object constructed from `src`'''
+ # note that the following does not call __init__
+ cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment)
+ dest._delegate = bam_dup1(src)
+ dest._alignment_file = alignment_file
+ return dest
+
+
+cdef class PileupColumn
+cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos,
+ int n_pu, AlignmentFile alignment_file):
+ '''return a PileupColumn object constructed from pileup in `plp` and
+ setting additional attributes.
+
+ '''
+ # note that the following does not call __init__
+ cdef PileupColumn dest = PileupColumn.__new__(PileupColumn)
+ dest._alignment_file = alignment_file
+ dest.plp = plp
+ dest.tid = tid
+ dest.pos = pos
+ dest.n_pu = n_pu
+ return dest
+
+cdef class PileupRead
+cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file):
+ '''return a PileupRead object construted from a bam_pileup1_t * object.'''
+ cdef PileupRead dest = PileupRead.__new__(PileupRead)
+ dest._alignment = makeAlignedSegment(src.b, alignment_file)
+ dest._qpos = src.qpos
+ dest._indel = src.indel
+ dest._level = src.level
+ dest._is_del = src.is_del
+ dest._is_head = src.is_head
+ dest._is_tail = src.is_tail
+ dest._is_refskip = src.is_refskip
+ return dest
+
+
+cdef inline uint32_t get_alignment_length(bam1_t * src):
+ cdef int k = 0
+ cdef uint32_t l = 0
+ if src == NULL:
+ return 0
+ cdef uint32_t * cigar_p = bam_get_cigar(src)
+ if cigar_p == NULL:
+ return 0
+ cdef int op
+ cdef int n = pysam_get_n_cigar(src)
+ for k from 0 <= k < n:
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP:
+ continue
+ l += cigar_p[k] >> BAM_CIGAR_SHIFT
+ return l
+
+
+# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ...
+cdef inline bytes build_alignment_sequence(bam1_t * src):
+ """return expanded sequence from MD tag.
+
+ The sequence includes substitutions and both insertions in the
+ reference as well as deletions to the reference sequence. Combine
+ with the cigar string to reconstitute the query or the reference
+ sequence.
+
+ Positions corresponding to `N` (skipped region from the reference)
+ in the CIGAR string will not appear in the returned sequence. The
+ MD should correspondingly not contain these. Thus proper tags are::
+
+ Deletion from the reference: cigar=5M1D5M MD=5^C5
+ Skipped region from reference: cigar=5M1N5M MD=10
+
+ Returns
+ -------
+
+ None, if no MD tag is present.
+
+ """
+ if src == NULL:
+ return None
+
+ cdef uint32_t start = getQueryStart(src)
+ cdef uint32_t end = getQueryEnd(src)
+ # get read sequence, taking into account soft-clipping
+ r = getSequenceInRange(src, start, end)
+ cdef char * read_sequence = r
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+ if cigar_p == NULL:
+ return None
+
+ cdef uint32_t r_idx = 0
+ cdef int op
+ cdef uint32_t k, i, l, x
+ cdef int nmatches = 0
+ cdef int s_idx = 0
+
+ cdef uint32_t max_len = get_alignment_length(src)
+ if max_len == 0:
+ raise ValueError("could not determine alignment length")
+
+ cdef char * s = <char*>calloc(max_len + 1, sizeof(char))
+ if s == NULL:
+ raise ValueError(
+ "could not allocated sequence of length %i" % max_len)
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+ for i from 0 <= i < l:
+ s[s_idx] = read_sequence[r_idx]
+ r_idx += 1
+ s_idx += 1
+ elif op == BAM_CDEL:
+ for i from 0 <= i < l:
+ s[s_idx] = '-'
+ s_idx += 1
+ elif op == BAM_CREF_SKIP:
+ pass
+ elif op == BAM_CINS:
+ for i from 0 <= i < l:
+ # encode insertions into reference as lowercase
+ s[s_idx] = read_sequence[r_idx] + 32
+ r_idx += 1
+ s_idx += 1
+ elif op == BAM_CSOFT_CLIP:
+ pass
+ elif op == BAM_CHARD_CLIP:
+ pass # advances neither
+ elif op == BAM_CPAD:
+ raise NotImplementedError(
+ "Padding (BAM_CPAD, 6) is currently not supported. "
+ "Please implement. Sorry about that.")
+
+ cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD")
+ if md_tag_ptr == NULL:
+ seq = PyBytes_FromStringAndSize(s, s_idx)
+ free(s)
+ return seq
+
+ cdef char * md_tag = <char*>bam_aux2Z(md_tag_ptr)
+ cdef int md_idx = 0
+ s_idx = 0
+
+ while md_tag[md_idx] != 0:
+ # c is numerical
+ if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57:
+ nmatches *= 10
+ nmatches += md_tag[md_idx] - 48
+ md_idx += 1
+ continue
+ else:
+ # save matches up to this point, skipping insertions
+ for x from 0 <= x < nmatches:
+ while s[s_idx] >= 'a':
+ s_idx += 1
+ s_idx += 1
+ while s[s_idx] >= 'a':
+ s_idx += 1
+
+ r_idx += nmatches
+ nmatches = 0
+ if md_tag[md_idx] == '^':
+ md_idx += 1
+ while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
+ assert s[s_idx] == '-'
+ s[s_idx] = md_tag[md_idx]
+ s_idx += 1
+ md_idx += 1
+ else:
+ # save mismatch and change to lower case
+ s[s_idx] = md_tag[md_idx] + 32
+ s_idx += 1
+ r_idx += 1
+ md_idx += 1
+
+ # save matches up to this point, skipping insertions
+ for x from 0 <= x < nmatches:
+ while s[s_idx] >= 'a':
+ s_idx += 1
+ s_idx += 1
+ while s[s_idx] >= 'a':
+ s_idx += 1
+
+ seq = PyBytes_FromStringAndSize(s, s_idx)
+ free(s)
+
+ return seq
+
+
+cdef class AlignedSegment:
+ '''Class representing an aligned segment.
+
+ This class stores a handle to the samtools C-structure representing
+ an aligned read. Member read access is forwarded to the C-structure
+ and converted into python objects. This implementation should be fast,
+ as only the data needed is converted.
+
+ For write access, the C-structure is updated in-place. This is
+ not the most efficient way to build BAM entries, as the variable
+ length data is concatenated and thus needs to be resized if
+ a field is updated. Furthermore, the BAM entry might be
+ in an inconsistent state.
+
+ One issue to look out for is that the sequence should always
+ be set *before* the quality scores. Setting the sequence will
+ also erase any quality scores that were set previously.
+ '''
+
+ # Now only called when instances are created from Python
+ def __init__(self):
+ # see bam_init1
+ self._delegate = <bam1_t*>calloc(1, sizeof(bam1_t))
+ # allocate some memory. If size is 0, calloc does not return a
+ # pointer that can be passed to free() so allocate 40 bytes
+ # for a new read
+ self._delegate.m_data = 40
+ self._delegate.data = <uint8_t *>calloc(
+ self._delegate.m_data, 1)
+ self._delegate.l_data = 0
+ # set some data to make read approximately legit.
+ # Note, SAM writing fails with q_name of length 0
+ self._delegate.core.l_qname = 0
+ self._delegate.core.tid = -1
+ self._delegate.core.pos = -1
+ self._delegate.core.mtid = -1
+ self._delegate.core.mpos = -1
+
+ # caching for selected fields
+ self.cache_query_qualities = None
+ self.cache_query_alignment_qualities = None
+ self.cache_query_sequence = None
+ self.cache_query_alignment_sequence = None
+
+ def __dealloc__(self):
+ bam_destroy1(self._delegate)
+
+ def __str__(self):
+ """return string representation of alignment.
+
+ The representation is an approximate :term:`SAM` format, because
+ an aligned read might not be associated with a :term:`AlignmentFile`.
+ As a result :term:`tid` is shown instead of the reference name.
+ Similarly, the tags field is returned in its parsed state.
+
+ To get a valid SAM record, use :meth:`tostring`.
+ """
+ # sam-parsing is done in sam.c/bam_format1_core which
+ # requires a valid header.
+ return "\t".join(map(str, (self.query_name,
+ self.flag,
+ self.reference_id,
+ self.reference_start,
+ self.mapping_quality,
+ self.cigarstring,
+ self.next_reference_id,
+ self.next_reference_start,
+ self.query_alignment_length,
+ self.query_sequence,
+ self.query_qualities,
+ self.tags)))
+
+ def __copy__(self):
+ return makeAlignedSegment(self._delegate, self._alignment_file)
+
+ def __deepcopy__(self, memo):
+ return makeAlignedSegment(self._delegate, self._alignment_file)
+
+ def compare(self, AlignedSegment other):
+ '''return -1,0,1, if contents in this are binary
+ <,=,> to *other*
+
+ '''
+
+ cdef int retval, x
+ cdef bam1_t *t
+ cdef bam1_t *o
+
+ t = self._delegate
+ o = other._delegate
+
+ # uncomment for debugging purposes
+ # cdef unsigned char * oo, * tt
+ # tt = <unsigned char*>(&t.core)
+ # oo = <unsigned char*>(&o.core)
+ # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x]
+ # tt = <unsigned char*>(t.data)
+ # oo = <unsigned char*>(o.data)
+ # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x])
+
+ # Fast-path test for object identity
+ if t == o:
+ return 0
+
+ retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
+
+ if retval:
+ return retval
+ # cmp(t.l_data, o.l_data)
+ retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
+ if retval:
+ return retval
+ return memcmp(t.data, o.data, t.l_data)
+
+ def __richcmp__(self, AlignedSegment other, int op):
+ if op == 2: # == operator
+ return self.compare(other) == 0
+ elif op == 3: # != operator
+ return self.compare(other) != 0
+ else:
+ return NotImplemented
+
+ def __hash__(self):
+ cdef bam1_t * src = self._delegate
+ cdef int x
+
+ # see http://effbot.org/zone/python-hash.htm
+ cdef uint8_t * c = <uint8_t *>&src.core
+ cdef uint32_t hash_value = c[0]
+ for x from 1 <= x < sizeof(bam1_core_t):
+ hash_value = c_mul(hash_value, 1000003) ^ c[x]
+ c = <uint8_t *>src.data
+ for x from 0 <= x < src.l_data:
+ hash_value = c_mul(hash_value, 1000003) ^ c[x]
+
+ return hash_value
+
+ cpdef tostring(self, AlignmentFile_t htsfile):
+ """returns a string representation of the aligned segment.
+
+ The output format is valid SAM format.
+
+ Parameters
+ ----------
+
+ htsfile -- AlignmentFile object to map numerical
+ identifiers to chromosome names.
+ """
+ cdef int n_targets = htsfile.header.n_targets
+
+ if self._delegate.core.tid >= n_targets \
+ or self._delegate.core.mtid >= n_targets:
+ raise ValueError('htsfile does not match aligned segment')
+
+ cdef kstring_t line
+ line.l = line.m = 0
+ line.s = NULL
+
+ if sam_format1(htsfile.header, self._delegate, &line) < 0:
+ if line.m:
+ free(line.s)
+ raise ValueError('sam_format failed')
+
+ ret = force_str(line.s[:line.l])
+
+ if line.m:
+ free(line.s)
+
+ return ret
+
+ ########################################################
+ ## Basic attributes in order of appearance in SAM format
+ property query_name:
+ """the query template name (None if not present)"""
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ if pysam_get_l_qname(src) == 0:
+ return None
+ return charptr_to_str(<char *>pysam_bam_get_qname(src))
+
+ def __set__(self, qname):
+ if qname is None or len(qname) == 0:
+ return
+
+ if len(qname) >= 255:
+ raise ValueError("query length out of range {} > 254".format(
+ len(qname)))
+
+ qname = force_bytes(qname)
+ cdef bam1_t * src
+ cdef int l
+ cdef char * p
+
+ src = self._delegate
+ p = pysam_bam_get_qname(src)
+
+ # the qname is \0 terminated
+ l = len(qname) + 1
+ pysam_bam_update(src,
+ pysam_get_l_qname(src),
+ l,
+ <uint8_t*>p)
+
+ pysam_set_l_qname(src, l)
+
+ # re-acquire pointer to location in memory
+ # as it might have moved
+ p = pysam_bam_get_qname(src)
+
+ strncpy(p, qname, l)
+
+ property flag:
+ """properties flag"""
+ def __get__(self):
+ return pysam_get_flag(self._delegate)
+ def __set__(self, flag):
+ pysam_set_flag(self._delegate, flag)
+
+ property reference_name:
+ """:term:`reference` name (None if no AlignmentFile is associated)"""
+ def __get__(self):
+ if self._alignment_file is not None:
+ return self._alignment_file.getrname(self._delegate.core.tid)
+ return None
+
+ property reference_id:
+ """:term:`reference` ID
+
+ .. note::
+
+ This field contains the index of the reference sequence in
+ the sequence dictionary. To obtain the name of the
+ reference sequence, use
+ :meth:`pysam.AlignmentFile.getrname()`
+
+ """
+ def __get__(self): return self._delegate.core.tid
+ def __set__(self, tid): self._delegate.core.tid = tid
+
+ property reference_start:
+ """0-based leftmost coordinate"""
+ def __get__(self): return self._delegate.core.pos
+ def __set__(self, pos):
+ ## setting the position requires updating the "bin" attribute
+ cdef bam1_t * src
+ src = self._delegate
+ src.core.pos = pos
+ if pysam_get_n_cigar(src):
+ pysam_set_bin(src,
+ hts_reg2bin(
+ src.core.pos,
+ bam_endpos(src),
+ 14,
+ 5))
+ else:
+ pysam_set_bin(src,
+ hts_reg2bin(
+ src.core.pos,
+ src.core.pos + 1,
+ 14,
+ 5))
+
+ property mapping_quality:
+ """mapping quality"""
+ def __get__(self):
+ return pysam_get_qual(self._delegate)
+ def __set__(self, qual):
+ pysam_set_qual(self._delegate, qual)
+
+ property cigarstring:
+ '''the :term:`cigar` alignment as a string.
+
+ The cigar string is a string of alternating integers
+ and characters denoting the length and the type of
+ an operation.
+
+ .. note::
+ The order length,operation is specified in the
+ SAM format. It is different from the order of
+ the :attr:`cigar` property.
+
+ Returns None if not present.
+
+ To unset the cigarstring, assign None or the
+ empty string.
+ '''
+ def __get__(self):
+ c = self.cigartuples
+ if c is None:
+ return None
+ # reverse order
+ else:
+ return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c])
+
+ def __set__(self, cigar):
+ if cigar is None or len(cigar) == 0:
+ self.cigartuples = []
+ else:
+ parts = CIGAR_REGEX.findall(cigar)
+ # reverse order
+ self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts]
+
+ # TODO
+ # property cigar:
+ # """the cigar alignment"""
+
+ property next_reference_id:
+ """the :term:`reference` id of the mate/next read."""
+ def __get__(self): return self._delegate.core.mtid
+ def __set__(self, mtid):
+ self._delegate.core.mtid = mtid
+
+ property next_reference_name:
+ """:term:`reference` name of the mate/next read (None if no
+ AlignmentFile is associated)"""
+ def __get__(self):
+ if self._alignment_file is not None:
+ return self._alignment_file.getrname(self._delegate.core.mtid)
+ return None
+
+ property next_reference_start:
+ """the position of the mate/next read."""
+ def __get__(self):
+ return self._delegate.core.mpos
+ def __set__(self, mpos):
+ self._delegate.core.mpos = mpos
+
+ property query_length:
+ """the length of the query/read.
+
+ This value corresponds to the length of the sequence supplied
+ in the BAM/SAM file. The length of a query is 0 if there is no
+ sequence in the BAM/SAM file. In those cases, the read length
+ can be inferred from the CIGAR alignment, see
+ :meth:`pysam.AlignedSegment.infer_query_length`.
+
+ The length includes soft-clipped bases and is equal to
+ ``len(query_sequence)``.
+
+ This property is read-only but can be set by providing a
+ sequence.
+
+ Returns 0 if not available.
+
+ """
+ def __get__(self):
+ return self._delegate.core.l_qseq
+
+ property template_length:
+ """the observed query template length"""
+ def __get__(self):
+ return self._delegate.core.isize
+ def __set__(self, isize):
+ self._delegate.core.isize = isize
+
+ property query_sequence:
+ """read sequence bases, including :term:`soft clipped` bases
+ (None if not present).
+
+ Note that assigning to seq will invalidate any quality scores.
+ Thus, to in-place edit the sequence and quality scores, copies of
+ the quality scores need to be taken. Consider trimming for example::
+
+ q = read.query_qualities
+ read.query_squence = read.query_sequence[5:10]
+ read.query_qualities = q[5:10]
+
+ The sequence is returned as it is stored in the BAM file. Some mappers
+ might have stored a reverse complement of the original read
+ sequence.
+ """
+ def __get__(self):
+ if self.cache_query_sequence:
+ return self.cache_query_sequence
+
+ cdef bam1_t * src
+ cdef char * s
+ src = self._delegate
+
+ if src.core.l_qseq == 0:
+ return None
+
+ self.cache_query_sequence = force_str(getSequenceInRange(
+ src, 0, src.core.l_qseq))
+ return self.cache_query_sequence
+
+ def __set__(self, seq):
+ # samtools manages sequence and quality length memory together
+ # if no quality information is present, the first byte says 0xff.
+ cdef bam1_t * src
+ cdef uint8_t * p
+ cdef char * s
+ cdef int l, k
+ cdef Py_ssize_t nbytes_new, nbytes_old
+
+ if seq == None:
+ l = 0
+ else:
+ l = len(seq)
+ seq = force_bytes(seq)
+
+ src = self._delegate
+
+ # as the sequence is stored in half-bytes, the total length (sequence
+ # plus quality scores) is (l+1)/2 + l
+ nbytes_new = (l + 1) / 2 + l
+ nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq
+
+ # acquire pointer to location in memory
+ p = pysam_bam_get_seq(src)
+ src.core.l_qseq = l
+
+ # change length of data field
+ pysam_bam_update(src,
+ nbytes_old,
+ nbytes_new,
+ p)
+
+ if l > 0:
+ # re-acquire pointer to location in memory
+ # as it might have moved
+ p = pysam_bam_get_seq(src)
+ for k from 0 <= k < nbytes_new:
+ p[k] = 0
+ # convert to C string
+ s = seq
+ for k from 0 <= k < l:
+ p[k/2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
+
+ # erase qualities
+ p = pysam_bam_get_qual(src)
+ p[0] = 0xff
+
+ self.cache_query_sequence = force_str(seq)
+
+ # clear cached values for quality values
+ self.cache_query_qualities = None
+ self.cache_query_alignment_qualities = None
+
+ property query_qualities:
+ """read sequence base qualities, including :term:`soft
+ clipped` bases (None if not present).
+
+ Quality scores are returned as a python array of unsigned
+ chars. Note that this is not the ASCII-encoded value typically
+ seen in FASTQ or SAM formatted files. Thus, no offset of 33
+ needs to be subtracted.
+
+ Note that to set quality scores the sequence has to be set
+ beforehand as this will determine the expected length of the
+ quality score array.
+
+ This method raises a ValueError if the length of the
+ quality scores and the sequence are not the same.
+
+ """
+ def __get__(self):
+
+ if self.cache_query_qualities:
+ return self.cache_query_qualities
+
+ cdef bam1_t * src
+ cdef char * q
+
+ src = self._delegate
+
+ if src.core.l_qseq == 0:
+ return None
+
+ self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq)
+ return self.cache_query_qualities
+
+ def __set__(self, qual):
+
+ # note that memory is already allocated via setting the sequence
+ # hence length match of sequence and quality needs is checked.
+ cdef bam1_t * src
+ cdef uint8_t * p
+ cdef int l
+
+ src = self._delegate
+ p = pysam_bam_get_qual(src)
+ if qual is None or len(qual) == 0:
+ # if absent and there is a sequence: set to 0xff
+ if src.core.l_qseq != 0:
+ p[0] = 0xff
+ return
+
+ # check for length match
+ l = len(qual)
+ if src.core.l_qseq != l:
+ raise ValueError(
+ "quality and sequence mismatch: %i != %i" %
+ (l, src.core.l_qseq))
+
+ # create a python array object filling it
+ # with the quality scores
+
+ # NB: should avoid this copying if qual is
+ # already of the correct type.
+ cdef c_array.array result = c_array.array('B', qual)
+
+ # copy data
+ memcpy(p, result.data.as_voidptr, l)
+
+ # save in cache
+ self.cache_query_qualities = qual
+
+ property bin:
+ """properties bin"""
+ def __get__(self):
+ return pysam_get_bin(self._delegate)
+ def __set__(self, bin):
+ pysam_set_bin(self._delegate, bin)
+
+
+ ##########################################################
+ # Derived simple attributes. These are simple attributes of
+ # AlignedSegment getting and setting values.
+ ##########################################################
+ # 1. Flags
+ ##########################################################
+ property is_paired:
+ """true if read is paired in sequencing"""
+ def __get__(self):
+ return (self.flag & BAM_FPAIRED) != 0
+ def __set__(self,val):
+ pysam_update_flag(self._delegate, val, BAM_FPAIRED)
+
+ property is_proper_pair:
+ """true if read is mapped in a proper pair"""
+ def __get__(self):
+ return (self.flag & BAM_FPROPER_PAIR) != 0
+ def __set__(self,val):
+ pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR)
+ property is_unmapped:
+ """true if read itself is unmapped"""
+ def __get__(self):
+ return (self.flag & BAM_FUNMAP) != 0
+ def __set__(self, val):
+ pysam_update_flag(self._delegate, val, BAM_FUNMAP)
+ property mate_is_unmapped:
+ """true if the mate is unmapped"""
+ def __get__(self):
+ return (self.flag & BAM_FMUNMAP) != 0
+ def __set__(self,val):
+ pysam_update_flag(self._delegate, val, BAM_FMUNMAP)
+ property is_reverse:
+ """true if read is mapped to reverse strand"""
+ def __get__(self):
+ return (self.flag & BAM_FREVERSE) != 0
+ def __set__(self,val):
+ pysam_update_flag(self._delegate, val, BAM_FREVERSE)
+ property mate_is_reverse:
+ """true is read is mapped to reverse strand"""
+ def __get__(self):
+ return (self.flag & BAM_FMREVERSE) != 0
+ def __set__(self,val):
+ pysam_update_flag(self._delegate, val, BAM_FMREVERSE)
+ property is_read1:
+ """true if this is read1"""
+ def __get__(self):
+ return (self.flag & BAM_FREAD1) != 0
+ def __set__(self,val):
+ pysam_update_flag(self._delegate, val, BAM_FREAD1)
+ property is_read2:
+ """true if this is read2"""
+ def __get__(self):
+ return (self.flag & BAM_FREAD2) != 0
+ def __set__(self, val):
+ pysam_update_flag(self._delegate, val, BAM_FREAD2)
+ property is_secondary:
+ """true if not primary alignment"""
+ def __get__(self):
+ return (self.flag & BAM_FSECONDARY) != 0
+ def __set__(self, val):
+ pysam_update_flag(self._delegate, val, BAM_FSECONDARY)
+ property is_qcfail:
+ """true if QC failure"""
+ def __get__(self):
+ return (self.flag & BAM_FQCFAIL) != 0
+ def __set__(self, val):
+ pysam_update_flag(self._delegate, val, BAM_FQCFAIL)
+ property is_duplicate:
+ """true if optical or PCR duplicate"""
+ def __get__(self):
+ return (self.flag & BAM_FDUP) != 0
+ def __set__(self, val):
+ pysam_update_flag(self._delegate, val, BAM_FDUP)
+ property is_supplementary:
+ """true if this is a supplementary alignment"""
+ def __get__(self):
+ return (self.flag & BAM_FSUPPLEMENTARY) != 0
+ def __set__(self, val):
+ pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY)
+
+ # 2. Coordinates and lengths
+ property reference_end:
+ '''aligned reference position of the read on the reference genome.
+
+ reference_end points to one past the last aligned residue.
+ Returns None if not available (read is unmapped or no cigar
+ alignment present).
+
+ '''
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
+ return None
+ return bam_endpos(src)
+
+ property reference_length:
+ '''aligned length of the read on the reference genome.
+
+ This is equal to `aend - pos`. Returns None if not available.'''
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0:
+ return None
+ return bam_endpos(src) - \
+ self._delegate.core.pos
+
+ property query_alignment_sequence:
+ """aligned portion of the read.
+
+ This is a substring of :attr:`seq` that excludes flanking
+ bases that were :term:`soft clipped` (None if not present). It
+ is equal to ``seq[qstart:qend]``.
+
+ SAM/BAM files may include extra flanking bases that are not
+ part of the alignment. These bases may be the result of the
+ Smith-Waterman or other algorithms, which may not require
+ alignments that begin at the first residue or end at the last.
+ In addition, extra sequencing adapters, multiplex identifiers,
+ and low-quality bases that were not considered for alignment
+ may have been retained.
+
+ """
+
+ def __get__(self):
+ if self.cache_query_alignment_sequence:
+ return self.cache_query_alignment_sequence
+
+ cdef bam1_t * src
+ cdef uint32_t start, end
+
+ src = self._delegate
+
+ if src.core.l_qseq == 0:
+ return None
+
+ start = getQueryStart(src)
+ end = getQueryEnd(src)
+
+ self.cache_query_alignment_sequence = force_str(
+ getSequenceInRange(src, start, end))
+ return self.cache_query_alignment_sequence
+
+ property query_alignment_qualities:
+ """aligned query sequence quality values (None if not present). These
+ are the quality values that correspond to :attr:`query`, that
+ is, they exclude qualities of :term:`soft clipped` bases. This
+ is equal to ``qual[qstart:qend]``.
+
+ Quality scores are returned as a python array of unsigned
+ chars. Note that this is not the ASCII-encoded value typically
+ seen in FASTQ or SAM formatted files. Thus, no offset of 33
+ needs to be subtracted.
+
+ This property is read-only.
+
+ """
+ def __get__(self):
+
+ if self.cache_query_alignment_qualities:
+ return self.cache_query_alignment_qualities
+
+ cdef bam1_t * src
+ cdef uint32_t start, end
+
+ src = self._delegate
+
+ if src.core.l_qseq == 0:
+ return None
+
+ start = getQueryStart(src)
+ end = getQueryEnd(src)
+ self.cache_query_alignment_qualities = \
+ getQualitiesInRange(src, start, end)
+ return self.cache_query_alignment_qualities
+
+ property query_alignment_start:
+ """start index of the aligned query portion of the sequence (0-based,
+ inclusive).
+
+ This the index of the first base in :attr:`seq` that is not
+ soft-clipped.
+
+ """
+ def __get__(self):
+ return getQueryStart(self._delegate)
+
+ property query_alignment_end:
+ """end index of the aligned query portion of the sequence (0-based,
+ exclusive)"""
+ def __get__(self):
+ return getQueryEnd(self._delegate)
+
+ property query_alignment_length:
+ """length of the aligned query sequence.
+
+ This is equal to :attr:`qend` - :attr:`qstart`"""
+ def __get__(self):
+ cdef bam1_t * src
+ src = self._delegate
+ return getQueryEnd(src) - getQueryStart(src)
+
+ #####################################################
+ # Computed properties
+
+ def get_reference_positions(self, full_length=False):
+ """a list of reference positions that this read aligns to.
+
+ By default, this method only returns positions in the
+ reference that are within the alignment. If *full_length* is
+ set, None values will be included for any soft-clipped or
+ unaligned positions within the read. The returned list will
+ thus be of the same length as the read.
+
+ """
+ cdef uint32_t k, i, pos
+ cdef int op
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+ cdef bint _full = full_length
+
+ src = self._delegate
+ if pysam_get_n_cigar(src) == 0:
+ return []
+
+ result = []
+ pos = src.core.pos
+ cigar_p = pysam_bam_get_cigar(src)
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ if op == BAM_CSOFT_CLIP or op == BAM_CINS:
+ if _full:
+ for i from 0 <= i < l:
+ result.append(None)
+ elif op == BAM_CMATCH:
+ for i from pos <= i < pos + l:
+ result.append(i)
+ pos += l
+ elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ pos += l
+
+ return result
+
+ def infer_query_length(self, always=True):
+ """inferred read length from CIGAR string.
+
+ If *always* is set to True, the read length
+ will be always inferred. If set to False, the length
+ of the read sequence will be returned if it is
+ available.
+
+ Returns None if CIGAR string is not present.
+ """
+
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+
+ src = self._delegate
+
+ if not always and src.core.l_qseq:
+ return src.core.l_qseq
+
+ return calculateQueryLength(src)
+
+ def get_reference_sequence(self):
+ """return the reference sequence.
+
+ This method requires the MD tag to be set.
+ """
+ cdef uint32_t k, i
+ cdef int op
+ cdef bam1_t * src = self._delegate
+ ref_seq = force_str(build_alignment_sequence(src))
+ if ref_seq is None:
+ raise ValueError("MD tag not present")
+
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+ cdef uint32_t r_idx = 0
+ result = []
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+ for i from 0 <= i < l:
+ result.append(ref_seq[r_idx])
+ r_idx += 1
+ elif op == BAM_CDEL:
+ for i from 0 <= i < l:
+ result.append(ref_seq[r_idx])
+ r_idx += 1
+ elif op == BAM_CREF_SKIP:
+ pass
+ elif op == BAM_CINS:
+ r_idx += l
+ elif op == BAM_CSOFT_CLIP:
+ pass
+ elif op == BAM_CHARD_CLIP:
+ pass # advances neither
+ elif op == BAM_CPAD:
+ raise NotImplementedError(
+ "Padding (BAM_CPAD, 6) is currently not supported. "
+ "Please implement. Sorry about that.")
+
+ return "".join(result)
+
+ def get_aligned_pairs(self, matches_only=False, with_seq=False):
+ """a list of aligned read (query) and reference positions.
+
+ For inserts, deletions, skipping either query or reference
+ position may be None.
+
+ Padding is currently not supported and leads to an exception.
+
+ Parameters
+ ----------
+
+ matches_only : bool
+ If True, only matched bases are returned - no None on either
+ side.
+ with_seq : bool
+ If True, return a third element in the tuple containing the
+ reference sequence. Substitutions are lower-case. This option
+ requires an MD tag to be present.
+
+ Returns
+ -------
+
+ aligned_pairs : list of tuples
+
+ """
+ cdef uint32_t k, i, pos, qpos, r_idx, l
+ cdef int op
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src = self._delegate
+ cdef bint _matches_only = bool(matches_only)
+ cdef bint _with_seq = bool(with_seq)
+
+ # TODO: this method performs no checking and assumes that
+ # read sequence, cigar and MD tag are consistent.
+
+ if _with_seq:
+ ref_seq = force_str(self.get_reference_sequence())
+ if ref_seq is None:
+ raise ValueError("MD tag not present")
+
+ r_idx = 0
+
+ if pysam_get_n_cigar(src) == 0:
+ return []
+
+ result = []
+ pos = src.core.pos
+ qpos = 0
+ cigar_p = pysam_bam_get_cigar(src)
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF:
+ if _with_seq:
+ for i from pos <= i < pos + l:
+ result.append((qpos, i, ref_seq[r_idx]))
+ r_idx += 1
+ qpos += 1
+ else:
+ for i from pos <= i < pos + l:
+ result.append((qpos, i))
+ qpos += 1
+ pos += l
+
+ elif op == BAM_CINS or op == BAM_CSOFT_CLIP:
+ if not _matches_only:
+ if _with_seq:
+ for i from pos <= i < pos + l:
+ result.append((qpos, None, None))
+ qpos += 1
+ else:
+ for i from pos <= i < pos + l:
+ result.append((qpos, None))
+ qpos += 1
+ else:
+ qpos += l
+
+ elif op == BAM_CDEL:
+ if not _matches_only:
+ if _with_seq:
+ for i from pos <= i < pos + l:
+ result.append((None, i, ref_seq[r_idx]))
+ r_idx += 1
+ else:
+ for i from pos <= i < pos + l:
+ result.append((None, i))
+ pos += l
+
+ elif op == BAM_CHARD_CLIP:
+ pass # advances neither
+
+ elif op == BAM_CREF_SKIP:
+ if not _matches_only:
+ if _with_seq:
+ for i from pos <= i < pos + l:
+ result.append((None, i, None))
+ else:
+ for i from pos <= i < pos + l:
+ result.append((None, i))
+
+ pos += l
+
+ elif op == BAM_CPAD:
+ raise NotImplementedError(
+ "Padding (BAM_CPAD, 6) is currently not supported. "
+ "Please implement. Sorry about that.")
+
+ return result
+
+ def get_blocks(self):
+ """ a list of start and end positions of
+ aligned gapless blocks.
+
+ The start and end positions are in genomic
+ coordinates.
+
+ Blocks are not normalized, i.e. two blocks
+ might be directly adjacent. This happens if
+ the two blocks are separated by an insertion
+ in the read.
+ """
+
+ cdef uint32_t k, pos, l
+ cdef int op
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+
+ src = self._delegate
+ if pysam_get_n_cigar(src) == 0:
+ return []
+
+ result = []
+ pos = src.core.pos
+ cigar_p = pysam_bam_get_cigar(src)
+ l = 0
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ if op == BAM_CMATCH:
+ result.append((pos, pos + l))
+ pos += l
+ elif op == BAM_CDEL or op == BAM_CREF_SKIP:
+ pos += l
+
+ return result
+
+ def get_overlap(self, uint32_t start, uint32_t end):
+ """return number of aligned bases of read overlapping the interval
+ *start* and *end* on the reference sequence.
+
+ Return None if cigar alignment is not available.
+ """
+ cdef uint32_t k, i, pos, overlap
+ cdef int op, o
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+
+ overlap = 0
+
+ src = self._delegate
+ if pysam_get_n_cigar(src) == 0:
+ return None
+ pos = src.core.pos
+ o = 0
+
+ cigar_p = pysam_bam_get_cigar(src)
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ if op == BAM_CMATCH:
+ o = min( pos + l, end) - max( pos, start )
+ if o > 0: overlap += o
+
+ if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP:
+ pos += l
+
+ return overlap
+
+ def get_cigar_stats(self):
+ """summary of operations in cigar string.
+
+ The output order in the array is "MIDNSHP=X" followed by a
+ field for the NM tag. If the NM tag is not present, this
+ field will always be 0.
+
+ +-----+--------------+-----+
+ |M |BAM_CMATCH |0 |
+ +-----+--------------+-----+
+ |I |BAM_CINS |1 |
+ +-----+--------------+-----+
+ |D |BAM_CDEL |2 |
+ +-----+--------------+-----+
+ |N |BAM_CREF_SKIP |3 |
+ +-----+--------------+-----+
+ |S |BAM_CSOFT_CLIP|4 |
+ +-----+--------------+-----+
+ |H |BAM_CHARD_CLIP|5 |
+ +-----+--------------+-----+
+ |P |BAM_CPAD |6 |
+ +-----+--------------+-----+
+ |= |BAM_CEQUAL |7 |
+ +-----+--------------+-----+
+ |X |BAM_CDIFF |8 |
+ +-----+--------------+-----+
+ |NM |NM tag |9 |
+ +-----+--------------+-----+
+
+ If no cigar string is present, empty arrays will be returned.
+
+ Parameters
+ ----------
+
+ Returns
+ -------
+
+ arrays : two arrays. The first contains the nucleotide counts within
+ each cigar operation, the second contains the number of blocks for
+ each cigar operation.
+
+ """
+
+ cdef int nfields = NCIGAR_CODES + 1
+
+ cdef c_array.array base_counts = array.array(
+ "I",
+ [0] * nfields)
+ cdef uint32_t [:] base_view = base_counts
+ cdef c_array.array block_counts = array.array(
+ "I",
+ [0] * nfields)
+ cdef uint32_t [:] block_view = block_counts
+
+ cdef bam1_t * src = self._delegate
+ cdef int op
+ cdef uint32_t l
+ cdef int32_t k
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+
+ if cigar_p == NULL:
+ return None
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ base_view[op] += l
+ block_view[op] += 1
+
+ cdef uint8_t * v = bam_aux_get(src, 'NM')
+ if v != NULL:
+ base_view[nfields - 1] = <int32_t>bam_aux2i(v)
+
+ return base_counts, block_counts
+
+ #####################################################
+ ## Unsorted as yet
+ # TODO: capture in CIGAR object
+ property cigartuples:
+ """the :term:`cigar` alignment. The alignment
+ is returned as a list of tuples of (operation, length).
+
+ If the alignment is not present, None is returned.
+
+ The operations are:
+
+ +-----+--------------+-----+
+ |M |BAM_CMATCH |0 |
+ +-----+--------------+-----+
+ |I |BAM_CINS |1 |
+ +-----+--------------+-----+
+ |D |BAM_CDEL |2 |
+ +-----+--------------+-----+
+ |N |BAM_CREF_SKIP |3 |
+ +-----+--------------+-----+
+ |S |BAM_CSOFT_CLIP|4 |
+ +-----+--------------+-----+
+ |H |BAM_CHARD_CLIP|5 |
+ +-----+--------------+-----+
+ |P |BAM_CPAD |6 |
+ +-----+--------------+-----+
+ |= |BAM_CEQUAL |7 |
+ +-----+--------------+-----+
+ |X |BAM_CDIFF |8 |
+ +-----+--------------+-----+
+
+ .. note::
+ The output is a list of (operation, length) tuples, such as
+ ``[(0, 30)]``.
+ This is different from the SAM specification and
+ the :attr:`cigarstring` property, which uses a
+ (length, operation) order, for example: ``30M``.
+
+ To unset the cigar property, assign an empty list
+ or None.
+ """
+ def __get__(self):
+ cdef uint32_t * cigar_p
+ cdef bam1_t * src
+ cdef uint32_t op, l
+ cdef int k
+
+ src = self._delegate
+ if pysam_get_n_cigar(src) == 0:
+ return None
+
+ cigar = []
+
+ cigar_p = pysam_bam_get_cigar(src);
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ l = cigar_p[k] >> BAM_CIGAR_SHIFT
+ cigar.append((op, l))
+ return cigar
+
+ def __set__(self, values):
+ cdef uint32_t * p
+ cdef bam1_t * src
+ cdef op, l
+ cdef int k, ncigar
+
+ k = 0
+
+ src = self._delegate
+
+ # get location of cigar string
+ p = pysam_bam_get_cigar(src)
+
+ # empty values for cigar string
+ if values is None:
+ values = []
+
+ ncigar = len(values)
+ # create space for cigar data within src.data
+ pysam_bam_update(src,
+ pysam_get_n_cigar(src) * 4,
+ ncigar * 4,
+ <uint8_t*>p)
+
+ # length is number of cigar operations, not bytes
+ pysam_set_n_cigar(src, ncigar)
+
+ # re-acquire pointer to location in memory
+ # as it might have moved
+ p = pysam_bam_get_cigar(src)
+
+ # insert cigar operations
+ for op, l in values:
+ p[k] = l << BAM_CIGAR_SHIFT | op
+ k += 1
+
+ ## setting the cigar string requires updating the bin
+ pysam_set_bin(src,
+ hts_reg2bin(
+ src.core.pos,
+ bam_endpos(src),
+ 14,
+ 5))
+
+
+ cpdef set_tag(self,
+ tag,
+ value,
+ value_type=None,
+ replace=True):
+ """sets a particular field *tag* to *value* in the optional alignment
+ section.
+
+ *value_type* describes the type of *value* that is to entered
+ into the alignment record.. It can be set explicitly to one
+ of the valid one-letter type codes. If unset, an appropriate
+ type will be chosen automatically.
+
+ An existing value of the same *tag* will be overwritten unless
+ replace is set to False. This is usually not recommened as a
+ tag may only appear once in the optional alignment section.
+
+ If *value* is None, the tag will be deleted.
+ """
+
+ cdef int value_size
+ cdef uint8_t * value_ptr
+ cdef uint8_t *existing_ptr
+ cdef uint8_t typecode
+ cdef float float_value
+ cdef double double_value
+ cdef int32_t int_value
+ cdef bam1_t * src = self._delegate
+ cdef char * _value_type
+ cdef c_array.array array_value
+ cdef object buffer
+
+ if len(tag) != 2:
+ raise ValueError('Invalid tag: %s' % tag)
+
+ tag = force_bytes(tag)
+ if replace:
+ existing_ptr = bam_aux_get(src, tag)
+ if existing_ptr:
+ bam_aux_del(src, existing_ptr)
+
+ # setting value to None deletes a tag
+ if value is None:
+ return
+
+ typecode = get_value_code(value, value_type)
+ if typecode == 0:
+ raise ValueError("can't guess type or invalid type code specified")
+
+ # Not Endian-safe, but then again neither is samtools!
+ if typecode == 'Z':
+ value = force_bytes(value)
+ value_ptr = <uint8_t*><char*>value
+ value_size = len(value)+1
+ elif typecode == 'i':
+ int_value = value
+ value_ptr = <uint8_t*>&int_value
+ value_size = sizeof(int32_t)
+ elif typecode == 'd':
+ double_value = value
+ value_ptr = <uint8_t*>&double_value
+ value_size = sizeof(double)
+ elif typecode == 'f':
+ float_value = value
+ value_ptr = <uint8_t*>&float_value
+ value_size = sizeof(float)
+ elif typecode == 'B':
+ # the following goes through python, needs to be cleaned up
+ # pack array using struct
+ if value_type is None:
+ fmt, args = packTags([(tag, value)])
+ else:
+ fmt, args = packTags([(tag, value, value_type)])
+
+ # remove tag and type code as set by bam_aux_append
+ # first four chars of format (<2sc)
+ fmt = '<' + fmt[4:]
+ # first two values to pack
+ args = args[2:]
+ value_size = struct.calcsize(fmt)
+ # buffer will be freed when object goes out of scope
+ buffer = ctypes.create_string_buffer(value_size)
+ struct.pack_into(fmt, buffer, 0, *args)
+ # bam_aux_append copies data from value_ptr
+ bam_aux_append(src,
+ tag,
+ typecode,
+ value_size,
+ <uint8_t*>buffer.raw)
+ return
+ else:
+ raise ValueError('unsupported value_type in set_option')
+
+ bam_aux_append(src,
+ tag,
+ typecode,
+ value_size,
+ value_ptr)
+
+ cpdef has_tag(self, tag):
+ """returns true if the optional alignment section
+ contains a given *tag*."""
+ cdef uint8_t * v
+ cdef int nvalues
+ btag = force_bytes(tag)
+ v = bam_aux_get(self._delegate, btag)
+ return v != NULL
+
+ cpdef get_tag(self, tag, with_value_type=False):
+ """
+ retrieves data from the optional alignment section
+ given a two-letter *tag* denoting the field.
+
+ The returned value is cast into an appropriate python type.
+
+ This method is the fastest way to access the optional
+ alignment section if only few tags need to be retrieved.
+
+ Parameters
+ ----------
+
+ tag :
+ data tag.
+
+ with_value_type : Optional[bool]
+ if set to True, the return value is a tuple of (tag value, type code).
+ (default False)
+
+ Returns
+ -------
+
+ A python object with the value of the `tag`. The type of the
+ object depends on the data type in the data record.
+
+ Raises
+ ------
+
+ KeyError
+ If `tag` is not present, a KeyError is raised.
+
+ """
+ cdef uint8_t * v
+ cdef int nvalues
+ btag = force_bytes(tag)
+ v = bam_aux_get(self._delegate, btag)
+ if v == NULL:
+ raise KeyError("tag '%s' not present" % tag)
+ if chr(v[0]) == "B":
+ auxtype = chr(v[0]) + chr(v[1])
+ else:
+ auxtype = chr(v[0])
+
+ if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S':
+ value = <int>bam_aux2i(v)
+ elif auxtype == 'i' or auxtype == 'I':
+ value = <int32_t>bam_aux2i(v)
+ elif auxtype == 'f' or auxtype == 'F':
+ value = <float>bam_aux2f(v)
+ elif auxtype == 'd' or auxtype == 'D':
+ value = <double>bam_aux2f(v)
+ elif auxtype == 'A':
+ # there might a more efficient way
+ # to convert a char into a string
+ value = '%c' % <char>bam_aux2A(v)
+ elif auxtype == 'Z':
+ value = charptr_to_str(<char*>bam_aux2Z(v))
+ elif auxtype[0] == 'B':
+ bytesize, nvalues, values = convert_binary_tag(v + 1)
+ value = values
+ else:
+ raise ValueError("unknown auxiliary type '%s'" % auxtype)
+
+ if with_value_type:
+ return (value, auxtype)
+ else:
+ return value
+
+ def get_tags(self, with_value_type=False):
+ """the fields in the optional aligment section.
+
+ Returns a list of all fields in the optional
+ alignment section. Values are converted to appropriate python
+ values. For example:
+
+ [(NM, 2), (RG, "GJP00TM04")]
+
+ If *with_value_type* is set, the value type as encode in
+ the AlignedSegment record will be returned as well:
+
+ [(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+
+ This method will convert all values in the optional alignment
+ section. When getting only one or few tags, please see
+ :meth:`get_tag` for a quicker way to achieve this.
+
+ """
+
+ cdef char * ctag
+ cdef bam1_t * src
+ cdef uint8_t * s
+ cdef char auxtag[3]
+ cdef char auxtype
+ cdef uint8_t byte_size
+ cdef int32_t nvalues
+
+ src = self._delegate
+ if src.l_data == 0:
+ return []
+ s = pysam_bam_get_aux(src)
+ result = []
+ auxtag[2] = 0
+ while s < (src.data + src.l_data):
+ # get tag
+ auxtag[0] = s[0]
+ auxtag[1] = s[1]
+ s += 2
+ auxtype = s[0]
+ if auxtype in ('c', 'C'):
+ value = <int>bam_aux2i(s)
+ s += 1
+ elif auxtype in ('s', 'S'):
+ value = <int>bam_aux2i(s)
+ s += 2
+ elif auxtype in ('i', 'I'):
+ value = <int32_t>bam_aux2i(s)
+ s += 4
+ elif auxtype == 'f':
+ value = <float>bam_aux2f(s)
+ s += 4
+ elif auxtype == 'd':
+ value = <double>bam_aux2f(s)
+ s += 8
+ elif auxtype == 'A':
+ value = "%c" % <char>bam_aux2A(s)
+ s += 1
+ elif auxtype in ('Z', 'H'):
+ value = charptr_to_str(<char*>bam_aux2Z(s))
+ # +1 for NULL terminated string
+ s += len(value) + 1
+ elif auxtype == 'B':
+ s += 1
+ byte_size, nvalues, value = convert_binary_tag(s)
+ # 5 for 1 char and 1 int
+ s += 5 + (nvalues * byte_size) - 1
+ else:
+ raise KeyError("unknown type '%s'" % auxtype)
+
+ s += 1
+
+ if with_value_type:
+ result.append((charptr_to_str(auxtag), value, chr(auxtype)))
+ else:
+ result.append((charptr_to_str(auxtag), value))
+
+ return result
+
+ def set_tags(self, tags):
+ """sets the fields in the optional alignmest section with
+ a list of (tag, value) tuples.
+
+ The :term:`value type` of the values is determined from the
+ python type. Optionally, a type may be given explicitly as
+ a third value in the tuple, For example:
+
+ x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")]
+
+ This method will not enforce the rule that the same tag may appear
+ only once in the optional alignment section.
+ """
+
+ cdef bam1_t * src
+ cdef uint8_t * s
+ cdef char * temp
+ cdef int new_size = 0
+ cdef int old_size
+ src = self._delegate
+
+ # convert and pack the data
+ if tags is not None and len(tags) > 0:
+ fmt, args = packTags(tags)
+ new_size = struct.calcsize(fmt)
+ buffer = ctypes.create_string_buffer(new_size)
+ struct.pack_into(fmt,
+ buffer,
+ 0,
+ *args)
+
+ # delete the old data and allocate new space.
+ # If total_size == 0, the aux field will be
+ # empty
+ old_size = pysam_bam_get_l_aux(src)
+ pysam_bam_update(src,
+ old_size,
+ new_size,
+ pysam_bam_get_aux(src))
+
+ # copy data only if there is any
+ if new_size > 0:
+
+ # get location of new data
+ s = pysam_bam_get_aux(src)
+
+ # check if there is direct path from buffer.raw to tmp
+ p = buffer.raw
+ # create handle to make sure buffer stays alive long
+ # enough for memcpy, see issue 129
+ temp = p
+ memcpy(s, temp, new_size)
+
+
+ ########################################################
+ # Compatibility Accessors
+ # Functions, properties for compatibility with pysam < 0.8
+ #
+ # Several options
+ # change the factory functions according to API
+ # * requires code changes throughout, incl passing
+ # handles to factory functions
+ # subclass functions and add attributes at runtime
+ # e.g.: AlignedSegments.qname = AlignedSegments.query_name
+ # * will slow down the default interface
+ # explicit declaration of getters/setters
+ ########################################################
+ property qname:
+ """deprecated, use query_name instead"""
+ def __get__(self): return self.query_name
+ def __set__(self, v): self.query_name = v
+ property tid:
+ """deprecated, use reference_id instead"""
+ def __get__(self): return self.reference_id
+ def __set__(self, v): self.reference_id = v
+ property pos:
+ """deprecated, use reference_start instead"""
+ def __get__(self): return self.reference_start
+ def __set__(self, v): self.reference_start = v
+ property mapq:
+ """deprecated, use mapping_quality instead"""
+ def __get__(self): return self.mapping_quality
+ def __set__(self, v): self.mapping_quality = v
+ property rnext:
+ """deprecated, use next_reference_id instead"""
+ def __get__(self): return self.next_reference_id
+ def __set__(self, v): self.next_reference_id = v
+ property pnext:
+ """deprecated, use next_reference_start instead"""
+ def __get__(self):
+ return self.next_reference_start
+ def __set__(self, v):
+ self.next_reference_start = v
+ property cigar:
+ """deprecated, use cigartuples instead"""
+ def __get__(self):
+ r = self.cigartuples
+ if r is None:
+ r = []
+ return r
+ def __set__(self, v): self.cigartuples = v
+ property tlen:
+ """deprecated, use template_length instead"""
+ def __get__(self):
+ return self.template_length
+ def __set__(self, v):
+ self.template_length = v
+ property seq:
+ """deprecated, use query_sequence instead"""
+ def __get__(self):
+ return self.query_sequence
+ def __set__(self, v):
+ self.query_sequence = v
+ property qual:
+ """deprecated, query_qualities instead"""
+ def __get__(self):
+ return array_to_qualitystring(self.query_qualities)
+ def __set__(self, v):
+ self.query_qualities = qualitystring_to_array(v)
+ property alen:
+ """deprecated, reference_length instead"""
+ def __get__(self):
+ return self.reference_length
+ def __set__(self, v):
+ self.reference_length = v
+ property aend:
+ """deprecated, reference_end instead"""
+ def __get__(self):
+ return self.reference_end
+ def __set__(self, v):
+ self.reference_end = v
+ property rlen:
+ """deprecated, query_length instead"""
+ def __get__(self):
+ return self.query_length
+ def __set__(self, v):
+ self.query_length = v
+ property query:
+ """deprecated, query_alignment_sequence instead"""
+ def __get__(self):
+ return self.query_alignment_sequence
+ def __set__(self, v):
+ self.query_alignment_sequence = v
+ property qqual:
+ """deprecated, query_alignment_qualities instead"""
+ def __get__(self):
+ return array_to_qualitystring(self.query_alignment_qualities)
+ def __set__(self, v):
+ self.query_alignment_qualities = qualitystring_to_array(v)
+ property qstart:
+ """deprecated, use query_alignment_start instead"""
+ def __get__(self):
+ return self.query_alignment_start
+ def __set__(self, v):
+ self.query_alignment_start = v
+ property qend:
+ """deprecated, use query_alignment_end instead"""
+ def __get__(self):
+ return self.query_alignment_end
+ def __set__(self, v):
+ self.query_alignment_end = v
+ property qlen:
+ """deprecated, use query_alignment_length instead"""
+ def __get__(self):
+ return self.query_alignment_length
+ def __set__(self, v):
+ self.query_alignment_length = v
+ property mrnm:
+ """deprecated, use next_reference_id instead"""
+ def __get__(self):
+ return self.next_reference_id
+ def __set__(self, v):
+ self.next_reference_id = v
+ property mpos:
+ """deprecated, use next_reference_start instead"""
+ def __get__(self):
+ return self.next_reference_start
+ def __set__(self, v):
+ self.next_reference_start = v
+ property rname:
+ """deprecated, use reference_id instead"""
+ def __get__(self):
+ return self.reference_id
+ def __set__(self, v):
+ self.reference_id = v
+ property isize:
+ """deprecated, use template_length instead"""
+ def __get__(self):
+ return self.template_length
+ def __set__(self, v):
+ self.template_length = v
+ property blocks:
+ """deprecated, use get_blocks() instead"""
+ def __get__(self):
+ return self.get_blocks()
+ property aligned_pairs:
+ """deprecated, use get_aligned_pairs() instead"""
+ def __get__(self):
+ return self.get_aligned_pairs()
+ property inferred_length:
+ """deprecated, use infer_query_length() instead"""
+ def __get__(self):
+ return self.infer_query_length()
+ property positions:
+ """deprecated, use get_reference_positions() instead"""
+ def __get__(self):
+ return self.get_reference_positions()
+ property tags:
+ """deprecated, use get_tags() instead"""
+ def __get__(self):
+ return self.get_tags()
+ def __set__(self, tags):
+ self.set_tags(tags)
+ def overlap(self):
+ """deprecated, use get_overlap() instead"""
+ return self.get_overlap()
+ def opt(self, tag):
+ """deprecated, use get_tag() instead"""
+ return self.get_tag(tag)
+ def setTag(self, tag, value, value_type=None, replace=True):
+ """deprecated, use set_tag() instead"""
+ return self.set_tag(tag, value, value_type, replace)
+
+
+cdef class PileupColumn:
+ '''A pileup of reads at a particular reference sequence position
+ (:term:`column`). A pileup column contains all the reads that map
+ to a certain target base.
+
+ This class is a proxy for results returned by the samtools pileup
+ engine. If the underlying engine iterator advances, the results
+ of this column will change.
+
+ '''
+ def __init__(self):
+ raise TypeError("this class cannot be instantiated from Python")
+
+ def __str__(self):
+ return "\t".join(map(str,
+ (self.reference_id,
+ self.reference_pos,
+ self.nsegments))) +\
+ "\n" +\
+ "\n".join(map(str, self.pileups))
+
+ property reference_id:
+ '''the reference sequence number as defined in the header'''
+ def __get__(self):
+ return self.tid
+
+ property reference_name:
+ """:term:`reference` name (None if no AlignmentFile is associated)"""
+ def __get__(self):
+ if self._alignment_file is not None:
+ return self._alignment_file.getrname(self.tid)
+ return None
+
+ property nsegments:
+ '''number of reads mapping to this column.'''
+ def __get__(self):
+ return self.n_pu
+ def __set__(self, n):
+ self.n_pu = n
+
+ property reference_pos:
+ '''the position in the reference sequence (0-based).'''
+ def __get__(self):
+ return self.pos
+
+ property pileups:
+ '''list of reads (:class:`pysam.PileupRead`) aligned to this column'''
+ def __get__(self):
+ cdef int x
+ pileups = []
+
+ if self.plp == NULL or self.plp[0] == NULL:
+ raise ValueError("PileupColumn accessed after iterator finished")
+
+ # warning: there could be problems if self.n and self.buf are
+ # out of sync.
+ for x from 0 <= x < self.n_pu:
+ pileups.append(makePileupRead(&(self.plp[0][x]),
+ self._alignment_file))
+ return pileups
+
+ ########################################################
+ # Compatibility Accessors
+ # Functions, properties for compatibility with pysam < 0.8
+ ########################################################
+ property pos:
+ def __get__(self):
+ return self.reference_pos
+ def __set__(self, v):
+ self.reference_pos = v
+
+ property tid:
+ def __get__(self):
+ return self.reference_id
+ def __set__(self, v):
+ self.reference_id = v
+
+ property n:
+ def __get__(self):
+ return self.nsegments
+ def __set__(self, v):
+ self.nsegments = v
+
+
+cdef class PileupRead:
+ '''Representation of a read aligned to a particular position in the
+ reference sequence.
+
+ '''
+
+ def __init__(self):
+ raise TypeError(
+ "this class cannot be instantiated from Python")
+
+ def __str__(self):
+ return "\t".join(
+ map(str,
+ (self.alignment, self.query_position,
+ self.indel, self.level,
+ self.is_del, self.is_head,
+ self.is_tail, self.is_refskip)))
+
+ property alignment:
+ """a :class:`pysam.AlignedSegment` object of the aligned read"""
+ def __get__(self):
+ return self._alignment
+
+ property query_position:
+ """position of the read base at the pileup site, 0-based.
+ None if is_del or is_refskip is set.
+
+ """
+ def __get__(self):
+ if self.is_del or self.is_refskip:
+ return None
+ else:
+ return self._qpos
+
+ property query_position_or_next:
+ """position of the read base at the pileup site, 0-based.
+
+ If the current position is a deletion, returns the next
+ aligned base.
+
+ """
+ def __get__(self):
+ return self._qpos
+
+ property indel:
+ """indel length for the position following the current pileup site.
+
+ This quantity peeks ahead to the next cigar operation in this
+ alignment. If the next operation is an insertion, indel will
+ be positive. If the next operation is a deletion, it will be
+ negation. 0 if the next operation is not an indel.
+
+ """
+ def __get__(self):
+ return self._indel
+
+ property level:
+ """the level of the read in the "viewer" mode. Note that this value
+ is currently not computed."""
+ def __get__(self):
+ return self._level
+
+ property is_del:
+ """1 iff the base on the padded read is a deletion"""
+ def __get__(self):
+ return self._is_del
+
+ property is_head:
+ """1 iff the base on the padded read is the left-most base."""
+ def __get__(self):
+ return self._is_head
+
+ property is_tail:
+ """1 iff the base on the padded read is the right-most base."""
+ def __get__(self):
+ return self._is_tail
+
+ property is_refskip:
+ """1 iff the base on the padded read is part of CIGAR N op."""
+ def __get__(self):
+ return self._is_refskip
+
+__all__ = [
+ "AlignedSegment",
+ "PileupColumn",
+ "PileupRead"]
--- /dev/null
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+
+from pysam.libcfaidx cimport faidx_t, Fastafile
+from pysam.libcalignedsegment cimport AlignedSegment
+from pysam.libchtslib cimport *
+
+from cpython cimport array
+cimport cython
+
+cdef extern from *:
+ ctypedef char* const_char_ptr "const char*"
+
+cdef extern from "htslib_util.h":
+
+ char * pysam_bam_get_qname(bam1_t * b)
+
+cdef extern from "samfile_util.h":
+
+ int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+ int bam_prob_realn(bam1_t *b, const char *ref)
+
+####################################################################
+# Utility types
+
+ctypedef struct __iterdata:
+ htsFile * htsfile
+ bam_hdr_t * header
+ hts_itr_t * iter
+ faidx_t * fastafile
+ int tid
+ char * seq
+ int seq_len
+
+
+cdef class AlignmentFile(HTSFile):
+ cdef readonly object reference_filename
+
+ # pointer to index
+ cdef hts_idx_t *index
+ # header structure
+ cdef bam_hdr_t * header
+
+ # current read within iteration
+ cdef bam1_t * b
+
+ cdef bam1_t * getCurrent(self)
+ cdef int cnext(self)
+
+ # write an aligned read
+ cpdef int write(self, AlignedSegment read) except -1
+
+
+cdef class PileupColumn:
+ cdef bam_pileup1_t ** plp
+ cdef int tid
+ cdef int pos
+ cdef int n_pu
+
+
+cdef class PileupRead:
+ cdef AlignedSegment _alignment
+ cdef int32_t _qpos
+ cdef int _indel
+ cdef int _level
+ cdef uint32_t _is_del
+ cdef uint32_t _is_head
+ cdef uint32_t _is_tail
+ cdef uint32_t _is_refskip
+
+
+cdef class IteratorRow:
+ cdef int retval
+ cdef bam1_t * b
+ cdef AlignmentFile samfile
+ cdef htsFile * htsfile
+ cdef bam_hdr_t * header
+ cdef int owns_samfile
+
+
+cdef class IteratorRowRegion(IteratorRow):
+ cdef hts_itr_t * iter
+ cdef bam1_t * getCurrent(self)
+ cdef int cnext(self)
+
+cdef class IteratorRowHead(IteratorRow):
+ cdef int max_rows
+ cdef int current_row
+ cdef bam1_t * getCurrent(self)
+ cdef int cnext(self)
+
+cdef class IteratorRowAll(IteratorRow):
+ cdef bam1_t * getCurrent(self)
+ cdef int cnext(self)
+
+
+cdef class IteratorRowAllRefs(IteratorRow):
+ cdef int tid
+ cdef IteratorRowRegion rowiter
+
+
+cdef class IteratorRowSelection(IteratorRow):
+ cdef int current_pos
+ cdef positions
+ cdef bam1_t * getCurrent(self)
+ cdef int cnext(self)
+
+
+cdef class IteratorColumn:
+
+ # result of the last plbuf_push
+ cdef IteratorRowRegion iter
+ cdef int tid
+ cdef int pos
+ cdef int n_plp
+ cdef int mask
+ cdef bam_pileup1_t * plp
+ cdef bam_plp_t pileup_iter
+ cdef __iterdata iterdata
+ cdef AlignmentFile samfile
+ cdef Fastafile fastafile
+ cdef stepper
+ cdef int max_depth
+
+ cdef int cnext(self)
+ cdef char * getSequence(self)
+ cdef setMask(self, mask)
+ cdef setupIteratorData(self,
+ int tid,
+ int start,
+ int end,
+ int multiple_iterators=?)
+
+ cdef reset(self, tid, start, end)
+ cdef _free_pileup_iter(self)
+
+
+cdef class IteratorColumnRegion(IteratorColumn):
+ cdef int start
+ cdef int end
+ cdef int truncate
+
+
+cdef class IteratorColumnAllRefs(IteratorColumn):
+ pass
+
+
+cdef class IndexedReads:
+ cdef AlignmentFile samfile
+ cdef htsFile * htsfile
+ cdef index
+ cdef int owns_samfile
+ cdef bam_hdr_t * header
--- /dev/null
+# cython: embedsignature=True
+# cython: profile=True
+########################################################
+########################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+########################################################
+# The principal classes defined in this module are:
+#
+# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
+#
+# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
+# the original sort order intact
+#
+# Additionally this module defines numerous additional classes that
+# are part of the internal API. These are:
+#
+# Various iterator classes to iterate over alignments in sequential
+# (IteratorRow) or in a stacked fashion (IteratorColumn):
+#
+# class IteratorRow
+# class IteratorRowRegion
+# class IteratorRowHead
+# class IteratorRowAll
+# class IteratorRowAllRefs
+# class IteratorRowSelection
+# class IteratorColumn
+# class IteratorColumnRegion
+# class IteratorColumnAllRefs
+#
+########################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+########################################################
+import os
+import collections
+import re
+import warnings
+import array
+
+from libc.errno cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
+from cpython cimport array as c_array
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
+from pysam.libchtslib cimport HTSFile, hisremote
+
+if PY_MAJOR_VERSION >= 3:
+ from io import StringIO
+else:
+ from StringIO import StringIO
+
+cimport cython
+
+########################################################
+## Constants and global variables
+
+# defines imported from samtools
+DEF SEEK_SET = 0
+DEF SEEK_CUR = 1
+DEF SEEK_END = 2
+
+# maximum genomic coordinace
+cdef int MAX_POS = 2 << 29
+
+# valid types for SAM headers
+VALID_HEADER_TYPES = {"HD" : dict,
+ "SQ" : list,
+ "RG" : list,
+ "PG" : list,
+ "CO" : list}
+
+# order of records within SAM headers
+VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
+
+# default type conversions within SAM header records
+KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
+ "SQ" : {"SN" : str, "LN" : int, "AS" : str,
+ "M5" : str, "SP" : str, "UR" : str,
+ "AH" : str,},
+ "RG" : {"ID" : str, "CN" : str, "DS" : str,
+ "DT" : str, "FO" : str, "KS" : str,
+ "LB" : str, "PG" : str, "PI" : str,
+ "PL" : str, "PM" : str, "PU" : str,
+ "SM" : str,},
+ "PG" : {"ID" : str, "PN" : str, "CL" : str,
+ "PP" : str, "DS" : str, "VN" : str,},}
+
+# output order of fields within records. Ensure that CL is at
+# the end as parsing a CL will ignore any subsequent records.
+VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"),
+ "SQ" : ("SN", "LN", "AS", "M5",
+ "UR", "SP", "AH"),
+ "RG" : ("ID", "CN", "SM", "LB",
+ "PU", "PI", "DT", "DS",
+ "PL", "FO", "KS", "PG",
+ "PM"),
+ "PG" : ("PN", "ID", "VN", "PP",
+ "DS", "CL"),}
+
+
+def build_header_line(fields, record):
+ '''build a header line from `fields` dictionary for `record`'''
+
+ # TODO: add checking for field and sort order
+ line = ["@%s" % record]
+ # comment
+ if record == "CO":
+ line.append(fields)
+ # user tags
+ elif record.islower():
+ for key in sorted(fields):
+ line.append("%s:%s" % (key, str(fields[key])))
+ # defined tags
+ else:
+ # write fields of the specification
+ for key in VALID_HEADER_ORDER[record]:
+ if key in fields:
+ line.append("%s:%s" % (key, str(fields[key])))
+ # write user fields
+ for key in fields:
+ if not key.isupper():
+ line.append("%s:%s" % (key, str(fields[key])))
+
+ return "\t".join(line)
+
+cdef bam_hdr_t * build_header(new_header):
+ '''return a new header built from a dictionary in `new_header`.
+
+ This method inserts the text field, target_name and target_len.
+ '''
+
+ lines = []
+
+ # check if hash exists
+
+ # create new header and copy old data
+ cdef bam_hdr_t * dest
+
+ dest = bam_hdr_init()
+
+ # first: defined tags
+ for record in VALID_HEADERS:
+ if record in new_header:
+ ttype = VALID_HEADER_TYPES[record]
+ data = new_header[record]
+ if type(data) != type(ttype()):
+ raise ValueError(
+ "invalid type for record %s: %s, expected %s" %
+ (record, type(data), type(ttype())))
+ if type(data) is dict:
+ lines.append(build_header_line(data, record))
+ else:
+ for fields in new_header[record]:
+ lines.append(build_header_line(fields, record))
+
+ # then: user tags (lower case), sorted alphabetically
+ for record, data in sorted(new_header.items()):
+ if record in VALID_HEADERS: continue
+ if type(data) is dict:
+ lines.append(build_header_line(data, record))
+ else:
+ for fields in new_header[record]:
+ lines.append(build_header_line(fields, record))
+
+ text = "\n".join(lines) + "\n"
+ if dest.text != NULL: free( dest.text )
+ dest.text = <char*>calloc(len(text), sizeof(char))
+ dest.l_text = len(text)
+ cdef bytes btext = text.encode('ascii')
+ strncpy(dest.text, btext, dest.l_text)
+
+ cdef bytes bseqname
+ # collect targets
+ if "SQ" in new_header:
+ seqs = []
+ for fields in new_header["SQ"]:
+ try:
+ seqs.append( (fields["SN"], fields["LN"] ) )
+ except KeyError:
+ raise KeyError( "incomplete sequence information in '%s'" % str(fields))
+
+ dest.n_targets = len(seqs)
+ dest.target_name = <char**>calloc(dest.n_targets, sizeof(char*))
+ dest.target_len = <uint32_t*>calloc(dest.n_targets, sizeof(uint32_t))
+
+ for x from 0 <= x < dest.n_targets:
+ seqname, seqlen = seqs[x]
+ dest.target_name[x] = <char*>calloc(
+ len(seqname) + 1, sizeof(char))
+ bseqname = seqname.encode('ascii')
+ strncpy(dest.target_name[x], bseqname,
+ len(seqname) + 1)
+ dest.target_len[x] = seqlen
+
+ return dest
+
+
+cdef class AlignmentFile(HTSFile):
+ """AlignmentFile(filepath_or_object, mode=None, template=None,
+ reference_names=None, reference_lengths=None, text=NULL,
+ header=None, add_sq_text=False, check_header=True, check_sq=True,
+ reference_filename=None, filename=None, duplicate_filehandle=True)
+
+ A :term:`SAM`/:term:`BAM` formatted file.
+
+ If `filepath_or_object` is a string, the file is automatically
+ opened. If `filepath_or_object` is a python File object, the
+ already opened file will be used.
+
+ If the file is opened for reading an index for a BAM file exists
+ (.bai), it will be opened automatically. Without an index random
+ access via :meth:`~pysam.AlignmentFile.fetch` and
+ :meth:`~pysam.AlignmentFile.pileup` is disabled.
+
+ For writing, the header of a :term:`SAM` file/:term:`BAM` file can
+ be constituted from several sources (see also the samtools format
+ specification):
+
+ 1. If `template` is given, the header is copied from a another
+ `AlignmentFile` (`template` must be a
+ :class:`~pysam.AlignmentFile`).
+
+ 2. If `header` is given, the header is built from a
+ multi-level dictionary.
+
+ 3. If `text` is given, new header text is copied from raw
+ text.
+
+ 4. The names (`reference_names`) and lengths
+ (`reference_lengths`) are supplied directly as lists.
+
+ When reading or writing a CRAM file, the filename of a FASTA-formatted
+ reference can be specified with `reference_filename`.
+
+ By default, if a file is opened in mode 'r', it is checked
+ for a valid header (`check_header` = True) and a definition of
+ chromosome names (`check_sq` = True).
+
+ Parameters
+ ----------
+ mode : string
+ `mode` should be ``r`` for reading or ``w`` for writing. The
+ default is text mode (:term:`SAM`). For binary (:term:`BAM`)
+ I/O you should append ``b`` for compressed or ``u`` for
+ uncompressed :term:`BAM` output. Use ``h`` to output header
+ information in text (:term:`TAM`) mode. Use ``c`` for
+ :term:`CRAM` formatted files.
+
+ If ``b`` is present, it must immediately follow ``r`` or
+ ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
+ ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
+ :term:`BAM` formatted file for reading, type::
+
+ f = pysam.AlignmentFile('ex1.bam','rb')
+
+ If mode is not specified, the method will try to auto-detect
+ in the order 'rb', 'r', thus both the following should work::
+
+ f1 = pysam.AlignmentFile('ex1.bam')
+ f2 = pysam.AlignmentFile('ex1.sam')
+
+ template : AlignmentFile
+ when writing, copy header frem `template`.
+
+ header : dict
+ when writing, build header from a multi-level dictionary. The
+ first level are the four types ('HD', 'SQ', ...). The second
+ level are a list of lines, with each line being a list of
+ tag-value pairs. The header is constructed first from all the
+ defined fields, followed by user tags in alphabetical order.
+
+ text : string
+ when writing, use the string provided as the header
+
+ reference_names : list
+ see referece_lengths
+
+ reference_lengths : list
+ when writing, build header from list of chromosome names and
+ lengths. By default, 'SQ' and 'LN' tags will be added to the
+ header text. This option can be changed by unsetting the flag
+ `add_sq_text`.
+
+ add_sq_text : bool
+ do not add 'SQ' and 'LN' tags to header. This option permits
+ construction :term:`SAM` formatted files without a header.
+
+ check_header : bool
+ when reading, check if header is present (default=True)
+
+ check_sq : bool
+ when reading, check if SQ entries are present in header
+ (default=True)
+
+ reference_filename : string
+ Path to a FASTA-formatted reference file. Valid only for CRAM files.
+ When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
+ specified in the header (``UR`` tag), which are normally used to find
+ the reference.
+
+ filename : string
+ Alternative to filepath_or_object. Filename of the file
+ to be opened.
+
+ duplicate_filehandle: bool
+ By default, file handles passed either directly or through
+ File-like objects will be duplicated before passing them to
+ htslib. The duplication prevents issues where the same stream
+ will be closed by htslib and through destruction of the
+ high-level python object. Set to False to turn off
+ duplication.
+
+ """
+
+ def __cinit__(self, *args, **kwargs):
+ self.htsfile = NULL
+ self.filename = None
+ self.mode = None
+ self.is_stream = False
+ self.is_remote = False
+ self.index = NULL
+
+ if "filename" in kwargs:
+ args = [kwargs["filename"]]
+ del kwargs["filename"]
+
+ self._open(*args, **kwargs)
+
+ # allocate memory for iterator
+ self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
+
+ def has_index(self):
+ """return true if htsfile has an existing (and opened) index.
+ """
+ return self.index != NULL
+
+ def check_index(self):
+ """return True if index is present.
+
+ Raises
+ ------
+
+ AttributeError
+ if htsfile is :term:`SAM` formatted and thus has no index.
+
+ ValueError
+ if htsfile is closed or index could not be opened.
+ """
+
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+ if not self.is_bam and not self.is_cram:
+ raise AttributeError(
+ "AlignmentFile.mapped only available in bam files")
+ if self.index == NULL:
+ raise ValueError(
+ "mapping information not recorded in index "
+ "or index not available")
+ return True
+
+ def _open(self,
+ filepath_or_object,
+ mode=None,
+ AlignmentFile template=None,
+ reference_names=None,
+ reference_lengths=None,
+ reference_filename=None,
+ text=None,
+ header=None,
+ port=None,
+ add_sq_text=True,
+ check_header=True,
+ check_sq=True,
+ filepath_index=None,
+ referencenames=None,
+ referencelengths=None,
+ duplicate_filehandle=True):
+ '''open a sam, bam or cram formatted file.
+
+ If _open is called on an existing file, the current file
+ will be closed and a new file will be opened.
+ '''
+ cdef char *cfilename = NULL
+ cdef char *creference_filename = NULL
+ cdef char *cindexname = NULL
+ cdef char *cmode = NULL
+
+ # for backwards compatibility:
+ if referencenames is not None:
+ reference_names = referencenames
+ if referencelengths is not None:
+ reference_lengths = referencelengths
+
+ # close a previously opened file
+ if self.is_open:
+ self.close()
+
+ # autodetection for read
+ if mode is None:
+ mode = "r"
+
+ assert mode in ("r", "w", "rb", "wb", "wh",
+ "wbu", "rU", "wb0",
+ "rc", "wc"), \
+ "invalid file opening mode `%s`" % mode
+
+ self.duplicate_filehandle = duplicate_filehandle
+
+ # StringIO not supported
+ if isinstance(filepath_or_object, StringIO):
+ raise NotImplementedError(
+ "access from StringIO objects not supported")
+ # reading from a file descriptor
+ elif isinstance(filepath_or_object, int):
+ self.filename = filepath_or_object
+ filename = None
+ self.is_remote = False
+ self.is_stream = True
+ # reading from a File object or other object with fileno
+ elif hasattr(filepath_or_object, "fileno"):
+ if filepath_or_object.closed:
+ raise ValueError('I/O operation on closed file')
+ self.filename = filepath_or_object
+ # .name can be TextIOWrapper
+ try:
+ filename = encode_filename(str(filepath_or_object.name))
+ cfilename = filename
+ except AttributeError:
+ filename = None
+ self.is_remote = False
+ self.is_stream = True
+ # what remains is a filename
+ else:
+ self.filename = filename = encode_filename(filepath_or_object)
+ cfilename = filename
+ self.is_remote = hisremote(cfilename)
+ self.is_stream = self.filename == b'-'
+
+ # for htslib, wbu seems to not work
+ if mode == "wbu":
+ mode = "wb0"
+
+ self.mode = force_bytes(mode)
+ self.reference_filename = reference_filename = encode_filename(
+ reference_filename)
+
+ cdef char * ctext
+ cdef hFILE * fp
+ ctext = NULL
+
+ if mode[0] == 'w':
+ # open file for writing
+
+ # header structure (used for writing)
+ if template:
+ self.header = bam_hdr_dup(template.header)
+ elif header:
+ self.header = build_header(header)
+ else:
+ # build header from a target names and lengths
+ assert reference_names and reference_lengths, \
+ ("either supply options `template`, `header` "
+ "or both `reference_names` and `reference_lengths` "
+ "for writing")
+ assert len(reference_names) == len(reference_lengths), \
+ "unequal names and lengths of reference sequences"
+
+ # allocate and fill header
+ reference_names = [force_bytes(ref) for ref in reference_names]
+ self.header = bam_hdr_init()
+ self.header.n_targets = len(reference_names)
+ n = 0
+ for x in reference_names:
+ n += len(x) + 1
+ self.header.target_name = <char**>calloc(n, sizeof(char*))
+ self.header.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+ for x from 0 <= x < self.header.n_targets:
+ self.header.target_len[x] = reference_lengths[x]
+ name = reference_names[x]
+ self.header.target_name[x] = <char*>calloc(
+ len(name) + 1, sizeof(char))
+ strncpy(self.header.target_name[x], name, len(name))
+
+ # Optionally, if there is no text, add a SAM
+ # compatible header to output file.
+ if text is None and add_sq_text:
+ text = []
+ for x from 0 <= x < self.header.n_targets:
+ text.append("@SQ\tSN:%s\tLN:%s\n" % \
+ (force_str(reference_names[x]),
+ reference_lengths[x]))
+ text = ''.join(text)
+
+ if text is not None:
+ # copy without \0
+ text = force_bytes(text)
+ ctext = text
+ self.header.l_text = strlen(ctext)
+ self.header.text = <char*>calloc(
+ strlen(ctext), sizeof(char))
+ memcpy(self.header.text, ctext, strlen(ctext))
+
+ self.htsfile = self._open_htsfile()
+
+ # set filename with reference sequences. If no filename
+ # is given, the CRAM reference arrays will be built from
+ # the @SQ header in the header
+ if "c" in mode and reference_filename:
+ # note that fn_aux takes ownership, so create a copy
+ self.htsfile.fn_aux = strdup(self.reference_filename)
+
+ # write header to htsfile
+ if "b" in mode or "c" in mode or "h" in mode:
+ with nogil:
+ sam_hdr_write(self.htsfile, self.header)
+
+ elif mode[0] == "r":
+ # open file for reading
+ if not self._exists():
+ raise IOError("file `%s` not found" % self.filename)
+
+ self.htsfile = self._open_htsfile()
+
+ if self.htsfile == NULL:
+ raise ValueError(
+ "could not open file (mode='%s') - "
+ "is it SAM/BAM format?" % mode)
+
+ if self.htsfile.format.category != sequence_data:
+ raise ValueError("file does not contain alignment data")
+
+ # bam files require a valid header
+ if self.is_bam or self.is_cram:
+ with nogil:
+ self.header = sam_hdr_read(self.htsfile)
+ if self.header == NULL:
+ raise ValueError(
+ "file does not have valid header (mode='%s') "
+ "- is it BAM format?" % mode )
+ else:
+ # in sam files it is optional (htsfile full of
+ # unmapped reads)
+ if check_header:
+ with nogil:
+ self.header = sam_hdr_read(self.htsfile)
+ if self.header == NULL:
+ raise ValueError(
+ "file does not have valid header (mode='%s') "
+ "- is it SAM format?" % mode )
+ # self.header.ignore_sam_err = True
+
+ # set filename with reference sequences
+ if self.is_cram and reference_filename:
+ creference_filename = self.reference_filename
+ hts_set_opt(self.htsfile,
+ CRAM_OPT_REFERENCE,
+ creference_filename)
+
+ if check_sq and self.header.n_targets == 0:
+ raise ValueError(
+ ("file has no sequences defined (mode='%s') - "
+ "is it SAM/BAM format? Consider opening with "
+ "check_sq=False") % mode)
+
+ assert self.htsfile != NULL
+
+ # check for index and open if present
+ cdef int format_index = -1
+ if self.is_bam:
+ format_index = HTS_FMT_BAI
+ elif self.is_cram:
+ format_index = HTS_FMT_CRAI
+
+ if mode[0] == "r" and (self.is_bam or self.is_cram):
+ # open index for remote files
+ if self.is_remote and not filepath_index:
+ with nogil:
+ self.index = hts_idx_load(cfilename, format_index)
+ if self.index == NULL:
+ warnings.warn(
+ "unable to open remote index for '%s'" % cfilename)
+ else:
+ has_index = True
+ if filepath_index:
+ if not os.path.exists(filepath_index):
+ warnings.warn(
+ "unable to open index at %s" % cfilename)
+ self.index = NULL
+ has_index = False
+ elif filename is not None:
+ if self.is_bam \
+ and not os.path.exists(filename + b".bai") \
+ and not os.path.exists(filename[:-4] + b".bai") \
+ and not os.path.exists(filename + b".csi") \
+ and not os.path.exists(filename[:-4] + b".csi"):
+ self.index = NULL
+ has_index = False
+ elif self.is_cram \
+ and not os.path.exists(filename + b".crai") \
+ and not os.path.exists(filename[:-5] + b".crai"):
+ self.index = NULL
+ has_index = False
+ else:
+ self.index = NULL
+ has_index = False
+
+ if has_index:
+ # returns NULL if there is no index or index could
+ # not be opened
+ if filepath_index:
+ cindexname = filepath_index = encode_filename(filepath_index)
+ with nogil:
+ self.index = sam_index_load2(self.htsfile,
+ cfilename,
+ cindexname)
+ else:
+ with nogil:
+ self.index = sam_index_load(self.htsfile,
+ cfilename)
+ if self.index == NULL:
+ raise IOError(
+ "error while opening index for '%s'" %
+ filename)
+
+ # save start of data section
+ if not self.is_stream:
+ self.start_offset = self.tell()
+
+ def get_tid(self, reference):
+ """
+ return the numerical :term:`tid` corresponding to
+ :term:`reference`
+
+ returns -1 if reference is not known.
+ """
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+ reference = force_bytes(reference)
+ return bam_name2id(self.header, reference)
+
+ def get_reference_name(self, tid):
+ """
+ return :term:`reference` name corresponding to numerical :term:`tid`
+ """
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+ if not 0 <= tid < self.header.n_targets:
+ raise ValueError("reference_id %i out of range 0<=tid<%i" %
+ (tid, self.header.n_targets))
+ return charptr_to_str(self.header.target_name[tid])
+
+ def parse_region(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None,
+ tid=None):
+ """parse alternative ways to specify a genomic region. A region can
+ either be specified by :term:`reference`, `start` and
+ `end`. `start` and `end` denote 0-based, half-open
+ intervals.
+
+ Alternatively, a samtools :term:`region` string can be
+ supplied.
+
+ If any of the coordinates are missing they will be replaced by the
+ minimum (`start`) or maximum (`end`) coordinate.
+
+ Note that region strings are 1-based, while `start` and `end` denote
+ an interval in python coordinates.
+
+ Returns
+ -------
+
+ tuple : a tuple of `flag`, :term:`tid`, `start` and `end`. The
+ flag indicates whether no coordinates were supplied and the
+ genomic region is the complete genomic space.
+
+ Raises
+ ------
+
+ ValueError
+ for invalid or out of bounds regions.
+
+ """
+ cdef int rtid
+ cdef long long rstart
+ cdef long long rend
+
+ rtid = -1
+ rstart = 0
+ rend = MAX_POS
+ if start != None:
+ try:
+ rstart = start
+ except OverflowError:
+ raise ValueError('start out of range (%i)' % start)
+
+ if end != None:
+ try:
+ rend = end
+ except OverflowError:
+ raise ValueError('end out of range (%i)' % end)
+
+ if region:
+ region = force_str(region)
+ parts = re.split("[:-]", region)
+ reference = parts[0]
+ if len(parts) >= 2:
+ rstart = int(parts[1]) - 1
+ if len(parts) >= 3:
+ rend = int(parts[2])
+
+ if not reference:
+ return 0, 0, 0, 0
+
+ if tid is not None:
+ rtid = tid
+ else:
+ rtid = self.gettid(reference)
+
+ if rtid < 0:
+ raise ValueError(
+ "invalid reference `%s`" % reference)
+ if rstart > rend:
+ raise ValueError(
+ 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend))
+ if not 0 <= rstart < MAX_POS:
+ raise ValueError('start out of range (%i)' % rstart)
+ if not 0 <= rend <= MAX_POS:
+ raise ValueError('end out of range (%i)' % rend)
+
+ return 1, rtid, rstart, rend
+
+ def fetch(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None,
+ tid=None,
+ until_eof=False,
+ multiple_iterators=False):
+ """fetch reads aligned in a :term:`region`.
+
+ See :meth:`AlignmentFile.parse_region` for more information
+ on genomic regions.
+
+ Without a `reference` or `region` all mapped reads in the file
+ will be fetched. The reads will be returned ordered by reference
+ sequence, which will not necessarily be the order within the
+ file. This mode of iteration still requires an index. If there is
+ no index, use `until_eof=True`.
+
+ If only `reference` is set, all reads aligned to `reference`
+ will be fetched.
+
+ A :term:`SAM` file does not allow random access. If `region`
+ or `reference` are given, an exception is raised.
+
+ :class:`~pysam.FastaFile`
+ :class:`~pysam.IteratorRow`
+ :class:`~pysam.IteratorRow`
+ :class:`~IteratorRow`
+ :class:`IteratorRow`
+
+ Parameters
+ ----------
+
+ until_eof : bool
+
+ If `until_eof` is True, all reads from the current file
+ position will be returned in order as they are within the
+ file. Using this option will also fetch unmapped reads.
+
+ multiple_iterators : bool
+
+ If `multiple_iterators` is True, multiple
+ iterators on the same file can be used at the same time. The
+ iterator returned will receive its own copy of a filehandle to
+ the file effectively re-opening the file. Re-opening a file
+ creates some overhead, so beware.
+
+ Returns
+ -------
+
+ An iterator over a collection of reads.
+
+ Raises
+ ------
+
+ ValueError
+ if the genomic coordinates are out of range or invalid or the
+ file does not permit random access to genomic coordinates.
+
+ """
+ cdef int rtid, rstart, rend, has_coord
+
+ if not self.is_open:
+ raise ValueError( "I/O operation on closed file" )
+
+ has_coord, rtid, rstart, rend = self.parse_region(
+ reference,
+ start,
+ end,
+ region,
+ tid)
+
+ # Turn of re-opening if htsfile is a stream
+ if self.is_stream:
+ multiple_iterators = False
+
+ if self.is_bam or self.is_cram:
+ if not until_eof and not self.is_remote:
+ if not self.has_index():
+ raise ValueError(
+ "fetch called on bamfile without index")
+
+ if has_coord:
+ return IteratorRowRegion(
+ self, rtid, rstart, rend,
+ multiple_iterators=multiple_iterators)
+ else:
+ if until_eof:
+ return IteratorRowAll(
+ self,
+ multiple_iterators=multiple_iterators)
+ else:
+ # AH: check - reason why no multiple_iterators for
+ # AllRefs?
+ return IteratorRowAllRefs(
+ self,
+ multiple_iterators=multiple_iterators)
+ else:
+ if has_coord:
+ raise ValueError(
+ "fetching by region is not available for sam files")
+
+ if self.header == NULL:
+ raise ValueError(
+ "fetch called for htsfile without header")
+
+ # check if targets are defined
+ # give warning, sam_read1 segfaults
+ if self.header.n_targets == 0:
+ warnings.warn("fetch called for htsfile without header")
+
+ return IteratorRowAll(self,
+ multiple_iterators=multiple_iterators)
+
+ def head(self, n, multiple_iterators=True):
+ '''return an iterator over the first n alignments.
+
+ This iterator is is useful for inspecting the bam-file.
+
+ Parameters
+ ----------
+
+ multiple_iterators : bool
+
+ is set to True by default in order to
+ avoid changing the current file position.
+
+ Returns
+ -------
+
+ an iterator over a collection of reads
+
+ '''
+ return IteratorRowHead(self, n,
+ multiple_iterators=multiple_iterators)
+
+ def mate(self, AlignedSegment read):
+ '''return the mate of :class:`~pysam.AlignedSegment` `read`.
+
+ .. note::
+
+ Calling this method will change the file position.
+ This might interfere with any iterators that have
+ not re-opened the file.
+
+ .. note::
+
+ This method is too slow for high-throughput processing.
+ If a read needs to be processed with its mate, work
+ from a read name sorted file or, better, cache reads.
+
+ Returns
+ -------
+
+ :class:`~pysam.AlignedSegment` : the mate
+
+ Raises
+ ------
+
+ ValueError
+ if the read is unpaired or the mate is unmapped
+
+ '''
+ cdef uint32_t flag = read._delegate.core.flag
+
+ if flag & BAM_FPAIRED == 0:
+ raise ValueError("read %s: is unpaired" %
+ (read.query_name))
+ if flag & BAM_FMUNMAP != 0:
+ raise ValueError("mate %s: is unmapped" %
+ (read.query_name))
+
+ # xor flags to get the other mate
+ cdef int x = BAM_FREAD1 + BAM_FREAD2
+ flag = (flag ^ x) & x
+
+ # Make sure to use a separate file to jump around
+ # to mate as otherwise the original file position
+ # will be lost
+ # The following code is not using the C API and
+ # could thus be made much quicker, for example
+ # by using tell and seek.
+ for mate in self.fetch(
+ read._delegate.core.mpos,
+ read._delegate.core.mpos + 1,
+ tid=read._delegate.core.mtid,
+ multiple_iterators=True):
+ if mate.flag & flag != 0 and \
+ mate.query_name == read.query_name:
+ break
+ else:
+ raise ValueError("mate not found")
+
+ return mate
+
+ def pileup(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None,
+ **kwargs):
+ """perform a :term:`pileup` within a :term:`region`. The region is
+ specified by :term:`reference`, 'start' and 'end' (using
+ 0-based indexing). Alternatively, a samtools 'region' string
+ can be supplied.
+
+ Without 'reference' or 'region' all reads will be used for the
+ pileup. The reads will be returned ordered by
+ :term:`reference` sequence, which will not necessarily be the
+ order within the file.
+
+ Note that :term:`SAM` formatted files do not allow random
+ access. In these files, if a 'region' or 'reference' are
+ given an exception is raised.
+
+ .. note::
+
+ 'all' reads which overlap the region are returned. The
+ first base returned will be the first base of the first
+ read 'not' necessarily the first base of the region used
+ in the query.
+
+ Parameters
+ ----------
+
+ stepper : string
+ The stepper controls how the iterator advances.
+ Possible options for the stepper are
+
+ ``all``
+ skip reads in which any of the following flags are set:
+ BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+
+ ``nofilter``
+ uses every single read
+
+ ``samtools``
+ same filter and read processing as in :term:`csamtools`
+ pileup. This requires a 'fastafile' to be given.
+
+
+ fastafile : :class:`~pysam.FastaFile` object.
+
+ This is required for some of the steppers.
+
+ max_depth : int
+ Maximum read depth permitted. The default limit is '8000'.
+
+ truncate : bool
+
+ By default, the samtools pileup engine outputs all reads
+ overlapping a region. If truncate is True and a region is
+ given, only columns in the exact region specificied are
+ returned.
+
+ Returns
+ -------
+
+ an iterator over genomic positions.
+
+ """
+ cdef int rtid, rstart, rend, has_coord
+
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+
+ has_coord, rtid, rstart, rend = self.parse_region(
+ reference, start, end, region)
+
+ if self.is_bam or self.is_cram:
+ if not self.has_index():
+ raise ValueError("no index available for pileup")
+
+ if has_coord:
+ return IteratorColumnRegion(self,
+ tid=rtid,
+ start=rstart,
+ end=rend,
+ **kwargs )
+ else:
+ return IteratorColumnAllRefs(self, **kwargs )
+
+ else:
+ raise NotImplementedError(
+ "pileup of samfiles not implemented yet")
+
+ def count(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None,
+ until_eof=False,
+ read_callback="nofilter"):
+ '''count the number of reads in :term:`region`
+
+ The region is specified by :term:`reference`, `start` and
+ `end`. Alternatively, a :term:`samtools` :term:`region` string
+ can be supplied.
+
+ A :term:`SAM` file does not allow random access and if
+ `region` or `reference` are given, an exception is raised.
+
+ Parameters
+ ----------
+
+ reference : string
+ reference_name of the genomic region (chromosome)
+
+ start : int
+ start of the genomic region
+
+ end : int
+ end of the genomic region
+
+ region : string
+ a region string in samtools format.
+
+ until_eof : bool
+ count until the end of the file, possibly including
+ unmapped reads as well.
+
+ read_callback: string or function
+
+ select a call-back to ignore reads when counting. It can
+ be either a string with the following values:
+
+ ``all``
+ skip reads in which any of the following
+ flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+ BAM_FDUP
+
+ ``nofilter``
+ uses every single read
+
+ Alternatively, `read_callback` can be a function
+ ``check_read(read)`` that should return True only for
+ those reads that shall be included in the counting.
+
+ Raises
+ ------
+
+ ValueError
+ if the genomic coordinates are out of range or invalid.
+
+ '''
+ cdef AlignedSegment read
+ cdef long counter = 0
+
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+
+ cdef int filter_method = 0
+ if read_callback == "all":
+ filter_method = 1
+ elif read_callback == "nofilter":
+ filter_method = 2
+
+ for read in self.fetch(reference=reference,
+ start=start,
+ end=end,
+ region=region,
+ until_eof=until_eof):
+ # apply filter
+ if filter_method == 1:
+ # filter = "all"
+ if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+ continue
+ elif filter_method == 2:
+ # filter = "nofilter"
+ pass
+ else:
+ if not read_callback(read):
+ continue
+ counter += 1
+
+ return counter
+
+ @cython.boundscheck(False) # we do manual bounds checking
+ def count_coverage(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None,
+ quality_threshold=15,
+ read_callback='all'):
+ """count the coverage of genomic positions by reads in :term:`region`.
+
+ The region is specified by :term:`reference`, `start` and
+ `end`. Alternatively, a :term:`samtools` :term:`region` string
+ can be supplied. The coverage is computed per-base [ACGT].
+
+ Parameters
+ ----------
+
+ reference : string
+ reference_name of the genomic region (chromosome)
+
+ start : int
+ start of the genomic region
+
+ end : int
+ end of the genomic region
+
+ region : int
+ a region string.
+
+ quality_threshold : int
+ quality_threshold is the minimum quality score (in phred) a
+ base has to reach to be counted.
+
+ read_callback: string or function
+
+ select a call-back to ignore reads when counting. It can
+ be either a string with the following values:
+
+ ``all``
+ skip reads in which any of the following
+ flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
+ BAM_FDUP
+
+ ``nofilter``
+ uses every single read
+
+ Alternatively, `read_callback` can be a function
+ ``check_read(read)`` that should return True only for
+ those reads that shall be included in the counting.
+
+ Raises
+ ------
+
+ ValueError
+ if the genomic coordinates are out of range or invalid.
+
+ Returns
+ -------
+
+ four array.arrays of the same length in order A C G T : tuple
+
+ """
+
+ cdef int _start = start
+ cdef int _stop = end
+ cdef int length = _stop - _start
+ cdef c_array.array int_array_template = array.array('L', [])
+ cdef c_array.array count_a
+ cdef c_array.array count_c
+ cdef c_array.array count_g
+ cdef c_array.array count_t
+ count_a = c_array.clone(int_array_template, length, zero=True)
+ count_c = c_array.clone(int_array_template, length, zero=True)
+ count_g = c_array.clone(int_array_template, length, zero=True)
+ count_t = c_array.clone(int_array_template, length, zero=True)
+
+ cdef AlignedSegment read
+ cdef cython.str seq
+ cdef c_array.array quality
+ cdef int qpos
+ cdef int refpos
+ cdef int c = 0
+ cdef int filter_method = 0
+ if read_callback == "all":
+ filter_method = 1
+ elif read_callback == "nofilter":
+ filter_method = 2
+
+ cdef int _threshold = quality_threshold
+ for read in self.fetch(reference=reference,
+ start=start,
+ end=end,
+ region=region):
+ # apply filter
+ if filter_method == 1:
+ # filter = "all"
+ if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
+ continue
+ elif filter_method == 2:
+ # filter = "nofilter"
+ pass
+ else:
+ if not read_callback(read):
+ continue
+
+ # count
+ seq = read.seq
+ quality = read.query_qualities
+ for qpos, refpos in read.get_aligned_pairs(True):
+ if qpos is not None and refpos is not None and \
+ _start <= refpos < _stop:
+ if quality[qpos] >= quality_threshold:
+ if seq[qpos] == 'A':
+ count_a.data.as_ulongs[refpos - _start] += 1
+ if seq[qpos] == 'C':
+ count_c.data.as_ulongs[refpos - _start] += 1
+ if seq[qpos] == 'G':
+ count_g.data.as_ulongs[refpos - _start] += 1
+ if seq[qpos] == 'T':
+ count_t.data.as_ulongs[refpos - _start] += 1
+
+ return count_a, count_c, count_g, count_t
+
+ def find_introns(self, read_iterator):
+ """Return a dictionary {(start, stop): count}
+ Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
+ and their support ( = number of reads ).
+
+ read_iterator can be the result of a .fetch(...) call.
+ Or it can be a generator filtering such reads. Example
+ samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
+ """
+ import collections
+ res = collections.Counter()
+ for r in read_iterator:
+ if 'N' in r.cigarstring:
+ last_read_pos = False
+ for read_loc, genome_loc in r.get_aligned_pairs():
+ if read_loc is None and last_read_pos:
+ start = genome_loc
+ elif read_loc and last_read_pos is None:
+ stop = genome_loc # we are right exclusive ,so this is correct
+ res[(start, stop)] += 1
+ del start
+ del stop
+ last_read_pos = read_loc
+ return res
+
+ def close(self):
+ '''
+ closes the :class:`pysam.AlignmentFile`.'''
+
+ if self.htsfile == NULL:
+ return
+
+ cdef int ret = hts_close(self.htsfile)
+ hts_idx_destroy(self.index)
+ self.htsfile = NULL
+
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
+
+ def __dealloc__(self):
+ # remember: dealloc cannot call other methods
+ # note: no doc string
+ # note: __del__ is not called.
+
+ # FIXME[kbj]: isn't self.close a method? I've been duplicating
+ # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty
+ # solution and perhaps unnecessary given that calling self.close has
+ # been working for years.
+ # AH: I have removed the call to close. Even though it is working,
+ # it seems to be dangerous according to the documentation as the
+ # object be partially deconstructed already.
+ cdef int ret = 0
+
+ if self.htsfile != NULL:
+ ret = hts_close(self.htsfile)
+ hts_idx_destroy(self.index);
+ self.htsfile = NULL
+
+ bam_destroy1(self.b)
+ if self.header != NULL:
+ bam_hdr_destroy(self.header)
+
+
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
+
+ cpdef int write(self, AlignedSegment read) except -1:
+ '''
+ write a single :class:`pysam.AlignedSegment` to disk.
+
+ Raises
+ ------
+ ValueError
+ if the writing failed
+
+ Returns
+ -------
+
+ int : the number of bytes written. If the file is closed,
+ this will be 0.
+ '''
+ if not self.is_open:
+ return 0
+
+ cdef int ret
+
+ with nogil:
+ ret = sam_write1(self.htsfile,
+ self.header,
+ read._delegate)
+
+ # kbj: Still need to raise an exception with except -1. Otherwise
+ # when ret == -1 we get a "SystemError: error return without
+ # exception set".
+ if ret < 0:
+ raise IOError(
+ "sam_write1 failed with error code {}".format(ret))
+
+ return ret
+
+ # context manager interface
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
+ ###############################################################
+ ###############################################################
+ ###############################################################
+ ## properties
+ ###############################################################
+ property nreferences:
+ """"int with the number of :term:`reference` sequences in the file.
+ This is a read-only attribute."""
+ def __get__(self):
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+ return self.header.n_targets
+
+ property references:
+ """tuple with the names of :term:`reference` sequences. This is a
+ read-only attribute"""
+ def __get__(self):
+ if not self.is_open: raise ValueError( "I/O operation on closed file" )
+ t = []
+ for x from 0 <= x < self.header.n_targets:
+ t.append(charptr_to_str(self.header.target_name[x]))
+ return tuple(t)
+
+ property lengths:
+ """tuple of the lengths of the :term:`reference` sequences. This is a
+ read-only attribute. The lengths are in the same order as
+ :attr:`pysam.AlignmentFile.references`
+
+ """
+ def __get__(self):
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+ t = []
+ for x from 0 <= x < self.header.n_targets:
+ t.append(self.header.target_len[x])
+ return tuple(t)
+
+ property mapped:
+ """int with total number of mapped alignments according to the
+ statistics recorded in the index. This is a read-only
+ attribute.
+ """
+ def __get__(self):
+ self.check_index()
+ cdef int tid
+ cdef uint64_t total = 0
+ cdef uint64_t mapped, unmapped
+ for tid from 0 <= tid < self.header.n_targets:
+ with nogil:
+ hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+ total += mapped
+ return total
+
+ property unmapped:
+ """int with total number of unmapped reads according to the statistics
+ recorded in the index. This number of reads includes the number of reads
+ without coordinates. This is a read-only attribute.
+ """
+ def __get__(self):
+ self.check_index()
+ cdef int tid
+ cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
+ cdef uint64_t mapped, unmapped
+ for tid from 0 <= tid < self.header.n_targets:
+ with nogil:
+ hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
+ total += unmapped
+ return total
+
+ property nocoordinate:
+ """int with total number of reads without coordinates according to the
+ statistics recorded in the index. This is a read-only attribute.
+ """
+ def __get__(self):
+ self.check_index()
+ cdef uint64_t n
+ with nogil:
+ n = hts_idx_get_n_no_coor(self.index)
+ return n
+
+ property text:
+ '''string with the full contents of the :term:`sam file` header as a
+ string.
+
+ This is a read-only attribute.
+
+ See :attr:`pysam.AlignmentFile.header` to get a parsed
+ representation of the header.
+ '''
+ def __get__(self):
+ if not self.is_open:
+ raise ValueError( "I/O operation on closed file" )
+ return from_string_and_size(self.header.text, self.header.l_text)
+
+ property header:
+ """two-level dictionay with header information from the file.
+
+ This is a read-only attribute.
+
+ The first level contains the record (``HD``, ``SQ``, etc) and
+ the second level contains the fields (``VN``, ``LN``, etc).
+
+ The parser is validating and will raise an AssertionError if
+ if encounters any record or field tags that are not part of
+ the SAM specification. Use the
+ :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
+ header.
+
+ The parsing follows the SAM format specification with the
+ exception of the ``CL`` field. This option will consume the
+ rest of a header line irrespective of any additional fields.
+ This behaviour has been added to accommodate command line
+ options that contain characters that are not valid field
+ separators.
+
+ """
+ def __get__(self):
+ if not self.is_open:
+ raise ValueError( "I/O operation on closed file" )
+
+ result = {}
+
+ if self.header.text != NULL:
+ # convert to python string (note: call self.text to
+ # create 0-terminated string)
+ t = self.text
+ for line in t.split("\n"):
+ if not line.strip(): continue
+ assert line.startswith("@"), \
+ "header line without '@': '%s'" % line
+ fields = line[1:].split("\t")
+ record = fields[0]
+ assert record in VALID_HEADER_TYPES, \
+ "header line with invalid type '%s': '%s'" % (record, line)
+
+ # treat comments
+ if record == "CO":
+ if record not in result:
+ result[record] = []
+ result[record].append("\t".join( fields[1:]))
+ continue
+ # the following is clumsy as generators do not work?
+ x = {}
+
+ for idx, field in enumerate(fields[1:]):
+ if ":" not in field:
+ raise ValueError("malformatted header: no ':' in field" )
+ key, value = field.split(":", 1)
+ if key in ("CL",):
+ # special treatment for command line
+ # statements (CL). These might contain
+ # characters that are non-conformant with
+ # the valid field separators in the SAM
+ # header. Thus, in contravention to the
+ # SAM API, consume the rest of the line.
+ key, value = "\t".join(fields[idx+1:]).split(":", 1)
+ x[key] = KNOWN_HEADER_FIELDS[record][key](value)
+ break
+
+ # interpret type of known header record tags, default to str
+ x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
+
+ if VALID_HEADER_TYPES[record] == dict:
+ if record in result:
+ raise ValueError(
+ "multiple '%s' lines are not permitted" % record)
+
+ result[record] = x
+ elif VALID_HEADER_TYPES[record] == list:
+ if record not in result: result[record] = []
+ result[record].append(x)
+
+ # if there are no SQ lines in the header, add the
+ # reference names from the information in the bam
+ # file.
+ #
+ # Background: c-samtools keeps the textual part of the
+ # header separate from the list of reference names and
+ # lengths. Thus, if a header contains only SQ lines,
+ # the SQ information is not part of the textual header
+ # and thus are missing from the output. See issue 84.
+ if "SQ" not in result:
+ sq = []
+ for ref, length in zip(self.references, self.lengths):
+ sq.append({'LN': length, 'SN': ref })
+ result["SQ"] = sq
+
+ return result
+
+ ###############################################################
+ ## file-object like iterator access
+ ## note: concurrent access will cause errors (see IteratorRow
+ ## and multiple_iterators)
+ ## Possible solutions: deprecate or open new file handle
+ def __iter__(self):
+ if not self.is_open:
+ raise ValueError("I/O operation on closed file")
+
+ if not self.is_bam and self.header.n_targets == 0:
+ raise NotImplementedError(
+ "can not iterate over samfile without header")
+ return self
+
+ cdef bam1_t * getCurrent( self ):
+ return self.b
+
+ cdef int cnext(self):
+ '''
+ cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
+ '''
+ cdef int ret
+ with nogil:
+ ret = sam_read1(self.htsfile,
+ self.header,
+ self.b)
+ return ret
+
+ def __next__(self):
+ cdef int ret = self.cnext()
+ if (ret >= 0):
+ return makeAlignedSegment(self.b, self)
+ elif ret == -2:
+ raise IOError('truncated file')
+ else:
+ raise StopIteration
+
+ # Compatibility functions for pysam < 0.8.3
+ def gettid(self, reference):
+ """deprecated, use get_tid() instead"""
+ return self.get_tid(reference)
+
+ def getrname(self, tid):
+ """deprecated, use get_reference_name() instead"""
+ return self.get_reference_name(tid)
+
+
+cdef class IteratorRow:
+ '''abstract base class for iterators over mapped reads.
+
+ Various iterators implement different behaviours for wrapping around
+ contig boundaries. Examples include:
+
+ :class:`pysam.IteratorRowRegion`
+ iterate within a single contig and a defined region.
+
+ :class:`pysam.IteratorRowAll`
+ iterate until EOF. This iterator will also include unmapped reads.
+
+ :class:`pysam.IteratorRowAllRefs`
+ iterate over all reads in all reference sequences.
+
+ The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
+
+ .. note::
+
+ It is usually not necessary to create an object of this class
+ explicitly. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
+
+ '''
+
+ def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
+ cdef char *cfilename
+ cdef char *creference_filename
+
+ if not samfile.is_open:
+ raise ValueError("I/O operation on closed file")
+
+ # makes sure that samfile stays alive as long as the
+ # iterator is alive
+ self.samfile = samfile
+
+ # reopen the file - note that this makes the iterator
+ # slow and causes pileup to slow down significantly.
+ if multiple_iterators:
+ cfilename = samfile.filename
+ with nogil:
+ self.htsfile = hts_open(cfilename, 'r')
+ assert self.htsfile != NULL
+ # read header - required for accurate positioning
+ # could a tell/seek work?
+ with nogil:
+ self.header = sam_hdr_read(self.htsfile)
+ assert self.header != NULL
+ self.owns_samfile = True
+ # options specific to CRAM files
+ if samfile.is_cram and samfile.reference_filename:
+ creference_filename = samfile.reference_filename
+ hts_set_opt(self.htsfile,
+ CRAM_OPT_REFERENCE,
+ creference_filename)
+
+ else:
+ self.htsfile = self.samfile.htsfile
+ self.owns_samfile = False
+ self.header = self.samfile.header
+
+ self.retval = 0
+
+ self.b = bam_init1()
+
+ def __dealloc__(self):
+ bam_destroy1(self.b)
+ if self.owns_samfile:
+ hts_close(self.htsfile)
+ bam_hdr_destroy(self.header)
+
+
+cdef class IteratorRowRegion(IteratorRow):
+ """*(AlignmentFile samfile, int tid, int beg, int end,
+ int multiple_iterators=False)*
+
+ iterate over mapped reads in a region.
+
+ .. note::
+
+ It is usually not necessary to create an object of this class
+ explicitly. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
+
+ """
+
+ def __init__(self, AlignmentFile samfile,
+ int tid, int beg, int end,
+ int multiple_iterators=False):
+
+ IteratorRow.__init__(self, samfile,
+ multiple_iterators=multiple_iterators)
+
+ if not samfile.has_index():
+ raise ValueError("no index available for iteration")
+
+ with nogil:
+ self.iter = sam_itr_queryi(
+ self.samfile.index,
+ tid,
+ beg,
+ end)
+
+ def __iter__(self):
+ return self
+
+ cdef bam1_t * getCurrent(self):
+ return self.b
+
+ cdef int cnext(self):
+ '''cversion of iterator. Used by IteratorColumn'''
+ with nogil:
+ self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
+ self.iter,
+ self.b,
+ self.htsfile)
+
+ def __next__(self):
+ self.cnext()
+ if self.retval >= 0:
+ return makeAlignedSegment(self.b, self.samfile)
+ elif self.retval == -2:
+ # Note: it is currently not the case that hts_iter_next
+ # returns -2 for a truncated file.
+ # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
+ raise IOError('truncated file')
+ else:
+ raise StopIteration
+
+ def __dealloc__(self):
+ hts_itr_destroy(self.iter)
+
+
+cdef class IteratorRowHead(IteratorRow):
+ """*(AlignmentFile samfile, n, int multiple_iterators=False)*
+
+ iterate over first n reads in `samfile`
+
+ .. note::
+ It is usually not necessary to create an object of this class
+ explicitly. It is returned as a result of call to a
+ :meth:`AlignmentFile.head`.
+
+ """
+
+ def __init__(self, AlignmentFile samfile, int n,
+ int multiple_iterators=False):
+
+ IteratorRow.__init__(self, samfile,
+ multiple_iterators=multiple_iterators)
+
+ self.max_rows = n
+ self.current_row = 0
+
+ def __iter__(self):
+ return self
+
+ cdef bam1_t * getCurrent( self ):
+ return self.b
+
+ cdef int cnext(self):
+ '''cversion of iterator. Used by IteratorColumn'''
+ cdef int ret
+ with nogil:
+ ret = sam_read1(self.htsfile,
+ self.samfile.header,
+ self.b)
+ return ret
+
+ def __next__(self):
+ if self.current_row >= self.max_rows:
+ raise StopIteration
+
+ cdef int ret = self.cnext()
+ if ret >= 0:
+ self.current_row += 1
+ return makeAlignedSegment(self.b, self.samfile)
+ elif ret == -2:
+ raise IOError('truncated file')
+ else:
+ raise StopIteration
+
+
+cdef class IteratorRowAll(IteratorRow):
+ """*(AlignmentFile samfile, int multiple_iterators=False)*
+
+ iterate over all reads in `samfile`
+
+ .. note::
+
+ It is usually not necessary to create an object of this class
+ explicitly. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
+
+ """
+
+ def __init__(self, AlignmentFile samfile,
+ int multiple_iterators=False):
+
+ IteratorRow.__init__(self, samfile,
+ multiple_iterators=multiple_iterators)
+
+ def __iter__(self):
+ return self
+
+ cdef bam1_t * getCurrent( self ):
+ return self.b
+
+ cdef int cnext(self):
+ '''cversion of iterator. Used by IteratorColumn'''
+ cdef int ret
+ with nogil:
+ ret = sam_read1(self.htsfile,
+ self.samfile.header,
+ self.b)
+ return ret
+
+ def __next__(self):
+ cdef int ret = self.cnext()
+ if ret >= 0:
+ return makeAlignedSegment(self.b, self.samfile)
+ elif ret == -2:
+ raise IOError('truncated file')
+ else:
+ raise StopIteration
+
+
+cdef class IteratorRowAllRefs(IteratorRow):
+ """iterates over all mapped reads by chaining iterators over each
+ reference
+
+ .. note::
+ It is usually not necessary to create an object of this class
+ explicitly. It is returned as a result of call to a
+ :meth:`AlignmentFile.fetch`.
+
+ """
+
+ def __init__(self, AlignmentFile samfile,
+ multiple_iterators=False):
+
+ IteratorRow.__init__(self, samfile,
+ multiple_iterators=multiple_iterators)
+
+ if not samfile.has_index():
+ raise ValueError("no index available for fetch")
+
+ self.tid = -1
+
+ def nextiter(self):
+ # get a new iterator for a chromosome. The file
+ # will not be re-opened.
+ self.rowiter = IteratorRowRegion(self.samfile,
+ self.tid,
+ 0,
+ 1<<29)
+ # set htsfile and header of the rowiter
+ # to the values in this iterator to reflect multiple_iterators
+ self.rowiter.htsfile = self.htsfile
+ self.rowiter.header = self.header
+
+ # make sure the iterator understand that IteratorRowAllRefs
+ # has ownership
+ self.rowiter.owns_samfile = False
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ # Create an initial iterator
+ if self.tid == -1:
+ if not self.samfile.nreferences:
+ raise StopIteration
+ self.tid = 0
+ self.nextiter()
+
+ while 1:
+ self.rowiter.cnext()
+
+ # If current iterator is not exhausted, return aligned read
+ if self.rowiter.retval > 0:
+ return makeAlignedSegment(self.rowiter.b, self.samfile)
+
+ self.tid += 1
+
+ # Otherwise, proceed to next reference or stop
+ if self.tid < self.samfile.nreferences:
+ self.nextiter()
+ else:
+ raise StopIteration
+
+
+cdef class IteratorRowSelection(IteratorRow):
+ """*(AlignmentFile samfile)*
+
+ iterate over reads in `samfile` at a given list of file positions.
+
+ .. note::
+ It is usually not necessary to create an object of this class
+ explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
+ """
+
+ def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
+
+ IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators)
+
+ self.positions = positions
+ self.current_pos = 0
+
+ def __iter__(self):
+ return self
+
+ cdef bam1_t * getCurrent(self):
+ return self.b
+
+ cdef int cnext(self):
+ '''cversion of iterator'''
+ # end iteration if out of positions
+ if self.current_pos >= len(self.positions): return -1
+
+ cdef uint64_t pos = self.positions[self.current_pos]
+ with nogil:
+ bgzf_seek(hts_get_bgzfp(self.htsfile),
+ pos,
+ 0)
+ self.current_pos += 1
+
+ cdef int ret
+ with nogil:
+ ret = sam_read1(self.htsfile,
+ self.samfile.header,
+ self.b)
+ return ret
+
+ def __next__(self):
+ cdef int ret = self.cnext()
+ if (ret >= 0):
+ return makeAlignedSegment(self.b, self.samfile)
+ elif (ret == -2):
+ raise IOError('truncated file')
+ else:
+ raise StopIteration
+
+
+cdef int __advance_nofilter(void *data, bam1_t *b):
+ '''advance without any read filtering.
+ '''
+ cdef __iterdata * d
+ d = <__iterdata*>data
+ cdef int ret
+ with nogil:
+ ret = sam_itr_next(d.htsfile, d.iter, b)
+ return ret
+
+
+cdef int __advance_all(void *data, bam1_t *b):
+ '''only use reads for pileup passing basic
+ filters:
+
+ BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
+ '''
+
+ cdef __iterdata * d
+ cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
+ d = <__iterdata*>data
+ cdef int ret
+ with nogil:
+ ret = sam_itr_next(d.htsfile, d.iter, b)
+ while ret >= 0 and b.core.flag & mask:
+ with nogil:
+ ret = sam_itr_next(d.htsfile, d.iter, b)
+ return ret
+
+
+cdef int __advance_snpcalls(void * data, bam1_t * b):
+ '''advance using same filter and read processing as in
+ the samtools pileup.
+ '''
+
+ # Note that this method requries acces to some
+ # functions in the samtools code base and is thus
+ # not htslib only.
+ # The functions accessed in samtools are:
+ # 1. bam_prob_realn
+ # 2. bam_cap_mapQ
+ cdef __iterdata * d
+ d = <__iterdata*>data
+
+ cdef int ret
+ cdef int skip = 0
+ cdef int q
+ cdef int is_cns = 1
+ cdef int is_nobaq = 0
+ cdef int capQ_thres = 0
+
+ with nogil:
+ ret = sam_itr_next(d.htsfile, d.iter, b)
+
+ # reload sequence
+ if d.fastafile != NULL and b.core.tid != d.tid:
+ if d.seq != NULL:
+ free(d.seq)
+ d.tid = b.core.tid
+ with nogil:
+ d.seq = faidx_fetch_seq(
+ d.fastafile,
+ d.header.target_name[d.tid],
+ 0, MAX_POS,
+ &d.seq_len)
+
+ if d.seq == NULL:
+ raise ValueError(
+ "reference sequence for '%s' (tid=%i) not found" % \
+ (d.header.target_name[d.tid],
+ d.tid))
+
+ while ret >= 0:
+ skip = 0
+
+ # realign read - changes base qualities
+ if d.seq != NULL and is_cns and not is_nobaq:
+ bam_prob_realn(b, d.seq)
+
+ if d.seq != NULL and capQ_thres > 10:
+ q = bam_cap_mapQ(b, d.seq, capQ_thres)
+ if q < 0:
+ skip = 1
+ elif b.core.qual > q:
+ b.core.qual = q
+ if b.core.flag & BAM_FUNMAP:
+ skip = 1
+ elif b.core.flag & 1 and not b.core.flag & 2:
+ skip = 1
+
+ if not skip:
+ break
+ # additional filters
+
+ with nogil:
+ ret = sam_itr_next(d.htsfile, d.iter, b)
+
+ return ret
+
+cdef class IteratorColumn:
+ '''abstract base class for iterators over columns.
+
+ IteratorColumn objects wrap the pileup functionality of samtools.
+
+ For reasons of efficiency, the iterator points to the current
+ pileup buffer. The pileup buffer is updated at every iteration.
+ This might cause some unexpected behavious. For example,
+ consider the conversion to a list::
+
+ f = AlignmentFile("file.bam", "rb")
+ result = list( f.pileup() )
+
+ Here, ``result`` will contain ``n`` objects of type
+ :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
+ ``result`` will contain the same information.
+
+ The desired behaviour can be achieved by list comprehension::
+
+ result = [ x.pileups() for x in f.pileup() ]
+
+ ``result`` will be a list of ``n`` lists of objects of type
+ :class:`~pysam.PileupRead`.
+
+ If the iterator is associated with a :class:`~pysam.Fastafile` using the
+ :meth:`addReference` method, then the iterator will export the
+ current sequence via the methods :meth:`getSequence` and
+ :meth:`seq_len`.
+
+ Optional kwargs to the iterator:
+
+ stepper
+ The stepper controls how the iterator advances.
+
+ Valid values are None, "all" (default), "nofilter" or "samtools".
+
+ See AlignmentFile.pileup for description.
+
+ fastafile
+ A :class:`~pysam.FastaFile` object
+
+ max_depth
+ maximum read depth. The default is 8000.
+
+ '''
+
+ def __cinit__( self, AlignmentFile samfile, **kwargs ):
+ self.samfile = samfile
+ self.fastafile = kwargs.get("fastafile", None)
+ self.stepper = kwargs.get("stepper", None)
+ self.max_depth = kwargs.get("max_depth", 8000)
+ self.iterdata.seq = NULL
+ self.tid = 0
+ self.pos = 0
+ self.n_plp = 0
+ self.plp = NULL
+ self.pileup_iter = <bam_plp_t>NULL
+
+ def __iter__(self):
+ return self
+
+ cdef int cnext(self):
+ '''perform next iteration.
+ '''
+ # do not release gil here because of call-backs
+ self.plp = bam_plp_auto(self.pileup_iter,
+ &self.tid,
+ &self.pos,
+ &self.n_plp)
+
+ cdef char * getSequence(self):
+ '''return current reference sequence underlying the iterator.
+ '''
+ return self.iterdata.seq
+
+ property seq_len:
+ '''current sequence length.'''
+ def __get__(self):
+ return self.iterdata.seq_len
+
+ def addReference(self, Fastafile fastafile):
+ '''
+ add reference sequences in `fastafile` to iterator.'''
+ self.fastafile = fastafile
+ if self.iterdata.seq != NULL:
+ free(self.iterdata.seq)
+ self.iterdata.tid = -1
+ self.iterdata.fastafile = self.fastafile.fastafile
+
+ def hasReference(self):
+ '''
+ return true if iterator is associated with a reference'''
+ return self.fastafile
+
+ cdef setMask(self, mask):
+ '''set masking flag in iterator.
+
+ reads with bits set in `mask` will be skipped.
+ '''
+ raise NotImplementedError()
+ # self.mask = mask
+ # bam_plp_set_mask( self.pileup_iter, self.mask )
+
+ cdef setupIteratorData( self,
+ int tid,
+ int start,
+ int end,
+ int multiple_iterators=0 ):
+ '''setup the iterator structure'''
+
+ self.iter = IteratorRowRegion(self.samfile, tid, start, end, multiple_iterators)
+ self.iterdata.htsfile = self.samfile.htsfile
+ self.iterdata.iter = self.iter.iter
+ self.iterdata.seq = NULL
+ self.iterdata.tid = -1
+ self.iterdata.header = self.samfile.header
+
+ if self.fastafile is not None:
+ self.iterdata.fastafile = self.fastafile.fastafile
+ else:
+ self.iterdata.fastafile = NULL
+
+ # Free any previously allocated memory before reassigning
+ # pileup_iter
+ self._free_pileup_iter()
+
+ if self.stepper is None or self.stepper == "all":
+ with nogil:
+ self.pileup_iter = bam_plp_init(
+ <bam_plp_auto_f>&__advance_all,
+ &self.iterdata)
+ elif self.stepper == "nofilter":
+ with nogil:
+ self.pileup_iter = bam_plp_init(
+ <bam_plp_auto_f>&__advance_nofilter,
+ &self.iterdata)
+ elif self.stepper == "samtools":
+ with nogil:
+ self.pileup_iter = bam_plp_init(
+ <bam_plp_auto_f>&__advance_snpcalls,
+ &self.iterdata)
+ else:
+ raise ValueError(
+ "unknown stepper option `%s` in IteratorColumn" % self.stepper)
+
+ if self.max_depth:
+ with nogil:
+ bam_plp_set_maxcnt(self.pileup_iter, self.max_depth)
+
+ # bam_plp_set_mask( self.pileup_iter, self.mask )
+
+ cdef reset( self, tid, start, end ):
+ '''reset iterator position.
+
+ This permits using the iterator multiple times without
+ having to incur the full set-up costs.
+ '''
+ self.iter = IteratorRowRegion( self.samfile, tid, start, end, multiple_iterators = 0 )
+ self.iterdata.iter = self.iter.iter
+
+ # invalidate sequence if different tid
+ if self.tid != tid:
+ if self.iterdata.seq != NULL:
+ free(self.iterdata.seq)
+ self.iterdata.seq = NULL
+ self.iterdata.tid = -1
+
+ # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata )
+ with nogil:
+ bam_plp_reset(self.pileup_iter)
+
+ cdef _free_pileup_iter(self):
+ '''free the memory alloc'd by bam_plp_init.
+
+ This is needed before setupIteratorData allocates
+ another pileup_iter, or else memory will be lost.
+ '''
+ if self.pileup_iter != <bam_plp_t>NULL:
+ with nogil:
+ bam_plp_reset(self.pileup_iter)
+ bam_plp_destroy(self.pileup_iter)
+ self.pileup_iter = <bam_plp_t>NULL
+
+ def __dealloc__(self):
+ # reset in order to avoid memory leak messages for iterators
+ # that have not been fully consumed
+ self._free_pileup_iter()
+ self.plp = <bam_pileup1_t*>NULL
+
+ if self.iterdata.seq != NULL:
+ free(self.iterdata.seq)
+ self.iterdata.seq = NULL
+
+
+cdef class IteratorColumnRegion(IteratorColumn):
+ '''iterates over a region only.
+ '''
+ def __cinit__(self, AlignmentFile samfile,
+ int tid = 0,
+ int start = 0,
+ int end = MAX_POS,
+ int truncate = False,
+ **kwargs ):
+
+ # initialize iterator
+ self.setupIteratorData(tid, start, end, 1)
+ self.start = start
+ self.end = end
+ self.truncate = truncate
+
+ def __next__(self):
+
+ while 1:
+ self.cnext()
+ if self.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ if self.plp == NULL:
+ raise StopIteration
+
+ if self.truncate:
+ if self.start > self.pos: continue
+ if self.pos >= self.end: raise StopIteration
+
+ return makePileupColumn(&self.plp,
+ self.tid,
+ self.pos,
+ self.n_plp,
+ self.samfile)
+
+
+cdef class IteratorColumnAllRefs(IteratorColumn):
+ """iterates over all columns by chaining iterators over each reference
+ """
+
+ def __cinit__(self,
+ AlignmentFile samfile,
+ **kwargs):
+
+ # no iteration over empty files
+ if not samfile.nreferences:
+ raise StopIteration
+
+ # initialize iterator
+ self.setupIteratorData(self.tid, 0, MAX_POS, 1)
+
+ def __next__(self):
+
+ while 1:
+ self.cnext()
+
+ if self.n_plp < 0:
+ raise ValueError("error during iteration" )
+
+ # return result, if within same reference
+ if self.plp != NULL:
+ return makePileupColumn(&self.plp,
+ self.tid,
+ self.pos,
+ self.n_plp,
+ self.samfile)
+
+ # otherwise, proceed to next reference or stop
+ self.tid += 1
+ if self.tid < self.samfile.nreferences:
+ self.setupIteratorData(self.tid, 0, MAX_POS, 0)
+ else:
+ raise StopIteration
+
+
+cdef class SNPCall:
+ '''the results of a SNP call.'''
+ cdef int _tid
+ cdef int _pos
+ cdef char _reference_base
+ cdef char _genotype
+ cdef int _consensus_quality
+ cdef int _snp_quality
+ cdef int _rms_mapping_quality
+ cdef int _coverage
+
+ property tid:
+ '''the chromosome ID as is defined in the header'''
+ def __get__(self):
+ return self._tid
+
+ property pos:
+ '''nucleotide position of SNP.'''
+ def __get__(self): return self._pos
+
+ property reference_base:
+ '''reference base at pos. ``N`` if no reference sequence supplied.'''
+ def __get__(self): return from_string_and_size( &self._reference_base, 1 )
+
+ property genotype:
+ '''the genotype called.'''
+ def __get__(self): return from_string_and_size( &self._genotype, 1 )
+
+ property consensus_quality:
+ '''the genotype quality (Phred-scaled).'''
+ def __get__(self): return self._consensus_quality
+
+ property snp_quality:
+ '''the snp quality (Phred scaled) - probability of consensus being
+ identical to reference sequence.'''
+ def __get__(self): return self._snp_quality
+
+ property mapping_quality:
+ '''the root mean square (rms) of the mapping quality of all reads
+ involved in the call.'''
+ def __get__(self): return self._rms_mapping_quality
+
+ property coverage:
+ '''coverage or read depth - the number of reads involved in the call.'''
+ def __get__(self): return self._coverage
+
+ def __str__(self):
+
+ return "\t".join( map(str, (
+ self.tid,
+ self.pos,
+ self.reference_base,
+ self.genotype,
+ self.consensus_quality,
+ self.snp_quality,
+ self.mapping_quality,
+ self.coverage ) ) )
+
+
+cdef class IndexedReads:
+ """*(AlignmentFile samfile, multiple_iterators=True)
+
+ Index a Sam/BAM-file by query name while keeping the
+ original sort order intact.
+
+ The index is kept in memory and can be substantial.
+
+ By default, the file is re-openend to avoid conflicts if multiple
+ operators work on the same file. Set `multiple_iterators` = False
+ to not re-open `samfile`.
+
+ Parameters
+ ----------
+
+ samfile : AlignmentFile
+ File to be indexed.
+
+ multiple_iterators : bool
+ Flag indicating whether the file should be reopened. Reopening prevents
+ existing iterators being affected by the indexing.
+
+ """
+
+ def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
+ cdef char *cfilename
+
+ # makes sure that samfile stays alive as long as this
+ # object is alive.
+ self.samfile = samfile
+
+ assert samfile.is_bam, "can only IndexReads on bam files"
+
+ # multiple_iterators the file - note that this makes the iterator
+ # slow and causes pileup to slow down significantly.
+ if multiple_iterators:
+ cfilename = samfile.filename
+ with nogil:
+ self.htsfile = hts_open(cfilename, 'r')
+ assert self.htsfile != NULL
+ # read header - required for accurate positioning
+ with nogil:
+ self.header = sam_hdr_read(self.htsfile)
+ self.owns_samfile = True
+ else:
+ self.htsfile = self.samfile.htsfile
+ self.header = self.samfile.header
+ self.owns_samfile = False
+
+ def build(self):
+ '''build the index.'''
+
+ self.index = collections.defaultdict(list)
+
+ # this method will start indexing from the current file
+ # position if you decide
+ cdef int ret = 1
+ cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
+
+ cdef uint64_t pos
+
+ while ret > 0:
+ with nogil:
+ pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
+ ret = sam_read1(self.htsfile,
+ self.samfile.header,
+ b)
+ if ret > 0:
+ qname = charptr_to_str(pysam_bam_get_qname(b))
+ self.index[qname].append(pos)
+
+ bam_destroy1(b)
+
+ def find(self, query_name):
+ '''find `query_name` in index.
+
+ Returns
+ -------
+
+ IteratorRowSelection
+ Returns an iterator over all reads with query_name.
+
+ Raises
+ ------
+
+ KeyError
+ if the `query_name` is not in the index.
+
+ '''
+ if query_name in self.index:
+ return IteratorRowSelection(
+ self.samfile,
+ self.index[query_name],
+ multiple_iterators = False)
+ else:
+ raise KeyError("read %s not found" % query_name)
+
+ def __dealloc__(self):
+ if self.owns_samfile:
+ hts_close(self.htsfile)
+ bam_hdr_destroy(self.header)
+
+__all__ = [
+ "AlignmentFile",
+ "IteratorRow",
+ "IteratorColumn",
+ "IndexedReads"]
--- /dev/null
+###############################################################################
+###############################################################################
+## Cython wrapper for htslib VCF/BCF reader/writer
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015, 2016 Kevin Jacobs (jacobs@bioinformed.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, memmove, strncpy, strlen, strdup
+
+from pysam.libchtslib cimport *
+
+
+cdef class VariantHeader(object):
+ cdef bcf_hdr_t *ptr
+
+ cpdef VariantRecord new_record(self)
+ cdef _subset_samples(self, include_samples)
+
+
+cdef class VariantHeaderRecord(object):
+ cdef VariantHeader header
+ cdef bcf_hrec_t *ptr
+
+
+cdef class VariantHeaderRecords(object):
+ cdef VariantHeader header
+
+
+cdef class VariantHeaderContigs(object):
+ cdef VariantHeader header
+
+
+cdef class VariantHeaderSamples(object):
+ cdef VariantHeader header
+
+
+cdef class VariantContig(object):
+ cdef VariantHeader header
+ cdef int id
+
+
+cdef class VariantMetadata(object):
+ cdef VariantHeader header
+ cdef int type
+ cdef int id
+
+
+cdef class VariantHeaderMetadata(object):
+ cdef VariantHeader header
+ cdef int32_t type
+
+
+cdef class VariantRecord(object):
+ cdef VariantHeader header
+ cdef bcf1_t *ptr
+
+
+cdef class VariantRecordFilter(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordFormat(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordInfo(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordSamples(object):
+ cdef VariantRecord record
+
+
+cdef class VariantRecordSample(object):
+ cdef VariantRecord record
+ cdef readonly int32_t index
+
+
+cdef class BaseIndex(object):
+ cdef tuple refs
+ cdef dict refmap
+
+
+cdef class BCFIndex(BaseIndex):
+ cdef VariantHeader header
+ cdef hts_idx_t *ptr
+
+
+cdef class TabixIndex(BaseIndex):
+ cdef tbx_t *ptr
+
+
+cdef class BaseIterator(object):
+ cdef VariantFile bcf
+ cdef hts_itr_t *iter
+
+
+cdef class BCFIterator(BaseIterator):
+ cdef BCFIndex index
+
+
+cdef class TabixIterator(BaseIterator):
+ cdef TabixIndex index
+ cdef kstring_t line_buffer
+
+
+cdef class VariantFile(HTSFile):
+ cdef readonly VariantHeader header
+ cdef readonly BaseIndex index
+
+ cdef readonly bint drop_samples # true if sample information is to be ignored
+
+ # FIXME: Temporary, use htsFormat when it is available
+ cdef readonly bint is_reading # true if file has begun reading records
+ cdef readonly bint header_written # true if header has already been written
+
+ cpdef VariantRecord new_record(self)
+
+ cpdef int write(self, VariantRecord record) except -1
--- /dev/null
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+## Cython wrapper for htslib VCF/BCF reader/writer
+###############################################################################
+#
+# NOTICE: This code is incomplete and preliminary. It offers a nearly
+# complete Pythonic interface to VCF/BCF metadata and data with
+# reading and writing capability. Documentation and a unit test suite
+# are in the works. The code is best tested under Python 2, but
+# should also work with Python 3. Please report any remaining
+# str/bytes issues on the github site when using Python 3 and I'll
+# fix them promptly.
+#
+# Here is a minimal example of how to use the API:
+#
+# $ cat bcfview.py
+# import sys
+# from pysam import VariantFile
+#
+# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
+# bcf_out = VariantFile('-', 'w', header=bcf_in.header)
+#
+# for rec in bcf_in:
+# bcf_out.write(rec)
+#
+# Performance is fairly close to that of bcftools view. Here is an example
+# using some 1k Genomes data:
+#
+# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
+# 1103799
+#
+# real 0m56.114s
+# user 1m4.489s
+# sys 0m3.102s
+#
+# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
+# 1103800 # bcftools adds an extra header
+#
+# real 0m55.126s
+# user 1m3.502s
+# sys 0m3.459s
+#
+###############################################################################
+#
+# TODO list:
+#
+# * more genotype methods
+# * unit test suite (perhaps py.test based)
+# * documentation
+# * pickle support
+# * left/right locus normalization
+# * fix reopen to re-use fd
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+from __future__ import division, print_function
+
+import os
+import sys
+
+from libc.errno cimport errno, EPIPE
+from libc.string cimport strcmp, strpbrk, strerror
+from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
+
+cimport cython
+
+from cpython.object cimport PyObject
+from cpython.ref cimport Py_INCREF
+from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
+from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
+from cpython.bytes cimport PyBytes_FromStringAndSize
+from cpython.unicode cimport PyUnicode_DecodeASCII
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libchtslib cimport HTSFile, hisremote
+
+
+from warnings import warn
+
+
+__all__ = ['VariantFile',
+ 'VariantHeader',
+ 'VariantHeaderRecord',
+ 'VariantRecord']
+
+########################################################################
+########################################################################
+## Constants
+########################################################################
+
+cdef int MAX_POS = 2 << 29
+cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
+cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
+cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
+
+
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+
+########################################################################
+########################################################################
+## VCF/BCF string intern system
+########################################################################
+
+cdef dict bcf_str_cache = {}
+
+cdef inline bcf_str_cache_get_charptr(const char* s):
+ if s == NULL:
+ return None
+
+ cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
+ if pystr:
+ return <object>pystr
+
+ if PY_MAJOR_VERSION < 3:
+ val = s
+ else:
+ val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
+
+ PyDict_SetItemString(bcf_str_cache, s, val)
+
+ return val
+
+
+########################################################################
+########################################################################
+## Low level type conversion helpers
+########################################################################
+
+
+cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
+ return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
+
+
+cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
+ return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
+
+
+cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
+ if not a:
+ return None
+ try:
+ return tuple(charptr_to_str(a[i]) for i in range(n))
+ finally:
+ if free_after and a:
+ free(a)
+
+
+cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
+ cdef char *datac
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ cdef float *dataf
+ cdef int i
+
+ if not data or n <= 0:
+ return None
+
+ if type == BCF_BT_CHAR:
+ datac = <char *>data
+ while n and datac[n-1] == bcf_str_vector_end:
+ n -= 1
+ value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None
+ # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
+
+ value = tuple(v or None for v in value.split(',')) if value else ()
+ # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
+ else:
+ value = []
+ if type == BCF_BT_INT8:
+ data8 = <int8_t *>data
+ for i in range(n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ value.append(data8[i] if data8[i] != bcf_int8_missing else None)
+ elif type == BCF_BT_INT16:
+ data16 = <int16_t *>data
+ for i in range(n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ value.append(data16[i] if data16[i] != bcf_int16_missing else None)
+ elif type == BCF_BT_INT32:
+ data32 = <int32_t *>data
+ for i in range(n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ value.append(data32[i] if data32[i] != bcf_int32_missing else None)
+ elif type == BCF_BT_FLOAT:
+ dataf = <float *>data
+ for i in range(n):
+ if bcf_float_is_vector_end(dataf[i]):
+ break
+ value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
+ else:
+ raise TypeError('unsupported info type code')
+
+ # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
+ if not value:
+ if scalar:
+ value = None
+ elif count <= 0:
+ value = ()
+ else:
+ value = (None,)*count
+ elif scalar and len(value) == 1:
+ value = value[0]
+ else:
+ value = tuple(value)
+
+ return value
+
+
+cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
+ cdef char *datac
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ cdef float *dataf
+ cdef ssize_t i, value_count = len(values)
+
+ assert(value_count <= n)
+
+ if bt_type == BCF_BT_CHAR:
+ if not isinstance(values, (str, bytes)):
+ values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+ value_count = len(values)
+ assert(value_count <= n)
+ datac = <char *>data
+ memcpy(datac, <char *>values, value_count)
+ for i in range(value_count, n):
+ datac[i] = 0
+ elif bt_type == BCF_BT_INT8:
+ datai8 = <int8_t *>data
+ for i in range(value_count):
+ val = values[i]
+ datai8[i] = val if val is not None else bcf_int8_missing
+ for i in range(value_count, n):
+ datai8[i] = bcf_int8_vector_end
+ elif bt_type == BCF_BT_INT16:
+ datai16 = <int16_t *>data
+ for i in range(value_count):
+ val = values[i]
+ datai16[i] = val if val is not None else bcf_int16_missing
+ for i in range(value_count, n):
+ datai16[i] = bcf_int16_vector_end
+ elif bt_type == BCF_BT_INT32:
+ datai32 = <int32_t *>data
+ for i in range(value_count):
+ val = values[i]
+ datai32[i] = val if val is not None else bcf_int32_missing
+ for i in range(value_count, n):
+ datai32[i] = bcf_int32_vector_end
+ elif bt_type == BCF_BT_FLOAT:
+ dataf = <float *>data
+ for i in range(value_count):
+ val = values[i]
+ if val is None:
+ bcf_float_set(dataf + i, bcf_float_missing)
+ else:
+ dataf[i] = val
+ for i in range(value_count, n):
+ bcf_float_set(dataf + i, bcf_float_vector_end)
+ else:
+ raise TypeError('unsupported type')
+
+
+cdef bcf_empty_array(int type, ssize_t n, int vlen):
+ cdef char *datac
+ cdef int32_t *data32
+ cdef float *dataf
+ cdef int i
+
+ if n <= 0:
+ raise ValueError('Cannot create empty array')
+
+ if type == BCF_HT_STR:
+ value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
+ datac = <char *>value
+ for i in range(n):
+ datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
+ elif type == BCF_HT_INT:
+ value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
+ data32 = <int32_t *><char *>value
+ for i in range(n):
+ data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif type == BCF_HT_REAL:
+ value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
+ dataf = <float *><char *>value
+ for i in range(n):
+ bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
+ else:
+ raise TypeError('unsupported header type code')
+
+ return value
+
+
+cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
+ void *dst_data, int dst_type, ssize_t dst_values,
+ int vlen):
+ cdef char *src_datac
+ cdef char *dst_datac
+ cdef int8_t *src_datai8
+ cdef int16_t *src_datai16
+ cdef int32_t *src_datai32
+ cdef int32_t *dst_datai
+ cdef float *src_dataf
+ cdef float *dst_dataf
+ cdef ssize_t src_size, dst_size, i, j
+ cdef int val
+
+ if src_values > dst_values:
+ raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
+
+ if src_type == dst_type == BCF_BT_CHAR:
+ src_datac = <char *>src_data
+ dst_datac = <char *>dst_data
+ memcpy(src_datac, dst_datac, src_values)
+ for i in range(src_values, dst_values):
+ dst_datac[i] = 0
+ elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
+ src_datai8 = <int8_t *>src_data
+ dst_datai = <int32_t *>dst_data
+ for i in range(src_values):
+ val = src_datai8[i]
+ if val == bcf_int8_missing:
+ val = bcf_int32_missing
+ elif val == bcf_int8_vector_end:
+ val = bcf_int32_vector_end
+ dst_datai[i] = val
+ for i in range(src_values, dst_values):
+ dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
+ src_datai16 = <int16_t *>src_data
+ dst_datai = <int32_t *>dst_data
+ for i in range(src_values):
+ val = src_datai16[i]
+ if val == bcf_int16_missing:
+ val = bcf_int32_missing
+ elif val == bcf_int16_vector_end:
+ val = bcf_int32_vector_end
+ dst_datai[i] = val
+ for i in range(src_values, dst_values):
+ dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
+ src_datai32 = <int32_t *>src_data
+ dst_datai = <int32_t *>dst_data
+ for i in range(src_values):
+ dst_datai[i] = src_datai32[i]
+ for i in range(src_values, dst_values):
+ dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
+ elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
+ src_dataf = <float *>src_data
+ dst_dataf = <float *>dst_data
+ for i in range(src_values):
+ dst_dataf[i] = src_dataf[i]
+ for i in range(src_values, dst_values):
+ bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
+ else:
+ raise TypeError('unsupported types')
+
+
+cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
+ if record is None:
+ raise ValueError('record must not be None')
+
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf1_t *r = record.ptr
+
+ if not check_header_id(hdr, hl_type, id):
+ raise ValueError('Invalid header')
+
+ cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
+ cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
+
+ scalar[0] = 0
+
+ if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
+ count[0] = number
+ elif length == BCF_VL_FIXED:
+ if number == 1:
+ scalar[0] = 1
+ count[0] = number
+ elif length == BCF_VL_R:
+ count[0] = r.n_allele
+ elif length == BCF_VL_A:
+ count[0] = r.n_allele - 1
+ elif length == BCF_VL_G:
+ count[0] = r.n_allele * (r.n_allele + 1) // 2
+ elif length == BCF_VL_VAR:
+ count[0] = -1
+ else:
+ raise ValueError('Unknown format length')
+
+
+cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
+ if record is None:
+ raise ValueError('record must not be None')
+
+ cdef bcf_hdr_t *hdr = record.header.ptr
+
+ cdef char *s
+ cdef ssize_t count
+ cdef int scalar
+
+ bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar)
+
+ if z.len == 0:
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
+ value = True
+ elif scalar:
+ value = None
+ else:
+ value = ()
+ elif z.len == 1:
+ if z.type == BCF_BT_INT8:
+ value = z.v1.i if z.v1.i != bcf_int8_missing else None
+ elif z.type == BCF_BT_INT16:
+ value = z.v1.i if z.v1.i != bcf_int16_missing else None
+ elif z.type == BCF_BT_INT32:
+ value = z.v1.i if z.v1.i != bcf_int32_missing else None
+ elif z.type == BCF_BT_FLOAT:
+ value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
+ elif z.type == BCF_BT_CHAR:
+ value = force_str(chr(z.v1.i))
+ else:
+ raise TypeError('unsupported info type code')
+
+ if not scalar and value != ():
+ value = (value,)
+ else:
+ value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
+
+ return value
+
+
+cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
+ int id, int bt_type, ssize_t bt_len,
+ ssize_t *value_count, int *scalar, int *realloc):
+
+ if record is None:
+ raise ValueError('record must not be None')
+
+ bcf_get_value_count(record, hl_type, id, value_count, scalar)
+
+ # Validate values now that we know the type and size
+ values = (value,) if not isinstance(value, (list, tuple)) else value
+
+ # Validate values now that we know the type and size
+ if ht_type == BCF_HT_FLAG:
+ value_count[0] = 1
+ elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
+ # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
+ value_count[0] = -1
+
+ if value_count[0] != -1 and value_count[0] != len(values):
+ if scalar[0]:
+ raise TypeError('value expected to be scalar'.format(value_count[0]))
+ else:
+ raise TypeError('values expected to be {:d}-tuple'.format(value_count[0]))
+
+ if ht_type == BCF_HT_REAL:
+ for v in values:
+ if not(v is None or isinstance(v, (float, int))):
+ raise TypeError('invalid value for Float format')
+ elif ht_type == BCF_HT_INT:
+ for v in values:
+ if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
+ raise TypeError('invalid value for Integer format')
+ for v in values:
+ if not(v is None or bcf_int32_missing < v <= INT32_MAX):
+ raise ValueError('Integer value too small/large to store in VCF/BCF')
+ elif ht_type == BCF_HT_STR:
+ values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+ elif ht_type == BCF_HT_FLAG:
+ if values[0] not in (True, False, None, 1, 0):
+ raise ValueError('Flag values must be: True, False, None, 1, 0')
+ else:
+ raise TypeError('unsupported type')
+
+ realloc[0] = 0
+ if len(values) <= 1 and hl_type == BCF_HL_INFO:
+ realloc[0] = 0
+ elif len(values) > bt_len:
+ realloc[0] = 1
+ elif bt_type == BCF_BT_INT8:
+ for v in values:
+ if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
+ realloc[0] = 1
+ break
+ elif bt_type == BCF_BT_INT16:
+ for v in values:
+ if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
+ realloc[0] = 1
+ break
+
+ return values
+
+
+cdef bcf_encode_alleles(VariantRecord record, values):
+ if record is None:
+ raise ValueError('record must not be None')
+
+ cdef bcf1_t *r = record.ptr
+ cdef int32_t nalleles = r.n_allele
+ cdef list gt_values = []
+ cdef char *s
+ cdef int i
+
+ if values is None:
+ return ()
+
+ if not isinstance(values, (list, tuple)):
+ values = (values,)
+
+ for value in values:
+ if value is None:
+ gt_values.append(bcf_gt_missing)
+ elif isinstance(value, (str, bytes)):
+ bvalue = force_bytes(value)
+ s = bvalue
+ for i in range(r.n_allele):
+ if strcmp(r.d.allele[i], s) != 0:
+ gt_values.append(bcf_gt_unphased(i))
+ break
+ else:
+ raise ValueError('Unknown allele')
+ else:
+ i = value
+ if not (0 <= i < nalleles):
+ raise ValueError('Invalid allele index')
+ gt_values.append(bcf_gt_unphased(i))
+
+ return gt_values
+
+
+cdef bcf_info_set_value(VariantRecord record, key, value):
+ if record is None:
+ raise ValueError('record must not be None')
+
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf1_t *r = record.ptr
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
+ cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ if info:
+ info_id = info.key
+ else:
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+ raise KeyError('unknown INFO')
+
+ info_id = kh_val_vdict(d, k).id
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
+ values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
+ info.type if info else -1,
+ info.len if info else -1,
+ &value_count, &scalar, &realloc)
+
+ if info_type == BCF_HT_FLAG:
+ if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
+ raise ValueError('Unable to update INFO values')
+ return
+
+ vlen = value_count < 0
+ value_count = len(values)
+
+ # If we can, write updated values to existing allocated storage
+ if info and not realloc:
+ r.d.shared_dirty |= BCF1_DIRTY_INF
+
+ if value_count == 0:
+ info.len = 0
+ # FIXME: Check if need to free vptr if info.len > 0?
+ elif value_count == 1:
+ # FIXME: Check if need to free vptr if info.len > 0?
+ if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
+ bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
+ elif info.type == BCF_BT_FLOAT:
+ bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
+ else:
+ raise TypeError('unsupported info type code')
+ info.len = 1
+ else:
+ bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
+ return
+
+ alloc_len = max(1, value_count)
+ if info and info.len > alloc_len:
+ alloc_len = info.len
+
+ new_values = bcf_empty_array(info_type, alloc_len, vlen)
+ cdef char *valp = <char *>new_values
+
+ if info_type == BCF_HT_INT:
+ dst_type = BCF_BT_INT32
+ elif info_type == BCF_HT_REAL:
+ dst_type = BCF_BT_FLOAT
+ elif info_type == BCF_HT_STR:
+ dst_type = BCF_BT_CHAR
+ else:
+ raise ValueError('Unsupported INFO type')
+
+ bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
+
+ if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
+ raise ValueError('Unable to update INFO values')
+
+
+cdef bcf_info_del_value(VariantRecord record, key):
+ if record is None:
+ raise ValueError('record must not be None')
+
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf1_t *r = record.ptr
+ cdef ssize_t value_count
+ cdef int scalar
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ if not info:
+ raise KeyError(key)
+
+ bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar)
+
+ if value_count <= 0:
+ null_value = ()
+ elif scalar:
+ null_value = None
+ else:
+ null_value = (None,)*value_count
+
+ bcf_info_set_value(record, bkey, null_value)
+
+
+cdef bcf_format_get_value(VariantRecordSample sample, key):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef ssize_t count
+ cdef int scalar
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError('invalid FORMAT')
+
+ if is_gt_fmt(hdr, fmt.id):
+ return bcf_format_get_allele_indices(sample)
+
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar)
+
+ if fmt.p and fmt.n and fmt.size:
+ return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
+ elif scalar:
+ return None
+ elif count <= 0:
+ return ()
+ else:
+ return (None,)*count
+
+
+cdef bcf_format_set_value(VariantRecordSample sample, key, value):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int fmt_id
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
+ cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if fmt:
+ fmt_id = fmt.id
+ else:
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
+ raise KeyError('unknown format')
+
+ fmt_id = kh_val_vdict(d, k).id
+
+ if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
+ raise ValueError('Invalid header')
+
+ fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
+
+ if fmt_type == BCF_HT_FLAG:
+ raise ValueError('Flag types are not allowed on FORMATs')
+
+ if is_gt_fmt(hdr, fmt_id):
+ value = bcf_encode_alleles(sample.record, value)
+ # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
+ fmt_type = BCF_HT_INT
+
+ values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
+ fmt.type if fmt else -1,
+ fmt.n if fmt else -1,
+ &value_count, &scalar, &realloc)
+
+ vlen = value_count < 0
+ value_count = len(values)
+
+ # If we can, write updated values to existing allocated storage
+ if fmt and not realloc:
+ r.d.indiv_dirty = 1
+ bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
+ return
+
+ alloc_len = max(1, value_count)
+ if fmt and fmt.n > alloc_len:
+ alloc_len = fmt.n
+
+ n = bcf_hdr_nsamples(hdr)
+ new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
+ cdef char *valp = <char *>new_values
+
+ if fmt_type == BCF_HT_INT:
+ dst_type = BCF_BT_INT32
+ dst_size = sizeof(int32_t) * alloc_len
+ elif fmt_type == BCF_HT_REAL:
+ dst_type = BCF_BT_FLOAT
+ dst_size = sizeof(float) * alloc_len
+ elif fmt_type == BCF_HT_STR:
+ dst_type = BCF_BT_CHAR
+ dst_size = sizeof(char) * alloc_len
+ else:
+ raise ValueError('Unsupported FORMAT type')
+
+ if fmt and n > 1:
+ for i in range(n):
+ bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
+ valp + i*dst_size, dst_type, alloc_len,
+ vlen)
+
+ bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
+
+ if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
+ raise ValueError('Unable to update format values')
+
+
+cdef bcf_format_del_value(VariantRecordSample sample, key):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef ssize_t value_count
+ cdef int scalar
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError(key)
+
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar)
+
+ if value_count <= 0:
+ null_value = ()
+ elif scalar:
+ null_value = None
+ else:
+ null_value = (None,)*value_count
+
+ bcf_format_set_value(sample, bkey, null_value)
+
+
+cdef bcf_format_get_allele_indices(VariantRecordSample sample):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ if sample.index < 0 or sample.index >= n or not r.n_fmt:
+ return ()
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ return ()
+
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ cdef int32_t a, nalleles = r.n_allele
+ cdef list alleles = []
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ elif data8[i] == bcf_gt_missing:
+ a = -1
+ else:
+ a = bcf_gt_allele(data8[i])
+ alleles.append(a if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ elif data16[i] == bcf_gt_missing:
+ a = -1
+ else:
+ a = bcf_gt_allele(data16[i])
+ alleles.append(a if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ elif data32[i] == bcf_gt_missing:
+ a = -1
+ else:
+ a = bcf_gt_allele(data32[i])
+ alleles.append(a if 0 <= a < nalleles else None)
+
+ return tuple(alleles)
+
+
+cdef bcf_format_get_alleles(VariantRecordSample sample):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ cdef int32_t nalleles = r.n_allele
+
+ if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
+ return ()
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ return ()
+
+ cdef int32_t a
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+ alleles = []
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ a = bcf_gt_allele(data8[i])
+ alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ a = bcf_gt_allele(data16[i])
+ alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ a = bcf_gt_allele(data32[i])
+ alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
+ return tuple(alleles)
+
+
+cdef bint bcf_sample_get_phased(VariantRecordSample sample):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ if sample.index < 0 or sample.index >= n or not r.n_fmt:
+ return False
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ return False
+
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+
+ cdef bint phased = False
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ elif data8[i] == bcf_int8_missing:
+ continue
+ elif i and not bcf_gt_is_phased(data8[i]):
+ return False
+ else:
+ phased = True
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ elif data16[i] == bcf_int16_missing:
+ continue
+ elif i and not bcf_gt_is_phased(data16[i]):
+ return False
+ else:
+ phased = True
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ elif data32[i] == bcf_int32_missing:
+ continue
+ elif i and not bcf_gt_is_phased(data32[i]):
+ return False
+ else:
+ phased = True
+
+ return phased
+
+
+cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
+ if sample is None:
+ raise ValueError('sample must not be None')
+
+ cdef bcf_hdr_t *hdr = sample.record.header.ptr
+ cdef bcf1_t *r = sample.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+ if bcf_unpack(r, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ if sample.index < 0 or sample.index >= n or not r.n_fmt:
+ return
+
+ cdef bcf_fmt_t *fmt0 = r.d.fmt
+ cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
+
+ if not gt0 or not fmt0.n:
+ raise ValueError('Cannot set phased before genotype is set')
+
+ cdef int8_t *data8
+ cdef int16_t *data16
+ cdef int32_t *data32
+
+ if fmt0.type == BCF_BT_INT8:
+ data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data8[i] == bcf_int8_vector_end:
+ break
+ elif data8[i] == bcf_int8_missing:
+ continue
+ elif i:
+ data8[i] = (data8[i] & 0xFE) | phased
+ elif fmt0.type == BCF_BT_INT16:
+ data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data16[i] == bcf_int16_vector_end:
+ break
+ elif data16[i] == bcf_int16_missing:
+ continue
+ elif i:
+ data16[i] = (data16[i] & 0xFFFE) | phased
+ elif fmt0.type == BCF_BT_INT32:
+ data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
+ for i in range(fmt0.n):
+ if data32[i] == bcf_int32_vector_end:
+ break
+ elif data32[i] == bcf_int32_missing:
+ continue
+ elif i:
+ data32[i] = (data32[i] & 0xFFFFFFFE) | phased
+
+
+########################################################################
+########################################################################
+## Variant Header objects
+########################################################################
+
+
+cdef bcf_header_remove_hrec(VariantHeader header, int i):
+ if header is None:
+ raise ValueError('header must not be None')
+
+ cdef bcf_hdr_t *hdr = header.ptr
+
+ if i < 0 or i >= hdr.nhrec:
+ raise ValueError('Invalid header record index')
+
+ cdef bcf_hrec_t *hrec = hdr.hrec[i]
+ hdr.nhrec -= 1
+
+ if i < hdr.nhrec:
+ memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
+
+ bcf_hrec_destroy(hrec)
+ hdr.hrec[hdr.nhrec] = NULL
+ hdr.dirty = 1
+
+
+#FIXME: implement a full mapping interface
+#FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
+# object lifetime.
+cdef class VariantHeaderRecord(object):
+ """header record from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ @property
+ def type(self):
+ """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return None
+ return METADATA_TYPES[r.type]
+
+ @property
+ def key(self):
+ """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
+ cdef bcf_hrec_t *r = self.ptr
+ return bcf_str_cache_get_charptr(r.key) if r and r.key else None
+
+ @property
+ def value(self):
+ """header value. Set only for generic lines, None for FILTER/INFO, etc."""
+ cdef bcf_hrec_t *r = self.ptr
+ return charptr_to_str(r.value) if r and r.value else None
+
+ @property
+ def attrs(self):
+ """sequence of additional header attributes"""
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return ()
+ cdef int i
+ return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
+ charptr_to_str(r.vals[i]) if r.vals[i] else None)
+ for i in range(r.nkeys))
+
+ def __len__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ return r.nkeys if r else 0
+
+ def __bool__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ return r != NULL and r.nkeys != 0
+
+ def __getitem__(self, key):
+ """get attribute value"""
+ cdef bcf_hrec_t *r = self.ptr
+ cdef int i
+ if r:
+ bkey = force_bytes(key)
+ for i in range(r.nkeys):
+ if r.keys[i] and r.keys[i] == bkey:
+ return charptr_to_str(r.vals[i]) if r.vals[i] else None
+ raise KeyError('cannot find metadata key')
+
+ def __iter__(self):
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
+ cdef int i
+ for i in range(r.nkeys):
+ if r.keys[i]:
+ yield bcf_str_cache_get_charptr(r.keys[i])
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ try:
+ self[key]
+ except KeyError:
+ return False
+ else:
+ return True
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
+ cdef int i
+ for i in range(r.nkeys):
+ if r.keys[i]:
+ yield charptr_to_str(r.vals[i]) if r.vals[i] else None
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
+ cdef int i
+ for i in range(r.nkeys):
+ if r.keys[i]:
+ yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+ def __str__(self):
+ cdef bcf_hrec_t *r = self.ptr
+
+ if not r:
+ raise ValueError('cannot convert deleted record to str')
+
+ cdef kstring_t hrec_str
+ hrec_str.l = hrec_str.m = 0
+ hrec_str.s = NULL
+
+ bcf_hrec_format(r, &hrec_str)
+
+ ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
+
+ if hrec_str.m:
+ free(hrec_str.s)
+
+ return ret
+
+ # FIXME: Not safe -- causes trivial segfaults at the moment
+ def remove(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef bcf_hrec_t *r = self.ptr
+ if not r:
+ return
+ assert(r.key)
+ cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
+ print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key))
+ bcf_hdr_remove(hdr, r.type, key)
+ self.ptr = NULL
+
+
+cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if not hdr:
+ return None
+
+ cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
+ record.header = header
+ record.ptr = hdr
+
+ return record
+
+
+cdef class VariantHeaderRecords(object):
+ """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __len__(self):
+ return self.header.ptr.nhrec
+
+ def __bool__(self):
+ return self.header.ptr.nhrec != 0
+
+ def __getitem__(self, index):
+ cdef int32_t i = index
+ if i < 0 or i >= self.header.ptr.nhrec:
+ raise IndexError('invalid header record index')
+ return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
+
+ def __iter__(self):
+ cdef int32_t i
+ for i in range(self.header.ptr.nhrec):
+ yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
+
+ __hash__ = None
+
+
+cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
+ records.header = header
+ return records
+
+
+cdef class VariantMetadata(object):
+ """filter, info or format metadata record from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ @property
+ def name(self):
+ """metadata name"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
+
+ # Q: Should this be exposed?
+ @property
+ def id(self):
+ """metadata internal header id number"""
+ return self.id
+
+ @property
+ def number(self):
+ """metadata number (i.e. cardinality)"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+
+ if not check_header_id(hdr, self.type, self.id):
+ raise ValueError('Invalid header id')
+
+ if self.type == BCF_HL_FLT:
+ return None
+
+ cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
+ if l == BCF_VL_FIXED:
+ return bcf_hdr_id2number(hdr, self.type, self.id)
+ elif l == BCF_VL_VAR:
+ return '.'
+ else:
+ return METADATA_LENGTHS[l]
+
+ @property
+ def type(self):
+ """metadata value type"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ if not check_header_id(hdr, self.type, self.id):
+ raise ValueError('Invalid header id')
+
+ if self.type == BCF_HL_FLT:
+ return None
+ return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
+
+ @property
+ def description(self):
+ """metadata description (or None if not set)"""
+ descr = self.record.get('Description')
+ if descr:
+ descr = descr.strip('"')
+ return force_str(descr)
+
+ @property
+ def record(self):
+ """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ if not check_header_id(hdr, self.type, self.id):
+ raise ValueError('Invalid header id')
+ cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
+ if not hrec:
+ return None
+ return makeVariantHeaderRecord(self.header, hrec)
+
+ def remove_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key
+ bcf_hdr_remove(hdr, self.type, bkey)
+
+
+cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
+ raise ValueError('invalid metadata type')
+
+ if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
+ raise ValueError('invalid metadata id')
+
+ cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
+ meta.header = header
+ meta.type = type
+ meta.id = id
+
+ return meta
+
+
+cdef class VariantHeaderMetadata(object):
+ """mapping from filter, info or format name to :class:`VariantMetadata` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def add(self, id, number, type, description, **kwargs):
+ """Add a new filter, info or format record"""
+ if id in self:
+ raise ValueError('Header already exists for id={}'.format(id))
+
+ if self.type == BCF_HL_FLT:
+ if number is not None:
+ raise ValueError('Number must be None when adding a filter')
+ if type is not None:
+ raise ValueError('Type must be None when adding a filter')
+
+ items = [('ID', id), ('Description', description)]
+ else:
+ if type not in VALUE_TYPES:
+ raise ValueError('unknown type specified: {}'.format(type))
+ if number is None:
+ number = '.'
+
+ items = [('ID', id),
+ ('Number', number),
+ ('Type', type),
+ ('Description', description)]
+
+ items += kwargs.items()
+ self.header.add_meta(METADATA_TYPES[self.type], items=items)
+
+ def __len__(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef bcf_idpair_t *idpair
+ cdef int32_t i, n = 0
+
+ for i in range(hdr.n[BCF_DT_ID]):
+ idpair = hdr.id[BCF_DT_ID] + i
+ if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+ n += 1
+ return n
+
+ def __bool__(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef bcf_idpair_t *idpair
+ cdef int32_t i
+
+ for i in range(hdr.n[BCF_DT_ID]):
+ idpair = hdr.id[BCF_DT_ID] + i
+ if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+ return True
+ return False
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
+
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
+ raise KeyError('invalid key')
+
+ return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
+
+ def remove_header(self, key):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
+
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
+ raise KeyError('invalid key')
+
+ bcf_hdr_remove(hdr, self.type, bkey)
+ #bcf_hdr_sync(hdr)
+
+ def clear_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ bcf_hdr_remove(hdr, self.type, NULL)
+ #bcf_hdr_sync(hdr)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef bcf_idpair_t *idpair
+ cdef int32_t i
+
+ for i in range(hdr.n[BCF_DT_ID]):
+ idpair = hdr.id[BCF_DT_ID] + i
+ if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
+ yield bcf_str_cache_get_charptr(idpair.key)
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ try:
+ self[key]
+ except KeyError:
+ return False
+ else:
+ return True
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
+ meta.header = header
+ meta.type = type
+
+ return meta
+
+
+cdef class VariantContig(object):
+ """contig metadata from a :class:`VariantHeader`"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ @property
+ def name(self):
+ """contig name"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
+
+ @property
+ def id(self):
+ """contig internal id number"""
+ return self.id
+
+ @property
+ def length(self):
+ """contig length or None if not available"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
+ return length if length else None
+
+ @property
+ def header(self):
+ """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
+ return makeVariantHeaderRecord(self.header, hrec)
+
+ def remove_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key
+ bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+
+
+cdef VariantContig makeVariantContig(VariantHeader header, int id):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
+ raise ValueError('invalid contig id')
+
+ cdef VariantContig contig = VariantContig.__new__(VariantContig)
+ contig.header = header
+ contig.id = id
+
+ return contig
+
+
+cdef class VariantHeaderContigs(object):
+ """mapping from contig name or index to :class:`VariantContig` object."""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __len__(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
+ return hdr.n[BCF_DT_CTG]
+
+ def __bool__(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
+ return hdr.n[BCF_DT_CTG] != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int index
+
+ if isinstance(key, int):
+ index = key
+ if index < 0 or index >= hdr.n[BCF_DT_CTG]:
+ raise IndexError('invalid contig index')
+ return makeVariantContig(self.header, index)
+
+ cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d):
+ raise KeyError('invalid contig')
+
+ cdef int id = kh_val_vdict(d, k).id
+
+ return makeVariantContig(self.header, id)
+
+ def remove_header(self, key):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int index
+ cdef const char *bkey
+ cdef vdict_t *d
+ cdef khiter_t k
+
+ if isinstance(key, int):
+ index = key
+ if index < 0 or index >= hdr.n[BCF_DT_CTG]:
+ raise IndexError('invalid contig index')
+ bkey = hdr.id[BCF_DT_CTG][self.id].key
+ else:
+ d = <vdict_t *>hdr.dict[BCF_DT_CTG]
+ key = force_bytes(key)
+ if kh_get_vdict(d, key) == kh_end(d):
+ raise KeyError('invalid contig')
+ bkey = key
+
+ bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+
+ def clear_header(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
+ #bcf_hdr_sync(hdr)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
+ cdef uint32_t n = kh_size(d)
+
+ assert n == hdr.n[BCF_DT_CTG]
+
+ for i in range(n):
+ yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ try:
+ self[key]
+ except KeyError:
+ return False
+ else:
+ return True
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+ def add(self, id, **kwargs):
+ """Add a new contig record"""
+ if id in self:
+ raise ValueError('Header already exists for contig {}'.format(id))
+
+ items = [('ID', id)] + kwargs.items()
+ self.header.add_meta('contig', items=items)
+
+
+cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
+ contigs.header = header
+
+ return contigs
+
+
+cdef class VariantHeaderSamples(object):
+ """sequence of sample names from a :class:`VariantHeader` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __len__(self):
+ return bcf_hdr_nsamples(self.header.ptr)
+
+ def __bool__(self):
+ return bcf_hdr_nsamples(self.header.ptr) != 0
+
+ def __getitem__(self, index):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t i = index
+
+ if i < 0 or i >= n:
+ raise IndexError('invalid sample index')
+
+ return charptr_to_str(hdr.samples[i])
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+ for i in range(n):
+ yield charptr_to_str(hdr.samples[i])
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
+ bkey = force_bytes(key)
+ cdef khiter_t k = kh_get_vdict(d, bkey)
+
+ return k != kh_end(d)
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+ def add(self, name):
+ """Add a new sample"""
+ self.header.add_sample(name)
+
+
+cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
+ samples.header = header
+
+ return samples
+
+
+cdef class VariantHeader(object):
+ """header information for a :class:`VariantFile` object"""
+ #FIXME: Add structured proxy
+ #FIXME: Add generic proxy
+ #FIXME: Add mutable methods
+
+ # See makeVariantHeader for C constructor
+ def __cinit__(self):
+ self.ptr = NULL
+
+ # Python constructor
+ def __init__(self):
+ self.ptr = bcf_hdr_init(b'w')
+ if not self.ptr:
+ raise ValueError('cannot create VariantHeader')
+
+ def __dealloc__(self):
+ if self.ptr:
+ bcf_hdr_destroy(self.ptr)
+ self.ptr = NULL
+
+ def __bool__(self):
+ # self.ptr == NULL should be impossible
+ return self.ptr != NULL
+
+ def copy(self):
+ return makeVariantHeader(bcf_hdr_dup(self.ptr))
+
+ def merge(self, VariantHeader header):
+ if header is None:
+ raise ValueError('header must not be None')
+ bcf_hdr_merge(self.ptr, header.ptr)
+
+ @property
+ def version(self):
+ """VCF version"""
+ return force_str(bcf_hdr_get_version(self.ptr))
+
+ @property
+ def samples(self):
+ """samples (:class:`VariantHeaderSamples`)"""
+ return makeVariantHeaderSamples(self)
+
+ @property
+ def records(self):
+ """header records (:class:`VariantHeaderRecords`)"""
+ return makeVariantHeaderRecords(self)
+
+ @property
+ def contigs(self):
+ """contig information (:class:`VariantHeaderContigs`)"""
+ return makeVariantHeaderContigs(self)
+
+ @property
+ def filters(self):
+ """filter metadata (:class:`VariantHeaderMetadata`)"""
+ return makeVariantHeaderMetadata(self, BCF_HL_FLT)
+
+ @property
+ def info(self):
+ """info metadata (:class:`VariantHeaderMetadata`)"""
+ return makeVariantHeaderMetadata(self, BCF_HL_INFO)
+
+ @property
+ def formats(self):
+ """format metadata (:class:`VariantHeaderMetadata`)"""
+ return makeVariantHeaderMetadata(self, BCF_HL_FMT)
+
+ @property
+ def alts(self):
+ """alt metadata (:class:`dict` ID->record).
+
+ The data returned just a snapshot of alt records, is created
+ every time the property is requested, and modifications will
+ not be reflected in the header metadata and vice versa.
+
+ i.e. it is just a dict that reflects the state of alt records
+ at the time it is created.
+ """
+ return {record['ID']:record for record in self.records
+ if record.key.upper() == 'ALT' }
+
+ # only safe to do when opening an htsfile
+ cdef _subset_samples(self, include_samples):
+ keep_samples = set(self.samples)
+ include_samples = set(include_samples)
+ missing_samples = include_samples - keep_samples
+ keep_samples &= include_samples
+
+ if missing_samples:
+ # FIXME: add specialized exception with payload
+ raise ValueError(
+ 'missing {:d} requested samples'.format(
+ len(missing_samples)))
+
+ keep_samples = force_bytes(','.join(keep_samples))
+ cdef char *keep = <char *>keep_samples if keep_samples else NULL
+ cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
+
+ if ret != 0:
+ raise ValueError(
+ 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
+
+ def __str__(self):
+ cdef int hlen
+ cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
+
+ try:
+ return charptr_to_str_w_len(hstr, hlen)
+ finally:
+ free(hstr)
+
+ cpdef VariantRecord new_record(self):
+ """Create a new empty VariantRecord"""
+ r = makeVariantRecord(self, bcf_init())
+ r.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
+ return r
+
+ def add_record(self, VariantHeaderRecord record):
+ """Add an existing :class:`VariantHeaderRecord` to this header"""
+ if record is None:
+ raise ValueError('record must not be None')
+
+ cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
+
+ bcf_hdr_add_hrec(self.ptr, hrec)
+
+ if self.ptr.dirty:
+ bcf_hdr_sync(self.ptr)
+
+ def add_line(self, line):
+ """Add a metadata line to this header"""
+ bline = force_bytes(line)
+ if bcf_hdr_append(self.ptr, bline) < 0:
+ raise ValueError('invalid header line')
+
+ if self.ptr.dirty:
+ bcf_hdr_sync(self.ptr)
+
+ def add_meta(self, key, value=None, items=None):
+ """Add metadata to this header"""
+ if not ((value is not None) ^ (items is not None)):
+ raise ValueError('either value or items must be specified')
+
+ cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
+ cdef int quoted
+
+ try:
+ key = force_bytes(key)
+ hrec.key = strdup(key)
+
+ if value is not None:
+ hrec.value = strdup(force_bytes(value))
+ else:
+ for key, value in items:
+ key = force_bytes(key)
+ bcf_hrec_add_key(hrec, key, <int>len(key))
+
+ value = force_bytes(str(value))
+ quoted = strpbrk(value, ' ;,"\t<>') != NULL
+ bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted)
+ except:
+ bcf_hrec_destroy(hrec)
+ raise
+
+ bcf_hdr_add_hrec(self.ptr, hrec)
+
+ if self.ptr.dirty:
+ bcf_hdr_sync(self.ptr)
+
+ def add_sample(self, name):
+ """Add a new sample to this header"""
+ bname = force_bytes(name)
+ if bcf_hdr_add_sample(self.ptr, bname) < 0:
+ raise ValueError('Duplicated sample name: {}'.format(name))
+ if self.ptr.dirty:
+ bcf_hdr_sync(self.ptr)
+
+
+cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
+ if not hdr:
+ raise ValueError('cannot create VariantHeader')
+
+ cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
+ header.ptr = hdr
+
+ return header
+
+
+########################################################################
+########################################################################
+## Variant Record objects
+########################################################################
+
+cdef class VariantRecordFilter(object):
+ """Filters set on a :class:`VariantRecord` object, presented as a mapping from
+ filter index or name to :class:`VariantMetadata` object"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __len__(self):
+ return self.record.ptr.d.n_flt
+
+ def __bool__(self):
+ return self.record.ptr.d.n_flt != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int index, id
+ cdef int n = r.d.n_flt
+
+ if isinstance(key, int):
+ index = key
+
+ if index < 0 or index >= n:
+ raise IndexError('invalid filter index')
+
+ id = r.d.flt[index]
+ else:
+ if key == '.':
+ key = 'PASS'
+
+ bkey = force_bytes(key)
+ id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+ if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
+ raise KeyError('Invalid filter')
+
+ return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
+
+ def add(self, key):
+ """Add a new filter"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int id
+
+ if key == '.':
+ key = 'PASS'
+
+ bkey = force_bytes(key)
+ id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+ if not check_header_id(hdr, BCF_HL_FLT, id):
+ raise KeyError('Invalid filter')
+
+ bcf_add_filter(hdr, r, id)
+
+ def __delitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int index, id
+ cdef int n = r.d.n_flt
+
+ if isinstance(key, int):
+ index = key
+
+ if index < 0 or index >= n:
+ raise IndexError('invalid filter index')
+
+ id = r.d.flt[index]
+ else:
+ if key == '.':
+ key = 'PASS'
+
+ bkey = force_bytes(key)
+ id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
+
+ if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
+ raise KeyError('Invalid filter')
+
+ bcf_remove_filter(hdr, r, id, 0)
+
+ def clear(self):
+ """Clear all filters"""
+ cdef bcf1_t *r = self.record.ptr
+ r.d.shared_dirty |= BCF1_DIRTY_FLT
+ r.d.n_flt = 0
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i
+
+ for i in range(r.d.n_flt):
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ bkey = force_bytes(key)
+ return bcf_has_filter(hdr, r, bkey) == 1
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
+ filter.record = record
+
+ return filter
+
+
+cdef class VariantRecordFormat(object):
+ """Format data present for each sample in a :class:`VariantRecord` object,
+ presented as mapping from format name to :class:`VariantMetadata` object."""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __len__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = 0
+
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ n += 1
+ return n
+
+ def __bool__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i
+
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ return True
+ return False
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError('unknown format')
+
+ return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
+
+ def __delitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+
+ if not fmt or not fmt.p:
+ raise KeyError('unknown format')
+
+ if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
+ raise ValueError('Unable to delete FORMAT')
+
+ def clear(self):
+ """Clear all formats for all samples within the associated
+ :class:`VariantRecord` instance"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt
+ cdef const char *key
+ cdef int i
+
+ for i in reversed(range(r.n_fmt)):
+ fmt = &r.d.fmt[i]
+ if fmt.p:
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
+ if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
+ raise ValueError('Unable to delete FORMAT')
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt
+ cdef int i
+
+ for i in range(r.n_fmt):
+ fmt = &r.d.fmt[i]
+ if fmt.p:
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+ return fmt != NULL and fmt.p != NULL
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
+ format.record = record
+
+ return format
+
+
+#TODO: Add a getmeta method to return the corresponding VariantMetadata?
+cdef class VariantRecordInfo(object):
+ """Info data stored in a :class:`VariantRecord` object, presented as a
+ mapping from info metadata name to value."""
+
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __len__(self):
+ return self.record.ptr.n_info
+
+ def __bool__(self):
+ return self.record.ptr.n_info != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef info_id
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ if not info:
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, bkey)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+
+ info_id = kh_val_vdict(d, k).id
+ else:
+ info_id = info.key
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
+ return info != NULL and info.vptr != NULL
+
+ if not info or not info.vptr:
+ raise KeyError('Invalid INFO field: {}'.format(key))
+
+ return bcf_info_get_value(self.record, info)
+
+ def __setitem__(self, key, value):
+ bcf_info_set_value(self.record, key, value)
+
+ def __delitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ if not info or not info.vptr:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+
+ if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete INFO')
+
+ def clear(self):
+ """Clear all info data"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef const char *key
+ cdef int i
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ if info and info.vptr:
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete INFO')
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef int i
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ if info and info.vptr:
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key))
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ return info != NULL
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef int i
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ if info and info.vptr:
+ yield bcf_info_get_value(self.record, info)
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef int i
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ if info and info.vptr:
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ value = bcf_info_get_value(self.record, info)
+ yield bcf_str_cache_get_charptr(key), value
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
+ info.record = record
+
+ return info
+
+
+cdef class VariantRecordSamples(object):
+ """mapping from sample index or name to :class:`VariantRecordSample` object."""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __len__(self):
+ return bcf_hdr_nsamples(self.record.header.ptr)
+
+ def __bool__(self):
+ return bcf_hdr_nsamples(self.record.header.ptr) != 0
+
+ def __getitem__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int n = bcf_hdr_nsamples(hdr)
+ cdef int sample_index
+ cdef vdict_t *d
+ cdef khiter_t k
+
+ if isinstance(key, int):
+ sample_index = key
+ else:
+ bkey = force_bytes(key)
+ sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
+ if sample_index < 0:
+ raise KeyError('invalid sample name')
+
+ if sample_index < 0 or sample_index >= n:
+ raise IndexError('invalid sample index')
+
+ return makeVariantRecordSample(self.record, sample_index)
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+ for i in range(n):
+ yield charptr_to_str(hdr.samples[i])
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int n = bcf_hdr_nsamples(hdr)
+ cdef int sample_index
+ cdef vdict_t *d
+ cdef khiter_t k
+
+ if isinstance(key, int):
+ sample_index = key
+ else:
+ bkey = force_bytes(key)
+ sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
+ if sample_index < 0:
+ raise KeyError('invalid sample name')
+
+ return 0 <= sample_index < n
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+ for i in range(n):
+ yield makeVariantRecordSample(self.record, i)
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+
+ for i in range(n):
+ yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
+ if not record:
+ raise ValueError('invalid VariantRecord')
+
+ cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
+ VariantRecordSamples)
+ samples.record = record
+
+ return samples
+
+
+cdef class VariantRecord(object):
+ """Variant record"""
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ def __dealloc__(self):
+ if self.ptr:
+ bcf_destroy1(self.ptr)
+ self.ptr = NULL
+
+ def copy(self):
+ """return a copy of this VariantRecord object"""
+ return makeVariantRecord(self.header, bcf_dup(self.ptr))
+
+ def translate(self, VariantHeader dst_header):
+ if dst_header is None:
+ raise ValueError('dst_header must not be None')
+
+ cdef bcf_hdr_t *src_hdr = self.header.ptr
+ cdef bcf_hdr_t *dst_hdr = dst_header.ptr
+
+ if src_hdr != dst_hdr:
+ if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
+ msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
+ raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
+
+ bcf_translate(dst_hdr, src_hdr, self.ptr)
+
+ @property
+ def rid(self):
+ """internal reference id number"""
+ return self.ptr.rid
+
+ @rid.setter
+ def rid(self, value):
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int r = value
+ if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
+ raise ValueError('invalid reference id')
+ self.ptr.rid = r
+
+ @property
+ def chrom(self):
+ """chromosome/contig name"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int rid = self.ptr.rid
+ if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
+ raise ValueError('Invalid header')
+ return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
+
+ @chrom.setter
+ def chrom(self, value):
+ cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
+ bchrom = force_bytes(value)
+ cdef khint_t k = kh_get_vdict(d, bchrom)
+ if k == kh_end(d):
+ raise ValueError('Invalid chromosome/contig')
+ self.ptr.rid = kh_val_vdict(d, k).id
+
+ @property
+ def contig(self):
+ """chromosome/contig name"""
+ cdef bcf_hdr_t *hdr = self.header.ptr
+ cdef int rid = self.ptr.rid
+ if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
+ raise ValueError('Invalid header')
+ return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
+
+ @contig.setter
+ def contig(self, value):
+ cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
+ bchrom = force_bytes(value)
+ cdef khint_t k = kh_get_vdict(d, bchrom)
+ if k == kh_end(d):
+ raise ValueError('Invalid chromosome/contig')
+ self.ptr.rid = kh_val_vdict(d, k).id
+
+ @property
+ def pos(self):
+ """record start position on chrom/contig (1-based inclusive)"""
+ return self.ptr.pos + 1
+
+ @pos.setter
+ def pos(self, value):
+ cdef int p = value
+ if p < 1:
+ raise ValueError('Position must be positive')
+ self.ptr.pos = p - 1
+
+ @property
+ def start(self):
+ """record start position on chrom/contig (0-based inclusive)"""
+ return self.ptr.pos
+
+ @start.setter
+ def start(self, value):
+ cdef int s = value
+ if s < 0:
+ raise ValueError('Start coordinate must be non-negative')
+ self.ptr.pos = s
+
+ @property
+ def stop(self):
+ """record stop position on chrom/contig (0-based exclusive)"""
+ return self.ptr.pos + self.ptr.rlen
+
+ @stop.setter
+ def stop(self, value):
+ cdef int s = value
+ if s < self.ptr.pos:
+ raise ValueError('Stop coordinate must be greater than or equal to start')
+ self.ptr.rlen = s - self.ptr.pos
+ if self.ptr.rlen != len(self.ref) or 'END' in self.info:
+ self.info['END'] = s
+
+ @property
+ def rlen(self):
+ """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)"""
+ return self.ptr.rlen
+
+ @rlen.setter
+ def rlen(self, value):
+ cdef int r = value
+ if r < 0:
+ raise ValueError('Reference length must be non-negative')
+ self.ptr.rlen = r
+ if r != len(self.ref) or 'END' in self.info:
+ self.info['END'] = self.ptr.pos + r
+
+ @property
+ def qual(self):
+ """phred scaled quality score or None if not available"""
+ return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
+
+ @qual.setter
+ def qual(self, value):
+ if value is not None:
+ self.ptr.qual = value
+ else:
+ bcf_float_set(&self.ptr.qual, bcf_float_missing)
+
+
+# @property
+# def n_allele(self):
+# return self.ptr.n_allele
+
+# @property
+# def n_sample(self):
+# return self.ptr.n_sample
+
+ @property
+ def id(self):
+ """record identifier or None if not available"""
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+
+ @id.setter
+ def id(self, value):
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ cdef char *idstr = NULL
+ if value is not None:
+ bid = force_bytes(value)
+ idstr = bid
+ if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
+ raise ValueError('Error updating id')
+
+ @property
+ def ref(self):
+ """reference allele"""
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return charptr_to_str(r.d.allele[0]) if r.d.allele else None
+
+ @ref.setter
+ def ref(self, value):
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ #FIXME: Set alleles directly -- this is stupid
+ if not value:
+ raise ValueError('ref allele must not be null')
+ value = force_bytes(value)
+ if r.d.allele and r.n_allele:
+ alleles = [r.d.allele[i] for i in range(r.n_allele)]
+ alleles[0] = value
+ else:
+ alleles = [value]
+ self.alleles = alleles
+
+ @property
+ def alleles(self):
+ """tuple of reference allele followed by alt alleles"""
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ if not r.d.allele:
+ return None
+ cdef tuple res = PyTuple_New(r.n_allele)
+ for i in range(r.n_allele):
+ a = charptr_to_str(r.d.allele[i])
+ PyTuple_SET_ITEM(res, i, a)
+ Py_INCREF(a)
+ return res
+
+ @alleles.setter
+ def alleles(self, value):
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ value = [force_bytes(v) for v in value]
+ if b'' in value:
+ raise ValueError('cannot set null allele')
+ value = b','.join(value)
+ if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
+ raise ValueError('Error updating alleles')
+
+ @property
+ def alts(self):
+ """tuple of alt alleles"""
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ if r.n_allele < 2 or not r.d.allele:
+ return None
+ cdef tuple res = PyTuple_New(r.n_allele - 1)
+ for i in range(1, r.n_allele):
+ a = charptr_to_str(r.d.allele[i])
+ PyTuple_SET_ITEM(res, i - 1, a)
+ Py_INCREF(a)
+ return res
+
+ @alts.setter
+ def alts(self, value):
+ #FIXME: Set alleles directly -- this is stupid
+ cdef bcf1_t *r = self.ptr
+ if bcf_unpack(r, BCF_UN_STR) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ value = [force_bytes(v) for v in value]
+ if b'' in value:
+ raise ValueError('cannot set null alt allele')
+ ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
+ self.alleles = ref + value
+
+ @property
+ def filter(self):
+ """filter information (see :class:`VariantRecordFilter`)"""
+ if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordFilter(self)
+
+ @property
+ def info(self):
+ """info data (see :class:`VariantRecordInfo`)"""
+ if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordInfo(self)
+
+ @property
+ def format(self):
+ """sample format metadata (see :class:`VariantRecordFormat`)"""
+ if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordFormat(self)
+
+ @property
+ def samples(self):
+ """sample data (see :class:`VariantRecordSamples`)"""
+ if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+ return makeVariantRecordSamples(self)
+
+ def __str__(self):
+ cdef kstring_t line
+ cdef char c
+
+ line.l = line.m = 0
+ line.s = NULL
+
+ if vcf_format(self.header.ptr, self.ptr, &line) < 0:
+ if line.m:
+ free(line.s)
+ raise ValueError('vcf_format failed')
+
+ # Strip CR/LF?
+ #while line.l:
+ # c = line.s[line.l - 1]
+ # if c != b'\n' and c != b'\r':
+ # break
+ # line.l -= 1
+
+ ret = charptr_to_str_w_len(line.s, line.l)
+
+ if line.m:
+ free(line.s)
+
+ return ret
+
+
+cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ if not r:
+ raise ValueError('cannot create VariantRecord')
+
+ if r.errcode:
+ msg = []
+ #if r.errcode & BCF_ERR_CTG_UNDEF:
+ # msg.append('undefined contig')
+ #if r.errcode & BCF_ERR_TAG_UNDEF:
+ # msg.append('undefined tag')
+ if r.errcode & BCF_ERR_NCOLS:
+ msg.append('invalid number of columns')
+ if r.errcode & BCF_ERR_LIMITS:
+ msg.append('limits violated')
+ if r.errcode & BCF_ERR_CHAR:
+ msg.append('invalid character found')
+ if r.errcode & BCF_ERR_CTG_INVALID:
+ msg.append('invalid contig')
+ if r.errcode & BCF_ERR_TAG_INVALID:
+ msg.append('invalid tag')
+
+ if msg:
+ msg = ', '.join(msg)
+ raise ValueError('Error(s) reading record: {}'.format(msg))
+
+ cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
+ record.header = header
+ record.ptr = r
+
+ return record
+
+
+########################################################################
+########################################################################
+## Variant Sampletype object
+########################################################################
+
+
+cdef class VariantRecordSample(object):
+ """Data for a single sample from a :class:`VariantRecord` object.
+ Provides data accessors for genotypes and a mapping interface
+ from format name to values.
+ """
+ def __init__(self, *args, **kwargs):
+ raise TypeError('this class cannot be instantiated from Python')
+
+ @property
+ def name(self):
+ """sample name"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int32_t n = bcf_hdr_nsamples(hdr)
+
+ if self.index < 0 or self.index >= n:
+ raise ValueError('invalid sample index')
+
+ return charptr_to_str(hdr.samples[self.index])
+
+ @property
+ def allele_indices(self):
+ """allele indices for called genotype, if present. Otherwise None"""
+ return bcf_format_get_allele_indices(self)
+
+ @allele_indices.setter
+ def allele_indices(self, value):
+ self['GT'] = value
+
+ @allele_indices.deleter
+ def allele_indices(self):
+ self['GT'] = ()
+
+ @property
+ def alleles(self):
+ """alleles for called genotype, if present. Otherwise None"""
+ return bcf_format_get_alleles(self)
+
+ @alleles.setter
+ def alleles(self, value):
+ self['GT'] = value
+
+ @alleles.deleter
+ def alleles(self):
+ self['GT'] = ()
+
+ @property
+ def phased(self):
+ """False if genotype is missing or any allele is unphased. Otherwise True."""
+ return bcf_sample_get_phased(self)
+
+ @phased.setter
+ def phased(self, value):
+ bcf_sample_set_phased(self, value)
+
+ def __len__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i, n = 0
+
+ if bcf_unpack(r, BCF_UN_FMT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ n += 1
+ return n
+
+ def __bool__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef int i
+
+ if bcf_unpack(r, BCF_UN_FMT) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_fmt):
+ if r.d.fmt[i].p:
+ return True
+ return False
+
+ def __getitem__(self, key):
+ return bcf_format_get_value(self, key)
+
+ def __setitem__(self, key, value):
+ bcf_format_set_value(self, key, value)
+
+ def __delitem__(self, key):
+ bcf_format_del_value(self, key)
+
+ def clear(self):
+ """Clear all format data (including genotype) for this sample"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt
+ cdef int i
+
+ for i in range(r.n_fmt):
+ fmt = &r.d.fmt[i]
+ if fmt.p:
+ bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
+
+ def __iter__(self):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_fmt_t *fmt
+ cdef int i
+
+ for i in range(r.n_fmt):
+ fmt = &r.d.fmt[i]
+ if r.d.fmt[i].p:
+ yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ bkey = force_bytes(key)
+ cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
+ return fmt != NULL and fmt.p != NULL
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
+ if not record or sample_index < 0:
+ raise ValueError('cannot create VariantRecordSample')
+
+ cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
+ sample.record = record
+ sample.index = sample_index
+
+ return sample
+
+
+########################################################################
+########################################################################
+## Index objects
+########################################################################
+
+
+cdef class BaseIndex(object):
+ def __init__(self):
+ self.refs = ()
+ self.remap = {}
+
+ def __len__(self):
+ return len(self.refs)
+
+ def __bool__(self):
+ return len(self.refs) != 0
+
+ def __getitem__(self, key):
+ if isinstance(key, int):
+ return self.refs[key]
+ else:
+ return self.refmap[key]
+
+ def __iter__(self):
+ return iter(self.refs)
+
+ def get(self, key, default=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self[key]
+ except KeyError:
+ return default
+
+ def __contains__(self, key):
+ try:
+ self[key]
+ except KeyError:
+ return False
+ else:
+ return True
+
+ def iterkeys(self):
+ """D.iterkeys() -> an iterator over the keys of D"""
+ return iter(self)
+
+ def itervalues(self):
+ """D.itervalues() -> an iterator over the values of D"""
+ for key in self:
+ yield self[key]
+
+ def iteritems(self):
+ """D.iteritems() -> an iterator over the (key, value) items of D"""
+ for key in self:
+ yield (key, self[key])
+
+ def keys(self):
+ """D.keys() -> list of D's keys"""
+ return list(self)
+
+ def items(self):
+ """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
+ return list(self.iteritems())
+
+ def values(self):
+ """D.values() -> list of D's values"""
+ return list(self.itervalues())
+
+ # Mappings are not hashable by default, but subclasses can change this
+ __hash__ = None
+
+ #TODO: implement __richcmp__
+
+
+cdef class BCFIndex(object):
+ """CSI index data structure for BCF files"""
+ def __init__(self):
+ self.refs = ()
+ self.refmap = {}
+
+ if not self.ptr:
+ raise ValueError('Invalid index object')
+
+ cdef int n
+ cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
+
+ self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
+ self.refmap = { r:i for i,r in enumerate(self.refs) }
+
+ def __dealloc__(self):
+ if self.ptr:
+ hts_idx_destroy(self.ptr)
+ self.ptr = NULL
+
+ def fetch(self, bcf, contig, start, stop, region, reopen):
+ return BCFIterator(bcf, contig, start, stop, region, reopen)
+
+
+cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
+ if not idx:
+ return None
+
+ if not header:
+ raise ValueError('invalid VariantHeader')
+
+ cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
+ index.header = header
+ index.ptr = idx
+ index.__init__()
+
+ return index
+
+
+cdef class TabixIndex(BaseIndex):
+ """Tabix index data structure for VCF files"""
+ def __init__(self):
+ self.refs = ()
+ self.refmap = {}
+
+ if not self.ptr:
+ raise ValueError('Invalid index object')
+
+ cdef int n
+ cdef const char **refs = tbx_seqnames(self.ptr, &n)
+
+ self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
+ self.refmap = { r:i for i,r in enumerate(self.refs) }
+
+ def __dealloc__(self):
+ if self.ptr:
+ tbx_destroy(self.ptr)
+ self.ptr = NULL
+
+ def fetch(self, bcf, contig, start, stop, region, reopen):
+ return TabixIterator(bcf, contig, start, stop, region, reopen)
+
+
+cdef TabixIndex makeTabixIndex(tbx_t *idx):
+ if not idx:
+ return None
+
+ cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
+ index.ptr = idx
+ index.__init__()
+
+ return index
+
+
+########################################################################
+########################################################################
+## Iterators
+########################################################################
+
+
+cdef class BaseIterator(object):
+ pass
+
+
+# Interal function to clean up after iteration stop or failure.
+# This would be a nested function if it weren't a cdef function.
+cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
+ bcf_destroy1(record)
+
+ # destroy iter so future calls to __next__ raise StopIteration
+ bcf_itr_destroy(self.iter)
+ self.iter = NULL
+
+
+cdef class BCFIterator(BaseIterator):
+ def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+ if bcf is None:
+ raise ValueError('bcf must not be None')
+
+ if not isinstance(bcf.index, BCFIndex):
+ raise ValueError('bcf index required')
+
+ cdef BCFIndex index = bcf.index
+ cdef int rid, cstart, cstop
+ cdef char *cregion
+
+ if not index:
+ raise ValueError('bcf index required')
+
+ if reopen:
+ bcf = bcf.copy()
+
+ if region is not None:
+ if contig is not None or start is not None or stop is not None:
+ raise ValueError # FIXME
+
+ bregion = force_bytes(region)
+ cregion = bregion
+ with nogil:
+ self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion)
+ else:
+ if contig is None:
+ raise ValueError # FIXME
+
+ try:
+ rid = index.refmap[contig]
+ except KeyError:
+ raise ValueError('Unknown contig specified')
+
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = MAX_POS
+
+ cstart, cstop = start, stop
+
+ with nogil:
+ self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
+
+ # Do not fail on self.iter == NULL, since it signifies a null query.
+
+ self.bcf = bcf
+ self.index = index
+
+ def __dealloc__(self):
+ if self.iter:
+ bcf_itr_destroy(self.iter)
+ self.iter = NULL
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if not self.iter:
+ raise StopIteration
+
+ cdef bcf1_t *record = bcf_init1()
+
+ record.pos = -1
+ if self.bcf.drop_samples:
+ record.max_unpack = BCF_UN_SHR
+
+ cdef int ret
+
+ with nogil:
+ ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
+
+ if ret < 0:
+ _stop_BCFIterator(self, record)
+ if ret == -1:
+ raise StopIteration
+ else:
+ raise ValueError('error reading BCF file')
+
+ ret = bcf_subset_format(self.bcf.header.ptr, record)
+
+ if ret < 0:
+ _stop_BCFIterator(self, record)
+ raise ValueError('error in bcf_subset_format')
+
+ return makeVariantRecord(self.bcf.header, record)
+
+
+cdef class TabixIterator(BaseIterator):
+ def __cinit__(self, *args, **kwargs):
+ self.line_buffer.l = 0
+ self.line_buffer.m = 0
+ self.line_buffer.s = NULL
+
+ def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True):
+ if bcf is None:
+ raise ValueError('bcf must not be None')
+
+ if not isinstance(bcf.index, TabixIndex):
+ raise ValueError('tabix index required')
+
+ cdef TabixIndex index = bcf.index
+
+ if not index:
+ raise ValueError('bcf index required')
+
+ if reopen:
+ bcf = bcf.copy()
+
+ if region is not None:
+ if contig is not None or start is not None or stop is not None:
+ raise ValueError # FIXME
+
+ self.iter = tbx_itr_querys(index.ptr, region)
+ else:
+ if contig is None:
+ raise ValueError # FIXME
+
+ rid = index.refmap.get(contig, -1)
+
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = MAX_POS
+
+ self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
+
+ # Do not fail on self.iter == NULL, since it signifies a null query.
+
+ self.bcf = bcf
+ self.index = index
+
+ def __dealloc__(self):
+ if self.iter:
+ tbx_itr_destroy(self.iter)
+ self.iter = NULL
+
+ if self.line_buffer.m:
+ free(self.line_buffer.s)
+
+ self.line_buffer.l = 0
+ self.line_buffer.m = 0
+ self.line_buffer.s = NULL
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if not self.iter:
+ raise StopIteration
+
+ cdef int ret
+
+ with nogil:
+ ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
+
+ if ret < 0:
+ tbx_itr_destroy(self.iter)
+ self.iter = NULL
+ if ret == -1:
+ raise StopIteration
+ else:
+ raise ValueError('error reading indexed VCF file')
+
+ cdef bcf1_t *record = bcf_init1()
+
+ record.pos = -1
+ if self.bcf.drop_samples:
+ record.max_unpack = BCF_UN_SHR
+
+ ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
+
+ # FIXME: stop iteration on parse failure?
+ if ret < 0:
+ bcf_destroy1(record)
+ raise ValueError('error in vcf_parse')
+
+ return makeVariantRecord(self.bcf.header, record)
+
+
+########################################################################
+########################################################################
+## Variant File
+########################################################################
+
+
+cdef class VariantFile(HTSFile):
+ """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
+ duplicate_filehandle=True)*
+
+ A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
+ opened.
+
+ If an index for a variant file exists (.csi or .tbi), it will be
+ opened automatically. Without an index random access to records
+ via :meth:`fetch` is disabled.
+
+ For writing, a :class:`VariantHeader` object must be provided,
+ typically obtained from another :term:`VCF` file/:term:`BCF`
+ file.
+
+ Parameters
+ ----------
+ mode : string
+ *mode* should be ``r`` for reading or ``w`` for writing. The default is
+ text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
+ ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
+
+ If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
+ modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
+ For instance, to open a :term:`BCF` formatted file for reading, type::
+
+ f = pysam.VariantFile('ex1.bcf','r')
+
+ If mode is not specified, we will try to auto-detect the file type. All
+ of the following should work::
+
+ f1 = pysam.VariantFile('ex1.bcf')
+ f2 = pysam.VariantFile('ex1.vcf')
+ f3 = pysam.VariantFile('ex1.vcf.gz')
+
+ index_filename : string
+ Explicit path to an index file.
+
+ header : VariantHeader
+ :class:`VariantHeader` object required for writing.
+
+ drop_samples: bool
+ Ignore sample information when reading.
+
+ duplicate_filehandle: bool
+ By default, file handles passed either directly or through
+ File-like objects will be duplicated before passing them to
+ htslib. The duplication prevents issues where the same stream
+ will be closed by htslib and through destruction of the
+ high-level python object. Set to False to turn off
+ duplication.
+
+ """
+ def __cinit__(self, *args, **kwargs):
+ self.htsfile = NULL
+
+ def __init__(self, *args, **kwargs):
+ self.header = None
+ self.index = None
+ self.filename = None
+ self.mode = None
+ self.index_filename = None
+ self.is_stream = False
+ self.is_remote = False
+ self.is_reading = False
+ self.drop_samples = False
+ self.header_written = False
+ self.start_offset = -1
+
+ self.open(*args, **kwargs)
+
+ def close(self):
+ """closes the :class:`pysam.VariantFile`."""
+ cdef int ret = 0
+ self.header = self.index = None
+ if self.htsfile:
+ # Write header if no records were written
+ if self.htsfile.is_write and not self.header_written:
+ self.header_written = True
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ ret = hts_close(self.htsfile)
+ self.htsfile = NULL
+
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
+
+ def __iter__(self):
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+
+ if self.htsfile.is_write:
+ raise ValueError('cannot iterate over Variantfile opened for writing')
+
+ self.is_reading = 1
+ return self
+
+ def __next__(self):
+ cdef int ret
+ cdef bcf1_t *record = bcf_init1()
+
+ record.pos = -1
+ if self.drop_samples:
+ record.max_unpack = BCF_UN_SHR
+
+ with nogil:
+ ret = bcf_read1(self.htsfile, self.header.ptr, record)
+
+ if ret < 0:
+ bcf_destroy1(record)
+ if ret == -1:
+ raise StopIteration
+ elif ret == -2:
+ raise IOError('truncated file')
+ else:
+ raise ValueError('Variant read failed')
+
+ return makeVariantRecord(self.header, record)
+
+ def copy(self):
+ if not self.is_open:
+ raise ValueError
+
+ cdef VariantFile vars = VariantFile.__new__(VariantFile)
+ cdef bcf_hdr_t *hdr
+
+ # FIXME: re-open using fd or else header and index could be invalid
+ vars.htsfile = self._open_htsfile()
+
+ if not vars.htsfile:
+ raise ValueError('Cannot re-open htsfile')
+
+ # minimize overhead by re-using header and index. This approach is
+ # currently risky, but see above for how this can be mitigated.
+ vars.header = self.header
+ vars.index = self.index
+
+ vars.filename = self.filename
+ vars.mode = self.mode
+ vars.index_filename = self.index_filename
+ vars.drop_samples = self.drop_samples
+ vars.is_stream = self.is_stream
+ vars.is_remote = self.is_remote
+ vars.is_reading = self.is_reading
+ vars.start_offset = self.start_offset
+ vars.header_written = self.header_written
+
+ if self.htsfile.is_bin:
+ vars.seek(self.tell())
+ else:
+ with nogil:
+ hdr = bcf_hdr_read(vars.htsfile)
+ makeVariantHeader(hdr)
+
+ return vars
+
+ def open(self, filename, mode='r',
+ index_filename=None,
+ VariantHeader header=None,
+ drop_samples=False,
+ duplicate_filehandle=True):
+ """open a vcf/bcf file.
+
+ If open is called on an existing VariantFile, the current file will be
+ closed and a new file will be opened.
+ """
+ cdef bcf_hdr_t *hdr
+ cdef BGZF *bgzfp
+ cdef hts_idx_t *idx
+ cdef tbx_t *tidx
+ cdef char *cfilename
+ cdef char *cindex_filename = NULL
+ cdef char *cmode
+
+ # close a previously opened file
+ if self.is_open:
+ self.close()
+
+ if not mode or mode[0] not in 'rwa':
+ raise ValueError('mode must begin with r, w or a')
+
+ self.duplicate_filehandle = duplicate_filehandle
+
+ format_modes = [m for m in mode[1:] if m in 'bcguz']
+ if len(format_modes) > 1:
+ raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
+
+ invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
+ if invalid_modes:
+ raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
+
+ # Autodetect mode from filename
+ if mode == 'w' and isinstance(filename, str):
+ if filename.endswith('.gz'):
+ mode = 'wz'
+ elif filename.endswith('.bcf'):
+ mode = 'wb'
+
+ # for htslib, wbu seems to not work
+ if mode == 'wbu':
+ mode = 'wb0'
+
+ self.mode = mode = force_bytes(mode)
+ try:
+ filename = encode_filename(filename)
+ self.is_remote = hisremote(filename)
+ self.is_stream = filename == b'-'
+ except TypeError:
+ filename = filename
+ self.is_remote = False
+ self.is_stream = True
+
+ self.filename = filename
+
+ if index_filename is not None:
+ self.index_filename = index_filename = encode_filename(index_filename)
+ else:
+ self.index_filename = None
+
+ self.drop_samples = bool(drop_samples)
+ self.header = None
+
+ self.header_written = False
+
+ if mode.startswith(b'w'):
+ # open file for writing
+ if index_filename is not None:
+ raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
+
+ # header structure (used for writing)
+ if header:
+ self.header = header.copy()
+ else:
+ self.header = VariantHeader()
+ #raise ValueError('a VariantHeader must be specified')
+
+ # Header is not written until the first write or on close
+ self.htsfile = self._open_htsfile()
+
+ if not self.htsfile:
+ raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
+
+ elif mode.startswith(b'r'):
+ # open file for reading
+
+ if not self._exists():
+ raise IOError('file `{}` not found'.format(filename))
+
+ self.htsfile = self._open_htsfile()
+
+ if not self.htsfile:
+ raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+ if self.htsfile.format.format not in (bcf, vcf):
+ raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+ if self.htsfile.format.compression == bgzf:
+ bgzfp = hts_get_bgzfp(self.htsfile)
+ if bgzfp and bgzf_check_EOF(bgzfp) == 0:
+ warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename))
+
+ with nogil:
+ hdr = bcf_hdr_read(self.htsfile)
+
+ try:
+ self.header = makeVariantHeader(hdr)
+ except ValueError:
+ raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode))
+
+ if isinstance(self.filename, bytes):
+ cfilename = self.filename
+ else:
+ cfilename = NULL
+
+ # check for index and open if present
+ if self.htsfile.format.format == bcf and cfilename:
+ if index_filename is not None:
+ cindex_filename = index_filename
+ with nogil:
+ idx = bcf_index_load2(cfilename, cindex_filename)
+ self.index = makeBCFIndex(self.header, idx)
+
+ elif self.htsfile.format.compression == bgzf and cfilename:
+ if index_filename is not None:
+ cindex_filename = index_filename
+ with nogil:
+ tidx = tbx_index_load2(cfilename, cindex_filename)
+ self.index = makeTabixIndex(tidx)
+
+ if not self.is_stream:
+ self.start_offset = self.tell()
+ else:
+ raise ValueError("unknown mode {}".format(mode))
+
+ def reset(self):
+ """reset file position to beginning of file just after the header."""
+ return self.seek(self.start_offset)
+
+
+ def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
+ """fetch records in a :term:`region` using 0-based indexing. The
+ region is specified by :term:`contig`, *start* and *end*.
+ Alternatively, a samtools :term:`region` string can be supplied.
+
+ Without *contig* or *region* all mapped records will be fetched. The
+ records will be returned ordered by contig, which will not necessarily
+ be the order within the file.
+
+ Set *reopen* to true if you will be using multiple iterators on the
+ same file at the same time. The iterator returned will receive its
+ own copy of a filehandle to the file effectively re-opening the
+ file. Re-opening a file incurrs some overhead, so use with care.
+
+ If only *contig* is set, all records on *contig* will be fetched.
+ If both *region* and *contig* are given, an exception is raised.
+
+ Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
+ (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
+ read sequentially.
+ """
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+
+ if self.htsfile.is_write:
+ raise ValueError('cannot fetch from Variantfile opened for writing')
+
+ if contig is None and region is None:
+ self.is_reading = 1
+ bcf = self.copy() if reopen else self
+ bcf.seek(self.start_offset)
+ return iter(bcf)
+
+ if not self.index:
+ raise ValueError('fetch requires an index')
+
+ self.is_reading = 1
+ return self.index.fetch(self, contig, start, stop, region, reopen)
+
+ cpdef VariantRecord new_record(self):
+ """Create a new empty VariantRecord"""
+ return self.header.new_record()
+
+ cpdef int write(self, VariantRecord record) except -1:
+ """
+ write a single :class:`pysam.VariantRecord` to disk.
+
+ returns the number of bytes written.
+ """
+ if record is None:
+ raise ValueError('record must not be None')
+
+ if not self.is_open:
+ return ValueError('I/O operation on closed file')
+
+ if not self.htsfile.is_write:
+ raise ValueError('cannot write to a Variantfile opened for reading')
+
+ if not self.header_written:
+ self.header_written = True
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ #if record.header is not self.header:
+ # record.translate(self.header)
+ # raise ValueError('Writing records from a different VariantFile is not yet supported')
+
+ if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
+ msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
+ raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
+
+ cdef int ret
+
+ with nogil:
+ ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
+
+ if ret < 0:
+ raise IOError(errno, strerror(errno))
+
+ return ret
+
+ def subset_samples(self, include_samples):
+ """
+ Read only a subset of samples to reduce processing time and memory.
+ Must be called prior to retrieving records.
+ """
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+
+ if self.htsfile.is_write:
+ raise ValueError('cannot subset samples from Variantfile opened for writing')
+
+ if self.is_reading:
+ raise ValueError('cannot subset samples after fetching records')
+
+ self.header._subset_samples(include_samples)
+
+ # potentially unnecessary optimization that also sets max_unpack
+ if not include_samples:
+ self.drop_samples = True
--- /dev/null
+"""Functions that read and write block gzipped files.
+
+The user of the file doesn't have to worry about the compression
+and random access is allowed if an index file is present."""
+
+# based on Python 3.5's gzip module
+
+import io
+
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+
+from cpython.object cimport PyObject
+from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libchtslib cimport *
+
+
+__all__ = ["BGZFile"]
+
+
+BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE
+
+
+cdef class BGZFile(object):
+ """The BGZFile class simulates most of the methods of a file object with
+ the exception of the truncate() method.
+
+ This class only supports opening files in binary mode. If you need to open a
+ compressed file in text mode, use the gzip.open() function.
+ """
+ cdef BGZF* bgzf
+ cdef bytes name, index
+
+ def __init__(self, filename, mode=None, index=None):
+ """Constructor for the BGZFile class.
+
+ The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
+ 'xb' depending on whether the file will be read or written. The default
+ is the mode of fileobj if discernible; otherwise, the default is 'rb'.
+ A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
+ 'wb', 'a' and 'ab', and 'x' and 'xb'.
+ """
+ if mode and ('t' in mode or 'U' in mode):
+ raise ValueError("Invalid mode: {!r}".format(mode))
+ if not mode:
+ mode = 'rb'
+ if mode and 'b' not in mode:
+ mode += 'b'
+ self.name = force_bytes(filename)
+ self.index = force_bytes(index) if index is not None else None
+ self.bgzf = bgzf_open(self.name, mode)
+
+ if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0:
+ raise IOError('Error building bgzf index')
+
+ def __dealloc__(self):
+ self.close()
+
+ def write(self,data):
+ if not self.bgzf:
+ raise ValueError("write() on closed BGZFile object")
+
+ if not self.bgzf.is_write:
+ import errno
+ raise OSError(errno.EBADF, "write() on read-only BGZFile object")
+
+ if isinstance(data, bytes):
+ length = len(data)
+ else:
+ # accept any data that supports the buffer protocol
+ data = memoryview(data)
+ length = data.nbytes
+
+ if length > 0 and bgzf_write(self.bgzf, <char *>data, length) < 0:
+ raise IOError('BGZFile write failed')
+
+ return length
+
+ def read(self, size=-1):
+ cdef ssize_t read_size
+
+ if not self.bgzf:
+ raise ValueError("read() on closed BGZFile object")
+
+ if self.bgzf.is_write:
+ import errno
+ raise OSError(errno.EBADF, "read() on write-only BGZFile object")
+
+ if size < 0:
+ chunks = []
+ while 1:
+ chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE)
+ cdata = <bytes>chunk
+ read_size = bgzf_read(self.bgzf, <char *>chunk, BUFFER_SIZE)
+ if read_size < 0:
+ raise IOError('Error reading from BGZFile')
+ elif not read_size:
+ break
+ elif read_size < BUFFER_SIZE:
+ chunk = chunk[:read_size]
+ chunks.append(chunk)
+ return b''.join(chunks)
+
+ elif size > 0:
+ chunk = PyBytes_FromStringAndSize(NULL, size)
+ read_size = bgzf_read(self.bgzf, <char *>chunk, size)
+ if read_size < 0:
+ raise IOError('Error reading from BGZFile')
+ elif read_size < size:
+ chunk = chunk[:size]
+ return chunk
+ else:
+ return b''
+
+ @property
+ def closed(self):
+ return self.bgzf == NULL
+
+ def close(self):
+ if not self.bgzf:
+ return
+
+ if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
+ raise IOError('Error flushing BGZFile object')
+
+ if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0:
+ raise IOError('Cannot write index')
+
+ cdef ret = bgzf_close(self.bgzf)
+ self.bgzf = NULL
+
+ if ret < 0:
+ raise IOError('Error closing BGZFile object')
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, tb):
+ self.close()
+
+ def flush(self):
+ if not self.bgzf:
+ return
+
+ if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0:
+ raise IOError('Error flushing BGZFile object')
+
+ def fileno(self):
+ """Invoke the underlying file object's fileno() method.
+
+ This will raise AttributeError if the underlying file object
+ doesn't support fileno().
+ """
+ raise AttributeError('fileno')
+
+ def rewind(self):
+ '''Return the uncompressed stream file position indicator to the
+ beginning of the file'''
+ if not self.bgzf:
+ raise ValueError("rewind() on closed BGZFile object")
+ if not self.bgzf.is_write:
+ raise OSError("Can't rewind in write mode")
+ if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0:
+ raise IOError('Error seeking BGZFFile object')
+
+ def readable(self):
+ if not self.bgzf:
+ raise ValueError("readable() on closed BGZFile object")
+ return self.bgzf != NULL and not self.bgzf.is_write
+
+ def writable(self):
+ return self.bgzf != NULL and self.bgzf.is_write
+
+ def seekable(self):
+ return True
+
+ def seek(self, offset, whence=io.SEEK_SET):
+ if not self.bgzf:
+ raise ValueError("seek() on closed BGZFile object")
+ if whence is not io.SEEK_SET:
+ raise ValueError('Seek from end not supported')
+
+ cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET)
+ if off < 0:
+ raise IOError('Error seeking BGZFFile object')
+
+ return off
+
+ def readline(self, size=-1):
+ if not self.bgzf:
+ raise ValueError("readline() on closed BGZFile object")
+
+ cdef kstring_t line
+ cdef char c
+
+ line.l = line.m = 0
+ line.s = NULL
+ if bgzf_getline(self.bgzf, '\n', &line) < 0:
+ raise IOError('Error reading line in BGZFFile object')
+
+ ret = charptr_to_str_w_len(line.s, line.l)
+
+ if line.m:
+ free(line.s)
+
+ return ret
--- /dev/null
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+cimport cython
+
+from cpython cimport array
+from pysam.libchtslib cimport faidx_t, kstring_t, BGZF
+
+# These functions are put here and not in chtslib.pxd in order
+# to avoid warnings for unused functions.
+cdef extern from "pysam_stream.h" nogil:
+
+ ctypedef struct kstream_t:
+ pass
+
+ ctypedef struct kseq_t:
+ kstring_t name
+ kstring_t comment
+ kstring_t seq
+ kstring_t qual
+
+ kseq_t *kseq_init(BGZF *)
+ int kseq_read(kseq_t *)
+ void kseq_destroy(kseq_t *)
+ kstream_t *ks_init(BGZF *)
+ void ks_destroy(kstream_t *)
+
+ # Retrieve characters from stream until delimiter
+ # is reached placing results in str.
+ int ks_getuntil(kstream_t *,
+ int delimiter,
+ kstring_t * str,
+ int * dret)
+
+cdef class FastaFile:
+ cdef bint is_remote
+ cdef object _filename, _references, _lengths, reference2length
+ cdef faidx_t* fastafile
+ cdef char* _fetch(self, char* reference,
+ int start, int end, int* length)
+
+
+cdef class FastqProxy:
+ cdef kseq_t * _delegate
+ cdef cython.str tostring(self)
+ cpdef array.array get_quality_array(self, int offset=*)
+
+
+cdef class PersistentFastqProxy:
+ """
+ Python container for pysam.libcfaidx.FastqProxy with persistence.
+ """
+ cdef public str comment, quality, sequence, name
+ cdef cython.str tostring(self)
+ cpdef array.array get_quality_array(self, int offset=*)
+
+
+cdef class FastxFile:
+ cdef object _filename
+ cdef BGZF * fastqfile
+ cdef kseq_t * entry
+ cdef bint persist
+ cdef bint is_remote
+
+ cdef kseq_t * getCurrent(self)
+ cdef int cnext(self)
+
+
+# Compatibility Layer for pysam 0.8.1
+cdef class FastqFile(FastxFile):
+ pass
+
+
+# Compatibility Layer for pysam < 0.8
+cdef class Fastafile(FastaFile):
+ pass
+
--- /dev/null
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for SAM/BAM/CRAM files based on htslib
+###############################################################################
+# The principal classes defined in this module are:
+#
+# class FastaFile random read read/write access to faidx indexd files
+# class FastxFile streamed read/write access to fasta/fastq files
+#
+# Additionally this module defines several additional classes that are part
+# of the internal API. These are:
+#
+# class FastqProxy
+# class PersistentFastqProxy
+#
+# For backwards compatibility, the following classes are also defined:
+#
+# class Fastafile equivalent to FastaFile
+# class FastqFile equivalent to FastxFile
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import sys
+import os
+import re
+from cpython cimport array
+
+from cpython cimport PyErr_SetString, \
+ PyBytes_Check, \
+ PyUnicode_Check, \
+ PyBytes_FromStringAndSize
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libchtslib cimport \
+ faidx_nseq, fai_load, fai_destroy, fai_fetch, \
+ faidx_seq_len, \
+ faidx_fetch_seq, hisremote, \
+ bgzf_open, bgzf_close
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+from pysam.libcutils cimport qualitystring_to_array, parse_region
+
+cdef class FastqProxy
+cdef makeFastqProxy(kseq_t * src):
+ '''enter src into AlignedRead.'''
+ cdef FastqProxy dest = FastqProxy.__new__(FastqProxy)
+ dest._delegate = src
+ return dest
+
+## TODO:
+## add automatic indexing.
+## add function to get sequence names.
+cdef class FastaFile:
+ """Random access to fasta formatted files that
+ have been indexed by :term:`faidx`.
+
+ The file is automatically opened. The index file of file
+ ``<filename>`` is expected to be called ``<filename>.fai``.
+
+ Parameters
+ ----------
+
+ filename : string
+ Filename of fasta file to be opened.
+
+ filepath_index : string
+ Optional, filename of the index. By default this is
+ the filename + ".fai".
+
+ Raises
+ ------
+
+ ValueError
+ if index file is missing
+
+ IOError
+ if file could not be opened
+
+ """
+
+ def __cinit__(self, *args, **kwargs):
+ self.fastafile = NULL
+ self._filename = None
+ self._references = None
+ self._lengths = None
+ self.reference2length = None
+ self._open(*args, **kwargs)
+
+ def is_open(self):
+ '''return true if samfile has been opened.'''
+ return self.fastafile != NULL
+
+ def __len__(self):
+ if self.fastafile == NULL:
+ raise ValueError("calling len() on closed file")
+
+ return faidx_nseq(self.fastafile)
+
+ def _open(self, filename, filepath_index=None):
+ '''open an indexed fasta file.
+
+ This method expects an indexed fasta file.
+ '''
+
+ # close a previously opened file
+ if self.fastafile != NULL:
+ self.close()
+
+ self._filename = encode_filename(filename)
+ cdef char *cfilename = self._filename
+ self.is_remote = hisremote(cfilename)
+
+ if filepath_index is not None:
+ raise NotImplementedError(
+ "setting an explicit path for the index "
+ "is not implemented")
+
+ # open file for reading
+ if (self._filename != b"-"
+ and not self.is_remote
+ and not os.path.exists(filename)):
+ raise IOError("file `%s` not found" % filename)
+
+ with nogil:
+ self.fastafile = fai_load(cfilename)
+
+ if self.fastafile == NULL:
+ raise IOError("could not open file `%s`" % filename)
+
+ if self.is_remote:
+ filepath_index = os.path.basename(
+ re.sub("[^:]+:[/]*", "", filename)) + ".fai"
+ elif filepath_index is None:
+ filepath_index = filename + ".fai"
+
+ if not os.path.exists(filepath_index):
+ raise ValueError("could not locate index file {}".format(
+ filepath_index))
+
+ with open(filepath_index) as inf:
+ data = [x.split("\t") for x in inf]
+ self._references = tuple(x[0] for x in data)
+ self._lengths = tuple(int(x[1]) for x in data)
+ self.reference2length = dict(zip(self._references, self._lengths))
+
+ def close(self):
+ """close the file."""
+ if self.fastafile != NULL:
+ fai_destroy(self.fastafile)
+ self.fastafile = NULL
+
+ def __dealloc__(self):
+ if self.fastafile != NULL:
+ fai_destroy(self.fastafile)
+ self.fastafile = NULL
+
+ # context manager interface
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
+ property closed:
+ """"bool indicating the current state of the file object.
+ This is a read-only attribute; the close() method changes the value.
+ """
+ def __get__(self):
+ return not self.is_open()
+
+ property filename:
+ """filename associated with this object. This is a read-only attribute."""
+ def __get__(self):
+ return self._filename
+
+ property references:
+ '''tuple with the names of :term:`reference` sequences.'''
+ def __get__(self):
+ return self._references
+
+ property nreferences:
+ """"int with the number of :term:`reference` sequences in the file.
+ This is a read-only attribute."""
+ def __get__(self):
+ return len(self._references) if self.references else None
+
+ property lengths:
+ """tuple with the lengths of :term:`reference` sequences."""
+ def __get__(self):
+ return self._lengths
+
+ def fetch(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None):
+ """fetch sequences in a :term:`region`.
+
+ A region can
+ either be specified by :term:`reference`, `start` and
+ `end`. `start` and `end` denote 0-based, half-open
+ intervals.
+
+ Alternatively, a samtools :term:`region` string can be
+ supplied.
+
+ If any of the coordinates are missing they will be replaced by the
+ minimum (`start`) or maximum (`end`) coordinate.
+
+ Note that region strings are 1-based, while `start` and `end` denote
+ an interval in python coordinates.
+ The region is specified by :term:`reference`, `start` and `end`.
+
+ Returns
+ -------
+
+ string : a string with the sequence specified by the region.
+
+ Raises
+ ------
+
+ IndexError
+ if the coordinates are out of range
+
+ ValueError
+ if the region is invalid
+
+ """
+
+ if not self.is_open():
+ raise ValueError("I/O operation on closed file" )
+
+ cdef int length
+ cdef char *seq
+ cdef char *ref
+ cdef int rstart, rend
+
+ reference, rstart, rend = parse_region(reference, start, end, region)
+
+ if reference is None:
+ raise ValueError("no sequence/region supplied.")
+
+ if rstart == rend:
+ return ""
+
+ ref = reference
+ with nogil:
+ length = faidx_seq_len(self.fastafile, ref)
+ if length == -1:
+ raise KeyError("sequence '%s' not present" % reference)
+ if rstart >= length:
+ return ""
+
+ # fai_fetch adds a '\0' at the end
+ with nogil:
+ seq = faidx_fetch_seq(self.fastafile,
+ ref,
+ rstart,
+ rend-1,
+ &length)
+
+ if seq == NULL:
+ raise ValueError(
+ "failure when retrieving sequence on '%s'" % reference)
+
+ try:
+ return charptr_to_str(seq)
+ finally:
+ free(seq)
+
+ cdef char * _fetch(self, char * reference, int start, int end, int * length):
+ '''fetch sequence for reference, start and end'''
+
+ with nogil:
+ return faidx_fetch_seq(self.fastafile,
+ reference,
+ start,
+ end-1,
+ length)
+
+ def get_reference_length(self, reference):
+ '''return the length of reference.'''
+ return self.reference2length[reference]
+
+ def __getitem__(self, reference):
+ return self.fetch(reference)
+
+ def __contains__(self, reference):
+ '''return true if reference in fasta file.'''
+ return reference in self.reference2length
+
+
+cdef class FastqProxy:
+ """A single entry in a fastq file."""
+ def __init__(self): pass
+
+ property name:
+ """The name of each entry in the fastq file."""
+ def __get__(self):
+ return charptr_to_str(self._delegate.name.s)
+
+ property sequence:
+ """The sequence of each entry in the fastq file."""
+ def __get__(self):
+ return charptr_to_str(self._delegate.seq.s)
+
+ property comment:
+ def __get__(self):
+ if self._delegate.comment.l:
+ return charptr_to_str(self._delegate.comment.s)
+ else:
+ return None
+
+ property quality:
+ """The quality score of each entry in the fastq file, represented as a string."""
+ def __get__(self):
+ if self._delegate.qual.l:
+ return charptr_to_str(self._delegate.qual.s)
+ else:
+ return None
+
+ cdef cython.str tostring(self):
+ if self.comment is None:
+ comment = ""
+ else:
+ comment = " %s" % self.comment
+
+ if self.quality is None:
+ return ">%s%s\n%s" % (self.name, comment, self.sequence)
+ else:
+ return "@%s%s\n%s\n+\n%s" % (self.name, comment,
+ self.sequence, self.quality)
+
+ def __str__(self):
+ return self.tostring()
+
+ cpdef array.array get_quality_array(self, int offset=33):
+ '''return quality values as integer array after subtracting offset.'''
+ if self.quality is None:
+ return None
+ return qualitystring_to_array(force_bytes(self.quality),
+ offset=offset)
+
+cdef class PersistentFastqProxy:
+ """
+ Python container for pysam.libcfaidx.FastqProxy with persistence.
+ Needed to compare multiple fastq records from the same file.
+ """
+ def __init__(self, FastqProxy FastqRead):
+ self.comment = FastqRead.comment
+ self.quality = FastqRead.quality
+ self.sequence = FastqRead.sequence
+ self.name = FastqRead.name
+
+ cdef cython.str tostring(self):
+ if self.comment is None:
+ comment = ""
+ else:
+ comment = " %s" % self.comment
+
+ if self.quality is None:
+ return ">%s%s\n%s" % (self.name, comment, self.sequence)
+ else:
+ return "@%s%s\n%s\n+\n%s" % (self.name, comment,
+ self.sequence, self.quality)
+
+ def __str__(self):
+ return self.tostring()
+
+ cpdef array.array get_quality_array(self, int offset=33):
+ '''return quality values as array after subtracting offset.'''
+ if self.quality is None:
+ return None
+ return qualitystring_to_array(force_bytes(self.quality),
+ offset=offset)
+
+
+cdef class FastxFile:
+ """Stream access to :term:`fasta` or :term:`fastq` formatted files.
+
+ The file is automatically opened.
+
+ Entries in the file can be both fastq or fasta formatted or even a
+ mixture of the two.
+
+ This file object permits iterating over all entries in the
+ file. Random access is not implemented. The iteration returns
+ objects of type :class:`FastqProxy`
+
+ Parameters
+ ----------
+
+ filename : string
+ Filename of fasta/fastq file to be opened.
+
+ persist : bool
+
+ If True (default) make a copy of the entry in the file during
+ iteration. If set to False, no copy will be made. This will
+ permit faster iteration, but an entry will not persist when
+ the iteration continues.
+
+ Notes
+ -----
+ Prior to version 0.8.2, this was called FastqFile.
+
+ Raises
+ ------
+
+ IOError
+ if file could not be opened
+
+
+ Examples
+ --------
+ >>> with pysam.FastxFile(filename) as fh:
+ ... for entry in fh:
+ ... print(entry.name)
+ ... print(entry.sequence)
+ ... print(entry.comment)
+ ... print(entry.quality)
+
+ """
+ def __cinit__(self, *args, **kwargs):
+ # self.fastqfile = <gzFile*>NULL
+ self._filename = None
+ self.entry = NULL
+ self._open(*args, **kwargs)
+
+ def is_open(self):
+ '''return true if samfile has been opened.'''
+ return self.entry != NULL
+
+ def _open(self, filename, persist=True):
+ '''open a fastq/fasta file in *filename*
+
+ Paramentes
+ ----------
+
+ persist : bool
+
+ if True return a copy of the underlying data (default
+ True). The copy will persist even if the iteration
+ on the file continues.
+
+ '''
+ if self.fastqfile != NULL:
+ self.close()
+
+ self._filename = encode_filename(filename)
+ cdef char *cfilename = self._filename
+ self.is_remote = hisremote(cfilename)
+
+ # open file for reading
+ if (self._filename != b"-"
+ and not self.is_remote
+ and not os.path.exists(filename)):
+ raise IOError("file `%s` not found" % filename)
+
+ self.persist = persist
+
+ with nogil:
+ self.fastqfile = bgzf_open(cfilename, "r")
+ self.entry = kseq_init(self.fastqfile)
+ self._filename = filename
+
+ def close(self):
+ '''close the file.'''
+ if self.fastqfile != NULL:
+ bgzf_close(self.fastqfile)
+ self.fastqfile = NULL
+ if self.entry != NULL:
+ kseq_destroy(self.entry)
+ self.entry = NULL
+
+ def __dealloc__(self):
+ if self.fastqfile != NULL:
+ bgzf_close(self.fastqfile)
+ if self.entry:
+ kseq_destroy(self.entry)
+
+ # context manager interface
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
+ property closed:
+ """"bool indicating the current state of the file object.
+ This is a read-only attribute; the close() method changes the value.
+ """
+ def __get__(self):
+ return not self.is_open()
+
+ property filename:
+ """string with the filename associated with this object."""
+ def __get__(self):
+ return self._filename
+
+ def __iter__(self):
+ if not self.is_open():
+ raise ValueError("I/O operation on closed file")
+ return self
+
+ cdef kseq_t * getCurrent(self):
+ return self.entry
+
+ cdef int cnext(self):
+ '''C version of iterator
+ '''
+ with nogil:
+ return kseq_read(self.entry)
+
+ def __next__(self):
+ """
+ python version of next().
+ """
+ cdef int l
+ with nogil:
+ l = kseq_read(self.entry)
+ if (l >= 0):
+ if self.persist:
+ return PersistentFastqProxy(makeFastqProxy(self.entry))
+ return makeFastqProxy(self.entry)
+ else:
+ raise StopIteration
+
+# Compatibility Layer for pysam 0.8.1
+cdef class FastqFile(FastxFile):
+ """FastqFile is deprecated: use FastxFile instead"""
+ pass
+
+# Compatibility Layer for pysam < 0.8
+cdef class Fastafile(FastaFile):
+ """Fastafile is deprecated: use FastaFile instead"""
+ pass
+
+__all__ = ["FastaFile",
+ "FastqFile",
+ "FastxFile",
+ "Fastafile",
+ "FastqProxy"]
--- /dev/null
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+from posix.types cimport off_t
+
+cdef extern from "Python.h":
+ FILE* PyFile_AsFile(object)
+
+
+cdef extern from "htslib/kstring.h" nogil:
+ ctypedef struct kstring_t:
+ size_t l, m
+ char *s
+
+
+cdef extern from "htslib_util.h" nogil:
+ int hts_set_verbosity(int verbosity)
+ int hts_get_verbosity()
+
+ ctypedef uint32_t khint32_t
+ ctypedef uint32_t khint_t
+ ctypedef khint_t khiter_t
+
+ # Used to manage BCF Header info
+ ctypedef struct vdict_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ khint32_t *flags
+ const char *keys
+ bcf_idinfo_t *vals
+
+ # Used to manage indexed contigs in Tabix
+ ctypedef struct s2i_t:
+ khint_t n_buckets, size, n_occupied, upper_bound
+ khint32_t *flags
+ const char *keys
+ int64_t *vals
+
+ # Generic khash methods
+ khint_t kh_size(void *d)
+ khint_t kh_begin(void *d)
+ khint_t kh_end(void *d)
+ int kh_exist(void *d, khiter_t i)
+
+ # Specialized khash methods for vdict
+ khint_t kh_get_vdict(vdict_t *d, const char *key)
+ const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
+ bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
+
+
+cdef extern from "htslib/hfile.h" nogil:
+ ctypedef struct hFILE
+
+ # @abstract Open the named file or URL as a stream
+ # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+ hFILE *hopen(const char *filename, const char *mode)
+
+ # @abstract Associate a stream with an existing open file descriptor
+ # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
+ # @notes For socket descriptors (on Windows), mode should contain 's'.
+ hFILE *hdopen(int fd, const char *mode)
+
+ # @abstract Report whether the file name or URL denotes remote storage
+ # @return 0 if local, 1 if remote.
+ # @notes "Remote" means involving e.g. explicit network access, with the
+ # implication that callers may wish to cache such files' contents locally.
+ int hisremote(const char *filename)
+
+ # @abstract Flush (for output streams) and close the stream
+ # @return 0 if successful, or EOF (with errno set) if an error occurred.
+ int hclose(hFILE *fp)
+
+ # @abstract Close the stream, without flushing or propagating errors
+ # @notes For use while cleaning up after an error only. Preserves errno.
+ void hclose_abruptly(hFILE *fp)
+
+ # @abstract Return the stream's error indicator
+ # @return Non-zero (in fact, an errno value) if an error has occurred.
+ # @notes This would be called herror() and return true/false to parallel
+ # ferror(3), but a networking-related herror(3) function already exists. */
+ int herrno(hFILE *fp)
+
+ # @abstract Clear the stream's error indicator
+ void hclearerr(hFILE *fp)
+
+ # @abstract Reposition the read/write stream offset
+ # @return The resulting offset within the stream (as per lseek(2)),
+ # or negative if an error occurred.
+ off_t hseek(hFILE *fp, off_t offset, int whence)
+
+ # @abstract Report the current stream offset
+ # @return The offset within the stream, starting from zero.
+ off_t htell(hFILE *fp)
+
+ # @abstract Read one character from the stream
+ # @return The character read, or EOF on end-of-file or error
+ int hgetc(hFILE *fp)
+
+ # @abstract Peek at characters to be read without removing them from buffers
+ # @param fp The file stream
+ # @param buffer The buffer to which the peeked bytes will be written
+ # @param nbytes The number of bytes to peek at; limited by the size of the
+ # internal buffer, which could be as small as 4K.
+ # @return The number of bytes peeked, which may be less than nbytes if EOF
+ # is encountered; or negative, if there was an I/O error.
+ # @notes The characters peeked at remain in the stream's internal buffer,
+ # and will be returned by later hread() etc calls.
+ ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
+
+ # @abstract Read a block of characters from the file
+ # @return The number of bytes read, or negative if an error occurred.
+ # @notes The full nbytes requested will be returned, except as limited
+ # by EOF or I/O errors.
+ ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
+
+ # @abstract Write a character to the stream
+ # @return The character written, or EOF if an error occurred.
+ int hputc(int c, hFILE *fp)
+
+ # @abstract Write a string to the stream
+ # @return 0 if successful, or EOF if an error occurred.
+ int hputs(const char *text, hFILE *fp)
+
+ # @abstract Write a block of characters to the file
+ # @return Either nbytes, or negative if an error occurred.
+ # @notes In the absence of I/O errors, the full nbytes will be written.
+ ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
+
+ # @abstract For writing streams, flush buffered output to the underlying stream
+ # @return 0 if successful, or EOF if an error occurred.
+ int hflush(hFILE *fp)
+
+
+cdef extern from "htslib/bgzf.h" nogil:
+ ctypedef struct bgzf_mtaux_t
+ ctypedef struct bgzidx_t
+ ctypedef struct z_stream
+
+ ctypedef struct BGZF:
+ unsigned errcode
+ unsigned is_write
+ int is_be
+ int compress_level
+ int is_compressed
+ int is_gzip
+ int cache_size
+ int64_t block_address
+ int64_t uncompressed_address
+ void *uncompressed_block
+ void *compressed_block
+ void *cache
+ hFILE *fp
+ bgzf_mtaux_t *mt
+ bgzidx_t *idx
+ int idx_build_otf
+ z_stream *gz_stream
+
+ #*****************
+ # Basic routines *
+ # *****************/
+
+ # Open an existing file descriptor for reading or writing.
+ #
+ # @param fd file descriptor
+ # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+ # writing, 'a' for appending, 'g' for gzip rather than BGZF
+ # compression (with 'w' only), and digit specifies the zlib
+ # compression level.
+ # Note that there is a distinction between 'u' and '0': the
+ # first yields plain uncompressed output whereas the latter
+ # outputs uncompressed data wrapped in the zlib format.
+ # @return BGZF file handler; 0 on error
+
+ BGZF* bgzf_dopen(int fd, const char *mode)
+ BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility
+
+ # Open the specified file for reading or writing.
+ BGZF* bgzf_open(const char* path, const char *mode)
+
+ # Open an existing hFILE stream for reading or writing.
+ BGZF* bgzf_hopen(hFILE *fp, const char *mode)
+
+ # Close the BGZF and free all associated resources.
+ #
+ # @param fp BGZF file handler
+ # @return 0 on success and -1 on error
+ int bgzf_close(BGZF *fp)
+
+ # Read up to _length_ bytes from the file storing into _data_.
+ #
+ # @param fp BGZF file handler
+ # @param data data array to read into
+ # @param length size of data to read
+ # @return number of bytes actually read; 0 on end-of-file and -1 on error
+ ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
+
+ # Write _length_ bytes from _data_ to the file. If no I/O errors occur,
+ # the complete _length_ bytes will be written (or queued for writing).
+ #
+ # @param fp BGZF file handler
+ # @param data data array to write
+ # @param length size of data to write
+ # @return number of bytes written (i.e., _length_); negative on error
+ ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
+
+ # Read up to _length_ bytes directly from the underlying stream without
+ # decompressing. Bypasses BGZF blocking, so must be used with care in
+ # specialised circumstances only.
+ #
+ # @param fp BGZF file handler
+ # @param data data array to read into
+ # @param length number of raw bytes to read
+ # @return number of bytes actually read; 0 on end-of-file and -1 on error
+ ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
+
+ # Write _length_ bytes directly to the underlying stream without
+ # compressing. Bypasses BGZF blocking, so must be used with care
+ # in specialised circumstances only.
+ #
+ # @param fp BGZF file handler
+ # @param data data array to write
+ # @param length number of raw bytes to write
+ # @return number of bytes actually written; -1 on error
+ ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
+
+ # Write the data in the buffer to the file.
+ int bgzf_flush(BGZF *fp)
+
+ int SEEK_SET
+
+ # Return a virtual file pointer to the current location in the file.
+ # No interpetation of the value should be made, other than a subsequent
+ # call to bgzf_seek can be used to position the file at the same point.
+ # Return value is non-negative on success.
+ int64_t bgzf_tell(BGZF *fp)
+
+ # Set the file to read from the location specified by _pos_.
+ #
+ # @param fp BGZF file handler
+ # @param pos virtual file offset returned by bgzf_tell()
+ # @param whence must be SEEK_SET
+ # @return 0 on success and -1 on error
+ # /
+ int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
+
+ # Check if the BGZF end-of-file (EOF) marker is present
+ #
+ # @param fp BGZF file handler opened for reading
+ # @return 1 if the EOF marker is present and correct
+ # 2 if it can't be checked, e.g., because fp isn't seekable
+ # 0 if the EOF marker is absent
+ # -1 (with errno set) on error
+ int bgzf_check_EOF(BGZF *fp)
+
+ # Check if a file is in the BGZF format
+ #
+ # @param fn file name
+ # @return 1 if _fn_ is BGZF; 0 if not or on I/O error
+ int bgzf_is_bgzf(const char *fn)
+
+ #*********************
+ # Advanced routines *
+ #*********************
+
+ # Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+ #
+ # @param fp BGZF file handler
+ # @param size size of cache in bytes; 0 to disable caching (default)
+ void bgzf_set_cache_size(BGZF *fp, int size)
+
+ # Flush the file if the remaining buffer size is smaller than _size_
+ # @return 0 if flushing succeeded or was not needed; negative on error
+ int bgzf_flush_try(BGZF *fp, ssize_t size)
+
+ # Read one byte from a BGZF file. It is faster than bgzf_read()
+ # @param fp BGZF file handler
+ # @return byte read; -1 on end-of-file or error
+ int bgzf_getc(BGZF *fp)
+
+ # Read one line from a BGZF file. It is faster than bgzf_getc()
+ #
+ # @param fp BGZF file handler
+ # @param delim delimitor
+ # @param str string to write to; must be initialized
+ # @return length of the string; 0 on end-of-file; negative on error
+ int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+
+ # Read the next BGZF block.
+ int bgzf_read_block(BGZF *fp)
+
+ # Enable multi-threading (only effective on writing and when the
+ # library was compiled with -DBGZF_MT)
+ #
+ # @param fp BGZF file handler; must be opened for writing
+ # @param n_threads #threads used for writing
+ # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
+ int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+
+
+ # Compress a single BGZF block.
+ #
+ # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
+ # @param dlen size of output buffer; updated on return to the number
+ # of bytes actually written to dst
+ # @param src buffer to be compressed
+ # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE)
+ # @param level compression level
+ # @return 0 on success and negative on error
+ #
+ int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
+
+ #*******************
+ # bgzidx routines *
+ # BGZF at the uncompressed offset
+ #
+ # @param fp BGZF file handler; must be opened for reading
+ # @param uoffset file offset in the uncompressed data
+ # @param where SEEK_SET supported atm
+ #
+ # Returns 0 on success and -1 on error.
+ int bgzf_useek(BGZF *fp, long uoffset, int where)
+
+ # Position in uncompressed BGZF
+ #
+ # @param fp BGZF file handler; must be opened for reading
+ #
+ # Returns the current offset on success and -1 on error.
+ long bgzf_utell(BGZF *fp)
+
+ # Tell BGZF to build index while compressing.
+ #
+ # @param fp BGZF file handler; can be opened for reading or writing.
+ #
+ # Returns 0 on success and -1 on error.
+ int bgzf_index_build_init(BGZF *fp)
+
+ # Load BGZF index
+ #
+ # @param fp BGZF file handler
+ # @param bname base name
+ # @param suffix suffix to add to bname (can be NULL)
+ #
+ # Returns 0 on success and -1 on error.
+ int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
+
+ # Save BGZF index
+ #
+ # @param fp BGZF file handler
+ # @param bname base name
+ # @param suffix suffix to add to bname (can be NULL)
+ #
+ # Returns 0 on success and -1 on error.
+ int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
+
+
+cdef extern from "htslib/hts.h" nogil:
+ uint32_t kroundup32(uint32_t x)
+
+ ctypedef struct cram_fd
+
+ union FilePointerUnion:
+ BGZF *bgzf
+ cram_fd *cram
+ hFILE *hfile
+ void *voidp
+
+ enum htsFormatCategory:
+ unknown_category
+ sequence_data # Sequence data -- SAM, BAM, CRAM, etc
+ variant_data # Variant calling data -- VCF, BCF, etc
+ index_file # Index file associated with some data file
+ region_list # Coordinate intervals or regions -- BED, etc
+ category_maximum
+
+ enum htsExactFormat:
+ unknown_format
+ binary_format
+ text_format
+ sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
+ format_maximum
+
+ enum htsCompression:
+ no_compression, gzip, bgzf, custom
+ compression_maximum
+
+ enum hts_fmt_option:
+ CRAM_OPT_DECODE_MD,
+ CRAM_OPT_PREFIX,
+ CRAM_OPT_VERBOSITY,
+ CRAM_OPT_SEQS_PER_SLICE,
+ CRAM_OPT_SLICES_PER_CONTAINER,
+ CRAM_OPT_RANGE,
+ CRAM_OPT_VERSION,
+ CRAM_OPT_EMBED_REF,
+ CRAM_OPT_IGNORE_MD5,
+ CRAM_OPT_REFERENCE,
+ CRAM_OPT_MULTI_SEQ_PER_SLICE,
+ CRAM_OPT_NO_REF,
+ CRAM_OPT_USE_BZIP2,
+ CRAM_OPT_SHARED_REF,
+ CRAM_OPT_NTHREADS,
+ CRAM_OPT_THREAD_POOL,
+ CRAM_OPT_USE_LZMA,
+ CRAM_OPT_USE_RANS,
+ CRAM_OPT_REQUIRED_FIELDS,
+ HTS_OPT_COMPRESSION_LEVEL,
+ HTS_OPT_NTHREADS,
+
+ ctypedef struct htsVersion:
+ short major, minor
+
+ ctypedef struct htsFormat:
+ htsFormatCategory category
+ htsExactFormat format
+ htsVersion version
+ htsCompression compression
+ short compression_level
+ void *specific
+
+ ctypedef struct htsFile:
+ uint8_t is_bin
+ uint8_t is_write
+ uint8_t is_be
+ uint8_t is_cram
+ int64_t lineno
+ kstring_t line
+ char *fn
+ char *fn_aux
+ FilePointerUnion fp
+ htsFormat format
+
+ int hts_verbose
+
+ # @abstract Table for converting a nucleotide character to 4-bit encoding.
+ # The input character may be either an IUPAC ambiguity code, '=' for 0, or
+ # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
+ # for A/C/G/T or combinations of these bits for ambiguous bases.
+ const unsigned char *seq_nt16_table
+
+ # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+ # ambiguity code letter (or '=' when given 0).
+ const char *seq_nt16_str
+
+ # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+ # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+ const int *seq_nt16_int
+
+ # @abstract Get the htslib version number
+ # @return For released versions, a string like "N.N[.N]"; or git describe
+ # output if using a library built within a Git repository.
+ const char *hts_version()
+
+ # @abstract Determine format by peeking at the start of a file
+ # @param fp File opened for reading, positioned at the beginning
+ # @param fmt Format structure that will be filled out on return
+ # @return 0 for success, or negative if an error occurred.
+ int hts_detect_format(hFILE *fp, htsFormat *fmt)
+
+ # @abstract Get a human-readable description of the file format
+ # @return Description string, to be freed by the caller after use.
+ char *hts_format_description(const htsFormat *format)
+
+ # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
+ # @param fn The file name or "-" for stdin/stdout
+ # @param mode Mode matching / [rwa][bceguxz0-9]* /
+ # @discussion
+ # With 'r' opens for reading; any further format mode letters are ignored
+ # as the format is detected by checking the first few bytes or BGZF blocks
+ # of the file. With 'w' or 'a' opens for writing or appending, with format
+ # specifier letters:
+ # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
+ # c CRAM format
+ # g gzip compressed
+ # u uncompressed
+ # z bgzf compressed
+ # [0-9] zlib compression level
+ # and with non-format option letters (for any of 'r'/'w'/'a'):
+ # e close the file on exec(2) (opens with O_CLOEXEC, where supported)
+ # x create the file exclusively (opens with O_EXCL, where supported)
+ # Note that there is a distinction between 'u' and '0': the first yields
+ # plain uncompressed output whereas the latter outputs uncompressed data
+ # wrapped in the zlib format.
+ # @example
+ # [rw]b .. compressed BCF, BAM, FAI
+ # [rw]bu .. uncompressed BCF
+ # [rw]z .. compressed VCF
+ # [rw] .. uncompressed VCF
+ htsFile *hts_open(const char *fn, const char *mode)
+
+ # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
+ # @param fn The file name or "-" for stdin/stdout
+ # @param mode Open mode, as per hts_open()
+ # @param fmt Optional format specific parameters
+ # @discussion
+ # See hts_open() for description of fn and mode.
+ # // TODO Update documentation for s/opts/fmt/
+ # Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
+ # if defined, override mode. Opts also contains a linked list of hts_opt
+ # structures to apply to the open file handle. These can contain things
+ # like pointers to the reference or information on compression levels,
+ # block sizes, etc.
+ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
+
+ # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+ # @param fp The already-open file handle
+ # @param fn The file name or "-" for stdin/stdout
+ # @param mode Open mode, as per hts_open()
+ htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
+
+ # @abstract Close a file handle, flushing buffered data for output streams
+ # @param fp The file handle to be closed
+ # @return 0 for success, or negative if an error occurred.
+ int hts_close(htsFile *fp)
+
+ # @abstract Returns the file's format information
+ # @param fp The file handle
+ # @return Read-only pointer to the file's htsFormat.
+ const htsFormat *hts_get_format(htsFile *fp)
+
+ # @ abstract Returns a string containing the file format extension.
+ # @ param format Format structure containing the file type.
+ # @ return A string ("sam", "bam", etc) or "?" for unknown formats.
+ const char *hts_format_file_extension(const htsFormat *format)
+
+ # @abstract Sets a specified CRAM option on the open file handle.
+ # @param fp The file handle open the open file.
+ # @param opt The CRAM_OPT_* option.
+ # @param ... Optional arguments, dependent on the option used.
+ # @return 0 for success, or negative if an error occurred.
+ int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
+
+ int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
+ char **hts_readlines(const char *fn, int *_n)
+
+ # @abstract Parse comma-separated list or read list from a file
+ # @param list File name or comma-separated list
+ # @param is_file
+ # @param _n Size of the output array (number of items read)
+ # @return NULL on failure or pointer to newly allocated array of
+ # strings
+ char **hts_readlist(const char *fn, int is_file, int *_n)
+
+ # @abstract Create extra threads to aid compress/decompression for this file
+ # @param fp The file handle
+ # @param n The number of worker threads to create
+ # @return 0 for success, or negative if an error occurred.
+ # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
+ int hts_set_threads(htsFile *fp, int n)
+
+ # @abstract Set .fai filename for a file opened for reading
+ # @return 0 for success, negative on failure
+ # @discussion
+ # Called before *_hdr_read(), this provides the name of a .fai file
+ # used to provide a reference list if the htsFile contains no @SQ headers.
+ int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
+
+ int8_t HTS_IDX_NOCOOR
+ int8_t HTS_IDX_START
+ int8_t HTS_IDX_REST
+ int8_t HTS_IDX_NONE
+
+ int8_t HTS_FMT_CSI
+ int8_t HTS_FMT_BAI
+ int8_t HTS_FMT_TBI
+ int8_t HTS_FMT_CRAI
+
+ BGZF *hts_get_bgzfp(htsFile *fp)
+ int hts_useek(htsFile *fp, long uoffset, int where)
+ long hts_utell(htsFile *fp)
+
+ ctypedef struct hts_idx_t
+
+ ctypedef struct hts_pair64_t:
+ uint64_t u, v
+
+ ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end)
+
+ ctypedef struct hts_bins_t:
+ int n, m
+ int *a
+
+ ctypedef struct hts_itr_t:
+ uint32_t read_rest
+ uint32_t finished
+ int tid, bed, end, n_off, i
+ int curr_tid, curr_beg, curr_end
+ uint64_t curr_off
+ hts_pair64_t *off
+ hts_readrec_func *readfunc
+ hts_bins_t bins
+
+ hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
+ void hts_idx_destroy(hts_idx_t *idx)
+ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
+ void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
+
+ #### Save an index to a file
+ # @param idx Index to be written
+ # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
+ # @param fmt One of the HTS_FMT_* index formats
+ # @return 0 if successful, or negative if an error occurred.
+ int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
+
+ #### Save an index to a specific file
+ # @param idx Index to be written
+ # @param fn Input BAM/BCF/etc filename
+ # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
+ # @param fmt One of the HTS_FMT_* index formats
+ # @return 0 if successful, or negative if an error occurred.
+ int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
+
+ #### Load an index file
+ # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
+ # the extension substituted, to search for an existing index file
+ # @param fmt One of the HTS_FMT_* index formats
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *hts_idx_load(const char *fn, int fmt)
+
+ #### Load a specific index file
+ # @param fn Input BAM/BCF/etc filename
+ # @param fnidx The input index filename
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
+
+ uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
+ void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
+
+ int hts_idx_get_stat(const hts_idx_t* idx, int tid,
+ uint64_t* mapped, uint64_t* unmapped)
+
+ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
+
+ int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers
+
+ # Parse a numeric string
+ # The number may be expressed in scientific notation, and optionally may
+ # contain commas in the integer part (before any decimal point or E notation).
+ # @param str String to be parsed
+ # @param strend If non-NULL, set on return to point to the first character
+ # in @a str after those forming the parsed number
+ # @param flags Or'ed-together combination of HTS_PARSE_* flags
+ # @return Converted value of the parsed number.
+ #
+ # When @a strend is NULL, a warning will be printed (if hts_verbose is 2
+ # or more) if there are any trailing characters after the number.
+ long long hts_parse_decimal(const char *str, char **strend, int flags)
+
+ # Parse a "CHR:START-END"-style region string
+ # @param str String to be parsed
+ # @param beg Set on return to the 0-based start of the region
+ # @param end Set on return to the 1-based end of the region
+ # @return Pointer to the colon or '\0' after the reference sequence name,
+ # or NULL if @a str could not be parsed.
+ const char *hts_parse_reg(const char *str, int *beg, int *end)
+
+ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
+ void hts_itr_destroy(hts_itr_t *iter)
+
+ ctypedef int (*hts_name2id_f)(void*, const char*)
+ ctypedef const char *(*hts_id2name_f)(void*, int)
+ ctypedef hts_itr_t *hts_itr_query_func(
+ const hts_idx_t *idx,
+ int tid,
+ int beg,
+ int end,
+ hts_readrec_func *readrec)
+
+ hts_itr_t *hts_itr_querys(
+ const hts_idx_t *idx,
+ const char *reg,
+ hts_name2id_f getid,
+ void *hdr,
+ hts_itr_query_func *itr_query,
+ hts_readrec_func *readrec)
+
+ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
+ const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values
+
+ # hts_file_type() - Convenience function to determine file type
+ # @fname: the file name
+ #
+ # Returns one of the FT_* defines.
+ #
+ # DEPRECATED: This function has been replaced by hts_detect_format().
+ # It and these FT_* macros will be removed in a future HTSlib release.
+ int FT_UNKN
+ int FT_GZ
+ int FT_VCF
+ int FT_VCF_GZ
+ int FT_BCF
+ int FT_BCF_GZ
+ int FT_STDIN
+
+ int hts_file_type(const char *fname)
+
+ inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+ inline int hts_bin_bot(int bin, int n_lvls)
+
+ # * Endianness *
+ inline int ed_is_big()
+ inline uint16_t ed_swap_2(uint16_t v)
+ inline void *ed_swap_2p(void *x)
+ inline uint32_t ed_swap_4(uint32_t v)
+ inline void *ed_swap_4p(void *x)
+ inline uint64_t ed_swap_8(uint64_t v)
+ inline void *ed_swap_8p(void *x)
+
+
+cdef extern from "htslib/sam.h" nogil:
+ #**********************
+ #*** SAM/BAM header ***
+ #**********************
+
+ # @abstract Structure for the alignment header.
+ # @field n_targets number of reference sequences
+ # @field l_text length of the plain text in the header
+ # @field target_len lengths of the reference sequences
+ # @field target_name names of the reference sequences
+ # @field text plain text
+ # @field sdict header dictionary
+
+ ctypedef struct bam_hdr_t:
+ int32_t n_targets, ignore_sam_err
+ uint32_t l_text
+ uint32_t *target_len
+ uint8_t *cigar_tab
+ char **target_name
+ char *text
+ void *sdict
+
+ #****************************
+ #*** CIGAR related macros ***
+ #****************************
+
+ int BAM_CMATCH
+ int BAM_CINS
+ int BAM_CDEL
+ int BAM_CREF_SKIP
+ int BAM_CSOFT_CLIP
+ int BAM_CHARD_CLIP
+ int BAM_CPAD
+ int BAM_CEQUAL
+ int BAM_CDIFF
+ int BAM_CBACK
+
+ char *BAM_CIGAR_STR
+ int BAM_CIGAR_SHIFT
+ uint32_t BAM_CIGAR_MASK
+ uint32_t BAM_CIGAR_TYPE
+
+ char bam_cigar_op(uint32_t c)
+ uint32_t bam_cigar_oplen(uint32_t c)
+ char bam_cigar_opchr(uint32_t)
+ uint32_t bam_cigar_gen(char, uint32_t)
+ int bam_cigar_type(char o)
+
+ # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair
+ int BAM_FPAIRED
+ # @abstract the read is mapped in a proper pair
+ int BAM_FPROPER_PAIR
+ # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
+ int BAM_FUNMAP
+ # @abstract the mate is unmapped
+ int BAM_FMUNMAP
+ # @abstract the read is mapped to the reverse strand
+ int BAM_FREVERSE
+ # @abstract the mate is mapped to the reverse strand
+ int BAM_FMREVERSE
+ # @abstract this is read1
+ int BAM_FREAD1
+ # @abstract this is read2
+ int BAM_FREAD2
+ # @abstract not primary alignment
+ int BAM_FSECONDARY
+ # @abstract QC failure
+ int BAM_FQCFAIL
+ # @abstract optical or PCR duplicate
+ int BAM_FDUP
+ # @abstract supplementary alignment
+ int BAM_FSUPPLEMENTARY
+
+ #*************************
+ #*** Alignment records ***
+ #*************************
+
+ # @abstract Structure for core alignment information.
+ # @field tid chromosome ID, defined by bam_hdr_t
+ # @field pos 0-based leftmost coordinate
+ # @field bin bin calculated by bam_reg2bin()
+ # @field qual mapping quality
+ # @field l_qname length of the query name
+ # @field flag bitwise flag
+ # @field n_cigar number of CIGAR operations
+ # @field l_qseq length of the query sequence (read)
+ # @field mtid chromosome ID of next read in template, defined by bam_hdr_t
+ # @field mpos 0-based leftmost coordinate of next read in template
+
+ ctypedef struct bam1_core_t:
+ int32_t tid
+ int32_t pos
+ uint16_t bin
+ uint8_t qual
+ uint8_t l_qname
+ uint16_t flag
+ uint16_t n_cigar
+ int32_t l_qseq
+ int32_t mtid
+ int32_t mpos
+ int32_t isize
+
+ # @abstract Structure for one alignment.
+ # @field core core information about the alignment
+ # @field l_data current length of bam1_t::data
+ # @field m_data maximum length of bam1_t::data
+ # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
+ #
+ # @discussion Notes:
+ #
+ # 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+ # 2. l_qseq is calculated from the total length of an alignment block
+ # on reading or from CIGAR.
+ # 3. cigar data is encoded 4 bytes per CIGAR operation.
+ # 4. seq is nybble-encoded according to seq_nt16_table.
+ ctypedef struct bam1_t:
+ bam1_core_t core
+ int l_data, m_data
+ uint8_t *data
+ uint64_t id
+
+ # @abstract Get whether the query is on the reverse strand
+ # @param b pointer to an alignment
+ # @return boolean true if query is on the reverse strand
+ int bam_is_rev(bam1_t *b)
+
+ # @abstract Get whether the query's mate is on the reverse strand
+ # @param b pointer to an alignment
+ # @return boolean true if query's mate on the reverse strand
+ int bam_is_mrev(bam1_t *b)
+
+ # @abstract Get the name of the query
+ # @param b pointer to an alignment
+ # @return pointer to the name string, null terminated
+ char *bam_get_qname(bam1_t *b)
+
+ # @abstract Get the CIGAR array
+ # @param b pointer to an alignment
+ # @return pointer to the CIGAR array
+ #
+ # @discussion In the CIGAR array, each element is a 32-bit integer. The
+ # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+ # length of a CIGAR.
+ uint32_t *bam_get_cigar(bam1_t *b)
+
+ # @abstract Get query sequence
+ # @param b pointer to an alignment
+ # @return pointer to sequence
+ #
+ # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+ # 8 for T and 15 for N. Two bases are packed in one byte with the base
+ # at the higher 4 bits having smaller coordinate on the read. It is
+ # recommended to use bam_seqi() macro to get the base.
+ char *bam_get_seq(bam1_t *b)
+
+ # @abstract Get query quality
+ # @param b pointer to an alignment
+ # @return pointer to quality string
+ uint8_t *bam_get_qual(bam1_t *b)
+
+ # @abstract Get auxiliary data
+ # @param b pointer to an alignment
+ # @return pointer to the concatenated auxiliary data
+ uint8_t *bam_get_aux(bam1_t *b)
+
+ # @abstract Get length of auxiliary data
+ # @param b pointer to an alignment
+ # @return length of the concatenated auxiliary data
+ int bam_get_l_aux(bam1_t *b)
+
+ # @abstract Get a base on read
+ # @param s Query sequence returned by bam1_seq()
+ # @param i The i-th position, 0-based
+ # @return 4-bit integer representing the base.
+ char bam_seqi(char *s, int i)
+
+ #**************************
+ #*** Exported functions ***
+ #**************************
+
+ #***************
+ #*** BAM I/O ***
+ #***************
+
+ bam_hdr_t *bam_hdr_init()
+ bam_hdr_t *bam_hdr_read(BGZF *fp)
+ int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
+ void bam_hdr_destroy(bam_hdr_t *h)
+ int bam_name2id(bam_hdr_t *h, const char *ref)
+ bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0)
+
+ bam1_t *bam_init1()
+ void bam_destroy1(bam1_t *b)
+ int bam_read1(BGZF *fp, bam1_t *b)
+ int bam_write1(BGZF *fp, const bam1_t *b)
+ bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+ bam1_t *bam_dup1(const bam1_t *bsrc)
+
+ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
+ int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
+
+ # @abstract Calculate the rightmost base position of an alignment on the
+ # reference genome.
+
+ # @param b pointer to an alignment
+ # @return the coordinate of the first base after the alignment, 0-based
+
+ # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
+ # For an unmapped read (either according to its flags or if it has no cigar
+ # string), we return b->core.pos + 1 by convention.
+ int32_t bam_endpos(const bam1_t *b)
+
+ int bam_str2flag(const char *str) # returns negative value on error
+ char *bam_flag2str(int flag) # The string must be freed by the user
+
+ #*************************
+ #*** BAM/CRAM indexing ***
+ #*************************
+
+ # These BAM iterator functions work only on BAM files. To work with either
+ # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
+ void bam_itr_destroy(hts_itr_t *iter)
+ hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+ hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
+ int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
+
+ # Load/build .csi or .bai BAM index file. Does not work with CRAM.
+ # It is recommended to use the sam_index_* functions below instead.
+ hts_idx_t *bam_index_load(const char *fn)
+ int bam_index_build(const char *fn, int min_shift)
+
+ # Load a BAM (.csi or .bai) or CRAM (.crai) index file
+ # @param fp File handle of the data file whose index is being opened
+ # @param fn BAM/CRAM/etc filename to search alongside for the index file
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
+
+ # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
+ # @param fp File handle of the data file whose index is being opened
+ # @param fn BAM/CRAM/etc data file filename
+ # @param fnidx Index filename, or NULL to search alongside @a fn
+ # @return The index, or NULL if an error occurred.
+ hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
+
+ # Generate and save an index file
+ # @param fn Input BAM/etc filename, to which .csi/etc will be added
+ # @param min_shift Positive to generate CSI, or 0 to generate BAI
+ # @return 0 if successful, or negative if an error occurred (usually -1; or
+ # -2: opening fn failed; -3: format not indexable)
+ int sam_index_build(const char *fn, int min_shift)
+
+ # Generate and save an index to a specific file
+ # @param fn Input BAM/CRAM/etc filename
+ # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
+ # @param min_shift Positive to generate CSI, or 0 to generate BAI
+ # @return 0 if successful, or negative if an error occurred.
+ int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
+
+ void sam_itr_destroy(hts_itr_t *iter)
+ hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+ hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
+ int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
+
+ #***************
+ #*** SAM I/O ***
+ #***************
+
+ htsFile *sam_open(const char *fn, const char *mode)
+ htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
+ int sam_close(htsFile *fp)
+
+ int sam_open_mode(char *mode, const char *fn, const char *format)
+
+ # A version of sam_open_mode that can handle ,key=value options.
+ # The format string is allocated and returned, to be freed by the caller.
+ # Prefix should be "r" or "w",
+ char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
+
+ bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
+ bam_hdr_t *sam_hdr_read(htsFile *fp)
+ int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
+
+ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
+ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
+ int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
+ int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
+
+ #*************************************
+ #*** Manipulating auxiliary fields ***
+ #*************************************
+
+ uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
+ int32_t bam_aux2i(const uint8_t *s)
+ double bam_aux2f(const uint8_t *s)
+ char bam_aux2A(const uint8_t *s)
+ char *bam_aux2Z(const uint8_t *s)
+
+ void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data)
+ int bam_aux_del(bam1_t *b, uint8_t *s)
+
+ #**************************
+ #*** Pileup and Mpileup ***
+ #**************************
+
+ # @abstract Structure for one alignment covering the pileup position.
+ # @field b pointer to the alignment
+ # @field qpos position of the read base at the pileup site, 0-based
+ # @field indel indel length; 0 for no indel, positive for ins and negative for del
+ # @field level the level of the read in the "viewer" mode
+ # @field is_del 1 iff the base on the padded read is a deletion
+ # @field is_head ???
+ # @field is_tail ???
+ # @field is_refskip ???
+ # @field aux ???
+ #
+ # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+ # difference between the two functions is that the former does not
+ # set bam_pileup1_t::level, while the later does. Level helps the
+ # implementation of alignment viewers, but calculating this has some
+ # overhead.
+ #
+ # is_del, is_head, etc are a bit field, declaring as below should
+ # work as expected, see
+ # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
+
+ ctypedef struct bam_pileup1_t:
+ bam1_t *b
+ int32_t qpos
+ int indel, level
+ uint32_t is_del
+ uint32_t is_head
+ uint32_t is_tail
+ uint32_t is_refskip
+ uint32_t aux
+
+ ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
+ ctypedef int (*bam_test_f)()
+
+ ctypedef struct __bam_plp_t
+ ctypedef __bam_plp_t *bam_plp_t
+
+ ctypedef struct __bam_mplp_t
+ ctypedef __bam_mplp_t *bam_mplp_t
+
+ # bam_plp_init() - sets an iterator over multiple
+ # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return
+ # status: 0 on success, -1 on end, < -1 on non-recoverable errors
+ # @data: user data to pass to @func
+ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
+ void bam_plp_destroy(bam_plp_t iter)
+ int bam_plp_push(bam_plp_t iter, const bam1_t *b)
+ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
+ void bam_plp_reset(bam_plp_t iter)
+
+ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+
+ # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
+ # read pairs and for each base pair set the base quality of the
+ # lower-quality base to zero, thus effectively discarding it from
+ # calling. If the two bases are identical, the quality of the other base
+ # is increased to the sum of their qualities (capped at 200), otherwise
+ # it is multiplied by 0.8.
+ void bam_mplp_init_overlaps(bam_mplp_t iter)
+ void bam_mplp_destroy(bam_mplp_t iter)
+ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
+ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+
+ # Added by AH
+ # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
+
+
+cdef extern from "htslib/faidx.h" nogil:
+
+ ctypedef struct faidx_t:
+ pass
+
+ int fai_build(char *fn)
+
+ void fai_destroy(faidx_t *fai)
+
+ faidx_t *fai_load(char *fn)
+
+ char *fai_fetch(faidx_t *fai,
+ char *reg,
+ int *len)
+
+ int faidx_nseq(faidx_t *fai)
+
+ int faidx_has_seq(faidx_t *fai, const char *seq)
+
+ char *faidx_fetch_seq(faidx_t *fai,
+ char *c_name,
+ int p_beg_i,
+ int p_end_i,
+ int *len)
+
+ int faidx_seq_len(faidx_t *fai, const char *seq)
+
+
+# tabix support
+cdef extern from "htslib/tbx.h" nogil:
+
+ # tbx.h definitions
+ int8_t TBX_MAX_SHIFT
+ int8_t TBX_GENERIC
+ int8_t TBX_SAM
+ int8_t TBX_VCF
+ int8_t TBX_UCSC
+
+ ctypedef struct tbx_conf_t:
+ int32_t preset
+ int32_t sc, bc, ec # seq col., beg col. and end col.
+ int32_t meta_char, line_skip
+
+ ctypedef struct tbx_t:
+ tbx_conf_t conf
+ hts_idx_t *idx
+ void * dict
+
+ tbx_conf_t tbx_conf_gff
+ tbx_conf_t tbx_conf_bed
+ tbx_conf_t tbx_conf_psltbl
+ tbx_conf_t tbx_conf_sam
+ tbx_conf_t tbx_conf_vcf
+
+ void tbx_itr_destroy(hts_itr_t * iter)
+ hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
+ hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
+ int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data)
+
+ int tbx_name2id(tbx_t *tbx, char *ss)
+
+ int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
+ int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
+
+ tbx_t * tbx_index_load(char *fn)
+ tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
+
+ # free the array but not the values
+ char **tbx_seqnames(tbx_t *tbx, int *n)
+
+ void tbx_destroy(tbx_t *tbx)
+
+
+# VCF/BCF API
+cdef extern from "htslib/vcf.h" nogil:
+
+ # Header struct
+
+ uint8_t BCF_HL_FLT # header line
+ uint8_t BCF_HL_INFO
+ uint8_t BCF_HL_FMT
+ uint8_t BCF_HL_CTG
+ uint8_t BCF_HL_STR # structured header line TAG=<A=..,B=..>
+ uint8_t BCF_HL_GEN # generic header line
+
+ uint8_t BCF_HT_FLAG # header type
+ uint8_t BCF_HT_INT
+ uint8_t BCF_HT_REAL
+ uint8_t BCF_HT_STR
+
+ uint8_t BCF_VL_FIXED # variable length
+ uint8_t BCF_VL_VAR
+ uint8_t BCF_VL_A
+ uint8_t BCF_VL_G
+ uint8_t BCF_VL_R
+
+ # === Dictionary ===
+ #
+ # The header keeps three dictonaries. The first keeps IDs in the
+ # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
+ # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
+ # is the actual hash table, which is opaque to the end users. In the hash
+ # table, the key is the ID or sample name as a C string and the value is a
+ # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
+ # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
+ # size of the hash table or, equivalently, the length of the id[] arrays.
+
+ uint8_t BCF_DT_ID # dictionary type
+ uint8_t BCF_DT_CTG
+ uint8_t BCF_DT_SAMPLE
+
+ # Complete textual representation of a header line
+ ctypedef struct bcf_hrec_t:
+ int type # One of the BCF_HL_* type
+ char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
+ char *value # Set only for generic lines, NULL for FILTER/INFO, etc.
+ int nkeys # Number of structured fields
+ char **keys # The key=value pairs
+ char **vals
+
+ ctypedef struct bcf_idinfo_t:
+ uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
+ bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
+ int id
+
+ ctypedef struct bcf_idpair_t:
+ const char *key
+ const bcf_idinfo_t *val
+
+ ctypedef struct bcf_hdr_t:
+ int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
+ bcf_idpair_t *id[3]
+ void *dict[3] # ID dictionary, contig dict and sample dict
+ char **samples
+ bcf_hrec_t **hrec
+ int nhrec, dirty
+ int ntransl
+ int *transl[2] # for bcf_translate()
+ int nsamples_ori # for bcf_hdr_set_samples()
+ uint8_t *keep_samples
+ kstring_t mem
+ int32_t m[3] # m: allocated size of the dictionary block in use (see n above)
+
+ uint8_t bcf_type_shift[]
+
+ # * VCF record *
+
+ uint8_t BCF_BT_NULL
+ uint8_t BCF_BT_INT8
+ uint8_t BCF_BT_INT16
+ uint8_t BCF_BT_INT32
+ uint8_t BCF_BT_FLOAT
+ uint8_t BCF_BT_CHAR
+
+ uint8_t VCF_REF
+ uint8_t VCF_SNP
+ uint8_t VCF_MNP
+ uint8_t VCF_INDEL
+ uint8_t VCF_OTHER
+
+ ctypedef struct variant_t:
+ int type, n # variant type and the number of bases affected, negative for deletions
+
+ ctypedef struct bcf_fmt_t:
+ int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
+ int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
+ uint8_t *p # same as vptr and vptr_* in bcf_info_t below
+ uint32_t p_len
+ uint32_t p_off
+ uint8_t p_free
+
+ union bcf_info_union_t:
+ int32_t i # integer value
+ float f # float value
+
+ ctypedef struct bcf_info_t:
+ int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
+ int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars
+
+ # v1 union only set if $len==1; for easier access
+ bcf_info_union_t v1
+ uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
+ uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset
+ uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes
+ uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new
+ # data block is bigger than the original
+
+ uint8_t BCF1_DIRTY_ID
+ uint8_t BCF1_DIRTY_ALS
+ uint8_t BCF1_DIRTY_FLT
+ uint8_t BCF1_DIRTY_INF
+
+ ctypedef struct bcf_dec_t:
+ int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change
+ int n_flt # Number of FILTER fields
+ int *flt # FILTER keys in the dictionary
+ char *id # ID
+ char *als # REF+ALT block (\0-seperated)
+ char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated
+ bcf_info_t *info # INFO
+ bcf_fmt_t *fmt # FORMAT and individual sample
+ variant_t *var # $var and $var_type set only when set_variant_types called
+ int n_var, var_type
+ int shared_dirty # if set, shared.s must be recreated on BCF output
+ int indiv_dirty # if set, indiv.s must be recreated on BCF output
+
+ uint8_t BCF_ERR_CTG_UNDEF
+ uint8_t BCF_ERR_TAG_UNDEF
+ uint8_t BCF_ERR_NCOLS
+ uint8_t BCF_ERR_LIMITS
+ uint8_t BCF_ERR_CHAR
+ uint8_t BCF_ERR_CTG_INVALID
+ uint8_t BCF_ERR_TAG_INVALID
+
+ # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
+ # is slower because the string is first to be parsed, packed into BCF line
+ # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
+ # is known in advance that some of the fields will not be required (notably
+ # the sample columns), parsing of these can be skipped by setting max_unpack
+ # appropriately.
+ # Similarly, it is fast to output a BCF line because the columns (kept in
+ # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
+ # line must be formatted in vcf_format.
+
+ ctypedef struct bcf1_t:
+ int32_t rid # CHROM
+ int32_t pos # POS
+ int32_t rlen # length of REF
+ float qual # QUAL
+ uint32_t n_info, n_allele
+ uint32_t n_fmt, n_sample
+ kstring_t shared, indiv
+ bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
+ int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
+ int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
+ int unpack_size[3] # the original block size of ID, REF+ALT and FILTER
+ int errcode # one of BCF_ERR_* codes
+
+ ####### API #######
+
+ # BCF and VCF I/O
+ #
+ # A note about naming conventions: htslib internally represents VCF
+ # records as bcf1_t data structures, therefore most functions are
+ # prefixed with bcf_. There are a few exceptions where the functions must
+ # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
+ # these cases, functions prefixed with bcf_ are more general and work
+ # with both BCF and VCF.
+
+ # bcf_hdr_init() - create an empty BCF header.
+ # @param mode "r" or "w"
+ #
+ # When opened for writing, the mandatory fileFormat and
+ # FILTER=PASS lines are added automatically.
+ bcf_hdr_t *bcf_hdr_init(const char *mode)
+
+ # Destroy a BCF header struct
+ void bcf_hdr_destroy(bcf_hdr_t *h)
+
+ # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
+ bcf1_t *bcf_init()
+
+ # Deallocate a bcf1_t object
+ void bcf_destroy(bcf1_t *v)
+
+ # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
+ # not the bcf1_t object itself.
+ void bcf_empty(bcf1_t *v)
+
+ # Make the bcf1_t object ready for next read. Intended mostly for
+ # internal use, the user should rarely need to call this function
+ # directly.
+ void bcf_clear(bcf1_t *v)
+
+ # Reads VCF or BCF header
+ bcf_hdr_t *bcf_hdr_read(htsFile *fp)
+
+ # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
+ # @samples: samples to include or exclude from file or as a comma-separated string.
+ # LIST|FILE .. select samples in list/file
+ # ^LIST|FILE .. exclude samples from list/file
+ # - .. include all samples
+ # NULL .. exclude all samples
+ # @is_file: @samples is a file (1) or a comma-separated list (0)
+ #
+ # The bottleneck of VCF reading is parsing of genotype fields. If the
+ # reader knows in advance that only subset of samples is needed (possibly
+ # no samples at all), the performance of bcf_read() can be significantly
+ # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
+ # The function bcf_read() will subset the VCF/BCF records automatically
+ # with the notable exception when reading records via bcf_itr_next().
+ # In this case, bcf_subset_format() must be called explicitly, because
+ # bcf_readrec() does not see the header.
+ #
+ # Returns 0 on success, -1 on error or a positive integer if the list
+ # contains samples not present in the VCF header. In such a case, the
+ # return value is the index of the offending sample.
+ #
+ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
+ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
+
+ # Writes VCF or BCF header
+ int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
+
+ # Parse VCF line contained in kstring and populate the bcf1_t struct
+ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
+
+ # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
+ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
+
+ # bcf_read() - read next VCF or BCF record
+ #
+ # Returns -1 on critical errors, 0 otherwise. On errors which are not
+ # critical for reading, such as missing header definitions, v->errcode is
+ # set to one of BCF_ERR* code and must be checked before calling
+ # vcf_write().
+ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+
+ # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
+ #
+ # Note that bcf_unpack() must be called even when reading VCF. It is safe
+ # to call the function repeatedly, it will not unpack the same field
+ # twice.
+ uint8_t BCF_UN_STR # up to ALT inclusive
+ uint8_t BCF_UN_FLT # up to FILTER
+ uint8_t BCF_UN_INFO # up to INFO
+ uint8_t BCF_UN_SHR # all shared information
+ uint8_t BCF_UN_FMT # unpack format and each sample
+ uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT
+ uint8_t BCF_UN_ALL # everything
+
+ int bcf_unpack(bcf1_t *b, int which)
+
+ # bcf_dup() - create a copy of BCF record.
+ #
+ # Note that bcf_unpack() must be called on the returned copy as if it was
+ # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
+ # internally to reflect any changes made by bcf_update_* functions.
+ bcf1_t *bcf_dup(bcf1_t *src)
+ bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
+
+ # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
+ int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
+
+ # The following functions work only with VCFs and should rarely be called
+ # directly. Usually one wants to use their bcf_* alternatives, which work
+ # transparently with both VCFs and BCFs.
+ bcf_hdr_t *vcf_hdr_read(htsFile *fp)
+ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
+ int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+
+ #************************************************************************
+ # Header querying and manipulation routines
+ #************************************************************************
+
+ # Create a new header using the supplied template
+ bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
+
+ # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
+ # Returns 0 on success or sets a bit on error:
+ # 1 .. conflicting definitions of tag length
+ # # todo
+ int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
+
+ # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
+ # @param dst: the destination header to be merged into, NULL on the first pass
+ # @param src: the source header
+ #
+ # Notes:
+ # - use as:
+ # bcf_hdr_t *dst = NULL;
+ # for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
+ #
+ # - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
+ # combining multiple BCF headers. The current bcf_hdr_combine()
+ # does not have this problem, but became slow when used for many files.
+ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
+
+ # bcf_hdr_add_sample() - add a new sample.
+ # @param sample: sample name to be added
+ int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
+
+ # Read VCF header from a file and update the header
+ int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
+
+ # Returns formatted header (newly allocated string) and its length,
+ # excluding the terminating \0. If is_bcf parameter is unset, IDX
+ # fields are discarded.
+ char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
+
+ # Append new VCF header line, returns 0 on success
+ int bcf_hdr_append(bcf_hdr_t *h, const char *line)
+ int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
+
+ # VCF version, e.g. VCFv4.2
+ const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
+ void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
+
+ # bcf_hdr_remove() - remove VCF header tag
+ # @param type: one of BCF_HL_*
+ # @param key: tag name or NULL to remove all tags of the given type
+ void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
+
+ # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
+ # @param n: number of samples to keep
+ # @param samples: names of the samples to keep
+ # @param imap: mapping from index in @samples to the sample index in the original file
+ #
+ # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
+ # by comparing n and bcf_hdr_nsamples(out_hdr).
+ # This function can be used to reorder samples.
+ # See also bcf_subset() which subsets individual records.
+ #
+ bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
+
+ # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
+ const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
+
+ # Get number of samples
+ int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
+
+ # The following functions are for internal use and should rarely be called directly
+ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
+ int bcf_hdr_sync(bcf_hdr_t *h)
+ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
+ void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
+ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
+
+ # bcf_hdr_get_hrec() - get header line info
+ # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
+ # @param key: the header key for generic lines (e.g. "fileformat"), any field
+ # for structured lines, typically "ID".
+ # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
+ # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
+ #
+ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
+ bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
+ void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
+ void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
+ int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
+ void hrec_add_idx(bcf_hrec_t *hrec, int idx)
+ void bcf_hrec_destroy(bcf_hrec_t *hrec)
+
+ #************************************************************************
+ # Individual record querying and manipulation routines
+ #************************************************************************
+
+ # See the description of bcf_hdr_subset()
+ int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
+
+ # bcf_translate() - translate tags ids to be consistent with different header. This function
+ # is useful when lines from multiple VCF need to be combined.
+ # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
+ # @src_hdr: the source header, used in bcf_read()
+ # @src_line: line obtained by bcf_read()
+ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
+
+ # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc
+ int bcf_get_variant_types(bcf1_t *rec)
+ int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
+ int bcf_is_snp(bcf1_t *v)
+
+ # bcf_update_filter() - sets the FILTER column
+ # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+ # @n: Number of filters. If n==0, all filters are removed
+ int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
+
+ # bcf_add_filter() - adds to the FILTER column
+ # @flt_id: The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+ #
+ # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
+ int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
+
+ # bcf_remove_filter() - removes from the FILTER column
+ # @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
+ # @pass: when set to 1 and no filters are present, set to PASS
+ int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
+
+ # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
+ int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
+
+ # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
+ # @alleles: Array of alleles
+ # @nals: Number of alleles
+ # @alleles_string: Comma-separated alleles, starting with the REF allele
+ int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
+ int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
+
+ # bcf_update_id() - sets new ID string
+ # bcf_add_id() - adds to the ID string checking for duplicates
+ int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+ int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
+
+ # bcf_update_info_*() - functions for updating INFO fields
+ # @hdr: the BCF header
+ # @line: VCF line to be edited
+ # @key: the INFO tag to be updated
+ # @values: pointer to the array of values. Pass NULL to remove the tag.
+ # @n: number of values in the array. When set to 0, the INFO tag is removed
+ #
+ # The @string in bcf_update_info_flag() is optional, @n indicates whether
+ # the flag is set or removed.
+ #
+ # Returns 0 on success or negative value on error.
+ #
+ int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
+ int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
+ int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+ int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+
+ # bcf_update_format_*() - functions for updating FORMAT fields
+ # @values: pointer to the array of values, the same number of elements
+ # is expected for each sample. Missing values must be padded
+ # with bcf_*_missing or bcf_*_vector_end values.
+ # @n: number of values in the array. If n==0, existing tag is removed.
+ #
+ # The function bcf_update_format_string() is a higher-level (slower) variant of
+ # bcf_update_format_char(). The former accepts array of \0-terminated strings
+ # whereas the latter requires that the strings are collapsed into a single array
+ # of fixed-length strings. In case of strings with variable length, shorter strings
+ # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
+ # are not \0-terminated.
+ #
+ # Returns 0 on success or negative value on error.
+ #
+ int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
+ int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
+ int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
+ int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
+ int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
+ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
+
+ # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
+ # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
+ # from bcf_get_genotypes() below.
+ uint32_t bcf_gt_phased(uint32_t idx)
+ uint32_t bcf_gt_unphased(uint32_t idx)
+ uint32_t bcf_gt_missing
+ uint32_t bcf_gt_is_missing(uint32_t val)
+ uint32_t bcf_gt_is_phased(uint32_t idx)
+ uint32_t bcf_gt_allele(uint32_t val)
+
+ # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
+ uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
+ void bcf_gt2alleles(int igt, int *a, int *b)
+
+ # bcf_get_fmt() - returns pointer to FORMAT's field data
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @fmt: one of GT,PL,...
+ #
+ # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
+ # is not available.
+ #
+ bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+ bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
+
+ # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
+ # @line: VCF line obtained from vcf_parse1
+ # @id: The header index for the tag, obtained from bcf_hdr_id2int()
+ #
+ # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
+ # as their goal is to avoid the header lookup.
+ #
+ bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
+ bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
+
+ # bcf_get_info_*() - get INFO values, integers or floats
+ # @hdr: BCF header
+ # @line: BCF record
+ # @tag: INFO tag to retrieve
+ # @dst: *dst is pointer to a memory location, can point to NULL
+ # @ndst: pointer to the size of allocated memory
+ #
+ # Returns negative value on error or the number of written values on
+ # success. bcf_get_info_string() returns on success the number of
+ # characters written excluding the null-terminating byte. bcf_get_info_flag()
+ # returns 1 when flag is set or 0 if not.
+ #
+ # List of return codes:
+ # -1 .. no such INFO tag defined in the header
+ # -2 .. clash between types defined in the header and encountered in the VCF record
+ # -3 .. tag is not present in the VCF record
+ #
+ int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
+ int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
+ int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
+ int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
+ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+
+ # bcf_get_format_*() - same as bcf_get_info*() above
+ #
+ # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
+ # see the description of bcf_update_format_string() and bcf_update_format_char() above.
+ # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
+ # a single block of \0-terminated strings collapsed into a single array and an array of pointers
+ # to these strings. Both arrays must be cleaned by the user.
+ #
+ # Returns negative value on error or the number of written values on success.
+ #
+ # Example:
+ # int ndst = 0; char **dst = NULL
+ # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
+ # for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
+ # free(dst[0]); free(dst)
+ #
+ # Example:
+ # int ngt, *gt_arr = NULL, ngt_arr = 0
+ # ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr)
+ #
+ int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
+ int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
+ int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
+ int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst)
+ int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
+ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
+
+ #************************************************************************
+ # Helper functions
+ #************************************************************************
+
+ #
+ # bcf_hdr_id2int() - Translates string into numeric ID
+ # bcf_hdr_int2id() - Translates numeric ID into string
+ # @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
+ # @id: tag name, such as: PL, DP, GT, etc.
+ #
+ # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
+ # fields in BCF records.
+ #
+ int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
+ const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
+
+ # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
+ # bcf_hdr_id2name() - Translates numeric ID to sequence name
+ #
+ int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
+ const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
+ const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
+
+ #
+ # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
+ # @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
+ # @int_id: return value of bcf_hdr_id2int, must be >=0
+ #
+ # The returned values are:
+ # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
+ # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields
+ # bcf_hdr_id2type .. the field type, one of BCF_HT_*
+ # bcf_hdr_id2coltype .. the column type, one of BCF_HL_*
+ #
+ # Notes: Prior to using the macros, the presence of the info should be
+ # tested with bcf_hdr_idinfo_exists().
+ #
+ int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
+ int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
+ bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
+
+ void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
+ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
+
+ void bcf_enc_vchar(kstring_t *s, int l, const char *a)
+ void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
+ void bcf_enc_vfloat(kstring_t *s, int n, float *a)
+
+ #************************************************************************
+ # BCF index
+ #
+ # Note that these functions work with BCFs only. See synced_bcf_reader.h
+ # which provides (amongst other things) an API to work transparently with
+ # both indexed BCFs and VCFs.
+ #************************************************************************
+
+ hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
+ int bcf_index_build(const char *fn, int min_shift)
+ int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
+
+ #*******************
+ # Typed value I/O *
+ #******************
+
+ # Note that in contrast with BCFv2.1 specification, HTSlib implementation
+ # allows missing values in vectors. For integer types, the values 0x80,
+ # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
+ # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of
+ # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
+ # end-of-vector indicator.
+ # Note that the end-of-vector byte is not part of the vector.
+
+ # This trial BCF version (v2.2) is compatible with the VCF specification and
+ # enables to handle correctly vectors with different ploidy in presence of
+ # missing values.
+
+ int32_t bcf_int8_vector_end
+ int32_t bcf_int16_vector_end
+ int32_t bcf_int32_vector_end
+ int32_t bcf_str_vector_end
+ int32_t bcf_int8_missing
+ int32_t bcf_int16_missing
+ int32_t bcf_int32_missing
+ int32_t bcf_str_missing
+
+ uint32_t bcf_float_vector_end
+ uint32_t bcf_float_missing
+
+ void bcf_float_set(float *ptr, uint32_t value)
+ void bcf_float_set_vector_end(float *x)
+ void bcf_float_set_missing(float *x)
+
+ int bcf_float_is_missing(float f)
+ int bcf_float_is_vector_end(float f)
+ void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
+ void bcf_enc_size(kstring_t *s, int size, int type)
+ int bcf_enc_inttype(long x)
+ void bcf_enc_int1(kstring_t *s, int32_t x)
+ int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
+ int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
+ int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
+
+ # These trivial wrappers are defined only for consistency with other parts of htslib
+ bcf1_t *bcf_init1()
+ int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
+ void bcf_destroy1(bcf1_t *v)
+ void bcf_empty1(bcf1_t *v)
+ int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
+ void bcf_clear1(bcf1_t *v)
+ int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
+
+ # Other nice wrappers
+ void bcf_itr_destroy(hts_itr_t *iter)
+ hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
+ hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
+ int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
+ hts_idx_t *bcf_index_load(const char *fn)
+ const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
+
+
+# VCF/BCF utility functions
+cdef extern from "htslib/vcfutils.h" nogil:
+ struct kbitset_t
+
+ # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtain from vcf_parse1
+ #
+ # Returns the number of removed alleles on success or negative
+ # on error:
+ # -1 .. some allele index is out of bounds
+ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
+
+ # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @mask: alleles to remove
+ #
+ # If you have more than 31 alleles, then the integer bit mask will
+ # overflow, so use bcf_remove_allele_set instead
+ void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
+
+ # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @rm_set: pointer to kbitset_t object with bits set for allele
+ # indexes to remove
+ #
+ # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
+ void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
+
+ # bcf_calc_ac() - calculate the number of REF and ALT alleles
+ # @header: for access to BCF_DT_ID dictionary
+ # @line: VCF line obtained from vcf_parse1
+ # @ac: array of length line->n_allele
+ # @which: determine if INFO/AN,AC and indv fields be used
+ #
+ # Returns 1 if the call succeeded, or 0 if the value could not
+ # be determined.
+ #
+ # The value of @which determines if existing INFO/AC,AN can be
+ # used (BCF_UN_INFO) and and if indv fields can be splitted
+ # (BCF_UN_FMT).
+ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
+
+ # bcf_gt_type() - determines type of the genotype
+ # @fmt_ptr: the GT format field as set for example by set_fmt_ptr
+ # @isample: sample index (starting from 0)
+ # @ial: index of the 1st non-reference allele (starting from 1)
+ # @jal: index of the 2nd non-reference allele (starting from 1)
+ #
+ # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
+ # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
+ # is not NULL and the genotype has one or more non-reference
+ # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
+ # position of the allele which appeared first in ALT. If $jal is
+ # not null and the genotype is GT_HET_AA, $jal will be set and is
+ # the position of the second allele in ALT.
+ uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation
+ uint8_t GT_HOM_AA
+ uint8_t GT_HET_RA
+ uint8_t GT_HET_AA
+ uint8_t GT_HAPL_R
+ uint8_t GT_HAPL_A
+ uint8_t GT_UNKN
+ int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
+
+ int bcf_acgt2int(char c)
+ char bcf_int2acgt(int i)
+
+ # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
+ # @i,j: allele indexes, 0-based, i<=j
+ # Returns index to the Number=G diploid array
+ uint32_t bcf_ij2G(uint32_t i, uint32_t j)
+
+
+cdef class HTSFile(object):
+ cdef htsFile *htsfile # pointer to htsFile structure
+ cdef int64_t start_offset # BGZF offset of first record
+
+ cdef readonly object filename # filename as supplied by user
+ cdef readonly object mode # file opening mode
+ cdef readonly object index_filename # filename of index, if supplied by user
+
+ cdef readonly bint is_stream # Is htsfile a non-seekable stream
+ cdef readonly bint is_remote # Is htsfile a remote stream
+ cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh
+
+ cdef htsFile *_open_htsfile(self) except? NULL
--- /dev/null
+# cython: embedsignature=True
+# cython: profile=True
+# adds doc-strings for sphinx
+import os
+
+from posix.unistd cimport dup
+
+from pysam.libchtslib cimport *
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+
+__all__ = ["get_verbosity", "set_verbosity"]
+
+
+########################################################################
+########################################################################
+## Constants
+########################################################################
+
+cdef int MAX_POS = 2 << 29
+cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
+cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
+ 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
+cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
+
+
+cpdef set_verbosity(int verbosity):
+ """Set htslib's hts_verbose global variable to the specified value."""
+ return hts_set_verbosity(verbosity)
+
+cpdef get_verbosity():
+ """Return the value of htslib's hts_verbose global variable."""
+ return hts_get_verbosity()
+
+
+class CallableValue(object):
+ def __init__(self, value):
+ self.value = value
+ def __call__(self):
+ return self.value
+ def __bool__(self):
+ return self.value
+ def __nonzero__(self):
+ return self.value
+ def __eq__(self, other):
+ return self.value == other
+ def __ne__(self, other):
+ return self.value != other
+
+
+CTrue = CallableValue(True)
+CFalse = CallableValue(False)
+
+
+cdef class HTSFile(object):
+ """
+ Base class for HTS file types
+ """
+ def __cinit__(self, *args, **kwargs):
+ self.htsfile = NULL
+ self.duplicate_filehandle = True
+
+ def __dealloc__(self):
+ if self.htsfile:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.close()
+ return False
+
+ @property
+ def category(self):
+ """General file format category. One of UNKNOWN, ALIGNMENTS,
+ VARIANTS, INDEX, REGIONS"""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return FORMAT_CATEGORIES[self.htsfile.format.category]
+
+ @property
+ def format(self):
+ """File format.
+
+ One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
+ BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
+ """
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return FORMATS[self.htsfile.format.format]
+
+ @property
+ def version(self):
+ """Tuple of file format version numbers (major, minor)"""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return self.htsfile.format.version.major, self.htsfile.format.version.minor
+
+ @property
+ def compression(self):
+ """File compression.
+
+ One of NONE, GZIP, BGZF, CUSTOM."""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ return COMPRESSION[self.htsfile.format.compression]
+
+ @property
+ def description(self):
+ """Vaguely human readable description of the file format"""
+ if not self.htsfile:
+ raise ValueError('metadata not available on closed file')
+ cdef char *desc = hts_format_description(&self.htsfile.format)
+ try:
+ return charptr_to_str(desc)
+ finally:
+ free(desc)
+
+ @property
+ def is_open(self):
+ """return True if HTSFile is open and in a valid state."""
+ return CTrue if self.htsfile != NULL else CFalse
+
+ @property
+ def is_closed(self):
+ """return True if HTSFile is closed."""
+ return self.htsfile == NULL
+
+ @property
+ def closed(self):
+ """return True if HTSFile is closed."""
+ return self.htsfile == NULL
+
+ @property
+ def is_write(self):
+ """return True if HTSFile is open for writing"""
+ return self.htsfile != NULL and self.htsfile.is_write != 0
+
+ @property
+ def is_read(self):
+ """return True if HTSFile is open for reading"""
+ return self.htsfile != NULL and self.htsfile.is_write == 0
+
+ @property
+ def is_sam(self):
+ """return True if HTSFile is reading or writing a SAM alignment file"""
+ return self.htsfile != NULL and self.htsfile.format.format == sam
+
+ @property
+ def is_bam(self):
+ """return True if HTSFile is reading or writing a BAM alignment file"""
+ return self.htsfile != NULL and self.htsfile.format.format == bam
+
+ @property
+ def is_cram(self):
+ """return True if HTSFile is reading or writing a BAM alignment file"""
+ return self.htsfile != NULL and self.htsfile.format.format == cram
+
+ @property
+ def is_vcf(self):
+ """return True if HTSFile is reading or writing a VCF variant file"""
+ return self.htsfile != NULL and self.htsfile.format.format == vcf
+
+ @property
+ def is_bcf(self):
+ """return True if HTSFile is reading or writing a BCF variant file"""
+ return self.htsfile != NULL and self.htsfile.format.format == bcf
+
+ def reset(self):
+ """reset file position to beginning of file just after the header.
+
+ Returns
+ -------
+
+ The file position after moving the file pointer.
+
+ """
+ return self.seek(self.start_offset)
+
+ def seek(self, uint64_t offset):
+ """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+ if self.is_stream:
+ raise OSError('seek not available in streams')
+
+ cdef int64_t ret
+ if self.htsfile.format.compression != no_compression:
+ with nogil:
+ ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
+ else:
+ with nogil:
+ ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
+ return ret
+
+ def tell(self):
+ """return current file position, see :meth:`pysam.HTSFile.seek`."""
+ if not self.is_open:
+ raise ValueError('I/O operation on closed file')
+ if self.is_stream:
+ raise OSError('tell not available in streams')
+
+ cdef int64_t ret
+ if self.htsfile.format.compression != no_compression:
+ with nogil:
+ ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
+ else:
+ with nogil:
+ ret = hts_utell(self.htsfile)
+ return ret
+
+ cdef htsFile *_open_htsfile(self) except? NULL:
+ cdef char *cfilename
+ cdef char *cmode = self.mode
+ cdef int fd, dup_fd
+
+ if isinstance(self.filename, bytes):
+ cfilename = self.filename
+ with nogil:
+ return hts_open(cfilename, cmode)
+ else:
+ if isinstance(self.filename, int):
+ fd = self.filename
+ else:
+ fd = self.filename.fileno()
+
+ if self.duplicate_filehandle:
+ dup_fd = dup(fd)
+ else:
+ dup_fd = fd
+
+ # Replicate mode normalization done in hts_open_format
+ smode = self.mode.replace(b'b', b'').replace(b'c', b'')
+ if b'b' in self.mode:
+ smode += b'b'
+ elif b'c' in self.mode:
+ smode += b'c'
+ cmode = smode
+
+ hfile = hdopen(dup_fd, cmode)
+ if hfile == NULL:
+ raise IOError('Cannot create hfile')
+
+ try:
+ # filename.name can be an int
+ filename = str(self.filename.name)
+ except AttributeError:
+ filename = '<fd:{}>'.format(fd)
+
+ filename = encode_filename(filename)
+ cfilename = filename
+ with nogil:
+ return hts_hopen(hfile, cfilename, cmode)
+
+ def _exists(self):
+ """return False iff file is local, a file and exists.
+ """
+ return (not isinstance(self.filename, (str, bytes)) or
+ self.filename == b'-' or
+ self.is_remote or
+ os.path.exists(self.filename))
--- /dev/null
+from pysam.libcalignmentfile cimport AlignedSegment, AlignmentFile
+
+#################################################
+# Compatibility Layer for pysam < 0.8
+
+# import all declarations from htslib
+from pysam.libchtslib cimport *
+
+cdef class AlignedRead(AlignedSegment):
+ pass
+
+cdef class Samfile(AlignmentFile):
+ pass
+
+# import the conversion functions
+cdef extern from "htslib_util.h":
+
+ # add *nbytes* into the variable length data of *src* at *pos*
+ bam1_t * pysam_bam_update(bam1_t * b,
+ size_t nbytes_old,
+ size_t nbytes_new,
+ uint8_t * pos)
+
+ # now: static
+ int aux_type2size(int)
+
+ char * pysam_bam_get_qname(bam1_t * b)
+ uint32_t * pysam_bam_get_cigar(bam1_t * b)
+ uint8_t * pysam_bam_get_seq(bam1_t * b)
+ uint8_t * pysam_bam_get_qual(bam1_t * b)
+ uint8_t * pysam_bam_get_aux(bam1_t * b)
+ int pysam_bam_get_l_aux(bam1_t * b)
+ char pysam_bam_seqi(uint8_t * s, int i)
+
+ uint16_t pysam_get_bin(bam1_t * b)
+ uint8_t pysam_get_qual(bam1_t * b)
+ uint8_t pysam_get_l_qname(bam1_t * b)
+ uint16_t pysam_get_flag(bam1_t * b)
+ uint16_t pysam_get_n_cigar(bam1_t * b)
+ void pysam_set_bin(bam1_t * b, uint16_t v)
+ void pysam_set_qual(bam1_t * b, uint8_t v)
+ void pysam_set_l_qname(bam1_t * b, uint8_t v)
+ void pysam_set_flag(bam1_t * b, uint16_t v)
+ void pysam_set_n_cigar(bam1_t * b, uint16_t v)
+ void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
--- /dev/null
+# cython: embedsignature=True
+# cython: profile=True
+# adds doc-strings for sphinx
+import tempfile
+import os
+import sys
+import types
+import itertools
+import struct
+import ctypes
+import collections
+import re
+import platform
+import warnings
+from cpython cimport PyErr_SetString, \
+ PyBytes_Check, \
+ PyUnicode_Check, \
+ PyBytes_FromStringAndSize
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+
+
+cdef class Samfile(AlignmentFile):
+ '''Deprecated alternative for :class:`~pysam.AlignmentFile`
+
+ Added for backwards compatibility with pysam <= 0.8.0
+ '''
+ pass
+
+
+cdef class AlignedRead(AlignedSegment):
+ '''Deprecated alternative for :class:`~pysam.AlignedSegment`
+
+ Added for backwards compatibility with pysam <= 0.8.0
+ '''
+ pass
+
+
+__all__ = ['Samfile', 'AlignedRead']
+
+
--- /dev/null
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+from libc.stdlib cimport malloc, calloc, realloc, free
+from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
+from libc.stdio cimport FILE, printf
+
+# Note: this replaces python "open"!
+cdef extern from "fcntl.h":
+ int open(char *pathname, int flags)
+
+cdef extern from "unistd.h" nogil:
+ ctypedef int ssize_t
+ ssize_t read(int fd, void *buf, size_t count)
+ int close(int fd)
+
+from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \
+ tbx_t, kstring_t, BGZF, HTSFile
+
+
+# These functions are put here and not in chtslib.pxd in order
+# to avoid warnings for unused functions.
+cdef extern from "pysam_stream.h" nogil:
+
+ ctypedef struct kstream_t:
+ pass
+
+ ctypedef struct kseq_t:
+ kstring_t name
+ kstring_t comment
+ kstring_t seq
+ kstring_t qual
+
+ kseq_t *kseq_init(BGZF *)
+ int kseq_read(kseq_t *)
+ void kseq_destroy(kseq_t *)
+ kstream_t *ks_init(BGZF *)
+ void ks_destroy(kstream_t *)
+
+ # Retrieve characters from stream until delimiter
+ # is reached placing results in str.
+ int ks_getuntil(kstream_t *,
+ int delimiter,
+ kstring_t * str,
+ int * dret)
+
+
+cdef class tabix_file_iterator:
+ cdef BGZF * fh
+ cdef kstream_t * kstream
+ cdef kstring_t buffer
+ cdef size_t size
+ cdef Parser parser
+ cdef int fd
+ cdef int duplicated_fd
+ cdef infile
+
+ cdef __cnext__(self)
+
+
+cdef class TabixFile(HTSFile):
+ # pointer to index structure
+ cdef tbx_t * index
+
+ cdef readonly object filename_index
+
+ cdef Parser parser
+
+ cdef encoding
+
+
+cdef class Parser:
+ cdef encoding
+ cdef parse(self, char * buffer, int len)
+
+
+cdef class asTuple(Parser):
+ cdef parse(self, char * buffer, int len)
+
+
+cdef class asGTF(Parser):
+ pass
+
+
+cdef class asBed(Parser):
+ pass
+
+
+cdef class asVCF(Parser):
+ pass
+
+
+cdef class TabixIterator:
+ cdef hts_itr_t * iterator
+ cdef TabixFile tabixfile
+ cdef kstring_t buffer
+ cdef encoding
+ cdef int __cnext__(self)
+
+
+cdef class TabixIteratorParsed(TabixIterator):
+ cdef Parser parser
+
+
+cdef class GZIterator:
+ cdef object _filename
+ cdef BGZF * gzipfile
+ cdef kstream_t * kstream
+ cdef kstring_t buffer
+ cdef int __cnext__(self)
+ cdef encoding
+
+
+cdef class GZIteratorHead(GZIterator):
+ pass
+
+
+cdef class GZIteratorParsed(GZIterator):
+ cdef Parser parser
+
+
+# Compatibility Layer for pysam < 0.8
+cdef class Tabixfile(TabixFile):
+ pass
--- /dev/null
+# cython: embedsignature=True
+# cython: profile=True
+###############################################################################
+###############################################################################
+# Cython wrapper for access to tabix indexed files in bgzf format
+###############################################################################
+# The principal classes and functions defined in this module are:
+#
+# class TabixFile class wrapping tabix indexed files in bgzf format
+#
+# class asTuple Parser class for tuples
+# class asGT Parser class for GTF formatted rows
+# class asBed Parser class for Bed formatted rows
+# class asVCF Parser class for VCF formatted rows
+#
+# class tabix_generic_iterator Streamed iterator of bgzf formatted files
+#
+# Additionally this module defines several additional classes that are part
+# of the internal API. These are:
+#
+# class Parser base class for parsers of tab-separated rows
+# class tabix_file_iterator
+# class TabixIterator iterator class over rows in bgzf file
+# class EmptyIterator
+#
+# For backwards compatibility, the following classes are also defined:
+#
+# class Tabixfile equivalent to TabixFile
+#
+###############################################################################
+#
+# The MIT License
+#
+# Copyright (c) 2015 Andreas Heger
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+import os
+import sys
+
+from libc.stdio cimport printf, fprintf, stderr
+from libc.string cimport strerror
+from libc.errno cimport errno
+from posix.unistd cimport dup
+
+from cpython cimport PyErr_SetString, PyBytes_Check, \
+ PyUnicode_Check, PyBytes_FromStringAndSize, \
+ PyObject_AsFileDescriptor
+
+from cpython.version cimport PY_MAJOR_VERSION
+
+cimport pysam.libctabixproxies as ctabixproxies
+
+from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
+ BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
+ tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \
+ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
+ tbx_destroy, hisremote, region_list
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+cdef class Parser:
+
+ def __init__(self, encoding="ascii"):
+ self.encoding = encoding
+
+ def set_encoding(self, encoding):
+ self.encoding = encoding
+
+ def get_encoding(self):
+ return self.encoding
+
+ cdef parse(self, char * buffer, int length):
+ raise NotImplementedError(
+ 'parse method of %s not implemented' % str(self))
+
+ def __call__(self, char * buffer, int length):
+ return self.parse(buffer, length)
+
+
+cdef class asTuple(Parser):
+ '''converts a :term:`tabix row` into a python tuple.
+
+ A field in a row is accessed by numeric index.
+ '''
+ cdef parse(self, char * buffer, int len):
+ cdef ctabixproxies.TupleProxy r
+ r = ctabixproxies.TupleProxy(self.encoding)
+ # need to copy - there were some
+ # persistence issues with "present"
+ r.copy(buffer, len)
+ return r
+
+
+cdef class asGTF(Parser):
+ '''converts a :term:`tabix row` into a GTF record with the following
+ fields:
+
+ +----------+----------+-------------------------------+
+ |*Column* |*Name* |*Content* |
+ +----------+----------+-------------------------------+
+ |1 |contig |the chromosome name |
+ +----------+----------+-------------------------------+
+ |2 |feature |The feature type |
+ +----------+----------+-------------------------------+
+ |3 |source |The feature source |
+ +----------+----------+-------------------------------+
+ |4 |start |genomic start coordinate |
+ | | |(0-based) |
+ +----------+----------+-------------------------------+
+ |5 |end |genomic end coordinate |
+ | | |(0-based) |
+ +----------+----------+-------------------------------+
+ |6 |score |feature score |
+ +----------+----------+-------------------------------+
+ |7 |strand |strand |
+ +----------+----------+-------------------------------+
+ |8 |frame |frame |
+ +----------+----------+-------------------------------+
+ |9 |attributes|the attribute field |
+ +----------+----------+-------------------------------+
+
+ GTF formatted entries also define the following fields that
+ are derived from the attributes field:
+
+ +--------------------+------------------------------+
+ |*Name* |*Content* |
+ +--------------------+------------------------------+
+ |gene_id |the gene identifier |
+ +--------------------+------------------------------+
+ |transcript_id |the transcript identifier |
+ +--------------------+------------------------------+
+
+ '''
+ cdef parse(self, char * buffer, int len):
+ cdef ctabixproxies.GTFProxy r
+ r = ctabixproxies.GTFProxy(self.encoding)
+ r.copy(buffer, len)
+ return r
+
+
+cdef class asBed(Parser):
+ '''converts a :term:`tabix row` into a bed record
+ with the following fields:
+
+ +-----------+-----------+------------------------------------------+
+ |*Column* |*Field* |*Contents* |
+ | | | |
+ +-----------+-----------+------------------------------------------+
+ |1 |contig |contig |
+ | | | |
+ +-----------+-----------+------------------------------------------+
+ |2 |start |genomic start coordinate (zero-based) |
+ +-----------+-----------+------------------------------------------+
+ |3 |end |genomic end coordinate plus one |
+ | | |(zero-based) |
+ +-----------+-----------+------------------------------------------+
+ |4 |name |name of feature. |
+ +-----------+-----------+------------------------------------------+
+ |5 |score |score of feature |
+ +-----------+-----------+------------------------------------------+
+ |6 |strand |strand of feature |
+ +-----------+-----------+------------------------------------------+
+ |7 |thickStart |thickStart |
+ +-----------+-----------+------------------------------------------+
+ |8 |thickEnd |thickEnd |
+ +-----------+-----------+------------------------------------------+
+ |9 |itemRGB |itemRGB |
+ +-----------+-----------+------------------------------------------+
+ |10 |blockCount |number of bocks |
+ +-----------+-----------+------------------------------------------+
+ |11 |blockSizes |',' separated string of block sizes |
+ +-----------+-----------+------------------------------------------+
+ |12 |blockStarts|',' separated string of block genomic |
+ | | |start positions |
+ +-----------+-----------+------------------------------------------+
+
+ Only the first three fields are required. Additional
+ fields are optional, but if one is defined, all the preceding
+ need to be defined as well.
+
+ '''
+ cdef parse(self, char * buffer, int len):
+ cdef ctabixproxies.BedProxy r
+ r = ctabixproxies.BedProxy(self.encoding)
+ r.copy(buffer, len)
+ return r
+
+
+cdef class asVCF(Parser):
+ '''converts a :term:`tabix row` into a VCF record with
+ the following fields:
+
+ +----------+---------+------------------------------------+
+ |*Column* |*Field* |*Contents* |
+ | | | |
+ +----------+---------+------------------------------------+
+ |1 |contig |chromosome |
+ +----------+---------+------------------------------------+
+ |2 |pos |chromosomal position, zero-based |
+ +----------+---------+------------------------------------+
+ |3 |id |id |
+ +----------+---------+------------------------------------+
+ |4 |ref |reference allele |
+ +----------+---------+------------------------------------+
+ |5 |alt |alternate alleles |
+ +----------+---------+------------------------------------+
+ |6 |qual |quality |
+ +----------+---------+------------------------------------+
+ |7 |filter |filter |
+ +----------+---------+------------------------------------+
+ |8 |info |info |
+ +----------+---------+------------------------------------+
+ |9 |format |format specifier. |
+ +----------+---------+------------------------------------+
+
+ Access to genotypes is via index::
+
+ contig = vcf.contig
+ first_sample_genotype = vcf[0]
+ second_sample_genotype = vcf[1]
+
+ '''
+ cdef parse(self, char * buffer, int len):
+ cdef ctabixproxies.VCFProxy r
+ r = ctabixproxies.VCFProxy(self.encoding)
+ r.copy(buffer, len)
+ return r
+
+
+cdef class TabixFile:
+ """Random access to bgzf formatted files that
+ have been indexed by :term:`tabix`.
+
+ The file is automatically opened. The index file of file
+ ``<filename>`` is expected to be called ``<filename>.tbi``
+ by default (see parameter `index`).
+
+ Parameters
+ ----------
+
+ filename : string
+ Filename of bgzf file to be opened.
+
+ index : string
+ The filename of the index. If not set, the default is to
+ assume that the index is called ``filename.tbi`
+
+ mode : char
+ The file opening mode. Currently, only ``r`` is permitted.
+
+ parser : :class:`pysam.Parser`
+
+ sets the default parser for this tabix file. If `parser`
+ is None, the results are returned as an unparsed string.
+ Otherwise, `parser` is assumed to be a functor that will return
+ parsed data (see for example :class:`~pysam.asTuple` and
+ :class:`~pysam.asGTF`).
+
+ encoding : string
+
+ The encoding passed to the parser
+
+ Raises
+ ------
+
+ ValueError
+ if index file is missing.
+
+ IOError
+ if file could not be opened
+ """
+ def __cinit__(self,
+ filename,
+ mode='r',
+ parser=None,
+ index=None,
+ encoding="ascii",
+ *args,
+ **kwargs ):
+
+ self.htsfile = NULL
+ self.is_remote = False
+ self.is_stream = False
+ self.parser = parser
+ self._open(filename, mode, index, *args, **kwargs)
+ self.encoding = encoding
+
+ def _open( self,
+ filename,
+ mode='r',
+ index=None,
+ ):
+ '''open a :term:`tabix file` for reading.'''
+
+ if mode != 'r':
+ raise ValueError("invalid file opening mode `%s`" % mode)
+
+ if self.htsfile != NULL:
+ self.close()
+ self.htsfile = NULL
+
+ filename_index = index or (filename + ".tbi")
+ # encode all the strings to pass to tabix
+ self.filename = encode_filename(filename)
+ self.filename_index = encode_filename(filename_index)
+
+ self.is_stream = self.filename == b'-'
+ self.is_remote = hisremote(self.filename)
+
+ if not self.is_remote:
+ if not os.path.exists(filename):
+ raise IOError("file `%s` not found" % filename)
+
+ if not os.path.exists(filename_index):
+ raise IOError("index `%s` not found" % filename_index)
+
+ # open file
+ cdef char *cfilename = self.filename
+ with nogil:
+ self.htsfile = hts_open(cfilename, 'r')
+
+ if self.htsfile == NULL:
+ raise IOError("could not open file `%s`" % filename)
+
+ #if self.htsfile.format.category != region_list:
+ # raise ValueError("file does not contain region data")
+
+ cfilename = self.filename_index
+ with nogil:
+ self.index = tbx_index_load(cfilename)
+
+ if self.index == NULL:
+ raise IOError("could not open index for `%s`" % filename)
+
+ if not self.is_stream:
+ self.start_offset = self.tell()
+
+ def _dup(self):
+ '''return a copy of this tabix file.
+
+ The file is being re-opened.
+ '''
+ return TabixFile(self.filename,
+ mode="r",
+ parser=self.parser,
+ index=self.filename_index,
+ encoding=self.encoding)
+
+ def fetch(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None,
+ parser=None,
+ multiple_iterators=False):
+ '''fetch one or more rows in a :term:`region` using 0-based
+ indexing. The region is specified by :term:`reference`,
+ *start* and *end*. Alternatively, a samtools :term:`region`
+ string can be supplied.
+
+ Without *reference* or *region* all entries will be fetched.
+
+ If only *reference* is set, all reads matching on *reference*
+ will be fetched.
+
+ If *parser* is None, the default parser will be used for
+ parsing.
+
+ Set *multiple_iterators* to true if you will be using multiple
+ iterators on the same file at the same time. The iterator
+ returned will receive its own copy of a filehandle to the file
+ effectively re-opening the file. Re-opening a file creates
+ some overhead, so beware.
+
+ '''
+ if not self.is_open():
+ raise ValueError("I/O operation on closed file")
+
+ # convert coordinates to region string, which is one-based
+ if reference:
+ if end is not None:
+ if end < 0:
+ raise ValueError("end out of range (%i)" % end)
+ if start is None:
+ start = 0
+
+ if start < 0:
+ raise ValueError("start out of range (%i)" % end)
+ elif start > end:
+ raise ValueError(
+ 'start (%i) >= end (%i)' % (start, end))
+ elif start == end:
+ return EmptyIterator()
+ else:
+ region = '%s:%i-%i' % (reference, start + 1, end)
+ elif start is not None:
+ if start < 0:
+ raise ValueError("start out of range (%i)" % end)
+ region = '%s:%i' % (reference, start + 1)
+ else:
+ region = reference
+
+ # get iterator
+ cdef hts_itr_t * itr
+ cdef char *cstr
+ cdef TabixFile fileobj
+
+ # reopen the same file if necessary
+ if multiple_iterators:
+ fileobj = self._dup()
+ else:
+ fileobj = self
+
+ if region is None:
+ # without region or reference - iterate from start
+ with nogil:
+ itr = tbx_itr_queryi(fileobj.index,
+ HTS_IDX_START,
+ 0,
+ 0)
+ else:
+ s = force_bytes(region, encoding=fileobj.encoding)
+ cstr = s
+ with nogil:
+ itr = tbx_itr_querys(fileobj.index, cstr)
+
+ if itr == NULL:
+ if region is None:
+ if len(self.contigs) > 0:
+ # when accessing a tabix file created prior tabix 1.0
+ # the full-file iterator is empty.
+ raise ValueError(
+ "could not create iterator, possible "
+ "tabix version mismatch")
+ else:
+ # possible reason is that the file is empty -
+ # return an empty iterator
+ return EmptyIterator()
+ else:
+ raise ValueError(
+ "could not create iterator for region '%s'" %
+ region)
+
+ # use default parser if no parser is specified
+ if parser is None:
+ parser = fileobj.parser
+
+ cdef TabixIterator a
+ if parser is None:
+ a = TabixIterator(encoding=fileobj.encoding)
+ else:
+ parser.set_encoding(fileobj.encoding)
+ a = TabixIteratorParsed(parser)
+
+ a.tabixfile = fileobj
+ a.iterator = itr
+
+ return a
+
+ ###############################################################
+ ###############################################################
+ ###############################################################
+ ## properties
+ ###############################################################
+ property header:
+ '''the file header.
+
+ The file header consists of the lines at the beginning of a
+ file that are prefixed by the comment character ``#``.
+
+ .. note::
+ The header is returned as an iterator presenting lines
+ without the newline character.
+
+ .. note::
+ The header is only available for local files. For remote
+ files an Attribute Error is raised.
+
+ '''
+
+ def __get__(self):
+ if self.is_remote:
+ raise AttributeError(
+ "the header is not available for remote files")
+ return GZIteratorHead(self.filename)
+
+ property contigs:
+ '''list of chromosome names'''
+ def __get__(self):
+ cdef char ** sequences
+ cdef int nsequences
+
+ with nogil:
+ sequences = tbx_seqnames(self.index, &nsequences)
+ cdef int x
+ result = []
+ for x from 0 <= x < nsequences:
+ result.append(force_str(sequences[x]))
+
+ # htslib instructions:
+ # only free container, not the sequences themselves
+ free(sequences)
+
+ return result
+
+ def close(self):
+ '''
+ closes the :class:`pysam.TabixFile`.'''
+ if self.htsfile != NULL:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+ if self.index != NULL:
+ tbx_destroy(self.index)
+ self.index = NULL
+
+ def __dealloc__( self ):
+ # remember: dealloc cannot call other python methods
+ # note: no doc string
+ # note: __del__ is not called.
+ if self.htsfile != NULL:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+ if self.index != NULL:
+ tbx_destroy(self.index)
+
+
+cdef class TabixIterator:
+ """iterates over rows in *tabixfile* in region
+ given by *tid*, *start* and *end*.
+ """
+
+ def __init__(self, encoding="ascii"):
+ self.encoding = encoding
+
+ def __iter__(self):
+ self.buffer.s = NULL
+ self.buffer.l = 0
+ self.buffer.m = 0
+
+ return self
+
+ cdef int __cnext__(self):
+ '''iterate to next element.
+
+ Return -5 if file has been closed when this function
+ was called.
+ '''
+ if self.tabixfile.htsfile == NULL:
+ return -5
+
+ cdef int retval
+
+ while 1:
+ with nogil:
+ retval = tbx_itr_next(
+ self.tabixfile.htsfile,
+ self.tabixfile.index,
+ self.iterator,
+ &self.buffer)
+
+ if retval < 0:
+ break
+
+ if self.buffer.s[0] != '#':
+ break
+
+ return retval
+
+ def __next__(self):
+ """python version of next().
+
+ pyrex uses this non-standard name instead of next()
+ """
+
+ cdef int retval = self.__cnext__()
+ if retval == -5:
+ raise IOError("iteration on closed file")
+ elif retval < 0:
+ raise StopIteration
+
+ return charptr_to_str(self.buffer.s, self.encoding)
+
+ def next(self):
+ return self.__next__()
+
+ def __dealloc__(self):
+ if <void*>self.iterator != NULL:
+ tbx_itr_destroy(self.iterator)
+ if self.buffer.s != NULL:
+ free(self.buffer.s)
+
+
+class EmptyIterator:
+ '''empty iterator'''
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ raise StopIteration()
+
+ def __next__(self):
+ raise StopIteration()
+
+
+cdef class TabixIteratorParsed(TabixIterator):
+ """iterates over mapped reads in a region.
+
+ The *parser* determines the encoding.
+
+ Returns parsed data.
+ """
+
+ def __init__(self,
+ Parser parser):
+
+ TabixIterator.__init__(self)
+ self.parser = parser
+
+ def __next__(self):
+ """python version of next().
+
+ pyrex uses this non-standard name instead of next()
+ """
+
+ cdef int retval = self.__cnext__()
+ if retval == -5:
+ raise IOError("iteration on closed file")
+ elif retval < 0:
+ raise StopIteration
+
+ return self.parser.parse(self.buffer.s,
+ self.buffer.l)
+
+
+cdef class GZIterator:
+ def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
+ '''iterate line-by-line through gzip (or bgzip)
+ compressed file.
+ '''
+ if not os.path.exists(filename):
+ raise IOError("No such file or directory: %s" % filename)
+
+ filename = encode_filename(filename)
+ cdef char *cfilename = filename
+ with nogil:
+ self.gzipfile = bgzf_open(cfilename, "r")
+ self._filename = filename
+ self.kstream = ks_init(self.gzipfile)
+ self.encoding = encoding
+
+ self.buffer.l = 0
+ self.buffer.m = 0
+ self.buffer.s = <char*>malloc(buffer_size)
+
+ def __dealloc__(self):
+ '''close file.'''
+ if self.gzipfile != NULL:
+ bgzf_close(self.gzipfile)
+ self.gzipfile = NULL
+ if self.buffer.s != NULL:
+ free(self.buffer.s)
+ if self.kstream != NULL:
+ ks_destroy(self.kstream)
+
+ def __iter__(self):
+ return self
+
+ cdef int __cnext__(self):
+ cdef int dret = 0
+ cdef int retval = 0
+ while 1:
+ with nogil:
+ retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
+
+ if retval < 0:
+ break
+
+ return dret
+ return -1
+
+ def __next__(self):
+ """python version of next().
+ """
+ cdef int retval = self.__cnext__()
+ if retval < 0:
+ raise StopIteration
+ return force_str(self.buffer.s, self.encoding)
+
+
+cdef class GZIteratorHead(GZIterator):
+ '''iterate line-by-line through gzip (or bgzip)
+ compressed file returning comments at top of file.
+ '''
+
+ def __next__(self):
+ """python version of next().
+ """
+ cdef int retval = self.__cnext__()
+ if retval < 0:
+ raise StopIteration
+ if self.buffer.s[0] == '#':
+ return self.buffer.s
+ else:
+ raise StopIteration
+
+
+cdef class GZIteratorParsed(GZIterator):
+ '''iterate line-by-line through gzip (or bgzip)
+ compressed file returning comments at top of file.
+ '''
+
+ def __init__(self, parser):
+ self.parser = parser
+
+ def __next__(self):
+ """python version of next().
+ """
+ cdef int retval = self.__cnext__()
+ if retval < 0:
+ raise StopIteration
+
+ return self.parser.parse(self.buffer.s,
+ self.buffer.l)
+
+
+def tabix_compress(filename_in,
+ filename_out,
+ force=False):
+ '''compress *filename_in* writing the output to *filename_out*.
+
+ Raise an IOError if *filename_out* already exists, unless *force*
+ is set.
+ '''
+
+ if not force and os.path.exists(filename_out):
+ raise IOError(
+ "Filename '%s' already exists, use *force* to "
+ "overwrite" % filename_out)
+
+ cdef int WINDOW_SIZE
+ cdef int c, r
+ cdef void * buffer
+ cdef BGZF * fp
+ cdef int fd_src
+ cdef bint is_empty = True
+ cdef int O_RDONLY
+ O_RDONLY = os.O_RDONLY
+
+ WINDOW_SIZE = 64 * 1024
+
+ fn = encode_filename(filename_out)
+ cdef char *cfn = fn
+ with nogil:
+ fp = bgzf_open(cfn, "w")
+ if fp == NULL:
+ raise IOError("could not open '%s' for writing" % filename_out)
+
+ fn = encode_filename(filename_in)
+ fd_src = open(fn, O_RDONLY)
+ if fd_src == 0:
+ raise IOError("could not open '%s' for reading" % filename_in)
+
+ buffer = malloc(WINDOW_SIZE)
+ c = 1
+
+ while c > 0:
+ with nogil:
+ c = read(fd_src, buffer, WINDOW_SIZE)
+ if c > 0:
+ is_empty = False
+ r = bgzf_write(fp, buffer, c)
+ if r < 0:
+ free(buffer)
+ raise OSError("writing failed")
+
+ free(buffer)
+ r = bgzf_close(fp)
+ if r < 0:
+ raise OSError("error %i when writing to file %s" % (r, filename_out))
+
+ r = close(fd_src)
+ # an empty file will return with -1, thus ignore this.
+ if r < 0:
+ if not (r == -1 and is_empty):
+ raise OSError("error %i when closing file %s" % (r, filename_in))
+
+
+def tabix_index( filename,
+ force = False,
+ seq_col = None,
+ start_col = None,
+ end_col = None,
+ preset = None,
+ meta_char = "#",
+ zerobased = False,
+ int min_shift = -1,
+ ):
+ '''index tab-separated *filename* using tabix.
+
+ An existing index will not be overwritten unless
+ *force* is set.
+
+ The index will be built from coordinates
+ in columns *seq_col*, *start_col* and *end_col*.
+
+ The contents of *filename* have to be sorted by
+ contig and position - the method does not check
+ if the file is sorted.
+
+ Column indices are 0-based. Coordinates in the file
+ are assumed to be 1-based.
+
+ If *preset* is provided, the column coordinates
+ are taken from a preset. Valid values for preset
+ are "gff", "bed", "sam", "vcf", psltbl", "pileup".
+
+ Lines beginning with *meta_char* and the first
+ *line_skip* lines will be skipped.
+
+ If *filename* does not end in ".gz", it will be automatically
+ compressed. The original file will be removed and only the
+ compressed file will be retained.
+
+ If *filename* ends in *gz*, the file is assumed to be already
+ compressed with bgzf.
+
+ *min-shift* sets the minimal interval size to 1<<INT; 0 for the
+ old tabix index. The default of -1 is changed inside htslib to
+ the old tabix default of 0.
+
+ returns the filename of the compressed data
+
+ '''
+
+ if not os.path.exists(filename):
+ raise IOError("No such file '%s'" % filename)
+
+ if preset is None and \
+ (seq_col is None or start_col is None or end_col is None):
+ raise ValueError(
+ "neither preset nor seq_col,start_col and end_col given")
+
+ if not filename.endswith(".gz"):
+ tabix_compress(filename, filename + ".gz", force=force)
+ os.unlink( filename )
+ filename += ".gz"
+
+ if not force and os.path.exists(filename + ".tbi"):
+ raise IOError(
+ "Filename '%s.tbi' already exists, use *force* to overwrite")
+
+ # columns (1-based):
+ # preset-code, contig, start, end, metachar for
+ # comments, lines to ignore at beginning
+ # 0 is a missing column
+ preset2conf = {
+ 'gff' : (0, 1, 4, 5, ord('#'), 0),
+ 'bed' : (0x10000, 1, 2, 3, ord('#'), 0),
+ 'psltbl' : (0x10000, 15, 17, 18, ord('#'), 0),
+ 'sam' : (1, 3, 4, 0, ord('@'), 0),
+ 'vcf' : (2, 1, 2, 0, ord('#'), 0),
+ 'pileup': (3, 1, 2, 0, ord('#'), 0),
+ }
+
+ if preset:
+ try:
+ conf_data = preset2conf[preset]
+ except KeyError:
+ raise KeyError(
+ "unknown preset '%s', valid presets are '%s'" %
+ (preset, ",".join(preset2conf.keys())))
+ else:
+ if end_col == None:
+ end_col = -1
+ preset = 0
+
+ # note that tabix internally works with 0-based coordinates
+ # and open/closed intervals. When using a preset, conversion
+ # is automatically taken care of. Otherwise, the coordinates
+ # are assumed to be 1-based closed intervals and -1 is
+ # subtracted from the start coordinate. To avoid doing this,
+ # set the TI_FLAG_UCSC=0x10000 flag:
+ if zerobased:
+ preset = preset | 0x10000
+
+ conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), 0)
+
+ cdef tbx_conf_t conf
+ conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
+
+
+ fn = encode_filename(filename)
+ cdef char *cfn = fn
+ with nogil:
+ tbx_index_build(cfn, min_shift, &conf)
+
+ return filename
+
+# #########################################################
+# cdef class tabix_file_iterator_old:
+# '''iterate over ``infile``.
+
+# This iterator is not safe. If the :meth:`__next__()` method is called
+# after ``infile`` is closed, the result is undefined (see ``fclose()``).
+
+# The iterator might either raise a StopIteration or segfault.
+# '''
+
+
+# def __cinit__(self,
+# infile,
+# Parser parser,
+# int buffer_size = 65536 ):
+
+# cdef int fd = PyObject_AsFileDescriptor( infile )
+# if fd == -1: raise ValueError( "I/O operation on closed file." )
+# self.infile = fdopen( fd, 'r')
+
+# if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
+
+# self.buffer = <char*>malloc( buffer_size )
+# self.size = buffer_size
+# self.parser = parser
+
+# def __iter__(self):
+# return self
+
+# cdef __cnext__(self):
+
+# cdef char * b
+# cdef size_t nbytes
+# b = self.buffer
+
+# while not feof( self.infile ):
+# nbytes = getline( &b, &self.size, self.infile)
+
+# # stop at first error or eof
+# if (nbytes == -1): break
+# # skip comments
+# if (b[0] == '#'): continue
+
+# # skip empty lines
+# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
+
+# # make sure that entry is complete
+# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+# result = b
+# raise ValueError( "incomplete line at %s" % result )
+
+# # make sure that this goes fully through C
+# # otherwise buffer is copied to/from a
+# # Python object causing segfaults as
+# # the wrong memory is freed
+# return self.parser.parse( b, nbytes )
+
+# raise StopIteration
+
+# def __dealloc__(self):
+# free(self.buffer)
+
+# def __next__(self):
+# return self.__cnext__()
+
+#########################################################
+#########################################################
+#########################################################
+## Iterators for parsing through unindexed files.
+#########################################################
+# cdef buildGzipError(void *gzfp):
+# cdef int errnum = 0
+# cdef char *s = gzerror(gzfp, &errnum)
+# return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
+
+
+cdef class tabix_file_iterator:
+ '''iterate over a compressed or uncompressed ``infile``.
+ '''
+
+ def __cinit__(self,
+ infile,
+ Parser parser,
+ int buffer_size=65536):
+
+ if infile.closed:
+ raise ValueError("I/O operation on closed file.")
+
+ self.infile = infile
+
+ cdef int fd = PyObject_AsFileDescriptor(infile)
+ if fd == -1:
+ raise ValueError("I/O operation on closed file.")
+
+ self.duplicated_fd = dup(fd)
+
+ # From the manual:
+ # gzopen can be used to read a file which is not in gzip format;
+ # in this case gzread will directly read from the file without decompression.
+ # When reading, this will be detected automatically by looking
+ # for the magic two-byte gzip header.
+ self.fh = bgzf_dopen(self.duplicated_fd, 'r')
+
+ if self.fh == NULL:
+ raise IOError('%s' % strerror(errno))
+
+ self.kstream = ks_init(self.fh)
+
+ self.buffer.s = <char*>malloc(buffer_size)
+ #if self.buffer == NULL:
+ # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
+ #self.size = buffer_size
+ self.parser = parser
+
+ def __iter__(self):
+ return self
+
+ cdef __cnext__(self):
+
+ cdef char * b
+ cdef int dret = 0
+ cdef int retval = 0
+ while 1:
+ with nogil:
+ retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
+
+ if retval < 0:
+ break
+ #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
+
+ b = self.buffer.s
+
+ # skip comments
+ if (b[0] == '#'):
+ continue
+
+ # skip empty lines
+ if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
+ continue
+
+ # gzgets terminates at \n, no need to test
+
+ # parser creates a copy
+ return self.parser.parse(b, self.buffer.l)
+
+ raise StopIteration
+
+ def __dealloc__(self):
+ free(self.buffer.s)
+ ks_destroy(self.kstream)
+ bgzf_close(self.fh)
+
+ def __next__(self):
+ return self.__cnext__()
+
+ def next(self):
+ return self.__cnext__()
+
+
+class tabix_generic_iterator:
+ '''iterate over ``infile``.
+
+ Permits the use of file-like objects for example from the gzip module.
+ '''
+ def __init__(self, infile, parser):
+
+ self.infile = infile
+ if self.infile.closed:
+ raise ValueError("I/O operation on closed file.")
+ self.parser = parser
+
+ def __iter__(self):
+ return self
+
+ # cython version - required for python 3
+ def __next__(self):
+
+ cdef char * b
+ cdef char * cpy
+ cdef size_t nbytes
+
+ encoding = self.parser.get_encoding()
+
+ # note that GzipFile.close() does not close the file
+ # reading is still possible.
+ if self.infile.closed:
+ raise ValueError("I/O operation on closed file.")
+
+ while 1:
+
+ line = self.infile.readline()
+ if not line:
+ break
+
+ s = force_bytes(line, encoding)
+ b = s
+ nbytes = len(line)
+ assert b[nbytes] == '\0'
+
+ # skip comments
+ if b[0] == '#':
+ continue
+
+ # skip empty lines
+ if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
+ continue
+
+ # make sure that entry is complete
+ if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+ raise ValueError("incomplete line at %s" % line)
+
+ bytes_cpy = <bytes> b
+ cpy = <char *> bytes_cpy
+
+ return self.parser(cpy, nbytes)
+
+ raise StopIteration
+
+ # python version - required for python 2.7
+ def next(self):
+ return self.__next__()
+
+def tabix_iterator(infile, parser):
+ """return an iterator over all entries in a file.
+
+ Results are returned parsed as specified by the *parser*. If
+ *parser* is None, the results are returned as an unparsed string.
+ Otherwise, *parser* is assumed to be a functor that will return
+ parsed data (see for example :class:`~pysam.asTuple` and
+ :class:`~pysam.asGTF`).
+
+ """
+ if PY_MAJOR_VERSION >= 3:
+ return tabix_generic_iterator(infile, parser)
+ else:
+ return tabix_file_iterator(infile, parser)
+
+ # file objects can use C stdio
+ # used to be: isinstance( infile, file):
+ # if PY_MAJOR_VERSION >= 3:
+ # if isinstance( infile, io.IOBase ):
+ # return tabix_copy_iterator( infile, parser )
+ # else:
+ # return tabix_generic_iterator( infile, parser )
+ # else:
+# if isinstance( infile, file ):
+# return tabix_copy_iterator( infile, parser )
+# else:
+# return tabix_generic_iterator( infile, parser )
+
+cdef class Tabixfile(TabixFile):
+ """Tabixfile is deprecated: use TabixFile instead"""
+ pass
+
+
+__all__ = [
+ "tabix_index",
+ "tabix_compress",
+ "TabixFile",
+ "Tabixfile",
+ "asTuple",
+ "asGTF",
+ "asVCF",
+ "asBed",
+ "GZIterator",
+ "GZIteratorHead",
+ "tabix_iterator",
+ "tabix_generic_iterator",
+ "tabix_file_iterator",
+]
--- /dev/null
+#cdef extern from "Python.h":
+# ctypedef struct FILE
+
+from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t
+
+cdef class TupleProxy:
+
+ cdef:
+ char * data
+ char ** fields
+ int nfields
+ int index
+ int nbytes
+ int offset
+ bint is_modified
+
+ cdef encoding
+
+ cpdef int getMaxFields(self)
+ cpdef int getMinFields(self)
+# cdef char * _getindex(self, int idx)
+
+ cdef take(self, char * buffer, size_t nbytes)
+ cdef present(self, char * buffer, size_t nbytes)
+ cdef copy(self, char * buffer, size_t nbytes, bint reset=*)
+ cdef update(self, char * buffer, size_t nbytes)
+
+cdef class GTFProxy(TupleProxy) :
+
+ cdef:
+ char * _attributes
+ cdef bint hasOwnAttributes
+
+ cpdef int getMaxFields(self)
+ cpdef int getMinFields(self)
+ cdef char * getAttributes(self)
+
+cdef class NamedTupleProxy(TupleProxy):
+ pass
+
+cdef class BedProxy(NamedTupleProxy):
+
+ cdef:
+ char * contig
+ uint32_t start
+ uint32_t end
+ int bedfields
+
+ cpdef int getMaxFields(self)
+ cpdef int getMinFields(self)
+ cdef update(self, char * buffer, size_t nbytes)
+
+cdef class VCFProxy(NamedTupleProxy) :
+
+ cdef:
+ char * contig
+ uint32_t pos
+
+ cdef update(self, char * buffer, size_t nbytes)
--- /dev/null
+from cpython cimport PyBytes_FromStringAndSize
+
+from libc.stdio cimport printf, feof, fgets
+from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
+from libc.stdlib cimport free, malloc, calloc, realloc
+from libc.stdlib cimport atoi, atol, atof
+
+from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
+from pysam.libcutils cimport encode_filename, from_string_and_size
+
+import collections
+
+cdef char *StrOrEmpty(char * buffer):
+ if buffer == NULL:
+ return ""
+ else: return buffer
+
+cdef int isNew(char * p, char * buffer, size_t nbytes):
+ """return True if `p` is located within `buffer` of size
+ `nbytes`
+ """
+ if p == NULL:
+ return 0
+ return not (buffer <= p < buffer + nbytes)
+
+
+cdef class TupleProxy:
+ '''Proxy class for access to parsed row as a tuple.
+
+ This class represents a table row for fast read-access.
+
+ Access to individual fields is via the [] operator.
+
+ Only read-only access is implemented.
+
+ '''
+
+ def __cinit__(self, encoding="ascii"):
+ self.data = NULL
+ self.fields = NULL
+ self.index = 0
+ self.nbytes = 0
+ self.is_modified = 0
+ self.nfields = 0
+ # start counting at field offset
+ self.offset = 0
+ self.encoding = encoding
+
+ def __dealloc__(self):
+ cdef int x
+ if self.is_modified:
+ for x from 0 <= x < self.nfields:
+ if isNew(self.fields[x], self.data, self.nbytes):
+ free(self.fields[x])
+ self.fields[x] = NULL
+
+ if self.data != NULL:
+ free(self.data)
+ if self.fields != NULL:
+ free(self.fields)
+
+ def __copy__(self):
+ if self.is_modified:
+ raise NotImplementedError(
+ "copying modified tuples is not implemented")
+ cdef TupleProxy n = type(self)()
+ n.copy(self.data, self.nbytes, reset=True)
+ return n
+
+ def compare(self, TupleProxy other):
+ '''return -1,0,1, if contents in this are binary
+ <,=,> to *other*
+
+ '''
+ if self.is_modified or other.is_modified:
+ raise NotImplementedError(
+ 'comparison of modified TupleProxies is not implemented')
+ if self.data == other.data:
+ return 0
+
+ if self.nbytes < other.nbytes:
+ return -1
+ elif self.nbytes > other.nbytes:
+ return 1
+ return memcmp(self.data, other.data, self.nbytes)
+
+ def __richcmp__(self, TupleProxy other, int op):
+ if op == 2: # == operator
+ return self.compare(other) == 0
+ elif op == 3: # != operator
+ return self.compare(other) != 0
+ else:
+ err_msg = "op {0} isn't implemented yet".format(op)
+ raise NotImplementedError(err_msg)
+
+ cdef take(self, char * buffer, size_t nbytes):
+ '''start presenting buffer.
+
+ Take ownership of the pointer.
+ '''
+ self.data = buffer
+ self.nbytes = nbytes
+ self.update(buffer, nbytes)
+
+ cdef present(self, char * buffer, size_t nbytes):
+ '''start presenting buffer.
+
+ Do not take ownership of the pointer.
+ '''
+ self.update(buffer, nbytes)
+
+ cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
+ '''start presenting buffer of size *nbytes*.
+
+ Buffer is a '\0'-terminated string without the '\n'.
+
+ Take a copy of buffer.
+ '''
+ # +1 for '\0'
+ cdef int s = sizeof(char) * (nbytes + 1)
+ self.data = <char*>malloc(s)
+ if self.data == NULL:
+ raise ValueError("out of memory in TupleProxy.copy()")
+ memcpy(<char*>self.data, buffer, s)
+
+ if reset:
+ for x from 0 <= x < nbytes:
+ if self.data[x] == '\0':
+ self.data[x] = '\t'
+
+ self.update(self.data, nbytes)
+
+ cpdef int getMinFields(self):
+ '''return minimum number of fields.'''
+ # 1 is not a valid tabix entry, but TupleProxy
+ # could be more generic.
+ return 1
+
+ cpdef int getMaxFields(self):
+ '''return maximum number of fields. Return
+ 0 for unknown length.'''
+ return 0
+
+ cdef update(self, char * buffer, size_t nbytes):
+ '''update internal data.
+
+ *buffer* is a \0 terminated string.
+
+ *nbytes* is the number of bytes in buffer (excluding
+ the \0)
+
+ Update starts work in buffer, thus can be used
+ to collect any number of fields until nbytes
+ is exhausted.
+
+ If max_fields is set, the number of fields is initialized to
+ max_fields.
+
+ '''
+ cdef char * pos
+ cdef char * old_pos
+ cdef int field
+ cdef int max_fields, min_fields, x
+
+ assert strlen(buffer) == nbytes, \
+ "length of buffer (%i) != number of bytes (%i)" % (
+ strlen(buffer), nbytes)
+
+ if buffer[nbytes] != 0:
+ raise ValueError("incomplete line at %s" % buffer)
+
+ #################################
+ # remove line breaks and feeds and update number of bytes
+ x = nbytes - 1
+ while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'):
+ buffer[x] = '\0'
+ x -= 1
+ self.nbytes = x + 1
+
+ #################################
+ # clear data
+ if self.fields != NULL:
+ free(self.fields)
+
+ for field from 0 <= field < self.nfields:
+ if isNew(self.fields[field], self.data, self.nbytes):
+ free(self.fields[field])
+
+ self.is_modified = self.nfields = 0
+
+ #################################
+ # allocate new
+ max_fields = self.getMaxFields()
+ # pre-count fields - better would be
+ # to guess or dynamically grow
+ if max_fields == 0:
+ for x from 0 <= x < nbytes:
+ if buffer[x] == '\t':
+ max_fields += 1
+ max_fields += 1
+
+ self.fields = <char **>calloc(max_fields, sizeof(char *))
+ if self.fields == NULL:
+ raise ValueError("out of memory in TupleProxy.update()")
+
+ #################################
+ # start filling
+ field = 0
+ self.fields[field] = pos = buffer
+ field += 1
+ old_pos = pos
+ while 1:
+
+ pos = <char*>memchr(pos, '\t', nbytes)
+ if pos == NULL:
+ break
+ if field >= max_fields:
+ raise ValueError(
+ "parsing error: more than %i fields in line: %s" %
+ (max_fields, buffer))
+
+ pos[0] = '\0'
+ pos += 1
+ self.fields[field] = pos
+ field += 1
+ nbytes -= pos - old_pos
+ if nbytes < 0:
+ break
+ old_pos = pos
+ self.nfields = field
+ if self.nfields < self.getMinFields():
+ raise ValueError(
+ "parsing error: fewer that %i fields in line: %s" %
+ (self.getMinFields(), buffer))
+
+ def _getindex(self, int index):
+ '''return item at idx index'''
+ cdef int i = index
+ if i < 0:
+ i += self.nfields
+ if i < 0:
+ raise IndexError("list index out of range")
+ # apply offset - separating a fixed number
+ # of fields from a variable number such as in VCF
+ i += self.offset
+ if i >= self.nfields:
+ raise IndexError(
+ "list index out of range %i >= %i" %
+ (i, self.nfields))
+ return force_str(self.fields[i], self.encoding)
+
+ def __getitem__(self, key):
+ if type(key) == int:
+ return self._getindex(key)
+ # slice object
+ start, end, step = key.indices(self.nfields)
+ result = []
+ for index in range(start, end, step):
+ result.append(self._getindex(index))
+ return result
+
+ def _setindex(self, index, value):
+ '''set item at idx index.'''
+ cdef int idx = index
+ if idx < 0:
+ raise IndexError("list index out of range")
+ if idx >= self.nfields:
+ raise IndexError("list index out of range")
+
+ if isNew(self.fields[idx], self.data, self.nbytes):
+ free(self.fields[idx] )
+
+ self.is_modified = 1
+
+ if value is None:
+ self.fields[idx] = NULL
+ return
+
+ # conversion with error checking
+ value = force_bytes(value)
+ cdef char * tmp = <char*>value
+ self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
+ if self.fields[idx] == NULL:
+ raise ValueError("out of memory" )
+ strcpy(self.fields[idx], tmp)
+
+ def __setitem__(self, index, value):
+ '''set item at *index* to *value*'''
+ cdef int i = index
+ if i < 0:
+ i += self.nfields
+ i += self.offset
+
+ self._setindex(i, value)
+
+ def __len__(self):
+ return self.nfields
+
+ def __iter__(self):
+ self.index = 0
+ return self
+
+ def __next__(self):
+ """python version of next().
+ """
+ if self.index >= self.nfields:
+ raise StopIteration
+ cdef char * retval = self.fields[self.index]
+ self.index += 1
+ if retval == NULL:
+ return None
+ else:
+ return force_str(retval, self.encoding)
+
+ def __str__(self):
+ '''return original data'''
+ # copy and replace \0 bytes with \t characters
+ cdef char * cpy
+ if self.is_modified:
+ # todo: treat NULL values
+ result = []
+ for x in xrange(0, self.nfields):
+ result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
+ return "\t".join(result)
+ else:
+ cpy = <char*>calloc(sizeof(char), self.nbytes+1)
+ if cpy == NULL:
+ raise ValueError("out of memory")
+ memcpy(cpy, self.data, self.nbytes+1)
+ for x from 0 <= x < self.nbytes:
+ if cpy[x] == '\0':
+ cpy[x] = '\t'
+ result = cpy[:self.nbytes]
+ free(cpy)
+ r = result.decode(self.encoding)
+ return r
+
+def toDot(v):
+ '''convert value to '.' if None'''
+ if v is None:
+ return "."
+ else:
+ return str(v)
+
+def quote(v):
+ '''return a quoted attribute.'''
+ if isinstance(v, str):
+ return '"%s"' % v
+ else:
+ return str(v)
+
+
+cdef class GTFProxy(TupleProxy):
+ '''Proxy class for access to GTF fields.
+
+ This class represents a GTF entry for fast read-access.
+ Write-access has been added as well, though some care must
+ be taken. If any of the string fields (contig, source, ...)
+ are set, the new value is tied to the lifetime of the
+ argument that was supplied.
+
+ The only exception is the attributes field when set from
+ a dictionary - this field will manage its own memory.
+ '''
+
+ def __cinit__(self):
+ # automatically calls TupleProxy.__cinit__
+ self.hasOwnAttributes = False
+ self._attributes = NULL
+
+ def __dealloc__(self):
+ # automatically calls TupleProxy.__dealloc__
+ if self.hasOwnAttributes:
+ free(self._attributes)
+
+ cpdef int getMinFields(self):
+ '''return minimum number of fields.'''
+ return 9
+
+ cpdef int getMaxFields(self):
+ '''return max number of fields.'''
+ return 9
+
+ property contig:
+ '''contig of feature.'''
+ def __get__(self):
+ return self._getindex(0)
+ def __set__(self, value):
+ self._setindex(0, value)
+
+ property source:
+ '''feature source.'''
+ def __get__(self):
+ return self._getindex(1)
+ def __set__(self, value):
+ if value is None:
+ value = "."
+ self._setindex(1, value)
+
+ property feature:
+ '''feature name.'''
+ def __get__(self):
+ return self._getindex(2)
+ def __set__(self, value):
+ if value is None:
+ value = "."
+ self._setindex(2, value)
+
+ property start:
+ '''feature start (in 0-based open/closed coordinates).'''
+ def __get__(self ):
+ return int( self._getindex(3)) - 1
+ def __set__(self, value ):
+ self._setindex(3, str(value+1))
+
+ property end:
+ '''feature end (in 0-based open/closed coordinates).'''
+ def __get__(self):
+ return int(self._getindex(4))
+ def __set__(self, value):
+ self._setindex(4, str(value))
+
+ property score:
+ '''feature score.'''
+ def __get__(self):
+ v = self._getindex(5)
+ if v == "" or v[0] == '.':
+ return None
+ else:
+ return float(v)
+
+ def __set__(self, value):
+ if value is None:
+ value = "."
+ self._setindex(5, str(value))
+
+ property strand:
+ '''feature strand.'''
+ def __get__(self):
+ return self._getindex(6)
+ def __set__(self, value ):
+ if value is None:
+ value = "."
+ self._setindex(6, value)
+
+ property frame:
+ '''feature frame.'''
+ def __get__(self):
+ v = self._getindex(7)
+ if v == "" or v[0] == '.':
+ return v
+ else:
+ return int(v)
+
+ def __set__(self, value):
+ if value is None:
+ value = "."
+ self._setindex(7, str(value))
+
+ property attributes:
+ '''feature attributes (as a string).'''
+ def __get__(self):
+ if self.hasOwnAttributes:
+ return force_str(self._attributes)
+ else:
+ return force_str(self._getindex(8))
+ def __set__( self, value):
+ if self.hasOwnAttributes:
+ free(self._attributes)
+ self._attributes = NULL
+ self.hasOwnAttributes = False
+ self._setindex(8, value)
+
+ cdef char * getAttributes(self):
+ '''return pointer to attributes.'''
+ cdef char * attributes
+ if self.hasOwnAttributes:
+ attributes = self._attributes
+ else:
+ attributes = self.fields[8]
+ if attributes == NULL:
+ raise KeyError("no attributes defined GTF entry")
+ return attributes
+
+ def asDict(self):
+ """parse attributes - return as dict
+ """
+
+ # remove comments
+ attributes = self.attributes
+
+ # separate into fields
+ # Fields might contain a ";", for example in ENSEMBL GTF file
+ # for mouse, v78:
+ # ...; transcript_name "TXNRD2;-001"; ....
+ # The current heuristic is to split on a semicolon followed by a
+ # space, see also http://mblab.wustl.edu/GTF22.html
+
+ # Remove white space to prevent a last empty field.
+ fields = [x.strip() for x in attributes.strip().split("; ")]
+
+ result = collections.OrderedDict()
+
+ for f in fields:
+
+ # strip semicolon (GTF files without a space after the last semicolon)
+ if f.endswith(";"):
+ f = f[:-1]
+
+ # split at most once in order to avoid separating
+ # multi-word values
+ d = [x.strip() for x in f.split(" ", 1)]
+
+ n,v = d[0], d[1]
+ if len(d) > 2:
+ v = d[1:]
+
+ if v[0] == '"' and v[-1] == '"':
+ v = v[1:-1]
+ else:
+ ## try to convert to a value
+ try:
+ v = float(v)
+ v = int(v)
+ except ValueError:
+ pass
+ except TypeError:
+ pass
+
+ result[n] = v
+
+ return result
+
+ def fromDict(self, d):
+ '''set attributes from a dictionary.'''
+ cdef char * p
+ cdef int l
+
+ # clean up if this field is set twice
+ if self.hasOwnAttributes:
+ free(self._attributes)
+
+ aa = []
+ for k,v in d.items():
+ if isinstance(v, str):
+ aa.append( '%s "%s"' % (k,v) )
+ else:
+ aa.append( '%s %s' % (k,str(v)) )
+
+ a = force_bytes("; ".join(aa) + ";")
+ p = a
+ l = len(a)
+ self._attributes = <char *>calloc(l + 1, sizeof(char))
+ if self._attributes == NULL:
+ raise ValueError("out of memory")
+ memcpy(self._attributes, p, l)
+
+ self.hasOwnAttributes = True
+ self.is_modified = True
+
+ def __str__(self):
+ cdef char * cpy
+ cdef int x
+
+ if self.is_modified:
+ return "\t".join(
+ (self.contig,
+ self.source,
+ self.feature,
+ str(self.start+1),
+ str(self.end),
+ toDot(self.score),
+ toDot(self.strand),
+ toDot(self.frame),
+ self.attributes))
+ else:
+ return TupleProxy.__str__(self)
+
+ def invert(self, int lcontig):
+ '''invert coordinates to negative strand coordinates
+
+ This method will only act if the feature is on the
+ negative strand.'''
+
+ if self.strand[0] == '-':
+ start = min(self.start, self.end)
+ end = max(self.start, self.end)
+ self.start, self.end = lcontig - end, lcontig - start
+
+ def keys(self):
+ '''return a list of attributes defined in this entry.'''
+ r = self.attributes
+ return [x.strip().split(" ")[0]
+ # separator is ';' followed by space
+ for x in r.split("; ") if x.strip() != '']
+
+ def __getitem__(self, key):
+ return self.__getattr__(key)
+
+ def __getattr__(self, item):
+ """Generic lookup of attribute from GFF/GTF attributes
+ Only called if there *isn't* an attribute with this name
+ """
+ cdef char * start
+ cdef char * query
+ cdef char * cpy
+ cdef char * end
+ cdef int l
+
+ #
+ # important to use the getAttributes function.
+ # Using the self.attributes property to access
+ # the attributes caused a hard-to-trace bug
+ # in which fields in the attribute string were
+ # set to 0.
+ # Running through valgrind complained that
+ # memory was accessed in the memory field
+ # that has been released. It is not clear
+ # why this happened and might be a cython bug
+ # (Version 0.16). The valgrind warnings
+ # disappeard after accessing the C data structures
+ # directly and so did the bug.
+ cdef char * attributes = self.getAttributes()
+ if attributes == NULL:
+ raise KeyError("key %s not found, no attributes" % item)
+
+ # add space in order to make sure
+ # to not pick up a field that is a prefix of another field
+ r = force_bytes(item + " ")
+ query = r
+ start = strstr(attributes, query)
+
+ if start == NULL:
+ raise AttributeError("'GTFProxy' has no attribute '%s'" % item)
+
+ start += strlen(query)
+ # skip gaps before
+ while start[0] == ' ':
+ start += 1
+
+ if start[0] == '"':
+ start += 1
+ end = start
+ while end[0] != '\0' and end[0] != '"':
+ end += 1
+ l = end - start
+ result = force_str(PyBytes_FromStringAndSize(start, l),
+ self.encoding)
+ return result
+ else:
+ return force_str(start, self.encoding)
+
+ def setAttribute(self, name, value):
+ '''convenience method to set an attribute.'''
+ r = self.asDict()
+ r[name] = value
+ self.fromDict(r)
+
+ def __cmp__(self, other):
+ return (self.contig, self.strand, self.start) < \
+ (other.contig, other.strand, other.start)
+
+ # python 3 compatibility
+ def __richcmp__(GTFProxy self, GTFProxy other, int op):
+ if op == 0:
+ return (self.contig, self.strand, self.start) < \
+ (other.contig, other.strand, other.start)
+ elif op == 1:
+ return (self.contig, self.strand, self.start) <= \
+ (other.contig, other.strand, other.start)
+ elif op == 2:
+ return self.compare(other) == 0
+ elif op == 3:
+ return self.compare(other) != 0
+ else:
+ err_msg = "op {0} isn't implemented yet".format(op)
+ raise NotImplementedError(err_msg)
+
+
+cdef class NamedTupleProxy(TupleProxy):
+
+ map_key2field = {}
+
+ def __setattr__(self, key, value):
+ '''set attribute.'''
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ if self.nfields < idx:
+ raise KeyError("field %s not set" % key)
+ TupleProxy.__setitem__(self, idx, str(value))
+
+ def __getattr__(self, key):
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ if self.nfields < idx:
+ raise KeyError("field %s not set" % key)
+ if f == str:
+ return force_str(self.fields[idx],
+ self.encoding)
+ return f(self.fields[idx])
+
+
+cdef class BedProxy(NamedTupleProxy):
+ '''Proxy class for access to Bed fields.
+
+ This class represents a BED entry for fast read-access.
+ '''
+ map_key2field = {
+ 'contig' : (0, str),
+ 'start' : (1, int),
+ 'end' : (2, int),
+ 'name' : (3, str),
+ 'score' : (4, float),
+ 'strand' : (5, str),
+ 'thickStart' : (6, int),
+ 'thickEnd' : (7, int),
+ 'itemRGB' : (8, str),
+ 'blockCount': (9, int),
+ 'blockSizes': (10, str),
+ 'blockStarts': (11, str), }
+
+ cpdef int getMinFields(self):
+ '''return minimum number of fields.'''
+ return 3
+
+ cpdef int getMaxFields(self):
+ '''return max number of fields.'''
+ return 12
+
+ cdef update(self, char * buffer, size_t nbytes):
+ '''update internal data.
+
+ nbytes does not include the terminal '\0'.
+ '''
+ TupleProxy.update(self, buffer, nbytes)
+
+ if self.nfields < 3:
+ raise ValueError(
+ "bed format requires at least three columns")
+
+ # determines bed format
+ self.bedfields = self.nfields
+
+ # do automatic conversion
+ self.contig = self.fields[0]
+ self.start = atoi(self.fields[1])
+ self.end = atoi(self.fields[2])
+
+ # __setattr__ in base class seems to take precedence
+ # hence implement setters in __setattr__
+ #property start:
+ # def __get__( self ): return self.start
+ #property end:
+ # def __get__( self ): return self.end
+
+ def __str__(self):
+
+ cdef int save_fields = self.nfields
+ # ensure fields to use correct format
+ self.nfields = self.bedfields
+ retval = TupleProxy.__str__(self)
+ self.nfields = save_fields
+ return retval
+
+ def __setattr__(self, key, value ):
+ '''set attribute.'''
+ if key == "start":
+ self.start = value
+ elif key == "end":
+ self.end = value
+
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ TupleProxy._setindex(self, idx, str(value) )
+
+cdef class VCFProxy(NamedTupleProxy):
+ '''Proxy class for access to VCF fields.
+
+ The genotypes are accessed via a numeric index.
+ Sample headers are not available.
+ '''
+ map_key2field = {
+ 'contig' : (0, str),
+ 'pos' : (1, int),
+ 'id' : (2, str),
+ 'ref' : (3, str),
+ 'alt' : (4, str),
+ 'qual' : (5, str),
+ 'filter' : (6, str),
+ 'info' : (7, str),
+ 'format' : (8, str) }
+
+ def __cinit__(self):
+ # automatically calls TupleProxy.__cinit__
+ # start indexed access at genotypes
+ self.offset = 9
+
+ cdef update(self, char * buffer, size_t nbytes):
+ '''update internal data.
+
+ nbytes does not include the terminal '\0'.
+ '''
+ TupleProxy.update(self, buffer, nbytes)
+
+ self.contig = self.fields[0]
+ # vcf counts from 1 - correct here
+ self.pos = atoi(self.fields[1]) - 1
+
+ def __len__(self):
+ '''return number of genotype fields.'''
+ return max(0, self.nfields - 9)
+
+ property pos:
+ '''feature end (in 0-based open/closed coordinates).'''
+ def __get__(self):
+ return self.pos
+
+ def __setattr__(self, key, value):
+ '''set attribute.'''
+ if key == "pos":
+ self.pos = value
+ value += 1
+
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ TupleProxy._setindex(self, idx, str(value))
+
--- /dev/null
+#########################################################################
+# Utility functions used across pysam
+#########################################################################
+cimport cython
+from cpython cimport array as c_array
+
+cpdef parse_region(reference=*, start=*, end=*, region=*)
+
+#########################################################################
+# Utility functions for quality string conversions
+
+cpdef c_array.array qualitystring_to_array(input_str, int offset=*)
+cpdef array_to_qualitystring(c_array.array arr, int offset=*)
+cpdef qualities_to_qualitystring(qualities, int offset=*)
+
+########################################################################
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+cdef charptr_to_str(const char *s, encoding=*)
+cdef bytes charptr_to_bytes(const char *s, encoding=*)
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*)
+cdef force_str(object s, encoding=*)
+cdef bytes force_bytes(object s, encoding=*)
+cdef bytes encode_filename(object filename)
+cdef from_string_and_size(const char *s, size_t length)
+
+cdef extern from "pysam_util.h":
+
+ int samtools_main(int argc, char *argv[])
+ int bcftools_main(int argc, char *argv[])
+ void pysam_set_stderr(int fd)
+ void pysam_unset_stderr()
+ void pysam_set_stdout(int fd)
+ void pysam_set_stdout_fn(const char *)
+ void pysam_unset_stdout()
+ void set_optind(int)
--- /dev/null
+import types
+import sys
+import string
+import re
+import tempfile
+import os
+import io
+from contextlib import contextmanager
+
+from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION
+from cpython cimport PyBytes_Check, PyUnicode_Check
+from cpython cimport array as c_array
+from libc.stdlib cimport calloc, free
+from libc.string cimport strncpy
+from libc.stdio cimport fprintf, stderr, fflush
+from libc.stdio cimport stdout as c_stdout
+from posix.fcntl cimport open as c_open, O_WRONLY
+
+#####################################################################
+# hard-coded constants
+cdef int MAX_POS = 2 << 29
+
+#################################################################
+# Utility functions for quality string conversions
+cpdef c_array.array qualitystring_to_array(input_str, int offset=33):
+ """convert a qualitystring to an array of quality values."""
+ if input_str is None:
+ return None
+ qs = force_bytes(input_str)
+ cdef char i
+ return c_array.array('B', [i - offset for i in qs])
+
+
+cpdef array_to_qualitystring(c_array.array qualities, int offset=33):
+ """convert an array of quality values to a string."""
+ if qualities is None:
+ return None
+ cdef int x
+
+ cdef c_array.array result
+ result = c_array.clone(qualities, len(qualities), zero=False)
+
+ for x from 0 <= x < len(qualities):
+ result[x] = qualities[x] + offset
+ return force_str(result.tostring())
+
+
+cpdef qualities_to_qualitystring(qualities, int offset=33):
+ """convert a list or array of quality scores to the string
+ representation used in the SAM format.
+
+ Parameters
+ ----------
+ offset : int
+ offset to be added to the quality scores to arrive at
+ the characters of the quality string (default=33).
+
+ Returns
+ -------
+ string
+ a quality string
+
+ """
+ cdef char x
+ if qualities is None:
+ return None
+ elif isinstance(qualities, c_array.array):
+ return array_to_qualitystring(qualities, offset=offset)
+ else:
+ # tuples and lists
+ return force_str("".join([chr(x + offset) for x in qualities]))
+
+
+########################################################################
+########################################################################
+########################################################################
+## Python 3 compatibility functions
+########################################################################
+
+cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
+
+cdef from_string_and_size(const char* s, size_t length):
+ if IS_PYTHON3:
+ return s[:length].decode("ascii")
+ else:
+ return s[:length]
+
+
+# filename encoding (adapted from lxml.etree.pyx)
+cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
+
+
+cdef bytes encode_filename(object filename):
+ """Make sure a filename is 8-bit encoded (or None)."""
+ if filename is None:
+ return None
+ elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2:
+ # Added to support path-like objects
+ return os.fsencode(filename)
+ elif PyBytes_Check(filename):
+ return filename
+ elif PyUnicode_Check(filename):
+ return filename.encode(FILENAME_ENCODING)
+ else:
+ raise TypeError("Argument must be string or unicode.")
+
+
+cdef bytes force_bytes(object s, encoding="ascii"):
+ """convert string or unicode object to bytes, assuming
+ ascii encoding.
+ """
+ if s is None:
+ return None
+ elif PyBytes_Check(s):
+ return s
+ elif PyUnicode_Check(s):
+ return s.encode(encoding)
+ else:
+ raise TypeError("Argument must be string, bytes or unicode.")
+
+
+cdef charptr_to_str(const char* s, encoding="ascii"):
+ if s == NULL:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s
+ else:
+ return s.decode(encoding)
+
+
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
+ if s == NULL:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s[:n]
+ else:
+ return s[:n].decode(encoding)
+
+
+cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
+ if s == NULL:
+ return None
+ else:
+ return s
+
+
+cdef force_str(object s, encoding="ascii"):
+ """Return s converted to str type of current Python
+ (bytes in Py2, unicode in Py3)"""
+ if s is None:
+ return None
+ if PY_MAJOR_VERSION < 3:
+ return s
+ elif PyBytes_Check(s):
+ return s.decode(encoding)
+ else:
+ # assume unicode
+ return s
+
+
+cpdef parse_region(reference=None,
+ start=None,
+ end=None,
+ region=None):
+ """parse alternative ways to specify a genomic region. A region can
+ either be specified by :term:`reference`, `start` and
+ `end`. `start` and `end` denote 0-based, half-open
+ intervals.
+
+ Alternatively, a samtools :term:`region` string can be
+ supplied.
+
+ If any of the coordinates are missing they will be replaced by the
+ minimum (`start`) or maximum (`end`) coordinate.
+
+ Note that region strings are 1-based, while `start` and `end` denote
+ an interval in python coordinates.
+
+ Returns
+ -------
+
+ tuple : a tuple of `reference`, `start` and `end`.
+
+ Raises
+ ------
+
+ ValueError
+ for invalid or out of bounds regions.
+
+ """
+ cdef int rtid
+ cdef long long rstart
+ cdef long long rend
+
+ rtid = -1
+ rstart = 0
+ rend = MAX_POS
+ if start != None:
+ try:
+ rstart = start
+ except OverflowError:
+ raise ValueError('start out of range (%i)' % start)
+
+ if end != None:
+ try:
+ rend = end
+ except OverflowError:
+ raise ValueError('end out of range (%i)' % end)
+
+ if region:
+ region = force_str(region)
+ parts = re.split("[:-]", region)
+ reference = parts[0]
+ if len(parts) >= 2:
+ rstart = int(parts[1]) - 1
+ if len(parts) >= 3:
+ rend = int(parts[2])
+
+ if not reference:
+ return None, 0, 0
+
+ if not 0 <= rstart < MAX_POS:
+ raise ValueError('start out of range (%i)' % rstart)
+ if not 0 <= rend <= MAX_POS:
+ raise ValueError('end out of range (%i)' % rend)
+ if rstart > rend:
+ raise ValueError(
+ 'invalid region: start (%i) > end (%i)' % (rstart, rend))
+
+ return force_bytes(reference), rstart, rend
+
+
+def _pysam_dispatch(collection,
+ method,
+ args=None,
+ catch_stdout=True,
+ save_stdout=None):
+ '''call ``method`` in samtools/bcftools providing arguments in args.
+
+ Catching of stdout can be turned off by setting *catch_stdout* to
+ False.
+
+ '''
+
+ if method == "index":
+ if not os.path.exists(args[0]):
+ raise IOError("No such file or directory: '%s'" % args[0])
+
+ if args is None:
+ args = []
+ else:
+ args = list(args)
+
+ # redirect stderr to file
+ stderr_h, stderr_f = tempfile.mkstemp()
+ pysam_set_stderr(stderr_h)
+
+ # redirect stdout to file
+ if save_stdout:
+ stdout_f = save_stdout
+ stdout_h = c_open(force_bytes(stdout_f),
+ O_WRONLY)
+ if stdout_h == -1:
+ raise OSError("error while opening {} for writing".format(stdout_f))
+
+ pysam_set_stdout_fn(force_bytes(stdout_f))
+ pysam_set_stdout(stdout_h)
+ elif catch_stdout:
+ stdout_h, stdout_f = tempfile.mkstemp()
+
+ MAP_STDOUT_OPTIONS = {
+ "samtools": {
+ "view": "-o {}",
+ "mpileup": "-o {}",
+ "depad": "-o {}",
+ "calmd": "", # uses pysam_stdout_fn
+ },
+ "bcftools": {}
+ }
+
+ stdout_option = None
+ if collection == "bcftools":
+ # in bcftools, most methods accept -o, the exceptions
+ # are below:
+ if method not in ("index", "roh", "stats"):
+ stdout_option = "-o {}"
+ elif method in MAP_STDOUT_OPTIONS[collection]:
+ # special case - samtools view -c outputs on stdout
+ if not(method == "view" and "-c" in args):
+ stdout_option = MAP_STDOUT_OPTIONS[collection][method]
+
+ if stdout_option is not None:
+ os.close(stdout_h)
+ pysam_set_stdout_fn(force_bytes(stdout_f))
+ args.extend(stdout_option.format(stdout_f).split(" "))
+ else:
+ pysam_set_stdout(stdout_h)
+ else:
+ pysam_set_stdout_fn("-")
+
+ # setup the function call to samtools/bcftools main
+ cdef char ** cargs
+ cdef int i, n, retval, l
+ n = len(args)
+ method = force_bytes(method)
+ collection = force_bytes(collection)
+ args = [force_bytes(a) for a in args]
+
+ # allocate two more for first (dummy) argument (contains command)
+ cdef int extra_args = 0
+ if method == b"index":
+ extra_args = 1
+ # add extra arguments for commands accepting optional arguments
+ # such as 'samtools index x.bam [out.index]'
+ cargs = <char**>calloc(n + 2 + extra_args, sizeof(char *))
+ cargs[0] = collection
+ cargs[1] = method
+
+ # create copies of strings - getopt for long options permutes
+ # arguments
+ for i from 0 <= i < n:
+ l = len(args[i])
+ cargs[i + 2] = <char *>calloc(l + 1, sizeof(char))
+ strncpy(cargs[i + 2], args[i], l)
+
+ # reset getopt. On OsX there getopt reset is different
+ # between getopt and getopt_long
+ if method in [b'index', b'cat', b'quickcheck',
+ b'faidx', b'kprobaln']:
+ set_optind(1)
+ else:
+ set_optind(0)
+
+ # call samtools/bcftools
+ if collection == b"samtools":
+ retval = samtools_main(n + 2, cargs)
+ elif collection == b"bcftools":
+ retval = bcftools_main(n + 2, cargs)
+
+ for i from 0 <= i < n:
+ free(cargs[i + 2])
+ free(cargs)
+
+ # get error messages
+ def _collect(fn):
+ out = []
+ try:
+ with open(fn, "r") as inf:
+ out = inf.read()
+ except UnicodeDecodeError:
+ with open(fn, "rb") as inf:
+ # read binary output
+ out = inf.read()
+ finally:
+ os.remove(fn)
+ return out
+
+ pysam_unset_stderr()
+ out_stderr = _collect(stderr_f)
+
+ if save_stdout:
+ pysam_unset_stdout()
+ out_stdout = None
+ elif catch_stdout:
+ pysam_unset_stdout()
+ out_stdout = _collect(stdout_f)
+ else:
+ out_stdout = None
+
+ return retval, out_stderr, out_stdout
+
+
+__all__ = ["qualitystring_to_array",
+ "array_to_qualitystring",
+ "qualities_to_qualitystring"]
--- /dev/null
+# cython: embedsignature=True
+#
+# Code to read, write and edit VCF files
+#
+# VCF lines are encoded as a dictionary with these keys (note: all lowercase):
+# 'chrom': string
+# 'pos': integer
+# 'id': string
+# 'ref': string
+# 'alt': list of strings
+# 'qual': integer
+# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
+# 'info': dictionary of values (see below)
+# 'format': list of keys (strings)
+# sample keys: dictionary of values (see below)
+#
+# The sample keys are accessible through vcf.getsamples()
+#
+# A dictionary of values contains value keys (defined in ##INFO or
+# ##FORMAT lines) which map to a list, containing integers, floats,
+# strings, or characters. Missing values are replaced by a particular
+# value, often -1 or .
+#
+# Genotypes are not stored as a string, but as a list of 1 or 3
+# elements (for haploid and diploid samples), the first (and last) the
+# integer representing an allele, and the second the separation
+# character. Note that there is just one genotype per sample, but for
+# consistency the single element is stored in a list.
+#
+# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
+# (key, value) pairs and are accessible through getheader()
+#
+# The VCF class can be instantiated with a 'regions' variable
+# consisting of tuples (chrom,start,end) encoding 0-based half-open
+# segments. Only variants with a position inside the segment will be
+# parsed. A regions parser is available under parse_regions.
+#
+# When instantiated, a reference can be passed to the VCF class. This
+# may be any class that supports a fetch(chrom, start, end) method.
+#
+# NOTE: the position that is returned to Python is 0-based, NOT
+# 1-based as in the VCF file.
+# NOTE: There is also preliminary VCF functionality in the VariantFile class.
+#
+# TODO:
+# only v4.0 writing is complete; alleles are not converted to v3.3 format
+#
+
+from collections import namedtuple, defaultdict
+from operator import itemgetter
+import sys, re, copy, bisect
+
+from libc.stdlib cimport atoi
+from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
+
+cimport pysam.libctabix as libctabix
+cimport pysam.libctabixproxies as libctabixproxies
+
+from pysam.libcutils cimport force_str
+
+import pysam
+
+gtsRegEx = re.compile("[|/\\\\]")
+alleleRegEx = re.compile('^[ACGTN]+$')
+
+# Utility function. Uses 0-based coordinates
+def get_sequence(chrom, start, end, fa):
+ # obtain sequence from .fa file, without truncation
+ if end<=start: return ""
+ if not fa: return "N"*(end-start)
+ if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
+ sequence = fa.fetch(chrom, start, end).upper()
+ if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
+ return sequence
+
+# Utility function. Parses a region string
+def parse_regions( string ):
+ result = []
+ for r in string.split(','):
+ elts = r.split(':')
+ chrom, start, end = elts[0], 0, 3000000000
+ if len(elts)==1: pass
+ elif len(elts)==2:
+ if len(elts[1])>0:
+ ielts = elts[1].split('-')
+ if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
+ try: start, end = int(ielts[0])-1, int(ielts[1])
+ except: raise ValueError("Don't understand region string '%s'" % r)
+ else:
+ raise ValueError("Don't understand region string '%s'" % r)
+ result.append( (chrom,start,end) )
+ return result
+
+
+FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
+
+###########################################################################################################
+#
+# New class
+#
+###########################################################################################################
+
+cdef class VCFRecord(libctabixproxies.TupleProxy):
+ '''vcf record.
+
+ initialized from data and vcf meta
+ '''
+
+ cdef vcf
+ cdef char * contig
+ cdef uint32_t pos
+
+ def __init__(self, vcf):
+ self.vcf = vcf
+ self.encoding = vcf.encoding
+
+ # if len(data) != len(self.vcf._samples):
+ # self.vcf.error(str(data),
+ # self.BAD_NUMBER_OF_COLUMNS,
+ # "expected %s for %s samples (%s), got %s" % \
+ # (len(self.vcf._samples),
+ # len(self.vcf._samples),
+ # self.vcf._samples,
+ # len(data)))
+
+ def __cinit__(self, vcf):
+ # start indexed access at genotypes
+ self.offset = 9
+
+ self.vcf = vcf
+ self.encoding = vcf.encoding
+
+ def error(self, line, error, opt=None):
+ '''raise error.'''
+ # pass to vcf file for error handling
+ return self.vcf.error(line, error, opt)
+
+ cdef update(self, char * buffer, size_t nbytes):
+ '''update internal data.
+
+ nbytes does not include the terminal '\0'.
+ '''
+ libctabixproxies.TupleProxy.update(self, buffer, nbytes)
+
+ self.contig = self.fields[0]
+ # vcf counts from 1 - correct here
+ self.pos = atoi(self.fields[1]) - 1
+
+ def __len__(self):
+ return max(0, self.nfields - 9)
+
+ property contig:
+ def __get__(self): return self.contig
+
+ property pos:
+ def __get__(self): return self.pos
+
+ property id:
+ def __get__(self): return self.fields[2]
+
+ property ref:
+ def __get__(self):
+ return self.fields[3]
+
+ property alt:
+ def __get__(self):
+ # convert v3.3 to v4.0 alleles below
+ alt = self.fields[4]
+ if alt == ".": alt = []
+ else: alt = alt.upper().split(',')
+ return alt
+
+ property qual:
+ def __get__(self):
+ qual = self.fields[5]
+ if qual == b".": qual = -1
+ else:
+ try: qual = float(qual)
+ except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
+ return qual
+
+ property filter:
+ def __get__(self):
+ f = self.fields[6]
+ # postpone checking that filters exist. Encode missing filter or no filtering as empty list
+ if f == b"." or f == b"PASS" or f == b"0": return []
+ else: return f.split(';')
+
+ property info:
+ def __get__(self):
+ col = self.fields[7]
+ # dictionary of keys, and list of values
+ info = {}
+ if col != b".":
+ for blurp in col.split(';'):
+ elts = blurp.split('=')
+ if len(elts) == 1: v = None
+ elif len(elts) == 2: v = elts[1]
+ else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
+ info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
+ return info
+
+ property format:
+ def __get__(self):
+ return self.fields[8].split(':')
+
+ property samples:
+ def __get__(self):
+ return self.vcf._samples
+
+ def __getitem__(self, key):
+
+ # parse sample columns
+ values = self.fields[self.vcf._sample2column[key]].split(':')
+ alt = self.alt
+ format = self.format
+
+ if len(values) > len(format):
+ self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
+ (len(values),key,len(format)))
+
+ result = {}
+ for idx in range(len(format)):
+ expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
+ if idx < len(values): value = values[idx]
+ else:
+ if expected == -1: value = "."
+ else: value = ",".join(["."]*expected)
+
+ result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
+ if expected != -1 and len(result[format[idx]]) != expected:
+ self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
+ "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
+ if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
+ result[format[idx]] = result[format[idx]][:expected]
+
+ return result
+
+
+cdef class asVCFRecord(libctabix.Parser):
+ '''converts a :term:`tabix row` into a VCF record.'''
+ cdef vcffile
+ def __init__(self, vcffile):
+ self.vcffile = vcffile
+
+ cdef parse(self, char * buffer, int len):
+ cdef VCFRecord r
+ r = VCFRecord(self.vcffile)
+ r.copy(buffer, len)
+ return r
+
+class VCF(object):
+
+ # types
+ NT_UNKNOWN = 0
+ NT_NUMBER = 1
+ NT_ALLELES = 2
+ NT_NR_ALLELES = 3
+ NT_GENOTYPES = 4
+ NT_PHASED_GENOTYPES = 5
+
+ _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
+ 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
+ 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
+ 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
+ 4:"POS_NOT_NUMERICAL:Position column is not numerical",
+ 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
+ 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
+ 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
+ 8:"POS_NOT_POSITIVE:Position field must be >0",
+ 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
+ 10:"ERROR_INFO_STRING:Error while parsing info field",
+ 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
+ 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
+ 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
+ 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
+ 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
+ 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
+ 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
+ 18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
+ 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
+ 20:"MISSING_REF:Reference allele missing",
+ 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
+ 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
+ 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
+ 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
+ 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
+ 26:"WRONG_REF:Wrong reference %s",
+ 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
+ 28:"BAD_CHR_TAG:Error calculating chr tag for %s",
+ 29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
+ 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
+ 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
+ 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
+ 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
+ }
+
+ # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
+ _header = []
+
+ # version number; 33=v3.3; 40=v4.0
+ _version = 40
+
+ # info, filter and format data
+ _info = {}
+ _filter = {}
+ _format = {}
+
+ # header; and required columns
+ _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
+ _samples = []
+
+ # control behaviour
+ _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
+ _warn_errors = set([])
+ _leftalign = False
+
+ # reference sequence
+ _reference = None
+
+ # regions to include; None includes everything
+ _regions = None
+
+ # statefull stuff
+ _lineno = -1
+ _line = None
+ _lines = None
+
+ def __init__(self, _copy=None, reference=None, regions=None,
+ lines=None, leftalign=False):
+ # make error identifiers accessible by name
+ for id in self._errors.keys():
+ self.__dict__[self._errors[id].split(':')[0]] = id
+ if _copy != None:
+ self._leftalign = _copy._leftalign
+ self._header = _copy._header[:]
+ self._version = _copy._version
+ self._info = copy.deepcopy(_copy._info)
+ self._filter = copy.deepcopy(_copy._filter)
+ self._format = copy.deepcopy(_copy._format)
+ self._samples = _copy._samples[:]
+ self._sample2column = copy.deepcopy(_copy._sample2column)
+ self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
+ self._warn_errors = copy.deepcopy(_copy._warn_errors)
+ self._reference = _copy._reference
+ self._regions = _copy._regions
+ if reference: self._reference = reference
+ if regions: self._regions = regions
+ if leftalign: self._leftalign = leftalign
+ self._lines = lines
+ self.encoding = "ascii"
+ self.tabixfile = None
+
+ def error(self,line,error,opt=None):
+ if error in self._ignored_errors: return
+ errorlabel, errorstring = self._errors[error].split(':')
+ if opt: errorstring = errorstring % opt
+ errwarn = ["Error","Warning"][error in self._warn_errors]
+ errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
+ if error in self._warn_errors: return
+ raise ValueError(errorstring)
+
+ def parse_format(self,line,format,filter=False):
+ if self._version == 40:
+ if not format.startswith('<'):
+ self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
+ format = "<"+format
+ if not format.endswith('>'):
+ self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
+ format += ">"
+ format = format[1:-1]
+ data = {'id':None,'number':None,'type':None,'descr':None}
+ idx = 0
+ while len(format.strip())>0:
+ elts = format.strip().split(',')
+ first, rest = elts[0], ','.join(elts[1:])
+ if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
+ if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
+ if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+ first = ["ID=","Number=","Type=","Description="][idx] + first
+ if first.startswith('ID='): data['id'] = first.split('=')[1]
+ elif first.startswith('Number='): data['number'] = first.split('=')[1]
+ elif first.startswith('Type='): data['type'] = first.split('=')[1]
+ elif first.startswith('Description='):
+ elts = format.split('"')
+ if len(elts)<3:
+ self.error(line,self.FORMAT_MISSING_QUOTES)
+ elts = first.split('=') + [rest]
+ data['descr'] = elts[1]
+ rest = '"'.join(elts[2:])
+ if rest.startswith(','): rest = rest[1:]
+ else:
+ self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+ format = rest
+ idx += 1
+ if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings
+ if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+ if 'descr' not in data:
+ # missing description
+ self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+ data['descr'] = ""
+ if not data['type'] and not data['number']:
+ # fine, ##filter format
+ return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
+ if not data['type'] in ["Integer","Float","Character","String","Flag"]:
+ self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+ # I would like a missing-value field, but it isn't there
+ if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value
+ else: data['missing'] = '.'
+ if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+ try:
+ n = int(data['number'])
+ t = self.NT_NUMBER
+ except ValueError:
+ n = -1
+ if data['number'] == '.': t = self.NT_UNKNOWN
+ elif data['number'] == '#alleles': t = self.NT_ALLELES
+ elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES
+ elif data['number'] == '#genotypes': t = self.NT_GENOTYPES
+ elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
+ elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
+ # abbreviations added in VCF version v4.1
+ elif data['number'] == 'A': t = self.NT_ALLELES
+ elif data['number'] == 'G': t = self.NT_GENOTYPES
+ else:
+ self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
+ # if number is 0 - type must be Flag
+ if n == 0 and data['type'] != 'Flag':
+ self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
+ # force type 'Flag' if no number
+ data['type'] = 'Flag'
+
+ return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
+
+ def format_format( self, fmt, filter=False ):
+ values = [('ID',fmt.id)]
+ if fmt.number != None and not filter:
+ if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
+ elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
+ elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
+ elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
+ elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
+ elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
+ else:
+ raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
+ values.append( ('Number',nmb) )
+ values.append( ('Type', fmt.type) )
+ values.append( ('Description', '"' + fmt.description + '"') )
+ if self._version == 33:
+ format = ",".join([v for k,v in values])
+ else:
+ format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
+ return format
+
+ def get_expected(self, format, formatdict, alt):
+ fmt = formatdict[format]
+ if fmt.numbertype == self.NT_UNKNOWN: return -1
+ if fmt.numbertype == self.NT_NUMBER: return fmt.number
+ if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
+ if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
+ if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
+ if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
+ return 0
+
+
+ def _add_definition(self, formatdict, key, data, line ):
+ if key in formatdict: return
+ self.error(line,self.ERROR_UNKNOWN_KEY,key)
+ if data == None:
+ formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
+ return
+ if data == []: data = [""] # unsure what type -- say string
+ if type(data[0]) == type(0.0):
+ formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
+ return
+ if type(data[0]) == type(0):
+ formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
+ return
+ formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
+
+
+ # todo: trim trailing missing values
+ def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
+ output, sdata = [], []
+ if type(data) == type([]): # for FORMAT field, make data with dummy values
+ d = {}
+ for k in data: d[k] = []
+ data = d
+ # convert missing values; and silently add definitions if required
+ for k in data:
+ self._add_definition( format, k, data[k], "(output)" )
+ for idx,v in enumerate(data[k]):
+ if v == format[k].missingvalue: data[k][idx] = "."
+ # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
+ for k in data:
+ if k != 'GT': sdata.append( (k,data[k]) )
+ sdata.sort()
+ if 'GT' in data:
+ sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
+ for k,v in sdata:
+ if v == []: v = None
+ if key and value:
+ if v != None: output.append( k+"="+','.join(map(str,v)) )
+ else: output.append( k )
+ elif key: output.append(k)
+ elif value:
+ if v != None: output.append( ','.join(map(str,v)) )
+ else: output.append( "." ) # should not happen
+ # snip off trailing missing data
+ while len(output) > 1:
+ last = output[-1].replace(',','').replace('.','')
+ if len(last)>0: break
+ output = output[:-1]
+ return separator.join(output)
+
+
+ def enter_default_format(self):
+ for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
+ FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
+ FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
+ FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
+ FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
+ FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
+ FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
+ FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
+ FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
+ FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid
+ FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
+ FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
+ FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
+ FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
+ ]:
+ if f.id not in self._format:
+ self._format[f.id] = f
+
+ def parse_header(self, line):
+
+ assert line.startswith('##')
+ elts = line[2:].split('=')
+ key = elts[0].strip()
+ value = '='.join(elts[1:]).strip()
+ if key == "fileformat":
+ if value == "VCFv3.3":
+ self._version = 33
+ elif value == "VCFv4.0":
+ self._version = 40
+ elif value == "VCFv4.1":
+ # AH - for testing
+ self._version = 40
+ elif value == "VCFv4.2":
+ # AH - for testing
+ self._version = 40
+ else:
+ self.error(line,self.UNKNOWN_FORMAT_STRING)
+ elif key == "INFO":
+ f = self.parse_format(line, value)
+ self._info[ f.id ] = f
+ elif key == "FILTER":
+ f = self.parse_format(line, value, filter=True)
+ self._filter[ f.id ] = f
+ elif key == "FORMAT":
+ f = self.parse_format(line, value)
+ self._format[ f.id ] = f
+ else:
+ # keep other keys in the header field
+ self._header.append( (key,value) )
+
+
+ def write_header( self, stream ):
+ stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
+ for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
+ for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
+ for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
+
+
+ def parse_heading( self, line ):
+ assert line.startswith('#')
+ assert not line.startswith('##')
+ headings = line[1:].split('\t')
+ # test for 8, as FORMAT field might be missing
+ if len(headings)==1 and len(line[1:].split()) >= 8:
+ self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
+ headings = line[1:].split()
+
+ for i,s in enumerate(self._required):
+
+ if len(headings)<=i or headings[i] != s:
+
+ if len(headings) <= i:
+ err = "(%sth entry not found)" % (i+1)
+ else:
+ err = "(found %s, expected %s)" % (headings[i],s)
+
+ #self.error(line,self.BADLY_FORMATTED_HEADING,err)
+ # allow FORMAT column to be absent
+ if len(headings) == 8:
+ headings.append("FORMAT")
+ else:
+ self.error(line,self.BADLY_FORMATTED_HEADING,err)
+
+ self._samples = headings[9:]
+ self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
+
+ def write_heading( self, stream ):
+ stream.write("#" + "\t".join(self._required + self._samples) + "\n")
+
+ def convertGT(self, GTstring):
+ if GTstring == ".": return ["."]
+ try:
+ gts = gtsRegEx.split(GTstring)
+ if len(gts) == 1: return [int(gts[0])]
+ if len(gts) != 2: raise ValueError()
+ if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
+ return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
+ except ValueError:
+ self.error(self._line,self.BAD_GENOTYPE,GTstring)
+ return [".","|","."]
+
+ def convertGTback(self, GTdata):
+ return ''.join(map(str,GTdata))
+
+ def parse_formatdata( self, key, value, formatdict, line ):
+ # To do: check that the right number of values is present
+ f = formatdict.get(key,None)
+ if f == None:
+ self._add_definition(formatdict, key, value, line )
+ f = formatdict[key]
+ if f.type == "Flag":
+ if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
+ return []
+ values = value.split(',')
+ # deal with trailing data in some early VCF files
+ if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
+ self.error(line,self.ERROR_TRAILING_DATA,values[-1])
+ values[-1] = values[-1].split(';')[0]
+ if f.type == "Integer":
+ for idx,v in enumerate(values):
+ try:
+ if v == ".": values[idx] = f.missingvalue
+ else: values[idx] = int(v)
+ except:
+ self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
+ return [0] * len(values)
+ return values
+ elif f.type == "String":
+ self._line = line
+ if f.id == "GT": values = list(map( self.convertGT, values ))
+ return values
+ elif f.type == "Character":
+ for v in values:
+ if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
+ return values
+ elif f.type == "Float":
+ for idx,v in enumerate(values):
+ if v == ".": values[idx] = f.missingvalue
+ try: return list(map(float,values))
+ except:
+ self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
+ return [0.0] * len(values)
+ else:
+ # can't happen
+ self.error(line,self.ERROR_INFO_STRING)
+
+ def inregion(self, chrom, pos):
+ if not self._regions: return True
+ for r in self._regions:
+ if r[0] == chrom and r[1] <= pos < r[2]: return True
+ return False
+
+ def parse_data( self, line, lineparse=False ):
+ cols = line.split('\t')
+ if len(cols) != len(self._samples)+9:
+ # gracefully deal with absent FORMAT column
+ # and those missing samples
+ if len(cols) == 8:
+ cols.append("")
+ else:
+ self.error(line,
+ self.BAD_NUMBER_OF_COLUMNS,
+ "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
+
+ chrom = cols[0]
+
+ # get 0-based position
+ try: pos = int(cols[1])-1
+ except: self.error(line,self.POS_NOT_NUMERICAL)
+ if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
+
+ # implement filtering
+ if not self.inregion(chrom,pos): return None
+
+ # end of first-pass parse for sortedVCF
+ if lineparse: return chrom, pos, line
+
+ id = cols[2]
+
+ ref = cols[3].upper()
+ if ref == ".":
+ self.error(line,self.MISSING_REF)
+ if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
+ else: ref = ""
+ else:
+ for c in ref:
+ if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
+ if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
+
+ # make sure reference is sane
+ if self._reference:
+ left = max(0,pos-100)
+ faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
+ faref = faref_leftflank[pos-left:]
+ if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
+ ref = faref
+
+ # convert v3.3 to v4.0 alleles below
+ if cols[4] == ".": alt = []
+ else: alt = cols[4].upper().split(',')
+
+ if cols[5] == ".": qual = -1
+ else:
+ try: qual = float(cols[5])
+ except: self.error(line,self.QUAL_NOT_NUMERICAL)
+
+ # postpone checking that filters exist. Encode missing filter or no filtering as empty list
+ if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
+ else: filter = cols[6].split(';')
+
+ # dictionary of keys, and list of values
+ info = {}
+ if cols[7] != ".":
+ for blurp in cols[7].split(';'):
+ elts = blurp.split('=')
+ if len(elts) == 1: v = None
+ elif len(elts) == 2: v = elts[1]
+ else: self.error(line,self.ERROR_INFO_STRING)
+ info[elts[0]] = self.parse_formatdata(elts[0],
+ v,
+ self._info,
+ line)
+
+ # Gracefully deal with absent FORMAT column
+ if cols[8] == "": format = []
+ else: format = cols[8].split(':')
+
+ # check: all filters are defined
+ for f in filter:
+ if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
+
+ # check: format fields are defined
+ if self._format:
+ for f in format:
+ if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
+
+ # convert v3.3 alleles
+ if self._version == 33:
+ if len(ref) != 1: self.error(line,self.V33_BAD_REF)
+ newalts = []
+ have_deletions = False
+ for a in alt:
+ if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
+ elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
+ elif a.startswith('D'): # allow D<seq> and D<num>
+ have_deletions = True
+ try:
+ l = int(a[1:]) # throws ValueError if sequence
+ if len(ref) < l: # add to reference if necessary
+ addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
+ ref += addns
+ for i,na in enumerate(newalts): newalts[i] = na+addns
+ a = ref[l:] # new deletion, deleting pos...pos+l
+ except ValueError:
+ s = a[1:]
+ if len(ref) < len(s): # add Ns to reference if necessary
+ addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
+ if not s.endswith(addns) and addns != 'N'*len(addns):
+ self.error(line,self.V33_UNMATCHED_DELETION,
+ "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
+ ref += addns
+ for i,na in enumerate(newalts): newalts[i] = na+addns
+ a = ref[len(s):] # new deletion, deleting from pos
+ else:
+ self.error(line,self.V33_BAD_ALLELE)
+ newalts.append(a)
+ alt = newalts
+ # deletion alleles exist, add dummy 1st reference allele, and account for leading base
+ if have_deletions:
+ if pos == 0:
+ # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
+ addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
+ ref += addn
+ alt = [allele+addn for allele in alt]
+ else:
+ addn = get_sequence(chrom,pos-1,pos,self._reference)
+ ref = addn + ref
+ alt = [addn + allele for allele in alt]
+ pos -= 1
+ else:
+ # format v4.0 -- just check for nucleotides
+ for allele in alt:
+ if not alleleRegEx.match(allele):
+ self.error(line,self.V40_BAD_ALLELE,allele)
+
+ # check for leading nucleotide in indel calls
+ for allele in alt:
+ if len(allele) != len(ref):
+ if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
+ if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
+ self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
+
+ # trim trailing bases in alleles
+ # AH: not certain why trimming this needs to be added
+ # disabled now for unit testing
+ # if alt:
+ # for i in range(1,min(len(ref),min(map(len,alt)))):
+ # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
+ # break
+ # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
+
+ # left-align alleles, if a reference is available
+ if self._leftalign and self._reference:
+ while left < pos:
+ movable = True
+ for allele in alt:
+ if len(allele) > len(ref):
+ longest, shortest = allele, ref
+ else:
+ longest, shortest = ref, allele
+ if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
+ movable = False
+ if longest[-1].upper() != longest[len(shortest)-1].upper():
+ movable = False
+ if not movable:
+ break
+ ref = ref[:-1]
+ alt = [allele[:-1] for allele in alt]
+ if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
+ ref = faref_leftflank[pos-left-1] + ref
+ alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
+ pos -= 1
+
+ # parse sample columns
+ samples = []
+ for sample in cols[9:]:
+ dict = {}
+ values = sample.split(':')
+ if len(values) > len(format):
+ self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
+ for idx in range(len(format)):
+ expected = self.get_expected(format[idx], self._format, alt)
+ if idx < len(values): value = values[idx]
+ else:
+ if expected == -1: value = "."
+ else: value = ",".join(["."]*expected)
+
+ dict[format[idx]] = self.parse_formatdata(format[idx],
+ value,
+ self._format,
+ line)
+ if expected != -1 and len(dict[format[idx]]) != expected:
+ self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
+ "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
+ if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
+ dict[format[idx]] = dict[format[idx]][:expected]
+ samples.append( dict )
+
+ # done
+ d = {'chrom':chrom,
+ 'pos':pos, # return 0-based position
+ 'id':id,
+ 'ref':ref,
+ 'alt':alt,
+ 'qual':qual,
+ 'filter':filter,
+ 'info':info,
+ 'format':format}
+ for key,value in zip(self._samples,samples):
+ d[key] = value
+
+ return d
+
+
+ def write_data(self, stream, data):
+ required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
+ for k in required:
+ if k not in data: raise ValueError("Required key %s not found in data" % str(k))
+ if data['alt'] == []: alt = "."
+ else: alt = ",".join(data['alt'])
+ if data['filter'] == None: filter = "."
+ elif data['filter'] == []:
+ if self._version == 33: filter = "0"
+ else: filter = "PASS"
+ else: filter = ';'.join(data['filter'])
+ if data['qual'] == -1: qual = "."
+ else: qual = str(data['qual'])
+
+ output = [data['chrom'],
+ str(data['pos']+1), # change to 1-based position
+ data['id'],
+ data['ref'],
+ alt,
+ qual,
+ filter,
+ self.format_formatdata(
+ data['info'], self._info, separator=";"),
+ self.format_formatdata(
+ data['format'], self._format, value=False)]
+
+ for s in self._samples:
+ output.append(self.format_formatdata(
+ data[s], self._format, key=False))
+
+ stream.write( "\t".join(output) + "\n" )
+
+ def _parse_header(self, stream):
+ self._lineno = 0
+ for line in stream:
+ line = force_str(line, self.encoding)
+ self._lineno += 1
+ if line.startswith('##'):
+ self.parse_header(line.strip())
+ elif line.startswith('#'):
+ self.parse_heading(line.strip())
+ self.enter_default_format()
+ else:
+ break
+ return line
+
+ def _parse(self, line, stream):
+ # deal with files with header only
+ if line.startswith("##"): return
+ if len(line.strip()) > 0:
+ d = self.parse_data( line.strip() )
+ if d: yield d
+ for line in stream:
+ self._lineno += 1
+ if self._lines and self._lineno > self._lines: raise StopIteration
+ d = self.parse_data( line.strip() )
+ if d: yield d
+
+ ######################################################################################################
+ #
+ # API follows
+ #
+ ######################################################################################################
+
+ def getsamples(self):
+ """ List of samples in VCF file """
+ return self._samples
+
+ def setsamples(self,samples):
+ """ List of samples in VCF file """
+ self._samples = samples
+
+ def getheader(self):
+ """ List of header key-value pairs (strings) """
+ return self._header
+
+ def setheader(self,header):
+ """ List of header key-value pairs (strings) """
+ self._header = header
+
+ def getinfo(self):
+ """ Dictionary of ##INFO tags, as VCF.FORMAT values """
+ return self._info
+
+ def setinfo(self,info):
+ """ Dictionary of ##INFO tags, as VCF.FORMAT values """
+ self._info = info
+
+ def getformat(self):
+ """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
+ return self._format
+
+ def setformat(self,format):
+ """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
+ self._format = format
+
+ def getfilter(self):
+ """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
+ return self._filter
+
+ def setfilter(self,filter):
+ """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
+ self._filter = filter
+
+ def setversion(self, version):
+ if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
+ self._version = version
+
+ def setregions(self, regions):
+ self._regions = regions
+
+ def setreference(self, ref):
+ """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
+ self._reference = ref
+
+ def ignoreerror(self, errorstring):
+ try: self._ignored_errors.add(self.__dict__[errorstring])
+ except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
+
+ def warnerror(self, errorstring):
+ try: self._warn_errors.add(self.__dict__[errorstring])
+ except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
+
+ def parse(self, stream):
+ """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
+ last_line = self._parse_header(stream)
+ # now return a generator that does the actual work. In this way the pre-processing is done
+ # before the first piece of data is yielded
+ return self._parse(last_line, stream)
+
+ def write(self, stream, datagenerator):
+ """ Writes a VCF file to a stream, using a data generator (or list) """
+ self.write_header(stream)
+ self.write_heading(stream)
+ for data in datagenerator: self.write_data(stream,data)
+
+ def writeheader(self, stream):
+ """ Writes a VCF header """
+ self.write_header(stream)
+ self.write_heading(stream)
+
+ def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
+ """ Utility function: compares two calls for equality """
+ # a variant should always be assigned to a unique position, one base before
+ # the leftmost position of the alignment gap. If this rule is implemented
+ # correctly, the two positions must be equal for the calls to be identical.
+ if pos1 != pos2: return False
+ # from both calls, trim rightmost bases when identical. Do this safely, i.e.
+ # only when the reference bases are not Ns
+ while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
+ ref1 = ref1[:-1]
+ alt1 = alt1[:-1]
+ while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
+ ref2 = ref2[:-1]
+ alt2 = alt2[:-1]
+ # now, the alternative alleles must be identical
+ return alt1 == alt2
+
+###########################################################################################################
+###########################################################################################################
+## API functions added by Andreas
+###########################################################################################################
+
+ def connect(self, filename, encoding="ascii"):
+ '''connect to tabix file.'''
+ self.encoding=encoding
+ self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
+ self._parse_header(self.tabixfile.header)
+
+ def __del__(self):
+ self.close()
+ self.tabixfile = None
+
+ def close(self):
+ if self.tabixfile:
+ self.tabixfile.close()
+ self.tabixfile = None
+
+ def fetch(self,
+ reference=None,
+ start=None,
+ end=None,
+ region=None ):
+ """ Parse a stream of VCF-formatted lines.
+ Initializes class instance and return generator """
+ return self.tabixfile.fetch(
+ reference,
+ start,
+ end,
+ region,
+ parser = asVCFRecord(self))
+
+ def validate(self, record):
+ '''validate vcf record.
+
+ returns a validated record.
+ '''
+
+ raise NotImplementedError("needs to be checked")
+
+ chrom, pos = record.chrom, record.pos
+
+ # check reference
+ ref = record.ref
+ if ref == ".":
+ self.error(str(record),self.MISSING_REF)
+ if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
+ else: ref = ""
+ else:
+ for c in ref:
+ if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
+ if "N" in ref: ref = get_sequence(chrom,
+ pos,
+ pos+len(ref),
+ self._reference)
+
+ # make sure reference is sane
+ if self._reference:
+ left = max(0,self.pos-100)
+ faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
+ faref = faref_leftflank[pos-left:]
+ if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
+ ref = faref
+
+ # check: format fields are defined
+ for f in record.format:
+ if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
+
+ # check: all filters are defined
+ for f in record.filter:
+ if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
+
+ # convert v3.3 alleles
+ if self._version == 33:
+ if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
+ newalts = []
+ have_deletions = False
+ for a in alt:
+ if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
+ elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
+ elif a.startswith('D'): # allow D<seq> and D<num>
+ have_deletions = True
+ try:
+ l = int(a[1:]) # throws ValueError if sequence
+ if len(ref) < l: # add to reference if necessary
+ addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
+ ref += addns
+ for i,na in enumerate(newalts): newalts[i] = na+addns
+ a = ref[l:] # new deletion, deleting pos...pos+l
+ except ValueError:
+ s = a[1:]
+ if len(ref) < len(s): # add Ns to reference if necessary
+ addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
+ if not s.endswith(addns) and addns != 'N'*len(addns):
+ self.error(str(record),self.V33_UNMATCHED_DELETION,
+ "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
+ ref += addns
+ for i,na in enumerate(newalts): newalts[i] = na+addns
+ a = ref[len(s):] # new deletion, deleting from pos
+ else:
+ self.error(str(record),self.V33_BAD_ALLELE)
+ newalts.append(a)
+ alt = newalts
+ # deletion alleles exist, add dummy 1st reference allele, and account for leading base
+ if have_deletions:
+ if pos == 0:
+ # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
+ addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
+ ref += addn
+ alt = [allele+addn for allele in alt]
+ else:
+ addn = get_sequence(chrom,pos-1,pos,self._reference)
+ ref = addn + ref
+ alt = [addn + allele for allele in alt]
+ pos -= 1
+ else:
+ # format v4.0 -- just check for nucleotides
+ for allele in alt:
+ if not alleleRegEx.match(allele):
+ self.error(str(record),self.V40_BAD_ALLELE,allele)
+
+
+ # check for leading nucleotide in indel calls
+ for allele in alt:
+ if len(allele) != len(ref):
+ if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
+ if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
+ self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
+
+ # trim trailing bases in alleles
+ # AH: not certain why trimming this needs to be added
+ # disabled now for unit testing
+ # for i in range(1,min(len(ref),min(map(len,alt)))):
+ # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
+ # break
+ # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
+
+ # left-align alleles, if a reference is available
+ if self._leftalign and self._reference:
+ while left < pos:
+ movable = True
+ for allele in alt:
+ if len(allele) > len(ref):
+ longest, shortest = allele, ref
+ else:
+ longest, shortest = ref, allele
+ if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
+ movable = False
+ if longest[-1].upper() != longest[len(shortest)-1].upper():
+ movable = False
+ if not movable:
+ break
+ ref = ref[:-1]
+ alt = [allele[:-1] for allele in alt]
+ if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
+ ref = faref_leftflank[pos-left-1] + ref
+ alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
+ pos -= 1
+
+__all__ = [
+ "VCF", "VCFRecord", ]
-from pysam.cutils import _pysam_dispatch
+from pysam.libcutils import _pysam_dispatch
class SamtoolsError(Exception):
# pysam versioning information
-__version__ = "0.9.1.4"
+__version__ = "0.10.0"
__samtools_version__ = "1.3.1"
-__htslib_version__ = "1.3.1"
+__bcftools_version__ = "1.3.1"
+
+__htslib_version__ = "1.3.2"
-cython>=0.22
+cython>=0.24.1
#Install miniconda python
if [ $TRAVIS_OS_NAME == "osx" ]; then
- curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
- bash Miniconda3-latest-MacOSX-x86_64.sh -b
+ wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh
else
- curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
- bash Miniconda3-latest-Linux-x86_64.sh -b
+ wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate # Default OS versions are old and have SSL / CERT issues
fi
+bash Miniconda3.sh -b
+
# Create a new conda environment with the target python version
~/miniconda3/bin/conda install conda-build -y
-~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose
+~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose psutil pip
+
+# activate testenv environment
+source ~/miniconda3/bin/activate testenv
-# Add new conda environment to PATH
-export PATH=~/miniconda3/envs/testenv/bin/:$PATH
+conda config --add channels conda-forge
+conda config --add channels defaults
+conda config --add channels r
+conda config --add channels bioconda
-# Hack to force linking to anaconda libraries rather than system libraries
-#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
-#export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/
+conda install -y samtools bcftools htslib
# Need to make C compiler and linker use the anaconda includes and libraries:
export PREFIX=~/miniconda3/
export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
-# create a new folder to store external tools
-mkdir -p $WORKDIR/external-tools
-
-# install htslib
-cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 > htslib-1.3.1.tar.bz2
-tar xjvf htslib-1.3.1.tar.bz2
-cd htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/htslib-1.3.1
-LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3.1
-
-# install samtools, compile against htslib
-cd $WORKDIR/external-tools
-curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3.1/samtools-1.3.1.tar.bz2 > samtools-1.3.1.tar.bz2
-tar xjvf samtools-1.3.1.tar.bz2
-cd samtools-1.3.1
-./configure --with-htslib=../htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/samtools-1.3.1
-
-echo "installed samtools"
samtools --version
-
-if [ $? != 0 ]; then
- exit 1
-fi
-
-# install bcftools
-cd $WORKDIR/external-tools
-curl -L https://github.com/samtools/bcftools/releases/download/1.3.1/bcftools-1.3.1.tar.bz2 > bcftools-1.3.1.tar.bz2
-tar xjf bcftools-1.3.1.tar.bz2
-cd bcftools-1.3.1
-./configure --with-htslib=../htslib-1.3.1
-make
-PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3.1
-
-echo "installed bcftools"
+htslib --version
bcftools --version
-if [ $? != 0 ]; then
- exit 1
-fi
-
-popd
-
# Try building conda recipe first
~/miniconda3/bin/conda-build ci/conda-recipe/ --python=$CONDA_PY
exit 1
fi
-# build source tar-ball
+# build source tar-ball. Make sure to build so that .pyx files
+# are cythonized.
cd ..
-python setup.py sdist
+python setup.py build sdist
if [ $? != 0 ]; then
exit 1
# test pip installation from tar-ball with cython
echo "pip installing with cython"
-pip install --verbose --no-deps --no-use-wheel dist/pysam-*.tar.gz
+pip install --verbose --no-deps --no-binary=:all: dist/pysam-*.tar.gz
if [ $? != 0 ]; then
exit 1
# attempt pip installation without cython
echo "pip installing without cython"
-~/miniconda3/bin/conda remove cython
+~/miniconda3/bin/conda remove -y cython
~/miniconda3/bin/conda list
-echo "pthyon is" `which python`
-pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz
+echo "python is" `which python`
+pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
if [ $? != 0 ]; then
exit 1
# command line options
echo "pip installing without cython and no configure options"
export HTSLIB_CONFIGURE_OPTIONS=""
-pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz
+pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz
if [ $? != 0 ]; then
exit 1
}
view_end:
- if (is_count && ret == 0)
+ if (is_count && ret == 0)
fprintf(pysam_stdout, "%" PRId64 "\n", count);
-
+
// close files, free and return
if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
def run_make_print_config():
- stdout = subprocess.check_output(["make", "print-config"])
+ stdout = subprocess.check_output(["make", "-s", "print-config"])
if IS_PYTHON3:
stdout = stdout.decode("ascii")
- result = dict([[x.strip() for x in line.split("=")]
- for line in stdout.splitlines()])
- return result
+ make_print_config = {}
+ for line in stdout.splitlines():
+ if "=" in line:
+ row = line.split("=")
+ if len(row) == 2:
+ make_print_config.update(
+ {row[0].strip(): row[1].strip()})
+ return make_print_config
def configure_library(library_dir, env_options=None, options=[]):
import cython
HAVE_CYTHON = True
print ("# pysam: cython is available - using cythonize if necessary")
- source_pattern = "pysam/c%s.pyx"
- if HTSLIB_MODE != "external":
- HTSLIB_MODE = "shared"
+ source_pattern = "pysam/libc%s.pyx"
except ImportError:
HAVE_CYTHON = False
print ("# pysam: no cython available - using pre-compiled C")
# no Cython available - use existing C code
- source_pattern = "pysam/c%s.c"
- if HTSLIB_MODE != "external":
- HTSLIB_MODE = "shared"
+ source_pattern = "pysam/libc%s.c"
# collect pysam version
sys.path.insert(0, "pysam")
chtslib_sources = []
htslib_library_dirs = [HTSLIB_LIBRARY_DIR]
htslib_include_dirs = [HTSLIB_INCLUDE_DIR]
- internal_htslib_libraries = []
external_htslib_libraries = ['z', 'hts']
elif HTSLIB_MODE == 'separate':
shared_htslib_sources = htslib_sources
htslib_library_dirs = []
htslib_include_dirs = ['htslib']
- internal_htslib_libraries = []
elif HTSLIB_MODE == 'shared':
# link each pysam component against the same
htslib_library_dirs = [
'pysam',
".",
- os.path.join("build",
- distutils_dir_name("lib"),
- "pysam")]
+ os.path.join("build", distutils_dir_name("lib"), "pysam")]
htslib_include_dirs = ['htslib']
- if IS_PYTHON3:
- if sys.version_info.minor >= 5:
- internal_htslib_libraries = ["chtslib.{}".format(
- sysconfig.get_config_var('SOABI'))]
- else:
- if sys.platform == "darwin":
- # On OSX, python 3.3 and 3.4 Libs have no platform tags.
- internal_htslib_libraries = ["chtslib"]
- else:
- internal_htslib_libraries = ["chtslib.{}{}".format(
- sys.implementation.cache_tag,
- sys.abiflags)]
- else:
- internal_htslib_libraries = ["chtslib"]
-
else:
raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
+internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]]
+
# build config.py
with open(os.path.join("pysam", "config.py"), "w") as outf:
outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE))
# Selected ones have been copied into samfile_utils.c
# Needs to be devolved somehow.
csamfile = Extension(
- "pysam.csamfile",
+ "pysam.libcsamfile",
[source_pattern % "samfile",
"pysam/htslib_util.c",
"pysam/samfile_util.c",
# Selected ones have been copied into samfile_utils.c
# Needs to be devolved somehow.
calignmentfile = Extension(
- "pysam.calignmentfile",
+ "pysam.libcalignmentfile",
[source_pattern % "alignmentfile",
"pysam/htslib_util.c",
"pysam/samfile_util.c",
# Selected ones have been copied into samfile_utils.c
# Needs to be devolved somehow.
calignedsegment = Extension(
- "pysam.calignedsegment",
+ "pysam.libcalignedsegment",
[source_pattern % "alignedsegment",
"pysam/htslib_util.c",
"pysam/samfile_util.c",
)
ctabix = Extension(
- "pysam.ctabix",
+ "pysam.libctabix",
[source_pattern % "tabix",
"pysam/tabix_util.c"] +
htslib_sources +
)
cutils = Extension(
- "pysam.cutils",
+ "pysam.libcutils",
[source_pattern % "utils", "pysam/pysam_util.c"] +
glob.glob(os.path.join("samtools", "*.pysam.c")) +
# glob.glob(os.path.join("samtools", "*", "*.pysam.c")) +
)
cfaidx = Extension(
- "pysam.cfaidx",
+ "pysam.libcfaidx",
[source_pattern % "faidx"] +
htslib_sources +
os_c_files,
)
ctabixproxies = Extension(
- "pysam.ctabixproxies",
+ "pysam.libctabixproxies",
[source_pattern % "tabixproxies"] +
os_c_files,
library_dirs=htslib_library_dirs,
)
cvcf = Extension(
- "pysam.cvcf",
+ "pysam.libcvcf",
[source_pattern % "vcf"] +
os_c_files,
library_dirs=htslib_library_dirs,
)
cbcf = Extension(
- "pysam.cbcf",
+ "pysam.libcbcf",
[source_pattern % "bcf"] +
htslib_sources +
os_c_files,
define_macros=define_macros
)
+cbgzf = Extension(
+ "pysam.libcbgzf",
+ [source_pattern % "bgzf"] +
+ htslib_sources +
+ os_c_files,
+ library_dirs=htslib_library_dirs,
+ include_dirs=["htslib", "."] + include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
+ language="c",
+ extra_compile_args=extra_compile_args,
+ define_macros=define_macros
+)
+
metadata = {
'name': "pysam",
'version': version,
ctabixproxies,
cvcf,
cbcf,
+ cbgzf,
cfaidx,
cutils],
'cmdclass': cmdclass,
self.assertEqual(a.query_sequence, None)
self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None)
self.assertEqual(a.flag, 0)
- self.assertEqual(a.reference_id, 0)
+ self.assertEqual(a.reference_id, -1)
self.assertEqual(a.mapping_quality, 0)
self.assertEqual(a.cigartuples, None)
self.assertEqual(a.tags, [])
- self.assertEqual(a.next_reference_id, 0)
- self.assertEqual(a.next_reference_start, 0)
+ self.assertEqual(a.next_reference_id, -1)
+ self.assertEqual(a.next_reference_start, -1)
self.assertEqual(a.template_length, 0)
def testStrOfEmptyRead(self):
a = pysam.AlignedSegment()
s = str(a)
self.assertEqual(
- "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
+ "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
s)
def testSettingTagInEmptyRead(self):
self.assertEqual(a.get_blocks(),
[(20, 30), (31, 40), (40, 60)])
+ def test_infer_query_length(self):
+ '''Test infer_query_length on M|=|X|I|D|H|S cigar ops'''
+ a = self.buildRead()
+ a.cigarstring = '15M'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '15='
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '15X'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '5M5I5M'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '5M5D5M'
+ self.assertEqual(a.infer_query_length(), 10)
+ a.cigarstring = '5H10M'
+ self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '5S10M'
+ self.assertEqual(a.infer_query_length(), 15)
+
def test_get_aligned_pairs_soft_clipping(self):
a = self.buildRead()
a.cigartuples = ((4, 2), (0, 35), (4, 3))
a.cigarstring = "1S20M1S"
self.assertEqual(a.query_alignment_length, 20)
+ def test_query_length_is_limited(self):
+
+ a = self.buildRead()
+ a.query_name = "A" * 1
+ a.query_name = "A" * 254
+ self.assertRaises(
+ ValueError,
+ setattr,
+ a,
+ "query_name",
+ "A" * 255)
+
class TestCigarStats(ReadTest):
for s, p in zip(reference, pysamf):
self.assertEqual(s, p.tostring(pysamf))
+
if __name__ == "__main__":
unittest.main()
get_temp_filename
-DATADIR = "pysam_data"
+DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
+ "pysam_data"))
##################################################
class BasicTestBAMFromFile(BasicTestBAMFromFetch):
def setUp(self):
- f = open(os.path.join(DATADIR, "ex3.bam"))
- self.samfile = pysam.AlignmentFile(
- f, "rb")
+ with open(os.path.join(DATADIR, "ex3.bam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f, "rb")
+ self.reads = [r for r in self.samfile]
+
+
+class BasicTestBAMFromFileNo(BasicTestBAMFromFetch):
+
+ def setUp(self):
+ with open(os.path.join(DATADIR, "ex3.bam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f.fileno(), "rb")
self.reads = [r for r in self.samfile]
class BasicTestSAMFromFile(BasicTestBAMFromFetch):
def setUp(self):
- f = open(os.path.join(DATADIR, "ex3.sam"))
- self.samfile = pysam.AlignmentFile(
- f, "r")
+ with open(os.path.join(DATADIR, "ex3.sam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f, "r")
+ self.reads = [r for r in self.samfile]
+
+
+class BasicTestSAMFromFileNo(BasicTestBAMFromFetch):
+
+ def setUp(self):
+ with open(os.path.join(DATADIR, "ex3.sam")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f.fileno(), "r")
self.reads = [r for r in self.samfile]
class BasicTestCRAMFromFile(BasicTestCRAMFromFetch):
def setUp(self):
- f = open(os.path.join(DATADIR, "ex3.cram"))
- self.samfile = pysam.AlignmentFile(f, "rc")
+ with open(os.path.join(DATADIR, "ex3.cram")) as f:
+ self.samfile = pysam.AlignmentFile(f, "rc")
+ self.reads = [r for r in self.samfile]
+
+
+class BasicTestCRAMFromFileNo(BasicTestCRAMFromFetch):
+
+ def setUp(self):
+ with open(os.path.join(DATADIR, "ex3.cram")) as f:
+ self.samfile = pysam.AlignmentFile(
+ f.fileno(), "rc")
self.reads = [r for r in self.samfile]
samfile = pysam.AlignmentFile(f, "rb")
f.close()
self.assertTrue(f.closed)
- # access to Samfile should still work
+ # access to Samfile still works
self.checkEcho("ex1.bam",
"ex1.bam",
"tmp_ex1.bam",
mode="rb")
self.assertEqual(len(list(samfile.fetch())), 3270)
+ def testBAMWithCSIIndex(self):
+ '''see issue 116'''
+ input_filename = os.path.join(DATADIR, "ex1_csi.bam")
+ samfile = pysam.AlignmentFile(input_filename,
+ "rb",
+ check_sq=False)
+ samfile.fetch('chr2')
+
+
class TestAutoDetect(unittest.TestCase):
"""testing header manipulation"""
- header = {'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}],
+ header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
+ {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891',
'PU': 'SC_1_10', "CN": "name:with:colon"},
{'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891',
last[r.alignment.query_name] = r.query_position
+class TestFindIntrons(unittest.TestCase):
+ samfilename = "pysam_data/ex_spliced.bam"
+
+ def setUp(self):
+ self.samfile = pysam.AlignmentFile(self.samfilename)
+
+ def tearDown(self):
+ self.samfile.close()
+
+ def test_total(self):
+ all_read_counts = self.samfile.count()
+ splice_sites = self.samfile.find_introns(self.samfile.fetch())
+ self.assertEqual(sum(splice_sites.values()), all_read_counts -1) # there is a single unspliced read in there
+
+ def test_first(self):
+ reads = list(self.samfile.fetch())[:10]
+ splice_sites = self.samfile.find_introns(reads)
+ starts = [14792+38 - 1]
+ stops = [14792+38 + 140 - 1]
+ self.assertEqual(len(splice_sites), 1)
+ self.assertTrue((starts[0], stops[0]) in splice_sites)
+ self.assertEqual(splice_sites[(starts[0], stops[0])], 9) # first one is the unspliced read
+
+ def test_all(self):
+ reads = list(self.samfile.fetch())
+ splice_sites = self.samfile.find_introns(reads)
+ should = collections.Counter({
+ (14829, 14969): 33,
+ (15038, 15795): 24,
+ (15947, 16606): 3,
+ (16765, 16857): 9,
+ (16765, 16875): 1,
+ (17055, 17232): 19,
+ (17055, 17605): 3,
+ (17055, 17914): 1,
+ (17368, 17605): 7,
+ })
+ self.assertEqual(should, splice_sites)
+
+
class TestLogging(unittest.TestCase):
'''test around bug issue 42,
inf.mapped)
-
class TestSamtoolsProxy(unittest.TestCase):
'''tests for sanity checking access to samtools functions.'''
self.assertEqual(pysam.get_verbosity(), 3)
+class TestSanityCheckingBAM(unittest.TestCase):
+
+ mode = "wb"
+
+ def check_write(self, read):
+
+ fn = "tmp_test_sanity_check.bam"
+ names = ["chr1"]
+ lengths = [10000]
+ with pysam.AlignmentFile(
+ fn,
+ self.mode,
+ reference_names=names,
+ reference_lengths=lengths) as outf:
+ outf.write(read)
+
+ if os.path.exists(fn):
+ os.unlink(fn)
+
+ def test_empty_read_gives_value_error(self):
+ read = pysam.AlignedSegment()
+ self.check_write(read)
+
+# SAM writing fails, as query length is 0
+# class TestSanityCheckingSAM(TestSanityCheckingSAM):
+# mode = "w"
+
+
if __name__ == "__main__":
# build data files
print ("building data files")
class TestHeaderSam(unittest.TestCase):
- header = {'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}],
+ header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
+ {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"},
{'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}],
'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}],
self.assertEqual(a.seq, None)
self.assertEqual(a.qual, None)
self.assertEqual(a.flag, 0)
- self.assertEqual(a.rname, 0)
+ self.assertEqual(a.rname, -1)
self.assertEqual(a.mapq, 0)
self.assertEqual(a.cigar, [])
self.assertEqual(a.tags, [])
- self.assertEqual(a.mrnm, 0)
- self.assertEqual(a.mpos, 0)
+ self.assertEqual(a.mrnm, -1)
+ self.assertEqual(a.mpos, -1)
self.assertEqual(a.isize, 0)
def testStrOfEmptyRead(self):
a = pysam.AlignedRead()
s = str(a)
self.assertEqual(
- "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]",
+ "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
s)
def buildRead(self):
--- /dev/null
+import os
+import subprocess
+import threading
+import errno
+import unittest
+
+from pysam import AlignmentFile
+
+DATADIR = os.path.abspath(os.path.join(
+ os.path.dirname(__file__),
+ "pysam_data"))
+
+
+def alignmentfile_writer_thread(infile, outfile):
+ def _writer_thread(infile, outfile):
+ """read from infile and write to outfile"""
+ try:
+ i = 0
+ for record in infile:
+ outfile.write(record)
+ i += 1
+ except IOError as e:
+ if e.errno != errno.EPIPE:
+ pass
+ finally:
+ outfile.close()
+
+ writer = threading.Thread(target=_writer_thread, args=(infile, outfile))
+ writer.daemon = True
+ writer.start()
+ return writer
+
+
+class StreamTest(unittest.TestCase):
+
+ def stream_process(self, proc, in_stream, out_stream, writer):
+
+ with AlignmentFile(proc.stdout) as infile:
+ read = 0
+ for record in infile:
+ read += 1
+ return 0, read
+
+ def test_text_processing(self):
+
+ proc = subprocess.Popen('head -n200',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True)
+
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
+
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 198)
+
+ def test_samtools_processing(self):
+
+ proc = subprocess.Popen('samtools view -b -f 4',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True)
+
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
+
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 35)
+
+
+if __name__ == "__main__":
+ unittest.main()
import os
+import sys
import unittest
import pysam
import gzip
import subprocess
+
+try:
+ from pathlib import Path
+except ImportError:
+ Path = None
+
from TestUtils import get_temp_filename, check_lines_equal
DATADIR="cbcf_data"
os.unlink("tmp_testEmptyFile.vcf")
+
+ if Path and sys.version_info >= (3,6):
+ def testEmptyFileVCFFromPath(self):
+ with open("tmp_testEmptyFile.vcf", "w"):
+ pass
+
+ self.assertRaises(ValueError, pysam.VariantFile,
+ Path("tmp_testEmptyFile.vcf"))
+
+ os.unlink("tmp_testEmptyFile.vcf")
+
def testEmptyFileVCFGZWithIndex(self):
with open("tmp_testEmptyFile.vcf", "w"):
pass
# remove last header line starting with #CHROM
ref.pop()
ref = sorted(ref)
- comp = sorted([str(x) for x in v.header.records])
+ comp = sorted(str(x) for x in v.header.records)
self.assertEqual(len(ref), len(comp))
for x, y in zip(ref, comp):
- self.assertEqual(x[:-1], str(y))
+ self.assertEqual(x, y)
# These tests need to be separate and start from newly opened files. This
chrom = [rec.chrom for rec in v]
self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
+ if Path and sys.version_info >= (3,6):
+ def testChromFromPath(self):
+ fn = os.path.join(DATADIR, self.filename)
+ v = pysam.VariantFile(Path(fn))
+ chrom = [rec.chrom for rec in v]
+ self.assertEqual(chrom, ['M', '17', '20', '20', '20'])
+
def testPos(self):
fn = os.path.join(DATADIR, self.filename)
v = pysam.VariantFile(fn)
"""construct VariantFile from scratch."""
filename = "example_vcf42_withcontigs.vcf"
+ compression = 'NONE'
+ description = 'VCF version 4.2 variant calling text'
- def complete_check(self, fn_in, fn_out):
+ def testBase(self):
+ with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+ self.assertEqual(inf.category, 'VARIANTS')
+ self.assertEqual(inf.format, 'VCF')
+ self.assertEqual(inf.version, (4, 2))
+ self.assertEqual(inf.compression, self.compression)
+ self.assertEqual(inf.description, self.description)
+ self.assertTrue(inf.is_open)
+ self.assertEqual(inf.is_read, True)
+ self.assertEqual(inf.is_write, False)
+ def complete_check(self, fn_in, fn_out):
+ self.maxDiff = None
check_lines_equal(
self, fn_in, fn_out, sort=True,
filter_f=lambda x: x.startswith("##contig"))
for record in vcf_in.header.records:
header.add_record(record)
- fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf"
- vcf_out = pysam.VariantFile(fn, "w", header=header)
+ for sample in vcf_in.header.samples:
+ header.add_sample(sample)
+
+ vcf_out = pysam.VariantFile(fn_out, "w", header=header)
for record in vcf_in:
- # currently segfaults here:
- # vcf_out.write(record)
- pass
- return
+ record.translate(header)
+ vcf_out.write(record)
+ vcf_in.close()
vcf_out.close()
self.complete_check(fn_in, fn_out)
for record in vcf_in:
vcf_out.write(record)
+ vcf_in.close()
vcf_out.close()
self.complete_check(fn_in, fn_out)
self.complete_check(fn_in, fn_out)
-# Currently segfaults for VCFs without contigs
-# class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs):
+
+#class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs):
# """construct VariantFile from scratch."""
# filename = "example_vcf40.vcf"
"""construct VariantFile from scratch."""
filename = "example_vcf42_withcontigs.vcf.gz"
+ compression = 'BGZF'
+ description = 'VCF version 4.2 BGZF-compressed variant calling data'
class TestConstructionVCFGZWithoutContigs(TestConstructionVCFWithContigs):
"""construct VariantFile from scratch."""
filename = "example_vcf42.vcf.gz"
+ compression = 'BGZF'
+ description = 'VCF version 4.2 BGZF-compressed variant calling data'
class TestSettingRecordValues(unittest.TestCase):
filename = "example_vcf40.vcf"
+ def testBase(self):
+ with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
+ self.assertEqual(inf.category, 'VARIANTS')
+ self.assertEqual(inf.format, 'VCF')
+ self.assertEqual(inf.version, (4, 0))
+ self.assertEqual(inf.compression, 'NONE')
+ self.assertEqual(inf.description, 'VCF version 4.0 variant calling text')
+ self.assertTrue(inf.is_open)
+ self.assertEqual(inf.is_read, True)
+ self.assertEqual(inf.is_write, False)
+
def testSetQual(self):
with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf:
record = next(inf)
sample = record.samples["NA00001"]
print (sample["GT"])
self.assertEqual(sample["GT"], (0, 0))
-# Fails with TypeError
-# sample["GT"] = sample["GT"]
+ sample["GT"] = sample["GT"]
class TestSubsetting(unittest.TestCase):
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-from pysam.ctabix cimport Tabixfile
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+from pysam.libctabix cimport Tabixfile
cdef AlignmentFile samfile
cdef Tabixfile tabixfile
-from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
-from pysam.calignmentfile cimport pysam_get_flag
-from pysam.calignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
+from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED
+from pysam.libcalignedsegment cimport pysam_get_flag
def count(AlignmentFile samfile):
cdef int is_proper = 0
+++ /dev/null
-import pysam
-
-import pyximport
-pyximport.install()
-import _cython_flagstat
-
-is_paired, is_proper = _cython_flagstat.count(
- pysam.AlignmentFile("ex1.bam", "rb"))
-
-print ("there are alignments of %i paired reads" % is_paired)
-print ("there are %i proper paired alignments" % is_proper)
empty.bam empty.bam.bai \
explicit_index.bam explicit_index.cram \
faidx_empty_seq.fq.gz \
- ex1.fa.gz ex1.fa.gz.fai
+ ex1.fa.gz ex1.fa.gz.fai \
+ ex1_csi.bam
# ex2.sam - as ex1.sam, but with header
ex2.sam.gz: ex1.bam ex1.bam.bai
ex1.fa.fai:ex1.fa
samtools faidx ex1.fa
+
ex1.bam:ex1.sam.gz ex1.fa.fai
samtools import ex1.fa.fai ex1.sam.gz ex1.bam
ex2_truncated.bam: ex2.bam
head -c 124000 ex2.bam > ex2_truncated.bam
+ex1_csi.bam: ex1.bam
+ cp ex1.bam ex1_csi.bam
+ samtools index -c ex1_csi.bam
+
empty.bam: ex2.sam
grep "^@" $< | samtools view -Sb - > $@
@HD VN:1.0
-@SQ SN:chr1 LN:1575
-@SQ SN:chr2 LN:1584
+@SQ SN:chr1 LN:1575 AH:chr1:5000000-5010000
+@SQ SN:chr2 LN:1584 AH:*
@RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 CN:name:with:colon
@RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 CN:name:with:colon
@PG ID:P1 VN:1.0
--- /dev/null
+@HD VN:1.4 SO:coordinate
+@SQ SN:1 LN:248956422
+@SQ SN:2 LN:242193529
+@SQ SN:3 LN:198295559
+@SQ SN:4 LN:190214555
+@SQ SN:5 LN:181538259
+@SQ SN:6 LN:170805979
+@SQ SN:7 LN:159345973
+@SQ SN:8 LN:145138636
+@SQ SN:9 LN:138394717
+@SQ SN:10 LN:133797422
+@SQ SN:11 LN:135086622
+@SQ SN:12 LN:133275309
+@SQ SN:13 LN:114364328
+@SQ SN:14 LN:107043718
+@SQ SN:15 LN:101991189
+@SQ SN:16 LN:90338345
+@SQ SN:17 LN:83257441
+@SQ SN:18 LN:80373285
+@SQ SN:19 LN:58617616
+@SQ SN:20 LN:64444167
+@SQ SN:21 LN:46709983
+@SQ SN:22 LN:50818468
+@SQ SN:X LN:156040895
+@SQ SN:Y LN:57227415
+@SQ SN:MT LN:16569
+@SQ SN:GL000008.2 LN:209709
+@SQ SN:GL000009.2 LN:201709
+@SQ SN:GL000194.1 LN:191469
+@SQ SN:GL000195.1 LN:182896
+@SQ SN:GL000205.2 LN:185591
+@SQ SN:GL000208.1 LN:92689
+@SQ SN:GL000213.1 LN:164239
+@SQ SN:GL000214.1 LN:137718
+@SQ SN:GL000216.2 LN:176608
+@SQ SN:GL000218.1 LN:161147
+@SQ SN:GL000219.1 LN:179198
+@SQ SN:GL000220.1 LN:161802
+@SQ SN:GL000221.1 LN:155397
+@SQ SN:GL000224.1 LN:179693
+@SQ SN:GL000225.1 LN:211173
+@SQ SN:GL000226.1 LN:15008
+@SQ SN:KI270302.1 LN:2274
+@SQ SN:KI270303.1 LN:1942
+@SQ SN:KI270304.1 LN:2165
+@SQ SN:KI270305.1 LN:1472
+@SQ SN:KI270310.1 LN:1201
+@SQ SN:KI270311.1 LN:12399
+@SQ SN:KI270312.1 LN:998
+@SQ SN:KI270315.1 LN:2276
+@SQ SN:KI270316.1 LN:1444
+@SQ SN:KI270317.1 LN:37690
+@SQ SN:KI270320.1 LN:4416
+@SQ SN:KI270322.1 LN:21476
+@SQ SN:KI270329.1 LN:1040
+@SQ SN:KI270330.1 LN:1652
+@SQ SN:KI270333.1 LN:2699
+@SQ SN:KI270334.1 LN:1368
+@SQ SN:KI270335.1 LN:1048
+@SQ SN:KI270336.1 LN:1026
+@SQ SN:KI270337.1 LN:1121
+@SQ SN:KI270338.1 LN:1428
+@SQ SN:KI270340.1 LN:1428
+@SQ SN:KI270362.1 LN:3530
+@SQ SN:KI270363.1 LN:1803
+@SQ SN:KI270364.1 LN:2855
+@SQ SN:KI270366.1 LN:8320
+@SQ SN:KI270371.1 LN:2805
+@SQ SN:KI270372.1 LN:1650
+@SQ SN:KI270373.1 LN:1451
+@SQ SN:KI270374.1 LN:2656
+@SQ SN:KI270375.1 LN:2378
+@SQ SN:KI270376.1 LN:1136
+@SQ SN:KI270378.1 LN:1048
+@SQ SN:KI270379.1 LN:1045
+@SQ SN:KI270381.1 LN:1930
+@SQ SN:KI270382.1 LN:4215
+@SQ SN:KI270383.1 LN:1750
+@SQ SN:KI270384.1 LN:1658
+@SQ SN:KI270385.1 LN:990
+@SQ SN:KI270386.1 LN:1788
+@SQ SN:KI270387.1 LN:1537
+@SQ SN:KI270388.1 LN:1216
+@SQ SN:KI270389.1 LN:1298
+@SQ SN:KI270390.1 LN:2387
+@SQ SN:KI270391.1 LN:1484
+@SQ SN:KI270392.1 LN:971
+@SQ SN:KI270393.1 LN:1308
+@SQ SN:KI270394.1 LN:970
+@SQ SN:KI270395.1 LN:1143
+@SQ SN:KI270396.1 LN:1880
+@SQ SN:KI270411.1 LN:2646
+@SQ SN:KI270412.1 LN:1179
+@SQ SN:KI270414.1 LN:2489
+@SQ SN:KI270417.1 LN:2043
+@SQ SN:KI270418.1 LN:2145
+@SQ SN:KI270419.1 LN:1029
+@SQ SN:KI270420.1 LN:2321
+@SQ SN:KI270422.1 LN:1445
+@SQ SN:KI270423.1 LN:981
+@SQ SN:KI270424.1 LN:2140
+@SQ SN:KI270425.1 LN:1884
+@SQ SN:KI270429.1 LN:1361
+@SQ SN:KI270435.1 LN:92983
+@SQ SN:KI270438.1 LN:112505
+@SQ SN:KI270442.1 LN:392061
+@SQ SN:KI270448.1 LN:7992
+@SQ SN:KI270465.1 LN:1774
+@SQ SN:KI270466.1 LN:1233
+@SQ SN:KI270467.1 LN:3920
+@SQ SN:KI270468.1 LN:4055
+@SQ SN:KI270507.1 LN:5353
+@SQ SN:KI270508.1 LN:1951
+@SQ SN:KI270509.1 LN:2318
+@SQ SN:KI270510.1 LN:2415
+@SQ SN:KI270511.1 LN:8127
+@SQ SN:KI270512.1 LN:22689
+@SQ SN:KI270515.1 LN:6361
+@SQ SN:KI270516.1 LN:1300
+@SQ SN:KI270517.1 LN:3253
+@SQ SN:KI270518.1 LN:2186
+@SQ SN:KI270519.1 LN:138126
+@SQ SN:KI270521.1 LN:7642
+@SQ SN:KI270522.1 LN:5674
+@SQ SN:KI270528.1 LN:2983
+@SQ SN:KI270529.1 LN:1899
+@SQ SN:KI270530.1 LN:2168
+@SQ SN:KI270538.1 LN:91309
+@SQ SN:KI270539.1 LN:993
+@SQ SN:KI270544.1 LN:1202
+@SQ SN:KI270548.1 LN:1599
+@SQ SN:KI270579.1 LN:31033
+@SQ SN:KI270580.1 LN:1553
+@SQ SN:KI270581.1 LN:7046
+@SQ SN:KI270582.1 LN:6504
+@SQ SN:KI270583.1 LN:1400
+@SQ SN:KI270584.1 LN:4513
+@SQ SN:KI270587.1 LN:2969
+@SQ SN:KI270588.1 LN:6158
+@SQ SN:KI270589.1 LN:44474
+@SQ SN:KI270590.1 LN:4685
+@SQ SN:KI270591.1 LN:5796
+@SQ SN:KI270593.1 LN:3041
+@SQ SN:KI270706.1 LN:175055
+@SQ SN:KI270707.1 LN:32032
+@SQ SN:KI270708.1 LN:127682
+@SQ SN:KI270709.1 LN:66860
+@SQ SN:KI270710.1 LN:40176
+@SQ SN:KI270711.1 LN:42210
+@SQ SN:KI270712.1 LN:176043
+@SQ SN:KI270713.1 LN:40745
+@SQ SN:KI270714.1 LN:41717
+@SQ SN:KI270715.1 LN:161471
+@SQ SN:KI270716.1 LN:153799
+@SQ SN:KI270717.1 LN:40062
+@SQ SN:KI270718.1 LN:38054
+@SQ SN:KI270719.1 LN:176845
+@SQ SN:KI270720.1 LN:39050
+@SQ SN:KI270721.1 LN:100316
+@SQ SN:KI270722.1 LN:194050
+@SQ SN:KI270723.1 LN:38115
+@SQ SN:KI270724.1 LN:39555
+@SQ SN:KI270725.1 LN:172810
+@SQ SN:KI270726.1 LN:43739
+@SQ SN:KI270727.1 LN:448248
+@SQ SN:KI270728.1 LN:1872759
+@SQ SN:KI270729.1 LN:280839
+@SQ SN:KI270730.1 LN:112551
+@SQ SN:KI270731.1 LN:150754
+@SQ SN:KI270732.1 LN:41543
+@SQ SN:KI270733.1 LN:179772
+@SQ SN:KI270734.1 LN:165050
+@SQ SN:KI270735.1 LN:42811
+@SQ SN:KI270736.1 LN:181920
+@SQ SN:KI270737.1 LN:103838
+@SQ SN:KI270738.1 LN:99375
+@SQ SN:KI270739.1 LN:73985
+@SQ SN:KI270740.1 LN:37240
+@SQ SN:KI270741.1 LN:157432
+@SQ SN:KI270742.1 LN:186739
+@SQ SN:KI270743.1 LN:210658
+@SQ SN:KI270744.1 LN:168472
+@SQ SN:KI270745.1 LN:41891
+@SQ SN:KI270746.1 LN:66486
+@SQ SN:KI270747.1 LN:198735
+@SQ SN:KI270748.1 LN:93321
+@SQ SN:KI270749.1 LN:158759
+@SQ SN:KI270750.1 LN:148850
+@SQ SN:KI270751.1 LN:150742
+@SQ SN:KI270752.1 LN:27745
+@SQ SN:KI270753.1 LN:62944
+@SQ SN:KI270754.1 LN:40191
+@SQ SN:KI270755.1 LN:36723
+@SQ SN:KI270756.1 LN:79590
+@SQ SN:KI270757.1 LN:71251
+@PG ID:STAR PN:STAR VN:STAR_2.4.1a
+HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 51M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 38M140N13M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1214:7658:35836 272 1 14792 0 38M140N13M * 0 0 GGGCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHJJJJJJJJCGHIJJIJJJJJJIJJGIJJIJIJIJJJJI NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2114:4116:44566 272 1 14794 0 36M140N15M * 0 0 GCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA <@@DDDDDDFHCFHEFGBE+2AFH@GIEGF=GGHII9F<GHHIIA@6=48; NH:i:6 HI:i:3 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1114:13704:81420 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC @@@DDDDFHBFHHGIGIE3CFHIIIIII<E@FHIGIIC?BFDHDHGIIIII NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2115:10483:10806 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC CCCFFFFFHHHHHJJJIIHHJIJJJJJJHHHIHGIIJJJHJIJJIJJJJJJ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2214:18560:59872 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC =;?BA@DAFFFFF?EAF;A?9CH?CB9E9?D9FGEGGCGGEHIDBE@FFH; NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2028:49488 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC ???+4@B?FHHFHFGIBEF9BDHCB??CEHGG*1C<FEHAF?(?(@@=B8@ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2949:63319 272 1 14795 0 1S35M140N15M * 0 0 ACCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA @@CDFDFFHHHDHGIIIEEFGIJJJGIJIIGCGIJJJJJJIGIJIJJJHGA NH:i:6 HI:i:3 AS:i:46 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1209:18680:84812 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC CC@FFFFFHHFHHJJJHHEHIIJJJJJIIEGFIJJJIIIIJJIJJJJJIII NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2205:10731:40239 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC @@<=A@DDFB:?FG<F;:3CCBEEBFGIIA?GIAD>B?BFF<BDF<8B8FF NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2207:18860:77945 272 1 14795 0 35M140N16M * 0 0 CCCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGAC C@CFFFFFHHHHHJJJIJGHIIJJJJJJJJJJJJIJJJJJIJJIJJJJIJJ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1102:4544:68832 272 1 14796 0 34M140N17M * 0 0 CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC ???DDDDDBDDD:CE:C2<CFBDDECCEDC>DEE??BD?D@DADD<CC=8B NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1203:5829:91963 272 1 14796 0 34M140N17M * 0 0 CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC CCCFFFFFHHHHHJJIJIHJIIJGIIJIJJJJJIJJIJGGIJJJGIJJHJI NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2110:3018:17806 272 1 14796 0 34M140N17M * 0 0 CCCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACC CCCFFFFFHHHFHIJJIFGIJIHIJIIDHIIJGGIIJIJJJJJJIJJIIIJ NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1111:17873:10434 272 1 14797 0 33M140N18M * 0 0 CCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCA ???DDDADDDDD8:2<2<FFFEIICEI;E@>DDBDDD@<?0@@D9=<.BBB NH:i:6 HI:i:3 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1109:10709:93463 272 1 14801 0 29M140N22M * 0 0 ACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTT CC@DFFFFHHGH<CGDFHIIGIDAEGDGBBGFH@GEH:FHGGHIEFFDHII NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2214:12148:62454 272 1 14801 0 29M140N22M * 0 0 ACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTT ?@?DDDDD:FF=CFGGIGGBFGA;3<1:EEG>FGHFHFHIHGI@?DGC@CF NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1105:1515:82248 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG ??:ADBDDDDD:A<+C?AFDB@E?F4<*?:?1:??):??0009??9?(8BC NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1110:16355:5537 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG @CCFFFFFHH?ADHGIJIJJJJJIIEHIJJJJJIJIGIIJJIJJIIJIJJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1102:17802:20689 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG CCCFFFFFHHHHHJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJI NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1104:7670:95815 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@DBDDDHHBFDBFGEBBGHG@HIBHIDHBGGGEFBDDDFDGBBBGCHHI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1110:11368:101298 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG BCCFFFFFCFHHHJJJJJIJJJJJJJJJJJJJGJJJJJJJJJJJJJJJIJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1115:2363:85646 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @C@FFFB?CFDFHDHGIIIIEGIIIIEDGIIIIIIIIIGGIIGIIGCGHIH NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:2213:6044:80821 272 1 14805 0 25M140N26M * 0 0 GCTCCGGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@FFFFFFFBFDGIIJIJGGFHIIIJIIJGIIEHI<FEGIIFEGIHIHGE NH:i:7 HI:i:4 AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:1105:6336:76198 272 1 14807 0 23M140N28M * 0 0 CCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAA CCCFFFFFHHGHHGGHIIJEHIJIEGIJIJGIIIJICBFFIGAHFHHHJBH NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1108:3508:3794 272 1 14807 0 1S23M140N27M * 0 0 GCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGA CCCFFFFDHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJIJJJJJJJJJIJJ NH:i:7 HI:i:4 AS:i:48 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2209:11671:4960 272 1 14811 0 19M140N32M * 0 0 GGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGA BCBFFFFFHHHHHJJGIIGIJJJJJJJJJJJJIIJIJJJJJJHIJIIJIJI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2105:5117:87572 272 1 14812 0 18M140N33M * 0 0 GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT C@CFFFFFFFFHGFEFHIJJJJGIBHIJJJGGCHIEEGIJJFDGGGIGIGI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2202:9099:82513 272 1 14812 0 18M140N33M * 0 0 GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT 8?;DDDDDDFB?A3:<EE<<CGA+<F:F1?D*:*1:))???99??<FB??B NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2214:5571:19703 272 1 14812 0 18M140N33M * 0 0 GTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGAT =:?7ADFDHHHDHHGIEF?CFFCBFEG@G>CHIGEGFG?FGHGA>9B8BF@ NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1215:4185:31561 272 1 14815 0 15M140N36M * 0 0 CTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCG CCCFFFFFHHHHGJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJIJJJH NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2108:1506:70629 272 1 14816 0 14M140N37M * 0 0 TTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGA ?@@;BDDD=DFFDDDFGGFCA?)<CEG@C??C?FDFFB<FGIFDFFDFC;; NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1113:4051:71948 272 1 14818 0 12M140N39M * 0 0 TCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGACA @@@FFDFFF:CDCFGGDHHGEFHIJIJIIGIGHIJDBBDHGI@9BFGIEHI NH:i:7 HI:i:4 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1101:6114:49389 272 1 15001 0 38M757N13M * 0 0 ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC CCCFFFFFHHHHHHIJJJJJJJJIJJJJJJIJJJJJJJJJJJJJGJJJJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1103:12497:23506 272 1 15001 0 38M757N13M * 0 0 ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC CCCFFFFFHHHHGJIJJJJJJJJJJJJJJJJJJJJJIJJIJJJJJIJIJJI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2214:19931:4971 272 1 15002 0 37M757N14M * 0 0 TCCTACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCC ?;=+4ADDBBBDAFGGI>1@F?F+AAEEEB<GAG;?DGFE>FFIIF@DE4= NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1108:6828:32713 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT ?@@ADDDDD?CDD:CFB@:@G@ABGFGFGFBFEAFEEEFCFCF@F=)8=@> NH:i:8 HI:i:2 AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:1111:7491:39504 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFDFFFHHHFIGEGHIGIGGDGIJFHEHGGIJJJIJIJJJJJIIIIGI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1212:16079:85811 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHGHHHJJJJJJJIIJJJJJIJIJJHIIJJJJIJJJJJJJIJJJ NH:i:8 HI:i:2 AS:i:45 nM:i:2
+HWI-C00113:131:HMHYWADXX:1:2101:7167:50357 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@DD?DDFHHHD@?<AGHBEHFGAGIFHEH3??BFGBD@GGCHGGGCHI; NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2201:9548:48040 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJJJJIJHIIIGGIIJJJJHIJJJJIJIJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2201:14017:74222 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT =@?=DDDDDBDCFHB@CG?EF<BC>CG?FHGIIIIG@??BGHIE;8@B<FB NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2204:7589:97905 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHDHIGGGJIJJIJJJJIJJJJIGIJIJJIJJJJJIJJIJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2212:18929:92726 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@DDDDDFFFF:AE<AFGFEHFFAF8:1:@8:DBBD9BB?/BDF<CDB<F NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2215:2615:12154 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFGHHHHJJJJJJJJIJJJJJJJJJJJJJJJJJIJJJJJJJJJJI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1106:7741:42827 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJIJJJIIJJJJIJJJJJJJJJJJIJGHIHH NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1201:8380:74978 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHFHHHIJJJJJJJJJJJJJFHIIGJJJJJJJIGGIIJEEDHHI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1205:11268:38021 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT 1:?7DDD?ACDFBGHEAE@FGB@;@@A@C@0?F9FBFCF@48*9==3=CCF NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1208:17413:76793 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT C@@FFFFFGGDDDGHIGGIH<FHGEHB8CEIIJIIFG?FFHFHIJII>FEG NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1211:4828:84953 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @BB?DFFFHHHHHIJIJJJJJJJJJJJHIJJIIJJJJJJIIIJJJJJJIJI NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2107:20905:80208 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHHHFBHIIEIIDIHGGGGG@GGHCFGHIIJIGGGGIJIGIGGH NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2112:6263:84991 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@?DDDDFBH?FHIGIGIIGG;GHBGCD?DCGIIGHEGBBFHGGIHBFIG NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:2202:10314:26844 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJJJJJJIJJJJIIJJJJJJJJJJJJJJJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2213:21028:90280 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@BDDDAD?FDF9GIBB@@FFG3CFF:DD)?BD*9D@@F4BDEEEFFF8= NH:i:8 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:1216:14847:22529 272 1 15004 0 35M757N16M * 0 0 CGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTT @@@FFFFDHHBDHIIIJJJJIIIIIIJJIJJGIJIFIJJIDHHGBEHIJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2111:14281:81135 272 1 15007 0 32M757N19M * 0 0 CATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTG @@@DDDBD42=:ACFFIE?FFGAFF@FFFDGEAG>D@DBB9BC3D@EDFFA NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2203:4824:93169 272 1 15008 0 31M757N20M * 0 0 ATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTGC CCCFFFFFHHHHHJJJJIJJJJHIJIJJJJJJJJGIJJJJI?DFGFHIHJF NH:i:8 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1112:17298:87937 272 1 15925 1 23M659N28M * 0 0 CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC ?@;;BA;3ABC?C?6EGDGIIBA+AAC<?D9CBGG@@FFFFAFCIIECC7= NH:i:4 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2210:17342:39133 272 1 15925 1 23M659N28M * 0 0 CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC @?@DDDDDFAB::<EBFGIFG@FF9AECEFIFAGCD:F@F8=@E;7)77@@ NH:i:4 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2112:17740:2548 272 1 15931 1 17M659N34M * 0 0 CCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTCGTCCTG @@@FFD:ACFDCFCGGGDF?HHIBDEHFGHDHFIIGBDGEEHIFHGIIGHH NH:i:4 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2112:15228:46115 272 1 16727 0 39M92N12M * 0 0 GGGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTC @CCDDFF:?FFHH-@B:AABCB@DDEEDCDCCDCCCCD>ACD>>:9:??2< NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2109:14386:93817 272 1 16728 0 38M92N13M * 0 0 GGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTCG @CCFFFDDHHHHDHIFHIJJJGHHIIJHHHHHHFFFFEFEEEECDDDDDDB NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2203:14322:7218 272 1 16741 0 25M110N26M * 0 0 GGTGTTAGTACCCCATCTTGTAGGTCTCAGTGTGGAAGGTGGGCAGTTCTG ?@?DDD?BFFHHFB7EFGGGEFHIHA<CFHIGEHI<FEHH<=DEGG?DGEH NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1212:14242:21074 272 1 16751 0 15M92N36M * 0 0 CCCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGG @@CDFFFFGGBFFECGGGGIIHCHCEG@FAEGII9?*?BB9BFGC@H)=FG NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1106:12278:45196 272 1 16752 0 14M92N37M * 0 0 CCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGC CCCFFFFFHHHHHHIHIJIJIJJJJJIJFEHHIGGEGHGFHGBGGH8BGH; NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:12811:3161 272 1 16752 0 14M92N37M * 0 0 CCCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGC CCCFFFFFHFFHGEGHIJJJJJIIGHFHAFEGGDDCE:DBDGDHFH?CGH@ NH:i:6 HI:i:2 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1102:2563:17519 272 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGCGGGCA ?<?D>DB8D>822<CC<<<+A;CE?1):C?))1:*0B<9*8*0*((7@4'3 NH:i:6 HI:i:2 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:1:2201:2398:40333 16 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA CCCFFFFFHHHDHEHIHIJGF1FFHEBH@FHICHDD<B?DDA@?FD?FHFH NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2209:7506:25914 16 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA @C@FDFFFFHHHHIIJIIIJJJJJJBHG=?DFBC<?:?9?FGHCG8BHHD7 NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1207:6786:56046 16 1 16753 0 13M92N38M * 0 0 CCATCTTGTAGGTCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCA @B@DFFFFHHHHHGIGIIIAGIJEGAHHEGHHBF>BDG?FHBGEH?FHGG3 NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2116:7403:96086 272 1 17020 0 36M177N15M * 0 0 GCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG :?=DDD=AAAC:+<CA2C+::AFAC,9<9CA+::CEDDDD>BDIIIIIIA? NH:i:7 HI:i:5 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1209:11002:81132 272 1 17020 0 36M177N15M * 0 0 GCCCGGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG @@@DD@A<@DDDF;BCGF<4CHCEG?EG@FGF9)?BB:?B?DBF>D?**9B NH:i:7 HI:i:4 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1115:8064:78307 272 1 17021 0 35M177N16M * 0 0 CCCTGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGC 11844BBDD=FDFEFFFDFI?HEHAFBEHEEEFC?E:FDGDD<FE4:9??9 NH:i:7 HI:i:5 AS:i:47 nM:i:1
+HWI-C00113:131:HMHYWADXX:2:1211:18547:26385 272 1 17027 0 29M177N22M * 0 0 TCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTC BCCFFFFFHHHHHJJIIJHJJJJJJJIJJJIJJJJJJJJJIJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2109:12204:47428 272 1 17028 0 28M177N23M * 0 0 CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT @@@DDBD:=ACDBFEGECFGIHD>DH9CBEHHHEEFB?F>GD@3?FB?BB@ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1101:15891:42282 272 1 17028 0 28M177N23M * 0 0 CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT CCCFFFFFHHHHHJHHIIJJJJJJJJJJIIJJJJIJJJIJJJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1107:10929:6659 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHDHIHHJJGJJJJJJJIJJIJGIJJJIJJJIJJJJJIJJG NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1114:7098:71178 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG =?@BDEEFHBDHFBGIEGIHEHIGDHGEIIJIIIEHIHIIGHDGHIGIIH@ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1209:3383:100724 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG ?@@ADDDDDHDH?EEFH<CAHHGCHIF?GG>EHIGIIGHGHIFII>BFIH? NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2111:3771:31345 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@DFFFFGHDHHHJGIHJJJJGIJJIJIJIIJJIIJJIGHIJJJIJJIJ< NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2205:14794:36455 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHIJJJJJJJJJJJJJJJJJJIJJJJIJJIJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1107:19701:64552 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFDFFHHHHDGIIJIJJJIIJDGHGJJJJJJIJJJJJJJGIJJJJJJF NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1210:18711:88303 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHJJJJJJJJIJFIJJJEIIHIIJJIIJJGJJJIJJJJJE NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:19113:15559 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@7B>DDC=<AF@<CFB?<,?FFDBF3AD+?9*?EGCF>@BFBGBAF<FG NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2212:14258:59619 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG =?@DDD:=DADFAEFGEF<,AADE<F<?AAFCGG@?FD>CGBF<D<9B<D< NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2103:9695:24819 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC CCCFFFFFHHHHHJHIJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJGG NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1204:13994:2816 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC ?@@DDDDDHHHHFHEFAABA@?FGBEFHIIIHH>DB@DHIHIDD>@@GHID NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1212:15591:47491 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC @@C+ADDDDHFFDEGEGIIIDFHIFHIIIIIGEHIIBH>FGGGHGHFGGII NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2215:10125:81395 272 1 17031 0 25M859N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCAGGGCCCGCTCGTCCAGGGGGC CCCFFFFFGHHHHJJJJJJJJJHJJJJJJIJIIJJJHIJJJJJJJJJIJHE NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2102:9065:90529 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG C@CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJFHIFHIJIJJJJJJJJ NH:i:5 HI:i:2 AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2204:7767:77376 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@@FDFDDBFHADEHEIGIGIJIGHIHG?EDGHGGCFH:B?BD@FGFHGIH NH:i:5 HI:i:2 AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1212:6793:42000 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@?DADBD8CFADGFHIIIIE3A<EC:EHGGGIIB8?80?DDH>9?<FGCD NH:i:5 HI:i:2 AS:i:47 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1211:14829:37922 272 1 17036 0 20M177N31M * 0 0 TAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGCTGATG @<@BDBFDFFHAFHFF@GIIIHECFHFGFHICFHFIIIIGIIEGFF<FHII NH:i:7 HI:i:6 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2216:19937:47046 16 1 17334 1 35M237N16M * 0 0 CAGCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGT CC@F?DA?FDHHHIIGI@DHHGGFHHHIIAG@F@GFHHGGHEHG7-;FEHE NH:i:4 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1111:7653:49738 16 1 17336 1 33M237N18M * 0 0 GCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTG ?8=4BDDDFDFFAGF@@GD?9?FBFDDDBDEFFIII?BDEFFI75F5;65C NH:i:4 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2205:11163:47820 16 1 17336 1 33M237N18M * 0 0 GCCAGGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTG C@CFFFFFHHHHHJJJJIIJJJIJIJJJJJJJJJJJJJJJJJJGHIAGHII NH:i:4 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:2208:13311:23997 16 1 17340 1 29M237N22M * 0 0 GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA ?@8AD?@D?F>DH?FHGHH@EHGIEHGGIIIGGHIGHGFDEHGH=FHGIIH NH:i:3 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:2207:3786:78354 16 1 17340 1 29M237N22M * 0 0 GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA CCCFFFFFHHHHHJJJJJIIJJJJJJJJJJJJHHIJIHHBFIHIIJJJJJI NH:i:3 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:1:1115:8438:81914 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG @@CFFFFDHH?HDGGHIIGIGHIGHGIDIIIFGIIGHHDG:?DFHEHIIII NH:i:3 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:2:1114:13486:49038 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG ?@@:D@DDFHAFFHGGFHFHH@CCHIIIII@:CFGFGGC?D)?8DHHGCGI NH:i:3 HI:i:1 AS:i:49 nM:i:0
+++ /dev/null
-import pysam
-
-is_paired = 0
-is_proper = 0
-
-for read in pysam.AlignmentFile("ex1.bam", "rb"):
- is_paired += read.is_paired
- is_proper += read.is_proper_pair
-
-print ("there are alignments of %i paired reads" % is_paired)
-print ("there are %i proper paired alignments" % is_proper)
# an output file.
statements = [
"view ex1.bam > %(out)s_ex1.view",
+ "view -c ex1.bam > %(out)s_ex1.count",
# ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam",
"sort ex1.bam -o %(out)s_ex1.sort.bam",
"mpileup ex1.bam > %(out)s_ex1.pileup",