From: Afif Elghraoui Date: Tue, 24 Jan 2017 03:09:05 +0000 (-0800) Subject: Imported Upstream version 0.10.0+ds X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2~12^2~17 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=dadf905780b2c683a23346e1eece6aa56717ca88;p=python-pysam.git Imported Upstream version 0.10.0+ds --- diff --git a/.travis.yml b/.travis.yml index 1482ed7..bfc5d1c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,14 +3,14 @@ os: - osx language: c -sudo: required +sudo: false env: matrix: - CONDA_PY=2.7 - - CONDA_PY=3.3 - CONDA_PY=3.4 - CONDA_PY=3.5 + - CONDA_PY=3.6 addons: apt: diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index 9afe620..9eb3a7c 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -317,7 +317,7 @@ static void init_data(args_t *args) while (*p && *p!=',') p++; if ( *p==',' ) p++; } - if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files); + if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files); if ( args->isec_op==OP_COMPLEMENT && args->nwrite ) { if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n"); diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index 758d475..e3890d5 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -319,7 +319,7 @@ static void init_data(args_t *args) while (*p && *p!=',') p++; if ( *p==',' ) p++; } - if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files); + if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files); if ( args->isec_op==OP_COMPLEMENT && args->nwrite ) { if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n"); diff --git a/benchmark/cython_flagstat.py b/benchmark/cython_flagstat.py new file mode 100644 index 0000000..6a9b7df --- /dev/null +++ b/benchmark/cython_flagstat.py @@ -0,0 +1,23 @@ +"""compute number of reads/alignments from BAM file +=================================================== + +This is a benchmarking utility script with limited functionality. + +Compute simple flag stats on a BAM-file using +the pysam cython interface. + +""" + +import sys +import pysam +import pyximport +pyximport.install() +import _cython_flagstat + +assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0]) + +is_paired, is_proper = _cython_flagstat.count( + pysam.AlignmentFile(sys.argv[1], "rb")) + +print ("there are alignments of %i paired reads" % is_paired) +print ("there are %i proper paired alignments" % is_proper) diff --git a/benchmark/python_flagstat.py b/benchmark/python_flagstat.py new file mode 100644 index 0000000..0a14d80 --- /dev/null +++ b/benchmark/python_flagstat.py @@ -0,0 +1,23 @@ +"""compute number of reads/alignments from BAM file +=================================================== + +This is a benchmarking utility script with limited functionality. + +Compute simple flag stats on a BAM-file using +the pysam python interface. +""" + +import sys +import pysam + +assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0]) + +is_paired = 0 +is_proper = 0 + +for read in pysam.AlignmentFile(sys.argv[1], "rb"): + is_paired += read.is_paired + is_proper += read.is_proper_pair + +print ("there are alignments of %i paired reads" % is_paired) +print ("there are %i proper paired alignments" % is_proper) diff --git a/buildwheels.sh b/buildwheels.sh new file mode 100755 index 0000000..a5987f1 --- /dev/null +++ b/buildwheels.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# +# Build manylinux1 wheels for pysam. Based on the example at +# +# +# It is best to run this in a fresh clone of the repository! +# +# Run this within the repository root: +# docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh +# +# The wheels will be put into the wheelhouse/ subdirectory. +# +# For interactive tests: +# docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash + +set -xeuo pipefail + +# For convenience, if this script is called from outside of a docker container, +# it starts a container and runs itself inside of it. +if ! grep -q docker /proc/1/cgroup; then + # We are not inside a container + exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0 +fi + +yum install -y zlib-devel + +# Python 2.6 is not supported +rm -r /opt/python/cp26* + +# Python 3.3 builds fail with: +# /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib +rm -r /opt/python/cp33* + +# Without libcurl support, htslib can open files from HTTP and FTP URLs. +# With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a +# current version of OpenSSL, and we do not want to be responsible for +# updating the wheels as soon as there are any security issues. So disable +# libcurl for now. +# See also . +# +export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl" + +PYBINS="/opt/python/*/bin" +for PYBIN in ${PYBINS}; do + ${PYBIN}/pip install -r /io/requirements.txt + ${PYBIN}/pip wheel /io/ -w wheelhouse/ +done + +# Bundle external shared libraries into the wheels +# +# The '-L .' option is a workaround. By default, auditwheel puts all external +# libraries (.so files) into a .libs directory and sets the RUNPATH to $ORIGIN/.libs. +# When HTSLIB_MODE is 'shared' (now the default), then all so libraries part of +# pysam require that RUNPATH is set to $ORIGIN (without the .libs). It seems +# auditwheel overwrites $ORIGIN with $ORIGIN/.libs. This workaround makes +# auditwheel set the RUNPATH to "$ORIGIN/." and it will work as desired. +# +for whl in wheelhouse/*.whl; do + auditwheel repair -L . $whl -w /io/wheelhouse/ +done + +# Created files are owned by root, so fix permissions. +chown -R --reference=/io/setup.py /io/wheelhouse/ + +# TODO Install packages and test them +#for PYBIN in ${PYBINS}; do +# ${PYBIN}/pip install pysam --no-index -f /io/wheelhouse +# (cd $HOME; ${PYBIN}/nosetests ...) +#done diff --git a/cy_build.py b/cy_build.py index 880b5cc..29af588 100644 --- a/cy_build.py +++ b/cy_build.py @@ -16,7 +16,6 @@ if sys.platform == 'darwin': config_vars = get_config_vars() config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '') config_vars['SHLIB_EXT'] = '.so' - config_vars['SO'] = '.so' def is_pip_install(): @@ -61,7 +60,6 @@ class cy_build_ext(build_ext): ext.library_dirs.append(os.path.join(self.build_lib, "pysam")) if sys.platform == 'darwin': - relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"] if "develop" in sys.argv or "test" in sys.argv: diff --git a/doc/api.rst b/doc/api.rst index 671fe4e..686c60d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -201,7 +201,7 @@ Fastq files :members: -.. autoclass:: pysam.cfaidx.FastqProxy +.. autoclass:: pysam.FastqProxy :members: diff --git a/doc/release.rst b/doc/release.rst index f49b8f0..1d378f3 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,35 @@ Release notes ============= +Release 0.10.0 +============== + +This release implements further functionality in the VariantFile API +and includes several bugfixes: + +* treat special case -c option in samtools view outputs to stdout even + if -o given, fixes #315 +* permit reading BAM files with CSI index, closes #370 +* raise Error if query name exceeds maximum length, fixes #373 +* new method to compute hash value for AlignedSegment +* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile +* Avoid segfault by detecting out of range reference_id and + next_reference in AlignedSegment.tostring +* Issue #355: Implement streams using file descriptors for VariantFile +* upgrade to htslib 1.3.2 +* fix compilation with musl libc +* Issue #316, #360: Rename all Cython modules to have lib as a prefix +* Issue #332, hardclipped bases in cigar included by + pysam.AlignedSegment.infer_query_length() +* Added support for Python 3.6 filename encoding protocol +* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord +* Issue #331, fix failure in VariantFile.reset() method +* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and + VariantRecord.copy() methods to create new VariantRecord objects +* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters +* Preliminary (potentially unsafe) support for removing and altering header metadata +* Many minor fixes and improvements to VariantFile and related objects + Release 0.9.1 ============= diff --git a/doc/usage.rst b/doc/usage.rst index 90e7688..936f3bd 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -197,12 +197,22 @@ be retrieved by numeric index: for row in tbx.fetch("chr1", 1000, 2000): print ("chromosome is", row[0]) -By providing a parser argument to :class:`~pysam.AlignmentFile.fetch` +By providing a parser to :class:`~pysam.AlignmentFile.fetch` or :class:`~pysam.TabixFile`, the data will we presented in parsed -form: +form:: for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asTuple()): print ("chromosome is", row.contig) + print ("first field (chrom)=", row[0]) + +Pre-built parsers are available for :term:`bed` +(:class:`~pysam.asBed`) formatted files and :term:`gtf` +(:class:`~pysam.asGTF`) formatted files. Thus, additional fields +become available through named access, for example:: + + for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asBed()): + print ("name is", row.name) + .. Currently inactivated as pileup deprecated .. Using the samtools SNP caller diff --git a/pysam/__init__.py b/pysam/__init__.py index d1b5d41..ed17e04 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -3,22 +3,24 @@ import sys import sysconfig from pysam.libchtslib import * -from pysam.cutils import * -import pysam.cutils as cutils -import pysam.cfaidx as cfaidx -from pysam.cfaidx import * -import pysam.ctabix as ctabix -from pysam.ctabix import * -import pysam.csamfile as csamfile -from pysam.csamfile import * -import pysam.calignmentfile as calignmentfile -from pysam.calignmentfile import * -import pysam.calignedsegment as calignedsegment -from pysam.calignedsegment import * -import pysam.cvcf as cvcf -from pysam.cvcf import * -import pysam.cbcf as cbcf -from pysam.cbcf import * +from pysam.libcutils import * +import pysam.libcutils as libcutils +import pysam.libcfaidx as libcfaidx +from pysam.libcfaidx import * +import pysam.libctabix as libctabix +from pysam.libctabix import * +import pysam.libcsamfile as libcsamfile +from pysam.libcsamfile import * +import pysam.libcalignmentfile as libcalignmentfile +from pysam.libcalignmentfile import * +import pysam.libcalignedsegment as libcalignedsegment +from pysam.libcalignedsegment import * +import pysam.libcvcf as libcvcf +from pysam.libcvcf import * +import pysam.libcbcf as libcbcf +from pysam.libcbcf import * +import pysam.libcbgzf as libcbgzf +from pysam.libcbgzf import * from pysam.utils import SamtoolsError import pysam.Pileup as Pileup from pysam.samtools import * @@ -28,14 +30,15 @@ import pysam.config # export all the symbols from separate modules __all__ = \ libchtslib.__all__ +\ - cutils.__all__ +\ - ctabix.__all__ +\ - cvcf.__all__ +\ - cbcf.__all__ +\ - cfaidx.__all__ +\ - calignmentfile.__all__ +\ - calignedsegment.__all__ +\ - csamfile.__all__ +\ + libcutils.__all__ +\ + libctabix.__all__ +\ + libcvcf.__all__ +\ + libcbcf.__all__ +\ + libcbgzf.__all__ +\ + libcfaidx.__all__ +\ + libcalignmentfile.__all__ +\ + libcalignedsegment.__all__ +\ + libcsamfile.__all__ +\ ["SamtoolsError"] +\ ["Pileup"] @@ -75,25 +78,17 @@ def get_defines(): def get_libraries(): '''return a list of libraries to link against.''' - # Note that this list does not include csamtools.so as there are + # Note that this list does not include libcsamtools.so as there are # numerous name conflicts with libchtslib.so. dirname = os.path.abspath(os.path.join(os.path.dirname(__file__))) - pysam_libs = ['ctabixproxies', - 'cfaidx', - 'csamfile', - 'cvcf', - 'cbcf', - 'ctabix'] + pysam_libs = ['libctabixproxies', + 'libcfaidx', + 'libcsamfile', + 'libcvcf', + 'libcbcf', + 'libctabix'] if pysam.config.HTSLIB == "builtin": pysam_libs.append('libchtslib') - if sys.version_info.major >= 3: - if sys.version_info.minor >= 5: - return [os.path.join(dirname, x + ".{}.so".format( - sysconfig.get_config_var('SOABI'))) for x in pysam_libs] - else: - return [os.path.join(dirname, x + ".{}{}.so".format( - sys.implementation.cache_tag, - sys.abiflags)) for x in pysam_libs] - else: - return [os.path.join(dirname, x + ".so") for x in pysam_libs] + so = sysconfig.get_config_var('SO') + return [os.path.join(dirname, x + so) for x in pysam_libs] diff --git a/pysam/calignedsegment.pxd b/pysam/calignedsegment.pxd deleted file mode 100644 index 0880bef..0000000 --- a/pysam/calignedsegment.pxd +++ /dev/null @@ -1,91 +0,0 @@ -from pysam.chtslib cimport * - -cdef extern from "htslib_util.h": - - # add *nbytes* into the variable length data of *src* at *pos* - bam1_t * pysam_bam_update(bam1_t * b, - size_t nbytes_old, - size_t nbytes_new, - uint8_t * pos) - - # now: static - int aux_type2size(int) - - char * pysam_bam_get_qname(bam1_t * b) - uint32_t * pysam_bam_get_cigar(bam1_t * b) - uint8_t * pysam_bam_get_seq(bam1_t * b) - uint8_t * pysam_bam_get_qual(bam1_t * b) - uint8_t * pysam_bam_get_aux(bam1_t * b) - int pysam_bam_get_l_aux(bam1_t * b) - char pysam_bam_seqi(uint8_t * s, int i) - - uint16_t pysam_get_bin(bam1_t * b) - uint8_t pysam_get_qual(bam1_t * b) - uint8_t pysam_get_l_qname(bam1_t * b) - uint16_t pysam_get_flag(bam1_t * b) - uint16_t pysam_get_n_cigar(bam1_t * b) - void pysam_set_bin(bam1_t * b, uint16_t v) - void pysam_set_qual(bam1_t * b, uint8_t v) - void pysam_set_l_qname(bam1_t * b, uint8_t v) - void pysam_set_flag(bam1_t * b, uint16_t v) - void pysam_set_n_cigar(bam1_t * b, uint16_t v) - void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) - - -from pysam.calignmentfile cimport AlignmentFile -ctypedef AlignmentFile AlignmentFile_t - - -# Note: need to declare all C fields and methods here -cdef class AlignedSegment: - - # object that this AlignedSegment represents - cdef bam1_t * _delegate - - # the file from which this AlignedSegment originates (can be None) - cdef AlignmentFile _alignment_file - - # caching of array properties for quick access - cdef object cache_query_qualities - cdef object cache_query_alignment_qualities - cdef object cache_query_sequence - cdef object cache_query_alignment_sequence - - # add an alignment tag with value to the AlignedSegment - # an existing tag of the same name will be replaced. - cpdef set_tag(self, tag, value, value_type=?, replace=?) - - # add an alignment tag with value to the AlignedSegment - # an existing tag of the same name will be replaced. - cpdef get_tag(self, tag, with_value_type=?) - - # return true if tag exists - cpdef has_tag(self, tag) - - # returns a valid sam alignment string - cpdef tostring(self, AlignmentFile_t handle) - - -cdef class PileupColumn: - cdef bam_pileup1_t ** plp - cdef int tid - cdef int pos - cdef int n_pu - cdef AlignmentFile _alignment_file - - -cdef class PileupRead: - cdef AlignedSegment _alignment - cdef int32_t _qpos - cdef int _indel - cdef int _level - cdef uint32_t _is_del - cdef uint32_t _is_head - cdef uint32_t _is_tail - cdef uint32_t _is_refskip - -# factor methods -cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file) -cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file) -cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file) -cdef inline uint32_t get_alignment_length(bam1_t * src) diff --git a/pysam/calignedsegment.pyx b/pysam/calignedsegment.pyx deleted file mode 100644 index f4e0750..0000000 --- a/pysam/calignedsegment.pyx +++ /dev/null @@ -1,2457 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### -############################################################################### -# Cython wrapper for SAM/BAM/CRAM files based on htslib -############################################################################### -# The principal classes defined in this module are: -# -# class AlignedSegment an aligned segment (read) -# -# class PileupColumn a collection of segments (PileupRead) aligned to -# a particular genomic position. -# -# class PileupRead an AlignedSegment aligned to a particular genomic -# position. Contains additional attributes with respect -# to this. -# -# Additionally this module defines numerous additional classes that are part -# of the internal API. These are: -# -# Various iterator classes to iterate over alignments in sequential (IteratorRow) -# or in a stacked fashion (IteratorColumn): -# -# class IteratorRow -# class IteratorRowRegion -# class IteratorRowHead -# class IteratorRowAll -# class IteratorRowAllRefs -# class IteratorRowSelection -# -############################################################################### -# -# The MIT License -# -# Copyright (c) 2015 Andreas Heger -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -############################################################################### -import re -import array -import ctypes -import struct - -cimport cython -from cpython cimport array as c_array -from cpython.version cimport PY_MAJOR_VERSION -from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize -from libc.string cimport strchr -from cpython cimport array as c_array - -from pysam.cutils cimport force_bytes, force_str, \ - charptr_to_str, charptr_to_bytes -from pysam.cutils cimport qualities_to_qualitystring, qualitystring_to_array, \ - array_to_qualitystring - -# Constants for binary tag conversion -cdef char * htslib_types = 'cCsSiIf' -cdef char * parray_types = 'bBhHiIf' - -# translation tables - -# cigar code to character and vice versa -cdef char* CODE2CIGAR= "MIDNSHP=XB" -cdef int NCIGAR_CODES = 10 - -if PY_MAJOR_VERSION >= 3: - CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) -else: - CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR)) - -CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") - -##################################################################### -# typecode guessing -cdef inline char map_typecode_htslib_to_python(uint8_t s): - """map an htslib typecode to the corresponding python typecode - to be used in the struct or array modules.""" - - # map type from htslib to python array - cdef char * f = strchr(htslib_types, s) - - if f == NULL: - return 0 - return parray_types[f - htslib_types] - -cdef inline uint8_t map_typecode_python_to_htslib(char s): - """determine value type from type code of array""" - cdef char * f = strchr(parray_types, s) - if f == NULL: - return 0 - return htslib_types[f - parray_types] - -# optional tag data manipulation -cdef convert_binary_tag(uint8_t * tag): - """return bytesize, number of values and array of values - in aux_data memory location pointed to by tag.""" - cdef uint8_t auxtype - cdef uint8_t byte_size - cdef int32_t nvalues - # get byte size - auxtype = tag[0] - byte_size = aux_type2size(auxtype) - tag += 1 - # get number of values in array - nvalues = (tag)[0] - tag += 4 - - # define python array - cdef c_array.array c_values = array.array( - chr(map_typecode_htslib_to_python(auxtype))) - c_array.resize(c_values, nvalues) - - # copy data - memcpy(c_values.data.as_voidptr, tag, nvalues * byte_size) - - # no need to check for endian-ness as bam1_core_t fields - # and aux_data are in host endian-ness. See sam.c and calls - # to swap_data - return byte_size, nvalues, c_values - - -cdef inline uint8_t get_value_code(value, value_type=None): - '''guess type code for a *value*. If *value_type* is None, - the type code will be inferred based on the Python type of - *value*''' - cdef uint8_t typecode - cdef char * _char_type - - if value_type is None: - if isinstance(value, int): - typecode = 'i' - elif isinstance(value, float): - typecode = 'd' - elif isinstance(value, str): - typecode = 'Z' - elif isinstance(value, bytes): - typecode = 'Z' - elif isinstance(value, array.array) or \ - isinstance(value, list) or \ - isinstance(value, tuple): - typecode = 'B' - else: - return 0 - else: - if value_type not in 'Zidf': - return 0 - value_type = force_bytes(value_type) - _char_type = value_type - typecode = (_char_type)[0] - - return typecode - - -cdef inline bytes getTypecode(value, maximum_value=None): - '''returns the value typecode of a value. - - If max is specified, the approprite type is - returned for a range where value is the minimum. - ''' - - if maximum_value is None: - maximum_value = value - - cdef bytes valuetype - - t = type(value) - - if t is float: - valuetype = b'f' - elif t is int: - # signed ints - if value < 0: - if value >= -128 and maximum_value < 128: - valuetype = b'c' - elif value >= -32768 and maximum_value < 32768: - valuetype = b's' - elif value < -2147483648 or maximum_value >= 2147483648: - raise ValueError( - "at least one signed integer out of range of " - "BAM/SAM specification") - else: - valuetype = b'i' - # unsigned ints - else: - if maximum_value < 256: - valuetype = b'C' - elif maximum_value < 65536: - valuetype = b'S' - elif maximum_value >= 4294967296: - raise ValueError( - "at least one integer out of range of BAM/SAM specification") - else: - valuetype = b'I' - else: - # Note: hex strings (H) are not supported yet - if t is not bytes: - value = value.encode('ascii') - if len(value) == 1: - valuetype = b'A' - else: - valuetype = b'Z' - - return valuetype - - -cdef inline packTags(tags): - """pack a list of tags. Each tag is a tuple of (tag, tuple). - - Values are packed into the most space efficient data structure - possible unless the tag contains a third field with the typecode. - - Returns a format string and the associated list of arguments - to be used in a call to struct.pack_into. - """ - fmts, args = ["<"], [] - - cdef char array_typecode - - datatype2format = { - b'c': ('b', 1), - b'C': ('B', 1), - b's': ('h', 2), - b'S': ('H', 2), - b'i': ('i', 4), - b'I': ('I', 4), - b'f': ('f', 4), - b'A': ('c', 1)} - - for tag in tags: - - if len(tag) == 2: - pytag, value = tag - valuetype = None - elif len(tag) == 3: - pytag, value, valuetype = tag - else: - raise ValueError("malformatted tag: %s" % str(tag)) - - pytag = force_bytes(pytag) - valuetype = force_bytes(valuetype) - t = type(value) - - if t is tuple or t is list: - # binary tags from tuples or lists - if valuetype is None: - # automatically determine value type - first value - # determines type. If there is a mix of types, the - # result is undefined. - valuetype = getTypecode(min(value), max(value)) - - if valuetype not in datatype2format: - raise ValueError("invalid value type '%s'" % valuetype) - - datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0]) - args.extend([pytag[:2], - b"B", - valuetype, - len(value)] + list(value)) - - elif isinstance(value, array.array): - # binary tags from arrays - if valuetype is None: - array_typecode = map_typecode_python_to_htslib(ord(value.typecode)) - - if array_typecode == 0: - raise ValueError("unsupported type code '{}'" - .format(value.typecode)) - - valuetype = force_bytes(chr(array_typecode)) - - if valuetype not in datatype2format: - raise ValueError("invalid value type '%s' (%s)" % - (valuetype, type(valuetype))) - - # use array.tostring() to retrieve byte representation and - # save as bytes - datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1]) - args.extend([pytag[:2], - b"B", - valuetype, - len(value), - force_bytes(value.tostring())]) - - else: - if valuetype is None: - valuetype = getTypecode(value) - - if valuetype in b"AZ": - value = force_bytes(value) - - if valuetype == b"Z": - datafmt = "2sc%is" % (len(value)+1) - else: - datafmt = "2sc%s" % datatype2format[valuetype][0] - - args.extend([pytag[:2], - valuetype, - value]) - - fmts.append(datafmt) - - return "".join(fmts), args - - -cdef inline int32_t calculateQueryLength(bam1_t * src): - """return query length computed from CIGAR alignment. - - Return 0 if there is no CIGAR alignment. - """ - - cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) - - if cigar_p == NULL: - return 0 - - cdef uint32_t k, qpos - cdef int op - qpos = 0 - - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - - if op == BAM_CMATCH or op == BAM_CINS or \ - op == BAM_CSOFT_CLIP or \ - op == BAM_CEQUAL or op == BAM_CDIFF: - qpos += cigar_p[k] >> BAM_CIGAR_SHIFT - - return qpos - - -cdef inline int32_t getQueryStart(bam1_t *src) except -1: - cdef uint32_t * cigar_p - cdef uint32_t k, op - cdef uint32_t start_offset = 0 - - if pysam_get_n_cigar(src): - cigar_p = pysam_bam_get_cigar(src); - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - if op == BAM_CHARD_CLIP: - if start_offset != 0 and start_offset != src.core.l_qseq: - PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string') - return -1 - elif op == BAM_CSOFT_CLIP: - start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT - else: - break - - return start_offset - - -cdef inline int32_t getQueryEnd(bam1_t *src) except -1: - cdef uint32_t * cigar_p - cdef uint32_t k, op - cdef uint32_t end_offset = src.core.l_qseq - - # if there is no sequence, compute length from cigar string - if end_offset == 0: - end_offset = calculateQueryLength(src) - - # walk backwards in cigar string - if pysam_get_n_cigar(src) > 1: - cigar_p = pysam_bam_get_cigar(src); - for k from pysam_get_n_cigar(src) > k >= 1: - op = cigar_p[k] & BAM_CIGAR_MASK - if op == BAM_CHARD_CLIP: - if end_offset != 0 and end_offset != src.core.l_qseq: - PyErr_SetString(ValueError, - 'Invalid clipping in CIGAR string') - return -1 - elif op == BAM_CSOFT_CLIP: - end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT - else: - break - - return end_offset - - -cdef inline bytes getSequenceInRange(bam1_t *src, - uint32_t start, - uint32_t end): - """return python string of the sequence in a bam1_t object. - """ - - cdef uint8_t * p - cdef uint32_t k - cdef char * s - - if not src.core.l_qseq: - return None - - seq = PyBytes_FromStringAndSize(NULL, end - start) - s = seq - p = pysam_bam_get_seq(src) - - for k from start <= k < end: - # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c) - # note: do not use string literal as it will be a python string - s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf] - - return charptr_to_bytes(seq) - - -cdef inline object getQualitiesInRange(bam1_t *src, - uint32_t start, - uint32_t end): - """return python array of quality values from a bam1_t object""" - - cdef uint8_t * p - cdef uint32_t k - - p = pysam_bam_get_qual(src) - if p[0] == 0xff: - return None - - # 'B': unsigned char - cdef c_array.array result = array.array('B', [0]) - c_array.resize(result, end - start) - - # copy data - memcpy(result.data.as_voidptr, &p[start], end - start) - - return result - - -##################################################################### -## private factory methods -cdef class AlignedSegment -cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file): - '''return an AlignedSegment object constructed from `src`''' - # note that the following does not call __init__ - cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment) - dest._delegate = bam_dup1(src) - dest._alignment_file = alignment_file - return dest - - -cdef class PileupColumn -cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, - int n_pu, AlignmentFile alignment_file): - '''return a PileupColumn object constructed from pileup in `plp` and - setting additional attributes. - - ''' - # note that the following does not call __init__ - cdef PileupColumn dest = PileupColumn.__new__(PileupColumn) - dest._alignment_file = alignment_file - dest.plp = plp - dest.tid = tid - dest.pos = pos - dest.n_pu = n_pu - return dest - -cdef class PileupRead -cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file): - '''return a PileupRead object construted from a bam_pileup1_t * object.''' - cdef PileupRead dest = PileupRead.__new__(PileupRead) - dest._alignment = makeAlignedSegment(src.b, alignment_file) - dest._qpos = src.qpos - dest._indel = src.indel - dest._level = src.level - dest._is_del = src.is_del - dest._is_head = src.is_head - dest._is_tail = src.is_tail - dest._is_refskip = src.is_refskip - return dest - - -cdef inline uint32_t get_alignment_length(bam1_t * src): - cdef int k = 0 - cdef uint32_t l = 0 - if src == NULL: - return 0 - cdef uint32_t * cigar_p = bam_get_cigar(src) - if cigar_p == NULL: - return 0 - cdef int op - cdef int n = pysam_get_n_cigar(src) - for k from 0 <= k < n: - op = cigar_p[k] & BAM_CIGAR_MASK - if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP: - continue - l += cigar_p[k] >> BAM_CIGAR_SHIFT - return l - - -# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ... -cdef inline bytes build_alignment_sequence(bam1_t * src): - """return expanded sequence from MD tag. - - The sequence includes substitutions and both insertions in the - reference as well as deletions to the reference sequence. Combine - with the cigar string to reconstitute the query or the reference - sequence. - - Positions corresponding to `N` (skipped region from the reference) - in the CIGAR string will not appear in the returned sequence. The - MD should correspondingly not contain these. Thus proper tags are:: - - Deletion from the reference: cigar=5M1D5M MD=5^C5 - Skipped region from reference: cigar=5M1N5M MD=10 - - Returns - ------- - - None, if no MD tag is present. - - """ - if src == NULL: - return None - - cdef uint32_t start = getQueryStart(src) - cdef uint32_t end = getQueryEnd(src) - # get read sequence, taking into account soft-clipping - r = getSequenceInRange(src, start, end) - cdef char * read_sequence = r - cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) - if cigar_p == NULL: - return None - - cdef uint32_t r_idx = 0 - cdef int op - cdef uint32_t k, i, l, x - cdef int nmatches = 0 - cdef int s_idx = 0 - - cdef uint32_t max_len = get_alignment_length(src) - if max_len == 0: - raise ValueError("could not determine alignment length") - - cdef char * s = calloc(max_len + 1, sizeof(char)) - if s == NULL: - raise ValueError( - "could not allocated sequence of length %i" % max_len) - - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: - for i from 0 <= i < l: - s[s_idx] = read_sequence[r_idx] - r_idx += 1 - s_idx += 1 - elif op == BAM_CDEL: - for i from 0 <= i < l: - s[s_idx] = '-' - s_idx += 1 - elif op == BAM_CREF_SKIP: - pass - elif op == BAM_CINS: - for i from 0 <= i < l: - # encode insertions into reference as lowercase - s[s_idx] = read_sequence[r_idx] + 32 - r_idx += 1 - s_idx += 1 - elif op == BAM_CSOFT_CLIP: - pass - elif op == BAM_CHARD_CLIP: - pass # advances neither - elif op == BAM_CPAD: - raise NotImplementedError( - "Padding (BAM_CPAD, 6) is currently not supported. " - "Please implement. Sorry about that.") - - cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") - if md_tag_ptr == NULL: - seq = PyBytes_FromStringAndSize(s, s_idx) - free(s) - return seq - - cdef char * md_tag = bam_aux2Z(md_tag_ptr) - cdef int md_idx = 0 - s_idx = 0 - - while md_tag[md_idx] != 0: - # c is numerical - if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: - nmatches *= 10 - nmatches += md_tag[md_idx] - 48 - md_idx += 1 - continue - else: - # save matches up to this point, skipping insertions - for x from 0 <= x < nmatches: - while s[s_idx] >= 'a': - s_idx += 1 - s_idx += 1 - while s[s_idx] >= 'a': - s_idx += 1 - - r_idx += nmatches - nmatches = 0 - if md_tag[md_idx] == '^': - md_idx += 1 - while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: - assert s[s_idx] == '-' - s[s_idx] = md_tag[md_idx] - s_idx += 1 - md_idx += 1 - else: - # save mismatch and change to lower case - s[s_idx] = md_tag[md_idx] + 32 - s_idx += 1 - r_idx += 1 - md_idx += 1 - - # save matches up to this point, skipping insertions - for x from 0 <= x < nmatches: - while s[s_idx] >= 'a': - s_idx += 1 - s_idx += 1 - while s[s_idx] >= 'a': - s_idx += 1 - - seq = PyBytes_FromStringAndSize(s, s_idx) - free(s) - - return seq - - -cdef class AlignedSegment: - '''Class representing an aligned segment. - - This class stores a handle to the samtools C-structure representing - an aligned read. Member read access is forwarded to the C-structure - and converted into python objects. This implementation should be fast, - as only the data needed is converted. - - For write access, the C-structure is updated in-place. This is - not the most efficient way to build BAM entries, as the variable - length data is concatenated and thus needs to be resized if - a field is updated. Furthermore, the BAM entry might be - in an inconsistent state. - - One issue to look out for is that the sequence should always - be set *before* the quality scores. Setting the sequence will - also erase any quality scores that were set previously. - ''' - - # Now only called when instances are created from Python - def __init__(self): - # see bam_init1 - self._delegate = calloc(1, sizeof(bam1_t)) - # allocate some memory. If size is 0, calloc does not return a - # pointer that can be passed to free() so allocate 40 bytes - # for a new read - self._delegate.m_data = 40 - self._delegate.data = calloc( - self._delegate.m_data, 1) - self._delegate.l_data = 0 - - # caching for selected fields - self.cache_query_qualities = None - self.cache_query_alignment_qualities = None - self.cache_query_sequence = None - self.cache_query_alignment_sequence = None - - def __dealloc__(self): - bam_destroy1(self._delegate) - - def __str__(self): - """return string representation of alignment. - - The representation is an approximate :term:`SAM` format, because - an aligned read might not be associated with a :term:`AlignmentFile`. - As a result :term:`tid` is shown instead of the reference name. - Similarly, the tags field is returned in its parsed state. - - To get a valid SAM record, use :meth:`tostring`. - """ - # sam-parsing is done in sam.c/bam_format1_core which - # requires a valid header. - return "\t".join(map(str, (self.query_name, - self.flag, - self.reference_id, - self.reference_start, - self.mapping_quality, - self.cigarstring, - self.next_reference_id, - self.next_reference_start, - self.query_alignment_length, - self.query_sequence, - self.query_qualities, - self.tags))) - - def __copy__(self): - return makeAlignedSegment(self._delegate, self._alignment_file) - - def __deepcopy__(self, memo): - return makeAlignedSegment(self._delegate, self._alignment_file) - - def compare(self, AlignedSegment other): - '''return -1,0,1, if contents in this are binary - <,=,> to *other* - - ''' - - cdef int retval, x - cdef bam1_t *t - cdef bam1_t *o - - t = self._delegate - o = other._delegate - - # uncomment for debugging purposes - # cdef unsigned char * oo, * tt - # tt = (&t.core) - # oo = (&o.core) - # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x] - # tt = (t.data) - # oo = (o.data) - # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) - - # Fast-path test for object identity - if t == o: - return 0 - - retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t)) - - if retval: - return retval - # cmp(t.l_data, o.l_data) - retval = (t.l_data > o.l_data) - (t.l_data < o.l_data) - if retval: - return retval - return memcmp(t.data, o.data, t.l_data) - - def __richcmp__(self, AlignedSegment other, int op): - if op == 2: # == operator - return self.compare(other) == 0 - elif op == 3: # != operator - return self.compare(other) != 0 - else: - return NotImplemented - - def __hash__(self): - cdef bam1_t * src - src = self._delegate - # shift and xor values in the core structure - # make sure tid and mtid are shifted by different amounts - # should variable length data be included? - cdef uint32_t hash_value = src.core.tid << 24 ^ \ - src.core.pos << 16 ^ \ - src.core.qual << 8 ^ \ - src.core.flag ^ \ - src.core.isize << 24 ^ \ - src.core.mtid << 16 ^ \ - src.core.mpos << 8 - - return hash_value - - cpdef tostring(self, AlignmentFile_t htsfile): - """returns a string representation of the aligned segment. - - The output format is valid SAM format. - - Parameters - ---------- - - htsfile -- AlignmentFile object to map numerical - identifers to chromosome names. - """ - - cdef kstring_t line - line.l = line.m = 0 - line.s = NULL - - if sam_format1(htsfile.header, self._delegate, &line) < 0: - if line.m: - free(line.s) - raise ValueError('sam_format failed') - - ret = force_str(line.s[:line.l]) - - if line.m: - free(line.s) - - return ret - - ######################################################## - ## Basic attributes in order of appearance in SAM format - property query_name: - """the query template name (None if not present)""" - def __get__(self): - cdef bam1_t * src - src = self._delegate - if pysam_get_l_qname(src) == 0: - return None - return charptr_to_str(pysam_bam_get_qname(src)) - - def __set__(self, qname): - if qname is None or len(qname) == 0: - return - qname = force_bytes(qname) - cdef bam1_t * src - cdef int l - cdef char * p - - src = self._delegate - p = pysam_bam_get_qname(src) - - # the qname is \0 terminated - l = len(qname) + 1 - pysam_bam_update(src, - pysam_get_l_qname(src), - l, - p) - - - pysam_set_l_qname(src, l) - - # re-acquire pointer to location in memory - # as it might have moved - p = pysam_bam_get_qname(src) - - strncpy(p, qname, l) - - property flag: - """properties flag""" - def __get__(self): - return pysam_get_flag(self._delegate) - def __set__(self, flag): - pysam_set_flag(self._delegate, flag) - - property reference_name: - """:term:`reference` name (None if no AlignmentFile is associated)""" - def __get__(self): - if self._alignment_file is not None: - return self._alignment_file.getrname(self._delegate.core.tid) - return None - - property reference_id: - """:term:`reference` ID - - .. note:: - - This field contains the index of the reference sequence in - the sequence dictionary. To obtain the name of the - reference sequence, use - :meth:`pysam.AlignmentFile.getrname()` - - """ - def __get__(self): return self._delegate.core.tid - def __set__(self, tid): self._delegate.core.tid = tid - - property reference_start: - """0-based leftmost coordinate""" - def __get__(self): return self._delegate.core.pos - def __set__(self, pos): - ## setting the position requires updating the "bin" attribute - cdef bam1_t * src - src = self._delegate - src.core.pos = pos - if pysam_get_n_cigar(src): - pysam_set_bin(src, - hts_reg2bin( - src.core.pos, - bam_endpos(src), - 14, - 5)) - else: - pysam_set_bin(src, - hts_reg2bin( - src.core.pos, - src.core.pos + 1, - 14, - 5)) - - property mapping_quality: - """mapping quality""" - def __get__(self): - return pysam_get_qual(self._delegate) - def __set__(self, qual): - pysam_set_qual(self._delegate, qual) - - property cigarstring: - '''the :term:`cigar` alignment as a string. - - The cigar string is a string of alternating integers - and characters denoting the length and the type of - an operation. - - .. note:: - The order length,operation is specified in the - SAM format. It is different from the order of - the :attr:`cigar` property. - - Returns None if not present. - - To unset the cigarstring, assign None or the - empty string. - ''' - def __get__(self): - c = self.cigartuples - if c is None: - return None - # reverse order - else: - return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c]) - - def __set__(self, cigar): - if cigar is None or len(cigar) == 0: - self.cigartuples = [] - else: - parts = CIGAR_REGEX.findall(cigar) - # reverse order - self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts] - - # TODO - # property cigar: - # """the cigar alignment""" - - property next_reference_id: - """the :term:`reference` id of the mate/next read.""" - def __get__(self): return self._delegate.core.mtid - def __set__(self, mtid): - self._delegate.core.mtid = mtid - - property next_reference_name: - """:term:`reference` name of the mate/next read (None if no - AlignmentFile is associated)""" - def __get__(self): - if self._alignment_file is not None: - return self._alignment_file.getrname(self._delegate.core.mtid) - return None - - property next_reference_start: - """the position of the mate/next read.""" - def __get__(self): - return self._delegate.core.mpos - def __set__(self, mpos): - self._delegate.core.mpos = mpos - - property query_length: - """the length of the query/read. - - This value corresponds to the length of the sequence supplied - in the BAM/SAM file. The length of a query is 0 if there is no - sequence in the BAM/SAM file. In those cases, the read length - can be inferred from the CIGAR alignment, see - :meth:`pysam.AlignmentFile.infer_query_length.`. - - The length includes soft-clipped bases and is equal to - ``len(query_sequence)``. - - This property is read-only but can be set by providing a - sequence. - - Returns 0 if not available. - - """ - def __get__(self): - return self._delegate.core.l_qseq - - property template_length: - """the observed query template length""" - def __get__(self): - return self._delegate.core.isize - def __set__(self, isize): - self._delegate.core.isize = isize - - property query_sequence: - """read sequence bases, including :term:`soft clipped` bases - (None if not present). - - Note that assigning to seq will invalidate any quality scores. - Thus, to in-place edit the sequence and quality scores, copies of - the quality scores need to be taken. Consider trimming for example:: - - q = read.query_qualities - read.query_squence = read.query_sequence[5:10] - read.query_qualities = q[5:10] - - The sequence is returned as it is stored in the BAM file. Some mappers - might have stored a reverse complement of the original read - sequence. - """ - def __get__(self): - if self.cache_query_sequence: - return self.cache_query_sequence - - cdef bam1_t * src - cdef char * s - src = self._delegate - - if src.core.l_qseq == 0: - return None - - self.cache_query_sequence = force_str(getSequenceInRange( - src, 0, src.core.l_qseq)) - return self.cache_query_sequence - - def __set__(self, seq): - # samtools manages sequence and quality length memory together - # if no quality information is present, the first byte says 0xff. - cdef bam1_t * src - cdef uint8_t * p - cdef char * s - cdef int l, k - cdef Py_ssize_t nbytes_new, nbytes_old - - if seq == None: - l = 0 - else: - l = len(seq) - seq = force_bytes(seq) - - src = self._delegate - - # as the sequence is stored in half-bytes, the total length (sequence - # plus quality scores) is (l+1)/2 + l - nbytes_new = (l + 1) / 2 + l - nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq - - # acquire pointer to location in memory - p = pysam_bam_get_seq(src) - src.core.l_qseq = l - - # change length of data field - pysam_bam_update(src, - nbytes_old, - nbytes_new, - p) - - if l > 0: - # re-acquire pointer to location in memory - # as it might have moved - p = pysam_bam_get_seq(src) - for k from 0 <= k < nbytes_new: - p[k] = 0 - # convert to C string - s = seq - for k from 0 <= k < l: - p[k/2] |= seq_nt16_table[s[k]] << 4 * (1 - k % 2) - - # erase qualities - p = pysam_bam_get_qual(src) - p[0] = 0xff - - self.cache_query_sequence = force_str(seq) - - # clear cached values for quality values - self.cache_query_qualities = None - self.cache_query_alignment_qualities = None - - property query_qualities: - """read sequence base qualities, including :term:`soft - clipped` bases (None if not present). - - Quality scores are returned as a python array of unsigned - chars. Note that this is not the ASCII-encoded value typically - seen in FASTQ or SAM formatted files. Thus, no offset of 33 - needs to be subtracted. - - Note that to set quality scores the sequence has to be set - beforehand as this will determine the expected length of the - quality score array. - - This method raises a ValueError if the length of the - quality scores and the sequence are not the same. - - """ - def __get__(self): - - if self.cache_query_qualities: - return self.cache_query_qualities - - cdef bam1_t * src - cdef char * q - - src = self._delegate - - if src.core.l_qseq == 0: - return None - - self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq) - return self.cache_query_qualities - - def __set__(self, qual): - - # note that memory is already allocated via setting the sequence - # hence length match of sequence and quality needs is checked. - cdef bam1_t * src - cdef uint8_t * p - cdef int l - - src = self._delegate - p = pysam_bam_get_qual(src) - if qual is None or len(qual) == 0: - # if absent and there is a sequence: set to 0xff - if src.core.l_qseq != 0: - p[0] = 0xff - return - - # check for length match - l = len(qual) - if src.core.l_qseq != l: - raise ValueError( - "quality and sequence mismatch: %i != %i" % - (l, src.core.l_qseq)) - - # create a python array object filling it - # with the quality scores - - # NB: should avoid this copying if qual is - # already of the correct type. - cdef c_array.array result = c_array.array('B', qual) - - # copy data - memcpy(p, result.data.as_voidptr, l) - - # save in cache - self.cache_query_qualities = qual - - property bin: - """properties bin""" - def __get__(self): - return pysam_get_bin(self._delegate) - def __set__(self, bin): - pysam_set_bin(self._delegate, bin) - - - ########################################################## - # Derived simple attributes. These are simple attributes of - # AlignedSegment getting and setting values. - ########################################################## - # 1. Flags - ########################################################## - property is_paired: - """true if read is paired in sequencing""" - def __get__(self): - return (self.flag & BAM_FPAIRED) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FPAIRED) - - property is_proper_pair: - """true if read is mapped in a proper pair""" - def __get__(self): - return (self.flag & BAM_FPROPER_PAIR) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR) - property is_unmapped: - """true if read itself is unmapped""" - def __get__(self): - return (self.flag & BAM_FUNMAP) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FUNMAP) - property mate_is_unmapped: - """true if the mate is unmapped""" - def __get__(self): - return (self.flag & BAM_FMUNMAP) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FMUNMAP) - property is_reverse: - """true if read is mapped to reverse strand""" - def __get__(self): - return (self.flag & BAM_FREVERSE) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FREVERSE) - property mate_is_reverse: - """true is read is mapped to reverse strand""" - def __get__(self): - return (self.flag & BAM_FMREVERSE) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FMREVERSE) - property is_read1: - """true if this is read1""" - def __get__(self): - return (self.flag & BAM_FREAD1) != 0 - def __set__(self,val): - pysam_update_flag(self._delegate, val, BAM_FREAD1) - property is_read2: - """true if this is read2""" - def __get__(self): - return (self.flag & BAM_FREAD2) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FREAD2) - property is_secondary: - """true if not primary alignment""" - def __get__(self): - return (self.flag & BAM_FSECONDARY) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FSECONDARY) - property is_qcfail: - """true if QC failure""" - def __get__(self): - return (self.flag & BAM_FQCFAIL) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FQCFAIL) - property is_duplicate: - """true if optical or PCR duplicate""" - def __get__(self): - return (self.flag & BAM_FDUP) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FDUP) - property is_supplementary: - """true if this is a supplementary alignment""" - def __get__(self): - return (self.flag & BAM_FSUPPLEMENTARY) != 0 - def __set__(self, val): - pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY) - - # 2. Coordinates and lengths - property reference_end: - '''aligned reference position of the read on the reference genome. - - reference_end points to one past the last aligned residue. - Returns None if not available (read is unmapped or no cigar - alignment present). - - ''' - def __get__(self): - cdef bam1_t * src - src = self._delegate - if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: - return None - return bam_endpos(src) - - property reference_length: - '''aligned length of the read on the reference genome. - - This is equal to `aend - pos`. Returns None if not available.''' - def __get__(self): - cdef bam1_t * src - src = self._delegate - if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: - return None - return bam_endpos(src) - \ - self._delegate.core.pos - - property query_alignment_sequence: - """aligned portion of the read. - - This is a substring of :attr:`seq` that excludes flanking - bases that were :term:`soft clipped` (None if not present). It - is equal to ``seq[qstart:qend]``. - - SAM/BAM files may include extra flanking bases that are not - part of the alignment. These bases may be the result of the - Smith-Waterman or other algorithms, which may not require - alignments that begin at the first residue or end at the last. - In addition, extra sequencing adapters, multiplex identifiers, - and low-quality bases that were not considered for alignment - may have been retained. - - """ - - def __get__(self): - if self.cache_query_alignment_sequence: - return self.cache_query_alignment_sequence - - cdef bam1_t * src - cdef uint32_t start, end - - src = self._delegate - - if src.core.l_qseq == 0: - return None - - start = getQueryStart(src) - end = getQueryEnd(src) - - self.cache_query_alignment_sequence = force_str( - getSequenceInRange(src, start, end)) - return self.cache_query_alignment_sequence - - property query_alignment_qualities: - """aligned query sequence quality values (None if not present). These - are the quality values that correspond to :attr:`query`, that - is, they exclude qualities of :term:`soft clipped` bases. This - is equal to ``qual[qstart:qend]``. - - Quality scores are returned as a python array of unsigned - chars. Note that this is not the ASCII-encoded value typically - seen in FASTQ or SAM formatted files. Thus, no offset of 33 - needs to be subtracted. - - This property is read-only. - - """ - def __get__(self): - - if self.cache_query_alignment_qualities: - return self.cache_query_alignment_qualities - - cdef bam1_t * src - cdef uint32_t start, end - - src = self._delegate - - if src.core.l_qseq == 0: - return None - - start = getQueryStart(src) - end = getQueryEnd(src) - self.cache_query_alignment_qualities = \ - getQualitiesInRange(src, start, end) - return self.cache_query_alignment_qualities - - property query_alignment_start: - """start index of the aligned query portion of the sequence (0-based, - inclusive). - - This the index of the first base in :attr:`seq` that is not - soft-clipped. - - """ - def __get__(self): - return getQueryStart(self._delegate) - - property query_alignment_end: - """end index of the aligned query portion of the sequence (0-based, - exclusive)""" - def __get__(self): - return getQueryEnd(self._delegate) - - property query_alignment_length: - """length of the aligned query sequence. - - This is equal to :attr:`qend` - :attr:`qstart`""" - def __get__(self): - cdef bam1_t * src - src = self._delegate - return getQueryEnd(src) - getQueryStart(src) - - ##################################################### - # Computed properties - - def get_reference_positions(self, full_length=False): - """a list of reference positions that this read aligns to. - - By default, this method only returns positions in the - reference that are within the alignment. If *full_length* is - set, None values will be included for any soft-clipped or - unaligned positions within the read. The returned list will - thus be of the same length as the read. - - """ - cdef uint32_t k, i, pos - cdef int op - cdef uint32_t * cigar_p - cdef bam1_t * src - cdef bint _full = full_length - - src = self._delegate - if pysam_get_n_cigar(src) == 0: - return [] - - result = [] - pos = src.core.pos - cigar_p = pysam_bam_get_cigar(src) - - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - - if op == BAM_CSOFT_CLIP or op == BAM_CINS: - if _full: - for i from 0 <= i < l: - result.append(None) - elif op == BAM_CMATCH: - for i from pos <= i < pos + l: - result.append(i) - pos += l - elif op == BAM_CDEL or op == BAM_CREF_SKIP: - pos += l - - return result - - def infer_query_length(self, always=True): - """inferred read length from CIGAR string. - - If *always* is set to True, the read length - will be always inferred. If set to False, the length - of the read sequence will be returned if it is - available. - - Returns None if CIGAR string is not present. - """ - - cdef uint32_t * cigar_p - cdef bam1_t * src - - src = self._delegate - - if not always and src.core.l_qseq: - return src.core.l_qseq - - return calculateQueryLength(src) - - def get_reference_sequence(self): - """return the reference sequence. - - This method requires the MD tag to be set. - """ - cdef uint32_t k, i - cdef int op - cdef bam1_t * src = self._delegate - ref_seq = force_str(build_alignment_sequence(src)) - if ref_seq is None: - raise ValueError("MD tag not present") - - cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) - cdef uint32_t r_idx = 0 - result = [] - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: - for i from 0 <= i < l: - result.append(ref_seq[r_idx]) - r_idx += 1 - elif op == BAM_CDEL: - for i from 0 <= i < l: - result.append(ref_seq[r_idx]) - r_idx += 1 - elif op == BAM_CREF_SKIP: - pass - elif op == BAM_CINS: - r_idx += l - elif op == BAM_CSOFT_CLIP: - pass - elif op == BAM_CHARD_CLIP: - pass # advances neither - elif op == BAM_CPAD: - raise NotImplementedError( - "Padding (BAM_CPAD, 6) is currently not supported. " - "Please implement. Sorry about that.") - - return "".join(result) - - def get_aligned_pairs(self, matches_only=False, with_seq=False): - """a list of aligned read (query) and reference positions. - - For inserts, deletions, skipping either query or reference - position may be None. - - Padding is currently not supported and leads to an exception. - - Parameters - ---------- - - matches_only : bool - If True, only matched bases are returned - no None on either - side. - with_seq : bool - If True, return a third element in the tuple containing the - reference sequence. Substitutions are lower-case. This option - requires an MD tag to be present. - - Returns - ------- - - aligned_pairs : list of tuples - - """ - cdef uint32_t k, i, pos, qpos, r_idx, l - cdef int op - cdef uint32_t * cigar_p - cdef bam1_t * src = self._delegate - cdef bint _matches_only = bool(matches_only) - cdef bint _with_seq = bool(with_seq) - - # TODO: this method performs no checking and assumes that - # read sequence, cigar and MD tag are consistent. - - if _with_seq: - ref_seq = force_str(self.get_reference_sequence()) - if ref_seq is None: - raise ValueError("MD tag not present") - - r_idx = 0 - - if pysam_get_n_cigar(src) == 0: - return [] - - result = [] - pos = src.core.pos - qpos = 0 - cigar_p = pysam_bam_get_cigar(src) - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - - if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: - if _with_seq: - for i from pos <= i < pos + l: - result.append((qpos, i, ref_seq[r_idx])) - r_idx += 1 - qpos += 1 - else: - for i from pos <= i < pos + l: - result.append((qpos, i)) - qpos += 1 - pos += l - - elif op == BAM_CINS or op == BAM_CSOFT_CLIP: - if not _matches_only: - if _with_seq: - for i from pos <= i < pos + l: - result.append((qpos, None, None)) - qpos += 1 - else: - for i from pos <= i < pos + l: - result.append((qpos, None)) - qpos += 1 - else: - qpos += l - - elif op == BAM_CDEL: - if not _matches_only: - if _with_seq: - for i from pos <= i < pos + l: - result.append((None, i, ref_seq[r_idx])) - r_idx += 1 - else: - for i from pos <= i < pos + l: - result.append((None, i)) - pos += l - - elif op == BAM_CHARD_CLIP: - pass # advances neither - - elif op == BAM_CREF_SKIP: - if not _matches_only: - if _with_seq: - for i from pos <= i < pos + l: - result.append((None, i, None)) - else: - for i from pos <= i < pos + l: - result.append((None, i)) - - pos += l - - elif op == BAM_CPAD: - raise NotImplementedError( - "Padding (BAM_CPAD, 6) is currently not supported. " - "Please implement. Sorry about that.") - - return result - - def get_blocks(self): - """ a list of start and end positions of - aligned gapless blocks. - - The start and end positions are in genomic - coordinates. - - Blocks are not normalized, i.e. two blocks - might be directly adjacent. This happens if - the two blocks are separated by an insertion - in the read. - """ - - cdef uint32_t k, pos, l - cdef int op - cdef uint32_t * cigar_p - cdef bam1_t * src - - src = self._delegate - if pysam_get_n_cigar(src) == 0: - return [] - - result = [] - pos = src.core.pos - cigar_p = pysam_bam_get_cigar(src) - l = 0 - - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - if op == BAM_CMATCH: - result.append((pos, pos + l)) - pos += l - elif op == BAM_CDEL or op == BAM_CREF_SKIP: - pos += l - - return result - - def get_overlap(self, uint32_t start, uint32_t end): - """return number of aligned bases of read overlapping the interval - *start* and *end* on the reference sequence. - - Return None if cigar alignment is not available. - """ - cdef uint32_t k, i, pos, overlap - cdef int op, o - cdef uint32_t * cigar_p - cdef bam1_t * src - - overlap = 0 - - src = self._delegate - if pysam_get_n_cigar(src) == 0: - return None - pos = src.core.pos - o = 0 - - cigar_p = pysam_bam_get_cigar(src) - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - - if op == BAM_CMATCH: - o = min( pos + l, end) - max( pos, start ) - if o > 0: overlap += o - - if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP: - pos += l - - return overlap - - def get_cigar_stats(self): - """summary of operations in cigar string. - - The output order in the array is "MIDNSHP=X" followed by a - field for the NM tag. If the NM tag is not present, this - field will always be 0. - - +-----+--------------+-----+ - |M |BAM_CMATCH |0 | - +-----+--------------+-----+ - |I |BAM_CINS |1 | - +-----+--------------+-----+ - |D |BAM_CDEL |2 | - +-----+--------------+-----+ - |N |BAM_CREF_SKIP |3 | - +-----+--------------+-----+ - |S |BAM_CSOFT_CLIP|4 | - +-----+--------------+-----+ - |H |BAM_CHARD_CLIP|5 | - +-----+--------------+-----+ - |P |BAM_CPAD |6 | - +-----+--------------+-----+ - |= |BAM_CEQUAL |7 | - +-----+--------------+-----+ - |X |BAM_CDIFF |8 | - +-----+--------------+-----+ - |NM |NM tag |9 | - +-----+--------------+-----+ - - If no cigar string is present, empty arrays will be returned. - - Parameters - ---------- - - Returns - ------- - - arrays : two arrays. The first contains the nucleotide counts within - each cigar operation, the second contains the number of blocks for - each cigar operation. - - """ - - cdef int nfields = NCIGAR_CODES + 1 - - cdef c_array.array base_counts = array.array( - "I", - [0] * nfields) - cdef uint32_t [:] base_view = base_counts - cdef c_array.array block_counts = array.array( - "I", - [0] * nfields) - cdef uint32_t [:] block_view = block_counts - - cdef bam1_t * src = self._delegate - cdef int op - cdef uint32_t l - cdef int32_t k - cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) - - if cigar_p == NULL: - return None - - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - base_view[op] += l - block_view[op] += 1 - - cdef uint8_t * v = bam_aux_get(src, 'NM') - if v != NULL: - base_view[nfields - 1] = bam_aux2i(v) - - return base_counts, block_counts - - ##################################################### - ## Unsorted as yet - # TODO: capture in CIGAR object - property cigartuples: - """the :term:`cigar` alignment. The alignment - is returned as a list of tuples of (operation, length). - - If the alignment is not present, None is returned. - - The operations are: - - +-----+--------------+-----+ - |M |BAM_CMATCH |0 | - +-----+--------------+-----+ - |I |BAM_CINS |1 | - +-----+--------------+-----+ - |D |BAM_CDEL |2 | - +-----+--------------+-----+ - |N |BAM_CREF_SKIP |3 | - +-----+--------------+-----+ - |S |BAM_CSOFT_CLIP|4 | - +-----+--------------+-----+ - |H |BAM_CHARD_CLIP|5 | - +-----+--------------+-----+ - |P |BAM_CPAD |6 | - +-----+--------------+-----+ - |= |BAM_CEQUAL |7 | - +-----+--------------+-----+ - |X |BAM_CDIFF |8 | - +-----+--------------+-----+ - - .. note:: - The output is a list of (operation, length) tuples, such as - ``[(0, 30)]``. - This is different from the SAM specification and - the :attr:`cigarstring` property, which uses a - (length, operation) order, for example: ``30M``. - - To unset the cigar property, assign an empty list - or None. - """ - def __get__(self): - cdef uint32_t * cigar_p - cdef bam1_t * src - cdef uint32_t op, l - cdef int k - - src = self._delegate - if pysam_get_n_cigar(src) == 0: - return None - - cigar = [] - - cigar_p = pysam_bam_get_cigar(src); - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - cigar.append((op, l)) - return cigar - - def __set__(self, values): - cdef uint32_t * p - cdef bam1_t * src - cdef op, l - cdef int k, ncigar - - k = 0 - - src = self._delegate - - # get location of cigar string - p = pysam_bam_get_cigar(src) - - # empty values for cigar string - if values is None: - values = [] - - ncigar = len(values) - # create space for cigar data within src.data - pysam_bam_update(src, - pysam_get_n_cigar(src) * 4, - ncigar * 4, - p) - - # length is number of cigar operations, not bytes - pysam_set_n_cigar(src, ncigar) - - # re-acquire pointer to location in memory - # as it might have moved - p = pysam_bam_get_cigar(src) - - # insert cigar operations - for op, l in values: - p[k] = l << BAM_CIGAR_SHIFT | op - k += 1 - - ## setting the cigar string requires updating the bin - pysam_set_bin(src, - hts_reg2bin( - src.core.pos, - bam_endpos(src), - 14, - 5)) - - - cpdef set_tag(self, - tag, - value, - value_type=None, - replace=True): - """sets a particular field *tag* to *value* in the optional alignment - section. - - *value_type* describes the type of *value* that is to entered - into the alignment record.. It can be set explicitly to one - of the valid one-letter type codes. If unset, an appropriate - type will be chosen automatically. - - An existing value of the same *tag* will be overwritten unless - replace is set to False. This is usually not recommened as a - tag may only appear once in the optional alignment section. - - If *value* is None, the tag will be deleted. - """ - - cdef int value_size - cdef uint8_t * value_ptr - cdef uint8_t *existing_ptr - cdef uint8_t typecode - cdef float float_value - cdef double double_value - cdef int32_t int_value - cdef bam1_t * src = self._delegate - cdef char * _value_type - cdef c_array.array array_value - cdef object buffer - - if len(tag) != 2: - raise ValueError('Invalid tag: %s' % tag) - - tag = force_bytes(tag) - if replace: - existing_ptr = bam_aux_get(src, tag) - if existing_ptr: - bam_aux_del(src, existing_ptr) - - # setting value to None deletes a tag - if value is None: - return - - typecode = get_value_code(value, value_type) - if typecode == 0: - raise ValueError("can't guess type or invalid type code specified") - - # Not Endian-safe, but then again neither is samtools! - if typecode == 'Z': - value = force_bytes(value) - value_ptr = value - value_size = len(value)+1 - elif typecode == 'i': - int_value = value - value_ptr = &int_value - value_size = sizeof(int32_t) - elif typecode == 'd': - double_value = value - value_ptr = &double_value - value_size = sizeof(double) - elif typecode == 'f': - float_value = value - value_ptr = &float_value - value_size = sizeof(float) - elif typecode == 'B': - # the following goes through python, needs to be cleaned up - # pack array using struct - if value_type is None: - fmt, args = packTags([(tag, value)]) - else: - fmt, args = packTags([(tag, value, value_type)]) - - # remove tag and type code as set by bam_aux_append - # first four chars of format (<2sc) - fmt = '<' + fmt[4:] - # first two values to pack - args = args[2:] - value_size = struct.calcsize(fmt) - # buffer will be freed when object goes out of scope - buffer = ctypes.create_string_buffer(value_size) - struct.pack_into(fmt, buffer, 0, *args) - # bam_aux_append copies data from value_ptr - bam_aux_append(src, - tag, - typecode, - value_size, - buffer.raw) - return - else: - raise ValueError('unsupported value_type in set_option') - - bam_aux_append(src, - tag, - typecode, - value_size, - value_ptr) - - cpdef has_tag(self, tag): - """returns true if the optional alignment section - contains a given *tag*.""" - cdef uint8_t * v - cdef int nvalues - btag = force_bytes(tag) - v = bam_aux_get(self._delegate, btag) - return v != NULL - - cpdef get_tag(self, tag, with_value_type=False): - """ - retrieves data from the optional alignment section - given a two-letter *tag* denoting the field. - - The returned value is cast into an appropriate python type. - - This method is the fastest way to access the optional - alignment section if only few tags need to be retrieved. - - Parameters - ---------- - - tag : - data tag. - - with_value_type : Optional[bool] - if set to True, the return value is a tuple of (tag value, type code). - (default False) - - Returns - ------- - - A python object with the value of the `tag`. The type of the - object depends on the data type in the data record. - - Raises - ------ - - KeyError - If `tag` is not present, a KeyError is raised. - - """ - cdef uint8_t * v - cdef int nvalues - btag = force_bytes(tag) - v = bam_aux_get(self._delegate, btag) - if v == NULL: - raise KeyError("tag '%s' not present" % tag) - if chr(v[0]) == "B": - auxtype = chr(v[0]) + chr(v[1]) - else: - auxtype = chr(v[0]) - - if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S': - value = bam_aux2i(v) - elif auxtype == 'i' or auxtype == 'I': - value = bam_aux2i(v) - elif auxtype == 'f' or auxtype == 'F': - value = bam_aux2f(v) - elif auxtype == 'd' or auxtype == 'D': - value = bam_aux2f(v) - elif auxtype == 'A': - # there might a more efficient way - # to convert a char into a string - value = '%c' % bam_aux2A(v) - elif auxtype == 'Z': - value = charptr_to_str(bam_aux2Z(v)) - elif auxtype[0] == 'B': - bytesize, nvalues, values = convert_binary_tag(v + 1) - value = values - else: - raise ValueError("unknown auxiliary type '%s'" % auxtype) - - if with_value_type: - return (value, auxtype) - else: - return value - - def get_tags(self, with_value_type=False): - """the fields in the optional aligment section. - - Returns a list of all fields in the optional - alignment section. Values are converted to appropriate python - values. For example: - - [(NM, 2), (RG, "GJP00TM04")] - - If *with_value_type* is set, the value type as encode in - the AlignedSegment record will be returned as well: - - [(NM, 2, "i"), (RG, "GJP00TM04", "Z")] - - This method will convert all values in the optional alignment - section. When getting only one or few tags, please see - :meth:`get_tag` for a quicker way to achieve this. - - """ - - cdef char * ctag - cdef bam1_t * src - cdef uint8_t * s - cdef char auxtag[3] - cdef char auxtype - cdef uint8_t byte_size - cdef int32_t nvalues - - src = self._delegate - if src.l_data == 0: - return [] - s = pysam_bam_get_aux(src) - result = [] - auxtag[2] = 0 - while s < (src.data + src.l_data): - # get tag - auxtag[0] = s[0] - auxtag[1] = s[1] - s += 2 - auxtype = s[0] - if auxtype in ('c', 'C'): - value = bam_aux2i(s) - s += 1 - elif auxtype in ('s', 'S'): - value = bam_aux2i(s) - s += 2 - elif auxtype in ('i', 'I'): - value = bam_aux2i(s) - s += 4 - elif auxtype == 'f': - value = bam_aux2f(s) - s += 4 - elif auxtype == 'd': - value = bam_aux2f(s) - s += 8 - elif auxtype == 'A': - value = "%c" % bam_aux2A(s) - s += 1 - elif auxtype in ('Z', 'H'): - value = charptr_to_str(bam_aux2Z(s)) - # +1 for NULL terminated string - s += len(value) + 1 - elif auxtype == 'B': - s += 1 - byte_size, nvalues, value = convert_binary_tag(s) - # 5 for 1 char and 1 int - s += 5 + (nvalues * byte_size) - 1 - else: - raise KeyError("unknown type '%s'" % auxtype) - - s += 1 - - if with_value_type: - result.append((charptr_to_str(auxtag), value, chr(auxtype))) - else: - result.append((charptr_to_str(auxtag), value)) - - return result - - def set_tags(self, tags): - """sets the fields in the optional alignmest section with - a list of (tag, value) tuples. - - The :term:`value type` of the values is determined from the - python type. Optionally, a type may be given explicitly as - a third value in the tuple, For example: - - x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")] - - This method will not enforce the rule that the same tag may appear - only once in the optional alignment section. - """ - - cdef bam1_t * src - cdef uint8_t * s - cdef char * temp - cdef int new_size = 0 - cdef int old_size - src = self._delegate - - # convert and pack the data - if tags is not None and len(tags) > 0: - fmt, args = packTags(tags) - new_size = struct.calcsize(fmt) - buffer = ctypes.create_string_buffer(new_size) - struct.pack_into(fmt, - buffer, - 0, - *args) - - # delete the old data and allocate new space. - # If total_size == 0, the aux field will be - # empty - old_size = pysam_bam_get_l_aux(src) - pysam_bam_update(src, - old_size, - new_size, - pysam_bam_get_aux(src)) - - # copy data only if there is any - if new_size > 0: - - # get location of new data - s = pysam_bam_get_aux(src) - - # check if there is direct path from buffer.raw to tmp - p = buffer.raw - # create handle to make sure buffer stays alive long - # enough for memcpy, see issue 129 - temp = p - memcpy(s, temp, new_size) - - - ######################################################## - # Compatibility Accessors - # Functions, properties for compatibility with pysam < 0.8 - # - # Several options - # change the factory functions according to API - # * requires code changes throughout, incl passing - # handles to factory functions - # subclass functions and add attributes at runtime - # e.g.: AlignedSegments.qname = AlignedSegments.query_name - # * will slow down the default interface - # explicit declaration of getters/setters - ######################################################## - property qname: - """deprecated, use query_name instead""" - def __get__(self): return self.query_name - def __set__(self, v): self.query_name = v - property tid: - """deprecated, use reference_id instead""" - def __get__(self): return self.reference_id - def __set__(self, v): self.reference_id = v - property pos: - """deprecated, use reference_start instead""" - def __get__(self): return self.reference_start - def __set__(self, v): self.reference_start = v - property mapq: - """deprecated, use mapping_quality instead""" - def __get__(self): return self.mapping_quality - def __set__(self, v): self.mapping_quality = v - property rnext: - """deprecated, use next_reference_id instead""" - def __get__(self): return self.next_reference_id - def __set__(self, v): self.next_reference_id = v - property pnext: - """deprecated, use next_reference_start instead""" - def __get__(self): - return self.next_reference_start - def __set__(self, v): - self.next_reference_start = v - property cigar: - """deprecated, use cigartuples instead""" - def __get__(self): - r = self.cigartuples - if r is None: - r = [] - return r - def __set__(self, v): self.cigartuples = v - property tlen: - """deprecated, use template_length instead""" - def __get__(self): - return self.template_length - def __set__(self, v): - self.template_length = v - property seq: - """deprecated, use query_sequence instead""" - def __get__(self): - return self.query_sequence - def __set__(self, v): - self.query_sequence = v - property qual: - """deprecated, query_qualities instead""" - def __get__(self): - return array_to_qualitystring(self.query_qualities) - def __set__(self, v): - self.query_qualities = qualitystring_to_array(v) - property alen: - """deprecated, reference_length instead""" - def __get__(self): - return self.reference_length - def __set__(self, v): - self.reference_length = v - property aend: - """deprecated, reference_end instead""" - def __get__(self): - return self.reference_end - def __set__(self, v): - self.reference_end = v - property rlen: - """deprecated, query_length instead""" - def __get__(self): - return self.query_length - def __set__(self, v): - self.query_length = v - property query: - """deprecated, query_alignment_sequence instead""" - def __get__(self): - return self.query_alignment_sequence - def __set__(self, v): - self.query_alignment_sequence = v - property qqual: - """deprecated, query_alignment_qualities instead""" - def __get__(self): - return array_to_qualitystring(self.query_alignment_qualities) - def __set__(self, v): - self.query_alignment_qualities = qualitystring_to_array(v) - property qstart: - """deprecated, use query_alignment_start instead""" - def __get__(self): - return self.query_alignment_start - def __set__(self, v): - self.query_alignment_start = v - property qend: - """deprecated, use query_alignment_end instead""" - def __get__(self): - return self.query_alignment_end - def __set__(self, v): - self.query_alignment_end = v - property qlen: - """deprecated, use query_alignment_length instead""" - def __get__(self): - return self.query_alignment_length - def __set__(self, v): - self.query_alignment_length = v - property mrnm: - """deprecated, use next_reference_id instead""" - def __get__(self): - return self.next_reference_id - def __set__(self, v): - self.next_reference_id = v - property mpos: - """deprecated, use next_reference_start instead""" - def __get__(self): - return self.next_reference_start - def __set__(self, v): - self.next_reference_start = v - property rname: - """deprecated, use reference_id instead""" - def __get__(self): - return self.reference_id - def __set__(self, v): - self.reference_id = v - property isize: - """deprecated, use template_length instead""" - def __get__(self): - return self.template_length - def __set__(self, v): - self.template_length = v - property blocks: - """deprecated, use get_blocks() instead""" - def __get__(self): - return self.get_blocks() - property aligned_pairs: - """deprecated, use get_aligned_pairs() instead""" - def __get__(self): - return self.get_aligned_pairs() - property inferred_length: - """deprecated, use infer_query_length() instead""" - def __get__(self): - return self.infer_query_length() - property positions: - """deprecated, use get_reference_positions() instead""" - def __get__(self): - return self.get_reference_positions() - property tags: - """deprecated, use get_tags() instead""" - def __get__(self): - return self.get_tags() - def __set__(self, tags): - self.set_tags(tags) - def overlap(self): - """deprecated, use get_overlap() instead""" - return self.get_overlap() - def opt(self, tag): - """deprecated, use get_tag() instead""" - return self.get_tag(tag) - def setTag(self, tag, value, value_type=None, replace=True): - """deprecated, use set_tag() instead""" - return self.set_tag(tag, value, value_type, replace) - - -cdef class PileupColumn: - '''A pileup of reads at a particular reference sequence postion - (:term:`column`). A pileup column contains all the reads that map - to a certain target base. - - This class is a proxy for results returned by the samtools pileup - engine. If the underlying engine iterator advances, the results - of this column will change. - - ''' - def __init__(self): - raise TypeError("this class cannot be instantiated from Python") - - def __str__(self): - return "\t".join(map(str, - (self.reference_id, - self.reference_pos, - self.nsegments))) +\ - "\n" +\ - "\n".join(map(str, self.pileups)) - - property reference_id: - '''the reference sequence number as defined in the header''' - def __get__(self): - return self.tid - - property reference_name: - """:term:`reference` name (None if no AlignmentFile is associated)""" - def __get__(self): - if self._alignment_file is not None: - return self._alignment_file.getrname(self.tid) - return None - - property nsegments: - '''number of reads mapping to this column.''' - def __get__(self): - return self.n_pu - def __set__(self, n): - self.n_pu = n - - property reference_pos: - '''the position in the reference sequence (0-based).''' - def __get__(self): - return self.pos - - property pileups: - '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' - def __get__(self): - cdef int x - pileups = [] - - if self.plp == NULL or self.plp[0] == NULL: - raise ValueError("PileupColumn accessed after iterator finished") - - # warning: there could be problems if self.n and self.buf are - # out of sync. - for x from 0 <= x < self.n_pu: - pileups.append(makePileupRead(&(self.plp[0][x]), - self._alignment_file)) - return pileups - - ######################################################## - # Compatibility Accessors - # Functions, properties for compatibility with pysam < 0.8 - ######################################################## - property pos: - def __get__(self): - return self.reference_pos - def __set__(self, v): - self.reference_pos = v - - property tid: - def __get__(self): - return self.reference_id - def __set__(self, v): - self.reference_id = v - - property n: - def __get__(self): - return self.nsegments - def __set__(self, v): - self.nsegments = v - - -cdef class PileupRead: - '''Representation of a read aligned to a particular position in the - reference sequence. - - ''' - - def __init__(self): - raise TypeError( - "this class cannot be instantiated from Python") - - def __str__(self): - return "\t".join( - map(str, - (self.alignment, self.query_position, - self.indel, self.level, - self.is_del, self.is_head, - self.is_tail, self.is_refskip))) - - property alignment: - """a :class:`pysam.AlignedSegment` object of the aligned read""" - def __get__(self): - return self._alignment - - property query_position: - """position of the read base at the pileup site, 0-based. - None if is_del or is_refskip is set. - - """ - def __get__(self): - if self.is_del or self.is_refskip: - return None - else: - return self._qpos - - property query_position_or_next: - """position of the read base at the pileup site, 0-based. - - If the current position is a deletion, returns the next - aligned base. - - """ - def __get__(self): - return self._qpos - - property indel: - """indel length for the position follwing the current pileup site. - - This quantity peeks ahead to the next cigar operation in this - alignment. If the next operation is and insertion, indel will - be positve. If the next operation is a deletion, it will be - negation. 0 if the next operation is not an indel. - - """ - def __get__(self): - return self._indel - - property level: - """the level of the read in the "viewer" mode""" - def __get__(self): - return self._level - - property is_del: - """1 iff the base on the padded read is a deletion""" - def __get__(self): - return self._is_del - - property is_head: - """1 iff the base on the padded read is the left-most base.""" - def __get__(self): - return self._is_head - - property is_tail: - """1 iff the base on the padded read is the right-most base.""" - def __get__(self): - return self._is_tail - - property is_refskip: - def __get__(self): - return self._is_refskip - -__all__ = [ - "AlignedSegment", - "PileupColumn", - "PileupRead"] diff --git a/pysam/calignmentfile.pxd b/pysam/calignmentfile.pxd deleted file mode 100644 index 3384e7e..0000000 --- a/pysam/calignmentfile.pxd +++ /dev/null @@ -1,164 +0,0 @@ -from libc.stdint cimport int8_t, int16_t, int32_t, int64_t -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -from libc.stdlib cimport malloc, calloc, realloc, free -from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup -from libc.stdio cimport FILE, printf - -from pysam.cfaidx cimport faidx_t, Fastafile -from pysam.calignedsegment cimport AlignedSegment -from pysam.chtslib cimport * - -from cpython cimport array -cimport cython - -cdef extern from *: - ctypedef char* const_char_ptr "const char*" - -cdef extern from "htslib_util.h": - - char * pysam_bam_get_qname(bam1_t * b) - -cdef extern from "samfile_util.h": - - int bam_cap_mapQ(bam1_t *b, char *ref, int thres) - int bam_prob_realn(bam1_t *b, const char *ref) - -#################################################################### -# Utility types - -ctypedef struct __iterdata: - htsFile * htsfile - bam_hdr_t * header - hts_itr_t * iter - faidx_t * fastafile - int tid - char * seq - int seq_len - - -cdef class AlignmentFile: - - cdef object _filename - cdef object _reference_filename - - # pointer to htsFile structure - cdef htsFile * htsfile - - # pointer to index - cdef hts_idx_t *index - # header structure - cdef bam_hdr_t * header - # true if file is bam format - cdef readonly bint is_bam - # true if file is bam format - cdef readonly bint is_cram - # true if not a file but a stream - cdef readonly bint is_stream - # true if file is not on the local filesystem - cdef readonly bint is_remote - # current read within iteration - cdef bam1_t * b - # file opening mode - cdef char * mode - - # beginning of read section - cdef int64_t start_offset - - cdef bam1_t * getCurrent(self) - cdef int cnext(self) - - # write an aligned read - cpdef int write(self, AlignedSegment read) except -1 - -cdef class PileupColumn: - cdef bam_pileup1_t ** plp - cdef int tid - cdef int pos - cdef int n_pu - -cdef class PileupRead: - cdef AlignedSegment _alignment - cdef int32_t _qpos - cdef int _indel - cdef int _level - cdef uint32_t _is_del - cdef uint32_t _is_head - cdef uint32_t _is_tail - cdef uint32_t _is_refskip - -cdef class IteratorRow: - cdef int retval - cdef bam1_t * b - cdef AlignmentFile samfile - cdef htsFile * htsfile - cdef bam_hdr_t * header - cdef int owns_samfile - -cdef class IteratorRowRegion(IteratorRow): - cdef hts_itr_t * iter - cdef bam1_t * getCurrent(self) - cdef int cnext(self) - -cdef class IteratorRowHead(IteratorRow): - cdef int max_rows - cdef int current_row - cdef bam1_t * getCurrent(self) - cdef int cnext(self) - -cdef class IteratorRowAll(IteratorRow): - cdef bam1_t * getCurrent(self) - cdef int cnext(self) - -cdef class IteratorRowAllRefs(IteratorRow): - cdef int tid - cdef IteratorRowRegion rowiter - -cdef class IteratorRowSelection(IteratorRow): - cdef int current_pos - cdef positions - cdef bam1_t * getCurrent(self) - cdef int cnext(self) - -cdef class IteratorColumn: - - # result of the last plbuf_push - cdef IteratorRowRegion iter - cdef int tid - cdef int pos - cdef int n_plp - cdef int mask - cdef bam_pileup1_t * plp - cdef bam_plp_t pileup_iter - cdef __iterdata iterdata - cdef AlignmentFile samfile - cdef Fastafile fastafile - cdef stepper - cdef int max_depth - - cdef int cnext(self) - cdef char * getSequence(self) - cdef setMask(self, mask) - cdef setupIteratorData(self, - int tid, - int start, - int end, - int multiple_iterators=?) - - cdef reset(self, tid, start, end) - cdef _free_pileup_iter(self) - -cdef class IteratorColumnRegion(IteratorColumn): - cdef int start - cdef int end - cdef int truncate - -cdef class IteratorColumnAllRefs(IteratorColumn): - pass - -cdef class IndexedReads: - cdef AlignmentFile samfile - cdef htsFile * htsfile - cdef index - cdef int owns_samfile - cdef bam_hdr_t * header - diff --git a/pysam/calignmentfile.pyx b/pysam/calignmentfile.pyx deleted file mode 100644 index ed5e584..0000000 --- a/pysam/calignmentfile.pyx +++ /dev/null @@ -1,2535 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -######################################################## -######################################################## -# Cython wrapper for SAM/BAM/CRAM files based on htslib -######################################################## -# The principal classes defined in this module are: -# -# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files -# -# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping -# the original sort order intact -# -# Additionally this module defines numerous additional classes that -# are part of the internal API. These are: -# -# Various iterator classes to iterate over alignments in sequential -# (IteratorRow) or in a stacked fashion (IteratorColumn): -# -# class IteratorRow -# class IteratorRowRegion -# class IteratorRowHead -# class IteratorRowAll -# class IteratorRowAllRefs -# class IteratorRowSelection -# class IteratorColumn -# class IteratorColumnRegion -# class IteratorColumnAllRefs -# -######################################################## -# -# The MIT License -# -# Copyright (c) 2015 Andreas Heger -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -######################################################## -import os -import collections -import re -import warnings -import array - -from cpython cimport array as c_array -from cpython.version cimport PY_MAJOR_VERSION - -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size -from pysam.calignedsegment cimport makeAlignedSegment, makePileupColumn -from pysam.chtslib cimport hisremote - -if PY_MAJOR_VERSION >= 3: - from io import StringIO -else: - from StringIO import StringIO - -cimport cython - -######################################################## -## Constants and global variables - -# defines imported from samtools -DEF SEEK_SET = 0 -DEF SEEK_CUR = 1 -DEF SEEK_END = 2 - -# maximum genomic coordinace -cdef int MAX_POS = 2 << 29 - -# valid types for SAM headers -VALID_HEADER_TYPES = {"HD" : dict, - "SQ" : list, - "RG" : list, - "PG" : list, - "CO" : list} - -# order of records within SAM headers -VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") - -# default type conversions within SAM header records -KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, - "SQ" : {"SN" : str, "LN" : int, "AS" : str, - "M5" : str, "SP" : str, "UR" : str,}, - "RG" : {"ID" : str, "CN" : str, "DS" : str, - "DT" : str, "FO" : str, "KS" : str, - "LB" : str, "PG" : str, "PI" : str, - "PL" : str, "PM" : str, "PU" : str, - "SM" : str,}, - "PG" : {"ID" : str, "PN" : str, "CL" : str, - "PP" : str, "DS" : str, "VN" : str,},} - -# output order of fields within records. Ensure that CL is at -# the end as parsing a CL will ignore any subsequent records. -VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"), - "SQ" : ("SN", "LN", "AS", "M5", - "UR", "SP"), - "RG" : ("ID", "CN", "SM", "LB", - "PU", "PI", "DT", "DS", - "PL", "FO", "KS", "PG", - "PM"), - "PG" : ("PN", "ID", "VN", "PP", - "DS", "CL"),} - - -def build_header_line(fields, record): - '''build a header line from `fields` dictionary for `record`''' - - # TODO: add checking for field and sort order - line = ["@%s" % record] - # comment - if record == "CO": - line.append(fields) - # user tags - elif record.islower(): - for key in sorted(fields): - line.append("%s:%s" % (key, str(fields[key]))) - # defined tags - else: - # write fields of the specification - for key in VALID_HEADER_ORDER[record]: - if key in fields: - line.append("%s:%s" % (key, str(fields[key]))) - # write user fields - for key in fields: - if not key.isupper(): - line.append("%s:%s" % (key, str(fields[key]))) - - return "\t".join(line) - -cdef bam_hdr_t * build_header(new_header): - '''return a new header built from a dictionary in `new_header`. - - This method inserts the text field, target_name and target_len. - ''' - - lines = [] - - # check if hash exists - - # create new header and copy old data - cdef bam_hdr_t * dest - - dest = bam_hdr_init() - - # first: defined tags - for record in VALID_HEADERS: - if record in new_header: - ttype = VALID_HEADER_TYPES[record] - data = new_header[record] - if type(data) != type(ttype()): - raise ValueError( - "invalid type for record %s: %s, expected %s" % - (record, type(data), type(ttype()))) - if type(data) is dict: - lines.append(build_header_line(data, record)) - else: - for fields in new_header[record]: - lines.append(build_header_line(fields, record)) - - # then: user tags (lower case), sorted alphabetically - for record, data in sorted(new_header.items()): - if record in VALID_HEADERS: continue - if type(data) is dict: - lines.append(build_header_line(data, record)) - else: - for fields in new_header[record]: - lines.append(build_header_line(fields, record)) - - text = "\n".join(lines) + "\n" - if dest.text != NULL: free( dest.text ) - dest.text = calloc(len(text), sizeof(char)) - dest.l_text = len(text) - cdef bytes btext = text.encode('ascii') - strncpy(dest.text, btext, dest.l_text) - - cdef bytes bseqname - # collect targets - if "SQ" in new_header: - seqs = [] - for fields in new_header["SQ"]: - try: - seqs.append( (fields["SN"], fields["LN"] ) ) - except KeyError: - raise KeyError( "incomplete sequence information in '%s'" % str(fields)) - - dest.n_targets = len(seqs) - dest.target_name = calloc(dest.n_targets, sizeof(char*)) - dest.target_len = calloc(dest.n_targets, sizeof(uint32_t)) - - for x from 0 <= x < dest.n_targets: - seqname, seqlen = seqs[x] - dest.target_name[x] = calloc( - len(seqname) + 1, sizeof(char)) - bseqname = seqname.encode('ascii') - strncpy(dest.target_name[x], bseqname, - len(seqname) + 1) - dest.target_len[x] = seqlen - - return dest - - -cdef class AlignmentFile: - """AlignmentFile(filepath_or_object, mode=None, template=None, - reference_names=None, reference_lengths=None, text=NULL, - header=None, add_sq_text=False, check_header=True, check_sq=True, - reference_filename=None, filename=None) - - A :term:`SAM`/:term:`BAM` formatted file. - - If `filepath_or_object` is a string, the file is automatically - opened. If `filepath_or_object` is a python File object, the - already opened file will be used. - - If the file is opened for reading an index for a BAM file exists - (.bai), it will be opened automatically. Without an index random - access via :meth:`~pysam.AlignmentFile.fetch` and - :meth:`~pysam.AlignmentFile.pileup` is disabled. - - For writing, the header of a :term:`SAM` file/:term:`BAM` file can - be constituted from several sources (see also the samtools format - specification): - - 1. If `template` is given, the header is copied from a another - `AlignmentFile` (`template` must be a - :class:`~pysam.AlignmentFile`). - - 2. If `header` is given, the header is built from a - multi-level dictionary. - - 3. If `text` is given, new header text is copied from raw - text. - - 4. The names (`reference_names`) and lengths - (`reference_lengths`) are supplied directly as lists. - - When reading or writing a CRAM file, the filename of a FASTA-formatted - reference can be specified with `reference_filename`. - - By default, if a file is opened in mode 'r', it is checked - for a valid header (`check_header` = True) and a definition of - chromosome names (`check_sq` = True). - - Parameters - ---------- - mode : string - `mode` should be ``r`` for reading or ``w`` for writing. The - default is text mode (:term:`SAM`). For binary (:term:`BAM`) - I/O you should append ``b`` for compressed or ``u`` for - uncompressed :term:`BAM` output. Use ``h`` to output header - information in text (:term:`TAM`) mode. Use ``c`` for - :term:`CRAM` formatted files. - - If ``b`` is present, it must immediately follow ``r`` or - ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, - ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a - :term:`BAM` formatted file for reading, type:: - - f = pysam.AlignmentFile('ex1.bam','rb') - - If mode is not specified, the method will try to auto-detect - in the order 'rb', 'r', thus both the following should work:: - - f1 = pysam.AlignmentFile('ex1.bam') - f2 = pysam.AlignmentFile('ex1.sam') - - template : AlignmentFile - when writing, copy header frem `template`. - - header : dict - when writing, build header from a multi-level dictionary. The - first level are the four types ('HD', 'SQ', ...). The second - level are a list of lines, with each line being a list of - tag-value pairs. The header is constructed first from all the - defined fields, followed by user tags in alphabetical order. - - text : string - when writing, use the string provided as the header - - reference_names : list - see referece_lengths - - reference_lengths : list - when writing, build header from list of chromosome names and - lengths. By default, 'SQ' and 'LN' tags will be added to the - header text. This option can be changed by unsetting the flag - `add_sq_text`. - - add_sq_text : bool - do not add 'SQ' and 'LN' tags to header. This option permits - construction :term:`SAM` formatted files without a header. - - check_header : bool - when reading, check if header is present (default=True) - - check_sq : bool - when reading, check if SQ entries are present in header - (default=True) - - reference_filename : string - Path to a FASTA-formatted reference file. Valid only for CRAM files. - When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL - specified in the header (``UR`` tag), which are normally used to find - the reference. - - filename : string - Alternative to filepath_or_object. Filename of the file - to be opened. - - """ - - def __cinit__(self, *args, **kwargs): - - self.htsfile = NULL - self._filename = None - self.is_bam = False - self.is_stream = False - self.is_cram = False - self.is_remote = False - - if "filename" in kwargs: - args = [kwargs["filename"]] - del kwargs["filename"] - - self._open(*args, **kwargs) - - # allocate memory for iterator - self.b = calloc(1, sizeof(bam1_t)) - - def is_open(self): - '''return true if htsfile has been opened.''' - return self.htsfile != NULL - - def has_index(self): - """return true if htsfile has an existing (and opened) index. - """ - return self.index != NULL - - def check_index(self): - """return True if index is present. - - Raises - ------ - - AttributeError - if htsfile is :term:`SAM` formatted and thus has no index. - - ValueError - if htsfile is closed or index could not be opened. - """ - - if not self.is_open(): - raise ValueError("I/O operation on closed file") - if not self.is_bam and not self.is_cram: - raise AttributeError( - "AlignmentFile.mapped only available in bam files") - if self.index == NULL: - raise ValueError( - "mapping information not recorded in index " - "or index not available") - return True - - def _open(self, - filepath_or_object, - mode=None, - AlignmentFile template=None, - reference_names=None, - reference_lengths=None, - reference_filename=None, - text=None, - header=None, - port=None, - add_sq_text=True, - check_header=True, - check_sq=True, - filepath_index=None, - referencenames=None, - referencelengths=None): - '''open a sam, bam or cram formatted file. - - If _open is called on an existing file, the current file - will be closed and a new file will be opened. - ''' - cdef char *cfilename - cdef char *creference_filename - cdef char *cindexname - cdef char *cmode - - # for backwards compatibility: - if referencenames is not None: - reference_names = referencenames - if referencelengths is not None: - reference_lengths = referencelengths - - # autodetection for read - if mode is None: - mode = "r" - - assert mode in ("r", "w", "rb", "wb", "wh", - "wbu", "rU", "wb0", - "rc", "wc"), \ - "invalid file opening mode `%s`" % mode - - # close a previously opened file - if self.htsfile != NULL: - self.close() - - # StringIO not supported - if isinstance(filepath_or_object, StringIO): - filename = "stringio" - raise NotImplementedError( - "access from StringIO objects not supported") - if filepath_or_object.closed: - raise ValueError('I/O operation on closed StringIO object') - # check if we are working with a File object - elif hasattr(filepath_or_object, "fileno"): - filename = filepath_or_object.name - if filepath_or_object.closed: - raise ValueError('I/O operation on closed file') - else: - filename = filepath_or_object - - # for htslib, wbu seems to not work - if mode == "wbu": - mode = "wb0" - - cdef bytes bmode = mode.encode('ascii') - self._filename = filename = encode_filename(filename) - self._reference_filename = reference_filename = encode_filename( - reference_filename) - - # FIXME: Use htsFormat when it is available - self.is_stream = filename == b"-" - self.is_remote = hisremote(filename) - - cdef char * ctext - cdef hFILE * fp - ctext = NULL - - if mode[0] == 'w': - # open file for writing - - # header structure (used for writing) - if template: - self.header = bam_hdr_dup(template.header) - elif header: - self.header = build_header(header) - else: - # build header from a target names and lengths - assert reference_names and reference_lengths, \ - ("either supply options `template`, `header` " - "or both `reference_names` and `reference_lengths` " - "for writing") - assert len(reference_names) == len(reference_lengths), \ - "unequal names and lengths of reference sequences" - - # allocate and fill header - reference_names = [force_bytes(ref) for ref in reference_names] - self.header = bam_hdr_init() - self.header.n_targets = len(reference_names) - n = 0 - for x in reference_names: - n += len(x) + 1 - self.header.target_name = calloc( - n, sizeof(char*)) - self.header.target_len = calloc( - n, sizeof(uint32_t)) - for x from 0 <= x < self.header.n_targets: - self.header.target_len[x] = reference_lengths[x] - name = reference_names[x] - self.header.target_name[x] = calloc( - len(name) + 1, sizeof(char)) - strncpy(self.header.target_name[x], name, len(name)) - - # Optionally, if there is no text, add a SAM - # compatible header to output file. - if text is None and add_sq_text: - text = [] - for x from 0 <= x < self.header.n_targets: - text.append("@SQ\tSN:%s\tLN:%s\n" % \ - (force_str(reference_names[x]), - reference_lengths[x])) - text = ''.join(text) - - if text is not None: - # copy without \0 - text = force_bytes(text) - ctext = text - self.header.l_text = strlen(ctext) - self.header.text = calloc( - strlen(ctext), sizeof(char)) - memcpy(self.header.text, ctext, strlen(ctext)) - - # open file (hts_open is synonym with sam_open) - cfilename, cmode = filename, bmode - if hasattr(filepath_or_object, "fileno"): - fp = hdopen(filepath_or_object.fileno(), cmode) - with nogil: - self.htsfile = hts_hopen(fp, cfilename, cmode) - else: - with nogil: - self.htsfile = hts_open(cfilename, cmode) - - # htsfile.format does not get set until writing, so use - # the format specifier explicitely given by the user. - self.is_bam = "b" in mode - self.is_cram = "c" in mode - - # set filename with reference sequences. If no filename - # is given, the CRAM reference arrays will be built from - # the @SQ header in the header - if self.is_cram and reference_filename: - # note that fn_aux takes ownership, so create a copy - self.htsfile.fn_aux = strdup(self._reference_filename) - - # write header to htsfile - if self.is_bam or self.is_cram or "h" in mode: - with nogil: - sam_hdr_write(self.htsfile, self.header) - - elif mode[0] == "r": - # open file for reading - if (filename != b"-" - and not self.is_remote - and not os.path.exists(filename)): - raise IOError("file `%s` not found" % filename) - - # open file (hts_open is synonym with sam_open) - cfilename, cmode = filename, bmode - if hasattr(filepath_or_object, "fileno"): - fp = hdopen(filepath_or_object.fileno(), cmode) - with nogil: - self.htsfile = hts_hopen(fp, cfilename, cmode) - else: - with nogil: - self.htsfile = hts_open(cfilename, cmode) - - if self.htsfile == NULL: - raise ValueError( - "could not open file (mode='%s') - " - "is it SAM/BAM format?" % mode) - - self.is_bam = self.htsfile.format.format == bam - self.is_cram = self.htsfile.format.format == cram - - # bam files require a valid header - if self.is_bam or self.is_cram: - with nogil: - self.header = sam_hdr_read(self.htsfile) - if self.header == NULL: - raise ValueError( - "file does not have valid header (mode='%s') " - "- is it BAM format?" % mode ) - else: - # in sam files it is optional (htsfile full of - # unmapped reads) - if check_header: - with nogil: - self.header = sam_hdr_read(self.htsfile) - if self.header == NULL: - raise ValueError( - "file does not have valid header (mode='%s') " - "- is it SAM format?" % mode ) - # self.header.ignore_sam_err = True - - # set filename with reference sequences - if self.is_cram and reference_filename: - creference_filename = self._reference_filename - hts_set_opt(self.htsfile, - CRAM_OPT_REFERENCE, - creference_filename) - - if check_sq and self.header.n_targets == 0: - raise ValueError( - ("file has no sequences defined (mode='%s') - " - "is it SAM/BAM format? Consider opening with " - "check_sq=False") % mode) - - if self.htsfile == NULL: - raise IOError("could not open file `%s`" % filename ) - - # check for index and open if present - cdef int format_index = -1 - if self.is_bam: - format_index = HTS_FMT_BAI - elif self.is_cram: - format_index = HTS_FMT_CRAI - - if mode[0] == "r" and (self.is_bam or self.is_cram): - - # open index for remote files - if self.is_remote and not filepath_index: - cfilename = filename - - with nogil: - self.index = hts_idx_load(cfilename, format_index) - if self.index == NULL: - warnings.warn( - "unable to open remote index for '%s'" % cfilename) - else: - has_index = True - cfilename = filename - if filepath_index: - if not os.path.exists(filepath_index): - warnings.warn( - "unable to open index at %s" % cfilename) - self.index = NULL - has_index = False - else: - if self.is_bam \ - and not os.path.exists(filename + b".bai") \ - and not os.path.exists(filename[:-4] + b".bai"): - self.index = NULL - has_index = False - elif self.is_cram \ - and not os.path.exists(filename + b".crai") \ - and not os.path.exists(filename[:-5] + b".crai"): - self.index = NULL - has_index = False - - if has_index: - # returns NULL if there is no index or index could - # not be opened - if filepath_index: - cindexname = filepath_index = encode_filename(filepath_index) - with nogil: - self.index = sam_index_load2(self.htsfile, - cfilename, - cindexname) - - else: - with nogil: - self.index = sam_index_load(self.htsfile, - cfilename) - if self.index == NULL: - raise IOError( - "error while opening index for '%s'" % - filename) - - # save start of data section - if not self.is_stream: - self.start_offset = self.tell() - - def get_tid(self, reference): - """ - return the numerical :term:`tid` corresponding to - :term:`reference` - - returns -1 if reference is not known. - """ - if not self.is_open(): - raise ValueError("I/O operation on closed file") - reference = force_bytes(reference) - return bam_name2id(self.header, reference) - - def get_reference_name(self, tid): - """ - return :term:`reference` name corresponding to numerical :term:`tid` - """ - if not self.is_open(): - raise ValueError("I/O operation on closed file") - if not 0 <= tid < self.header.n_targets: - raise ValueError("reference_id %i out of range 0<=tid<%i" % - (tid, self.header.n_targets)) - return charptr_to_str(self.header.target_name[tid]) - - def reset(self): - """reset file position to beginning of file just after - the header. - - Returns - ------- - - The file position after moving the file pointer. - - """ - return self.seek(self.start_offset, 0) - - def seek(self, uint64_t offset, int where=0): - """move file pointer to position `offset`, see - :meth:`pysam.AlignmentFile.tell`. - - Parameters - ---------- - - offset : int - - position of the read/write pointer within the file. - - where : int - - optional and defaults to 0 which means absolute file - positioning, other values are 1 which means seek relative to - the current position and 2 means seek relative to the file's - end. - - Returns - ------- - - the file position after moving the file pointer - - """ - - if not self.is_open(): - raise ValueError("I/O operation on closed file") - if not self.is_bam: - raise NotImplementedError( - "seek only available in bam files") - if self.is_stream: - raise OSError("seek no available in streams") - - cdef uint64_t pos - with nogil: - pos = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, where) - return pos - - def tell(self): - """ - return current file position. - """ - if not self.is_open(): - raise ValueError("I/O operation on closed file") - if not (self.is_bam or self.is_cram): - raise NotImplementedError( - "seek only available in bam files") - - cdef uint64_t pos - with nogil: - pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) - return pos - - def parse_region(self, - reference=None, - start=None, - end=None, - region=None, - tid=None): - """parse alternative ways to specify a genomic region. A region can - either be specified by :term:`reference`, `start` and - `end`. `start` and `end` denote 0-based, half-open - intervals. - - Alternatively, a samtools :term:`region` string can be - supplied. - - If any of the coordinates are missing they will be replaced by the - minimum (`start`) or maximum (`end`) coordinate. - - Note that region strings are 1-based, while `start` and `end` denote - an interval in python coordinates. - - Returns - ------- - - tuple : a tuple of `flag`, :term:`tid`, `start` and `end`. The - flag indicates whether no coordinates were supplied and the - genomic region is the complete genomic space. - - Raises - ------ - - ValueError - for invalid or out of bounds regions. - - """ - cdef int rtid - cdef long long rstart - cdef long long rend - - rtid = -1 - rstart = 0 - rend = MAX_POS - if start != None: - try: - rstart = start - except OverflowError: - raise ValueError('start out of range (%i)' % start) - - if end != None: - try: - rend = end - except OverflowError: - raise ValueError('end out of range (%i)' % end) - - if region: - region = force_str(region) - parts = re.split("[:-]", region) - reference = parts[0] - if len(parts) >= 2: - rstart = int(parts[1]) - 1 - if len(parts) >= 3: - rend = int(parts[2]) - - if not reference: - return 0, 0, 0, 0 - - if tid is not None: - rtid = tid - else: - rtid = self.gettid(reference) - - if rtid < 0: - raise ValueError( - "invalid reference `%s`" % reference) - if rstart > rend: - raise ValueError( - 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend)) - if not 0 <= rstart < MAX_POS: - raise ValueError('start out of range (%i)' % rstart) - if not 0 <= rend <= MAX_POS: - raise ValueError('end out of range (%i)' % rend) - - return 1, rtid, rstart, rend - - def fetch(self, - reference=None, - start=None, - end=None, - region=None, - tid=None, - until_eof=False, - multiple_iterators=False): - """fetch reads aligned in a :term:`region`. - - See :meth:`AlignmentFile.parse_region` for more information - on genomic regions. - - Without a `reference` or `region` all mapped reads in the file - will be fetched. The reads will be returned ordered by reference - sequence, which will not necessarily be the order within the - file. This mode of iteration still requires an index. If there is - no index, use `until_eof=True`. - - If only `reference` is set, all reads aligned to `reference` - will be fetched. - - A :term:`SAM` file does not allow random access. If `region` - or `reference` are given, an exception is raised. - - :class:`~pysam.FastaFile` - :class:`~pysam.IteratorRow` - :class:`~pysam.IteratorRow` - :class:`~IteratorRow` - :class:`IteratorRow` - - Parameters - ---------- - - until_eof : bool - - If `until_eof` is True, all reads from the current file - position will be returned in order as they are within the - file. Using this option will also fetch unmapped reads. - - multiple_iterators : bool - - If `multiple_iterators` is True, multiple - iterators on the same file can be used at the same time. The - iterator returned will receive its own copy of a filehandle to - the file effectively re-opening the file. Re-opening a file - creates some overhead, so beware. - - Returns - ------- - - An iterator over a collection of reads. - - Raises - ------ - - ValueError - if the genomic coordinates are out of range or invalid or the - file does not permit random access to genomic coordinates. - - """ - cdef int rtid, rstart, rend, has_coord - - if not self.is_open(): - raise ValueError( "I/O operation on closed file" ) - - has_coord, rtid, rstart, rend = self.parse_region( - reference, - start, - end, - region, - tid) - - # Turn of re-opening if htsfile is a stream - if self.is_stream: - multiple_iterators = False - - if self.is_bam or self.is_cram: - if not until_eof and not self.is_remote: - if not self.has_index(): - raise ValueError( - "fetch called on bamfile without index") - - if has_coord: - return IteratorRowRegion( - self, rtid, rstart, rend, - multiple_iterators=multiple_iterators) - else: - if until_eof: - return IteratorRowAll( - self, - multiple_iterators=multiple_iterators) - else: - # AH: check - reason why no multiple_iterators for - # AllRefs? - return IteratorRowAllRefs( - self, - multiple_iterators=multiple_iterators) - else: - if has_coord: - raise ValueError( - "fetching by region is not available for sam files") - - if self.header == NULL: - raise ValueError( - "fetch called for htsfile without header") - - # check if targets are defined - # give warning, sam_read1 segfaults - if self.header.n_targets == 0: - warnings.warn("fetch called for htsfile without header") - - return IteratorRowAll(self, - multiple_iterators=multiple_iterators) - - def head(self, n, multiple_iterators=True): - '''return an iterator over the first n alignments. - - This iterator is is useful for inspecting the bam-file. - - Parameters - ---------- - - multiple_iterators : bool - - is set to True by default in order to - avoid changing the current file position. - - Returns - ------- - - an iterator over a collection of reads - - ''' - return IteratorRowHead(self, n, - multiple_iterators=multiple_iterators) - - def mate(self, AlignedSegment read): - '''return the mate of :class:`~pysam.AlignedSegment` `read`. - - .. note:: - - Calling this method will change the file position. - This might interfere with any iterators that have - not re-opened the file. - - .. note:: - - This method is too slow for high-throughput processing. - If a read needs to be processed with its mate, work - from a read name sorted file or, better, cache reads. - - Returns - ------- - - :class:`~pysam.AlignedSegment` : the mate - - Raises - ------ - - ValueError - if the read is unpaired or the mate is unmapped - - ''' - cdef uint32_t flag = read._delegate.core.flag - - if flag & BAM_FPAIRED == 0: - raise ValueError("read %s: is unpaired" % - (read.query_name)) - if flag & BAM_FMUNMAP != 0: - raise ValueError("mate %s: is unmapped" % - (read.query_name)) - - # xor flags to get the other mate - cdef int x = BAM_FREAD1 + BAM_FREAD2 - flag = (flag ^ x) & x - - # Make sure to use a separate file to jump around - # to mate as otherwise the original file position - # will be lost - # The following code is not using the C API and - # could thus be made much quicker, for example - # by using tell and seek. - for mate in self.fetch( - read._delegate.core.mpos, - read._delegate.core.mpos + 1, - tid=read._delegate.core.mtid, - multiple_iterators=True): - if mate.flag & flag != 0 and \ - mate.query_name == read.query_name: - break - else: - raise ValueError("mate not found") - - return mate - - def pileup(self, - reference=None, - start=None, - end=None, - region=None, - **kwargs): - """perform a :term:`pileup` within a :term:`region`. The region is - specified by :term:`reference`, 'start' and 'end' (using - 0-based indexing). Alternatively, a samtools 'region' string - can be supplied. - - Without 'reference' or 'region' all reads will be used for the - pileup. The reads will be returned ordered by - :term:`reference` sequence, which will not necessarily be the - order within the file. - - Note that :term:`SAM` formatted files do not allow random - access. In these files, if a 'region' or 'reference' are - given an exception is raised. - - .. note:: - - 'all' reads which overlap the region are returned. The - first base returned will be the first base of the first - read 'not' necessarily the first base of the region used - in the query. - - Parameters - ---------- - - stepper : string - The stepper controlls how the iterator advances. - Possible options for the stepper are - - ``all`` - skip reads in which any of the following flags are set: - BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP - - ``nofilter`` - uses every single read - - ``samtools`` - same filter and read processing as in :term:`csamtools` - pileup. This requires a 'fastafile' to be given. - - - fastafile : :class:`~pysam.FastaFile` object. - - This is required for some of the steppers. - - max_depth : int - Maximum read depth permitted. The default limit is '8000'. - - truncate : bool - - By default, the samtools pileup engine outputs all reads - overlapping a region. If truncate is True and a region is - given, only columns in the exact region specificied are - returned. - - Returns - ------- - - an iterator over genomic positions. - - """ - cdef int rtid, rstart, rend, has_coord - - if not self.is_open(): - raise ValueError("I/O operation on closed file") - - has_coord, rtid, rstart, rend = self.parse_region( - reference, start, end, region) - - if self.is_bam or self.is_cram: - if not self.has_index(): - raise ValueError("no index available for pileup") - - if has_coord: - return IteratorColumnRegion(self, - tid=rtid, - start=rstart, - end=rend, - **kwargs ) - else: - return IteratorColumnAllRefs(self, **kwargs ) - - else: - raise NotImplementedError( - "pileup of samfiles not implemented yet") - - def count(self, - reference=None, - start=None, - end=None, - region=None, - until_eof=False, - read_callback="nofilter"): - '''count the number of reads in :term:`region` - - The region is specified by :term:`reference`, `start` and - `end`. Alternatively, a :term:`samtools` :term:`region` string - can be supplied. - - A :term:`SAM` file does not allow random access and if - `region` or `reference` are given, an exception is raised. - - Parameters - ---------- - - reference : string - reference_name of the genomic region (chromosome) - - start : int - start of the genomic region - - end : int - end of the genomic region - - region : string - a region string in samtools format. - - until_eof : bool - count until the end of the file, possibly including - unmapped reads as well. - - read_callback: string or function - - select a call-back to ignore reads when counting. It can - be either a string with the following values: - - ``all`` - skip reads in which any of the following - flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, - BAM_FDUP - - ``nofilter`` - uses every single read - - Alternatively, `read_callback` can be a function - ``check_read(read)`` that should return True only for - those reads that shall be included in the counting. - - Raises - ------ - - ValueError - if the genomic coordinates are out of range or invalid. - - ''' - cdef AlignedSegment read - cdef long counter = 0 - - if not self.is_open(): - raise ValueError( "I/O operation on closed file" ) - - cdef int filter_method = 0 - if read_callback == "all": - filter_method = 1 - elif read_callback == "nofilter": - filter_method = 2 - - for read in self.fetch(reference=reference, - start=start, - end=end, - region=region, - until_eof=until_eof): - # apply filter - if filter_method == 1: - # filter = "all" - if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): - continue - elif filter_method == 2: - # filter = "nofilter" - pass - else: - if not read_callback(read): - continue - counter += 1 - - return counter - - @cython.boundscheck(False) # we do manual bounds checking - def count_coverage(self, - reference=None, - start=None, - end=None, - region=None, - quality_threshold=15, - read_callback='all'): - """count the coverage of genomic positions by reads in :term:`region`. - - The region is specified by :term:`reference`, `start` and - `end`. Alternatively, a :term:`samtools` :term:`region` string - can be supplied. The coverage is computed per-base [ACGT]. - - Parameters - ---------- - - reference : string - reference_name of the genomic region (chromosome) - - start : int - start of the genomic region - - end : int - end of the genomic region - - region : int - a region string. - - quality_threshold : int - quality_threshold is the minimum quality score (in phred) a - base has to reach to be counted. - - read_callback: string or function - - select a call-back to ignore reads when counting. It can - be either a string with the following values: - - ``all`` - skip reads in which any of the following - flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, - BAM_FDUP - - ``nofilter`` - uses every single read - - Alternatively, `read_callback` can be a function - ``check_read(read)`` that should return True only for - those reads that shall be included in the counting. - - Raises - ------ - - ValueError - if the genomic coordinates are out of range or invalid. - - Returns - ------- - - four array.arrays of the same length in order A C G T : tuple - - """ - - cdef int _start = start - cdef int _stop = end - cdef int length = _stop - _start - cdef c_array.array int_array_template = array.array('L', []) - cdef c_array.array count_a - cdef c_array.array count_c - cdef c_array.array count_g - cdef c_array.array count_t - count_a = c_array.clone(int_array_template, length, zero=True) - count_c = c_array.clone(int_array_template, length, zero=True) - count_g = c_array.clone(int_array_template, length, zero=True) - count_t = c_array.clone(int_array_template, length, zero=True) - - cdef AlignedSegment read - cdef cython.str seq - cdef c_array.array quality - cdef int qpos - cdef int refpos - cdef int c = 0 - cdef int filter_method = 0 - if read_callback == "all": - filter_method = 1 - elif read_callback == "nofilter": - filter_method = 2 - - cdef int _threshold = quality_threshold - for read in self.fetch(reference=reference, - start=start, - end=end, - region=region): - # apply filter - if filter_method == 1: - # filter = "all" - if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): - continue - elif filter_method == 2: - # filter = "nofilter" - pass - else: - if not read_callback(read): - continue - - # count - seq = read.seq - quality = read.query_qualities - for qpos, refpos in read.get_aligned_pairs(True): - if qpos is not None and refpos is not None and \ - _start <= refpos < _stop: - if quality[qpos] >= quality_threshold: - if seq[qpos] == 'A': - count_a.data.as_ulongs[refpos - _start] += 1 - if seq[qpos] == 'C': - count_c.data.as_ulongs[refpos - _start] += 1 - if seq[qpos] == 'G': - count_g.data.as_ulongs[refpos - _start] += 1 - if seq[qpos] == 'T': - count_t.data.as_ulongs[refpos - _start] += 1 - - return count_a, count_c, count_g, count_t - - def close(self): - ''' - closes the :class:`pysam.AlignmentFile`.''' - if self.htsfile != NULL: - hts_close(self.htsfile) - hts_idx_destroy(self.index); - self.htsfile = NULL - - def __dealloc__(self): - # remember: dealloc cannot call other methods - # note: no doc string - # note: __del__ is not called. - - # FIXME[kbj]: isn't self.close a method? I've been duplicating - # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty - # solution and perhaps unnecessary given that calling self.close has - # been working for years. - # AH: I have removed the call to close. Even though it is working, - # it seems to be dangerous according to the documentation as the - # object be partially deconstructed already. - if self.htsfile != NULL: - hts_close(self.htsfile) - hts_idx_destroy(self.index); - self.htsfile = NULL - - bam_destroy1(self.b) - if self.header != NULL: - bam_hdr_destroy(self.header) - - cpdef int write(self, AlignedSegment read) except -1: - ''' - write a single :class:`pysam.AlignedSegment` to disk. - - Raises - ------ - ValueError - if the writing failed - - Returns - ------- - - int : the number of bytes written. If the file is closed, - this will be 0. - ''' - if not self.is_open(): - return 0 - - cdef int ret - - with nogil: - ret = sam_write1(self.htsfile, - self.header, - read._delegate) - - # kbj: Still need to raise an exception with except -1. Otherwise - # when ret == -1 we get a "SystemError: error return without - # exception set". - if ret < 0: - raise ValueError('sam write failed') - - return ret - - # context manager interface - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - return False - - ############################################################### - ############################################################### - ############################################################### - ## properties - ############################################################### - property closed: - """bool indicating the current state of the file object. - This is a read-only attribute; the close() method changes the value. - """ - def __get__(self): - return not self.is_open() - - property filename: - """filename associated with this object. This is a read-only attribute.""" - def __get__(self): - return self._filename - - property nreferences: - """"int with the number of :term:`reference` sequences in the file. - This is a read-only attribute.""" - def __get__(self): - if not self.is_open(): - raise ValueError("I/O operation on closed file") - return self.header.n_targets - - property references: - """tuple with the names of :term:`reference` sequences. This is a - read-only attribute""" - def __get__(self): - if not self.is_open(): raise ValueError( "I/O operation on closed file" ) - t = [] - for x from 0 <= x < self.header.n_targets: - t.append(charptr_to_str(self.header.target_name[x])) - return tuple(t) - - property lengths: - """tuple of the lengths of the :term:`reference` sequences. This is a - read-only attribute. The lengths are in the same order as - :attr:`pysam.AlignmentFile.references` - - """ - def __get__(self): - if not self.is_open(): - raise ValueError("I/O operation on closed file") - t = [] - for x from 0 <= x < self.header.n_targets: - t.append(self.header.target_len[x]) - return tuple(t) - - property mapped: - """int with total number of mapped alignments according to the - statistics recorded in the index. This is a read-only - attribute. - """ - def __get__(self): - self.check_index() - cdef int tid - cdef uint64_t total = 0 - cdef uint64_t mapped, unmapped - for tid from 0 <= tid < self.header.n_targets: - with nogil: - hts_idx_get_stat(self.index, tid, &mapped, &unmapped) - total += mapped - return total - - property unmapped: - """int with total number of unmapped reads according to the statistics - recorded in the index. This number of reads includes the number of reads - without coordinates. This is a read-only attribute. - """ - def __get__(self): - self.check_index() - cdef int tid - cdef uint64_t total = hts_idx_get_n_no_coor(self.index) - cdef uint64_t mapped, unmapped - for tid from 0 <= tid < self.header.n_targets: - with nogil: - hts_idx_get_stat(self.index, tid, &mapped, &unmapped) - total += unmapped - return total - - property nocoordinate: - """int with total number of reads without coordinates according to the - statistics recorded in the index. This is a read-only attribute. - """ - def __get__(self): - self.check_index() - cdef uint64_t n - with nogil: - n = hts_idx_get_n_no_coor(self.index) - return n - - property format: - '''string describing the file format''' - def __get__(self): - if not self.is_open(): - raise ValueError( "I/O operation on closed file" ) - return hts_format_description(&self.htsfile.format) - - property text: - '''string with the full contents of the :term:`sam file` header as a - string. - - This is a read-only attribute. - - See :attr:`pysam.AlignmentFile.header` to get a parsed - representation of the header. - ''' - def __get__(self): - if not self.is_open(): - raise ValueError( "I/O operation on closed file" ) - return from_string_and_size(self.header.text, self.header.l_text) - - property header: - """two-level dictionay with header information from the file. - - This is a read-only attribute. - - The first level contains the record (``HD``, ``SQ``, etc) and - the second level contains the fields (``VN``, ``LN``, etc). - - The parser is validating and will raise an AssertionError if - if encounters any record or field tags that are not part of - the SAM specification. Use the - :attr:`pysam.AlignmentFile.text` attribute to get the unparsed - header. - - The parsing follows the SAM format specification with the - exception of the ``CL`` field. This option will consume the - rest of a header line irrespective of any additional fields. - This behaviour has been added to accommodate command line - options that contain characters that are not valid field - separators. - - """ - def __get__(self): - if not self.is_open(): - raise ValueError( "I/O operation on closed file" ) - - result = {} - - if self.header.text != NULL: - # convert to python string (note: call self.text to - # create 0-terminated string) - t = self.text - for line in t.split("\n"): - if not line.strip(): continue - assert line.startswith("@"), \ - "header line without '@': '%s'" % line - fields = line[1:].split("\t") - record = fields[0] - assert record in VALID_HEADER_TYPES, \ - "header line with invalid type '%s': '%s'" % (record, line) - - # treat comments - if record == "CO": - if record not in result: - result[record] = [] - result[record].append("\t".join( fields[1:])) - continue - # the following is clumsy as generators do not work? - x = {} - - for idx, field in enumerate(fields[1:]): - if ":" not in field: - raise ValueError("malformatted header: no ':' in field" ) - key, value = field.split(":", 1) - if key in ("CL",): - # special treatment for command line - # statements (CL). These might contain - # characters that are non-conformant with - # the valid field separators in the SAM - # header. Thus, in contravention to the - # SAM API, consume the rest of the line. - key, value = "\t".join(fields[idx+1:]).split(":", 1) - x[key] = KNOWN_HEADER_FIELDS[record][key](value) - break - - # interpret type of known header record tags, default to str - x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) - - if VALID_HEADER_TYPES[record] == dict: - if record in result: - raise ValueError( - "multiple '%s' lines are not permitted" % record) - - result[record] = x - elif VALID_HEADER_TYPES[record] == list: - if record not in result: result[record] = [] - result[record].append(x) - - # if there are no SQ lines in the header, add the - # reference names from the information in the bam - # file. - # - # Background: c-samtools keeps the textual part of the - # header separate from the list of reference names and - # lengths. Thus, if a header contains only SQ lines, - # the SQ information is not part of the textual header - # and thus are missing from the output. See issue 84. - if "SQ" not in result: - sq = [] - for ref, length in zip(self.references, self.lengths): - sq.append({'LN': length, 'SN': ref }) - result["SQ"] = sq - - return result - - ############################################################### - ## file-object like iterator access - ## note: concurrent access will cause errors (see IteratorRow - ## and multiple_iterators) - ## Possible solutions: deprecate or open new file handle - def __iter__(self): - if not self.is_open(): - raise ValueError("I/O operation on closed file") - - if not self.is_bam and self.header.n_targets == 0: - raise NotImplementedError( - "can not iterate over samfile without header") - return self - - cdef bam1_t * getCurrent( self ): - return self.b - - cdef int cnext(self): - ''' - cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. - ''' - cdef int ret - with nogil: - ret = sam_read1(self.htsfile, - self.header, - self.b) - return ret - - def __next__(self): - cdef int ret = self.cnext() - if (ret >= 0): - return makeAlignedSegment(self.b, self) - elif ret == -2: - raise IOError('truncated file') - else: - raise StopIteration - - # Compatibility functions for pysam < 0.8.3 - def gettid(self, reference): - """deprecated, use get_tid() instead""" - return self.get_tid(reference) - - def getrname(self, tid): - """deprecated, use get_reference_name() instead""" - return self.get_reference_name(tid) - - -cdef class IteratorRow: - '''abstract base class for iterators over mapped reads. - - Various iterators implement different behaviours for wrapping around - contig boundaries. Examples include: - - :class:`pysam.IteratorRowRegion` - iterate within a single contig and a defined region. - - :class:`pysam.IteratorRowAll` - iterate until EOF. This iterator will also include unmapped reads. - - :class:`pysam.IteratorRowAllRefs` - iterate over all reads in all reference sequences. - - The method :meth:`AlignmentFile.fetch` returns an IteratorRow. - - .. note:: - - It is usually not necessary to create an object of this class - explicitly. It is returned as a result of call to a - :meth:`AlignmentFile.fetch`. - - ''' - - def __init__(self, AlignmentFile samfile, int multiple_iterators=False): - cdef char *cfilename - cdef char *creference_filename - - if not samfile.is_open(): - raise ValueError("I/O operation on closed file") - - # makes sure that samfile stays alive as long as the - # iterator is alive - self.samfile = samfile - - # reopen the file - note that this makes the iterator - # slow and causes pileup to slow down significantly. - if multiple_iterators: - cfilename = samfile._filename - with nogil: - self.htsfile = hts_open(cfilename, 'r') - assert self.htsfile != NULL - # read header - required for accurate positioning - # could a tell/seek work? - with nogil: - self.header = sam_hdr_read(self.htsfile) - assert self.header != NULL - self.owns_samfile = True - # options specific to CRAM files - if samfile.is_cram and samfile._reference_filename: - creference_filename = samfile._reference_filename - hts_set_opt(self.htsfile, - CRAM_OPT_REFERENCE, - creference_filename) - - else: - self.htsfile = self.samfile.htsfile - self.owns_samfile = False - self.header = self.samfile.header - - self.retval = 0 - - self.b = bam_init1() - - def __dealloc__(self): - bam_destroy1(self.b) - if self.owns_samfile: - hts_close(self.htsfile) - bam_hdr_destroy(self.header) - - -cdef class IteratorRowRegion(IteratorRow): - """*(AlignmentFile samfile, int tid, int beg, int end, - int multiple_iterators=False)* - - iterate over mapped reads in a region. - - .. note:: - - It is usually not necessary to create an object of this class - explicitly. It is returned as a result of call to a - :meth:`AlignmentFile.fetch`. - - """ - - def __init__(self, AlignmentFile samfile, - int tid, int beg, int end, - int multiple_iterators=False): - - IteratorRow.__init__(self, samfile, - multiple_iterators=multiple_iterators) - - if not samfile.has_index(): - raise ValueError("no index available for iteration") - - with nogil: - self.iter = sam_itr_queryi( - self.samfile.index, - tid, - beg, - end) - - def __iter__(self): - return self - - cdef bam1_t * getCurrent(self): - return self.b - - cdef int cnext(self): - '''cversion of iterator. Used by IteratorColumn''' - with nogil: - self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), - self.iter, - self.b, - self.htsfile) - - def __next__(self): - self.cnext() - if self.retval >= 0: - return makeAlignedSegment(self.b, self.samfile) - elif self.retval == -2: - # Note: it is currently not the case that hts_iter_next - # returns -2 for a truncated file. - # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 - raise IOError('truncated file') - else: - raise StopIteration - - def __dealloc__(self): - hts_itr_destroy(self.iter) - - -cdef class IteratorRowHead(IteratorRow): - """*(AlignmentFile samfile, n, int multiple_iterators=False)* - - iterate over first n reads in `samfile` - - .. note:: - It is usually not necessary to create an object of this class - explicitly. It is returned as a result of call to a - :meth:`AlignmentFile.head`. - - """ - - def __init__(self, AlignmentFile samfile, int n, - int multiple_iterators=False): - - IteratorRow.__init__(self, samfile, - multiple_iterators=multiple_iterators) - - self.max_rows = n - self.current_row = 0 - - def __iter__(self): - return self - - cdef bam1_t * getCurrent( self ): - return self.b - - cdef int cnext(self): - '''cversion of iterator. Used by IteratorColumn''' - cdef int ret - with nogil: - ret = sam_read1(self.htsfile, - self.samfile.header, - self.b) - return ret - - def __next__(self): - if self.current_row >= self.max_rows: - raise StopIteration - - cdef int ret = self.cnext() - if ret >= 0: - self.current_row += 1 - return makeAlignedSegment(self.b, self.samfile) - elif ret == -2: - raise IOError('truncated file') - else: - raise StopIteration - - -cdef class IteratorRowAll(IteratorRow): - """*(AlignmentFile samfile, int multiple_iterators=False)* - - iterate over all reads in `samfile` - - .. note:: - - It is usually not necessary to create an object of this class - explicitly. It is returned as a result of call to a - :meth:`AlignmentFile.fetch`. - - """ - - def __init__(self, AlignmentFile samfile, - int multiple_iterators=False): - - IteratorRow.__init__(self, samfile, - multiple_iterators=multiple_iterators) - - def __iter__(self): - return self - - cdef bam1_t * getCurrent( self ): - return self.b - - cdef int cnext(self): - '''cversion of iterator. Used by IteratorColumn''' - cdef int ret - with nogil: - ret = sam_read1(self.htsfile, - self.samfile.header, - self.b) - return ret - - def __next__(self): - cdef int ret = self.cnext() - if ret >= 0: - return makeAlignedSegment(self.b, self.samfile) - elif ret == -2: - raise IOError('truncated file') - else: - raise StopIteration - - -cdef class IteratorRowAllRefs(IteratorRow): - """iterates over all mapped reads by chaining iterators over each - reference - - .. note:: - It is usually not necessary to create an object of this class - explicitly. It is returned as a result of call to a - :meth:`AlignmentFile.fetch`. - - """ - - def __init__(self, AlignmentFile samfile, - multiple_iterators=False): - - IteratorRow.__init__(self, samfile, - multiple_iterators=multiple_iterators) - - if not samfile.has_index(): - raise ValueError("no index available for fetch") - - self.tid = -1 - - def nextiter(self): - # get a new iterator for a chromosome. The file - # will not be re-opened. - self.rowiter = IteratorRowRegion(self.samfile, - self.tid, - 0, - 1<<29) - # set htsfile and header of the rowiter - # to the values in this iterator to reflect multiple_iterators - self.rowiter.htsfile = self.htsfile - self.rowiter.header = self.header - - # make sure the iterator understand that IteratorRowAllRefs - # has ownership - self.rowiter.owns_samfile = False - - def __iter__(self): - return self - - def __next__(self): - # Create an initial iterator - if self.tid == -1: - if not self.samfile.nreferences: - raise StopIteration - self.tid = 0 - self.nextiter() - - while 1: - self.rowiter.cnext() - - # If current iterator is not exhausted, return aligned read - if self.rowiter.retval > 0: - return makeAlignedSegment(self.rowiter.b, self.samfile) - - self.tid += 1 - - # Otherwise, proceed to next reference or stop - if self.tid < self.samfile.nreferences: - self.nextiter() - else: - raise StopIteration - - -cdef class IteratorRowSelection(IteratorRow): - """*(AlignmentFile samfile)* - - iterate over reads in `samfile` at a given list of file positions. - - .. note:: - It is usually not necessary to create an object of this class - explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. - """ - - def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): - - IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators) - - self.positions = positions - self.current_pos = 0 - - def __iter__(self): - return self - - cdef bam1_t * getCurrent(self): - return self.b - - cdef int cnext(self): - '''cversion of iterator''' - # end iteration if out of positions - if self.current_pos >= len(self.positions): return -1 - - cdef uint64_t pos = self.positions[self.current_pos] - with nogil: - bgzf_seek(hts_get_bgzfp(self.htsfile), - pos, - 0) - self.current_pos += 1 - - cdef int ret - with nogil: - ret = sam_read1(self.htsfile, - self.samfile.header, - self.b) - return ret - - def __next__(self): - cdef int ret = self.cnext() - if (ret >= 0): - return makeAlignedSegment(self.b, self.samfile) - elif (ret == -2): - raise IOError('truncated file') - else: - raise StopIteration - - -cdef int __advance_nofilter(void *data, bam1_t *b): - '''advance without any read filtering. - ''' - cdef __iterdata * d - d = <__iterdata*>data - cdef int ret - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - return ret - - -cdef int __advance_all(void *data, bam1_t *b): - '''only use reads for pileup passing basic - filters: - - BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP - ''' - - cdef __iterdata * d - cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP - d = <__iterdata*>data - cdef int ret - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - while ret >= 0 and b.core.flag & mask: - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - return ret - - -cdef int __advance_snpcalls(void * data, bam1_t * b): - '''advance using same filter and read processing as in - the samtools pileup. - ''' - - # Note that this method requries acces to some - # functions in the samtools code base and is thus - # not htslib only. - # The functions accessed in samtools are: - # 1. bam_prob_realn - # 2. bam_cap_mapQ - cdef __iterdata * d - d = <__iterdata*>data - - cdef int ret - cdef int skip = 0 - cdef int q - cdef int is_cns = 1 - cdef int is_nobaq = 0 - cdef int capQ_thres = 0 - - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - - # reload sequence - if d.fastafile != NULL and b.core.tid != d.tid: - if d.seq != NULL: - free(d.seq) - d.tid = b.core.tid - with nogil: - d.seq = faidx_fetch_seq( - d.fastafile, - d.header.target_name[d.tid], - 0, MAX_POS, - &d.seq_len) - - if d.seq == NULL: - raise ValueError( - "reference sequence for '%s' (tid=%i) not found" % \ - (d.header.target_name[d.tid], - d.tid)) - - while ret >= 0: - skip = 0 - - # realign read - changes base qualities - if d.seq != NULL and is_cns and not is_nobaq: - bam_prob_realn(b, d.seq) - - if d.seq != NULL and capQ_thres > 10: - q = bam_cap_mapQ(b, d.seq, capQ_thres) - if q < 0: - skip = 1 - elif b.core.qual > q: - b.core.qual = q - if b.core.flag & BAM_FUNMAP: - skip = 1 - elif b.core.flag & 1 and not b.core.flag & 2: - skip = 1 - - if not skip: - break - # additional filters - - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - - return ret - -cdef class IteratorColumn: - '''abstract base class for iterators over columns. - - IteratorColumn objects wrap the pileup functionality of samtools. - - For reasons of efficiency, the iterator points to the current - pileup buffer. The pileup buffer is updated at every iteration. - This might cause some unexpected behavious. For example, - consider the conversion to a list:: - - f = AlignmentFile("file.bam", "rb") - result = list( f.pileup() ) - - Here, ``result`` will contain ``n`` objects of type - :class:`~pysam.PileupColumn` for ``n`` columns, but each object in - ``result`` will contain the same information. - - The desired behaviour can be achieved by list comprehension:: - - result = [ x.pileups() for x in f.pileup() ] - - ``result`` will be a list of ``n`` lists of objects of type - :class:`~pysam.PileupRead`. - - If the iterator is associated with a :class:`~pysam.Fastafile` using the - :meth:`addReference` method, then the iterator will export the - current sequence via the methods :meth:`getSequence` and - :meth:`seq_len`. - - Optional kwargs to the iterator: - - stepper - The stepper controls how the iterator advances. - - Valid values are None, "all" (default), "nofilter" or "samtools". - - See AlignmentFile.pileup for description. - - fastafile - A :class:`~pysam.FastaFile` object - - max_depth - maximum read depth. The default is 8000. - - ''' - - def __cinit__( self, AlignmentFile samfile, **kwargs ): - self.samfile = samfile - self.fastafile = kwargs.get("fastafile", None) - self.stepper = kwargs.get("stepper", None) - self.max_depth = kwargs.get("max_depth", 8000) - self.iterdata.seq = NULL - self.tid = 0 - self.pos = 0 - self.n_plp = 0 - self.plp = NULL - self.pileup_iter = NULL - - def __iter__(self): - return self - - cdef int cnext(self): - '''perform next iteration. - ''' - # do not release gil here because of call-backs - self.plp = bam_plp_auto(self.pileup_iter, - &self.tid, - &self.pos, - &self.n_plp) - - cdef char * getSequence(self): - '''return current reference sequence underlying the iterator. - ''' - return self.iterdata.seq - - property seq_len: - '''current sequence length.''' - def __get__(self): - return self.iterdata.seq_len - - def addReference(self, Fastafile fastafile): - ''' - add reference sequences in `fastafile` to iterator.''' - self.fastafile = fastafile - if self.iterdata.seq != NULL: - free(self.iterdata.seq) - self.iterdata.tid = -1 - self.iterdata.fastafile = self.fastafile.fastafile - - def hasReference(self): - ''' - return true if iterator is associated with a reference''' - return self.fastafile - - cdef setMask(self, mask): - '''set masking flag in iterator. - - reads with bits set in `mask` will be skipped. - ''' - raise NotImplementedError() - # self.mask = mask - # bam_plp_set_mask( self.pileup_iter, self.mask ) - - cdef setupIteratorData( self, - int tid, - int start, - int end, - int multiple_iterators=0 ): - '''setup the iterator structure''' - - self.iter = IteratorRowRegion(self.samfile, tid, start, end, multiple_iterators) - self.iterdata.htsfile = self.samfile.htsfile - self.iterdata.iter = self.iter.iter - self.iterdata.seq = NULL - self.iterdata.tid = -1 - self.iterdata.header = self.samfile.header - - if self.fastafile is not None: - self.iterdata.fastafile = self.fastafile.fastafile - else: - self.iterdata.fastafile = NULL - - # Free any previously allocated memory before reassigning - # pileup_iter - self._free_pileup_iter() - - if self.stepper is None or self.stepper == "all": - with nogil: - self.pileup_iter = bam_plp_init( - &__advance_all, - &self.iterdata) - elif self.stepper == "nofilter": - with nogil: - self.pileup_iter = bam_plp_init( - &__advance_nofilter, - &self.iterdata) - elif self.stepper == "samtools": - with nogil: - self.pileup_iter = bam_plp_init( - &__advance_snpcalls, - &self.iterdata) - else: - raise ValueError( - "unknown stepper option `%s` in IteratorColumn" % self.stepper) - - if self.max_depth: - with nogil: - bam_plp_set_maxcnt(self.pileup_iter, self.max_depth) - - # bam_plp_set_mask( self.pileup_iter, self.mask ) - - cdef reset( self, tid, start, end ): - '''reset iterator position. - - This permits using the iterator multiple times without - having to incur the full set-up costs. - ''' - self.iter = IteratorRowRegion( self.samfile, tid, start, end, multiple_iterators = 0 ) - self.iterdata.iter = self.iter.iter - - # invalidate sequence if different tid - if self.tid != tid: - if self.iterdata.seq != NULL: - free(self.iterdata.seq) - self.iterdata.seq = NULL - self.iterdata.tid = -1 - - # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata ) - with nogil: - bam_plp_reset(self.pileup_iter) - - cdef _free_pileup_iter(self): - '''free the memory alloc'd by bam_plp_init. - - This is needed before setupIteratorData allocates - another pileup_iter, or else memory will be lost. - ''' - if self.pileup_iter != NULL: - with nogil: - bam_plp_reset(self.pileup_iter) - bam_plp_destroy(self.pileup_iter) - self.pileup_iter = NULL - - def __dealloc__(self): - # reset in order to avoid memory leak messages for iterators - # that have not been fully consumed - self._free_pileup_iter() - self.plp = NULL - - if self.iterdata.seq != NULL: - free(self.iterdata.seq) - self.iterdata.seq = NULL - - -cdef class IteratorColumnRegion(IteratorColumn): - '''iterates over a region only. - ''' - def __cinit__(self, AlignmentFile samfile, - int tid = 0, - int start = 0, - int end = MAX_POS, - int truncate = False, - **kwargs ): - - # initialize iterator - self.setupIteratorData(tid, start, end, 1) - self.start = start - self.end = end - self.truncate = truncate - - def __next__(self): - - while 1: - self.cnext() - if self.n_plp < 0: - raise ValueError("error during iteration" ) - - if self.plp == NULL: - raise StopIteration - - if self.truncate: - if self.start > self.pos: continue - if self.pos >= self.end: raise StopIteration - - return makePileupColumn(&self.plp, - self.tid, - self.pos, - self.n_plp, - self.samfile) - - -cdef class IteratorColumnAllRefs(IteratorColumn): - """iterates over all columns by chaining iterators over each reference - """ - - def __cinit__(self, - AlignmentFile samfile, - **kwargs): - - # no iteration over empty files - if not samfile.nreferences: - raise StopIteration - - # initialize iterator - self.setupIteratorData(self.tid, 0, MAX_POS, 1) - - def __next__(self): - - while 1: - self.cnext() - - if self.n_plp < 0: - raise ValueError("error during iteration" ) - - # return result, if within same reference - if self.plp != NULL: - return makePileupColumn(&self.plp, - self.tid, - self.pos, - self.n_plp, - self.samfile) - - # otherwise, proceed to next reference or stop - self.tid += 1 - if self.tid < self.samfile.nreferences: - self.setupIteratorData(self.tid, 0, MAX_POS, 0) - else: - raise StopIteration - - -cdef class SNPCall: - '''the results of a SNP call.''' - cdef int _tid - cdef int _pos - cdef char _reference_base - cdef char _genotype - cdef int _consensus_quality - cdef int _snp_quality - cdef int _rms_mapping_quality - cdef int _coverage - - property tid: - '''the chromosome ID as is defined in the header''' - def __get__(self): - return self._tid - - property pos: - '''nucleotide position of SNP.''' - def __get__(self): return self._pos - - property reference_base: - '''reference base at pos. ``N`` if no reference sequence supplied.''' - def __get__(self): return from_string_and_size( &self._reference_base, 1 ) - - property genotype: - '''the genotype called.''' - def __get__(self): return from_string_and_size( &self._genotype, 1 ) - - property consensus_quality: - '''the genotype quality (Phred-scaled).''' - def __get__(self): return self._consensus_quality - - property snp_quality: - '''the snp quality (Phred scaled) - probability of consensus being - identical to reference sequence.''' - def __get__(self): return self._snp_quality - - property mapping_quality: - '''the root mean square (rms) of the mapping quality of all reads - involved in the call.''' - def __get__(self): return self._rms_mapping_quality - - property coverage: - '''coverage or read depth - the number of reads involved in the call.''' - def __get__(self): return self._coverage - - def __str__(self): - - return "\t".join( map(str, ( - self.tid, - self.pos, - self.reference_base, - self.genotype, - self.consensus_quality, - self.snp_quality, - self.mapping_quality, - self.coverage ) ) ) - - -cdef class IndexedReads: - """*(AlignmentFile samfile, multiple_iterators=True) - - Index a Sam/BAM-file by query name while keeping the - original sort order intact. - - The index is kept in memory and can be substantial. - - By default, the file is re-openend to avoid conflicts if multiple - operators work on the same file. Set `multiple_iterators` = False - to not re-open `samfile`. - - Parameters - ---------- - - samfile : AlignmentFile - File to be indexed. - - multiple_iterators : bool - Flag indicating whether the file should be reopened. Reopening prevents - existing iterators being affected by the indexing. - - """ - - def __init__(self, AlignmentFile samfile, int multiple_iterators=True): - cdef char *cfilename - - # makes sure that samfile stays alive as long as this - # object is alive. - self.samfile = samfile - - assert samfile.is_bam, "can only IndexReads on bam files" - - # multiple_iterators the file - note that this makes the iterator - # slow and causes pileup to slow down significantly. - if multiple_iterators: - cfilename = samfile._filename - with nogil: - self.htsfile = hts_open(cfilename, 'r') - assert self.htsfile != NULL - # read header - required for accurate positioning - with nogil: - self.header = sam_hdr_read(self.htsfile) - self.owns_samfile = True - else: - self.htsfile = self.samfile.htsfile - self.header = self.samfile.header - self.owns_samfile = False - - def build(self): - '''build the index.''' - - self.index = collections.defaultdict(list) - - # this method will start indexing from the current file - # position if you decide - cdef int ret = 1 - cdef bam1_t * b = calloc(1, sizeof( bam1_t)) - - cdef uint64_t pos - - while ret > 0: - with nogil: - pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) - ret = sam_read1(self.htsfile, - self.samfile.header, - b) - if ret > 0: - qname = charptr_to_str(pysam_bam_get_qname(b)) - self.index[qname].append(pos) - - bam_destroy1(b) - - def find(self, query_name): - '''find `query_name` in index. - - Returns - ------- - - IteratorRowSelection - Returns an iterator over all reads with query_name. - - Raises - ------ - - KeyError - if the `query_name` is not in the index. - - ''' - if query_name in self.index: - return IteratorRowSelection( - self.samfile, - self.index[query_name], - multiple_iterators = False) - else: - raise KeyError("read %s not found" % query_name) - - def __dealloc__(self): - if self.owns_samfile: - hts_close(self.htsfile) - bam_hdr_destroy(self.header) - -__all__ = [ - "AlignmentFile", - "IteratorRow", - "IteratorColumn", - "IndexedReads"] diff --git a/pysam/cbcf.pxd b/pysam/cbcf.pxd deleted file mode 100644 index b6e210a..0000000 --- a/pysam/cbcf.pxd +++ /dev/null @@ -1,159 +0,0 @@ -############################################################################### -############################################################################### -## Cython wrapper for htslib VCF/BCF reader/writer -############################################################################### -# -# NOTICE: This code is incomplete and preliminary. It is nearly complete as -# an immutable interface, but has no capability (yet) to mutate the -# resulting data (beyond dropping all samples). Documentation still -# needs to be written and a unit test suite is in the works. The -# code is also specific to Python 2 and will require a bit of work -# to properly adapt to Python 3. -# -############################################################################### -# -# The MIT License -# -# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com) -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -############################################################################### - -from libc.stdint cimport int8_t, int16_t, int32_t, int64_t -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -from libc.stdlib cimport malloc, calloc, realloc, free -from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup - -from pysam.chtslib cimport * - - -cdef class VariantHeader(object): - cdef bcf_hdr_t *ptr - - cdef _subset_samples(self, include_samples) - - -cdef class VariantHeaderRecord(object): - cdef VariantHeader header - cdef bcf_hrec_t *ptr - - -cdef class VariantHeaderRecords(object): - cdef VariantHeader header - - -cdef class VariantHeaderContigs(object): - cdef VariantHeader header - - -cdef class VariantHeaderSamples(object): - cdef VariantHeader header - - -cdef class VariantContig(object): - cdef VariantHeader header - cdef int id - - -cdef class VariantMetadata(object): - cdef VariantHeader header - cdef int type - cdef int id - - -cdef class VariantHeaderMetadata(object): - cdef VariantHeader header - cdef int32_t type - - -cdef class VariantRecord(object): - cdef VariantHeader header - cdef bcf1_t *ptr - - -cdef class VariantRecordFilter(object): - cdef VariantRecord record - - -cdef class VariantRecordFormat(object): - cdef VariantRecord record - - -cdef class VariantRecordInfo(object): - cdef VariantRecord record - - -cdef class VariantRecordSamples(object): - cdef VariantRecord record - - -cdef class VariantRecordSample(object): - cdef VariantRecord record - cdef readonly int32_t index - - -cdef class BaseIndex(object): - cdef tuple refs - cdef dict refmap - - -cdef class BCFIndex(BaseIndex): - cdef VariantHeader header - cdef hts_idx_t *ptr - - -cdef class TabixIndex(BaseIndex): - cdef tbx_t *ptr - - -cdef class BaseIterator(object): - cdef VariantFile bcf - cdef hts_itr_t *iter - - -cdef class BCFIterator(BaseIterator): - cdef BCFIndex index - - -cdef class TabixIterator(BaseIterator): - cdef TabixIndex index - cdef kstring_t line_buffer - - -cdef class VariantFile(object): - cdef htsFile *htsfile # pointer to htsFile structure - cdef int64_t start_offset # BGZF offset of first record - - cdef readonly object filename # filename as supplied by user - cdef readonly object mode # file opening mode - cdef readonly object index_filename # filename of index, if supplied by user - - cdef readonly VariantHeader header - cdef readonly BaseIndex index - - cdef readonly bint drop_samples # true if sample information is to be ignored - - # FIXME: Temporary, use htsFormat when it is available - cdef readonly bint is_bcf # true if file is a bcf file - cdef readonly bint is_stream # true if not a seekable file but a stream - cdef readonly bint is_remote # true if file is not on the local filesystem - cdef readonly bint is_reading # true if file has begun reading records - - cpdef int write(self, VariantRecord record) except -1 diff --git a/pysam/cbcf.pyx b/pysam/cbcf.pyx deleted file mode 100644 index 41fd44f..0000000 --- a/pysam/cbcf.pyx +++ /dev/null @@ -1,3652 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### -############################################################################### -## Cython wrapper for htslib VCF/BCF reader/writer -############################################################################### -# -# NOTICE: This code is incomplete and preliminary. It offers a nearly -# complete Pythonic interface to VCF/BCF metadata and data with -# reading and writing capability. It has limited capability to to -# mutate the resulting data. Documentation and a unit test suite -# are in the works. The code is best tested under Python 2, but -# should also work with Python 3. Please report any remaining -# str/bytes issues on the github site when using Python 3 and I'll -# fix them promptly. -# -# Here is a minimal example of how to use the API: -# -# $ cat bcfview.py -# import sys -# from pysam import VariantFile -# -# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format -# bcf_out = VariantFile('-', 'w', header=bcf_in.header) -# -# for rec in bcf_in: -# bcf_out.write(rec) -# -# Performance is fairly close to that of bcftools view. Here is an example -# using some 1k Genomes data: -# -# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l -# 1103799 -# -# real 0m56.114s -# user 1m4.489s -# sys 0m3.102s -# -# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l -# 1103800 # bcftools adds an extra header -# -# real 0m55.126s -# user 1m3.502s -# sys 0m3.459s -# -# Here is a quick tour through the API:: -# -# VariantFile(filename, mode=None, header=None, drop_samples=False) -# -# Attributes / Properties -# -# htsfile: htsFile* [private] -# start_offset: BGZF offset of first record [private] -# filename: filename [read only] -# mode: mode [read only] -# header: VariantHeader object [read only] -# index: TabixIndex, BCFIndex or None [read only] -# drop_samples: sample information is to be ignored [read only] -# -# is_stream: file is stdin/stdout [read only] -# is_remote: file is not on the local filesystem [read only] -# is_reading: file has begun reading records [read only] -# category: file format general category [read only] -# format: file format [read only] -# version: tuple of (major, minor) format version [read only] -# compression: file compression -# description: vaguely human readable description of [read only] -# file format. -# -# Methods: -# copy() -# close() -# open(filename, mode=None, header=None, drop_samples=False) -# reset() -# seek(offset) -# tell() -# fetch(contig=None, start=None, stop=None, region=None, reopen=False) -# subset_samples(include_samples) -# -# VariantHeader() -# -# version: VCF version -# samples: sequence-like access to samples -# records: sequence-like access to partially parsed headers -# contigs: mapping-like object for contig name -> VariantContig -# -# filters: mapping-like object for filter name -> VariantMetadata -# info: mapping-like object for info name -> VariantMetadata -# formats: mapping-like object for formats name -> VariantMetadata -# -# VariantRecord(...) -# -# header: VariantHeader object -# rid: reference id (i.e. tid) -# chrom: chromosome/contig string -# contig: synonym for chrom -# pos: 1-based start position (inclusive) -# start: 0-based start position (inclusive) -# stop: 0-based stop position (exclusive) -# rlen: reference length (stop - start) -# id: record identifier -# ref: reference allele -# alleles: alleles (ref followed by alts) -# alts: alt alleles -# qual: quality (float) -# filter: mapping-like object for filter name -> type info -# info: mapping-like object for info name -> value -# format: mapping-like object for format name -> type info -# samples: mapping-like object of sample genotypes & attrs -# -# VariantRecordSample(...) -# -# name: sample name -# index: sample index -# allele_indices: tuple of allele indices (ref=0, alt=1..len(alts), missing=-1) -# alleles: tuple of alleles (missing=None) -# -# VariantRecordSample is also a mapping object from formats to values -# -# VariantContig(...) -# -# id: reference id (i.e. tid) -# name: chromosome/contig string -# length: contig length if provided, else None -# header: defining VariantHeaderRecord -# -# VariantMetadata(...) # for FILTER, INFO and FORMAT metadata -# -# id: internal id -# name: metadata name -# type: value data type -# number: number of values -# header: defining VariantHeaderRecord -# -# VariantHeaderRecord(...) # replace with single tuple of key/value pairs? -# -# type: record type -# key: first record key -# value: first record value -# attrs: remaining key/value pairs -# -############################################################################### -# -# TODO list for next major sprint: -# -# * more genotype methods -# * unit test suite (perhaps py.test based) -# * documentation -# * htslib 1.2 format info -# -# For later sprints: -# -# * ability to create indices -# * mutable header and record data -# * pickle support -# * Python 3 support -# * left/right locus normalization -# * parallel iteration (like synced_bcf_reader) -# * fix reopen to re-use fd -# -############################################################################### -# -# The MIT License -# -# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com) -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -############################################################################### - -from __future__ import division, print_function - -import os -import sys - -from libc.string cimport strcmp, strpbrk -from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX - -cimport cython - -from cpython.object cimport PyObject -from cpython.ref cimport Py_INCREF -from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString -from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM -from cpython.bytes cimport PyBytes_FromStringAndSize -from cpython.unicode cimport PyUnicode_DecodeASCII -from cpython.version cimport PY_MAJOR_VERSION - -from pysam.chtslib cimport hisremote - - -from warnings import warn - - -__all__ = ['VariantFile', - 'VariantHeader', - 'VariantHeaderRecord', - 'VariantRecord'] - -######################################################################## -######################################################################## -## Constants -######################################################################## - -cdef int MAX_POS = 2 << 29 -cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') -cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') -cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') - -cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') -cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', - 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED') -cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM') - -######################################################################## -######################################################################## -## Python 3 compatibility functions -######################################################################## - -from pysam.cutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len -from pysam.cutils cimport encode_filename, from_string_and_size - - -######################################################################## -######################################################################## -## VCF/BCF string intern system -######################################################################## - -cdef dict bcf_str_cache = {} - -cdef inline bcf_str_cache_get_charptr(const char* s): - if s == NULL: - return None - - cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s) - if pystr: - return pystr - - if PY_MAJOR_VERSION < 3: - val = s - else: - val = PyUnicode_DecodeASCII(s, strlen(s), NULL) - - PyDict_SetItemString(bcf_str_cache, s, val) - - return val - - -######################################################################## -######################################################################## -## Low level type conversion helpers -######################################################################## - - -cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): - return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0 - - -cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0): - if not a: - return None - try: - return tuple(charptr_to_str(a[i]) for i in range(n)) - finally: - if free_after and a: - free(a) - - -cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar): - cdef char *datac - cdef int8_t *data8 - cdef int16_t *data16 - cdef int32_t *data32 - cdef float *dataf - cdef int i - - if not data or n <= 0: - return None - - if type == BCF_BT_CHAR: - datac = data - while n and datac[n-1] == bcf_str_vector_end: - n -= 1 - value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None - # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. - - value = tuple(v or None for v in value.split(',')) if value else () - # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. - else: - value = [] - if type == BCF_BT_INT8: - data8 = data - for i in range(n): - if data8[i] == bcf_int8_vector_end: - break - value.append(data8[i] if data8[i] != bcf_int8_missing else None) - elif type == BCF_BT_INT16: - data16 = data - for i in range(n): - if data16[i] == bcf_int16_vector_end: - break - value.append(data16[i] if data16[i] != bcf_int16_missing else None) - elif type == BCF_BT_INT32: - data32 = data - for i in range(n): - if data32[i] == bcf_int32_vector_end: - break - value.append(data32[i] if data32[i] != bcf_int32_missing else None) - elif type == BCF_BT_FLOAT: - dataf = data - for i in range(n): - if bcf_float_is_vector_end(dataf[i]): - break - value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None) - else: - raise TypeError('unsupported info type code') - - # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. - if not value: - if scalar: - value = None - elif count <= 0: - value = () - else: - value = (None,)*count - elif scalar and len(value) == 1: - value = value[0] - else: - value = tuple(value) - - return value - - -cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen): - cdef char *datac - cdef int8_t *data8 - cdef int16_t *data16 - cdef int32_t *data32 - cdef float *dataf - cdef ssize_t i, value_count = len(values) - - assert(value_count <= n) - - if bt_type == BCF_BT_CHAR: - if not isinstance(values, (str, bytes)): - values = b','.join(force_bytes(v) if v is not None else b'' for v in values) - value_count = len(values) - assert(value_count <= n) - datac = data - memcpy(datac, values, value_count) - for i in range(value_count, n): - datac[i] = 0 - elif bt_type == BCF_BT_INT8: - datai8 = data - for i in range(value_count): - val = values[i] - datai8[i] = val if val is not None else bcf_int8_missing - for i in range(value_count, n): - datai8[i] = bcf_int8_vector_end - elif bt_type == BCF_BT_INT16: - datai16 = data - for i in range(value_count): - val = values[i] - datai16[i] = val if val is not None else bcf_int16_missing - for i in range(value_count, n): - datai16[i] = bcf_int16_vector_end - elif bt_type == BCF_BT_INT32: - datai32 = data - for i in range(value_count): - val = values[i] - datai32[i] = val if val is not None else bcf_int32_missing - for i in range(value_count, n): - datai32[i] = bcf_int32_vector_end - elif bt_type == BCF_BT_FLOAT: - dataf = data - for i in range(value_count): - val = values[i] - if val is None: - bcf_float_set(dataf + i, bcf_float_missing) - else: - dataf[i] = val - for i in range(value_count, n): - bcf_float_set(dataf + i, bcf_float_vector_end) - else: - raise TypeError('unsupported type') - - -cdef bcf_empty_array(int type, ssize_t n, int vlen): - cdef char *datac - cdef int32_t *data32 - cdef float *dataf - cdef int i - - if n <= 0: - raise ValueError('Cannot create empty array') - - if type == BCF_HT_STR: - value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n) - datac = value - for i in range(n): - datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end - elif type == BCF_HT_INT: - value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n) - data32 = value - for i in range(n): - data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end - elif type == BCF_HT_REAL: - value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n) - dataf = value - for i in range(n): - bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) - else: - raise TypeError('unsupported header type code') - - return value - - -cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values, - void *dst_data, int dst_type, ssize_t dst_values, - int vlen): - cdef char *src_datac - cdef char *dst_datac - cdef int8_t *src_datai8 - cdef int16_t *src_datai16 - cdef int32_t *src_datai32 - cdef int32_t *dst_datai - cdef float *src_dataf - cdef float *dst_dataf - cdef ssize_t src_size, dst_size, i, j - cdef int val - - if src_values > dst_values: - raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values)) - - if src_type == dst_type == BCF_BT_CHAR: - src_datac = src_data - dst_datac = dst_data - memcpy(src_datac, dst_datac, src_values) - for i in range(src_values, dst_values): - dst_datac[i] = 0 - elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32: - src_datai8 = src_data - dst_datai = dst_data - for i in range(src_values): - val = src_datai8[i] - if val == bcf_int8_missing: - val = bcf_int32_missing - elif val == bcf_int8_vector_end: - val = bcf_int32_vector_end - dst_datai[i] = val - for i in range(src_values, dst_values): - dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end - elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32: - src_datai16 = src_data - dst_datai = dst_data - for i in range(src_values): - val = src_datai16[i] - if val == bcf_int16_missing: - val = bcf_int32_missing - elif val == bcf_int16_vector_end: - val = bcf_int32_vector_end - dst_datai[i] = val - for i in range(src_values, dst_values): - dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end - elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32: - src_datai32 = src_data - dst_datai = dst_data - for i in range(src_values): - dst_datai[i] = src_datai32[i] - for i in range(src_values, dst_values): - dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end - elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT: - src_dataf = src_data - dst_dataf = dst_data - for i in range(src_values): - dst_dataf[i] = src_dataf[i] - for i in range(src_values, dst_values): - bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) - else: - raise TypeError('unsupported types') - - -cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar): - cdef bcf_hdr_t *hdr = record.header.ptr - cdef bcf1_t *r = record.ptr - cdef int length = bcf_hdr_id2length(hdr, hl_type, id) - cdef int number = bcf_hdr_id2number(hdr, hl_type, id) - - scalar[0] = 0 - - if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id): - count[0] = number - elif length == BCF_VL_FIXED: - if number == 1: - scalar[0] = 1 - count[0] = number - elif length == BCF_VL_R: - count[0] = r.n_allele - elif length == BCF_VL_A: - count[0] = r.n_allele - 1 - elif length == BCF_VL_G: - count[0] = r.n_allele * (r.n_allele + 1) // 2 - elif length == BCF_VL_VAR: - count[0] = -1 - else: - raise ValueError('Unknown format length') - - -cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): - cdef bcf_hdr_t *hdr = record.header.ptr - - cdef char *s - cdef ssize_t count - cdef int scalar - - bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar) - - if z.len == 0: - if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG: - value = True - elif scalar: - value = None - else: - value = () - elif z.len == 1: - if z.type == BCF_BT_INT8: - if z.v1.i == bcf_int8_missing: - value = None - elif z.v1.i == bcf_int8_vector_end: - value = () - else: - value = z.v1.i - elif z.type == BCF_BT_INT16: - if z.v1.i == bcf_int16_missing: - value = None - elif z.v1.i == bcf_int16_vector_end: - value = () - else: - value = z.v1.i - elif z.type == BCF_BT_INT32: - if z.v1.i == bcf_int32_missing: - value = None - elif z.v1.i == bcf_int32_vector_end: - value = () - else: - value = z.v1.i - elif z.type == BCF_BT_FLOAT: - if bcf_float_is_missing(z.v1.f): - value = None - elif bcf_float_is_vector_end(z.v1.f): - value = () - else: - value = z.v1.f - elif z.type == BCF_BT_CHAR: - value = force_str(chr(z.v1.i)) - else: - raise TypeError('unsupported info type code') - - if not scalar and value != (): - value = (value,) - else: - value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar) - - return value - - -cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type, - int id, int bt_type, ssize_t bt_len, ssize_t *value_count, - int *scalar, int *realloc): - - bcf_get_value_count(record, hl_type, id, value_count, scalar) - - # Validate values now that we know the type and size - values = (value,) if not isinstance(value, tuple) else value - - # Validate values now that we know the type and size - if ht_type == BCF_HT_FLAG: - value_count[0] = 1 - - if value_count[0] != -1 and value_count[0] != len(values): - if scalar[0]: - raise TypeError('value expected to be scalar'.format(value_count[0])) - else: - raise TypeError('values expected to be {:d}-tuple'.format(value_count[0])) - - if ht_type == BCF_HT_REAL: - for v in values: - if not(v is None or isinstance(v, (float, int))): - raise TypeError('invalid value for Float format') - elif ht_type == BCF_HT_INT: - for v in values: - if not(v is None or (isinstance(v, (float, int)) and int(v) == v)): - raise TypeError('invalid value for Integer format') - for v in values: - if not(v is None or bcf_int32_missing < v <= INT32_MAX): - raise ValueError('Integer value too small/large to store in VCF/BCF') - elif ht_type == BCF_HT_STR: - values = b','.join(force_bytes(v) if v is not None else b'' for v in values) - elif ht_type == BCF_HT_FLAG: - if values[0] not in (True, False, None, 1, 0): - raise ValueError('Flag values must be: True, False, None, 1, 0') - else: - raise TypeError('unsupported type') - - realloc[0] = 0 - if len(values) <= 1 and hl_type == BCF_HL_INFO: - realloc[0] = 0 - elif len(values) > bt_len: - realloc[0] = 1 - elif bt_type == BCF_BT_INT8: - for v in values: - if v is not None and not(bcf_int8_missing < v <= INT8_MAX): - realloc[0] = 1 - break - elif bt_type == BCF_BT_INT16: - for v in values: - if v is not None and not(bcf_int16_missing < v <= INT16_MAX): - realloc[0] = 1 - break - - return values - - -cdef bcf_encode_alleles(VariantRecord record, values): - cdef bcf1_t *r = record.ptr - cdef int32_t nalleles = r.n_allele - cdef list gt_values = [] - cdef char *s - cdef int i - - if not values: - return () - - if not isinstance(values, (list, tuple)): - values = (values,) - - for value in values: - if value is None: - gt_values.append(None) - elif isinstance(value, (str, bytes)): - bvalue = force_bytes(value) - s = bvalue - for i in range(r.n_allele): - if strcmp(r.d.allele[i], s) != 0: - gt_values.append(bcf_gt_unphased(i)) - break - else: - raise ValueError('Unknown allele') - else: - i = value - if not (0 <= i < nalleles): - raise ValueError('Invalid allele index') - gt_values.append(bcf_gt_unphased(i)) - - return gt_values - - -cdef bcf_info_set_value(VariantRecord record, key, value): - cdef bcf_hdr_t *hdr = record.header.ptr - cdef bcf1_t *r = record.ptr - cdef vdict_t *d - cdef khiter_t k - cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0 - cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size - - if bcf_unpack(r, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) - - if info: - info_id = info.key - else: - d = hdr.dict[BCF_DT_ID] - k = kh_get_vdict(d, bkey) - - if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: - raise KeyError('unknown INFO') - - info_id = kh_val_vdict(d, k).id - - info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) - values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id, - info.type if info else -1, info.len if info else -1, - &value_count, &scalar, &realloc) - - if info_type == BCF_HT_FLAG: - if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0: - raise ValueError('Unable to update INFO values') - return - - vlen = value_count < 0 - value_count = len(values) - - # If we can, write updated values to existing allocated storage - if info and not realloc: - r.d.shared_dirty |= BCF1_DIRTY_INF - - if value_count == 0: - info.len = 0 - # FIXME: Check if need to free vptr if info.len > 0? - elif value_count == 1: - # FIXME: Check if need to free vptr if info.len > 0? - if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32: - bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen) - elif info.type == BCF_BT_FLOAT: - bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen) - else: - raise TypeError('unsupported info type code') - info.len = 1 - else: - bcf_object_to_array(values, info.vptr, info.type, info.len, vlen) - return - - alloc_len = max(1, value_count) - if info and info.len > alloc_len: - alloc_len = info.len - - new_values = bcf_empty_array(info_type, alloc_len, vlen) - cdef char *valp = new_values - - if info_type == BCF_HT_INT: - dst_type = BCF_BT_INT32 - elif info_type == BCF_HT_REAL: - dst_type = BCF_BT_FLOAT - elif info_type == BCF_HT_STR: - dst_type = BCF_BT_CHAR - else: - raise ValueError('Unsupported INFO type') - - bcf_object_to_array(values, valp, dst_type, alloc_len, vlen) - - if bcf_update_info(hdr, r, bkey, valp, alloc_len, info_type) < 0: - raise ValueError('Unable to update INFO values') - - -cdef bcf_info_del_value(VariantRecord record, key): - cdef bcf_hdr_t *hdr = record.header.ptr - cdef bcf1_t *r = record.ptr - cdef ssize_t value_count - cdef int scalar - - if bcf_unpack(r, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) - - if not info: - raise KeyError(key) - - bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar) - - if value_count <= 0: - null_value = () - elif scalar: - null_value = None - else: - null_value = (None,)*value_count - - bcf_info_set_value(record, bkey, null_value) - - -cdef bcf_format_get_value(VariantRecordSample sample, key): - cdef bcf_hdr_t *hdr = sample.record.header.ptr - cdef bcf1_t *r = sample.record.ptr - cdef ssize_t count - cdef int scalar - - if bcf_unpack(r, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) - - if not fmt or not fmt.p: - raise KeyError('invalid FORMAT') - - if is_gt_fmt(hdr, fmt.id): - return bcf_format_get_allele_indices(sample) - - bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar) - - if fmt.p and fmt.n and fmt.size: - return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar) - elif scalar: - return None - elif count <= 0: - return () - else: - return (None,)*count - - -cdef bcf_format_set_value(VariantRecordSample sample, key, value): - cdef bcf_hdr_t *hdr = sample.record.header.ptr - cdef bcf1_t *r = sample.record.ptr - cdef int fmt_id - cdef vdict_t *d - cdef khiter_t k - cdef int fmt_type, scalar, realloc, dst_type, vlen = 0 - cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size - - if bcf_unpack(r, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) - - if fmt: - fmt_id = fmt.id - else: - d = hdr.dict[BCF_DT_ID] - k = kh_get_vdict(d, bkey) - - if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF: - raise KeyError('unknown format') - - fmt_id = kh_val_vdict(d, k).id - - fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id) - - if fmt_type == BCF_HT_FLAG: - raise ValueError('Flag types are not allowed on FORMATs') - - if is_gt_fmt(hdr, fmt_id): - value = bcf_encode_alleles(sample.record, value) - - values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id, - fmt.type if fmt else -1, fmt.n if fmt else -1, - &value_count, &scalar, &realloc) - - vlen = value_count < 0 - value_count = len(values) - - # If we can, write updated values to existing allocated storage - if fmt and not realloc: - r.d.indiv_dirty = 1 - bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen) - return - - alloc_len = max(1, value_count) - if fmt and fmt.n > alloc_len: - alloc_len = fmt.n - - n = bcf_hdr_nsamples(hdr) - new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen) - cdef char *valp = new_values - - if fmt_type == BCF_HT_INT: - dst_type = BCF_BT_INT32 - dst_size = sizeof(int32_t) * alloc_len - elif fmt_type == BCF_HT_REAL: - dst_type = BCF_BT_FLOAT - dst_size = sizeof(float) * alloc_len - elif fmt_type == BCF_HT_STR: - dst_type = BCF_BT_CHAR - dst_size = sizeof(char) * alloc_len - else: - raise ValueError('Unsupported FORMAT type') - - if fmt and n > 1: - for i in range(n): - bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n, - valp + i*dst_size, dst_type, alloc_len, - vlen) - - bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen) - - if bcf_update_format(hdr, r, bkey, valp, (n*alloc_len), fmt_type) < 0: - raise ValueError('Unable to update format values') - - -cdef bcf_format_del_value(VariantRecordSample sample, key): - cdef bcf_hdr_t *hdr = sample.record.header.ptr - cdef bcf1_t *r = sample.record.ptr - cdef ssize_t value_count - cdef int scalar - - if bcf_unpack(r, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) - - if not fmt or not fmt.p: - raise KeyError(key) - - bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar) - - if value_count <= 0: - null_value = () - elif scalar: - null_value = None - else: - null_value = (None,)*value_count - - bcf_format_set_value(sample, bkey, null_value) - - -cdef bcf_format_get_allele_indices(VariantRecordSample sample): - cdef bcf_hdr_t *hdr = sample.record.header.ptr - cdef bcf1_t *r = sample.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) - - if bcf_unpack(r, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - - if sample.index < 0 or sample.index >= n or not r.n_fmt: - return () - - cdef bcf_fmt_t *fmt0 = r.d.fmt - cdef int gt0 = is_gt_fmt(hdr, fmt0.id) - - if not gt0 or not fmt0.n: - return () - - cdef int8_t *data8 - cdef int16_t *data16 - cdef int32_t *data32 - cdef int32_t a, nalleles = r.n_allele - cdef list alleles = [] - - if fmt0.type == BCF_BT_INT8: - data8 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data8[i] == bcf_int8_vector_end: - break - elif data8[i] == bcf_int8_missing: - a = -1 - else: - a = bcf_gt_allele(data8[i]) - alleles.append(a if 0 <= a < nalleles else None) - elif fmt0.type == BCF_BT_INT16: - data16 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data16[i] == bcf_int16_vector_end: - break - elif data16[i] == bcf_int16_missing: - a = -1 - else: - a = bcf_gt_allele(data16[i]) - alleles.append(a if 0 <= a < nalleles else None) - elif fmt0.type == BCF_BT_INT32: - data32 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data32[i] == bcf_int32_vector_end: - break - elif data32[i] == bcf_int32_missing: - a = -1 - else: - a = bcf_gt_allele(data32[i]) - alleles.append(a if 0 <= a < nalleles else None) - - return tuple(alleles) - - -cdef bcf_format_get_alleles(VariantRecordSample sample): - cdef bcf_hdr_t *hdr = sample.record.header.ptr - cdef bcf1_t *r = sample.record.ptr - cdef int32_t nsamples = bcf_hdr_nsamples(hdr) - - if bcf_unpack(r, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - - cdef int32_t nalleles = r.n_allele - - if sample.index < 0 or sample.index >= nsamples or not r.n_fmt: - return () - - cdef bcf_fmt_t *fmt0 = r.d.fmt - cdef int gt0 = is_gt_fmt(hdr, fmt0.id) - - if not gt0 or not fmt0.n: - return () - - cdef int32_t a - cdef int8_t *data8 - cdef int16_t *data16 - cdef int32_t *data32 - alleles = [] - if fmt0.type == BCF_BT_INT8: - data8 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data8[i] == bcf_int8_vector_end: - break - a = bcf_gt_allele(data8[i]) - alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) - elif fmt0.type == BCF_BT_INT16: - data16 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data16[i] == bcf_int16_vector_end: - break - a = bcf_gt_allele(data16[i]) - alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) - elif fmt0.type == BCF_BT_INT32: - data32 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data32[i] == bcf_int32_vector_end: - break - a = bcf_gt_allele(data32[i]) - alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) - return tuple(alleles) - - -cdef bint bcf_sample_get_phased(VariantRecordSample sample): - cdef bcf_hdr_t *hdr = sample.record.header.ptr - cdef bcf1_t *r = sample.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) - - if bcf_unpack(r, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - - if sample.index < 0 or sample.index >= n or not r.n_fmt: - return False - - cdef bcf_fmt_t *fmt0 = r.d.fmt - cdef int gt0 = is_gt_fmt(hdr, fmt0.id) - - if not gt0 or not fmt0.n: - return False - - cdef int8_t *data8 - cdef int16_t *data16 - cdef int32_t *data32 - - cdef bint phased = False - - if fmt0.type == BCF_BT_INT8: - data8 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data8[i] == bcf_int8_vector_end: - break - elif data8[i] == bcf_int8_missing: - continue - elif i and not bcf_gt_is_phased(data8[i]): - return False - else: - phased = True - elif fmt0.type == BCF_BT_INT16: - data16 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data16[i] == bcf_int16_vector_end: - break - elif data16[i] == bcf_int16_missing: - continue - elif i and not bcf_gt_is_phased(data16[i]): - return False - else: - phased = True - elif fmt0.type == BCF_BT_INT32: - data32 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data32[i] == bcf_int32_vector_end: - break - elif data32[i] == bcf_int32_missing: - continue - elif i and not bcf_gt_is_phased(data32[i]): - return False - else: - phased = True - - return phased - - -cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): - cdef bcf_hdr_t *hdr = sample.record.header.ptr - cdef bcf1_t *r = sample.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) - - if bcf_unpack(r, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - - if sample.index < 0 or sample.index >= n or not r.n_fmt: - return - - cdef bcf_fmt_t *fmt0 = r.d.fmt - cdef int gt0 = is_gt_fmt(hdr, fmt0.id) - - if not gt0 or not fmt0.n: - raise ValueError('Cannot set phased before genotype is set') - - cdef int8_t *data8 - cdef int16_t *data16 - cdef int32_t *data32 - - if fmt0.type == BCF_BT_INT8: - data8 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data8[i] == bcf_int8_vector_end: - break - elif data8[i] == bcf_int8_missing: - continue - elif i: - data8[i] = (data8[i] & 0xFE) | phased - elif fmt0.type == BCF_BT_INT16: - data16 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data16[i] == bcf_int16_vector_end: - break - elif data16[i] == bcf_int16_missing: - continue - elif i: - data16[i] = (data16[i] & 0xFFFE) | phased - elif fmt0.type == BCF_BT_INT32: - data32 = (fmt0.p + sample.index * fmt0.size) - for i in range(fmt0.n): - if data32[i] == bcf_int32_vector_end: - break - elif data32[i] == bcf_int32_missing: - continue - elif i: - data32[i] = (data32[i] & 0xFFFFFFFE) | phased - - -######################################################################## -######################################################################## -## Variant Header objects -######################################################################## - -#FIXME: implement a full mapping interface -#FIXME: passing bcf_hrec_t* may not be the safest approach once mutating -# operations are allowed. -cdef class VariantHeaderRecord(object): - """header record from a :class:`VariantHeader` object""" - - property type: - """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - return METADATA_TYPES[r.type] - - property key: - """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - return bcf_str_cache_get_charptr(r.key) if r.key else None - - property value: - """header value. Set only for generic lines, None for FILTER/INFO, etc.""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - return charptr_to_str(r.value) if r.value else None - - property attrs: - """sequence of additional header attributes""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - cdef int i - return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None, - charptr_to_str(r.vals[i]) if r.vals[i] else None) - for i in range(r.nkeys)) - - def __len__(self): - cdef bcf_hrec_t *r = self.ptr - return r.nkeys - - def __bool__(self): - cdef bcf_hrec_t *r = self.ptr - return r.nkeys != 0 - - def __getitem__(self, key): - """get attribute value""" - cdef bcf_hrec_t *r = self.ptr - cdef int i - bkey = force_bytes(key) - for i in range(r.nkeys): - if r.keys[i] and r.keys[i] == bkey: - return charptr_to_str(r.vals[i]) if r.vals[i] else None - raise KeyError('cannot find metadata key') - - def __iter__(self): - cdef bcf_hrec_t *r = self.ptr - cdef int i - for i in range(r.nkeys): - if r.keys[i]: - yield bcf_str_cache_get_charptr(r.keys[i]) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - try: - self[key] - except KeyError: - return False - else: - return True - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - cdef bcf_hrec_t *r = self.ptr - cdef int i - for i in range(r.nkeys): - if r.keys[i]: - yield charptr_to_str(r.vals[i]) if r.vals[i] else None - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - cdef bcf_hrec_t *r = self.ptr - cdef int i - for i in range(r.nkeys): - if r.keys[i]: - yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - def __str__(self): - cdef bcf_hrec_t *r = self.ptr - if r.type == BCF_HL_GEN: - return '##{}={}'.format(self.key, self.value) - else: - attrs = ','.join('{}={}'.format(k, v) for k,v in self.attrs if k != 'IDX') - return '##{}=<{}>'.format(self.key or self.type, attrs) - - -cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr): - if not header: - raise ValueError('invalid VariantHeader') - - if not hdr: - return None - - cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord) - record.header = header - record.ptr = hdr - - return record - - -cdef class VariantHeaderRecords(object): - """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object""" - - def __len__(self): - return self.header.ptr.nhrec - - def __bool__(self): - return self.header.ptr.nhrec != 0 - - def __getitem__(self, index): - cdef int32_t i = index - if i < 0 or i >= self.header.ptr.nhrec: - raise IndexError('invalid header record index') - return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) - - def __iter__(self): - cdef int32_t i - for i in range(self.header.ptr.nhrec): - yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) - - __hash__ = None - - -cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header): - if not header: - raise ValueError('invalid VariantHeader') - - cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords) - records.header = header - return records - - -cdef class VariantMetadata(object): - """filter, info or format metadata record from a :class:`VariantHeader` - object""" - - property name: - """metadata name""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key) - - # Q: Should this be exposed? - property id: - """metadata internal header id number""" - def __get__(self): - return self.id - - property number: - """metadata number (i.e. cardinality)""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or self.type == BCF_HL_FLT: - return None - cdef int l = bcf_hdr_id2length(hdr, self.type, self.id) - if l == BCF_VL_FIXED: - return bcf_hdr_id2number(hdr, self.type, self.id) - elif l == BCF_VL_VAR: - return '.' - else: - return METADATA_LENGTHS[l] - - property type: - """metadata value type""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or \ - self.type == BCF_HL_FLT: - return None - return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)] - - property description: - """metadata description (or None if not set)""" - def __get__(self): - descr = self.record.get('Description') - if descr: - descr = descr.strip('"') - return force_str(descr) - - property record: - """:class:`VariantHeaderRecord` associated with this - :class:`VariantMetadata` object""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - if not bcf_hdr_idinfo_exists(hdr, self.type, self.id): - return None - cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type] - if not hrec: - return None - return makeVariantHeaderRecord(self.header, hrec) - - -cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): - if not header: - raise ValueError('invalid VariantHeader') - - if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT: - raise ValueError('invalid metadata type') - - if id < 0 or id >= header.ptr.n[BCF_DT_ID]: - raise ValueError('invalid metadata id') - - cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata) - meta.header = header - meta.type = type - meta.id = id - - return meta - - -cdef class VariantHeaderMetadata(object): - """mapping from filter, info or format name to :class:`VariantMetadata` object""" - - def add(self, id, number, type, description, **kwargs): - """Add a new filter, info or format record""" - if id in self: - raise ValueError('Header already exists for id={}'.format(id)) - - if self.type == BCF_HL_FLT: - if number is not None: - raise ValueError('Number must be None when adding a filter') - if type is not None: - raise ValueError('Type must be None when adding a filter') - - items = [('ID', id), ('Description', description)] - else: - if type not in VALUE_TYPES: - raise ValueError('unknown type specified: {}'.format(type)) - if number is None: - number = '.' - - items = [('ID', id), - ('Number', number), - ('Type', type), - ('Description', description)] - - items += kwargs.items() - self.header.add_meta(METADATA_TYPES[self.type], items=items) - - def __len__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef bcf_idpair_t *idpair - cdef int32_t i, n = 0 - - for i in range(hdr.n[BCF_DT_ID]): - idpair = hdr.id[BCF_DT_ID] + i - if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: - n += 1 - return n - - def __bool__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef bcf_idpair_t *idpair - cdef int32_t i - - for i in range(hdr.n[BCF_DT_ID]): - idpair = hdr.id[BCF_DT_ID] + i - if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: - return True - return False - - def __getitem__(self, key): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef vdict_t *d = hdr.dict[BCF_DT_ID] - - bkey = force_bytes(key) - cdef khiter_t k = kh_get_vdict(d, bkey) - - if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: - raise KeyError('invalid filter') - - return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef bcf_idpair_t *idpair - cdef int32_t i - - for i in range(hdr.n[BCF_DT_ID]): - idpair = hdr.id[BCF_DT_ID] + i - if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: - yield bcf_str_cache_get_charptr(idpair.key) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - try: - self[key] - except KeyError: - return False - else: - return True - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - for key in self: - yield self[key] - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - for key in self: - yield (key, self[key]) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - -cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type): - if not header: - raise ValueError('invalid VariantHeader') - - cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata) - meta.header = header - meta.type = type - - return meta - - -cdef class VariantContig(object): - """contig metadata from a :class:`VariantHeader`""" - - property name: - """contig name""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key) - - property id: - """contig internal id number""" - def __get__(self): - return self.id - - property length: - """contig length or None if not available""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0] - return length if length else None - - property header: - """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] - return makeVariantHeaderRecord(self.header, hrec) - - -cdef VariantContig makeVariantContig(VariantHeader header, int id): - if not header: - raise ValueError('invalid VariantHeader') - - if id < 0 or id >= header.ptr.n[BCF_DT_CTG]: - raise ValueError('invalid contig id') - - cdef VariantContig contig = VariantContig.__new__(VariantContig) - contig.header = header - contig.id = id - - return contig - - -cdef class VariantHeaderContigs(object): - """mapping from contig name or index to :class:`VariantContig` object.""" - - def __len__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] - return hdr.n[BCF_DT_CTG] - - def __bool__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] - return hdr.n[BCF_DT_CTG] != 0 - - def __getitem__(self, key): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef int index - - if isinstance(key, int): - index = key - if index < 0 or index >= hdr.n[BCF_DT_CTG]: - raise IndexError('invalid contig index') - return makeVariantContig(self.header, index) - - cdef vdict_t *d = hdr.dict[BCF_DT_CTG] - bkey = force_bytes(key) - cdef khiter_t k = kh_get_vdict(d, bkey) - - if k == kh_end(d): - raise KeyError('invalid contig') - - cdef int id = kh_val_vdict(d, k).id - - return makeVariantContig(self.header, id) - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef vdict_t *d = hdr.dict[BCF_DT_CTG] - cdef uint32_t n = kh_size(d) - - assert n == hdr.n[BCF_DT_CTG] - - for i in range(n): - yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i)) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - try: - self[key] - except KeyError: - return False - else: - return True - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - for key in self: - yield self[key] - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - for key in self: - yield (key, self[key]) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - def add(self, id, **kwargs): - """Add a new contig record""" - if id in self: - raise ValueError('Header already exists for contig {}'.format(id)) - - items = [('ID', id)] + kwargs.items() - self.header.add_meta('contig', items=items) - - -cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header): - if not header: - raise ValueError('invalid VariantHeader') - - cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs) - contigs.header = header - - return contigs - - -cdef class VariantHeaderSamples(object): - """sequence of sample names from a :class:`VariantHeader` object""" - - def __len__(self): - return bcf_hdr_nsamples(self.header.ptr) - - def __bool__(self): - return bcf_hdr_nsamples(self.header.ptr) != 0 - - def __getitem__(self, index): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) - cdef int32_t i = index - - if i < 0 or i >= n: - raise IndexError('invalid sample index') - - return charptr_to_str(hdr.samples[i]) - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef int32_t i, n = bcf_hdr_nsamples(hdr) - - for i in range(n): - yield charptr_to_str(hdr.samples[i]) - - def __contains__(self, key): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef vdict_t *d = hdr.dict[BCF_DT_SAMPLE] - bkey = force_bytes(key) - cdef khiter_t k = kh_get_vdict(d, bkey) - - return k != kh_end(d) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - def add(self, name): - """Add a new sample""" - self.header.add_sample(name) - - -cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header): - if not header: - raise ValueError('invalid VariantHeader') - - cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples) - samples.header = header - - return samples - - -cdef class VariantHeader(object): - """header information for a :class:`VariantFile` object""" - - #FIXME: Add structured proxy - #FIXME: Add generic proxy - #FIXME: Add mutable methods - - # See makeVariantHeader for C constructor - def __cinit__(self): - self.ptr = NULL - - # Python constructor - def __init__(self): - self.ptr = bcf_hdr_init(b'w') - if not self.ptr: - raise ValueError('cannot create VariantHeader') - - def __dealloc__(self): - if self.ptr: - bcf_hdr_destroy(self.ptr) - self.ptr = NULL - - def __bool__(self): - # self.ptr == NULL should be impossible - return self.ptr != NULL - - def copy(self): - return makeVariantHeader(bcf_hdr_dup(self.ptr)) - - property version: - """VCF version""" - def __get__(self): - return force_str(bcf_hdr_get_version(self.ptr)) - - property samples: - """samples (:class:`VariantHeaderSamples`)""" - def __get__(self): - return makeVariantHeaderSamples(self) - - property records: - """header records (:class:`VariantHeaderRecords`)""" - def __get__(self): - return makeVariantHeaderRecords(self) - - property contigs: - """contig information (:class:`VariantHeaderContigs`)""" - def __get__(self): - return makeVariantHeaderContigs(self) - - property filters: - """filter metadata (:class:`VariantHeaderMetadata`)""" - def __get__(self): - return makeVariantHeaderMetadata(self, BCF_HL_FLT) - - property info: - """info metadata (:class:`VariantHeaderMetadata`)""" - def __get__(self): - return makeVariantHeaderMetadata(self, BCF_HL_INFO) - - property formats: - """format metadata (:class:`VariantHeaderMetadata`)""" - def __get__(self): - return makeVariantHeaderMetadata(self, BCF_HL_FMT) - - property alts: - """alt metadata (:class:`dict` ID->record). - - The data returned just a snapshot of alt records, is created - every time the property is requested, and modifications will - not be reflected in the header metadata and vice versa. - - i.e. it is just a dict that reflects the state of alt records - at the time it is created. - - """ - def __get__(self): - return {record['ID']:record for record in self.records - if record.key.upper() == 'ALT' } - - - # only safe to do when opening an htsfile - cdef _subset_samples(self, include_samples): - keep_samples = set(self.samples) - include_samples = set(include_samples) - missing_samples = include_samples - keep_samples - keep_samples &= include_samples - - if missing_samples: - # FIXME: add specialized exception with payload - raise ValueError( - 'missing {:d} requested samples'.format( - len(missing_samples))) - - keep_samples = force_bytes(','.join(keep_samples)) - cdef char *keep = keep_samples if keep_samples else NULL - cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0) - - if ret != 0: - raise ValueError( - 'bcf_hdr_set_samples failed: ret = {}'.format(ret)) - - def __str__(self): - cdef int hlen - cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen) - - try: - return charptr_to_str_w_len(hstr, hlen) - finally: - free(hstr) - - def add_record(self, VariantHeaderRecord record): - """Add an existing :class:`VariantHeaderRecord` to this header""" - cdef bcf_hrec_t *r = record.ptr - - if r.type == BCF_HL_GEN: - self.add_meta(r.key, r.value) - else: - items = [(k,v) for k,v in record.attrs if k != 'IDX'] - self.add_meta(r.key, items=items) - - def add_line(self, line): - """Add a metadata line to this header""" - bline = force_bytes(line) - if bcf_hdr_append(self.ptr, bline) < 0: - raise ValueError('invalid header line') - - if self.ptr.dirty: - bcf_hdr_sync(self.ptr) - - def add_meta(self, key, value=None, items=None): - """Add metadata to this header""" - if not ((value is not None) ^ (items is not None)): - raise ValueError('either value or items must be specified') - - cdef bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t)) - cdef int quoted - - try: - key = force_bytes(key) - hrec.key = strdup(key) - - if value is not None: - hrec.value = strdup(force_bytes(value)) - else: - for key, value in items: - key = force_bytes(key) - bcf_hrec_add_key(hrec, key, len(key)) - - value = force_bytes(str(value)) - quoted = strpbrk(value, ' ;,"\t<>') != NULL - bcf_hrec_set_val(hrec, hrec.nkeys-1, value, len(value), quoted) - except: - bcf_hrec_destroy(hrec) - raise - - bcf_hdr_add_hrec(self.ptr, hrec) - - if self.ptr.dirty: - bcf_hdr_sync(self.ptr) - - def add_sample(self, name): - """Add a new sample to this header""" - bname = force_bytes(name) - if bcf_hdr_add_sample(self.ptr, bname) < 0: - raise ValueError('Duplicated sample name: {}'.format(name)) - if self.ptr.dirty: - bcf_hdr_sync(self.ptr) - - -cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): - if not hdr: - raise ValueError('cannot create VariantHeader') - - cdef VariantHeader header = VariantHeader.__new__(VariantHeader) - header.ptr = hdr - - return header - - -######################################################################## -######################################################################## -## Variant Record objects -######################################################################## - -cdef class VariantRecordFilter(object): - """Filters set on a :class:`VariantRecord` object, presented as a mapping from - filter index or name to :class:`VariantMetadata` object""" - - def __len__(self): - return self.record.ptr.d.n_flt - - def __bool__(self): - return self.record.ptr.d.n_flt != 0 - - def __getitem__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int index, id - cdef int n = r.d.n_flt - - if isinstance(key, int): - index = key - - if index < 0 or index >= n: - raise IndexError('invalid filter index') - - id = r.d.flt[index] - else: - if key == '.': - key = 'PASS' - - bkey = force_bytes(key) - id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) - - if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \ - or not bcf_has_filter(hdr, self.record.ptr, bkey): - raise KeyError('Invalid filter') - - return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) - - def __delitem__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int index, id - cdef int n = r.d.n_flt - - if isinstance(key, int): - index = key - - if index < 0 or index >= n: - raise IndexError('invalid filter index') - - id = r.d.flt[index] - else: - if key == '.': - key = 'PASS' - - bkey = force_bytes(key) - id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) - - if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \ - or not bcf_has_filter(hdr, self.record.ptr, bkey): - raise KeyError('Invalid filter') - - bcf_remove_filter(hdr, r, id, 0) - - def clear(self): - """Clear all filters""" - cdef bcf1_t *r = self.record.ptr - r.d.shared_dirty |= BCF1_DIRTY_FLT - r.d.n_flt = 0 - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int i - - for i in range(r.d.n_flt): - yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i])) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) - return bcf_has_filter(hdr, r, bkey) == 1 - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - for key in self: - yield self[key] - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - for key in self: - yield (key, self[key]) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - -cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record): - if not record: - raise ValueError('invalid VariantRecord') - - cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter) - filter.record = record - - return filter - - -cdef class VariantRecordFormat(object): - """Format data present for each sample in a :class:`VariantRecord` object, - presented as mapping from format name to :class:`VariantMetadata` object.""" - - def __len__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int i, n = 0 - - for i in range(r.n_fmt): - if r.d.fmt[i].p: - n += 1 - return n - - def __bool__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int i - - for i in range(r.n_fmt): - if r.d.fmt[i].p: - return True - return False - - def __getitem__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - - bkey = force_bytes(key) - cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) - - if not fmt or not fmt.p: - raise KeyError('unknown format') - - return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id) - - def __delitem__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - - bkey = force_bytes(key) - cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) - - if not fmt or not fmt.p: - raise KeyError('unknown format') - - if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0: - raise ValueError('Unable to delete FORMAT') - - def clear(self): - """Clear all formats for all samples within the associated - :class:`VariantRecord` instance""" - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef bcf_fmt_t *fmt - cdef const char *key - cdef int i - - for i in reversed(range(r.n_fmt)): - fmt = &r.d.fmt[i] - if fmt.p: - key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id) - if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0: - raise ValueError('Unable to delete FORMAT') - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef bcf_fmt_t *fmt - cdef int i - - for i in range(r.n_fmt): - fmt = &r.d.fmt[i] - if fmt.p: - yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) - cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) - return fmt != NULL and fmt.p != NULL - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - for key in self: - yield self[key] - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - for key in self: - yield (key, self[key]) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - -cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record): - if not record: - raise ValueError('invalid VariantRecord') - - cdef VariantRecordFormat format = VariantRecordFormat.__new__( - VariantRecordFormat) - format.record = record - - return format - - -#TODO: Add a getmeta method to return the corresponding VariantMetadata? -cdef class VariantRecordInfo(object): - """Info data stored in a :class:`VariantRecord` object, presented as a - mapping from info metadata name to value.""" - - def __len__(self): - return self.record.ptr.n_info - - def __bool__(self): - return self.record.ptr.n_info != 0 - - def __getitem__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef vdict_t *d - cdef khiter_t k - cdef info_id - - if bcf_unpack(r, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) - - if not info: - d = hdr.dict[BCF_DT_ID] - k = kh_get_vdict(d, bkey) - - if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: - raise KeyError('Unknown INFO field: {}'.format(key)) - - info_id = kh_val_vdict(d, k).id - else: - info_id = info.key - - if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: - return info != NULL and info.vptr != NULL - - if not info or not info.vptr: - raise KeyError('Invalid INFO field: {}'.format(key)) - - return bcf_info_get_value(self.record, info) - - def __setitem__(self, key, value): - bcf_info_set_value(self.record, key, value) - - def __delitem__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - - if bcf_unpack(r, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) - - if not info or not info.vptr: - raise KeyError('Unknown INFO field: {}'.format(key)) - - if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: - raise ValueError('Unable to delete INFO') - - def clear(self): - """Clear all info data""" - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef bcf_info_t *info - cdef const char *key - cdef int i - - if bcf_unpack(r, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - - for i in range(r.n_info): - info = &r.d.info[i] - if info and info.vptr: - key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) - if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0: - raise ValueError('Unable to delete INFO') - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef bcf_info_t *info - cdef int i - - for i in range(r.n_info): - info = &r.d.info[i] - if info and info.vptr: - yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - - if bcf_unpack(r, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - - bkey = force_bytes(key) - cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) - - return info != NULL - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - cdef bcf1_t *r = self.record.ptr - cdef bcf_info_t *info - cdef int i - - for i in range(r.n_info): - info = &r.d.info[i] - if info and info.vptr: - yield bcf_info_get_value(self.record, info) - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef bcf_info_t *info - cdef int i - - for i in range(r.n_info): - info = &r.d.info[i] - if info and info.vptr: - key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) - value = bcf_info_get_value(self.record, info) - yield bcf_str_cache_get_charptr(key), value - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - -cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): - if not record: - raise ValueError('invalid VariantRecord') - - cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo) - info.record = record - - return info - - -cdef class VariantRecordSamples(object): - """mapping from sample index or name to :class:`VariantRecordSample` object.""" - - def __len__(self): - return bcf_hdr_nsamples(self.record.header.ptr) - - def __bool__(self): - return bcf_hdr_nsamples(self.record.header.ptr) != 0 - - def __getitem__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int n = bcf_hdr_nsamples(hdr) - cdef int sample_index - cdef vdict_t *d - cdef khiter_t k - - if isinstance(key, int): - sample_index = key - else: - bkey = force_bytes(key) - sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) - if sample_index < 0: - raise KeyError('invalid sample name') - - if sample_index < 0 or sample_index >= n: - raise IndexError('invalid sample index') - - return makeVariantRecordSample(self.record, sample_index) - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int32_t i, n = bcf_hdr_nsamples(hdr) - - for i in range(n): - yield charptr_to_str(hdr.samples[i]) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int n = bcf_hdr_nsamples(hdr) - cdef int sample_index - cdef vdict_t *d - cdef khiter_t k - - if isinstance(key, int): - sample_index = key - else: - bkey = force_bytes(key) - sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) - if sample_index < 0: - raise KeyError('invalid sample name') - - return 0 <= sample_index < n - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int32_t i, n = bcf_hdr_nsamples(hdr) - - for i in range(n): - yield makeVariantRecordSample(self.record, i) - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int32_t i, n = bcf_hdr_nsamples(hdr) - - for i in range(n): - yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i)) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - -cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): - if not record: - raise ValueError('invalid VariantRecord') - - cdef VariantRecordSamples samples = VariantRecordSamples.__new__( - VariantRecordSamples) - samples.record = record - - return samples - - -cdef class VariantRecord(object): - """Variant record""" - - def __dealloc__(self): - if self.ptr: - bcf_destroy1(self.ptr) - self.ptr = NULL - - property rid: - """internal reference id number""" - def __get__(self): - return self.ptr.rid - def __set__(self, rid): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef int r = rid - if rid < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val: - raise ValueError('invalid reference id') - self.ptr.rid = r - - property chrom: - """chromosome/contig name""" - def __get__(self): - return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid)) - def __set__(self, chrom): - cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] - bchrom = force_bytes(chrom) - cdef khint_t k = kh_get_vdict(d, bchrom) - if k == kh_end(d): - raise ValueError('Invalid chromosome/contig') - self.ptr.rid = kh_val_vdict(d, k).id - - property contig: - """chromosome/contig name""" - def __get__(self): - return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid)) - def __set__(self, chrom): - cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] - bchrom = force_bytes(chrom) - cdef khint_t k = kh_get_vdict(d, bchrom) - if k == kh_end(d): - raise ValueError('Invalid chromosome/contig') - self.ptr.rid = kh_val_vdict(d, k).id - - property pos: - """record start position on chrom/contig (1-based inclusive)""" - def __get__(self): - return self.ptr.pos + 1 - def __set__(self, pos): - if pos < 1: - raise ValueError('Position must be positive') - # FIXME: check start <= stop? - # KBJ: Can't or else certain mutating operations will become - # difficult or impossible. e.g. having to delete - # info['END'] before being able to reset pos is going to - # create subtle bugs. Better to check this when writing - # records. - self.ptr.pos = pos - 1 - - property start: - """record start position on chrom/contig (0-based inclusive)""" - def __get__(self): - return self.ptr.pos - def __set__(self, start): - if start < 0: - raise ValueError('Start coordinate must be non-negative') - # FIXME: check start <= stop? - # KBJ: See above. - self.ptr.pos = start - - property stop: - """record stop position on chrom/contig (0-based exclusive)""" - def __get__(self): - return self.ptr.pos + self.ptr.rlen - def __set__(self, stop): - if stop < self.ptr.pos: - raise ValueError('Stop coordinate must be greater than or equal to start') - self.ptr.rlen = stop - self.ptr.pos - - property rlen: - """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)""" - def __get__(self): - return self.ptr.rlen - def __set__(self, rlen): - if rlen < 0: - raise ValueError('Reference length must be non-negative') - self.ptr.rlen = rlen - - property qual: - """phred scaled quality score or None if not available""" - def __get__(self): - return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None - def __set__(self, qual): - if qual is not None: - self.ptr.qual = qual - else: - bcf_float_set(&self.ptr.qual, bcf_float_missing) - -# property n_allele: -# def __get__(self): -# return self.ptr.n_allele - -# property n_sample: -# def __get__(self): -# return self.ptr.n_sample - - property id: - """record identifier or None if not available""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None - def __set__(self, id): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - cdef char *idstr = NULL - if id is not None: - bid = force_bytes(id) - idstr = bid - if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0: - raise ValueError('Error updating id') - - property ref: - """reference allele""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - return charptr_to_str(r.d.allele[0]) if r.d.allele else None - def __set__(self, ref): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - #FIXME: Set alleles directly -- this is stupid - if not ref: - raise ValueError('ref allele cannot be null') - ref = force_bytes(ref) - if r.d.allele and r.n_allele: - alleles = [r.d.allele[i] for i in range(r.n_allele)] - alleles[0] = ref - else: - alleles = [ref] - self.alleles = alleles - - property alleles: - """tuple of reference allele followed by alt alleles""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - if not r.d.allele: - return None - cdef tuple res = PyTuple_New(r.n_allele) - for i in range(r.n_allele): - a = charptr_to_str(r.d.allele[i]) - PyTuple_SET_ITEM(res, i, a) - Py_INCREF(a) - return res - def __set__(self, values): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - values = [force_bytes(v) for v in values] - if b'' in values: - raise ValueError('cannot set null allele') - values = b','.join(values) - if bcf_update_alleles_str(self.header.ptr, r, values) < 0: - raise ValueError('Error updating alleles') - - property alts: - """tuple of alt alleles""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - if r.n_allele < 2 or not r.d.allele: - return None - cdef tuple res = PyTuple_New(r.n_allele - 1) - for i in range(1, r.n_allele): - a = charptr_to_str(r.d.allele[i]) - PyTuple_SET_ITEM(res, i - 1, a) - Py_INCREF(a) - return res - def __set__(self, values): - #FIXME: Set alleles directly -- this is stupid - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - values = [force_bytes(v) for v in values] - if b'' in values: - raise ValueError('cannot set null alt allele') - ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] - self.alleles = ref + values - - property filter: - """filter information (see :class:`VariantRecordFilter`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordFilter(self) - - property info: - """info data (see :class:`VariantRecordInfo`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordInfo(self) - - property format: - """sample format metadata (see :class:`VariantRecordFormat`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordFormat(self) - - property samples: - """sample data (see :class:`VariantRecordSamples`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordSamples(self) - - def __str__(self): - cdef kstring_t line - cdef char c - - line.l = line.m = 0 - line.s = NULL - - if vcf_format(self.header.ptr, self.ptr, &line) < 0: - if line.m: - free(line.s) - raise ValueError('vcf_format failed') - - # Strip CR/LF? - #while line.l: - # c = line.s[line.l - 1] - # if c != b'\n' and c != b'\r': - # break - # line.l -= 1 - - ret = charptr_to_str_w_len(line.s, line.l) - - if line.m: - free(line.s) - - return ret - - -cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r): - if not header: - raise ValueError('invalid VariantHeader') - - if not r: - raise ValueError('cannot create VariantRecord') - - cdef VariantRecord record = VariantRecord.__new__(VariantRecord) - record.header = header - record.ptr = r - - return record - - -######################################################################## -######################################################################## -## Variant Sampletype object -######################################################################## - - -cdef class VariantRecordSample(object): - """Data for a single sample from a :class:`VariantRecord` object. - Provides data accessors for genotypes and a mapping interface - from format name to values. - """ - - property name: - """sample name""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) - - if self.index < 0 or self.index >= n: - raise ValueError('invalid sample index') - - return charptr_to_str(hdr.samples[self.index]) - - property allele_indices: - """allele indices for called genotype, if present. Otherwise None""" - def __get__(self): - return bcf_format_get_allele_indices(self) - def __set__(self, values): - self['GT'] = values - def __del__(self): - self['GT'] = () - - property alleles: - """alleles for called genotype, if present. Otherwise None""" - def __get__(self): - return bcf_format_get_alleles(self) - def __set__(self, values): - self['GT'] = values - def __del__(self): - self['GT'] = () - - property phased: - """False if genotype is missing or any allele is unphased. Otherwise True.""" - def __get__(self): - return bcf_sample_get_phased(self) - def __set__(self, value): - bcf_sample_set_phased(self, value) - - def __len__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int i, n = 0 - - if bcf_unpack(r, BCF_UN_FMT) < 0: - raise ValueError('Error unpacking VariantRecord') - - for i in range(r.n_fmt): - if r.d.fmt[i].p: - n += 1 - return n - - def __bool__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int i - - if bcf_unpack(r, BCF_UN_FMT) < 0: - raise ValueError('Error unpacking VariantRecord') - - for i in range(r.n_fmt): - if r.d.fmt[i].p: - return True - return False - - def __getitem__(self, key): - return bcf_format_get_value(self, key) - - def __setitem__(self, key, value): - bcf_format_set_value(self, key, value) - - def __delitem__(self, key): - bcf_format_del_value(self, key) - - def clear(self): - """Clear all format data (including genotype) for this sample""" - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef bcf_fmt_t *fmt - cdef int i - - for i in range(r.n_fmt): - fmt = &r.d.fmt[i] - if fmt.p: - bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) - - def __iter__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef bcf_fmt_t *fmt - cdef int i - - for i in range(r.n_fmt): - fmt = &r.d.fmt[i] - if r.d.fmt[i].p: - yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - bkey = force_bytes(key) - cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) - return fmt != NULL and fmt.p != NULL - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - for key in self: - yield self[key] - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - for key in self: - yield (key, self[key]) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - -cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index): - if not record or sample_index < 0: - raise ValueError('cannot create VariantRecordSample') - - cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample) - sample.record = record - sample.index = sample_index - - return sample - - -######################################################################## -######################################################################## -## Index objects -######################################################################## - - -cdef class BaseIndex(object): - def __init__(self): - self.refs = () - self.remap = {} - - def __len__(self): - return len(self.refs) - - def __bool__(self): - return len(self.refs) != 0 - - def __getitem__(self, key): - if isinstance(key, int): - return self.refs[key] - else: - return self.refmap[key] - - def __iter__(self): - return iter(self.refs) - - def get(self, key, default=None): - """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" - try: - return self[key] - except KeyError: - return default - - def __contains__(self, key): - try: - self[key] - except KeyError: - return False - else: - return True - - def iterkeys(self): - """D.iterkeys() -> an iterator over the keys of D""" - return iter(self) - - def itervalues(self): - """D.itervalues() -> an iterator over the values of D""" - for key in self: - yield self[key] - - def iteritems(self): - """D.iteritems() -> an iterator over the (key, value) items of D""" - for key in self: - yield (key, self[key]) - - def keys(self): - """D.keys() -> list of D's keys""" - return list(self) - - def items(self): - """D.items() -> list of D's (key, value) pairs, as 2-tuples""" - return list(self.iteritems()) - - def values(self): - """D.values() -> list of D's values""" - return list(self.itervalues()) - - # Mappings are not hashable by default, but subclasses can change this - __hash__ = None - - #TODO: implement __richcmp__ - - -cdef class BCFIndex(object): - """CSI index data structure for BCF files""" - def __init__(self): - self.refs = () - self.refmap = {} - - if not self.ptr: - raise ValueError('Invalid index object') - - cdef int n - cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n) - - if not refs: - raise ValueError('Cannot retrieve reference sequence names') - - self.refs = char_array_to_tuple(refs, n, free_after=1) - self.refmap = { r:i for i,r in enumerate(self.refs) } - - def __dealloc__(self): - if self.ptr: - hts_idx_destroy(self.ptr) - self.ptr = NULL - - def fetch(self, bcf, contig, start, stop, region, reopen): - return BCFIterator(bcf, contig, start, stop, region, reopen) - - -cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx): - if not idx: - return None - - if not header: - raise ValueError('invalid VariantHeader') - - cdef BCFIndex index = BCFIndex.__new__(BCFIndex) - index.header = header - index.ptr = idx - index.__init__() - - return index - - -cdef class TabixIndex(BaseIndex): - """Tabix index data structure for VCF files""" - def __init__(self): - self.refs = () - self.refmap = {} - - if not self.ptr: - raise ValueError('Invalid index object') - - cdef int n - cdef const char **refs = tbx_seqnames(self.ptr, &n) - - if not refs: - raise ValueError('Cannot retrieve reference sequence names') - - self.refs = char_array_to_tuple(refs, n, free_after=1) - self.refmap = { r:i for i,r in enumerate(self.refs) } - - def __dealloc__(self): - if self.ptr: - tbx_destroy(self.ptr) - self.ptr = NULL - - def fetch(self, bcf, contig, start, stop, region, reopen): - return TabixIterator(bcf, contig, start, stop, region, reopen) - - -cdef TabixIndex makeTabixIndex(tbx_t *idx): - if not idx: - return None - - cdef TabixIndex index = TabixIndex.__new__(TabixIndex) - index.ptr = idx - index.__init__() - - return index - - -######################################################################## -######################################################################## -## Iterators -######################################################################## - - -cdef class BaseIterator(object): - pass - - -# Interal function to clean up after iteration stop or failure. -# This would be a nested function if it weren't a cdef function. -cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): - bcf_destroy1(record) - - # destroy iter so future calls to __next__ raise StopIteration - bcf_itr_destroy(self.iter) - self.iter = NULL - - -cdef class BCFIterator(BaseIterator): - def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): - - if not isinstance(bcf.index, BCFIndex): - raise ValueError('bcf index required') - - cdef BCFIndex index = bcf.index - cdef int rid, cstart, cstop - cdef char *cregion - - if not index: - raise ValueError('bcf index required') - - if reopen: - bcf = bcf.copy() - - if region is not None: - if contig is not None or start is not None or stop is not None: - raise ValueError # FIXME - - bregion = force_bytes(region) - cregion = bregion - with nogil: - self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion) - else: - if contig is None: - raise ValueError # FIXME - - try: - rid = index.refmap[contig] - except KeyError: - raise('Unknown contig specified') - - if start is None: - start = 0 - if stop is None: - stop = MAX_POS - - cstart, cstop = start, stop - - with nogil: - self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop) - - # Do not fail on self.iter == NULL, since it signifies a null query. - - self.bcf = bcf - self.index = index - - def __dealloc__(self): - if self.iter: - bcf_itr_destroy(self.iter) - self.iter = NULL - - def __iter__(self): - return self - - def __next__(self): - if not self.iter: - raise StopIteration - - cdef bcf1_t *record = bcf_init1() - - record.pos = -1 - if self.bcf.drop_samples: - record.max_unpack = BCF_UN_SHR - - cdef int ret - - with nogil: - ret = bcf_itr_next(self.bcf.htsfile, self.iter, record) - - if ret < 0: - _stop_BCFIterator(self, record) - if ret == -1: - raise StopIteration - else: - raise ValueError('error reading BCF file') - - ret = bcf_subset_format(self.bcf.header.ptr, record) - - if ret < 0: - _stop_BCFIterator(self, record) - raise ValueError('error in bcf_subset_format') - - return makeVariantRecord(self.bcf.header, record) - - -cdef class TabixIterator(BaseIterator): - def __cinit__(self, *args, **kwargs): - self.line_buffer.l = 0 - self.line_buffer.m = 0 - self.line_buffer.s = NULL - - def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): - if not isinstance(bcf.index, TabixIndex): - raise ValueError('tabix index required') - - cdef TabixIndex index = bcf.index - - if not index: - raise ValueError('bcf index required') - - if reopen: - bcf = bcf.copy() - - if region is not None: - if contig is not None or start is not None or stop is not None: - raise ValueError # FIXME - - self.iter = tbx_itr_querys(index.ptr, region) - else: - if contig is None: - raise ValueError # FIXME - - rid = index.refmap.get(contig, -1) - - if start is None: - start = 0 - if stop is None: - stop = MAX_POS - - self.iter = tbx_itr_queryi(index.ptr, rid, start, stop) - - # Do not fail on self.iter == NULL, since it signifies a null query. - - self.bcf = bcf - self.index = index - - def __dealloc__(self): - if self.iter: - tbx_itr_destroy(self.iter) - self.iter = NULL - - if self.line_buffer.m: - free(self.line_buffer.s) - - self.line_buffer.l = 0 - self.line_buffer.m = 0 - self.line_buffer.s = NULL - - def __iter__(self): - return self - - def __next__(self): - if not self.iter: - raise StopIteration - - cdef int ret - - with nogil: - ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer) - - if ret < 0: - tbx_itr_destroy(self.iter) - self.iter = NULL - if ret == -1: - raise StopIteration - else: - raise ValueError('error reading indexed VCF file') - - cdef bcf1_t *record = bcf_init1() - - record.pos = -1 - if self.bcf.drop_samples: - record.max_unpack = BCF_UN_SHR - - ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record) - - # FIXME: stop iteration on parse failure? - if ret < 0: - bcf_destroy1(record) - raise ValueError('error in vcf_parse') - - return makeVariantRecord(self.bcf.header, record) - - -######################################################################## -######################################################################## -## Variant File -######################################################################## - - -cdef class VariantFile(object): - """*(filename, mode=None, index_filename=None, header=None, drop_samples=False)* - - A :term:`VCF`/:term:`BCF` formatted file. The file is automatically - opened. - - *mode* should be ``r`` for reading or ``w`` for writing. The default is - text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append - ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. - - If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid - modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. - For instance, to open a :term:`BCF` formatted file for reading, type:: - - f = pysam.VariantFile('ex1.bcf','rb') - - If mode is not specified, we will try to auto-detect in the order 'rb', - 'r', thus both the following should work:: - - f1 = pysam.VariantFile('ex1.bcf') - f2 = pysam.VariantFile('ex1.vcf') - - If an index for a variant file exists (.csi or .tbi), it will be opened - automatically. Without an index random access to records via - :meth:`fetch` is disabled. - - For writing, a :class:`VariantHeader` object must be provided, typically - obtained from another :term:`VCF` file/:term:`BCF` file. - """ - def __cinit__(self, *args, **kwargs): - self.htsfile = NULL - - def __init__(self, *args, **kwargs): - self.header = None - self.index = None - self.filename = None - self.mode = None - self.index_filename = None - self.is_stream = False - self.is_remote = False - self.is_reading = False - self.drop_samples = False - self.start_offset = -1 - - self.open(*args, **kwargs) - - def __dealloc__(self): - if self.htsfile: - hts_close(self.htsfile) - self.htsfile = NULL - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - return False - - property category: - """General file format category. One of UNKNOWN, ALIGNMENTS, - VARIANTS, INDEX, REGIONS""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return FORMAT_CATEGORIES[self.htsfile.format.category] - - property format: - """File format. - - One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, - BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED. - """ - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return FORMATS[self.htsfile.format.format] - - property version: - """Tuple of file format version numbers (major, minor)""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return (self.htsfile.format.version.major, - self.htsfile.format.version.minor) - - property compression: - """File compression. - - One of NONE, GZIP, BGZF, CUSTOM.""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return COMPRESSION[self.htsfile.format.compression] - - property description: - """Vaguely human readable description of the file format""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - cdef char *desc = hts_format_description(&self.htsfile.format) - try: - return charptr_to_str(desc) - finally: - free(desc) - - def close(self): - """closes the :class:`pysam.VariantFile`.""" - if self.htsfile: - hts_close(self.htsfile) - self.htsfile = NULL - self.header = self.index = None - - property is_open: - def __get__(self): - """return True if VariantFile is open and in a valid state.""" - return self.htsfile != NULL - - def __iter__(self): - if not self.is_open: - raise ValueError('I/O operation on closed file') - - if not self.mode.startswith(b'r'): - raise ValueError( - 'cannot iterate over Variantfile opened for writing') - - self.is_reading = 1 - return self - - def __next__(self): - cdef int ret - cdef bcf1_t *record = bcf_init1() - - record.pos = -1 - if self.drop_samples: - record.max_unpack = BCF_UN_SHR - - with nogil: - ret = bcf_read1(self.htsfile, self.header.ptr, record) - - if ret < 0: - bcf_destroy1(record) - if ret == -1: - raise StopIteration - elif ret == -2: - raise IOError('truncated file') - else: - raise ValueError('Variant read failed') - - return makeVariantRecord(self.header, record) - - def copy(self): - if not self.is_open: - raise ValueError - - cdef VariantFile vars = VariantFile.__new__(VariantFile) - cdef bcf_hdr_t *hdr - cdef char *cfilename - cdef char *cmode - - # FIXME: re-open using fd or else header and index could be invalid - cfilename, cmode = self.filename, self.mode - with nogil: - vars.htsfile = hts_open(cfilename, cmode) - - if not vars.htsfile: - raise ValueError('Cannot re-open htsfile') - - # minimize overhead by re-using header and index. This approach is - # currently risky, but see above for how this can be mitigated. - vars.header = self.header - vars.index = self.index - - vars.filename = self.filename - vars.mode = self.mode - vars.index_filename = self.index_filename - vars.drop_samples = self.drop_samples - vars.is_stream = self.is_stream - vars.is_remote = self.is_remote - vars.is_reading = self.is_reading - vars.start_offset = self.start_offset - - if self.htsfile.is_bin: - vars.seek(self.tell()) - else: - with nogil: - hdr = bcf_hdr_read(vars.htsfile) - makeVariantHeader(hdr) - - return vars - - def open(self, filename, mode='rb', - index_filename=None, - VariantHeader header=None, - drop_samples=False): - """open a vcf/bcf file. - - If open is called on an existing VariantFile, the current file will be - closed and a new file will be opened. - """ - cdef bcf_hdr_t *hdr - cdef BGZF *bgzfp - cdef hts_idx_t *idx - cdef tbx_t *tidx - cdef char *cfilename - cdef char *cindex_filename = NULL - cdef char *cmode - - # close a previously opened file - if self.is_open: - self.close() - - if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'): - raise ValueError('invalid file opening mode `{}`'.format(mode)) - - # for htslib, wbu seems to not work - if mode == 'wbu': - mode = 'wb0' - - self.mode = mode = force_bytes(mode) - self.filename = filename = encode_filename(filename) - if index_filename is not None: - self.index_filename = index_filename = encode_filename(index_filename) - else: - self.index_filename = None - self.drop_samples = bool(drop_samples) - self.header = None - - self.is_remote = hisremote(filename) - self.is_stream = filename == b'-' - - if mode.startswith(b'w'): - # open file for writing - if index_filename is not None: - raise ValueError('Cannot specify an index filename when writing a VCF/BCF file') - - # header structure (used for writing) - if header: - self.header = header.copy() - else: - raise ValueError('a VariantHeader must be specified') - - # open file. Header gets written to file at the same time - # for bam files and sam files (in the latter case, the - # mode needs to be wh) - cfilename, cmode = filename, mode - with nogil: - self.htsfile = hts_open(cfilename, cmode) - - if not self.htsfile: - raise ValueError("could not open file `{}` (mode='{}')".format((filename, mode))) - - with nogil: - bcf_hdr_write(self.htsfile, self.header.ptr) - - elif mode.startswith(b'r'): - # open file for reading - if filename != b'-' and not self.is_remote and not os.path.exists(filename): - raise IOError('file `{}` not found'.format(filename)) - - cfilename, cmode = filename, mode - with nogil: - self.htsfile = hts_open(cfilename, cmode) - - if not self.htsfile: - raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode)) - - if self.htsfile.format.format not in (bcf, vcf): - raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode)) - - if self.htsfile.format.compression == bgzf: - bgzfp = hts_get_bgzfp(self.htsfile) - if bgzfp and bgzf_check_EOF(bgzfp) == 0: - warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename)) - - with nogil: - hdr = bcf_hdr_read(self.htsfile) - - try: - self.header = makeVariantHeader(hdr) - except ValueError: - raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode)) - - # check for index and open if present - if self.htsfile.format.format == bcf: - if index_filename is not None: - cindex_filename = index_filename - with nogil: - idx = bcf_index_load2(cfilename, cindex_filename) - self.index = makeBCFIndex(self.header, idx) - - elif self.htsfile.format.compression == bgzf: - if index_filename is not None: - cindex_filename = index_filename - with nogil: - tidx = tbx_index_load2(cfilename, cindex_filename) - self.index = makeTabixIndex(tidx) - - if not self.is_stream: - self.start_offset = self.tell() - else: - raise ValueError("unknown mode {}".format(mode)) - - def reset(self): - """reset file position to beginning of file just after the header.""" - return self.seek(self.start_offset, 0) - - def seek(self, uint64_t offset): - """move file pointer to position *offset*, see - :meth:`pysam.VariantFile.tell`.""" - if not self.is_open: - raise ValueError('I/O operation on closed file') - if self.is_stream: - raise OSError('seek not available in streams') - - cdef int64_t ret - if self.htsfile.format.compression != no_compression: - with nogil: - ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) - else: - with nogil: - ret = hts_useek(self.htsfile, offset, SEEK_SET) - return ret - - def tell(self): - """return current file position, see :meth:`pysam.VariantFile.seek`.""" - if not self.is_open: - raise ValueError('I/O operation on closed file') - if self.is_stream: - raise OSError('tell not available in streams') - - cdef int64_t ret - if self.htsfile.format.compression != no_compression: - with nogil: - ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) - else: - with nogil: - ret = hts_utell(self.htsfile) - return ret - - def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False): - """fetch records in a :term:`region` using 0-based indexing. The - region is specified by :term:`contig`, *start* and *end*. - Alternatively, a samtools :term:`region` string can be supplied. - - Without *contig* or *region* all mapped records will be fetched. The - records will be returned ordered by contig, which will not necessarily - be the order within the file. - - Set *reopen* to true if you will be using multiple iterators on the - same file at the same time. The iterator returned will receive its - own copy of a filehandle to the file effectively re-opening the - file. Re-opening a file incurrs some overhead, so use with care. - - If only *contig* is set, all records on *contig* will be fetched. - If both *region* and *contig* are given, an exception is raised. - - Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index - (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be - read sequentially. - """ - if not self.is_open: - raise ValueError('I/O operation on closed file') - - if not self.mode.startswith(b'r'): - raise ValueError('cannot fetch from Variantfile opened ' - 'for writing') - - if contig is None and region is None: - self.is_reading = 1 - bcf = self.copy() if reopen else self - bcf.seek(self.start_offset) - return iter(bcf) - - if not self.index: - raise ValueError('fetch requires an index') - - self.is_reading = 1 - return self.index.fetch(self, contig, start, stop, region, reopen) - - cpdef int write(self, VariantRecord record) except -1: - """ - write a single :class:`pysam.VariantRecord` to disk. - - returns the number of bytes written. - """ - if not self.is_open: - return ValueError('I/O operation on closed file') - - if not self.mode.startswith(b'w'): - raise ValueError('cannot write to a Variantfile opened for reading') - - #if record.header is not self.header: - # raise ValueError('Writing records from a different VariantFile is not yet supported') - - cdef int ret - - with nogil: - ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr) - - if ret < 0: - raise ValueError('write failed') - - return ret - - def subset_samples(self, include_samples): - """ - Read only a subset of samples to reduce processing time and memory. - Must be called prior to retrieving records. - """ - if not self.is_open: - raise ValueError('I/O operation on closed file') - - if not self.mode.startswith(b'r'): - raise ValueError('cannot subset samples from Variantfile ' - 'opened for writing') - - if self.is_reading: - raise ValueError('cannot subset samples after fetching records') - - self.header._subset_samples(include_samples) - - # potentially unnecessary optimization that also sets max_unpack - if not include_samples: - self.drop_samples = True diff --git a/pysam/cfaidx.pxd b/pysam/cfaidx.pxd deleted file mode 100644 index 7749274..0000000 --- a/pysam/cfaidx.pxd +++ /dev/null @@ -1,79 +0,0 @@ -from libc.stdint cimport int8_t, int16_t, int32_t, int64_t -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -from libc.stdlib cimport malloc, calloc, realloc, free -from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup -from libc.stdio cimport FILE, printf -cimport cython - -from cpython cimport array -from pysam.chtslib cimport faidx_t, kstring_t, BGZF - -# These functions are put here and not in chtslib.pxd in order -# to avoid warnings for unused functions. -cdef extern from "pysam_stream.h" nogil: - - ctypedef struct kstream_t: - pass - - ctypedef struct kseq_t: - kstring_t name - kstring_t comment - kstring_t seq - kstring_t qual - - kseq_t *kseq_init(BGZF *) - int kseq_read(kseq_t *) - void kseq_destroy(kseq_t *) - kstream_t *ks_init(BGZF *) - void ks_destroy(kstream_t *) - - # Retrieve characters from stream until delimiter - # is reached placing results in str. - int ks_getuntil(kstream_t *, - int delimiter, - kstring_t * str, - int * dret) - -cdef class FastaFile: - cdef bint is_remote - cdef object _filename, _references, _lengths, reference2length - cdef faidx_t* fastafile - cdef char* _fetch(self, char* reference, - int start, int end, int* length) - - -cdef class FastqProxy: - cdef kseq_t * _delegate - cdef cython.str tostring(self) - cpdef array.array get_quality_array(self, int offset=*) - - -cdef class PersistentFastqProxy: - """ - Python container for pysam.cfaidx.FastqProxy with persistence. - """ - cdef public str comment, quality, sequence, name - cdef cython.str tostring(self) - cpdef array.array get_quality_array(self, int offset=*) - - -cdef class FastxFile: - cdef object _filename - cdef BGZF * fastqfile - cdef kseq_t * entry - cdef bint persist - cdef bint is_remote - - cdef kseq_t * getCurrent(self) - cdef int cnext(self) - - -# Compatibility Layer for pysam 0.8.1 -cdef class FastqFile(FastxFile): - pass - - -# Compatibility Layer for pysam < 0.8 -cdef class Fastafile(FastaFile): - pass - diff --git a/pysam/cfaidx.pyx b/pysam/cfaidx.pyx deleted file mode 100644 index 78f9aac..0000000 --- a/pysam/cfaidx.pyx +++ /dev/null @@ -1,571 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### -############################################################################### -# Cython wrapper for SAM/BAM/CRAM files based on htslib -############################################################################### -# The principal classes defined in this module are: -# -# class FastaFile random read read/write access to faidx indexd files -# class FastxFile streamed read/write access to fasta/fastq files -# -# Additionally this module defines several additional classes that are part -# of the internal API. These are: -# -# class FastqProxy -# class PersistentFastqProxy -# -# For backwards compatibility, the following classes are also defined: -# -# class Fastafile equivalent to FastaFile -# class FastqFile equivalent to FastxFile -# -############################################################################### -# -# The MIT License -# -# Copyright (c) 2015 Andreas Heger -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -############################################################################### -import sys -import os -import re -from cpython cimport array - -from cpython cimport PyErr_SetString, \ - PyBytes_Check, \ - PyUnicode_Check, \ - PyBytes_FromStringAndSize - -from cpython.version cimport PY_MAJOR_VERSION - -from pysam.chtslib cimport \ - faidx_nseq, fai_load, fai_destroy, fai_fetch, \ - faidx_seq_len, \ - faidx_fetch_seq, hisremote, \ - bgzf_open, bgzf_close - -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size -from pysam.cutils cimport qualitystring_to_array, parse_region - -cdef class FastqProxy -cdef makeFastqProxy(kseq_t * src): - '''enter src into AlignedRead.''' - cdef FastqProxy dest = FastqProxy.__new__(FastqProxy) - dest._delegate = src - return dest - -## TODO: -## add automatic indexing. -## add function to get sequence names. -cdef class FastaFile: - """Random access to fasta formatted files that - have been indexed by :term:`faidx`. - - The file is automatically opened. The index file of file - ```` is expected to be called ``.fai``. - - Parameters - ---------- - - filename : string - Filename of fasta file to be opened. - - filepath_index : string - Optional, filename of the index. By default this is - the filename + ".fai". - - Raises - ------ - - ValueError - if index file is missing - - IOError - if file could not be opened - - """ - - def __cinit__(self, *args, **kwargs): - self.fastafile = NULL - self._filename = None - self._references = None - self._lengths = None - self.reference2length = None - self._open(*args, **kwargs) - - def is_open(self): - '''return true if samfile has been opened.''' - return self.fastafile != NULL - - def __len__(self): - if self.fastafile == NULL: - raise ValueError("calling len() on closed file") - - return faidx_nseq(self.fastafile) - - def _open(self, filename, filepath_index=None): - '''open an indexed fasta file. - - This method expects an indexed fasta file. - ''' - - # close a previously opened file - if self.fastafile != NULL: - self.close() - - self._filename = encode_filename(filename) - cdef char *cfilename = self._filename - self.is_remote = hisremote(cfilename) - - if filepath_index is not None: - raise NotImplementedError( - "setting an explicit path for the index " - "is not implemented") - - # open file for reading - if (self._filename != b"-" - and not self.is_remote - and not os.path.exists(filename)): - raise IOError("file `%s` not found" % filename) - - with nogil: - self.fastafile = fai_load(cfilename) - - if self.fastafile == NULL: - raise IOError("could not open file `%s`" % filename) - - if self.is_remote: - filepath_index = os.path.basename( - re.sub("[^:]+:[/]*", "", filename)) + ".fai" - elif filepath_index is None: - filepath_index = filename + ".fai" - - if not os.path.exists(filepath_index): - raise ValueError("could not locate index file {}".format( - filepath_index)) - - with open(filepath_index) as inf: - data = [x.split("\t") for x in inf] - self._references = tuple(x[0] for x in data) - self._lengths = tuple(int(x[1]) for x in data) - self.reference2length = dict(zip(self._references, self._lengths)) - - def close(self): - """close the file.""" - if self.fastafile != NULL: - fai_destroy(self.fastafile) - self.fastafile = NULL - - def __dealloc__(self): - if self.fastafile != NULL: - fai_destroy(self.fastafile) - self.fastafile = NULL - - # context manager interface - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - return False - - property closed: - """"bool indicating the current state of the file object. - This is a read-only attribute; the close() method changes the value. - """ - def __get__(self): - return not self.is_open() - - property filename: - """filename associated with this object. This is a read-only attribute.""" - def __get__(self): - return self._filename - - property references: - '''tuple with the names of :term:`reference` sequences.''' - def __get__(self): - return self._references - - property nreferences: - """"int with the number of :term:`reference` sequences in the file. - This is a read-only attribute.""" - def __get__(self): - return len(self._references) if self.references else None - - property lengths: - """tuple with the lengths of :term:`reference` sequences.""" - def __get__(self): - return self._lengths - - def fetch(self, - reference=None, - start=None, - end=None, - region=None): - """fetch sequences in a :term:`region`. - - A region can - either be specified by :term:`reference`, `start` and - `end`. `start` and `end` denote 0-based, half-open - intervals. - - Alternatively, a samtools :term:`region` string can be - supplied. - - If any of the coordinates are missing they will be replaced by the - minimum (`start`) or maximum (`end`) coordinate. - - Note that region strings are 1-based, while `start` and `end` denote - an interval in python coordinates. - The region is specified by :term:`reference`, `start` and `end`. - - Returns - ------- - - string : a string with the sequence specified by the region. - - Raises - ------ - - IndexError - if the coordinates are out of range - - ValueError - if the region is invalid - - """ - - if not self.is_open(): - raise ValueError("I/O operation on closed file" ) - - cdef int length - cdef char *seq - cdef char *ref - cdef int rstart, rend - - reference, rstart, rend = parse_region(reference, start, end, region) - - if reference is None: - raise ValueError("no sequence/region supplied.") - - if rstart == rend: - return "" - - ref = reference - with nogil: - length = faidx_seq_len(self.fastafile, ref) - if length == -1: - raise KeyError("sequence '%s' not present" % reference) - if rstart >= length: - return "" - - # fai_fetch adds a '\0' at the end - with nogil: - seq = faidx_fetch_seq(self.fastafile, - ref, - rstart, - rend-1, - &length) - - if seq == NULL: - raise ValueError( - "failure when retrieving sequence on '%s'" % reference) - - try: - return charptr_to_str(seq) - finally: - free(seq) - - cdef char * _fetch(self, char * reference, int start, int end, int * length): - '''fetch sequence for reference, start and end''' - - with nogil: - return faidx_fetch_seq(self.fastafile, - reference, - start, - end-1, - length) - - def get_reference_length(self, reference): - '''return the length of reference.''' - return self.reference2length[reference] - - def __getitem__(self, reference): - return self.fetch(reference) - - def __contains__(self, reference): - '''return true if reference in fasta file.''' - return reference in self.reference2length - - -cdef class FastqProxy: - """A single entry in a fastq file.""" - def __init__(self): pass - - property name: - """The name of each entry in the fastq file.""" - def __get__(self): - return charptr_to_str(self._delegate.name.s) - - property sequence: - """The sequence of each entry in the fastq file.""" - def __get__(self): - return charptr_to_str(self._delegate.seq.s) - - property comment: - def __get__(self): - if self._delegate.comment.l: - return charptr_to_str(self._delegate.comment.s) - else: - return None - - property quality: - """The quality score of each entry in the fastq file, represented as a string.""" - def __get__(self): - if self._delegate.qual.l: - return charptr_to_str(self._delegate.qual.s) - else: - return None - - cdef cython.str tostring(self): - if self.comment is None: - comment = "" - else: - comment = " %s" % self.comment - - if self.quality is None: - return ">%s%s\n%s" % (self.name, comment, self.sequence) - else: - return "@%s%s\n%s\n+\n%s" % (self.name, comment, - self.sequence, self.quality) - - def __str__(self): - return self.tostring() - - cpdef array.array get_quality_array(self, int offset=33): - '''return quality values as integer array after subtracting offset.''' - if self.quality is None: - return None - return qualitystring_to_array(force_bytes(self.quality), - offset=offset) - -cdef class PersistentFastqProxy: - """ - Python container for pysam.cfaidx.FastqProxy with persistence. - Needed to compare multiple fastq records from the same file. - """ - def __init__(self, FastqProxy FastqRead): - self.comment = FastqRead.comment - self.quality = FastqRead.quality - self.sequence = FastqRead.sequence - self.name = FastqRead.name - - cdef cython.str tostring(self): - if self.comment is None: - comment = "" - else: - comment = " %s" % self.comment - - if self.quality is None: - return ">%s%s\n%s" % (self.name, comment, self.sequence) - else: - return "@%s%s\n%s\n+\n%s" % (self.name, comment, - self.sequence, self.quality) - - def __str__(self): - return self.tostring() - - cpdef array.array get_quality_array(self, int offset=33): - '''return quality values as array after subtracting offset.''' - if self.quality is None: - return None - return qualitystring_to_array(force_bytes(self.quality), - offset=offset) - - -cdef class FastxFile: - """Stream access to :term:`fasta` or :term:`fastq` formatted files. - - The file is automatically opened. - - Entries in the file can be both fastq or fasta formatted or even a - mixture of the two. - - This file object permits iterating over all entries in the - file. Random access is not implemented. The iteration returns - objects of type :class:`FastqProxy` - - Parameters - ---------- - - filename : string - Filename of fasta/fastq file to be opened. - - persist : bool - - If True (default) make a copy of the entry in the file during - iteration. If set to False, no copy will be made. This will - permit faster iteration, but an entry will not persist when - the iteration continues. - - Notes - ----- - Prior to version 0.8.2, this was called FastqFile. - - Raises - ------ - - IOError - if file could not be opened - - - Examples - -------- - >>> with pysam.FastxFile(filename) as fh: - ... for entry in fh: - ... print(entry.name) - ... print(entry.sequence) - ... print(entry.comment) - ... print(entry.quality) - - """ - def __cinit__(self, *args, **kwargs): - # self.fastqfile = NULL - self._filename = None - self.entry = NULL - self._open(*args, **kwargs) - - def is_open(self): - '''return true if samfile has been opened.''' - return self.entry != NULL - - def _open(self, filename, persist=True): - '''open a fastq/fasta file in *filename* - - Paramentes - ---------- - - persist : bool - - if True return a copy of the underlying data (default - True). The copy will persist even if the iteration - on the file continues. - - ''' - if self.fastqfile != NULL: - self.close() - - self._filename = encode_filename(filename) - cdef char *cfilename = self._filename - self.is_remote = hisremote(cfilename) - - # open file for reading - if (self._filename != b"-" - and not self.is_remote - and not os.path.exists(filename)): - raise IOError("file `%s` not found" % filename) - - self.persist = persist - - with nogil: - self.fastqfile = bgzf_open(cfilename, "r") - self.entry = kseq_init(self.fastqfile) - self._filename = filename - - def close(self): - '''close the file.''' - if self.fastqfile != NULL: - bgzf_close(self.fastqfile) - self.fastqfile = NULL - if self.entry != NULL: - kseq_destroy(self.entry) - self.entry = NULL - - def __dealloc__(self): - if self.fastqfile != NULL: - bgzf_close(self.fastqfile) - if self.entry: - kseq_destroy(self.entry) - - # context manager interface - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - return False - - property closed: - """"bool indicating the current state of the file object. - This is a read-only attribute; the close() method changes the value. - """ - def __get__(self): - return not self.is_open() - - property filename: - """string with the filename associated with this object.""" - def __get__(self): - return self._filename - - def __iter__(self): - if not self.is_open(): - raise ValueError("I/O operation on closed file") - return self - - cdef kseq_t * getCurrent(self): - return self.entry - - cdef int cnext(self): - '''C version of iterator - ''' - with nogil: - return kseq_read(self.entry) - - def __next__(self): - """ - python version of next(). - """ - cdef int l - with nogil: - l = kseq_read(self.entry) - if (l >= 0): - if self.persist: - return PersistentFastqProxy(makeFastqProxy(self.entry)) - return makeFastqProxy(self.entry) - else: - raise StopIteration - -# Compatibility Layer for pysam 0.8.1 -cdef class FastqFile(FastxFile): - """FastqFile is deprecated: use FastxFile instead""" - pass - -# Compatibility Layer for pysam < 0.8 -cdef class Fastafile(FastaFile): - """Fastafile is deprecated: use FastaFile instead""" - pass - -__all__ = ["FastaFile", - "FastqFile", - "FastxFile", - "Fastafile"] diff --git a/pysam/chtslib.pxd b/pysam/chtslib.pxd deleted file mode 100644 index 33c1559..0000000 --- a/pysam/chtslib.pxd +++ /dev/null @@ -1,1898 +0,0 @@ -from libc.stdint cimport int8_t, int16_t, int32_t, int64_t -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -from libc.stdlib cimport malloc, calloc, realloc, free -from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup -from libc.stdio cimport FILE, printf -from posix.types cimport off_t - -cdef extern from "Python.h": - FILE* PyFile_AsFile(object) - - -cdef extern from "htslib/kstring.h" nogil: - ctypedef struct kstring_t: - size_t l, m - char *s - - -cdef extern from "htslib_util.h" nogil: - int hts_set_verbosity(int verbosity) - int hts_get_verbosity() - - ctypedef uint32_t khint32_t - ctypedef uint32_t khint_t - ctypedef khint_t khiter_t - - # Used to manage BCF Header info - ctypedef struct vdict_t: - khint_t n_buckets, size, n_occupied, upper_bound - khint32_t *flags - const char *keys - bcf_idinfo_t *vals - - # Used to manage indexed contigs in Tabix - ctypedef struct s2i_t: - khint_t n_buckets, size, n_occupied, upper_bound - khint32_t *flags - const char *keys - int64_t *vals - - # Generic khash methods - khint_t kh_size(void *d) - khint_t kh_begin(void *d) - khint_t kh_end(void *d) - int kh_exist(void *d, khiter_t i) - - # Specialized khash methods for vdict - khint_t kh_get_vdict(vdict_t *d, const char *key) - const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i) - bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i) - - -cdef extern from "htslib/hfile.h" nogil: - ctypedef struct hFILE - - # @abstract Open the named file or URL as a stream - # @return An hFILE pointer, or NULL (with errno set) if an error occurred. - hFILE *hopen(const char *filename, const char *mode) - - # @abstract Associate a stream with an existing open file descriptor - # @return An hFILE pointer, or NULL (with errno set) if an error occurred. - # @notes For socket descriptors (on Windows), mode should contain 's'. - hFILE *hdopen(int fd, const char *mode) - - # @abstract Report whether the file name or URL denotes remote storage - # @return 0 if local, 1 if remote. - # @notes "Remote" means involving e.g. explicit network access, with the - # implication that callers may wish to cache such files' contents locally. - int hisremote(const char *filename) - - # @abstract Flush (for output streams) and close the stream - # @return 0 if successful, or EOF (with errno set) if an error occurred. - int hclose(hFILE *fp) - - # @abstract Close the stream, without flushing or propagating errors - # @notes For use while cleaning up after an error only. Preserves errno. - void hclose_abruptly(hFILE *fp) - - # @abstract Return the stream's error indicator - # @return Non-zero (in fact, an errno value) if an error has occurred. - # @notes This would be called herror() and return true/false to parallel - # ferror(3), but a networking-related herror(3) function already exists. */ - int herrno(hFILE *fp) - - # @abstract Clear the stream's error indicator - void hclearerr(hFILE *fp) - - # @abstract Reposition the read/write stream offset - # @return The resulting offset within the stream (as per lseek(2)), - # or negative if an error occurred. - off_t hseek(hFILE *fp, off_t offset, int whence) - - # @abstract Report the current stream offset - # @return The offset within the stream, starting from zero. - off_t htell(hFILE *fp) - - # @abstract Read one character from the stream - # @return The character read, or EOF on end-of-file or error - int hgetc(hFILE *fp) - - # @abstract Peek at characters to be read without removing them from buffers - # @param fp The file stream - # @param buffer The buffer to which the peeked bytes will be written - # @param nbytes The number of bytes to peek at; limited by the size of the - # internal buffer, which could be as small as 4K. - # @return The number of bytes peeked, which may be less than nbytes if EOF - # is encountered; or negative, if there was an I/O error. - # @notes The characters peeked at remain in the stream's internal buffer, - # and will be returned by later hread() etc calls. - ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) - - # @abstract Read a block of characters from the file - # @return The number of bytes read, or negative if an error occurred. - # @notes The full nbytes requested will be returned, except as limited - # by EOF or I/O errors. - ssize_t hread(hFILE *fp, void *buffer, size_t nbytes) - - # @abstract Write a character to the stream - # @return The character written, or EOF if an error occurred. - int hputc(int c, hFILE *fp) - - # @abstract Write a string to the stream - # @return 0 if successful, or EOF if an error occurred. - int hputs(const char *text, hFILE *fp) - - # @abstract Write a block of characters to the file - # @return Either nbytes, or negative if an error occurred. - # @notes In the absence of I/O errors, the full nbytes will be written. - ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes) - - # @abstract For writing streams, flush buffered output to the underlying stream - # @return 0 if successful, or EOF if an error occurred. - int hflush(hFILE *fp) - - -cdef extern from "htslib/bgzf.h" nogil: - ctypedef struct bgzf_mtaux_t - ctypedef struct bgzidx_t - ctypedef struct z_stream - - ctypedef struct BGZF: - unsigned errcode - unsigned is_write - int is_be - int compress_level - int is_compressed - int is_gzip - int cache_size - int64_t block_address - int64_t uncompressed_address - void *uncompressed_block - void *compressed_block - void *cache - hFILE *fp - bgzf_mtaux_t *mt - bgzidx_t *idx - int idx_build_otf - z_stream *gz_stream - - #***************** - # Basic routines * - # *****************/ - - # Open an existing file descriptor for reading or writing. - # - # @param fd file descriptor - # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for - # writing, 'a' for appending, 'g' for gzip rather than BGZF - # compression (with 'w' only), and digit specifies the zlib - # compression level. - # Note that there is a distinction between 'u' and '0': the - # first yields plain uncompressed output whereas the latter - # outputs uncompressed data wrapped in the zlib format. - # @return BGZF file handler; 0 on error - - BGZF* bgzf_dopen(int fd, const char *mode) - BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility - - # Open the specified file for reading or writing. - BGZF* bgzf_open(const char* path, const char *mode) - - # Open an existing hFILE stream for reading or writing. - BGZF* bgzf_hopen(hFILE *fp, const char *mode) - - # Close the BGZF and free all associated resources. - # - # @param fp BGZF file handler - # @return 0 on success and -1 on error - int bgzf_close(BGZF *fp) - - # Read up to _length_ bytes from the file storing into _data_. - # - # @param fp BGZF file handler - # @param data data array to read into - # @param length size of data to read - # @return number of bytes actually read; 0 on end-of-file and -1 on error - ssize_t bgzf_read(BGZF *fp, void *data, size_t length) - - # Write _length_ bytes from _data_ to the file. If no I/O errors occur, - # the complete _length_ bytes will be written (or queued for writing). - # - # @param fp BGZF file handler - # @param data data array to write - # @param length size of data to write - # @return number of bytes written (i.e., _length_); negative on error - ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) - - # Read up to _length_ bytes directly from the underlying stream without - # decompressing. Bypasses BGZF blocking, so must be used with care in - # specialised circumstances only. - # - # @param fp BGZF file handler - # @param data data array to read into - # @param length number of raw bytes to read - # @return number of bytes actually read; 0 on end-of-file and -1 on error - ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) - - # Write _length_ bytes directly to the underlying stream without - # compressing. Bypasses BGZF blocking, so must be used with care - # in specialised circumstances only. - # - # @param fp BGZF file handler - # @param data data array to write - # @param length number of raw bytes to write - # @return number of bytes actually written; -1 on error - ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) - - # Write the data in the buffer to the file. - int bgzf_flush(BGZF *fp) - - int SEEK_SET - - # Return a virtual file pointer to the current location in the file. - # No interpetation of the value should be made, other than a subsequent - # call to bgzf_seek can be used to position the file at the same point. - # Return value is non-negative on success. - int64_t bgzf_tell(BGZF *fp) - - # Set the file to read from the location specified by _pos_. - # - # @param fp BGZF file handler - # @param pos virtual file offset returned by bgzf_tell() - # @param whence must be SEEK_SET - # @return 0 on success and -1 on error - # / - int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) - - # Check if the BGZF end-of-file (EOF) marker is present - # - # @param fp BGZF file handler opened for reading - # @return 1 if the EOF marker is present and correct - # 2 if it can't be checked, e.g., because fp isn't seekable - # 0 if the EOF marker is absent - # -1 (with errno set) on error - int bgzf_check_EOF(BGZF *fp) - - # Check if a file is in the BGZF format - # - # @param fn file name - # @return 1 if _fn_ is BGZF; 0 if not or on I/O error - int bgzf_is_bgzf(const char *fn) - - #********************* - # Advanced routines * - #********************* - - # Set the cache size. Only effective when compiled with -DBGZF_CACHE. - # - # @param fp BGZF file handler - # @param size size of cache in bytes; 0 to disable caching (default) - void bgzf_set_cache_size(BGZF *fp, int size) - - # Flush the file if the remaining buffer size is smaller than _size_ - # @return 0 if flushing succeeded or was not needed; negative on error - int bgzf_flush_try(BGZF *fp, ssize_t size) - - # Read one byte from a BGZF file. It is faster than bgzf_read() - # @param fp BGZF file handler - # @return byte read; -1 on end-of-file or error - int bgzf_getc(BGZF *fp) - - # Read one line from a BGZF file. It is faster than bgzf_getc() - # - # @param fp BGZF file handler - # @param delim delimitor - # @param str string to write to; must be initialized - # @return length of the string; 0 on end-of-file; negative on error - int bgzf_getline(BGZF *fp, int delim, kstring_t *str) - - # Read the next BGZF block. - int bgzf_read_block(BGZF *fp) - - # Enable multi-threading (only effective on writing and when the - # library was compiled with -DBGZF_MT) - # - # @param fp BGZF file handler; must be opened for writing - # @param n_threads #threads used for writing - # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended - int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) - - - # Compress a single BGZF block. - # - # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE) - # @param dlen size of output buffer; updated on return to the number - # of bytes actually written to dst - # @param src buffer to be compressed - # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE) - # @param level compression level - # @return 0 on success and negative on error - # - int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level) - - #******************* - # bgzidx routines * - # BGZF at the uncompressed offset - # - # @param fp BGZF file handler; must be opened for reading - # @param uoffset file offset in the uncompressed data - # @param where SEEK_SET supported atm - # - # Returns 0 on success and -1 on error. - int bgzf_useek(BGZF *fp, long uoffset, int where) - - # Position in uncompressed BGZF - # - # @param fp BGZF file handler; must be opened for reading - # - # Returns the current offset on success and -1 on error. - long bgzf_utell(BGZF *fp) - - # Tell BGZF to build index while compressing. - # - # @param fp BGZF file handler; can be opened for reading or writing. - # - # Returns 0 on success and -1 on error. - int bgzf_index_build_init(BGZF *fp) - - # Load BGZF index - # - # @param fp BGZF file handler - # @param bname base name - # @param suffix suffix to add to bname (can be NULL) - # - # Returns 0 on success and -1 on error. - int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) - - # Save BGZF index - # - # @param fp BGZF file handler - # @param bname base name - # @param suffix suffix to add to bname (can be NULL) - # - # Returns 0 on success and -1 on error. - int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) - - -cdef extern from "htslib/hts.h" nogil: - uint32_t kroundup32(uint32_t x) - - ctypedef struct cram_fd - - union FilePointerUnion: - BGZF *bgzf - cram_fd *cram - hFILE *hfile - void *voidp - - enum htsFormatCategory: - unknown_category - sequence_data # Sequence data -- SAM, BAM, CRAM, etc - variant_data # Variant calling data -- VCF, BCF, etc - index_file # Index file associated with some data file - region_list # Coordinate intervals or regions -- BED, etc - category_maximum - - enum htsExactFormat: - unknown_format - binary_format - text_format - sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed - format_maximum - - enum htsCompression: - no_compression, gzip, bgzf, custom - compression_maximum - - enum hts_fmt_option: - CRAM_OPT_DECODE_MD, - CRAM_OPT_PREFIX, - CRAM_OPT_VERBOSITY, - CRAM_OPT_SEQS_PER_SLICE, - CRAM_OPT_SLICES_PER_CONTAINER, - CRAM_OPT_RANGE, - CRAM_OPT_VERSION, - CRAM_OPT_EMBED_REF, - CRAM_OPT_IGNORE_MD5, - CRAM_OPT_REFERENCE, - CRAM_OPT_MULTI_SEQ_PER_SLICE, - CRAM_OPT_NO_REF, - CRAM_OPT_USE_BZIP2, - CRAM_OPT_SHARED_REF, - CRAM_OPT_NTHREADS, - CRAM_OPT_THREAD_POOL, - CRAM_OPT_USE_LZMA, - CRAM_OPT_USE_RANS, - CRAM_OPT_REQUIRED_FIELDS, - HTS_OPT_COMPRESSION_LEVEL, - HTS_OPT_NTHREADS, - - ctypedef struct htsVersion: - short major, minor - - ctypedef struct htsFormat: - htsFormatCategory category - htsExactFormat format - htsVersion version - htsCompression compression - short compression_level - void *specific - - ctypedef struct htsFile: - uint8_t is_bin - uint8_t is_write - uint8_t is_be - uint8_t is_cram - int64_t lineno - kstring_t line - char *fn - char *fn_aux - FilePointerUnion fp - htsFormat format - - int hts_verbose - - # @abstract Table for converting a nucleotide character to 4-bit encoding. - # The input character may be either an IUPAC ambiguity code, '=' for 0, or - # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 - # for A/C/G/T or combinations of these bits for ambiguous bases. - const unsigned char *seq_nt16_table - - # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC - # ambiguity code letter (or '=' when given 0). - const char *seq_nt16_str - - # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. - # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). - const int *seq_nt16_int - - # @abstract Get the htslib version number - # @return For released versions, a string like "N.N[.N]"; or git describe - # output if using a library built within a Git repository. - const char *hts_version() - - # @abstract Determine format by peeking at the start of a file - # @param fp File opened for reading, positioned at the beginning - # @param fmt Format structure that will be filled out on return - # @return 0 for success, or negative if an error occurred. - int hts_detect_format(hFILE *fp, htsFormat *fmt) - - # @abstract Get a human-readable description of the file format - # @return Description string, to be freed by the caller after use. - char *hts_format_description(const htsFormat *format) - - # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file - # @param fn The file name or "-" for stdin/stdout - # @param mode Mode matching / [rwa][bceguxz0-9]* / - # @discussion - # With 'r' opens for reading; any further format mode letters are ignored - # as the format is detected by checking the first few bytes or BGZF blocks - # of the file. With 'w' or 'a' opens for writing or appending, with format - # specifier letters: - # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) - # c CRAM format - # g gzip compressed - # u uncompressed - # z bgzf compressed - # [0-9] zlib compression level - # and with non-format option letters (for any of 'r'/'w'/'a'): - # e close the file on exec(2) (opens with O_CLOEXEC, where supported) - # x create the file exclusively (opens with O_EXCL, where supported) - # Note that there is a distinction between 'u' and '0': the first yields - # plain uncompressed output whereas the latter outputs uncompressed data - # wrapped in the zlib format. - # @example - # [rw]b .. compressed BCF, BAM, FAI - # [rw]bu .. uncompressed BCF - # [rw]z .. compressed VCF - # [rw] .. uncompressed VCF - htsFile *hts_open(const char *fn, const char *mode) - - # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file - # @param fn The file name or "-" for stdin/stdout - # @param mode Open mode, as per hts_open() - # @param fmt Optional format specific parameters - # @discussion - # See hts_open() for description of fn and mode. - # // TODO Update documentation for s/opts/fmt/ - # Opts contains a format string (sam, bam, cram, vcf, bcf) which will, - # if defined, override mode. Opts also contains a linked list of hts_opt - # structures to apply to the open file handle. These can contain things - # like pointers to the reference or information on compression levels, - # block sizes, etc. - htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) - - # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file - # @param fp The already-open file handle - # @param fn The file name or "-" for stdin/stdout - # @param mode Open mode, as per hts_open() - htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode) - - # @abstract Close a file handle, flushing buffered data for output streams - # @param fp The file handle to be closed - # @return 0 for success, or negative if an error occurred. - int hts_close(htsFile *fp) - - # @abstract Returns the file's format information - # @param fp The file handle - # @return Read-only pointer to the file's htsFormat. - const htsFormat *hts_get_format(htsFile *fp) - - # @ abstract Returns a string containing the file format extension. - # @ param format Format structure containing the file type. - # @ return A string ("sam", "bam", etc) or "?" for unknown formats. - const char *hts_format_file_extension(const htsFormat *format) - - # @abstract Sets a specified CRAM option on the open file handle. - # @param fp The file handle open the open file. - # @param opt The CRAM_OPT_* option. - # @param ... Optional arguments, dependent on the option used. - # @return 0 for success, or negative if an error occurred. - int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) - - int hts_getline(htsFile *fp, int delimiter, kstring_t *str) - char **hts_readlines(const char *fn, int *_n) - - # @abstract Parse comma-separated list or read list from a file - # @param list File name or comma-separated list - # @param is_file - # @param _n Size of the output array (number of items read) - # @return NULL on failure or pointer to newly allocated array of - # strings - char **hts_readlist(const char *fn, int is_file, int *_n) - - # @abstract Create extra threads to aid compress/decompression for this file - # @param fp The file handle - # @param n The number of worker threads to create - # @return 0 for success, or negative if an error occurred. - # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE. - int hts_set_threads(htsFile *fp, int n) - - # @abstract Set .fai filename for a file opened for reading - # @return 0 for success, negative on failure - # @discussion - # Called before *_hdr_read(), this provides the name of a .fai file - # used to provide a reference list if the htsFile contains no @SQ headers. - int hts_set_fai_filename(htsFile *fp, const char *fn_aux) - - int8_t HTS_IDX_NOCOOR - int8_t HTS_IDX_START - int8_t HTS_IDX_REST - int8_t HTS_IDX_NONE - - int8_t HTS_FMT_CSI - int8_t HTS_FMT_BAI - int8_t HTS_FMT_TBI - int8_t HTS_FMT_CRAI - - BGZF *hts_get_bgzfp(htsFile *fp) - int hts_useek(htsFile *fp, long uoffset, int where) - long hts_utell(htsFile *fp) - - ctypedef struct hts_idx_t - - ctypedef struct hts_pair64_t: - uint64_t u, v - - ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end) - - ctypedef struct hts_bins_t: - int n, m - int *a - - ctypedef struct hts_itr_t: - uint32_t read_rest - uint32_t finished - int tid, bed, end, n_off, i - int curr_tid, curr_beg, curr_end - uint64_t curr_off - hts_pair64_t *off - hts_readrec_func *readfunc - hts_bins_t bins - - hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) - void hts_idx_destroy(hts_idx_t *idx) - int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) - void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) - - #### Save an index to a file - # @param idx Index to be written - # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added - # @param fmt One of the HTS_FMT_* index formats - # @return 0 if successful, or negative if an error occurred. - int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) - - #### Save an index to a specific file - # @param idx Index to be written - # @param fn Input BAM/BCF/etc filename - # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn - # @param fmt One of the HTS_FMT_* index formats - # @return 0 if successful, or negative if an error occurred. - int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) - - #### Load an index file - # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or - # the extension substituted, to search for an existing index file - # @param fmt One of the HTS_FMT_* index formats - # @return The index, or NULL if an error occurred. - hts_idx_t *hts_idx_load(const char *fn, int fmt) - - #### Load a specific index file - # @param fn Input BAM/BCF/etc filename - # @param fnidx The input index filename - # @return The index, or NULL if an error occurred. - hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) - - uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta) - void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) - - int hts_idx_get_stat(const hts_idx_t* idx, int tid, - uint64_t* mapped, uint64_t* unmapped) - - uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) - - int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers - - # Parse a numeric string - # The number may be expressed in scientific notation, and optionally may - # contain commas in the integer part (before any decimal point or E notation). - # @param str String to be parsed - # @param strend If non-NULL, set on return to point to the first character - # in @a str after those forming the parsed number - # @param flags Or'ed-together combination of HTS_PARSE_* flags - # @return Converted value of the parsed number. - # - # When @a strend is NULL, a warning will be printed (if hts_verbose is 2 - # or more) if there are any trailing characters after the number. - long long hts_parse_decimal(const char *str, char **strend, int flags) - - # Parse a "CHR:START-END"-style region string - # @param str String to be parsed - # @param beg Set on return to the 0-based start of the region - # @param end Set on return to the 1-based end of the region - # @return Pointer to the colon or '\0' after the reference sequence name, - # or NULL if @a str could not be parsed. - const char *hts_parse_reg(const char *str, int *beg, int *end) - - hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) - void hts_itr_destroy(hts_itr_t *iter) - - ctypedef int (*hts_name2id_f)(void*, const char*) - ctypedef const char *(*hts_id2name_f)(void*, int) - ctypedef hts_itr_t *hts_itr_query_func( - const hts_idx_t *idx, - int tid, - int beg, - int end, - hts_readrec_func *readrec) - - hts_itr_t *hts_itr_querys( - const hts_idx_t *idx, - const char *reg, - hts_name2id_f getid, - void *hdr, - hts_itr_query_func *itr_query, - hts_readrec_func *readrec) - - int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) - const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values - - # hts_file_type() - Convenience function to determine file type - # @fname: the file name - # - # Returns one of the FT_* defines. - # - # DEPRECATED: This function has been replaced by hts_detect_format(). - # It and these FT_* macros will be removed in a future HTSlib release. - int FT_UNKN - int FT_GZ - int FT_VCF - int FT_VCF_GZ - int FT_BCF - int FT_BCF_GZ - int FT_STDIN - - int hts_file_type(const char *fname) - - inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) - inline int hts_bin_bot(int bin, int n_lvls) - - # * Endianness * - inline int ed_is_big() - inline uint16_t ed_swap_2(uint16_t v) - inline void *ed_swap_2p(void *x) - inline uint32_t ed_swap_4(uint32_t v) - inline void *ed_swap_4p(void *x) - inline uint64_t ed_swap_8(uint64_t v) - inline void *ed_swap_8p(void *x) - - -cdef extern from "htslib/sam.h" nogil: - #********************** - #*** SAM/BAM header *** - #********************** - - # @abstract Structure for the alignment header. - # @field n_targets number of reference sequences - # @field l_text length of the plain text in the header - # @field target_len lengths of the reference sequences - # @field target_name names of the reference sequences - # @field text plain text - # @field sdict header dictionary - - ctypedef struct bam_hdr_t: - int32_t n_targets, ignore_sam_err - uint32_t l_text - uint32_t *target_len - uint8_t *cigar_tab - char **target_name - char *text - void *sdict - - #**************************** - #*** CIGAR related macros *** - #**************************** - - int BAM_CMATCH - int BAM_CINS - int BAM_CDEL - int BAM_CREF_SKIP - int BAM_CSOFT_CLIP - int BAM_CHARD_CLIP - int BAM_CPAD - int BAM_CEQUAL - int BAM_CDIFF - int BAM_CBACK - - char *BAM_CIGAR_STR - int BAM_CIGAR_SHIFT - uint32_t BAM_CIGAR_MASK - uint32_t BAM_CIGAR_TYPE - - char bam_cigar_op(uint32_t c) - uint32_t bam_cigar_oplen(uint32_t c) - char bam_cigar_opchr(uint32_t) - uint32_t bam_cigar_gen(char, uint32_t) - int bam_cigar_type(char o) - - # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair - int BAM_FPAIRED - # @abstract the read is mapped in a proper pair - int BAM_FPROPER_PAIR - # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR - int BAM_FUNMAP - # @abstract the mate is unmapped - int BAM_FMUNMAP - # @abstract the read is mapped to the reverse strand - int BAM_FREVERSE - # @abstract the mate is mapped to the reverse strand - int BAM_FMREVERSE - # @abstract this is read1 - int BAM_FREAD1 - # @abstract this is read2 - int BAM_FREAD2 - # @abstract not primary alignment - int BAM_FSECONDARY - # @abstract QC failure - int BAM_FQCFAIL - # @abstract optical or PCR duplicate - int BAM_FDUP - # @abstract supplementary alignment - int BAM_FSUPPLEMENTARY - - #************************* - #*** Alignment records *** - #************************* - - # @abstract Structure for core alignment information. - # @field tid chromosome ID, defined by bam_hdr_t - # @field pos 0-based leftmost coordinate - # @field bin bin calculated by bam_reg2bin() - # @field qual mapping quality - # @field l_qname length of the query name - # @field flag bitwise flag - # @field n_cigar number of CIGAR operations - # @field l_qseq length of the query sequence (read) - # @field mtid chromosome ID of next read in template, defined by bam_hdr_t - # @field mpos 0-based leftmost coordinate of next read in template - - ctypedef struct bam1_core_t: - int32_t tid - int32_t pos - uint16_t bin - uint8_t qual - uint8_t l_qname - uint16_t flag - uint16_t n_cigar - int32_t l_qseq - int32_t mtid - int32_t mpos - int32_t isize - - # @abstract Structure for one alignment. - # @field core core information about the alignment - # @field l_data current length of bam1_t::data - # @field m_data maximum length of bam1_t::data - # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux - # - # @discussion Notes: - # - # 1. qname is zero tailing and core.l_qname includes the tailing '\0'. - # 2. l_qseq is calculated from the total length of an alignment block - # on reading or from CIGAR. - # 3. cigar data is encoded 4 bytes per CIGAR operation. - # 4. seq is nybble-encoded according to seq_nt16_table. - ctypedef struct bam1_t: - bam1_core_t core - int l_data, m_data - uint8_t *data - uint64_t id - - # @abstract Get whether the query is on the reverse strand - # @param b pointer to an alignment - # @return boolean true if query is on the reverse strand - int bam_is_rev(bam1_t *b) - - # @abstract Get whether the query's mate is on the reverse strand - # @param b pointer to an alignment - # @return boolean true if query's mate on the reverse strand - int bam_is_mrev(bam1_t *b) - - # @abstract Get the name of the query - # @param b pointer to an alignment - # @return pointer to the name string, null terminated - char *bam_get_qname(bam1_t *b) - - # @abstract Get the CIGAR array - # @param b pointer to an alignment - # @return pointer to the CIGAR array - # - # @discussion In the CIGAR array, each element is a 32-bit integer. The - # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the - # length of a CIGAR. - uint32_t *bam_get_cigar(bam1_t *b) - - # @abstract Get query sequence - # @param b pointer to an alignment - # @return pointer to sequence - # - # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, - # 8 for T and 15 for N. Two bases are packed in one byte with the base - # at the higher 4 bits having smaller coordinate on the read. It is - # recommended to use bam_seqi() macro to get the base. - char *bam_get_seq(bam1_t *b) - - # @abstract Get query quality - # @param b pointer to an alignment - # @return pointer to quality string - uint8_t *bam_get_qual(bam1_t *b) - - # @abstract Get auxiliary data - # @param b pointer to an alignment - # @return pointer to the concatenated auxiliary data - uint8_t *bam_get_aux(bam1_t *b) - - # @abstract Get length of auxiliary data - # @param b pointer to an alignment - # @return length of the concatenated auxiliary data - int bam_get_l_aux(bam1_t *b) - - # @abstract Get a base on read - # @param s Query sequence returned by bam1_seq() - # @param i The i-th position, 0-based - # @return 4-bit integer representing the base. - char bam_seqi(char *s, int i) - - #************************** - #*** Exported functions *** - #************************** - - #*************** - #*** BAM I/O *** - #*************** - - bam_hdr_t *bam_hdr_init() - bam_hdr_t *bam_hdr_read(BGZF *fp) - int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) - void bam_hdr_destroy(bam_hdr_t *h) - int bam_name2id(bam_hdr_t *h, const char *ref) - bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0) - - bam1_t *bam_init1() - void bam_destroy1(bam1_t *b) - int bam_read1(BGZF *fp, bam1_t *b) - int bam_write1(BGZF *fp, const bam1_t *b) - bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) - bam1_t *bam_dup1(const bam1_t *bsrc) - - int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) - int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) - - # @abstract Calculate the rightmost base position of an alignment on the - # reference genome. - - # @param b pointer to an alignment - # @return the coordinate of the first base after the alignment, 0-based - - # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. - # For an unmapped read (either according to its flags or if it has no cigar - # string), we return b->core.pos + 1 by convention. - int32_t bam_endpos(const bam1_t *b) - - int bam_str2flag(const char *str) # returns negative value on error - char *bam_flag2str(int flag) # The string must be freed by the user - - #************************* - #*** BAM/CRAM indexing *** - #************************* - - # These BAM iterator functions work only on BAM files. To work with either - # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. - void bam_itr_destroy(hts_itr_t *iter) - hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) - hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) - int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) - - # Load/build .csi or .bai BAM index file. Does not work with CRAM. - # It is recommended to use the sam_index_* functions below instead. - hts_idx_t *bam_index_load(const char *fn) - int bam_index_build(const char *fn, int min_shift) - - # Load a BAM (.csi or .bai) or CRAM (.crai) index file - # @param fp File handle of the data file whose index is being opened - # @param fn BAM/CRAM/etc filename to search alongside for the index file - # @return The index, or NULL if an error occurred. - hts_idx_t *sam_index_load(htsFile *fp, const char *fn) - - # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file - # @param fp File handle of the data file whose index is being opened - # @param fn BAM/CRAM/etc data file filename - # @param fnidx Index filename, or NULL to search alongside @a fn - # @return The index, or NULL if an error occurred. - hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) - - # Generate and save an index file - # @param fn Input BAM/etc filename, to which .csi/etc will be added - # @param min_shift Positive to generate CSI, or 0 to generate BAI - # @return 0 if successful, or negative if an error occurred (usually -1; or - # -2: opening fn failed; -3: format not indexable) - int sam_index_build(const char *fn, int min_shift) - - # Generate and save an index to a specific file - # @param fn Input BAM/CRAM/etc filename - # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn - # @param min_shift Positive to generate CSI, or 0 to generate BAI - # @return 0 if successful, or negative if an error occurred. - int sam_index_build2(const char *fn, const char *fnidx, int min_shift) - - void sam_itr_destroy(hts_itr_t *iter) - hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) - hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) - int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) - - #*************** - #*** SAM I/O *** - #*************** - - htsFile *sam_open(const char *fn, const char *mode) - htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt) - int sam_close(htsFile *fp) - - int sam_open_mode(char *mode, const char *fn, const char *format) - - # A version of sam_open_mode that can handle ,key=value options. - # The format string is allocated and returned, to be freed by the caller. - # Prefix should be "r" or "w", - char *sam_open_mode_opts(const char *fn, const char *mode, const char *format) - - bam_hdr_t *sam_hdr_parse(int l_text, const char *text) - bam_hdr_t *sam_hdr_read(htsFile *fp) - int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) - - int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) - int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) - int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) - int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) - - #************************************* - #*** Manipulating auxiliary fields *** - #************************************* - - uint8_t *bam_aux_get(const bam1_t *b, const char *tag) - int32_t bam_aux2i(const uint8_t *s) - double bam_aux2f(const uint8_t *s) - char bam_aux2A(const uint8_t *s) - char *bam_aux2Z(const uint8_t *s) - - void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data) - int bam_aux_del(bam1_t *b, uint8_t *s) - - #************************** - #*** Pileup and Mpileup *** - #************************** - - # @abstract Structure for one alignment covering the pileup position. - # @field b pointer to the alignment - # @field qpos position of the read base at the pileup site, 0-based - # @field indel indel length; 0 for no indel, positive for ins and negative for del - # @field level the level of the read in the "viewer" mode - # @field is_del 1 iff the base on the padded read is a deletion - # @field is_head ??? - # @field is_tail ??? - # @field is_refskip ??? - # @field aux ??? - # - # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The - # difference between the two functions is that the former does not - # set bam_pileup1_t::level, while the later does. Level helps the - # implementation of alignment viewers, but calculating this has some - # overhead. - # - # is_del, is_head, etc are a bit field, declaring as below should - # work as expected, see - # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J - - ctypedef struct bam_pileup1_t: - bam1_t *b - int32_t qpos - int indel, level - uint32_t is_del - uint32_t is_head - uint32_t is_tail - uint32_t is_refskip - uint32_t aux - - ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) - ctypedef int (*bam_test_f)() - - ctypedef struct __bam_plp_t - ctypedef __bam_plp_t *bam_plp_t - - ctypedef struct __bam_mplp_t - ctypedef __bam_mplp_t *bam_mplp_t - - # bam_plp_init() - sets an iterator over multiple - # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return - # status: 0 on success, -1 on end, < -1 on non-recoverable errors - # @data: user data to pass to @func - bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) - void bam_plp_destroy(bam_plp_t iter) - int bam_plp_push(bam_plp_t iter, const bam1_t *b) - const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) - const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) - void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) - void bam_plp_reset(bam_plp_t iter) - - bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) - - # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping - # read pairs and for each base pair set the base quality of the - # lower-quality base to zero, thus effectively discarding it from - # calling. If the two bases are identical, the quality of the other base - # is increased to the sum of their qualities (capped at 200), otherwise - # it is multiplied by 0.8. - void bam_mplp_init_overlaps(bam_mplp_t iter) - void bam_mplp_destroy(bam_mplp_t iter) - void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) - int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) - - # Added by AH - # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" - - -cdef extern from "htslib/faidx.h" nogil: - - ctypedef struct faidx_t: - pass - - int fai_build(char *fn) - - void fai_destroy(faidx_t *fai) - - faidx_t *fai_load(char *fn) - - char *fai_fetch(faidx_t *fai, - char *reg, - int *len) - - int faidx_nseq(faidx_t *fai) - - int faidx_has_seq(faidx_t *fai, const char *seq) - - char *faidx_fetch_seq(faidx_t *fai, - char *c_name, - int p_beg_i, - int p_end_i, - int *len) - - int faidx_seq_len(faidx_t *fai, const char *seq) - - -# tabix support -cdef extern from "htslib/tbx.h" nogil: - - # tbx.h definitions - int8_t TBX_MAX_SHIFT - int8_t TBX_GENERIC - int8_t TBX_SAM - int8_t TBX_VCF - int8_t TBX_UCSC - - ctypedef struct tbx_conf_t: - int32_t preset - int32_t sc, bc, ec # seq col., beg col. and end col. - int32_t meta_char, line_skip - - ctypedef struct tbx_t: - tbx_conf_t conf - hts_idx_t *idx - void * dict - - tbx_conf_t tbx_conf_gff - tbx_conf_t tbx_conf_bed - tbx_conf_t tbx_conf_psltbl - tbx_conf_t tbx_conf_sam - tbx_conf_t tbx_conf_vcf - - void tbx_itr_destroy(hts_itr_t * iter) - hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end) - hts_itr_t * tbx_itr_querys(tbx_t * t, char * s) - int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data) - - int tbx_name2id(tbx_t *tbx, char *ss) - - int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf) - int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf) - - tbx_t * tbx_index_load(char *fn) - tbx_t *tbx_index_load2(const char *fn, const char *fnidx) - - # free the array but not the values - char **tbx_seqnames(tbx_t *tbx, int *n) - - void tbx_destroy(tbx_t *tbx) - - -# VCF/BCF API -cdef extern from "htslib/vcf.h" nogil: - - # Header struct - - uint8_t BCF_HL_FLT # header line - uint8_t BCF_HL_INFO - uint8_t BCF_HL_FMT - uint8_t BCF_HL_CTG - uint8_t BCF_HL_STR # structured header line TAG= - uint8_t BCF_HL_GEN # generic header line - - uint8_t BCF_HT_FLAG # header type - uint8_t BCF_HT_INT - uint8_t BCF_HT_REAL - uint8_t BCF_HT_STR - - uint8_t BCF_VL_FIXED # variable length - uint8_t BCF_VL_VAR - uint8_t BCF_VL_A - uint8_t BCF_VL_G - uint8_t BCF_VL_R - - # === Dictionary === - # - # The header keeps three dictonaries. The first keeps IDs in the - # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths - # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] - # is the actual hash table, which is opaque to the end users. In the hash - # table, the key is the ID or sample name as a C string and the value is a - # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash - # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the - # size of the hash table or, equivalently, the length of the id[] arrays. - - uint8_t BCF_DT_ID # dictionary type - uint8_t BCF_DT_CTG - uint8_t BCF_DT_SAMPLE - - # Complete textual representation of a header line - ctypedef struct bcf_hrec_t: - int type # One of the BCF_HL_* type - char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. - char *value # Set only for generic lines, NULL for FILTER/INFO, etc. - int nkeys # Number of structured fields - char **keys # The key=value pairs - char **vals - - ctypedef struct bcf_idinfo_t: - uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2] - bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG - int id - - ctypedef struct bcf_idpair_t: - const char *key - const bcf_idinfo_t *val - - ctypedef struct bcf_hdr_t: - int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) - bcf_idpair_t *id[3] - void *dict[3] # ID dictionary, contig dict and sample dict - char **samples - bcf_hrec_t **hrec - int nhrec, dirty - int ntransl - int *transl[2] # for bcf_translate() - int nsamples_ori # for bcf_hdr_set_samples() - uint8_t *keep_samples - kstring_t mem - int32_t m[3] # m: allocated size of the dictionary block in use (see n above) - - uint8_t bcf_type_shift[] - - # * VCF record * - - uint8_t BCF_BT_NULL - uint8_t BCF_BT_INT8 - uint8_t BCF_BT_INT16 - uint8_t BCF_BT_INT32 - uint8_t BCF_BT_FLOAT - uint8_t BCF_BT_CHAR - - uint8_t VCF_REF - uint8_t VCF_SNP - uint8_t VCF_MNP - uint8_t VCF_INDEL - uint8_t VCF_OTHER - - ctypedef struct variant_t: - int type, n # variant type and the number of bases affected, negative for deletions - - ctypedef struct bcf_fmt_t: - int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key - int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types - uint8_t *p # same as vptr and vptr_* in bcf_info_t below - uint32_t p_len - uint32_t p_off - uint8_t p_free - - union bcf_info_union_t: - int32_t i # integer value - float f # float value - - ctypedef struct bcf_info_t: - int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key - int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars - - # v1 union only set if $len==1; for easier access - bcf_info_union_t v1 - uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes - uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset - uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes - uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new - # data block is bigger than the original - - uint8_t BCF1_DIRTY_ID - uint8_t BCF1_DIRTY_ALS - uint8_t BCF1_DIRTY_FLT - uint8_t BCF1_DIRTY_INF - - ctypedef struct bcf_dec_t: - int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change - int n_flt # Number of FILTER fields - int *flt # FILTER keys in the dictionary - char *id # ID - char *als # REF+ALT block (\0-seperated) - char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated - bcf_info_t *info # INFO - bcf_fmt_t *fmt # FORMAT and individual sample - variant_t *var # $var and $var_type set only when set_variant_types called - int n_var, var_type - int shared_dirty # if set, shared.s must be recreated on BCF output - int indiv_dirty # if set, indiv.s must be recreated on BCF output - - uint8_t BCF_ERR_CTG_UNDEF - uint8_t BCF_ERR_TAG_UNDEF - uint8_t BCF_ERR_NCOLS - uint8_t BCF_ERR_LIMITS - - # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file - # is slower because the string is first to be parsed, packed into BCF line - # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it - # is known in advance that some of the fields will not be required (notably - # the sample columns), parsing of these can be skipped by setting max_unpack - # appropriately. - # Similarly, it is fast to output a BCF line because the columns (kept in - # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF - # line must be formatted in vcf_format. - - ctypedef struct bcf1_t: - int32_t rid # CHROM - int32_t pos # POS - int32_t rlen # length of REF - float qual # QUAL - uint32_t n_info, n_allele - uint32_t n_fmt, n_sample - kstring_t shared, indiv - bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() - int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed - int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work - int unpack_size[3] # the original block size of ID, REF+ALT and FILTER - int errcode # one of BCF_ERR_* codes - - ####### API ####### - - # BCF and VCF I/O - # - # A note about naming conventions: htslib internally represents VCF - # records as bcf1_t data structures, therefore most functions are - # prefixed with bcf_. There are a few exceptions where the functions must - # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In - # these cases, functions prefixed with bcf_ are more general and work - # with both BCF and VCF. - - # bcf_hdr_init() - create an empty BCF header. - # @param mode "r" or "w" - # - # When opened for writing, the mandatory fileFormat and - # FILTER=PASS lines are added automatically. - bcf_hdr_t *bcf_hdr_init(const char *mode) - - # Destroy a BCF header struct - void bcf_hdr_destroy(bcf_hdr_t *h) - - # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) - bcf1_t *bcf_init() - - # Deallocate a bcf1_t object - void bcf_destroy(bcf1_t *v) - - # Same as bcf_destroy() but frees only the memory allocated by bcf1_t, - # not the bcf1_t object itself. - void bcf_empty(bcf1_t *v) - - # Make the bcf1_t object ready for next read. Intended mostly for - # internal use, the user should rarely need to call this function - # directly. - void bcf_clear(bcf1_t *v) - - # Reads VCF or BCF header - bcf_hdr_t *bcf_hdr_read(htsFile *fp) - - # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed - # @samples: samples to include or exclude from file or as a comma-separated string. - # LIST|FILE .. select samples in list/file - # ^LIST|FILE .. exclude samples from list/file - # - .. include all samples - # NULL .. exclude all samples - # @is_file: @samples is a file (1) or a comma-separated list (0) - # - # The bottleneck of VCF reading is parsing of genotype fields. If the - # reader knows in advance that only subset of samples is needed (possibly - # no samples at all), the performance of bcf_read() can be significantly - # improved by calling bcf_hdr_set_samples after bcf_hdr_read(). - # The function bcf_read() will subset the VCF/BCF records automatically - # with the notable exception when reading records via bcf_itr_next(). - # In this case, bcf_subset_format() must be called explicitly, because - # bcf_readrec() does not see the header. - # - # Returns 0 on success, -1 on error or a positive integer if the list - # contains samples not present in the VCF header. In such a case, the - # return value is the index of the offending sample. - # - int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) - int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) - - # Writes VCF or BCF header - int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) - - # Parse VCF line contained in kstring and populate the bcf1_t struct - int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) - - # The opposite of vcf_parse. It should rarely be called directly, see vcf_write - int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) - - # bcf_read() - read next VCF or BCF record - # - # Returns -1 on critical errors, 0 otherwise. On errors which are not - # critical for reading, such as missing header definitions, v->errcode is - # set to one of BCF_ERR* code and must be checked before calling - # vcf_write(). - int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - - # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) - # - # Note that bcf_unpack() must be called even when reading VCF. It is safe - # to call the function repeatedly, it will not unpack the same field - # twice. - uint8_t BCF_UN_STR # up to ALT inclusive - uint8_t BCF_UN_FLT # up to FILTER - uint8_t BCF_UN_INFO # up to INFO - uint8_t BCF_UN_SHR # all shared information - uint8_t BCF_UN_FMT # unpack format and each sample - uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT - uint8_t BCF_UN_ALL # everything - - int bcf_unpack(bcf1_t *b, int which) - - # bcf_dup() - create a copy of BCF record. - # - # Note that bcf_unpack() must be called on the returned copy as if it was - # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) - # internally to reflect any changes made by bcf_update_* functions. - bcf1_t *bcf_dup(bcf1_t *src) - bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) - - # bcf_write() - write one VCF or BCF record. The type is determined at the open() call. - int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) - - # The following functions work only with VCFs and should rarely be called - # directly. Usually one wants to use their bcf_* alternatives, which work - # transparently with both VCFs and BCFs. - bcf_hdr_t *vcf_hdr_read(htsFile *fp) - int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) - int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - - #************************************************************************ - # Header querying and manipulation routines - #************************************************************************ - - # Create a new header using the supplied template - bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) - - # Copy header lines from src to dst if not already present in dst. See also bcf_translate(). - # Returns 0 on success or sets a bit on error: - # 1 .. conflicting definitions of tag length - # # todo - int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) - - # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() - # @param dst: the destination header to be merged into, NULL on the first pass - # @param src: the source header - # - # Notes: - # - use as: - # bcf_hdr_t *dst = NULL; - # for (i=0; i 0 ) - # for (i=0; i=0 - # - # The returned values are: - # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* - # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields - # bcf_hdr_id2type .. the field type, one of BCF_HT_* - # bcf_hdr_id2coltype .. the column type, one of BCF_HL_* - # - # Notes: Prior to using the macros, the presence of the info should be - # tested with bcf_hdr_idinfo_exists(). - # - int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id) - int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id) - int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id) - int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id) - int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id) - bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id) - - void bcf_fmt_array(kstring_t *s, int n, int type, void *data) - uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) - - void bcf_enc_vchar(kstring_t *s, int l, const char *a) - void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) - void bcf_enc_vfloat(kstring_t *s, int n, float *a) - - #************************************************************************ - # BCF index - # - # Note that these functions work with BCFs only. See synced_bcf_reader.h - # which provides (amongst other things) an API to work transparently with - # both indexed BCFs and VCFs. - #************************************************************************ - - hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) - int bcf_index_build(const char *fn, int min_shift) - int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) - - #******************* - # Typed value I/O * - #****************** - - # Note that in contrast with BCFv2.1 specification, HTSlib implementation - # allows missing values in vectors. For integer types, the values 0x80, - # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, - # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of - # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an - # end-of-vector indicator. - # Note that the end-of-vector byte is not part of the vector. - - # This trial BCF version (v2.2) is compatible with the VCF specification and - # enables to handle correctly vectors with different ploidy in presence of - # missing values. - - int32_t bcf_int8_vector_end - int32_t bcf_int16_vector_end - int32_t bcf_int32_vector_end - int32_t bcf_str_vector_end - int32_t bcf_int8_missing - int32_t bcf_int16_missing - int32_t bcf_int32_missing - int32_t bcf_str_missing - - uint32_t bcf_float_vector_end - uint32_t bcf_float_missing - - void bcf_float_set(float *ptr, uint32_t value) - void bcf_float_set_vector_end(float *x) - void bcf_float_set_missing(float *x) - - int bcf_float_is_missing(float f) - int bcf_float_is_vector_end(float f) - void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) - void bcf_enc_size(kstring_t *s, int size, int type) - int bcf_enc_inttype(long x) - void bcf_enc_int1(kstring_t *s, int32_t x) - int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) - int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) - int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) - - # These trivial wrappers are defined only for consistency with other parts of htslib - bcf1_t *bcf_init1() - int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) - void bcf_destroy1(bcf1_t *v) - void bcf_empty1(bcf1_t *v) - int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) - void bcf_clear1(bcf1_t *v) - int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) - - # Other nice wrappers - void bcf_itr_destroy(hts_itr_t *iter) - hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) - hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s) - int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r) - hts_idx_t *bcf_index_load(const char *fn) - const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) - - -# VCF/BCF utility functions -cdef extern from "htslib/vcfutils.h" nogil: - struct kbitset_t - - # bcf_trim_alleles() - remove ALT alleles unused in genotype fields - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtain from vcf_parse1 - # - # Returns the number of removed alleles on success or negative - # on error: - # -1 .. some allele index is out of bounds - int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) - - # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtained from vcf_parse1 - # @mask: alleles to remove - # - # If you have more than 31 alleles, then the integer bit mask will - # overflow, so use bcf_remove_allele_set instead - void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) - - # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtained from vcf_parse1 - # @rm_set: pointer to kbitset_t object with bits set for allele - # indexes to remove - # - # Number=A,R,G INFO and FORMAT fields will be updated accordingly. - void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set) - - # bcf_calc_ac() - calculate the number of REF and ALT alleles - # @header: for access to BCF_DT_ID dictionary - # @line: VCF line obtained from vcf_parse1 - # @ac: array of length line->n_allele - # @which: determine if INFO/AN,AC and indv fields be used - # - # Returns 1 if the call succeeded, or 0 if the value could not - # be determined. - # - # The value of @which determines if existing INFO/AC,AN can be - # used (BCF_UN_INFO) and and if indv fields can be splitted - # (BCF_UN_FMT). - int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) - - # bcf_gt_type() - determines type of the genotype - # @fmt_ptr: the GT format field as set for example by set_fmt_ptr - # @isample: sample index (starting from 0) - # @ial: index of the 1st non-reference allele (starting from 1) - # @jal: index of the 2nd non-reference allele (starting from 1) - # - # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA, - # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial - # is not NULL and the genotype has one or more non-reference - # alleles, $ial will be set. In case of GT_HET_AA, $ial is the - # position of the allele which appeared first in ALT. If $jal is - # not null and the genotype is GT_HET_AA, $jal will be set and is - # the position of the second allele in ALT. - uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation - uint8_t GT_HOM_AA - uint8_t GT_HET_RA - uint8_t GT_HET_AA - uint8_t GT_HAPL_R - uint8_t GT_HAPL_A - uint8_t GT_UNKN - int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal) - - int bcf_acgt2int(char c) - char bcf_int2acgt(int i) - - # bcf_ij2G() - common task: allele indexes to Number=G index (diploid) - # @i,j: allele indexes, 0-based, i<=j - # Returns index to the Number=G diploid array - uint32_t bcf_ij2G(uint32_t i, uint32_t j) diff --git a/pysam/chtslib.pyx b/pysam/chtslib.pyx deleted file mode 100644 index eab229f..0000000 --- a/pysam/chtslib.pyx +++ /dev/null @@ -1,19 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -# adds doc-strings for sphinx -from pysam.chtslib cimport * - -cpdef set_verbosity(int verbosity): - u"""Set htslib's hts_verbose global variable to the specified value. - """ - return hts_set_verbosity(verbosity) - -cpdef get_verbosity(): - u"""Return the value of htslib's hts_verbose global variable. - """ - return hts_get_verbosity() - -__all__ = [ - "get_verbosity", - "set_verbosity"] - diff --git a/pysam/csamfile.pxd b/pysam/csamfile.pxd deleted file mode 100644 index a76a599..0000000 --- a/pysam/csamfile.pxd +++ /dev/null @@ -1,45 +0,0 @@ -from pysam.calignmentfile cimport AlignedSegment, AlignmentFile - -################################################# -# Compatibility Layer for pysam < 0.8 - -# import all declarations from htslib -from pysam.chtslib cimport * - -cdef class AlignedRead(AlignedSegment): - pass - -cdef class Samfile(AlignmentFile): - pass - -# import the conversion functions -cdef extern from "htslib_util.h": - - # add *nbytes* into the variable length data of *src* at *pos* - bam1_t * pysam_bam_update(bam1_t * b, - size_t nbytes_old, - size_t nbytes_new, - uint8_t * pos) - - # now: static - int aux_type2size(int) - - char * pysam_bam_get_qname(bam1_t * b) - uint32_t * pysam_bam_get_cigar(bam1_t * b) - uint8_t * pysam_bam_get_seq(bam1_t * b) - uint8_t * pysam_bam_get_qual(bam1_t * b) - uint8_t * pysam_bam_get_aux(bam1_t * b) - int pysam_bam_get_l_aux(bam1_t * b) - char pysam_bam_seqi(uint8_t * s, int i) - - uint16_t pysam_get_bin(bam1_t * b) - uint8_t pysam_get_qual(bam1_t * b) - uint8_t pysam_get_l_qname(bam1_t * b) - uint16_t pysam_get_flag(bam1_t * b) - uint16_t pysam_get_n_cigar(bam1_t * b) - void pysam_set_bin(bam1_t * b, uint16_t v) - void pysam_set_qual(bam1_t * b, uint8_t v) - void pysam_set_l_qname(bam1_t * b, uint8_t v) - void pysam_set_flag(bam1_t * b, uint16_t v) - void pysam_set_n_cigar(bam1_t * b, uint16_t v) - void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) diff --git a/pysam/csamfile.pyx b/pysam/csamfile.pyx deleted file mode 100644 index ed9d79b..0000000 --- a/pysam/csamfile.pyx +++ /dev/null @@ -1,43 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -# adds doc-strings for sphinx -import tempfile -import os -import sys -import types -import itertools -import struct -import ctypes -import collections -import re -import platform -import warnings -from cpython cimport PyErr_SetString, \ - PyBytes_Check, \ - PyUnicode_Check, \ - PyBytes_FromStringAndSize - -from cpython.version cimport PY_MAJOR_VERSION - -from pysam.calignmentfile cimport AlignmentFile, AlignedSegment - - -cdef class Samfile(AlignmentFile): - '''Deprecated alternative for :class:`~pysam.AlignmentFile` - - Added for backwards compatibility with pysam <= 0.8.0 - ''' - pass - - -cdef class AlignedRead(AlignedSegment): - '''Deprecated alternative for :class:`~pysam.AlignedSegment` - - Added for backwards compatibility with pysam <= 0.8.0 - ''' - pass - - -__all__ = ['Samfile', 'AlignedRead'] - - diff --git a/pysam/ctabix.pxd b/pysam/ctabix.pxd deleted file mode 100644 index 028090e..0000000 --- a/pysam/ctabix.pxd +++ /dev/null @@ -1,119 +0,0 @@ -from libc.stdint cimport int8_t, int16_t, int32_t, int64_t -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -from libc.stdlib cimport malloc, calloc, realloc, free -from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup -from libc.stdio cimport FILE, printf - -# Note: this replaces python "open"! -cdef extern from "fcntl.h": - int open(char *pathname, int flags) - -cdef extern from "unistd.h" nogil: - ctypedef int ssize_t - ssize_t read(int fd, void *buf, size_t count) - int close(int fd) - -from pysam.chtslib cimport hts_idx_t, hts_itr_t, htsFile, \ - tbx_t, kstring_t, BGZF - -# These functions are put here and not in chtslib.pxd in order -# to avoid warnings for unused functions. -cdef extern from "pysam_stream.h" nogil: - - ctypedef struct kstream_t: - pass - - ctypedef struct kseq_t: - kstring_t name - kstring_t comment - kstring_t seq - kstring_t qual - - kseq_t *kseq_init(BGZF *) - int kseq_read(kseq_t *) - void kseq_destroy(kseq_t *) - kstream_t *ks_init(BGZF *) - void ks_destroy(kstream_t *) - - # Retrieve characters from stream until delimiter - # is reached placing results in str. - int ks_getuntil(kstream_t *, - int delimiter, - kstring_t * str, - int * dret) - - -cdef class tabix_file_iterator: - cdef BGZF * fh - cdef kstream_t * kstream - cdef kstring_t buffer - cdef size_t size - cdef Parser parser - cdef int fd - cdef int duplicated_fd - cdef infile - - cdef __cnext__(self) - -cdef class TabixFile: - - # pointer to tabixfile - cdef htsFile * tabixfile - # pointer to index structure - cdef tbx_t * index - - # flag indicating whether file is remote - cdef int is_remote - - cdef object _filename - cdef object _filename_index - - cdef Parser parser - - cdef encoding - -cdef class Parser: - cdef encoding - - cdef parse(self, char * buffer, int len) - -cdef class asTuple(Parser): - cdef parse(self, char * buffer, int len) - -cdef class asGTF(Parser): - pass - -cdef class asBed(Parser): - pass - -cdef class asVCF(Parser): - pass - -cdef class TabixIterator: - cdef hts_itr_t * iterator - cdef TabixFile tabixfile - cdef kstring_t buffer - cdef encoding - cdef int __cnext__(self) - -cdef class TabixIteratorParsed(TabixIterator): - cdef Parser parser - -cdef class GZIterator: - cdef object _filename - cdef BGZF * gzipfile - cdef kstream_t * kstream - cdef kstring_t buffer - cdef int __cnext__(self) - cdef encoding - -cdef class GZIteratorHead(GZIterator): - pass - -cdef class GZIteratorParsed(GZIterator): - cdef Parser parser - -# Compatibility Layer for pysam < 0.8 -cdef class Tabixfile(TabixFile): - pass - diff --git a/pysam/ctabix.pyx b/pysam/ctabix.pyx deleted file mode 100644 index a23fa87..0000000 --- a/pysam/ctabix.pyx +++ /dev/null @@ -1,1206 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -############################################################################### -############################################################################### -# Cython wrapper for access to tabix indexed files in bgzf format -############################################################################### -# The principal classes and functions defined in this module are: -# -# class TabixFile class wrapping tabix indexed files in bgzf format -# -# class asTuple Parser class for tuples -# class asGT Parser class for GTF formatted rows -# class asBed Parser class for Bed formatted rows -# class asVCF Parser class for VCF formatted rows -# -# class tabix_generic_iterator Streamed iterator of bgzf formatted files -# -# Additionally this module defines several additional classes that are part -# of the internal API. These are: -# -# class Parser base class for parsers of tab-separated rows -# class tabix_file_iterator -# class TabixIterator iterator class over rows in bgzf file -# class EmptyIterator -# -# For backwards compatibility, the following classes are also defined: -# -# class Tabixfile equivalent to TabixFile -# -############################################################################### -# -# The MIT License -# -# Copyright (c) 2015 Andreas Heger -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. -# -############################################################################### -import os -import sys - -from libc.stdio cimport printf, fprintf, stderr -from libc.string cimport strerror -from libc.errno cimport errno -from posix.unistd cimport dup - -from cpython cimport PyErr_SetString, PyBytes_Check, \ - PyUnicode_Check, PyBytes_FromStringAndSize, \ - PyObject_AsFileDescriptor - -from cpython.version cimport PY_MAJOR_VERSION - -cimport pysam.ctabixproxies as ctabixproxies - -from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ - BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ - tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \ - tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ - tbx_destroy, hisremote - -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size - -cdef class Parser: - - def __init__(self, encoding="ascii"): - self.encoding = encoding - - def set_encoding(self, encoding): - self.encoding = encoding - - def get_encoding(self): - return self.encoding - - cdef parse(self, char * buffer, int length): - raise NotImplementedError( - 'parse method of %s not implemented' % str(self)) - - def __call__(self, char * buffer, int length): - return self.parse(buffer, length) - - -cdef class asTuple(Parser): - '''converts a :term:`tabix row` into a python tuple. - - A field in a row is accessed by numeric index. - ''' - cdef parse(self, char * buffer, int len): - cdef ctabixproxies.TupleProxy r - r = ctabixproxies.TupleProxy(self.encoding) - # need to copy - there were some - # persistence issues with "present" - r.copy(buffer, len) - return r - - -cdef class asGTF(Parser): - '''converts a :term:`tabix row` into a GTF record with the following - fields: - - +----------+----------+-------------------------------+ - |*Column* |*Name* |*Content* | - +----------+----------+-------------------------------+ - |1 |contig |the chromosome name | - +----------+----------+-------------------------------+ - |2 |feature |The feature type | - +----------+----------+-------------------------------+ - |3 |source |The feature source | - +----------+----------+-------------------------------+ - |4 |start |genomic start coordinate | - | | |(0-based) | - +----------+----------+-------------------------------+ - |5 |end |genomic end coordinate | - | | |(0-based) | - +----------+----------+-------------------------------+ - |6 |score |feature score | - +----------+----------+-------------------------------+ - |7 |strand |strand | - +----------+----------+-------------------------------+ - |8 |frame |frame | - +----------+----------+-------------------------------+ - |9 |attributes|the attribute field | - +----------+----------+-------------------------------+ - - GTF formatted entries also define the following fields that - are derived from the attributes field: - - +--------------------+------------------------------+ - |*Name* |*Content* | - +--------------------+------------------------------+ - |gene_id |the gene identifier | - +--------------------+------------------------------+ - |transcript_id |the transcript identifier | - +--------------------+------------------------------+ - - ''' - cdef parse(self, char * buffer, int len): - cdef ctabixproxies.GTFProxy r - r = ctabixproxies.GTFProxy(self.encoding) - r.copy(buffer, len) - return r - - -cdef class asBed(Parser): - '''converts a :term:`tabix row` into a bed record - with the following fields: - - +-----------+-----------+------------------------------------------+ - |*Column* |*Field* |*Contents* | - | | | | - +-----------+-----------+------------------------------------------+ - |1 |contig |contig | - | | | | - +-----------+-----------+------------------------------------------+ - |2 |start |genomic start coordinate (zero-based) | - +-----------+-----------+------------------------------------------+ - |3 |end |genomic end coordinate plus one | - | | |(zero-based) | - +-----------+-----------+------------------------------------------+ - |4 |name |name of feature. | - +-----------+-----------+------------------------------------------+ - |5 |score |score of feature | - +-----------+-----------+------------------------------------------+ - |6 |strand |strand of feature | - +-----------+-----------+------------------------------------------+ - |7 |thickStart |thickStart | - +-----------+-----------+------------------------------------------+ - |8 |thickEnd |thickEnd | - +-----------+-----------+------------------------------------------+ - |9 |itemRGB |itemRGB | - +-----------+-----------+------------------------------------------+ - |10 |blockCount |number of bocks | - +-----------+-----------+------------------------------------------+ - |11 |blockSizes |',' separated string of block sizes | - +-----------+-----------+------------------------------------------+ - |12 |blockStarts|',' separated string of block genomic | - | | |start positions | - +-----------+-----------+------------------------------------------+ - - Only the first three fields are required. Additional - fields are optional, but if one is defined, all the preceding - need to be defined as well. - - ''' - cdef parse(self, char * buffer, int len): - cdef ctabixproxies.BedProxy r - r = ctabixproxies.BedProxy(self.encoding) - r.copy(buffer, len) - return r - - -cdef class asVCF(Parser): - '''converts a :term:`tabix row` into a VCF record with - the following fields: - - +----------+---------+------------------------------------+ - |*Column* |*Field* |*Contents* | - | | | | - +----------+---------+------------------------------------+ - |1 |contig |chromosome | - +----------+---------+------------------------------------+ - |2 |pos |chromosomal position, zero-based | - +----------+---------+------------------------------------+ - |3 |id |id | - +----------+---------+------------------------------------+ - |4 |ref |reference allele | - +----------+---------+------------------------------------+ - |5 |alt |alternate alleles | - +----------+---------+------------------------------------+ - |6 |qual |quality | - +----------+---------+------------------------------------+ - |7 |filter |filter | - +----------+---------+------------------------------------+ - |8 |info |info | - +----------+---------+------------------------------------+ - |9 |format |format specifier. | - +----------+---------+------------------------------------+ - - Access to genotypes is via index:: - - contig = vcf.contig - first_sample_genotype = vcf[0] - second_sample_genotype = vcf[1] - - ''' - cdef parse(self, char * buffer, int len): - cdef ctabixproxies.VCFProxy r - r = ctabixproxies.VCFProxy(self.encoding) - r.copy(buffer, len) - return r - - -cdef class TabixFile: - """Random access to bgzf formatted files that - have been indexed by :term:`tabix`. - - The file is automatically opened. The index file of file - ```` is expected to be called ``.tbi`` - by default (see parameter `index`). - - Parameters - ---------- - - filename : string - Filename of bgzf file to be opened. - - index : string - The filename of the index. If not set, the default is to - assume that the index is called ``filename.tbi` - - mode : char - The file opening mode. Currently, only ``r`` is permitted. - - parser : :class:`pysam.Parser` - - sets the default parser for this tabix file. If `parser` - is None, the results are returned as an unparsed string. - Otherwise, `parser` is assumed to be a functor that will return - parsed data (see for example :class:`~pysam.asTuple` and - :class:`~pysam.asGTF`). - - encoding : string - - The encoding passed to the parser - - Raises - ------ - - ValueError - if index file is missing. - - IOError - if file could not be opened - """ - def __cinit__(self, - filename, - mode = 'r', - parser=None, - index=None, - encoding="ascii", - *args, - **kwargs ): - - self.tabixfile = NULL - self.parser = parser - self._open(filename, mode, index, *args, **kwargs) - self.encoding = encoding - - def _open( self, - filename, - mode='r', - index=None, - ): - '''open a :term:`tabix file` for reading. - ''' - - assert mode in ("r",), "invalid file opening mode `%s`" % mode - - if self.tabixfile != NULL: - self.close() - self.tabixfile = NULL - - filename_index = index or (filename + ".tbi") - # encode all the strings to pass to tabix - self._filename = encode_filename(filename) - self._filename_index = encode_filename(filename_index) - - self.is_remote = hisremote(self._filename) - - if not self.is_remote: - if not os.path.exists(filename): - raise IOError("file `%s` not found" % filename) - - if not os.path.exists(filename_index): - raise IOError("index `%s` not found" % filename_index) - - # open file - cdef char *cfilename = self._filename - with nogil: - self.tabixfile = hts_open(cfilename, 'r') - - if self.tabixfile == NULL: - raise IOError("could not open file `%s`" % filename) - - cfilename = self._filename_index - with nogil: - self.index = tbx_index_load(cfilename) - - if self.index == NULL: - raise IOError("could not open index for `%s`" % filename) - - def _dup(self): - '''return a copy of this tabix file. - - The file is being re-opened. - ''' - return TabixFile(self._filename, - mode="r", - parser=self.parser, - index=self._filename_index, - encoding=self.encoding) - - def is_open(self): - '''return true if samfile has been opened.''' - return self.tabixfile != NULL - - - def fetch(self, - reference=None, - start=None, - end=None, - region=None, - parser=None, - multiple_iterators=False): - '''fetch one or more rows in a :term:`region` using 0-based - indexing. The region is specified by :term:`reference`, - *start* and *end*. Alternatively, a samtools :term:`region` - string can be supplied. - - Without *reference* or *region* all entries will be fetched. - - If only *reference* is set, all reads matching on *reference* - will be fetched. - - If *parser* is None, the default parser will be used for - parsing. - - Set *multiple_iterators* to true if you will be using multiple - iterators on the same file at the same time. The iterator - returned will receive its own copy of a filehandle to the file - effectively re-opening the file. Re-opening a file creates - some overhead, so beware. - - ''' - if not self.is_open(): - raise ValueError("I/O operation on closed file") - - # convert coordinates to region string, which is one-based - if reference: - if end is not None: - if end < 0: - raise ValueError("end out of range (%i)" % end) - if start is None: - start = 0 - - if start < 0: - raise ValueError("start out of range (%i)" % end) - elif start > end: - raise ValueError( - 'start (%i) >= end (%i)' % (start, end)) - elif start == end: - return EmptyIterator() - else: - region = '%s:%i-%i' % (reference, start + 1, end) - elif start is not None: - if start < 0: - raise ValueError("start out of range (%i)" % end) - region = '%s:%i' % (reference, start + 1) - else: - region = reference - - # get iterator - cdef hts_itr_t * itr - cdef char *cstr - cdef TabixFile fileobj - - # reopen the same file if necessary - if multiple_iterators: - fileobj = self._dup() - else: - fileobj = self - - if region is None: - # without region or reference - iterate from start - with nogil: - itr = tbx_itr_queryi(fileobj.index, - HTS_IDX_START, - 0, - 0) - else: - s = force_bytes(region, encoding=fileobj.encoding) - cstr = s - with nogil: - itr = tbx_itr_querys(fileobj.index, cstr) - - if itr == NULL: - if region is None: - if len(self.contigs) > 0: - # when accessing a tabix file created prior tabix 1.0 - # the full-file iterator is empty. - raise ValueError( - "could not create iterator, possible " - "tabix version mismatch") - else: - # possible reason is that the file is empty - - # return an empty iterator - return EmptyIterator() - else: - raise ValueError( - "could not create iterator for region '%s'" % - region) - - # use default parser if no parser is specified - if parser is None: - parser = fileobj.parser - - cdef TabixIterator a - if parser is None: - a = TabixIterator(encoding=fileobj.encoding) - else: - parser.set_encoding(fileobj.encoding) - a = TabixIteratorParsed(parser) - - a.tabixfile = fileobj - a.iterator = itr - - return a - - # context manager interface - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - return False - - ############################################################### - ############################################################### - ############################################################### - ## properties - ############################################################### - property closed: - """"bool indicating the current state of the file object. - This is a read-only attribute; the close() method changes the value. - """ - def __get__(self): - return not self.is_open() - - property filename: - '''filename associated with this object.''' - def __get__(self): - if not self.is_open(): - raise ValueError("I/O operation on closed file") - return self._filename - - property header: - '''the file header. - - The file header consists of the lines at the beginning of a - file that are prefixed by the comment character ``#``. - - .. note:: - The header is returned as an iterator presenting lines - without the newline character. - - .. note:: - The header is only available for local files. For remote - files an Attribute Error is raised. - - ''' - - def __get__(self): - if self.is_remote: - raise AttributeError( - "the header is not available for remote files") - return GZIteratorHead(self.filename) - - property contigs: - '''list of chromosome names''' - def __get__(self): - cdef char ** sequences - cdef int nsequences - - with nogil: - sequences = tbx_seqnames(self.index, &nsequences) - cdef int x - result = [] - for x from 0 <= x < nsequences: - result.append(force_str(sequences[x])) - - # htslib instructions: - # only free container, not the sequences themselves - free(sequences) - - return result - - def close(self): - ''' - closes the :class:`pysam.TabixFile`.''' - if self.tabixfile != NULL: - hts_close(self.tabixfile) - self.tabixfile = NULL - if self.index != NULL: - tbx_destroy(self.index) - self.index = NULL - - def __dealloc__( self ): - # remember: dealloc cannot call other python methods - # note: no doc string - # note: __del__ is not called. - if self.tabixfile != NULL: - hts_close(self.tabixfile) - self.tabixfile = NULL - if self.index != NULL: - tbx_destroy(self.index) - - -cdef class TabixIterator: - """iterates over rows in *tabixfile* in region - given by *tid*, *start* and *end*. - """ - - def __init__(self, encoding="ascii"): - self.encoding = encoding - - def __iter__(self): - self.buffer.s = NULL - self.buffer.l = 0 - self.buffer.m = 0 - - return self - - cdef int __cnext__(self): - '''iterate to next element. - - Return -5 if file has been closed when this function - was called. - ''' - if self.tabixfile.tabixfile == NULL: - return -5 - - cdef int retval - - while 1: - with nogil: - retval = tbx_itr_next( - self.tabixfile.tabixfile, - self.tabixfile.index, - self.iterator, - &self.buffer) - - if retval < 0: - break - - if self.buffer.s[0] != '#': - break - - return retval - - def __next__(self): - """python version of next(). - - pyrex uses this non-standard name instead of next() - """ - - cdef int retval = self.__cnext__() - if retval == -5: - raise IOError("iteration on closed file") - elif retval < 0: - raise StopIteration - - return charptr_to_str(self.buffer.s, self.encoding) - - def next(self): - return self.__next__() - - def __dealloc__(self): - if self.iterator != NULL: - tbx_itr_destroy(self.iterator) - if self.buffer.s != NULL: - free(self.buffer.s) - - -class EmptyIterator: - '''empty iterator''' - - def __iter__(self): - return self - - def next(self): - raise StopIteration() - - def __next__(self): - raise StopIteration() - - -cdef class TabixIteratorParsed(TabixIterator): - """iterates over mapped reads in a region. - - The *parser* determines the encoding. - - Returns parsed data. - """ - - def __init__(self, - Parser parser): - - TabixIterator.__init__(self) - self.parser = parser - - def __next__(self): - """python version of next(). - - pyrex uses this non-standard name instead of next() - """ - - cdef int retval = self.__cnext__() - if retval == -5: - raise IOError("iteration on closed file") - elif retval < 0: - raise StopIteration - - return self.parser.parse(self.buffer.s, - self.buffer.l) - - -cdef class GZIterator: - def __init__(self, filename, int buffer_size=65536, encoding="ascii"): - '''iterate line-by-line through gzip (or bgzip) - compressed file. - ''' - if not os.path.exists(filename): - raise IOError("No such file or directory: %s" % filename) - - filename = encode_filename(filename) - cdef char *cfilename = filename - with nogil: - self.gzipfile = bgzf_open(cfilename, "r") - self._filename = filename - self.kstream = ks_init(self.gzipfile) - self.encoding = encoding - - self.buffer.l = 0 - self.buffer.m = 0 - self.buffer.s = malloc(buffer_size) - - def __dealloc__(self): - '''close file.''' - if self.gzipfile != NULL: - bgzf_close(self.gzipfile) - self.gzipfile = NULL - if self.buffer.s != NULL: - free(self.buffer.s) - if self.kstream != NULL: - ks_destroy(self.kstream) - - def __iter__(self): - return self - - cdef int __cnext__(self): - cdef int dret = 0 - cdef int retval = 0 - while 1: - with nogil: - retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) - - if retval < 0: - break - - return dret - return -1 - - def __next__(self): - """python version of next(). - """ - cdef int retval = self.__cnext__() - if retval < 0: - raise StopIteration - return force_str(self.buffer.s, self.encoding) - - -cdef class GZIteratorHead(GZIterator): - '''iterate line-by-line through gzip (or bgzip) - compressed file returning comments at top of file. - ''' - - def __next__(self): - """python version of next(). - """ - cdef int retval = self.__cnext__() - if retval < 0: - raise StopIteration - if self.buffer.s[0] == '#': - return self.buffer.s - else: - raise StopIteration - - -cdef class GZIteratorParsed(GZIterator): - '''iterate line-by-line through gzip (or bgzip) - compressed file returning comments at top of file. - ''' - - def __init__(self, parser): - self.parser = parser - - def __next__(self): - """python version of next(). - """ - cdef int retval = self.__cnext__() - if retval < 0: - raise StopIteration - - return self.parser.parse(self.buffer.s, - self.buffer.l) - - -def tabix_compress(filename_in, - filename_out, - force=False): - '''compress *filename_in* writing the output to *filename_out*. - - Raise an IOError if *filename_out* already exists, unless *force* - is set. - ''' - - if not force and os.path.exists(filename_out): - raise IOError( - "Filename '%s' already exists, use *force* to " - "overwrite" % filename_out) - - cdef int WINDOW_SIZE - cdef int c, r - cdef void * buffer - cdef BGZF * fp - cdef int fd_src - cdef bint is_empty = True - cdef int O_RDONLY - O_RDONLY = os.O_RDONLY - - WINDOW_SIZE = 64 * 1024 - - fn = encode_filename(filename_out) - cdef char *cfn = fn - with nogil: - fp = bgzf_open(cfn, "w") - if fp == NULL: - raise IOError("could not open '%s' for writing" % filename_out) - - fn = encode_filename(filename_in) - fd_src = open(fn, O_RDONLY) - if fd_src == 0: - raise IOError("could not open '%s' for reading" % filename_in) - - buffer = malloc(WINDOW_SIZE) - c = 1 - - while c > 0: - with nogil: - c = read(fd_src, buffer, WINDOW_SIZE) - if c > 0: - is_empty = False - r = bgzf_write(fp, buffer, c) - if r < 0: - free(buffer) - raise OSError("writing failed") - - free(buffer) - r = bgzf_close(fp) - if r < 0: - raise OSError("error %i when writing to file %s" % (r, filename_out)) - - r = close(fd_src) - # an empty file will return with -1, thus ignore this. - if r < 0: - if not (r == -1 and is_empty): - raise OSError("error %i when closing file %s" % (r, filename_in)) - - -def tabix_index( filename, - force = False, - seq_col = None, - start_col = None, - end_col = None, - preset = None, - meta_char = "#", - zerobased = False, - int min_shift = -1, - ): - '''index tab-separated *filename* using tabix. - - An existing index will not be overwritten unless - *force* is set. - - The index will be built from coordinates - in columns *seq_col*, *start_col* and *end_col*. - - The contents of *filename* have to be sorted by - contig and position - the method does not check - if the file is sorted. - - Column indices are 0-based. Coordinates in the file - are assumed to be 1-based. - - If *preset* is provided, the column coordinates - are taken from a preset. Valid values for preset - are "gff", "bed", "sam", "vcf", psltbl", "pileup". - - Lines beginning with *meta_char* and the first - *line_skip* lines will be skipped. - - If *filename* does not end in ".gz", it will be automatically - compressed. The original file will be removed and only the - compressed file will be retained. - - If *filename* ends in *gz*, the file is assumed to be already - compressed with bgzf. - - *min-shift* sets the minimal interval size to 1<malloc( buffer_size ) -# self.size = buffer_size -# self.parser = parser - -# def __iter__(self): -# return self - -# cdef __cnext__(self): - -# cdef char * b -# cdef size_t nbytes -# b = self.buffer - -# while not feof( self.infile ): -# nbytes = getline( &b, &self.size, self.infile) - -# # stop at first error or eof -# if (nbytes == -1): break -# # skip comments -# if (b[0] == '#'): continue - -# # skip empty lines -# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue - -# # make sure that entry is complete -# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': -# result = b -# raise ValueError( "incomplete line at %s" % result ) - -# # make sure that this goes fully through C -# # otherwise buffer is copied to/from a -# # Python object causing segfaults as -# # the wrong memory is freed -# return self.parser.parse( b, nbytes ) - -# raise StopIteration - -# def __dealloc__(self): -# free(self.buffer) - -# def __next__(self): -# return self.__cnext__() - -######################################################### -######################################################### -######################################################### -## Iterators for parsing through unindexed files. -######################################################### -# cdef buildGzipError(void *gzfp): -# cdef int errnum = 0 -# cdef char *s = gzerror(gzfp, &errnum) -# return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) - - -cdef class tabix_file_iterator: - '''iterate over a compressed or uncompressed ``infile``. - ''' - - def __cinit__(self, - infile, - Parser parser, - int buffer_size=65536): - - if infile.closed: - raise ValueError("I/O operation on closed file.") - - self.infile = infile - - cdef int fd = PyObject_AsFileDescriptor(infile) - if fd == -1: - raise ValueError("I/O operation on closed file.") - - self.duplicated_fd = dup(fd) - - # From the manual: - # gzopen can be used to read a file which is not in gzip format; - # in this case gzread will directly read from the file without decompression. - # When reading, this will be detected automatically by looking - # for the magic two-byte gzip header. - self.fh = bgzf_dopen(self.duplicated_fd, 'r') - - if self.fh == NULL: - raise IOError('%s' % strerror(errno)) - - self.kstream = ks_init(self.fh) - - self.buffer.s = malloc(buffer_size) - #if self.buffer == NULL: - # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) - #self.size = buffer_size - self.parser = parser - - def __iter__(self): - return self - - cdef __cnext__(self): - - cdef char * b - cdef int dret = 0 - cdef int retval = 0 - while 1: - with nogil: - retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) - - if retval < 0: - break - #raise IOError('gzip error: %s' % buildGzipError( self.fh )) - - b = self.buffer.s - - # skip comments - if (b[0] == '#'): - continue - - # skip empty lines - if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': - continue - - # gzgets terminates at \n, no need to test - - # parser creates a copy - return self.parser.parse(b, self.buffer.l) - - raise StopIteration - - def __dealloc__(self): - free(self.buffer.s) - ks_destroy(self.kstream) - bgzf_close(self.fh) - - def __next__(self): - return self.__cnext__() - - def next(self): - return self.__cnext__() - - -class tabix_generic_iterator: - '''iterate over ``infile``. - - Permits the use of file-like objects for example from the gzip module. - ''' - def __init__(self, infile, parser): - - self.infile = infile - if self.infile.closed: - raise ValueError("I/O operation on closed file.") - self.parser = parser - - def __iter__(self): - return self - - # cython version - required for python 3 - def __next__(self): - - cdef char * b - cdef char * cpy - cdef size_t nbytes - - encoding = self.parser.get_encoding() - - # note that GzipFile.close() does not close the file - # reading is still possible. - if self.infile.closed: - raise ValueError("I/O operation on closed file.") - - while 1: - - line = self.infile.readline() - if not line: - break - - s = force_bytes(line, encoding) - b = s - nbytes = len(line) - assert b[nbytes] == '\0' - - # skip comments - if b[0] == '#': - continue - - # skip empty lines - if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': - continue - - # make sure that entry is complete - if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': - raise ValueError("incomplete line at %s" % line) - - bytes_cpy = b - cpy = bytes_cpy - - return self.parser(cpy, nbytes) - - raise StopIteration - - # python version - required for python 2.7 - def next(self): - return self.__next__() - -def tabix_iterator(infile, parser): - """return an iterator over all entries in a file. - - Results are returned parsed as specified by the *parser*. If - *parser* is None, the results are returned as an unparsed string. - Otherwise, *parser* is assumed to be a functor that will return - parsed data (see for example :class:`~pysam.asTuple` and - :class:`~pysam.asGTF`). - - """ - if PY_MAJOR_VERSION >= 3: - return tabix_generic_iterator(infile, parser) - else: - return tabix_file_iterator(infile, parser) - - # file objects can use C stdio - # used to be: isinstance( infile, file): - # if PY_MAJOR_VERSION >= 3: - # if isinstance( infile, io.IOBase ): - # return tabix_copy_iterator( infile, parser ) - # else: - # return tabix_generic_iterator( infile, parser ) - # else: -# if isinstance( infile, file ): -# return tabix_copy_iterator( infile, parser ) -# else: -# return tabix_generic_iterator( infile, parser ) - -cdef class Tabixfile(TabixFile): - """Tabixfile is deprecated: use TabixFile instead""" - pass - - -__all__ = [ - "tabix_index", - "tabix_compress", - "TabixFile", - "Tabixfile", - "asTuple", - "asGTF", - "asVCF", - "asBed", - "GZIterator", - "GZIteratorHead", - "tabix_iterator", - "tabix_generic_iterator", - "tabix_file_iterator", -] diff --git a/pysam/ctabixproxies.pxd b/pysam/ctabixproxies.pxd deleted file mode 100644 index 5317b81..0000000 --- a/pysam/ctabixproxies.pxd +++ /dev/null @@ -1,59 +0,0 @@ -#cdef extern from "Python.h": -# ctypedef struct FILE - -from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t - -cdef class TupleProxy: - - cdef: - char * data - char ** fields - int nfields - int index - int nbytes - int offset - bint is_modified - - cdef encoding - - cpdef int getMaxFields(self) - cpdef int getMinFields(self) -# cdef char * _getindex(self, int idx) - - cdef take(self, char * buffer, size_t nbytes) - cdef present(self, char * buffer, size_t nbytes) - cdef copy(self, char * buffer, size_t nbytes, bint reset=*) - cdef update(self, char * buffer, size_t nbytes) - -cdef class GTFProxy(TupleProxy) : - - cdef: - char * _attributes - cdef bint hasOwnAttributes - - cpdef int getMaxFields(self) - cpdef int getMinFields(self) - cdef char * getAttributes(self) - -cdef class NamedTupleProxy(TupleProxy): - pass - -cdef class BedProxy(NamedTupleProxy): - - cdef: - char * contig - uint32_t start - uint32_t end - int bedfields - - cpdef int getMaxFields(self) - cpdef int getMinFields(self) - cdef update(self, char * buffer, size_t nbytes) - -cdef class VCFProxy(NamedTupleProxy) : - - cdef: - char * contig - uint32_t pos - - cdef update(self, char * buffer, size_t nbytes) diff --git a/pysam/ctabixproxies.pyx b/pysam/ctabixproxies.pyx deleted file mode 100644 index f5288cc..0000000 --- a/pysam/ctabixproxies.pyx +++ /dev/null @@ -1,827 +0,0 @@ -from cpython cimport PyBytes_FromStringAndSize - -from libc.stdio cimport printf, feof, fgets -from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr -from libc.stdlib cimport free, malloc, calloc, realloc -from libc.stdlib cimport atoi, atol, atof - -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size - -import collections - -cdef char *StrOrEmpty(char * buffer): - if buffer == NULL: - return "" - else: return buffer - -cdef int isNew(char * p, char * buffer, size_t nbytes): - """return True if `p` is located within `buffer` of size - `nbytes` - """ - if p == NULL: - return 0 - return not (buffer <= p < buffer + nbytes) - - -cdef class TupleProxy: - '''Proxy class for access to parsed row as a tuple. - - This class represents a table row for fast read-access. - - Access to individual fields is via the [] operator. - - Only read-only access is implemented. - - ''' - - def __cinit__(self, encoding="ascii"): - self.data = NULL - self.fields = NULL - self.index = 0 - self.nbytes = 0 - self.is_modified = 0 - self.nfields = 0 - # start counting at field offset - self.offset = 0 - self.encoding = encoding - - def __dealloc__(self): - cdef int x - if self.is_modified: - for x from 0 <= x < self.nfields: - if isNew(self.fields[x], self.data, self.nbytes): - free(self.fields[x]) - self.fields[x] = NULL - - if self.data != NULL: - free(self.data) - if self.fields != NULL: - free(self.fields) - - def __copy__(self): - if self.is_modified: - raise NotImplementedError( - "copying modified tuples is not implemented") - cdef TupleProxy n = type(self)() - n.copy(self.data, self.nbytes, reset=True) - return n - - def compare(self, TupleProxy other): - '''return -1,0,1, if contents in this are binary - <,=,> to *other* - - ''' - if self.is_modified or other.is_modified: - raise NotImplementedError( - 'comparison of modified TupleProxies is not implemented') - if self.data == other.data: - return 0 - - if self.nbytes < other.nbytes: - return -1 - elif self.nbytes > other.nbytes: - return 1 - return memcmp(self.data, other.data, self.nbytes) - - def __richcmp__(self, TupleProxy other, int op): - if op == 2: # == operator - return self.compare(other) == 0 - elif op == 3: # != operator - return self.compare(other) != 0 - else: - err_msg = "op {0} isn't implemented yet".format(op) - raise NotImplementedError(err_msg) - - cdef take(self, char * buffer, size_t nbytes): - '''start presenting buffer. - - Take ownership of the pointer. - ''' - self.data = buffer - self.nbytes = nbytes - self.update(buffer, nbytes) - - cdef present(self, char * buffer, size_t nbytes): - '''start presenting buffer. - - Do not take ownership of the pointer. - ''' - self.update(buffer, nbytes) - - cdef copy(self, char * buffer, size_t nbytes, bint reset=False): - '''start presenting buffer of size *nbytes*. - - Buffer is a '\0'-terminated string without the '\n'. - - Take a copy of buffer. - ''' - # +1 for '\0' - cdef int s = sizeof(char) * (nbytes + 1) - self.data = malloc(s) - if self.data == NULL: - raise ValueError("out of memory in TupleProxy.copy()") - memcpy(self.data, buffer, s) - - if reset: - for x from 0 <= x < nbytes: - if self.data[x] == '\0': - self.data[x] = '\t' - - self.update(self.data, nbytes) - - cpdef int getMinFields(self): - '''return minimum number of fields.''' - # 1 is not a valid tabix entry, but TupleProxy - # could be more generic. - return 1 - - cpdef int getMaxFields(self): - '''return maximum number of fields. Return - 0 for unknown length.''' - return 0 - - cdef update(self, char * buffer, size_t nbytes): - '''update internal data. - - *buffer* is a \0 terminated string. - - *nbytes* is the number of bytes in buffer (excluding - the \0) - - Update starts work in buffer, thus can be used - to collect any number of fields until nbytes - is exhausted. - - If max_fields is set, the number of fields is initialized to - max_fields. - - ''' - cdef char * pos - cdef char * old_pos - cdef int field - cdef int max_fields, min_fields, x - - assert strlen(buffer) == nbytes, \ - "length of buffer (%i) != number of bytes (%i)" % ( - strlen(buffer), nbytes) - - if buffer[nbytes] != 0: - raise ValueError("incomplete line at %s" % buffer) - - ################################# - # remove line breaks and feeds and update number of bytes - x = nbytes - 1 - while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): - buffer[x] = '\0' - x -= 1 - self.nbytes = x + 1 - - ################################# - # clear data - if self.fields != NULL: - free(self.fields) - - for field from 0 <= field < self.nfields: - if isNew(self.fields[field], self.data, self.nbytes): - free(self.fields[field]) - - self.is_modified = self.nfields = 0 - - ################################# - # allocate new - max_fields = self.getMaxFields() - # pre-count fields - better would be - # to guess or dynamically grow - if max_fields == 0: - for x from 0 <= x < nbytes: - if buffer[x] == '\t': - max_fields += 1 - max_fields += 1 - - self.fields = calloc(max_fields, sizeof(char *)) - if self.fields == NULL: - raise ValueError("out of memory in TupleProxy.update()") - - ################################# - # start filling - field = 0 - self.fields[field] = pos = buffer - field += 1 - old_pos = pos - while 1: - - pos = memchr(pos, '\t', nbytes) - if pos == NULL: - break - if field >= max_fields: - raise ValueError( - "parsing error: more than %i fields in line: %s" % - (max_fields, buffer)) - - pos[0] = '\0' - pos += 1 - self.fields[field] = pos - field += 1 - nbytes -= pos - old_pos - if nbytes < 0: - break - old_pos = pos - self.nfields = field - if self.nfields < self.getMinFields(): - raise ValueError( - "parsing error: fewer that %i fields in line: %s" % - (self.getMinFields(), buffer)) - - def _getindex(self, int index): - '''return item at idx index''' - cdef int i = index - if i < 0: - i += self.nfields - if i < 0: - raise IndexError("list index out of range") - # apply offset - separating a fixed number - # of fields from a variable number such as in VCF - i += self.offset - if i >= self.nfields: - raise IndexError( - "list index out of range %i >= %i" % - (i, self.nfields)) - return force_str(self.fields[i], self.encoding) - - def __getitem__(self, key): - if type(key) == int: - return self._getindex(key) - # slice object - start, end, step = key.indices(self.nfields) - result = [] - for index in range(start, end, step): - result.append(self._getindex(index)) - return result - - def _setindex(self, index, value): - '''set item at idx index.''' - cdef int idx = index - if idx < 0: - raise IndexError("list index out of range") - if idx >= self.nfields: - raise IndexError("list index out of range") - - if isNew(self.fields[idx], self.data, self.nbytes): - free(self.fields[idx] ) - - self.is_modified = 1 - - if value is None: - self.fields[idx] = NULL - return - - # conversion with error checking - value = force_bytes(value) - cdef char * tmp = value - self.fields[idx] = malloc((strlen( tmp ) + 1) * sizeof(char)) - if self.fields[idx] == NULL: - raise ValueError("out of memory" ) - strcpy(self.fields[idx], tmp) - - def __setitem__(self, index, value): - '''set item at *index* to *value*''' - cdef int i = index - if i < 0: - i += self.nfields - i += self.offset - - self._setindex(i, value) - - def __len__(self): - return self.nfields - - def __iter__(self): - self.index = 0 - return self - - def __next__(self): - """python version of next(). - """ - if self.index >= self.nfields: - raise StopIteration - cdef char * retval = self.fields[self.index] - self.index += 1 - if retval == NULL: - return None - else: - return force_str(retval, self.encoding) - - def __str__(self): - '''return original data''' - # copy and replace \0 bytes with \t characters - cdef char * cpy - if self.is_modified: - # todo: treat NULL values - result = [] - for x in xrange(0, self.nfields): - result.append(StrOrEmpty(self.fields[x]).decode(self.encoding)) - return "\t".join(result) - else: - cpy = calloc(sizeof(char), self.nbytes+1) - if cpy == NULL: - raise ValueError("out of memory") - memcpy(cpy, self.data, self.nbytes+1) - for x from 0 <= x < self.nbytes: - if cpy[x] == '\0': - cpy[x] = '\t' - result = cpy[:self.nbytes] - free(cpy) - r = result.decode(self.encoding) - return r - -def toDot(v): - '''convert value to '.' if None''' - if v is None: - return "." - else: - return str(v) - -def quote(v): - '''return a quoted attribute.''' - if isinstance(v, str): - return '"%s"' % v - else: - return str(v) - - -cdef class GTFProxy(TupleProxy): - '''Proxy class for access to GTF fields. - - This class represents a GTF entry for fast read-access. - Write-access has been added as well, though some care must - be taken. If any of the string fields (contig, source, ...) - are set, the new value is tied to the lifetime of the - argument that was supplied. - - The only exception is the attributes field when set from - a dictionary - this field will manage its own memory. - ''' - - def __cinit__(self): - # automatically calls TupleProxy.__cinit__ - self.hasOwnAttributes = False - self._attributes = NULL - - def __dealloc__(self): - # automatically calls TupleProxy.__dealloc__ - if self.hasOwnAttributes: - free(self._attributes) - - cpdef int getMinFields(self): - '''return minimum number of fields.''' - return 9 - - cpdef int getMaxFields(self): - '''return max number of fields.''' - return 9 - - property contig: - '''contig of feature.''' - def __get__(self): - return self._getindex(0) - def __set__(self, value): - self._setindex(0, value) - - property source: - '''feature source.''' - def __get__(self): - return self._getindex(1) - def __set__(self, value): - if value is None: - value = "." - self._setindex(1, value) - - property feature: - '''feature name.''' - def __get__(self): - return self._getindex(2) - def __set__(self, value): - if value is None: - value = "." - self._setindex(2, value) - - property start: - '''feature start (in 0-based open/closed coordinates).''' - def __get__(self ): - return int( self._getindex(3)) - 1 - def __set__(self, value ): - self._setindex(3, str(value+1)) - - property end: - '''feature end (in 0-based open/closed coordinates).''' - def __get__(self): - return int(self._getindex(4)) - def __set__(self, value): - self._setindex(4, str(value)) - - property score: - '''feature score.''' - def __get__(self): - v = self._getindex(5) - if v == "" or v[0] == '.': - return None - else: - return float(v) - - def __set__(self, value): - if value is None: - value = "." - self._setindex(5, str(value)) - - property strand: - '''feature strand.''' - def __get__(self): - return self._getindex(6) - def __set__(self, value ): - if value is None: - value = "." - self._setindex(6, value) - - property frame: - '''feature frame.''' - def __get__(self): - v = self._getindex(7) - if v == "" or v[0] == '.': - return v - else: - return int(v) - - def __set__(self, value): - if value is None: - value = "." - self._setindex(7, str(value)) - - property attributes: - '''feature attributes (as a string).''' - def __get__(self): - if self.hasOwnAttributes: - return force_str(self._attributes) - else: - return force_str(self._getindex(8)) - def __set__( self, value): - if self.hasOwnAttributes: - free(self._attributes) - self._attributes = NULL - self.hasOwnAttributes = False - self._setindex(8, value) - - cdef char * getAttributes(self): - '''return pointer to attributes.''' - cdef char * attributes - if self.hasOwnAttributes: - attributes = self._attributes - else: - attributes = self.fields[8] - if attributes == NULL: - raise KeyError("no attributes defined GTF entry") - return attributes - - def asDict(self): - """parse attributes - return as dict - """ - - # remove comments - attributes = self.attributes - - # separate into fields - # Fields might contain a ";", for example in ENSEMBL GTF file - # for mouse, v78: - # ...; transcript_name "TXNRD2;-001"; .... - # The current heuristic is to split on a semicolon followed by a - # space, see also http://mblab.wustl.edu/GTF22.html - - # Remove white space to prevent a last empty field. - fields = [x.strip() for x in attributes.strip().split("; ")] - - result = collections.OrderedDict() - - for f in fields: - - # strip semicolon (GTF files without a space after the last semicolon) - if f.endswith(";"): - f = f[:-1] - - # split at most once in order to avoid separating - # multi-word values - d = [x.strip() for x in f.split(" ", 1)] - - n,v = d[0], d[1] - if len(d) > 2: - v = d[1:] - - if v[0] == '"' and v[-1] == '"': - v = v[1:-1] - else: - ## try to convert to a value - try: - v = float(v) - v = int(v) - except ValueError: - pass - except TypeError: - pass - - result[n] = v - - return result - - def fromDict(self, d): - '''set attributes from a dictionary.''' - cdef char * p - cdef int l - - # clean up if this field is set twice - if self.hasOwnAttributes: - free(self._attributes) - - aa = [] - for k,v in d.items(): - if isinstance(v, str): - aa.append( '%s "%s"' % (k,v) ) - else: - aa.append( '%s %s' % (k,str(v)) ) - - a = force_bytes("; ".join(aa) + ";") - p = a - l = len(a) - self._attributes = calloc(l + 1, sizeof(char)) - if self._attributes == NULL: - raise ValueError("out of memory") - memcpy(self._attributes, p, l) - - self.hasOwnAttributes = True - self.is_modified = True - - def __str__(self): - cdef char * cpy - cdef int x - - if self.is_modified: - return "\t".join( - (self.contig, - self.source, - self.feature, - str(self.start+1), - str(self.end), - toDot(self.score), - toDot(self.strand), - toDot(self.frame), - self.attributes)) - else: - return TupleProxy.__str__(self) - - def invert(self, int lcontig): - '''invert coordinates to negative strand coordinates - - This method will only act if the feature is on the - negative strand.''' - - if self.strand[0] == '-': - start = min(self.start, self.end) - end = max(self.start, self.end) - self.start, self.end = lcontig - end, lcontig - start - - def keys(self): - '''return a list of attributes defined in this entry.''' - r = self.attributes - return [x.strip().split(" ")[0] - # separator is ';' followed by space - for x in r.split("; ") if x.strip() != ''] - - def __getitem__(self, key): - return self.__getattr__(key) - - def __getattr__(self, item): - """Generic lookup of attribute from GFF/GTF attributes - Only called if there *isn't* an attribute with this name - """ - cdef char * start - cdef char * query - cdef char * cpy - cdef char * end - cdef int l - - # - # important to use the getAttributes function. - # Using the self.attributes property to access - # the attributes caused a hard-to-trace bug - # in which fields in the attribute string were - # set to 0. - # Running through valgrind complained that - # memory was accessed in the memory field - # that has been released. It is not clear - # why this happened and might be a cython bug - # (Version 0.16). The valgrind warnings - # disappeard after accessing the C data structures - # directly and so did the bug. - cdef char * attributes = self.getAttributes() - if attributes == NULL: - raise KeyError("key %s not found, no attributes" % item) - - # add space in order to make sure - # to not pick up a field that is a prefix of another field - r = force_bytes(item + " ") - query = r - start = strstr(attributes, query) - - if start == NULL: - raise AttributeError("'GTFProxy' has no attribute '%s'" % item) - - start += strlen(query) - # skip gaps before - while start[0] == ' ': - start += 1 - - if start[0] == '"': - start += 1 - end = start - while end[0] != '\0' and end[0] != '"': - end += 1 - l = end - start - result = force_str(PyBytes_FromStringAndSize(start, l), - self.encoding) - return result - else: - return force_str(start, self.encoding) - - def setAttribute(self, name, value): - '''convenience method to set an attribute.''' - r = self.asDict() - r[name] = value - self.fromDict(r) - - def __cmp__(self, other): - return (self.contig, self.strand, self.start) < \ - (other.contig, other.strand, other.start) - - # python 3 compatibility - def __richcmp__(GTFProxy self, GTFProxy other, int op): - if op == 0: - return (self.contig, self.strand, self.start) < \ - (other.contig, other.strand, other.start) - elif op == 1: - return (self.contig, self.strand, self.start) <= \ - (other.contig, other.strand, other.start) - elif op == 2: - return self.compare(other) == 0 - elif op == 3: - return self.compare(other) != 0 - else: - err_msg = "op {0} isn't implemented yet".format(op) - raise NotImplementedError(err_msg) - - -cdef class NamedTupleProxy(TupleProxy): - - map_key2field = {} - - def __setattr__(self, key, value): - '''set attribute.''' - cdef int idx - idx, f = self.map_key2field[key] - if self.nfields < idx: - raise KeyError("field %s not set" % key) - TupleProxy.__setitem__(self, idx, str(value)) - - def __getattr__(self, key): - cdef int idx - idx, f = self.map_key2field[key] - if self.nfields < idx: - raise KeyError("field %s not set" % key) - if f == str: - return force_str(self.fields[idx], - self.encoding) - return f(self.fields[idx]) - - -cdef class BedProxy(NamedTupleProxy): - '''Proxy class for access to Bed fields. - - This class represents a BED entry for fast read-access. - ''' - map_key2field = { - 'contig' : (0, str), - 'start' : (1, int), - 'end' : (2, int), - 'name' : (3, str), - 'score' : (4, float), - 'strand' : (5, str), - 'thickStart' : (6, int), - 'thickEnd' : (7, int), - 'itemRGB' : (8, str), - 'blockCount': (9, int), - 'blockSizes': (10, str), - 'blockStarts': (11, str), } - - cpdef int getMinFields(self): - '''return minimum number of fields.''' - return 3 - - cpdef int getMaxFields(self): - '''return max number of fields.''' - return 12 - - cdef update(self, char * buffer, size_t nbytes): - '''update internal data. - - nbytes does not include the terminal '\0'. - ''' - TupleProxy.update(self, buffer, nbytes) - - if self.nfields < 3: - raise ValueError( - "bed format requires at least three columns") - - # determines bed format - self.bedfields = self.nfields - - # do automatic conversion - self.contig = self.fields[0] - self.start = atoi(self.fields[1]) - self.end = atoi(self.fields[2]) - - # __setattr__ in base class seems to take precedence - # hence implement setters in __setattr__ - #property start: - # def __get__( self ): return self.start - #property end: - # def __get__( self ): return self.end - - def __str__(self): - - cdef int save_fields = self.nfields - # ensure fields to use correct format - self.nfields = self.bedfields - retval = TupleProxy.__str__(self) - self.nfields = save_fields - return retval - - def __setattr__(self, key, value ): - '''set attribute.''' - if key == "start": - self.start = value - elif key == "end": - self.end = value - - cdef int idx - idx, f = self.map_key2field[key] - TupleProxy._setindex(self, idx, str(value) ) - -cdef class VCFProxy(NamedTupleProxy): - '''Proxy class for access to VCF fields. - - The genotypes are accessed via a numeric index. - Sample headers are not available. - ''' - map_key2field = { - 'contig' : (0, str), - 'pos' : (1, int), - 'id' : (2, str), - 'ref' : (3, str), - 'alt' : (4, str), - 'qual' : (5, str), - 'filter' : (6, str), - 'info' : (7, str), - 'format' : (8, str) } - - def __cinit__(self): - # automatically calls TupleProxy.__cinit__ - # start indexed access at genotypes - self.offset = 9 - - cdef update(self, char * buffer, size_t nbytes): - '''update internal data. - - nbytes does not include the terminal '\0'. - ''' - TupleProxy.update(self, buffer, nbytes) - - self.contig = self.fields[0] - # vcf counts from 1 - correct here - self.pos = atoi(self.fields[1]) - 1 - - def __len__(self): - '''return number of genotype fields.''' - return max(0, self.nfields - 9) - - property pos: - '''feature end (in 0-based open/closed coordinates).''' - def __get__(self): - return self.pos - - def __setattr__(self, key, value): - '''set attribute.''' - if key == "pos": - self.pos = value - value += 1 - - cdef int idx - idx, f = self.map_key2field[key] - TupleProxy._setindex(self, idx, str(value)) - diff --git a/pysam/cutils.pxd b/pysam/cutils.pxd deleted file mode 100644 index 81e544a..0000000 --- a/pysam/cutils.pxd +++ /dev/null @@ -1,38 +0,0 @@ -######################################################################### -# Utility functions used across pysam -######################################################################### -cimport cython -from cpython cimport array as c_array - -cpdef parse_region(reference=*, start=*, end=*, region=*) - -######################################################################### -# Utility functions for quality string conversions - -cpdef c_array.array qualitystring_to_array(input_str, int offset=*) -cpdef array_to_qualitystring(c_array.array arr, int offset=*) -cpdef qualities_to_qualitystring(qualities, int offset=*) - -######################################################################## -######################################################################## -######################################################################## -## Python 3 compatibility functions -######################################################################## -cdef charptr_to_str(const char *s, encoding=*) -cdef bytes charptr_to_bytes(const char *s, encoding=*) -cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*) -cdef force_str(object s, encoding=*) -cdef bytes force_bytes(object s, encoding=*) -cdef bytes encode_filename(object filename) -cdef from_string_and_size(const char *s, size_t length) - -cdef extern from "pysam_util.h": - - int samtools_main(int argc, char *argv[]) - int bcftools_main(int argc, char *argv[]) - void pysam_set_stderr(int fd) - void pysam_unset_stderr() - void pysam_set_stdout(int fd) - void pysam_set_stdout_fn(const char *) - void pysam_unset_stdout() - void set_optind(int) diff --git a/pysam/cutils.pyx b/pysam/cutils.pyx deleted file mode 100644 index 7510727..0000000 --- a/pysam/cutils.pyx +++ /dev/null @@ -1,371 +0,0 @@ -import types -import sys -import string -import re -import tempfile -import os -import io -from contextlib import contextmanager - -from cpython.version cimport PY_MAJOR_VERSION -from cpython cimport PyBytes_Check, PyUnicode_Check -from cpython cimport array as c_array -from libc.stdlib cimport calloc, free -from libc.string cimport strncpy -from libc.stdio cimport fprintf, stderr, fflush -from libc.stdio cimport stdout as c_stdout -from posix.fcntl cimport open as c_open, O_WRONLY - -##################################################################### -# hard-coded constants -cdef int MAX_POS = 2 << 29 - -################################################################# -# Utility functions for quality string conversions -cpdef c_array.array qualitystring_to_array(input_str, int offset=33): - """convert a qualitystring to an array of quality values.""" - if input_str is None: - return None - qs = force_bytes(input_str) - cdef char i - return c_array.array('B', [i - offset for i in qs]) - - -cpdef array_to_qualitystring(c_array.array qualities, int offset=33): - """convert an array of quality values to a string.""" - if qualities is None: - return None - cdef int x - - cdef c_array.array result - result = c_array.clone(qualities, len(qualities), zero=False) - - for x from 0 <= x < len(qualities): - result[x] = qualities[x] + offset - return force_str(result.tostring()) - - -cpdef qualities_to_qualitystring(qualities, int offset=33): - """convert a list or array of quality scores to the string - representation used in the SAM format. - - Parameters - ---------- - offset : int - offset to be added to the quality scores to arrive at - the characters of the quality string (default=33). - - Returns - ------- - string - a quality string - - """ - cdef char x - if qualities is None: - return None - elif isinstance(qualities, c_array.array): - return array_to_qualitystring(qualities, offset=offset) - else: - # tuples and lists - return force_str("".join([chr(x + offset) for x in qualities])) - - -######################################################################## -######################################################################## -######################################################################## -## Python 3 compatibility functions -######################################################################## -cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 - -cdef from_string_and_size(const char* s, size_t length): - if IS_PYTHON3: - return s[:length].decode("ascii") - else: - return s[:length] - -# filename encoding (copied from lxml.etree.pyx) -cdef str _FILENAME_ENCODING -_FILENAME_ENCODING = sys.getfilesystemencoding() -if _FILENAME_ENCODING is None: - _FILENAME_ENCODING = sys.getdefaultencoding() -if _FILENAME_ENCODING is None: - _FILENAME_ENCODING = 'ascii' - -#cdef char* _C_FILENAME_ENCODING -#_C_FILENAME_ENCODING = _FILENAME_ENCODING - -cdef bytes encode_filename(object filename): - """Make sure a filename is 8-bit encoded (or None).""" - if filename is None: - return None - elif PyBytes_Check(filename): - return filename - elif PyUnicode_Check(filename): - return filename.encode(_FILENAME_ENCODING) - else: - raise TypeError(u"Argument must be string or unicode.") - -cdef bytes force_bytes(object s, encoding="ascii"): - u"""convert string or unicode object to bytes, assuming - ascii encoding. - """ - if not IS_PYTHON3: - return s - elif s is None: - return None - elif PyBytes_Check(s): - return s - elif PyUnicode_Check(s): - return s.encode(encoding) - else: - raise TypeError(u"Argument must be string, bytes or unicode.") - -cdef charptr_to_str(const char* s, encoding="ascii"): - if s == NULL: - return None - if PY_MAJOR_VERSION < 3: - return s - else: - return s.decode(encoding) - -cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"): - if s == NULL: - return None - if PY_MAJOR_VERSION < 3: - return s[:n] - else: - return s[:n].decode(encoding) - -cdef bytes charptr_to_bytes(const char* s, encoding="ascii"): - if s == NULL: - return None - else: - return s - -cdef force_str(object s, encoding="ascii"): - """Return s converted to str type of current Python - (bytes in Py2, unicode in Py3)""" - if s is None: - return None - if PY_MAJOR_VERSION < 3: - return s - elif PyBytes_Check(s): - return s.decode(encoding) - else: - # assume unicode - return s - -cpdef parse_region(reference=None, - start=None, - end=None, - region=None): - """parse alternative ways to specify a genomic region. A region can - either be specified by :term:`reference`, `start` and - `end`. `start` and `end` denote 0-based, half-open - intervals. - - Alternatively, a samtools :term:`region` string can be - supplied. - - If any of the coordinates are missing they will be replaced by the - minimum (`start`) or maximum (`end`) coordinate. - - Note that region strings are 1-based, while `start` and `end` denote - an interval in python coordinates. - - Returns - ------- - - tuple : a tuple of `reference`, `start` and `end`. - - Raises - ------ - - ValueError - for invalid or out of bounds regions. - - """ - cdef int rtid - cdef long long rstart - cdef long long rend - - rtid = -1 - rstart = 0 - rend = MAX_POS - if start != None: - try: - rstart = start - except OverflowError: - raise ValueError('start out of range (%i)' % start) - - if end != None: - try: - rend = end - except OverflowError: - raise ValueError('end out of range (%i)' % end) - - if region: - region = force_str(region) - parts = re.split("[:-]", region) - reference = parts[0] - if len(parts) >= 2: - rstart = int(parts[1]) - 1 - if len(parts) >= 3: - rend = int(parts[2]) - - if not reference: - return None, 0, 0 - - if not 0 <= rstart < MAX_POS: - raise ValueError('start out of range (%i)' % rstart) - if not 0 <= rend <= MAX_POS: - raise ValueError('end out of range (%i)' % rend) - if rstart > rend: - raise ValueError( - 'invalid region: start (%i) > end (%i)' % (rstart, rend)) - - return force_bytes(reference), rstart, rend - - -def _pysam_dispatch(collection, - method, - args=None, - catch_stdout=True, - save_stdout=None): - '''call ``method`` in samtools/bcftools providing arguments in args. - - Catching of stdout can be turned off by setting *catch_stdout* to - False. - - ''' - - if method == "index": - if not os.path.exists(args[0]): - raise IOError("No such file or directory: '%s'" % args[0]) - - if args is None: - args = [] - else: - args = list(args) - - # redirect stderr to file - stderr_h, stderr_f = tempfile.mkstemp() - pysam_set_stderr(stderr_h) - - # redirect stdout to file - if save_stdout: - stdout_f = save_stdout - stdout_h = c_open(force_bytes(stdout_f), - O_WRONLY) - if stdout_h == -1: - raise OSError("error while opening {} for writing".format(stdout_f)) - - pysam_set_stdout_fn(force_bytes(stdout_f)) - pysam_set_stdout(stdout_h) - elif catch_stdout: - stdout_h, stdout_f = tempfile.mkstemp() - - MAP_STDOUT_OPTIONS = { - "samtools": { - "view": "-o {}", - "mpileup": "-o {}", - "depad": "-o {}", - "calmd": "", # uses pysam_stdout_fn - }, - "bcftools": {} - } - - stdout_option = None - if collection == "bcftools": - # in bcftools, most methods accept -o, the exceptions - # are below: - if method not in ("index", "roh", "stats"): - stdout_option = "-o {}" - elif method in MAP_STDOUT_OPTIONS[collection]: - stdout_option = MAP_STDOUT_OPTIONS[collection][method] - - if stdout_option is not None: - os.close(stdout_h) - pysam_set_stdout_fn(force_bytes(stdout_f)) - args.extend(stdout_option.format(stdout_f).split(" ")) - else: - pysam_set_stdout(stdout_h) - else: - pysam_set_stdout_fn("-") - - # setup the function call to samtools/bcftools main - cdef char ** cargs - cdef int i, n, retval, l - n = len(args) - method = force_bytes(method) - collection = force_bytes(collection) - args = [force_bytes(a) for a in args] - - # allocate two more for first (dummy) argument (contains command) - cdef int extra_args = 0 - if method == b"index": - extra_args = 1 - # add extra arguments for commands accepting optional arguments - # such as 'samtools index x.bam [out.index]' - cargs = calloc(n + 2 + extra_args, sizeof(char *)) - cargs[0] = collection - cargs[1] = method - - # create copies of strings - getopt for long options permutes - # arguments - for i from 0 <= i < n: - l = len(args[i]) - cargs[i + 2] = calloc(l + 1, sizeof(char)) - strncpy(cargs[i + 2], args[i], l) - - # reset getopt. On OsX there getopt reset is different - # between getopt and getopt_long - if method in [b'index', b'cat', b'quickcheck', - b'faidx', b'kprobaln']: - set_optind(1) - else: - set_optind(0) - - # call samtools/bcftools - if collection == b"samtools": - retval = samtools_main(n + 2, cargs) - elif collection == b"bcftools": - retval = bcftools_main(n + 2, cargs) - - for i from 0 <= i < n: - free(cargs[i + 2]) - free(cargs) - - # get error messages - def _collect(fn): - out = [] - try: - with open(fn, "r") as inf: - out = inf.read() - except UnicodeDecodeError: - with open(fn, "rb") as inf: - # read binary output - out = inf.read() - finally: - os.remove(fn) - return out - - pysam_unset_stderr() - out_stderr = _collect(stderr_f) - - if save_stdout: - pysam_unset_stdout() - out_stdout = None - elif catch_stdout: - pysam_unset_stdout() - out_stdout = _collect(stdout_f) - else: - out_stdout = None - - return retval, out_stderr, out_stdout - - -__all__ = ["qualitystring_to_array", - "array_to_qualitystring", - "qualities_to_qualitystring"] diff --git a/pysam/cvcf.pxd b/pysam/cvcf.pxd deleted file mode 100644 index 139597f..0000000 --- a/pysam/cvcf.pxd +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/pysam/cvcf.pyx b/pysam/cvcf.pyx deleted file mode 100644 index 5e2fda2..0000000 --- a/pysam/cvcf.pyx +++ /dev/null @@ -1,1203 +0,0 @@ -# cython: embedsignature=True -# -# Code to read, write and edit VCF files -# -# VCF lines are encoded as a dictionary with these keys (note: all lowercase): -# 'chrom': string -# 'pos': integer -# 'id': string -# 'ref': string -# 'alt': list of strings -# 'qual': integer -# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"] -# 'info': dictionary of values (see below) -# 'format': list of keys (strings) -# sample keys: dictionary of values (see below) -# -# The sample keys are accessible through vcf.getsamples() -# -# A dictionary of values contains value keys (defined in ##INFO or -# ##FORMAT lines) which map to a list, containing integers, floats, -# strings, or characters. Missing values are replaced by a particular -# value, often -1 or . -# -# Genotypes are not stored as a string, but as a list of 1 or 3 -# elements (for haploid and diploid samples), the first (and last) the -# integer representing an allele, and the second the separation -# character. Note that there is just one genotype per sample, but for -# consistency the single element is stored in a list. -# -# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as -# (key, value) pairs and are accessible through getheader() -# -# The VCF class can be instantiated with a 'regions' variable -# consisting of tuples (chrom,start,end) encoding 0-based half-open -# segments. Only variants with a position inside the segment will be -# parsed. A regions parser is available under parse_regions. -# -# When instantiated, a reference can be passed to the VCF class. This -# may be any class that supports a fetch(chrom, start, end) method. -# -# NOTE: the position that is returned to Python is 0-based, NOT -# 1-based as in the VCF file. -# NOTE: There is also preliminary VCF functionality in the VariantFile class. -# -# TODO: -# only v4.0 writing is complete; alleles are not converted to v3.3 format -# - -from collections import namedtuple, defaultdict -from operator import itemgetter -import sys, re, copy, bisect - -from libc.stdlib cimport atoi -from libc.stdint cimport int8_t, int16_t, int32_t, int64_t -from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t - -cimport pysam.ctabix as ctabix -cimport pysam.ctabixproxies as ctabixproxies - -from pysam.cutils cimport force_str - -import pysam - -gtsRegEx = re.compile("[|/\\\\]") -alleleRegEx = re.compile('^[ACGTN]+$') - -# Utility function. Uses 0-based coordinates -def get_sequence(chrom, start, end, fa): - # obtain sequence from .fa file, without truncation - if end<=start: return "" - if not fa: return "N"*(end-start) - if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper() - sequence = fa.fetch(chrom, start, end).upper() - if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence)) - return sequence - -# Utility function. Parses a region string -def parse_regions( string ): - result = [] - for r in string.split(','): - elts = r.split(':') - chrom, start, end = elts[0], 0, 3000000000 - if len(elts)==1: pass - elif len(elts)==2: - if len(elts[1])>0: - ielts = elts[1].split('-') - if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r) - try: start, end = int(ielts[0])-1, int(ielts[1]) - except: raise ValueError("Don't understand region string '%s'" % r) - else: - raise ValueError("Don't understand region string '%s'" % r) - result.append( (chrom,start,end) ) - return result - - -FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue') - -########################################################################################################### -# -# New class -# -########################################################################################################### - -cdef class VCFRecord( ctabixproxies.TupleProxy): - '''vcf record. - - initialized from data and vcf meta - ''' - - cdef vcf - cdef char * contig - cdef uint32_t pos - - def __init__(self, vcf): - self.vcf = vcf - self.encoding = vcf.encoding - - # if len(data) != len(self.vcf._samples): - # self.vcf.error(str(data), - # self.BAD_NUMBER_OF_COLUMNS, - # "expected %s for %s samples (%s), got %s" % \ - # (len(self.vcf._samples), - # len(self.vcf._samples), - # self.vcf._samples, - # len(data))) - - def __cinit__(self, vcf): - # start indexed access at genotypes - self.offset = 9 - - self.vcf = vcf - self.encoding = vcf.encoding - - def error(self, line, error, opt=None): - '''raise error.''' - # pass to vcf file for error handling - return self.vcf.error(line, error, opt) - - cdef update(self, char * buffer, size_t nbytes): - '''update internal data. - - nbytes does not include the terminal '\0'. - ''' - ctabixproxies.TupleProxy.update(self, buffer, nbytes) - - self.contig = self.fields[0] - # vcf counts from 1 - correct here - self.pos = atoi(self.fields[1]) - 1 - - def __len__(self): - return max(0, self.nfields - 9) - - property contig: - def __get__(self): return self.contig - - property pos: - def __get__(self): return self.pos - - property id: - def __get__(self): return self.fields[2] - - property ref: - def __get__(self): - return self.fields[3] - - property alt: - def __get__(self): - # convert v3.3 to v4.0 alleles below - alt = self.fields[4] - if alt == ".": alt = [] - else: alt = alt.upper().split(',') - return alt - - property qual: - def __get__(self): - qual = self.fields[5] - if qual == b".": qual = -1 - else: - try: qual = float(qual) - except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL) - return qual - - property filter: - def __get__(self): - f = self.fields[6] - # postpone checking that filters exist. Encode missing filter or no filtering as empty list - if f == b"." or f == b"PASS" or f == b"0": return [] - else: return f.split(';') - - property info: - def __get__(self): - col = self.fields[7] - # dictionary of keys, and list of values - info = {} - if col != b".": - for blurp in col.split(';'): - elts = blurp.split('=') - if len(elts) == 1: v = None - elif len(elts) == 2: v = elts[1] - else: self.vcf.error(str(self),self.ERROR_INFO_STRING) - info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf)) - return info - - property format: - def __get__(self): - return self.fields[8].split(':') - - property samples: - def __get__(self): - return self.vcf._samples - - def __getitem__(self, key): - - # parse sample columns - values = self.fields[self.vcf._sample2column[key]].split(':') - alt = self.alt - format = self.format - - if len(values) > len(format): - self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\ - (len(values),key,len(format))) - - result = {} - for idx in range(len(format)): - expected = self.vcf.get_expected(format[idx], self.vcf._format, alt) - if idx < len(values): value = values[idx] - else: - if expected == -1: value = "." - else: value = ",".join(["."]*expected) - - result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data)) - if expected != -1 and len(result[format[idx]]) != expected: - self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS, - "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]])) - if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]])) - result[format[idx]] = result[format[idx]][:expected] - - return result - - -cdef class asVCFRecord(ctabix.Parser): - '''converts a :term:`tabix row` into a VCF record.''' - cdef vcffile - def __init__(self, vcffile): - self.vcffile = vcffile - - cdef parse(self, char * buffer, int len): - cdef VCFRecord r - r = VCFRecord(self.vcffile) - r.copy(buffer, len) - return r - -class VCF(object): - - # types - NT_UNKNOWN = 0 - NT_NUMBER = 1 - NT_ALLELES = 2 - NT_NR_ALLELES = 3 - NT_GENOTYPES = 4 - NT_PHASED_GENOTYPES = 5 - - _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier", - 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string", - 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s", - 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)", - 4:"POS_NOT_NUMERICAL:Position column is not numerical", - 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field", - 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF", - 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF", - 8:"POS_NOT_POSITIVE:Position field must be >0", - 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'", - 10:"ERROR_INFO_STRING:Error while parsing info field", - 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)", - 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s", - 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string", - 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header", - 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header", - 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)", - 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)", - 18:"BAD_GENOTYPE:Cannot parse genotype (%s)", - 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)", - 20:"MISSING_REF:Reference allele missing", - 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)", - 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets", - 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes", - 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields", - 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs", - 26:"WRONG_REF:Wrong reference %s", - 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data", - 28:"BAD_CHR_TAG:Error calculating chr tag for %s", - 29:"ZERO_LENGTH_ALLELE:Found zero-length allele", - 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base", - 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'", - 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s", - 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value", - } - - # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields - _header = [] - - # version number; 33=v3.3; 40=v4.0 - _version = 40 - - # info, filter and format data - _info = {} - _filter = {} - _format = {} - - # header; and required columns - _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] - _samples = [] - - # control behaviour - _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD - _warn_errors = set([]) - _leftalign = False - - # reference sequence - _reference = None - - # regions to include; None includes everything - _regions = None - - # statefull stuff - _lineno = -1 - _line = None - _lines = None - - def __init__(self, _copy=None, reference=None, regions=None, - lines=None, leftalign=False): - # make error identifiers accessible by name - for id in self._errors.keys(): - self.__dict__[self._errors[id].split(':')[0]] = id - if _copy != None: - self._leftalign = _copy._leftalign - self._header = _copy._header[:] - self._version = _copy._version - self._info = copy.deepcopy(_copy._info) - self._filter = copy.deepcopy(_copy._filter) - self._format = copy.deepcopy(_copy._format) - self._samples = _copy._samples[:] - self._sample2column = copy.deepcopy(_copy._sample2column) - self._ignored_errors = copy.deepcopy(_copy._ignored_errors) - self._warn_errors = copy.deepcopy(_copy._warn_errors) - self._reference = _copy._reference - self._regions = _copy._regions - if reference: self._reference = reference - if regions: self._regions = regions - if leftalign: self._leftalign = leftalign - self._lines = lines - self.encoding = "ascii" - self.tabixfile = None - - def error(self,line,error,opt=None): - if error in self._ignored_errors: return - errorlabel, errorstring = self._errors[error].split(':') - if opt: errorstring = errorstring % opt - errwarn = ["Error","Warning"][error in self._warn_errors] - errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring) - if error in self._warn_errors: return - raise ValueError(errorstring) - - def parse_format(self,line,format,filter=False): - if self._version == 40: - if not format.startswith('<'): - self.error(line,self.V40_MISSING_ANGLE_BRACKETS) - format = "<"+format - if not format.endswith('>'): - self.error(line,self.V40_MISSING_ANGLE_BRACKETS) - format += ">" - format = format[1:-1] - data = {'id':None,'number':None,'type':None,'descr':None} - idx = 0 - while len(format.strip())>0: - elts = format.strip().split(',') - first, rest = elts[0], ','.join(elts[1:]) - if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')): - if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS) - if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - first = ["ID=","Number=","Type=","Description="][idx] + first - if first.startswith('ID='): data['id'] = first.split('=')[1] - elif first.startswith('Number='): data['number'] = first.split('=')[1] - elif first.startswith('Type='): data['type'] = first.split('=')[1] - elif first.startswith('Description='): - elts = format.split('"') - if len(elts)<3: - self.error(line,self.FORMAT_MISSING_QUOTES) - elts = first.split('=') + [rest] - data['descr'] = elts[1] - rest = '"'.join(elts[2:]) - if rest.startswith(','): rest = rest[1:] - else: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - format = rest - idx += 1 - if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings - if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - if 'descr' not in data: - # missing description - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - data['descr'] = "" - if not data['type'] and not data['number']: - # fine, ##filter format - return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.') - if not data['type'] in ["Integer","Float","Character","String","Flag"]: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - # I would like a missing-value field, but it isn't there - if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value - else: data['missing'] = '.' - if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - try: - n = int(data['number']) - t = self.NT_NUMBER - except ValueError: - n = -1 - if data['number'] == '.': t = self.NT_UNKNOWN - elif data['number'] == '#alleles': t = self.NT_ALLELES - elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES - elif data['number'] == '#genotypes': t = self.NT_GENOTYPES - elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES - elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES - # abbreviations added in VCF version v4.1 - elif data['number'] == 'A': t = self.NT_ALLELES - elif data['number'] == 'G': t = self.NT_GENOTYPES - else: - self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) - # if number is 0 - type must be Flag - if n == 0 and data['type'] != 'Flag': - self.error( line, self.ZERO_FOR_NON_FLAG_FIELD) - # force type 'Flag' if no number - data['type'] = 'Flag' - - return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing']) - - def format_format( self, fmt, filter=False ): - values = [('ID',fmt.id)] - if fmt.number != None and not filter: - if fmt.numbertype == self.NT_UNKNOWN: nmb = "." - elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number) - elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles" - elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles" - elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes" - elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes" - else: - raise ValueError("Unknown number type encountered: %s" % fmt.numbertype) - values.append( ('Number',nmb) ) - values.append( ('Type', fmt.type) ) - values.append( ('Description', '"' + fmt.description + '"') ) - if self._version == 33: - format = ",".join([v for k,v in values]) - else: - format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">" - return format - - def get_expected(self, format, formatdict, alt): - fmt = formatdict[format] - if fmt.numbertype == self.NT_UNKNOWN: return -1 - if fmt.numbertype == self.NT_NUMBER: return fmt.number - if fmt.numbertype == self.NT_ALLELES: return len(alt)+1 - if fmt.numbertype == self.NT_NR_ALLELES: return len(alt) - if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2 - if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1) - return 0 - - - def _add_definition(self, formatdict, key, data, line ): - if key in formatdict: return - self.error(line,self.ERROR_UNKNOWN_KEY,key) - if data == None: - formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".") - return - if data == []: data = [""] # unsure what type -- say string - if type(data[0]) == type(0.0): - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None) - return - if type(data[0]) == type(0): - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None) - return - formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".") - - - # todo: trim trailing missing values - def format_formatdata( self, data, format, key=True, value=True, separator=":" ): - output, sdata = [], [] - if type(data) == type([]): # for FORMAT field, make data with dummy values - d = {} - for k in data: d[k] = [] - data = d - # convert missing values; and silently add definitions if required - for k in data: - self._add_definition( format, k, data[k], "(output)" ) - for idx,v in enumerate(data[k]): - if v == format[k].missingvalue: data[k][idx] = "." - # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string - for k in data: - if k != 'GT': sdata.append( (k,data[k]) ) - sdata.sort() - if 'GT' in data: - sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata - for k,v in sdata: - if v == []: v = None - if key and value: - if v != None: output.append( k+"="+','.join(map(str,v)) ) - else: output.append( k ) - elif key: output.append(k) - elif value: - if v != None: output.append( ','.join(map(str,v)) ) - else: output.append( "." ) # should not happen - # snip off trailing missing data - while len(output) > 1: - last = output[-1].replace(',','').replace('.','') - if len(last)>0: break - output = output[:-1] - return separator.join(output) - - - def enter_default_format(self): - for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'), - FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1), - FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'), - FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), - FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), - FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1), - FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'), - FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'), - FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'), - FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid - FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'), - FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1), - FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1), - FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1), - ]: - if f.id not in self._format: - self._format[f.id] = f - - def parse_header(self, line): - - assert line.startswith('##') - elts = line[2:].split('=') - key = elts[0].strip() - value = '='.join(elts[1:]).strip() - if key == "fileformat": - if value == "VCFv3.3": - self._version = 33 - elif value == "VCFv4.0": - self._version = 40 - elif value == "VCFv4.1": - # AH - for testing - self._version = 40 - elif value == "VCFv4.2": - # AH - for testing - self._version = 40 - else: - self.error(line,self.UNKNOWN_FORMAT_STRING) - elif key == "INFO": - f = self.parse_format(line, value) - self._info[ f.id ] = f - elif key == "FILTER": - f = self.parse_format(line, value, filter=True) - self._filter[ f.id ] = f - elif key == "FORMAT": - f = self.parse_format(line, value) - self._format[ f.id ] = f - else: - # keep other keys in the header field - self._header.append( (key,value) ) - - - def write_header( self, stream ): - stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10)) - for key,value in self._header: stream.write("##%s=%s\n" % (key,value)) - for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]: - for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER")))) - - - def parse_heading( self, line ): - assert line.startswith('#') - assert not line.startswith('##') - headings = line[1:].split('\t') - # test for 8, as FORMAT field might be missing - if len(headings)==1 and len(line[1:].split()) >= 8: - self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS) - headings = line[1:].split() - - for i,s in enumerate(self._required): - - if len(headings)<=i or headings[i] != s: - - if len(headings) <= i: - err = "(%sth entry not found)" % (i+1) - else: - err = "(found %s, expected %s)" % (headings[i],s) - - #self.error(line,self.BADLY_FORMATTED_HEADING,err) - # allow FORMAT column to be absent - if len(headings) == 8: - headings.append("FORMAT") - else: - self.error(line,self.BADLY_FORMATTED_HEADING,err) - - self._samples = headings[9:] - self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] ) - - def write_heading( self, stream ): - stream.write("#" + "\t".join(self._required + self._samples) + "\n") - - def convertGT(self, GTstring): - if GTstring == ".": return ["."] - try: - gts = gtsRegEx.split(GTstring) - if len(gts) == 1: return [int(gts[0])] - if len(gts) != 2: raise ValueError() - if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]] - return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])] - except ValueError: - self.error(self._line,self.BAD_GENOTYPE,GTstring) - return [".","|","."] - - def convertGTback(self, GTdata): - return ''.join(map(str,GTdata)) - - def parse_formatdata( self, key, value, formatdict, line ): - # To do: check that the right number of values is present - f = formatdict.get(key,None) - if f == None: - self._add_definition(formatdict, key, value, line ) - f = formatdict[key] - if f.type == "Flag": - if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE) - return [] - values = value.split(',') - # deal with trailing data in some early VCF files - if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1: - self.error(line,self.ERROR_TRAILING_DATA,values[-1]) - values[-1] = values[-1].split(';')[0] - if f.type == "Integer": - for idx,v in enumerate(values): - try: - if v == ".": values[idx] = f.missingvalue - else: values[idx] = int(v) - except: - self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values))) - return [0] * len(values) - return values - elif f.type == "String": - self._line = line - if f.id == "GT": values = list(map( self.convertGT, values )) - return values - elif f.type == "Character": - for v in values: - if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR) - return values - elif f.type == "Float": - for idx,v in enumerate(values): - if v == ".": values[idx] = f.missingvalue - try: return list(map(float,values)) - except: - self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values))) - return [0.0] * len(values) - else: - # can't happen - self.error(line,self.ERROR_INFO_STRING) - - def inregion(self, chrom, pos): - if not self._regions: return True - for r in self._regions: - if r[0] == chrom and r[1] <= pos < r[2]: return True - return False - - def parse_data( self, line, lineparse=False ): - cols = line.split('\t') - if len(cols) != len(self._samples)+9: - # gracefully deal with absent FORMAT column - # and those missing samples - if len(cols) == 8: - cols.append("") - else: - self.error(line, - self.BAD_NUMBER_OF_COLUMNS, - "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols))) - - chrom = cols[0] - - # get 0-based position - try: pos = int(cols[1])-1 - except: self.error(line,self.POS_NOT_NUMERICAL) - if pos < 0: self.error(line,self.POS_NOT_POSITIVE) - - # implement filtering - if not self.inregion(chrom,pos): return None - - # end of first-pass parse for sortedVCF - if lineparse: return chrom, pos, line - - id = cols[2] - - ref = cols[3].upper() - if ref == ".": - self.error(line,self.MISSING_REF) - if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) - else: ref = "" - else: - for c in ref: - if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF) - if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference) - - # make sure reference is sane - if self._reference: - left = max(0,pos-100) - faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference) - faref = faref_leftflank[pos-left:] - if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) - ref = faref - - # convert v3.3 to v4.0 alleles below - if cols[4] == ".": alt = [] - else: alt = cols[4].upper().split(',') - - if cols[5] == ".": qual = -1 - else: - try: qual = float(cols[5]) - except: self.error(line,self.QUAL_NOT_NUMERICAL) - - # postpone checking that filters exist. Encode missing filter or no filtering as empty list - if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = [] - else: filter = cols[6].split(';') - - # dictionary of keys, and list of values - info = {} - if cols[7] != ".": - for blurp in cols[7].split(';'): - elts = blurp.split('=') - if len(elts) == 1: v = None - elif len(elts) == 2: v = elts[1] - else: self.error(line,self.ERROR_INFO_STRING) - info[elts[0]] = self.parse_formatdata(elts[0], - v, - self._info, - line) - - # Gracefully deal with absent FORMAT column - if cols[8] == "": format = [] - else: format = cols[8].split(':') - - # check: all filters are defined - for f in filter: - if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f) - - # check: format fields are defined - if self._format: - for f in format: - if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f) - - # convert v3.3 alleles - if self._version == 33: - if len(ref) != 1: self.error(line,self.V33_BAD_REF) - newalts = [] - have_deletions = False - for a in alt: - if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference - elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference - elif a.startswith('D'): # allow D and D - have_deletions = True - try: - l = int(a[1:]) # throws ValueError if sequence - if len(ref) < l: # add to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[l:] # new deletion, deleting pos...pos+l - except ValueError: - s = a[1:] - if len(ref) < len(s): # add Ns to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) - if not s.endswith(addns) and addns != 'N'*len(addns): - self.error(line,self.V33_UNMATCHED_DELETION, - "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[len(s):] # new deletion, deleting from pos - else: - self.error(line,self.V33_BAD_ALLELE) - newalts.append(a) - alt = newalts - # deletion alleles exist, add dummy 1st reference allele, and account for leading base - if have_deletions: - if pos == 0: - # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 - addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) - ref += addn - alt = [allele+addn for allele in alt] - else: - addn = get_sequence(chrom,pos-1,pos,self._reference) - ref = addn + ref - alt = [addn + allele for allele in alt] - pos -= 1 - else: - # format v4.0 -- just check for nucleotides - for allele in alt: - if not alleleRegEx.match(allele): - self.error(line,self.V40_BAD_ALLELE,allele) - - # check for leading nucleotide in indel calls - for allele in alt: - if len(allele) != len(ref): - if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE) - if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): - self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE) - - # trim trailing bases in alleles - # AH: not certain why trimming this needs to be added - # disabled now for unit testing - # if alt: - # for i in range(1,min(len(ref),min(map(len,alt)))): - # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): - # break - # ref, alt = ref[:-1], [allele[:-1] for allele in alt] - - # left-align alleles, if a reference is available - if self._leftalign and self._reference: - while left < pos: - movable = True - for allele in alt: - if len(allele) > len(ref): - longest, shortest = allele, ref - else: - longest, shortest = ref, allele - if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): - movable = False - if longest[-1].upper() != longest[len(shortest)-1].upper(): - movable = False - if not movable: - break - ref = ref[:-1] - alt = [allele[:-1] for allele in alt] - if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: - ref = faref_leftflank[pos-left-1] + ref - alt = [faref_leftflank[pos-left-1] + allele for allele in alt] - pos -= 1 - - # parse sample columns - samples = [] - for sample in cols[9:]: - dict = {} - values = sample.split(':') - if len(values) > len(format): - self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format))) - for idx in range(len(format)): - expected = self.get_expected(format[idx], self._format, alt) - if idx < len(values): value = values[idx] - else: - if expected == -1: value = "." - else: value = ",".join(["."]*expected) - - dict[format[idx]] = self.parse_formatdata(format[idx], - value, - self._format, - line) - if expected != -1 and len(dict[format[idx]]) != expected: - self.error(line,self.BAD_NUMBER_OF_PARAMETERS, - "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]])) - if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]])) - dict[format[idx]] = dict[format[idx]][:expected] - samples.append( dict ) - - # done - d = {'chrom':chrom, - 'pos':pos, # return 0-based position - 'id':id, - 'ref':ref, - 'alt':alt, - 'qual':qual, - 'filter':filter, - 'info':info, - 'format':format} - for key,value in zip(self._samples,samples): - d[key] = value - - return d - - - def write_data(self, stream, data): - required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples - for k in required: - if k not in data: raise ValueError("Required key %s not found in data" % str(k)) - if data['alt'] == []: alt = "." - else: alt = ",".join(data['alt']) - if data['filter'] == None: filter = "." - elif data['filter'] == []: - if self._version == 33: filter = "0" - else: filter = "PASS" - else: filter = ';'.join(data['filter']) - if data['qual'] == -1: qual = "." - else: qual = str(data['qual']) - - output = [data['chrom'], - str(data['pos']+1), # change to 1-based position - data['id'], - data['ref'], - alt, - qual, - filter, - self.format_formatdata( - data['info'], self._info, separator=";"), - self.format_formatdata( - data['format'], self._format, value=False)] - - for s in self._samples: - output.append(self.format_formatdata( - data[s], self._format, key=False)) - - stream.write( "\t".join(output) + "\n" ) - - def _parse_header(self, stream): - self._lineno = 0 - for line in stream: - line = force_str(line, self.encoding) - self._lineno += 1 - if line.startswith('##'): - self.parse_header(line.strip()) - elif line.startswith('#'): - self.parse_heading(line.strip()) - self.enter_default_format() - else: - break - return line - - def _parse(self, line, stream): - # deal with files with header only - if line.startswith("##"): return - if len(line.strip()) > 0: - d = self.parse_data( line.strip() ) - if d: yield d - for line in stream: - self._lineno += 1 - if self._lines and self._lineno > self._lines: raise StopIteration - d = self.parse_data( line.strip() ) - if d: yield d - - ###################################################################################################### - # - # API follows - # - ###################################################################################################### - - def getsamples(self): - """ List of samples in VCF file """ - return self._samples - - def setsamples(self,samples): - """ List of samples in VCF file """ - self._samples = samples - - def getheader(self): - """ List of header key-value pairs (strings) """ - return self._header - - def setheader(self,header): - """ List of header key-value pairs (strings) """ - self._header = header - - def getinfo(self): - """ Dictionary of ##INFO tags, as VCF.FORMAT values """ - return self._info - - def setinfo(self,info): - """ Dictionary of ##INFO tags, as VCF.FORMAT values """ - self._info = info - - def getformat(self): - """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ - return self._format - - def setformat(self,format): - """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ - self._format = format - - def getfilter(self): - """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ - return self._filter - - def setfilter(self,filter): - """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ - self._filter = filter - - def setversion(self, version): - if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files") - self._version = version - - def setregions(self, regions): - self._regions = regions - - def setreference(self, ref): - """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """ - self._reference = ref - - def ignoreerror(self, errorstring): - try: self._ignored_errors.add(self.__dict__[errorstring]) - except KeyError: raise ValueError("Invalid error string: %s" % errorstring) - - def warnerror(self, errorstring): - try: self._warn_errors.add(self.__dict__[errorstring]) - except KeyError: raise ValueError("Invalid error string: %s" % errorstring) - - def parse(self, stream): - """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """ - last_line = self._parse_header(stream) - # now return a generator that does the actual work. In this way the pre-processing is done - # before the first piece of data is yielded - return self._parse(last_line, stream) - - def write(self, stream, datagenerator): - """ Writes a VCF file to a stream, using a data generator (or list) """ - self.write_header(stream) - self.write_heading(stream) - for data in datagenerator: self.write_data(stream,data) - - def writeheader(self, stream): - """ Writes a VCF header """ - self.write_header(stream) - self.write_heading(stream) - - def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2): - """ Utility function: compares two calls for equality """ - # a variant should always be assigned to a unique position, one base before - # the leftmost position of the alignment gap. If this rule is implemented - # correctly, the two positions must be equal for the calls to be identical. - if pos1 != pos2: return False - # from both calls, trim rightmost bases when identical. Do this safely, i.e. - # only when the reference bases are not Ns - while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]: - ref1 = ref1[:-1] - alt1 = alt1[:-1] - while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]: - ref2 = ref2[:-1] - alt2 = alt2[:-1] - # now, the alternative alleles must be identical - return alt1 == alt2 - -########################################################################################################### -########################################################################################################### -## API functions added by Andreas -########################################################################################################### - - def connect(self, filename, encoding="ascii"): - '''connect to tabix file.''' - self.encoding=encoding - self.tabixfile = pysam.Tabixfile(filename, encoding=encoding) - self._parse_header(self.tabixfile.header) - - def __del__(self): - self.close() - self.tabixfile = None - - def close(self): - if self.tabixfile: - self.tabixfile.close() - self.tabixfile = None - - def fetch(self, - reference=None, - start=None, - end=None, - region=None ): - """ Parse a stream of VCF-formatted lines. - Initializes class instance and return generator """ - return self.tabixfile.fetch( - reference, - start, - end, - region, - parser = asVCFRecord(self)) - - def validate(self, record): - '''validate vcf record. - - returns a validated record. - ''' - - raise NotImplementedError("needs to be checked") - - chrom, pos = record.chrom, record.pos - - # check reference - ref = record.ref - if ref == ".": - self.error(str(record),self.MISSING_REF) - if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) - else: ref = "" - else: - for c in ref: - if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF) - if "N" in ref: ref = get_sequence(chrom, - pos, - pos+len(ref), - self._reference) - - # make sure reference is sane - if self._reference: - left = max(0,self.pos-100) - faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference) - faref = faref_leftflank[pos-left:] - if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) - ref = faref - - # check: format fields are defined - for f in record.format: - if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f) - - # check: all filters are defined - for f in record.filter: - if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f) - - # convert v3.3 alleles - if self._version == 33: - if len(ref) != 1: self.error(str(record),self.V33_BAD_REF) - newalts = [] - have_deletions = False - for a in alt: - if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference - elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference - elif a.startswith('D'): # allow D and D - have_deletions = True - try: - l = int(a[1:]) # throws ValueError if sequence - if len(ref) < l: # add to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[l:] # new deletion, deleting pos...pos+l - except ValueError: - s = a[1:] - if len(ref) < len(s): # add Ns to reference if necessary - addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) - if not s.endswith(addns) and addns != 'N'*len(addns): - self.error(str(record),self.V33_UNMATCHED_DELETION, - "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) - ref += addns - for i,na in enumerate(newalts): newalts[i] = na+addns - a = ref[len(s):] # new deletion, deleting from pos - else: - self.error(str(record),self.V33_BAD_ALLELE) - newalts.append(a) - alt = newalts - # deletion alleles exist, add dummy 1st reference allele, and account for leading base - if have_deletions: - if pos == 0: - # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 - addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) - ref += addn - alt = [allele+addn for allele in alt] - else: - addn = get_sequence(chrom,pos-1,pos,self._reference) - ref = addn + ref - alt = [addn + allele for allele in alt] - pos -= 1 - else: - # format v4.0 -- just check for nucleotides - for allele in alt: - if not alleleRegEx.match(allele): - self.error(str(record),self.V40_BAD_ALLELE,allele) - - - # check for leading nucleotide in indel calls - for allele in alt: - if len(allele) != len(ref): - if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE) - if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): - self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE) - - # trim trailing bases in alleles - # AH: not certain why trimming this needs to be added - # disabled now for unit testing - # for i in range(1,min(len(ref),min(map(len,alt)))): - # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): - # break - # ref, alt = ref[:-1], [allele[:-1] for allele in alt] - - # left-align alleles, if a reference is available - if self._leftalign and self._reference: - while left < pos: - movable = True - for allele in alt: - if len(allele) > len(ref): - longest, shortest = allele, ref - else: - longest, shortest = ref, allele - if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): - movable = False - if longest[-1].upper() != longest[len(shortest)-1].upper(): - movable = False - if not movable: - break - ref = ref[:-1] - alt = [allele[:-1] for allele in alt] - if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: - ref = faref_leftflank[pos-left-1] + ref - alt = [faref_leftflank[pos-left-1] + allele for allele in alt] - pos -= 1 - -__all__ = [ - "VCF", "VCFRecord", ] diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd new file mode 100644 index 0000000..f1d59d1 --- /dev/null +++ b/pysam/libcalignedsegment.pxd @@ -0,0 +1,91 @@ +from pysam.libchtslib cimport * + +cdef extern from "htslib_util.h": + + # add *nbytes* into the variable length data of *src* at *pos* + bam1_t * pysam_bam_update(bam1_t * b, + size_t nbytes_old, + size_t nbytes_new, + uint8_t * pos) + + # now: static + int aux_type2size(int) + + char * pysam_bam_get_qname(bam1_t * b) + uint32_t * pysam_bam_get_cigar(bam1_t * b) + uint8_t * pysam_bam_get_seq(bam1_t * b) + uint8_t * pysam_bam_get_qual(bam1_t * b) + uint8_t * pysam_bam_get_aux(bam1_t * b) + int pysam_bam_get_l_aux(bam1_t * b) + char pysam_bam_seqi(uint8_t * s, int i) + + uint16_t pysam_get_bin(bam1_t * b) + uint8_t pysam_get_qual(bam1_t * b) + uint8_t pysam_get_l_qname(bam1_t * b) + uint16_t pysam_get_flag(bam1_t * b) + uint16_t pysam_get_n_cigar(bam1_t * b) + void pysam_set_bin(bam1_t * b, uint16_t v) + void pysam_set_qual(bam1_t * b, uint8_t v) + void pysam_set_l_qname(bam1_t * b, uint8_t v) + void pysam_set_flag(bam1_t * b, uint16_t v) + void pysam_set_n_cigar(bam1_t * b, uint16_t v) + void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) + + +from pysam.libcalignmentfile cimport AlignmentFile +ctypedef AlignmentFile AlignmentFile_t + + +# Note: need to declare all C fields and methods here +cdef class AlignedSegment: + + # object that this AlignedSegment represents + cdef bam1_t * _delegate + + # the file from which this AlignedSegment originates (can be None) + cdef AlignmentFile _alignment_file + + # caching of array properties for quick access + cdef object cache_query_qualities + cdef object cache_query_alignment_qualities + cdef object cache_query_sequence + cdef object cache_query_alignment_sequence + + # add an alignment tag with value to the AlignedSegment + # an existing tag of the same name will be replaced. + cpdef set_tag(self, tag, value, value_type=?, replace=?) + + # add an alignment tag with value to the AlignedSegment + # an existing tag of the same name will be replaced. + cpdef get_tag(self, tag, with_value_type=?) + + # return true if tag exists + cpdef has_tag(self, tag) + + # returns a valid sam alignment string + cpdef tostring(self, AlignmentFile_t handle) + + +cdef class PileupColumn: + cdef bam_pileup1_t ** plp + cdef int tid + cdef int pos + cdef int n_pu + cdef AlignmentFile _alignment_file + + +cdef class PileupRead: + cdef AlignedSegment _alignment + cdef int32_t _qpos + cdef int _indel + cdef int _level + cdef uint32_t _is_del + cdef uint32_t _is_head + cdef uint32_t _is_tail + cdef uint32_t _is_refskip + +# factor methods +cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file) +cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file) +cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file) +cdef inline uint32_t get_alignment_length(bam1_t * src) diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx new file mode 100644 index 0000000..c95bb13 --- /dev/null +++ b/pysam/libcalignedsegment.pyx @@ -0,0 +1,2483 @@ +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +# Cython wrapper for SAM/BAM/CRAM files based on htslib +############################################################################### +# The principal classes defined in this module are: +# +# class AlignedSegment an aligned segment (read) +# +# class PileupColumn a collection of segments (PileupRead) aligned to +# a particular genomic position. +# +# class PileupRead an AlignedSegment aligned to a particular genomic +# position. Contains additional attributes with respect +# to this. +# +# Additionally this module defines numerous additional classes that are part +# of the internal API. These are: +# +# Various iterator classes to iterate over alignments in sequential (IteratorRow) +# or in a stacked fashion (IteratorColumn): +# +# class IteratorRow +# class IteratorRowRegion +# class IteratorRowHead +# class IteratorRowAll +# class IteratorRowAllRefs +# class IteratorRowSelection +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015 Andreas Heger +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### +import re +import array +import ctypes +import struct + +cimport cython +from cpython cimport array as c_array +from cpython.version cimport PY_MAJOR_VERSION +from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize +from libc.string cimport strchr +from cpython cimport array as c_array + +from pysam.libcutils cimport force_bytes, force_str, \ + charptr_to_str, charptr_to_bytes +from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \ + array_to_qualitystring + +# Constants for binary tag conversion +cdef char * htslib_types = 'cCsSiIf' +cdef char * parray_types = 'bBhHiIf' + +# translation tables + +# cigar code to character and vice versa +cdef char* CODE2CIGAR= "MIDNSHP=XB" +cdef int NCIGAR_CODES = 10 + +if PY_MAJOR_VERSION >= 3: + CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) +else: + CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR)) + +CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") + +##################################################################### +# C multiplication with wrapping around +cdef inline uint32_t c_mul(uint32_t a, uint32_t b): + return (a * b) & 0xffffffff + + +##################################################################### +# typecode guessing +cdef inline char map_typecode_htslib_to_python(uint8_t s): + """map an htslib typecode to the corresponding python typecode + to be used in the struct or array modules.""" + + # map type from htslib to python array + cdef char * f = strchr(htslib_types, s) + + if f == NULL: + return 0 + return parray_types[f - htslib_types] + +cdef inline uint8_t map_typecode_python_to_htslib(char s): + """determine value type from type code of array""" + cdef char * f = strchr(parray_types, s) + if f == NULL: + return 0 + return htslib_types[f - parray_types] + +# optional tag data manipulation +cdef convert_binary_tag(uint8_t * tag): + """return bytesize, number of values and array of values + in aux_data memory location pointed to by tag.""" + cdef uint8_t auxtype + cdef uint8_t byte_size + cdef int32_t nvalues + # get byte size + auxtype = tag[0] + byte_size = aux_type2size(auxtype) + tag += 1 + # get number of values in array + nvalues = (tag)[0] + tag += 4 + + # define python array + cdef c_array.array c_values = array.array( + chr(map_typecode_htslib_to_python(auxtype))) + c_array.resize(c_values, nvalues) + + # copy data + memcpy(c_values.data.as_voidptr, tag, nvalues * byte_size) + + # no need to check for endian-ness as bam1_core_t fields + # and aux_data are in host endian-ness. See sam.c and calls + # to swap_data + return byte_size, nvalues, c_values + + +cdef inline uint8_t get_value_code(value, value_type=None): + '''guess type code for a *value*. If *value_type* is None, + the type code will be inferred based on the Python type of + *value*''' + cdef uint8_t typecode + cdef char * _char_type + + if value_type is None: + if isinstance(value, int): + typecode = 'i' + elif isinstance(value, float): + typecode = 'd' + elif isinstance(value, str): + typecode = 'Z' + elif isinstance(value, bytes): + typecode = 'Z' + elif isinstance(value, array.array) or \ + isinstance(value, list) or \ + isinstance(value, tuple): + typecode = 'B' + else: + return 0 + else: + if value_type not in 'Zidf': + return 0 + value_type = force_bytes(value_type) + _char_type = value_type + typecode = (_char_type)[0] + + return typecode + + +cdef inline bytes getTypecode(value, maximum_value=None): + '''returns the value typecode of a value. + + If max is specified, the approprite type is + returned for a range where value is the minimum. + ''' + + if maximum_value is None: + maximum_value = value + + cdef bytes valuetype + + t = type(value) + + if t is float: + valuetype = b'f' + elif t is int: + # signed ints + if value < 0: + if value >= -128 and maximum_value < 128: + valuetype = b'c' + elif value >= -32768 and maximum_value < 32768: + valuetype = b's' + elif value < -2147483648 or maximum_value >= 2147483648: + raise ValueError( + "at least one signed integer out of range of " + "BAM/SAM specification") + else: + valuetype = b'i' + # unsigned ints + else: + if maximum_value < 256: + valuetype = b'C' + elif maximum_value < 65536: + valuetype = b'S' + elif maximum_value >= 4294967296: + raise ValueError( + "at least one integer out of range of BAM/SAM specification") + else: + valuetype = b'I' + else: + # Note: hex strings (H) are not supported yet + if t is not bytes: + value = value.encode('ascii') + if len(value) == 1: + valuetype = b'A' + else: + valuetype = b'Z' + + return valuetype + + +cdef inline packTags(tags): + """pack a list of tags. Each tag is a tuple of (tag, tuple). + + Values are packed into the most space efficient data structure + possible unless the tag contains a third field with the typecode. + + Returns a format string and the associated list of arguments + to be used in a call to struct.pack_into. + """ + fmts, args = ["<"], [] + + cdef char array_typecode + + datatype2format = { + b'c': ('b', 1), + b'C': ('B', 1), + b's': ('h', 2), + b'S': ('H', 2), + b'i': ('i', 4), + b'I': ('I', 4), + b'f': ('f', 4), + b'A': ('c', 1)} + + for tag in tags: + + if len(tag) == 2: + pytag, value = tag + valuetype = None + elif len(tag) == 3: + pytag, value, valuetype = tag + else: + raise ValueError("malformatted tag: %s" % str(tag)) + + pytag = force_bytes(pytag) + valuetype = force_bytes(valuetype) + t = type(value) + + if t is tuple or t is list: + # binary tags from tuples or lists + if valuetype is None: + # automatically determine value type - first value + # determines type. If there is a mix of types, the + # result is undefined. + valuetype = getTypecode(min(value), max(value)) + + if valuetype not in datatype2format: + raise ValueError("invalid value type '%s'" % valuetype) + + datafmt = "2sccI%i%s" % (len(value), datatype2format[valuetype][0]) + args.extend([pytag[:2], + b"B", + valuetype, + len(value)] + list(value)) + + elif isinstance(value, array.array): + # binary tags from arrays + if valuetype is None: + array_typecode = map_typecode_python_to_htslib(ord(value.typecode)) + + if array_typecode == 0: + raise ValueError("unsupported type code '{}'" + .format(value.typecode)) + + valuetype = force_bytes(chr(array_typecode)) + + if valuetype not in datatype2format: + raise ValueError("invalid value type '%s' (%s)" % + (valuetype, type(valuetype))) + + # use array.tostring() to retrieve byte representation and + # save as bytes + datafmt = "2sccI%is" % (len(value) * datatype2format[valuetype][1]) + args.extend([pytag[:2], + b"B", + valuetype, + len(value), + force_bytes(value.tostring())]) + + else: + if valuetype is None: + valuetype = getTypecode(value) + + if valuetype in b"AZ": + value = force_bytes(value) + + if valuetype == b"Z": + datafmt = "2sc%is" % (len(value)+1) + else: + datafmt = "2sc%s" % datatype2format[valuetype][0] + + args.extend([pytag[:2], + valuetype, + value]) + + fmts.append(datafmt) + + return "".join(fmts), args + + +cdef inline int32_t calculateQueryLength(bam1_t * src): + """return query length computed from CIGAR alignment. + + Return 0 if there is no CIGAR alignment. + """ + + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) + + if cigar_p == NULL: + return 0 + + cdef uint32_t k, qpos + cdef int op + qpos = 0 + + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + + if op == BAM_CMATCH or \ + op == BAM_CINS or \ + op == BAM_CSOFT_CLIP or \ + op == BAM_CHARD_CLIP or \ + op == BAM_CEQUAL or \ + op == BAM_CDIFF: + qpos += cigar_p[k] >> BAM_CIGAR_SHIFT + + return qpos + + +cdef inline int32_t getQueryStart(bam1_t *src) except -1: + cdef uint32_t * cigar_p + cdef uint32_t k, op + cdef uint32_t start_offset = 0 + + if pysam_get_n_cigar(src): + cigar_p = pysam_bam_get_cigar(src); + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + if op == BAM_CHARD_CLIP: + if start_offset != 0 and start_offset != src.core.l_qseq: + PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string') + return -1 + elif op == BAM_CSOFT_CLIP: + start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT + else: + break + + return start_offset + + +cdef inline int32_t getQueryEnd(bam1_t *src) except -1: + cdef uint32_t * cigar_p + cdef uint32_t k, op + cdef uint32_t end_offset = src.core.l_qseq + + # if there is no sequence, compute length from cigar string + if end_offset == 0: + end_offset = calculateQueryLength(src) + + # walk backwards in cigar string + if pysam_get_n_cigar(src) > 1: + cigar_p = pysam_bam_get_cigar(src); + for k from pysam_get_n_cigar(src) > k >= 1: + op = cigar_p[k] & BAM_CIGAR_MASK + if op == BAM_CHARD_CLIP: + if end_offset != 0 and end_offset != src.core.l_qseq: + PyErr_SetString(ValueError, + 'Invalid clipping in CIGAR string') + return -1 + elif op == BAM_CSOFT_CLIP: + end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT + else: + break + + return end_offset + + +cdef inline bytes getSequenceInRange(bam1_t *src, + uint32_t start, + uint32_t end): + """return python string of the sequence in a bam1_t object. + """ + + cdef uint8_t * p + cdef uint32_t k + cdef char * s + + if not src.core.l_qseq: + return None + + seq = PyBytes_FromStringAndSize(NULL, end - start) + s = seq + p = pysam_bam_get_seq(src) + + for k from start <= k < end: + # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c) + # note: do not use string literal as it will be a python string + s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf] + + return charptr_to_bytes(seq) + + +cdef inline object getQualitiesInRange(bam1_t *src, + uint32_t start, + uint32_t end): + """return python array of quality values from a bam1_t object""" + + cdef uint8_t * p + cdef uint32_t k + + p = pysam_bam_get_qual(src) + if p[0] == 0xff: + return None + + # 'B': unsigned char + cdef c_array.array result = array.array('B', [0]) + c_array.resize(result, end - start) + + # copy data + memcpy(result.data.as_voidptr, &p[start], end - start) + + return result + + +##################################################################### +## private factory methods +cdef class AlignedSegment +cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file): + '''return an AlignedSegment object constructed from `src`''' + # note that the following does not call __init__ + cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment) + dest._delegate = bam_dup1(src) + dest._alignment_file = alignment_file + return dest + + +cdef class PileupColumn +cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, + int n_pu, AlignmentFile alignment_file): + '''return a PileupColumn object constructed from pileup in `plp` and + setting additional attributes. + + ''' + # note that the following does not call __init__ + cdef PileupColumn dest = PileupColumn.__new__(PileupColumn) + dest._alignment_file = alignment_file + dest.plp = plp + dest.tid = tid + dest.pos = pos + dest.n_pu = n_pu + return dest + +cdef class PileupRead +cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file): + '''return a PileupRead object construted from a bam_pileup1_t * object.''' + cdef PileupRead dest = PileupRead.__new__(PileupRead) + dest._alignment = makeAlignedSegment(src.b, alignment_file) + dest._qpos = src.qpos + dest._indel = src.indel + dest._level = src.level + dest._is_del = src.is_del + dest._is_head = src.is_head + dest._is_tail = src.is_tail + dest._is_refskip = src.is_refskip + return dest + + +cdef inline uint32_t get_alignment_length(bam1_t * src): + cdef int k = 0 + cdef uint32_t l = 0 + if src == NULL: + return 0 + cdef uint32_t * cigar_p = bam_get_cigar(src) + if cigar_p == NULL: + return 0 + cdef int op + cdef int n = pysam_get_n_cigar(src) + for k from 0 <= k < n: + op = cigar_p[k] & BAM_CIGAR_MASK + if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP: + continue + l += cigar_p[k] >> BAM_CIGAR_SHIFT + return l + + +# TODO: avoid string copying for getSequenceInRange, reconstituneSequenceFromMD, ... +cdef inline bytes build_alignment_sequence(bam1_t * src): + """return expanded sequence from MD tag. + + The sequence includes substitutions and both insertions in the + reference as well as deletions to the reference sequence. Combine + with the cigar string to reconstitute the query or the reference + sequence. + + Positions corresponding to `N` (skipped region from the reference) + in the CIGAR string will not appear in the returned sequence. The + MD should correspondingly not contain these. Thus proper tags are:: + + Deletion from the reference: cigar=5M1D5M MD=5^C5 + Skipped region from reference: cigar=5M1N5M MD=10 + + Returns + ------- + + None, if no MD tag is present. + + """ + if src == NULL: + return None + + cdef uint32_t start = getQueryStart(src) + cdef uint32_t end = getQueryEnd(src) + # get read sequence, taking into account soft-clipping + r = getSequenceInRange(src, start, end) + cdef char * read_sequence = r + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) + if cigar_p == NULL: + return None + + cdef uint32_t r_idx = 0 + cdef int op + cdef uint32_t k, i, l, x + cdef int nmatches = 0 + cdef int s_idx = 0 + + cdef uint32_t max_len = get_alignment_length(src) + if max_len == 0: + raise ValueError("could not determine alignment length") + + cdef char * s = calloc(max_len + 1, sizeof(char)) + if s == NULL: + raise ValueError( + "could not allocated sequence of length %i" % max_len) + + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: + for i from 0 <= i < l: + s[s_idx] = read_sequence[r_idx] + r_idx += 1 + s_idx += 1 + elif op == BAM_CDEL: + for i from 0 <= i < l: + s[s_idx] = '-' + s_idx += 1 + elif op == BAM_CREF_SKIP: + pass + elif op == BAM_CINS: + for i from 0 <= i < l: + # encode insertions into reference as lowercase + s[s_idx] = read_sequence[r_idx] + 32 + r_idx += 1 + s_idx += 1 + elif op == BAM_CSOFT_CLIP: + pass + elif op == BAM_CHARD_CLIP: + pass # advances neither + elif op == BAM_CPAD: + raise NotImplementedError( + "Padding (BAM_CPAD, 6) is currently not supported. " + "Please implement. Sorry about that.") + + cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") + if md_tag_ptr == NULL: + seq = PyBytes_FromStringAndSize(s, s_idx) + free(s) + return seq + + cdef char * md_tag = bam_aux2Z(md_tag_ptr) + cdef int md_idx = 0 + s_idx = 0 + + while md_tag[md_idx] != 0: + # c is numerical + if md_tag[md_idx] >= 48 and md_tag[md_idx] <= 57: + nmatches *= 10 + nmatches += md_tag[md_idx] - 48 + md_idx += 1 + continue + else: + # save matches up to this point, skipping insertions + for x from 0 <= x < nmatches: + while s[s_idx] >= 'a': + s_idx += 1 + s_idx += 1 + while s[s_idx] >= 'a': + s_idx += 1 + + r_idx += nmatches + nmatches = 0 + if md_tag[md_idx] == '^': + md_idx += 1 + while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: + assert s[s_idx] == '-' + s[s_idx] = md_tag[md_idx] + s_idx += 1 + md_idx += 1 + else: + # save mismatch and change to lower case + s[s_idx] = md_tag[md_idx] + 32 + s_idx += 1 + r_idx += 1 + md_idx += 1 + + # save matches up to this point, skipping insertions + for x from 0 <= x < nmatches: + while s[s_idx] >= 'a': + s_idx += 1 + s_idx += 1 + while s[s_idx] >= 'a': + s_idx += 1 + + seq = PyBytes_FromStringAndSize(s, s_idx) + free(s) + + return seq + + +cdef class AlignedSegment: + '''Class representing an aligned segment. + + This class stores a handle to the samtools C-structure representing + an aligned read. Member read access is forwarded to the C-structure + and converted into python objects. This implementation should be fast, + as only the data needed is converted. + + For write access, the C-structure is updated in-place. This is + not the most efficient way to build BAM entries, as the variable + length data is concatenated and thus needs to be resized if + a field is updated. Furthermore, the BAM entry might be + in an inconsistent state. + + One issue to look out for is that the sequence should always + be set *before* the quality scores. Setting the sequence will + also erase any quality scores that were set previously. + ''' + + # Now only called when instances are created from Python + def __init__(self): + # see bam_init1 + self._delegate = calloc(1, sizeof(bam1_t)) + # allocate some memory. If size is 0, calloc does not return a + # pointer that can be passed to free() so allocate 40 bytes + # for a new read + self._delegate.m_data = 40 + self._delegate.data = calloc( + self._delegate.m_data, 1) + self._delegate.l_data = 0 + # set some data to make read approximately legit. + # Note, SAM writing fails with q_name of length 0 + self._delegate.core.l_qname = 0 + self._delegate.core.tid = -1 + self._delegate.core.pos = -1 + self._delegate.core.mtid = -1 + self._delegate.core.mpos = -1 + + # caching for selected fields + self.cache_query_qualities = None + self.cache_query_alignment_qualities = None + self.cache_query_sequence = None + self.cache_query_alignment_sequence = None + + def __dealloc__(self): + bam_destroy1(self._delegate) + + def __str__(self): + """return string representation of alignment. + + The representation is an approximate :term:`SAM` format, because + an aligned read might not be associated with a :term:`AlignmentFile`. + As a result :term:`tid` is shown instead of the reference name. + Similarly, the tags field is returned in its parsed state. + + To get a valid SAM record, use :meth:`tostring`. + """ + # sam-parsing is done in sam.c/bam_format1_core which + # requires a valid header. + return "\t".join(map(str, (self.query_name, + self.flag, + self.reference_id, + self.reference_start, + self.mapping_quality, + self.cigarstring, + self.next_reference_id, + self.next_reference_start, + self.query_alignment_length, + self.query_sequence, + self.query_qualities, + self.tags))) + + def __copy__(self): + return makeAlignedSegment(self._delegate, self._alignment_file) + + def __deepcopy__(self, memo): + return makeAlignedSegment(self._delegate, self._alignment_file) + + def compare(self, AlignedSegment other): + '''return -1,0,1, if contents in this are binary + <,=,> to *other* + + ''' + + cdef int retval, x + cdef bam1_t *t + cdef bam1_t *o + + t = self._delegate + o = other._delegate + + # uncomment for debugging purposes + # cdef unsigned char * oo, * tt + # tt = (&t.core) + # oo = (&o.core) + # for x from 0 <= x < sizeof( bam1_core_t): print x, tt[x], oo[x] + # tt = (t.data) + # oo = (o.data) + # for x from 0 <= x < max(t.l_data, o.l_data): print x, tt[x], oo[x], chr(tt[x]), chr(oo[x]) + + # Fast-path test for object identity + if t == o: + return 0 + + retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t)) + + if retval: + return retval + # cmp(t.l_data, o.l_data) + retval = (t.l_data > o.l_data) - (t.l_data < o.l_data) + if retval: + return retval + return memcmp(t.data, o.data, t.l_data) + + def __richcmp__(self, AlignedSegment other, int op): + if op == 2: # == operator + return self.compare(other) == 0 + elif op == 3: # != operator + return self.compare(other) != 0 + else: + return NotImplemented + + def __hash__(self): + cdef bam1_t * src = self._delegate + cdef int x + + # see http://effbot.org/zone/python-hash.htm + cdef uint8_t * c = &src.core + cdef uint32_t hash_value = c[0] + for x from 1 <= x < sizeof(bam1_core_t): + hash_value = c_mul(hash_value, 1000003) ^ c[x] + c = src.data + for x from 0 <= x < src.l_data: + hash_value = c_mul(hash_value, 1000003) ^ c[x] + + return hash_value + + cpdef tostring(self, AlignmentFile_t htsfile): + """returns a string representation of the aligned segment. + + The output format is valid SAM format. + + Parameters + ---------- + + htsfile -- AlignmentFile object to map numerical + identifiers to chromosome names. + """ + cdef int n_targets = htsfile.header.n_targets + + if self._delegate.core.tid >= n_targets \ + or self._delegate.core.mtid >= n_targets: + raise ValueError('htsfile does not match aligned segment') + + cdef kstring_t line + line.l = line.m = 0 + line.s = NULL + + if sam_format1(htsfile.header, self._delegate, &line) < 0: + if line.m: + free(line.s) + raise ValueError('sam_format failed') + + ret = force_str(line.s[:line.l]) + + if line.m: + free(line.s) + + return ret + + ######################################################## + ## Basic attributes in order of appearance in SAM format + property query_name: + """the query template name (None if not present)""" + def __get__(self): + cdef bam1_t * src + src = self._delegate + if pysam_get_l_qname(src) == 0: + return None + return charptr_to_str(pysam_bam_get_qname(src)) + + def __set__(self, qname): + if qname is None or len(qname) == 0: + return + + if len(qname) >= 255: + raise ValueError("query length out of range {} > 254".format( + len(qname))) + + qname = force_bytes(qname) + cdef bam1_t * src + cdef int l + cdef char * p + + src = self._delegate + p = pysam_bam_get_qname(src) + + # the qname is \0 terminated + l = len(qname) + 1 + pysam_bam_update(src, + pysam_get_l_qname(src), + l, + p) + + pysam_set_l_qname(src, l) + + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam_get_qname(src) + + strncpy(p, qname, l) + + property flag: + """properties flag""" + def __get__(self): + return pysam_get_flag(self._delegate) + def __set__(self, flag): + pysam_set_flag(self._delegate, flag) + + property reference_name: + """:term:`reference` name (None if no AlignmentFile is associated)""" + def __get__(self): + if self._alignment_file is not None: + return self._alignment_file.getrname(self._delegate.core.tid) + return None + + property reference_id: + """:term:`reference` ID + + .. note:: + + This field contains the index of the reference sequence in + the sequence dictionary. To obtain the name of the + reference sequence, use + :meth:`pysam.AlignmentFile.getrname()` + + """ + def __get__(self): return self._delegate.core.tid + def __set__(self, tid): self._delegate.core.tid = tid + + property reference_start: + """0-based leftmost coordinate""" + def __get__(self): return self._delegate.core.pos + def __set__(self, pos): + ## setting the position requires updating the "bin" attribute + cdef bam1_t * src + src = self._delegate + src.core.pos = pos + if pysam_get_n_cigar(src): + pysam_set_bin(src, + hts_reg2bin( + src.core.pos, + bam_endpos(src), + 14, + 5)) + else: + pysam_set_bin(src, + hts_reg2bin( + src.core.pos, + src.core.pos + 1, + 14, + 5)) + + property mapping_quality: + """mapping quality""" + def __get__(self): + return pysam_get_qual(self._delegate) + def __set__(self, qual): + pysam_set_qual(self._delegate, qual) + + property cigarstring: + '''the :term:`cigar` alignment as a string. + + The cigar string is a string of alternating integers + and characters denoting the length and the type of + an operation. + + .. note:: + The order length,operation is specified in the + SAM format. It is different from the order of + the :attr:`cigar` property. + + Returns None if not present. + + To unset the cigarstring, assign None or the + empty string. + ''' + def __get__(self): + c = self.cigartuples + if c is None: + return None + # reverse order + else: + return "".join([ "%i%c" % (y,CODE2CIGAR[x]) for x,y in c]) + + def __set__(self, cigar): + if cigar is None or len(cigar) == 0: + self.cigartuples = [] + else: + parts = CIGAR_REGEX.findall(cigar) + # reverse order + self.cigartuples = [(CIGAR2CODE[ord(y)], int(x)) for x,y in parts] + + # TODO + # property cigar: + # """the cigar alignment""" + + property next_reference_id: + """the :term:`reference` id of the mate/next read.""" + def __get__(self): return self._delegate.core.mtid + def __set__(self, mtid): + self._delegate.core.mtid = mtid + + property next_reference_name: + """:term:`reference` name of the mate/next read (None if no + AlignmentFile is associated)""" + def __get__(self): + if self._alignment_file is not None: + return self._alignment_file.getrname(self._delegate.core.mtid) + return None + + property next_reference_start: + """the position of the mate/next read.""" + def __get__(self): + return self._delegate.core.mpos + def __set__(self, mpos): + self._delegate.core.mpos = mpos + + property query_length: + """the length of the query/read. + + This value corresponds to the length of the sequence supplied + in the BAM/SAM file. The length of a query is 0 if there is no + sequence in the BAM/SAM file. In those cases, the read length + can be inferred from the CIGAR alignment, see + :meth:`pysam.AlignedSegment.infer_query_length`. + + The length includes soft-clipped bases and is equal to + ``len(query_sequence)``. + + This property is read-only but can be set by providing a + sequence. + + Returns 0 if not available. + + """ + def __get__(self): + return self._delegate.core.l_qseq + + property template_length: + """the observed query template length""" + def __get__(self): + return self._delegate.core.isize + def __set__(self, isize): + self._delegate.core.isize = isize + + property query_sequence: + """read sequence bases, including :term:`soft clipped` bases + (None if not present). + + Note that assigning to seq will invalidate any quality scores. + Thus, to in-place edit the sequence and quality scores, copies of + the quality scores need to be taken. Consider trimming for example:: + + q = read.query_qualities + read.query_squence = read.query_sequence[5:10] + read.query_qualities = q[5:10] + + The sequence is returned as it is stored in the BAM file. Some mappers + might have stored a reverse complement of the original read + sequence. + """ + def __get__(self): + if self.cache_query_sequence: + return self.cache_query_sequence + + cdef bam1_t * src + cdef char * s + src = self._delegate + + if src.core.l_qseq == 0: + return None + + self.cache_query_sequence = force_str(getSequenceInRange( + src, 0, src.core.l_qseq)) + return self.cache_query_sequence + + def __set__(self, seq): + # samtools manages sequence and quality length memory together + # if no quality information is present, the first byte says 0xff. + cdef bam1_t * src + cdef uint8_t * p + cdef char * s + cdef int l, k + cdef Py_ssize_t nbytes_new, nbytes_old + + if seq == None: + l = 0 + else: + l = len(seq) + seq = force_bytes(seq) + + src = self._delegate + + # as the sequence is stored in half-bytes, the total length (sequence + # plus quality scores) is (l+1)/2 + l + nbytes_new = (l + 1) / 2 + l + nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq + + # acquire pointer to location in memory + p = pysam_bam_get_seq(src) + src.core.l_qseq = l + + # change length of data field + pysam_bam_update(src, + nbytes_old, + nbytes_new, + p) + + if l > 0: + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam_get_seq(src) + for k from 0 <= k < nbytes_new: + p[k] = 0 + # convert to C string + s = seq + for k from 0 <= k < l: + p[k/2] |= seq_nt16_table[s[k]] << 4 * (1 - k % 2) + + # erase qualities + p = pysam_bam_get_qual(src) + p[0] = 0xff + + self.cache_query_sequence = force_str(seq) + + # clear cached values for quality values + self.cache_query_qualities = None + self.cache_query_alignment_qualities = None + + property query_qualities: + """read sequence base qualities, including :term:`soft + clipped` bases (None if not present). + + Quality scores are returned as a python array of unsigned + chars. Note that this is not the ASCII-encoded value typically + seen in FASTQ or SAM formatted files. Thus, no offset of 33 + needs to be subtracted. + + Note that to set quality scores the sequence has to be set + beforehand as this will determine the expected length of the + quality score array. + + This method raises a ValueError if the length of the + quality scores and the sequence are not the same. + + """ + def __get__(self): + + if self.cache_query_qualities: + return self.cache_query_qualities + + cdef bam1_t * src + cdef char * q + + src = self._delegate + + if src.core.l_qseq == 0: + return None + + self.cache_query_qualities = getQualitiesInRange(src, 0, src.core.l_qseq) + return self.cache_query_qualities + + def __set__(self, qual): + + # note that memory is already allocated via setting the sequence + # hence length match of sequence and quality needs is checked. + cdef bam1_t * src + cdef uint8_t * p + cdef int l + + src = self._delegate + p = pysam_bam_get_qual(src) + if qual is None or len(qual) == 0: + # if absent and there is a sequence: set to 0xff + if src.core.l_qseq != 0: + p[0] = 0xff + return + + # check for length match + l = len(qual) + if src.core.l_qseq != l: + raise ValueError( + "quality and sequence mismatch: %i != %i" % + (l, src.core.l_qseq)) + + # create a python array object filling it + # with the quality scores + + # NB: should avoid this copying if qual is + # already of the correct type. + cdef c_array.array result = c_array.array('B', qual) + + # copy data + memcpy(p, result.data.as_voidptr, l) + + # save in cache + self.cache_query_qualities = qual + + property bin: + """properties bin""" + def __get__(self): + return pysam_get_bin(self._delegate) + def __set__(self, bin): + pysam_set_bin(self._delegate, bin) + + + ########################################################## + # Derived simple attributes. These are simple attributes of + # AlignedSegment getting and setting values. + ########################################################## + # 1. Flags + ########################################################## + property is_paired: + """true if read is paired in sequencing""" + def __get__(self): + return (self.flag & BAM_FPAIRED) != 0 + def __set__(self,val): + pysam_update_flag(self._delegate, val, BAM_FPAIRED) + + property is_proper_pair: + """true if read is mapped in a proper pair""" + def __get__(self): + return (self.flag & BAM_FPROPER_PAIR) != 0 + def __set__(self,val): + pysam_update_flag(self._delegate, val, BAM_FPROPER_PAIR) + property is_unmapped: + """true if read itself is unmapped""" + def __get__(self): + return (self.flag & BAM_FUNMAP) != 0 + def __set__(self, val): + pysam_update_flag(self._delegate, val, BAM_FUNMAP) + property mate_is_unmapped: + """true if the mate is unmapped""" + def __get__(self): + return (self.flag & BAM_FMUNMAP) != 0 + def __set__(self,val): + pysam_update_flag(self._delegate, val, BAM_FMUNMAP) + property is_reverse: + """true if read is mapped to reverse strand""" + def __get__(self): + return (self.flag & BAM_FREVERSE) != 0 + def __set__(self,val): + pysam_update_flag(self._delegate, val, BAM_FREVERSE) + property mate_is_reverse: + """true is read is mapped to reverse strand""" + def __get__(self): + return (self.flag & BAM_FMREVERSE) != 0 + def __set__(self,val): + pysam_update_flag(self._delegate, val, BAM_FMREVERSE) + property is_read1: + """true if this is read1""" + def __get__(self): + return (self.flag & BAM_FREAD1) != 0 + def __set__(self,val): + pysam_update_flag(self._delegate, val, BAM_FREAD1) + property is_read2: + """true if this is read2""" + def __get__(self): + return (self.flag & BAM_FREAD2) != 0 + def __set__(self, val): + pysam_update_flag(self._delegate, val, BAM_FREAD2) + property is_secondary: + """true if not primary alignment""" + def __get__(self): + return (self.flag & BAM_FSECONDARY) != 0 + def __set__(self, val): + pysam_update_flag(self._delegate, val, BAM_FSECONDARY) + property is_qcfail: + """true if QC failure""" + def __get__(self): + return (self.flag & BAM_FQCFAIL) != 0 + def __set__(self, val): + pysam_update_flag(self._delegate, val, BAM_FQCFAIL) + property is_duplicate: + """true if optical or PCR duplicate""" + def __get__(self): + return (self.flag & BAM_FDUP) != 0 + def __set__(self, val): + pysam_update_flag(self._delegate, val, BAM_FDUP) + property is_supplementary: + """true if this is a supplementary alignment""" + def __get__(self): + return (self.flag & BAM_FSUPPLEMENTARY) != 0 + def __set__(self, val): + pysam_update_flag(self._delegate, val, BAM_FSUPPLEMENTARY) + + # 2. Coordinates and lengths + property reference_end: + '''aligned reference position of the read on the reference genome. + + reference_end points to one past the last aligned residue. + Returns None if not available (read is unmapped or no cigar + alignment present). + + ''' + def __get__(self): + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: + return None + return bam_endpos(src) + + property reference_length: + '''aligned length of the read on the reference genome. + + This is equal to `aend - pos`. Returns None if not available.''' + def __get__(self): + cdef bam1_t * src + src = self._delegate + if (self.flag & BAM_FUNMAP) or pysam_get_n_cigar(src) == 0: + return None + return bam_endpos(src) - \ + self._delegate.core.pos + + property query_alignment_sequence: + """aligned portion of the read. + + This is a substring of :attr:`seq` that excludes flanking + bases that were :term:`soft clipped` (None if not present). It + is equal to ``seq[qstart:qend]``. + + SAM/BAM files may include extra flanking bases that are not + part of the alignment. These bases may be the result of the + Smith-Waterman or other algorithms, which may not require + alignments that begin at the first residue or end at the last. + In addition, extra sequencing adapters, multiplex identifiers, + and low-quality bases that were not considered for alignment + may have been retained. + + """ + + def __get__(self): + if self.cache_query_alignment_sequence: + return self.cache_query_alignment_sequence + + cdef bam1_t * src + cdef uint32_t start, end + + src = self._delegate + + if src.core.l_qseq == 0: + return None + + start = getQueryStart(src) + end = getQueryEnd(src) + + self.cache_query_alignment_sequence = force_str( + getSequenceInRange(src, start, end)) + return self.cache_query_alignment_sequence + + property query_alignment_qualities: + """aligned query sequence quality values (None if not present). These + are the quality values that correspond to :attr:`query`, that + is, they exclude qualities of :term:`soft clipped` bases. This + is equal to ``qual[qstart:qend]``. + + Quality scores are returned as a python array of unsigned + chars. Note that this is not the ASCII-encoded value typically + seen in FASTQ or SAM formatted files. Thus, no offset of 33 + needs to be subtracted. + + This property is read-only. + + """ + def __get__(self): + + if self.cache_query_alignment_qualities: + return self.cache_query_alignment_qualities + + cdef bam1_t * src + cdef uint32_t start, end + + src = self._delegate + + if src.core.l_qseq == 0: + return None + + start = getQueryStart(src) + end = getQueryEnd(src) + self.cache_query_alignment_qualities = \ + getQualitiesInRange(src, start, end) + return self.cache_query_alignment_qualities + + property query_alignment_start: + """start index of the aligned query portion of the sequence (0-based, + inclusive). + + This the index of the first base in :attr:`seq` that is not + soft-clipped. + + """ + def __get__(self): + return getQueryStart(self._delegate) + + property query_alignment_end: + """end index of the aligned query portion of the sequence (0-based, + exclusive)""" + def __get__(self): + return getQueryEnd(self._delegate) + + property query_alignment_length: + """length of the aligned query sequence. + + This is equal to :attr:`qend` - :attr:`qstart`""" + def __get__(self): + cdef bam1_t * src + src = self._delegate + return getQueryEnd(src) - getQueryStart(src) + + ##################################################### + # Computed properties + + def get_reference_positions(self, full_length=False): + """a list of reference positions that this read aligns to. + + By default, this method only returns positions in the + reference that are within the alignment. If *full_length* is + set, None values will be included for any soft-clipped or + unaligned positions within the read. The returned list will + thus be of the same length as the read. + + """ + cdef uint32_t k, i, pos + cdef int op + cdef uint32_t * cigar_p + cdef bam1_t * src + cdef bint _full = full_length + + src = self._delegate + if pysam_get_n_cigar(src) == 0: + return [] + + result = [] + pos = src.core.pos + cigar_p = pysam_bam_get_cigar(src) + + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + + if op == BAM_CSOFT_CLIP or op == BAM_CINS: + if _full: + for i from 0 <= i < l: + result.append(None) + elif op == BAM_CMATCH: + for i from pos <= i < pos + l: + result.append(i) + pos += l + elif op == BAM_CDEL or op == BAM_CREF_SKIP: + pos += l + + return result + + def infer_query_length(self, always=True): + """inferred read length from CIGAR string. + + If *always* is set to True, the read length + will be always inferred. If set to False, the length + of the read sequence will be returned if it is + available. + + Returns None if CIGAR string is not present. + """ + + cdef uint32_t * cigar_p + cdef bam1_t * src + + src = self._delegate + + if not always and src.core.l_qseq: + return src.core.l_qseq + + return calculateQueryLength(src) + + def get_reference_sequence(self): + """return the reference sequence. + + This method requires the MD tag to be set. + """ + cdef uint32_t k, i + cdef int op + cdef bam1_t * src = self._delegate + ref_seq = force_str(build_alignment_sequence(src)) + if ref_seq is None: + raise ValueError("MD tag not present") + + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) + cdef uint32_t r_idx = 0 + result = [] + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: + for i from 0 <= i < l: + result.append(ref_seq[r_idx]) + r_idx += 1 + elif op == BAM_CDEL: + for i from 0 <= i < l: + result.append(ref_seq[r_idx]) + r_idx += 1 + elif op == BAM_CREF_SKIP: + pass + elif op == BAM_CINS: + r_idx += l + elif op == BAM_CSOFT_CLIP: + pass + elif op == BAM_CHARD_CLIP: + pass # advances neither + elif op == BAM_CPAD: + raise NotImplementedError( + "Padding (BAM_CPAD, 6) is currently not supported. " + "Please implement. Sorry about that.") + + return "".join(result) + + def get_aligned_pairs(self, matches_only=False, with_seq=False): + """a list of aligned read (query) and reference positions. + + For inserts, deletions, skipping either query or reference + position may be None. + + Padding is currently not supported and leads to an exception. + + Parameters + ---------- + + matches_only : bool + If True, only matched bases are returned - no None on either + side. + with_seq : bool + If True, return a third element in the tuple containing the + reference sequence. Substitutions are lower-case. This option + requires an MD tag to be present. + + Returns + ------- + + aligned_pairs : list of tuples + + """ + cdef uint32_t k, i, pos, qpos, r_idx, l + cdef int op + cdef uint32_t * cigar_p + cdef bam1_t * src = self._delegate + cdef bint _matches_only = bool(matches_only) + cdef bint _with_seq = bool(with_seq) + + # TODO: this method performs no checking and assumes that + # read sequence, cigar and MD tag are consistent. + + if _with_seq: + ref_seq = force_str(self.get_reference_sequence()) + if ref_seq is None: + raise ValueError("MD tag not present") + + r_idx = 0 + + if pysam_get_n_cigar(src) == 0: + return [] + + result = [] + pos = src.core.pos + qpos = 0 + cigar_p = pysam_bam_get_cigar(src) + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + + if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: + if _with_seq: + for i from pos <= i < pos + l: + result.append((qpos, i, ref_seq[r_idx])) + r_idx += 1 + qpos += 1 + else: + for i from pos <= i < pos + l: + result.append((qpos, i)) + qpos += 1 + pos += l + + elif op == BAM_CINS or op == BAM_CSOFT_CLIP: + if not _matches_only: + if _with_seq: + for i from pos <= i < pos + l: + result.append((qpos, None, None)) + qpos += 1 + else: + for i from pos <= i < pos + l: + result.append((qpos, None)) + qpos += 1 + else: + qpos += l + + elif op == BAM_CDEL: + if not _matches_only: + if _with_seq: + for i from pos <= i < pos + l: + result.append((None, i, ref_seq[r_idx])) + r_idx += 1 + else: + for i from pos <= i < pos + l: + result.append((None, i)) + pos += l + + elif op == BAM_CHARD_CLIP: + pass # advances neither + + elif op == BAM_CREF_SKIP: + if not _matches_only: + if _with_seq: + for i from pos <= i < pos + l: + result.append((None, i, None)) + else: + for i from pos <= i < pos + l: + result.append((None, i)) + + pos += l + + elif op == BAM_CPAD: + raise NotImplementedError( + "Padding (BAM_CPAD, 6) is currently not supported. " + "Please implement. Sorry about that.") + + return result + + def get_blocks(self): + """ a list of start and end positions of + aligned gapless blocks. + + The start and end positions are in genomic + coordinates. + + Blocks are not normalized, i.e. two blocks + might be directly adjacent. This happens if + the two blocks are separated by an insertion + in the read. + """ + + cdef uint32_t k, pos, l + cdef int op + cdef uint32_t * cigar_p + cdef bam1_t * src + + src = self._delegate + if pysam_get_n_cigar(src) == 0: + return [] + + result = [] + pos = src.core.pos + cigar_p = pysam_bam_get_cigar(src) + l = 0 + + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + if op == BAM_CMATCH: + result.append((pos, pos + l)) + pos += l + elif op == BAM_CDEL or op == BAM_CREF_SKIP: + pos += l + + return result + + def get_overlap(self, uint32_t start, uint32_t end): + """return number of aligned bases of read overlapping the interval + *start* and *end* on the reference sequence. + + Return None if cigar alignment is not available. + """ + cdef uint32_t k, i, pos, overlap + cdef int op, o + cdef uint32_t * cigar_p + cdef bam1_t * src + + overlap = 0 + + src = self._delegate + if pysam_get_n_cigar(src) == 0: + return None + pos = src.core.pos + o = 0 + + cigar_p = pysam_bam_get_cigar(src) + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + + if op == BAM_CMATCH: + o = min( pos + l, end) - max( pos, start ) + if o > 0: overlap += o + + if op == BAM_CMATCH or op == BAM_CDEL or op == BAM_CREF_SKIP: + pos += l + + return overlap + + def get_cigar_stats(self): + """summary of operations in cigar string. + + The output order in the array is "MIDNSHP=X" followed by a + field for the NM tag. If the NM tag is not present, this + field will always be 0. + + +-----+--------------+-----+ + |M |BAM_CMATCH |0 | + +-----+--------------+-----+ + |I |BAM_CINS |1 | + +-----+--------------+-----+ + |D |BAM_CDEL |2 | + +-----+--------------+-----+ + |N |BAM_CREF_SKIP |3 | + +-----+--------------+-----+ + |S |BAM_CSOFT_CLIP|4 | + +-----+--------------+-----+ + |H |BAM_CHARD_CLIP|5 | + +-----+--------------+-----+ + |P |BAM_CPAD |6 | + +-----+--------------+-----+ + |= |BAM_CEQUAL |7 | + +-----+--------------+-----+ + |X |BAM_CDIFF |8 | + +-----+--------------+-----+ + |NM |NM tag |9 | + +-----+--------------+-----+ + + If no cigar string is present, empty arrays will be returned. + + Parameters + ---------- + + Returns + ------- + + arrays : two arrays. The first contains the nucleotide counts within + each cigar operation, the second contains the number of blocks for + each cigar operation. + + """ + + cdef int nfields = NCIGAR_CODES + 1 + + cdef c_array.array base_counts = array.array( + "I", + [0] * nfields) + cdef uint32_t [:] base_view = base_counts + cdef c_array.array block_counts = array.array( + "I", + [0] * nfields) + cdef uint32_t [:] block_view = block_counts + + cdef bam1_t * src = self._delegate + cdef int op + cdef uint32_t l + cdef int32_t k + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) + + if cigar_p == NULL: + return None + + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + base_view[op] += l + block_view[op] += 1 + + cdef uint8_t * v = bam_aux_get(src, 'NM') + if v != NULL: + base_view[nfields - 1] = bam_aux2i(v) + + return base_counts, block_counts + + ##################################################### + ## Unsorted as yet + # TODO: capture in CIGAR object + property cigartuples: + """the :term:`cigar` alignment. The alignment + is returned as a list of tuples of (operation, length). + + If the alignment is not present, None is returned. + + The operations are: + + +-----+--------------+-----+ + |M |BAM_CMATCH |0 | + +-----+--------------+-----+ + |I |BAM_CINS |1 | + +-----+--------------+-----+ + |D |BAM_CDEL |2 | + +-----+--------------+-----+ + |N |BAM_CREF_SKIP |3 | + +-----+--------------+-----+ + |S |BAM_CSOFT_CLIP|4 | + +-----+--------------+-----+ + |H |BAM_CHARD_CLIP|5 | + +-----+--------------+-----+ + |P |BAM_CPAD |6 | + +-----+--------------+-----+ + |= |BAM_CEQUAL |7 | + +-----+--------------+-----+ + |X |BAM_CDIFF |8 | + +-----+--------------+-----+ + + .. note:: + The output is a list of (operation, length) tuples, such as + ``[(0, 30)]``. + This is different from the SAM specification and + the :attr:`cigarstring` property, which uses a + (length, operation) order, for example: ``30M``. + + To unset the cigar property, assign an empty list + or None. + """ + def __get__(self): + cdef uint32_t * cigar_p + cdef bam1_t * src + cdef uint32_t op, l + cdef int k + + src = self._delegate + if pysam_get_n_cigar(src) == 0: + return None + + cigar = [] + + cigar_p = pysam_bam_get_cigar(src); + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + cigar.append((op, l)) + return cigar + + def __set__(self, values): + cdef uint32_t * p + cdef bam1_t * src + cdef op, l + cdef int k, ncigar + + k = 0 + + src = self._delegate + + # get location of cigar string + p = pysam_bam_get_cigar(src) + + # empty values for cigar string + if values is None: + values = [] + + ncigar = len(values) + # create space for cigar data within src.data + pysam_bam_update(src, + pysam_get_n_cigar(src) * 4, + ncigar * 4, + p) + + # length is number of cigar operations, not bytes + pysam_set_n_cigar(src, ncigar) + + # re-acquire pointer to location in memory + # as it might have moved + p = pysam_bam_get_cigar(src) + + # insert cigar operations + for op, l in values: + p[k] = l << BAM_CIGAR_SHIFT | op + k += 1 + + ## setting the cigar string requires updating the bin + pysam_set_bin(src, + hts_reg2bin( + src.core.pos, + bam_endpos(src), + 14, + 5)) + + + cpdef set_tag(self, + tag, + value, + value_type=None, + replace=True): + """sets a particular field *tag* to *value* in the optional alignment + section. + + *value_type* describes the type of *value* that is to entered + into the alignment record.. It can be set explicitly to one + of the valid one-letter type codes. If unset, an appropriate + type will be chosen automatically. + + An existing value of the same *tag* will be overwritten unless + replace is set to False. This is usually not recommened as a + tag may only appear once in the optional alignment section. + + If *value* is None, the tag will be deleted. + """ + + cdef int value_size + cdef uint8_t * value_ptr + cdef uint8_t *existing_ptr + cdef uint8_t typecode + cdef float float_value + cdef double double_value + cdef int32_t int_value + cdef bam1_t * src = self._delegate + cdef char * _value_type + cdef c_array.array array_value + cdef object buffer + + if len(tag) != 2: + raise ValueError('Invalid tag: %s' % tag) + + tag = force_bytes(tag) + if replace: + existing_ptr = bam_aux_get(src, tag) + if existing_ptr: + bam_aux_del(src, existing_ptr) + + # setting value to None deletes a tag + if value is None: + return + + typecode = get_value_code(value, value_type) + if typecode == 0: + raise ValueError("can't guess type or invalid type code specified") + + # Not Endian-safe, but then again neither is samtools! + if typecode == 'Z': + value = force_bytes(value) + value_ptr = value + value_size = len(value)+1 + elif typecode == 'i': + int_value = value + value_ptr = &int_value + value_size = sizeof(int32_t) + elif typecode == 'd': + double_value = value + value_ptr = &double_value + value_size = sizeof(double) + elif typecode == 'f': + float_value = value + value_ptr = &float_value + value_size = sizeof(float) + elif typecode == 'B': + # the following goes through python, needs to be cleaned up + # pack array using struct + if value_type is None: + fmt, args = packTags([(tag, value)]) + else: + fmt, args = packTags([(tag, value, value_type)]) + + # remove tag and type code as set by bam_aux_append + # first four chars of format (<2sc) + fmt = '<' + fmt[4:] + # first two values to pack + args = args[2:] + value_size = struct.calcsize(fmt) + # buffer will be freed when object goes out of scope + buffer = ctypes.create_string_buffer(value_size) + struct.pack_into(fmt, buffer, 0, *args) + # bam_aux_append copies data from value_ptr + bam_aux_append(src, + tag, + typecode, + value_size, + buffer.raw) + return + else: + raise ValueError('unsupported value_type in set_option') + + bam_aux_append(src, + tag, + typecode, + value_size, + value_ptr) + + cpdef has_tag(self, tag): + """returns true if the optional alignment section + contains a given *tag*.""" + cdef uint8_t * v + cdef int nvalues + btag = force_bytes(tag) + v = bam_aux_get(self._delegate, btag) + return v != NULL + + cpdef get_tag(self, tag, with_value_type=False): + """ + retrieves data from the optional alignment section + given a two-letter *tag* denoting the field. + + The returned value is cast into an appropriate python type. + + This method is the fastest way to access the optional + alignment section if only few tags need to be retrieved. + + Parameters + ---------- + + tag : + data tag. + + with_value_type : Optional[bool] + if set to True, the return value is a tuple of (tag value, type code). + (default False) + + Returns + ------- + + A python object with the value of the `tag`. The type of the + object depends on the data type in the data record. + + Raises + ------ + + KeyError + If `tag` is not present, a KeyError is raised. + + """ + cdef uint8_t * v + cdef int nvalues + btag = force_bytes(tag) + v = bam_aux_get(self._delegate, btag) + if v == NULL: + raise KeyError("tag '%s' not present" % tag) + if chr(v[0]) == "B": + auxtype = chr(v[0]) + chr(v[1]) + else: + auxtype = chr(v[0]) + + if auxtype == 'c' or auxtype == 'C' or auxtype == 's' or auxtype == 'S': + value = bam_aux2i(v) + elif auxtype == 'i' or auxtype == 'I': + value = bam_aux2i(v) + elif auxtype == 'f' or auxtype == 'F': + value = bam_aux2f(v) + elif auxtype == 'd' or auxtype == 'D': + value = bam_aux2f(v) + elif auxtype == 'A': + # there might a more efficient way + # to convert a char into a string + value = '%c' % bam_aux2A(v) + elif auxtype == 'Z': + value = charptr_to_str(bam_aux2Z(v)) + elif auxtype[0] == 'B': + bytesize, nvalues, values = convert_binary_tag(v + 1) + value = values + else: + raise ValueError("unknown auxiliary type '%s'" % auxtype) + + if with_value_type: + return (value, auxtype) + else: + return value + + def get_tags(self, with_value_type=False): + """the fields in the optional aligment section. + + Returns a list of all fields in the optional + alignment section. Values are converted to appropriate python + values. For example: + + [(NM, 2), (RG, "GJP00TM04")] + + If *with_value_type* is set, the value type as encode in + the AlignedSegment record will be returned as well: + + [(NM, 2, "i"), (RG, "GJP00TM04", "Z")] + + This method will convert all values in the optional alignment + section. When getting only one or few tags, please see + :meth:`get_tag` for a quicker way to achieve this. + + """ + + cdef char * ctag + cdef bam1_t * src + cdef uint8_t * s + cdef char auxtag[3] + cdef char auxtype + cdef uint8_t byte_size + cdef int32_t nvalues + + src = self._delegate + if src.l_data == 0: + return [] + s = pysam_bam_get_aux(src) + result = [] + auxtag[2] = 0 + while s < (src.data + src.l_data): + # get tag + auxtag[0] = s[0] + auxtag[1] = s[1] + s += 2 + auxtype = s[0] + if auxtype in ('c', 'C'): + value = bam_aux2i(s) + s += 1 + elif auxtype in ('s', 'S'): + value = bam_aux2i(s) + s += 2 + elif auxtype in ('i', 'I'): + value = bam_aux2i(s) + s += 4 + elif auxtype == 'f': + value = bam_aux2f(s) + s += 4 + elif auxtype == 'd': + value = bam_aux2f(s) + s += 8 + elif auxtype == 'A': + value = "%c" % bam_aux2A(s) + s += 1 + elif auxtype in ('Z', 'H'): + value = charptr_to_str(bam_aux2Z(s)) + # +1 for NULL terminated string + s += len(value) + 1 + elif auxtype == 'B': + s += 1 + byte_size, nvalues, value = convert_binary_tag(s) + # 5 for 1 char and 1 int + s += 5 + (nvalues * byte_size) - 1 + else: + raise KeyError("unknown type '%s'" % auxtype) + + s += 1 + + if with_value_type: + result.append((charptr_to_str(auxtag), value, chr(auxtype))) + else: + result.append((charptr_to_str(auxtag), value)) + + return result + + def set_tags(self, tags): + """sets the fields in the optional alignmest section with + a list of (tag, value) tuples. + + The :term:`value type` of the values is determined from the + python type. Optionally, a type may be given explicitly as + a third value in the tuple, For example: + + x.set_tags([(NM, 2, "i"), (RG, "GJP00TM04", "Z")] + + This method will not enforce the rule that the same tag may appear + only once in the optional alignment section. + """ + + cdef bam1_t * src + cdef uint8_t * s + cdef char * temp + cdef int new_size = 0 + cdef int old_size + src = self._delegate + + # convert and pack the data + if tags is not None and len(tags) > 0: + fmt, args = packTags(tags) + new_size = struct.calcsize(fmt) + buffer = ctypes.create_string_buffer(new_size) + struct.pack_into(fmt, + buffer, + 0, + *args) + + # delete the old data and allocate new space. + # If total_size == 0, the aux field will be + # empty + old_size = pysam_bam_get_l_aux(src) + pysam_bam_update(src, + old_size, + new_size, + pysam_bam_get_aux(src)) + + # copy data only if there is any + if new_size > 0: + + # get location of new data + s = pysam_bam_get_aux(src) + + # check if there is direct path from buffer.raw to tmp + p = buffer.raw + # create handle to make sure buffer stays alive long + # enough for memcpy, see issue 129 + temp = p + memcpy(s, temp, new_size) + + + ######################################################## + # Compatibility Accessors + # Functions, properties for compatibility with pysam < 0.8 + # + # Several options + # change the factory functions according to API + # * requires code changes throughout, incl passing + # handles to factory functions + # subclass functions and add attributes at runtime + # e.g.: AlignedSegments.qname = AlignedSegments.query_name + # * will slow down the default interface + # explicit declaration of getters/setters + ######################################################## + property qname: + """deprecated, use query_name instead""" + def __get__(self): return self.query_name + def __set__(self, v): self.query_name = v + property tid: + """deprecated, use reference_id instead""" + def __get__(self): return self.reference_id + def __set__(self, v): self.reference_id = v + property pos: + """deprecated, use reference_start instead""" + def __get__(self): return self.reference_start + def __set__(self, v): self.reference_start = v + property mapq: + """deprecated, use mapping_quality instead""" + def __get__(self): return self.mapping_quality + def __set__(self, v): self.mapping_quality = v + property rnext: + """deprecated, use next_reference_id instead""" + def __get__(self): return self.next_reference_id + def __set__(self, v): self.next_reference_id = v + property pnext: + """deprecated, use next_reference_start instead""" + def __get__(self): + return self.next_reference_start + def __set__(self, v): + self.next_reference_start = v + property cigar: + """deprecated, use cigartuples instead""" + def __get__(self): + r = self.cigartuples + if r is None: + r = [] + return r + def __set__(self, v): self.cigartuples = v + property tlen: + """deprecated, use template_length instead""" + def __get__(self): + return self.template_length + def __set__(self, v): + self.template_length = v + property seq: + """deprecated, use query_sequence instead""" + def __get__(self): + return self.query_sequence + def __set__(self, v): + self.query_sequence = v + property qual: + """deprecated, query_qualities instead""" + def __get__(self): + return array_to_qualitystring(self.query_qualities) + def __set__(self, v): + self.query_qualities = qualitystring_to_array(v) + property alen: + """deprecated, reference_length instead""" + def __get__(self): + return self.reference_length + def __set__(self, v): + self.reference_length = v + property aend: + """deprecated, reference_end instead""" + def __get__(self): + return self.reference_end + def __set__(self, v): + self.reference_end = v + property rlen: + """deprecated, query_length instead""" + def __get__(self): + return self.query_length + def __set__(self, v): + self.query_length = v + property query: + """deprecated, query_alignment_sequence instead""" + def __get__(self): + return self.query_alignment_sequence + def __set__(self, v): + self.query_alignment_sequence = v + property qqual: + """deprecated, query_alignment_qualities instead""" + def __get__(self): + return array_to_qualitystring(self.query_alignment_qualities) + def __set__(self, v): + self.query_alignment_qualities = qualitystring_to_array(v) + property qstart: + """deprecated, use query_alignment_start instead""" + def __get__(self): + return self.query_alignment_start + def __set__(self, v): + self.query_alignment_start = v + property qend: + """deprecated, use query_alignment_end instead""" + def __get__(self): + return self.query_alignment_end + def __set__(self, v): + self.query_alignment_end = v + property qlen: + """deprecated, use query_alignment_length instead""" + def __get__(self): + return self.query_alignment_length + def __set__(self, v): + self.query_alignment_length = v + property mrnm: + """deprecated, use next_reference_id instead""" + def __get__(self): + return self.next_reference_id + def __set__(self, v): + self.next_reference_id = v + property mpos: + """deprecated, use next_reference_start instead""" + def __get__(self): + return self.next_reference_start + def __set__(self, v): + self.next_reference_start = v + property rname: + """deprecated, use reference_id instead""" + def __get__(self): + return self.reference_id + def __set__(self, v): + self.reference_id = v + property isize: + """deprecated, use template_length instead""" + def __get__(self): + return self.template_length + def __set__(self, v): + self.template_length = v + property blocks: + """deprecated, use get_blocks() instead""" + def __get__(self): + return self.get_blocks() + property aligned_pairs: + """deprecated, use get_aligned_pairs() instead""" + def __get__(self): + return self.get_aligned_pairs() + property inferred_length: + """deprecated, use infer_query_length() instead""" + def __get__(self): + return self.infer_query_length() + property positions: + """deprecated, use get_reference_positions() instead""" + def __get__(self): + return self.get_reference_positions() + property tags: + """deprecated, use get_tags() instead""" + def __get__(self): + return self.get_tags() + def __set__(self, tags): + self.set_tags(tags) + def overlap(self): + """deprecated, use get_overlap() instead""" + return self.get_overlap() + def opt(self, tag): + """deprecated, use get_tag() instead""" + return self.get_tag(tag) + def setTag(self, tag, value, value_type=None, replace=True): + """deprecated, use set_tag() instead""" + return self.set_tag(tag, value, value_type, replace) + + +cdef class PileupColumn: + '''A pileup of reads at a particular reference sequence position + (:term:`column`). A pileup column contains all the reads that map + to a certain target base. + + This class is a proxy for results returned by the samtools pileup + engine. If the underlying engine iterator advances, the results + of this column will change. + + ''' + def __init__(self): + raise TypeError("this class cannot be instantiated from Python") + + def __str__(self): + return "\t".join(map(str, + (self.reference_id, + self.reference_pos, + self.nsegments))) +\ + "\n" +\ + "\n".join(map(str, self.pileups)) + + property reference_id: + '''the reference sequence number as defined in the header''' + def __get__(self): + return self.tid + + property reference_name: + """:term:`reference` name (None if no AlignmentFile is associated)""" + def __get__(self): + if self._alignment_file is not None: + return self._alignment_file.getrname(self.tid) + return None + + property nsegments: + '''number of reads mapping to this column.''' + def __get__(self): + return self.n_pu + def __set__(self, n): + self.n_pu = n + + property reference_pos: + '''the position in the reference sequence (0-based).''' + def __get__(self): + return self.pos + + property pileups: + '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' + def __get__(self): + cdef int x + pileups = [] + + if self.plp == NULL or self.plp[0] == NULL: + raise ValueError("PileupColumn accessed after iterator finished") + + # warning: there could be problems if self.n and self.buf are + # out of sync. + for x from 0 <= x < self.n_pu: + pileups.append(makePileupRead(&(self.plp[0][x]), + self._alignment_file)) + return pileups + + ######################################################## + # Compatibility Accessors + # Functions, properties for compatibility with pysam < 0.8 + ######################################################## + property pos: + def __get__(self): + return self.reference_pos + def __set__(self, v): + self.reference_pos = v + + property tid: + def __get__(self): + return self.reference_id + def __set__(self, v): + self.reference_id = v + + property n: + def __get__(self): + return self.nsegments + def __set__(self, v): + self.nsegments = v + + +cdef class PileupRead: + '''Representation of a read aligned to a particular position in the + reference sequence. + + ''' + + def __init__(self): + raise TypeError( + "this class cannot be instantiated from Python") + + def __str__(self): + return "\t".join( + map(str, + (self.alignment, self.query_position, + self.indel, self.level, + self.is_del, self.is_head, + self.is_tail, self.is_refskip))) + + property alignment: + """a :class:`pysam.AlignedSegment` object of the aligned read""" + def __get__(self): + return self._alignment + + property query_position: + """position of the read base at the pileup site, 0-based. + None if is_del or is_refskip is set. + + """ + def __get__(self): + if self.is_del or self.is_refskip: + return None + else: + return self._qpos + + property query_position_or_next: + """position of the read base at the pileup site, 0-based. + + If the current position is a deletion, returns the next + aligned base. + + """ + def __get__(self): + return self._qpos + + property indel: + """indel length for the position following the current pileup site. + + This quantity peeks ahead to the next cigar operation in this + alignment. If the next operation is an insertion, indel will + be positive. If the next operation is a deletion, it will be + negation. 0 if the next operation is not an indel. + + """ + def __get__(self): + return self._indel + + property level: + """the level of the read in the "viewer" mode. Note that this value + is currently not computed.""" + def __get__(self): + return self._level + + property is_del: + """1 iff the base on the padded read is a deletion""" + def __get__(self): + return self._is_del + + property is_head: + """1 iff the base on the padded read is the left-most base.""" + def __get__(self): + return self._is_head + + property is_tail: + """1 iff the base on the padded read is the right-most base.""" + def __get__(self): + return self._is_tail + + property is_refskip: + """1 iff the base on the padded read is part of CIGAR N op.""" + def __get__(self): + return self._is_refskip + +__all__ = [ + "AlignedSegment", + "PileupColumn", + "PileupRead"] diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd new file mode 100644 index 0000000..6f32f47 --- /dev/null +++ b/pysam/libcalignmentfile.pxd @@ -0,0 +1,156 @@ +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free +from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup +from libc.stdio cimport FILE, printf + +from pysam.libcfaidx cimport faidx_t, Fastafile +from pysam.libcalignedsegment cimport AlignedSegment +from pysam.libchtslib cimport * + +from cpython cimport array +cimport cython + +cdef extern from *: + ctypedef char* const_char_ptr "const char*" + +cdef extern from "htslib_util.h": + + char * pysam_bam_get_qname(bam1_t * b) + +cdef extern from "samfile_util.h": + + int bam_cap_mapQ(bam1_t *b, char *ref, int thres) + int bam_prob_realn(bam1_t *b, const char *ref) + +#################################################################### +# Utility types + +ctypedef struct __iterdata: + htsFile * htsfile + bam_hdr_t * header + hts_itr_t * iter + faidx_t * fastafile + int tid + char * seq + int seq_len + + +cdef class AlignmentFile(HTSFile): + cdef readonly object reference_filename + + # pointer to index + cdef hts_idx_t *index + # header structure + cdef bam_hdr_t * header + + # current read within iteration + cdef bam1_t * b + + cdef bam1_t * getCurrent(self) + cdef int cnext(self) + + # write an aligned read + cpdef int write(self, AlignedSegment read) except -1 + + +cdef class PileupColumn: + cdef bam_pileup1_t ** plp + cdef int tid + cdef int pos + cdef int n_pu + + +cdef class PileupRead: + cdef AlignedSegment _alignment + cdef int32_t _qpos + cdef int _indel + cdef int _level + cdef uint32_t _is_del + cdef uint32_t _is_head + cdef uint32_t _is_tail + cdef uint32_t _is_refskip + + +cdef class IteratorRow: + cdef int retval + cdef bam1_t * b + cdef AlignmentFile samfile + cdef htsFile * htsfile + cdef bam_hdr_t * header + cdef int owns_samfile + + +cdef class IteratorRowRegion(IteratorRow): + cdef hts_itr_t * iter + cdef bam1_t * getCurrent(self) + cdef int cnext(self) + +cdef class IteratorRowHead(IteratorRow): + cdef int max_rows + cdef int current_row + cdef bam1_t * getCurrent(self) + cdef int cnext(self) + +cdef class IteratorRowAll(IteratorRow): + cdef bam1_t * getCurrent(self) + cdef int cnext(self) + + +cdef class IteratorRowAllRefs(IteratorRow): + cdef int tid + cdef IteratorRowRegion rowiter + + +cdef class IteratorRowSelection(IteratorRow): + cdef int current_pos + cdef positions + cdef bam1_t * getCurrent(self) + cdef int cnext(self) + + +cdef class IteratorColumn: + + # result of the last plbuf_push + cdef IteratorRowRegion iter + cdef int tid + cdef int pos + cdef int n_plp + cdef int mask + cdef bam_pileup1_t * plp + cdef bam_plp_t pileup_iter + cdef __iterdata iterdata + cdef AlignmentFile samfile + cdef Fastafile fastafile + cdef stepper + cdef int max_depth + + cdef int cnext(self) + cdef char * getSequence(self) + cdef setMask(self, mask) + cdef setupIteratorData(self, + int tid, + int start, + int end, + int multiple_iterators=?) + + cdef reset(self, tid, start, end) + cdef _free_pileup_iter(self) + + +cdef class IteratorColumnRegion(IteratorColumn): + cdef int start + cdef int end + cdef int truncate + + +cdef class IteratorColumnAllRefs(IteratorColumn): + pass + + +cdef class IndexedReads: + cdef AlignmentFile samfile + cdef htsFile * htsfile + cdef index + cdef int owns_samfile + cdef bam_hdr_t * header diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx new file mode 100644 index 0000000..2161f87 --- /dev/null +++ b/pysam/libcalignmentfile.pyx @@ -0,0 +1,2490 @@ +# cython: embedsignature=True +# cython: profile=True +######################################################## +######################################################## +# Cython wrapper for SAM/BAM/CRAM files based on htslib +######################################################## +# The principal classes defined in this module are: +# +# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files +# +# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping +# the original sort order intact +# +# Additionally this module defines numerous additional classes that +# are part of the internal API. These are: +# +# Various iterator classes to iterate over alignments in sequential +# (IteratorRow) or in a stacked fashion (IteratorColumn): +# +# class IteratorRow +# class IteratorRowRegion +# class IteratorRowHead +# class IteratorRowAll +# class IteratorRowAllRefs +# class IteratorRowSelection +# class IteratorColumn +# class IteratorColumnRegion +# class IteratorColumnAllRefs +# +######################################################## +# +# The MIT License +# +# Copyright (c) 2015 Andreas Heger +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +######################################################## +import os +import collections +import re +import warnings +import array + +from libc.errno cimport errno, EPIPE +from libc.string cimport strcmp, strpbrk, strerror +from cpython cimport array as c_array +from cpython.version cimport PY_MAJOR_VERSION + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn +from pysam.libchtslib cimport HTSFile, hisremote + +if PY_MAJOR_VERSION >= 3: + from io import StringIO +else: + from StringIO import StringIO + +cimport cython + +######################################################## +## Constants and global variables + +# defines imported from samtools +DEF SEEK_SET = 0 +DEF SEEK_CUR = 1 +DEF SEEK_END = 2 + +# maximum genomic coordinace +cdef int MAX_POS = 2 << 29 + +# valid types for SAM headers +VALID_HEADER_TYPES = {"HD" : dict, + "SQ" : list, + "RG" : list, + "PG" : list, + "CO" : list} + +# order of records within SAM headers +VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") + +# default type conversions within SAM header records +KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, + "SQ" : {"SN" : str, "LN" : int, "AS" : str, + "M5" : str, "SP" : str, "UR" : str, + "AH" : str,}, + "RG" : {"ID" : str, "CN" : str, "DS" : str, + "DT" : str, "FO" : str, "KS" : str, + "LB" : str, "PG" : str, "PI" : str, + "PL" : str, "PM" : str, "PU" : str, + "SM" : str,}, + "PG" : {"ID" : str, "PN" : str, "CL" : str, + "PP" : str, "DS" : str, "VN" : str,},} + +# output order of fields within records. Ensure that CL is at +# the end as parsing a CL will ignore any subsequent records. +VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"), + "SQ" : ("SN", "LN", "AS", "M5", + "UR", "SP", "AH"), + "RG" : ("ID", "CN", "SM", "LB", + "PU", "PI", "DT", "DS", + "PL", "FO", "KS", "PG", + "PM"), + "PG" : ("PN", "ID", "VN", "PP", + "DS", "CL"),} + + +def build_header_line(fields, record): + '''build a header line from `fields` dictionary for `record`''' + + # TODO: add checking for field and sort order + line = ["@%s" % record] + # comment + if record == "CO": + line.append(fields) + # user tags + elif record.islower(): + for key in sorted(fields): + line.append("%s:%s" % (key, str(fields[key]))) + # defined tags + else: + # write fields of the specification + for key in VALID_HEADER_ORDER[record]: + if key in fields: + line.append("%s:%s" % (key, str(fields[key]))) + # write user fields + for key in fields: + if not key.isupper(): + line.append("%s:%s" % (key, str(fields[key]))) + + return "\t".join(line) + +cdef bam_hdr_t * build_header(new_header): + '''return a new header built from a dictionary in `new_header`. + + This method inserts the text field, target_name and target_len. + ''' + + lines = [] + + # check if hash exists + + # create new header and copy old data + cdef bam_hdr_t * dest + + dest = bam_hdr_init() + + # first: defined tags + for record in VALID_HEADERS: + if record in new_header: + ttype = VALID_HEADER_TYPES[record] + data = new_header[record] + if type(data) != type(ttype()): + raise ValueError( + "invalid type for record %s: %s, expected %s" % + (record, type(data), type(ttype()))) + if type(data) is dict: + lines.append(build_header_line(data, record)) + else: + for fields in new_header[record]: + lines.append(build_header_line(fields, record)) + + # then: user tags (lower case), sorted alphabetically + for record, data in sorted(new_header.items()): + if record in VALID_HEADERS: continue + if type(data) is dict: + lines.append(build_header_line(data, record)) + else: + for fields in new_header[record]: + lines.append(build_header_line(fields, record)) + + text = "\n".join(lines) + "\n" + if dest.text != NULL: free( dest.text ) + dest.text = calloc(len(text), sizeof(char)) + dest.l_text = len(text) + cdef bytes btext = text.encode('ascii') + strncpy(dest.text, btext, dest.l_text) + + cdef bytes bseqname + # collect targets + if "SQ" in new_header: + seqs = [] + for fields in new_header["SQ"]: + try: + seqs.append( (fields["SN"], fields["LN"] ) ) + except KeyError: + raise KeyError( "incomplete sequence information in '%s'" % str(fields)) + + dest.n_targets = len(seqs) + dest.target_name = calloc(dest.n_targets, sizeof(char*)) + dest.target_len = calloc(dest.n_targets, sizeof(uint32_t)) + + for x from 0 <= x < dest.n_targets: + seqname, seqlen = seqs[x] + dest.target_name[x] = calloc( + len(seqname) + 1, sizeof(char)) + bseqname = seqname.encode('ascii') + strncpy(dest.target_name[x], bseqname, + len(seqname) + 1) + dest.target_len[x] = seqlen + + return dest + + +cdef class AlignmentFile(HTSFile): + """AlignmentFile(filepath_or_object, mode=None, template=None, + reference_names=None, reference_lengths=None, text=NULL, + header=None, add_sq_text=False, check_header=True, check_sq=True, + reference_filename=None, filename=None, duplicate_filehandle=True) + + A :term:`SAM`/:term:`BAM` formatted file. + + If `filepath_or_object` is a string, the file is automatically + opened. If `filepath_or_object` is a python File object, the + already opened file will be used. + + If the file is opened for reading an index for a BAM file exists + (.bai), it will be opened automatically. Without an index random + access via :meth:`~pysam.AlignmentFile.fetch` and + :meth:`~pysam.AlignmentFile.pileup` is disabled. + + For writing, the header of a :term:`SAM` file/:term:`BAM` file can + be constituted from several sources (see also the samtools format + specification): + + 1. If `template` is given, the header is copied from a another + `AlignmentFile` (`template` must be a + :class:`~pysam.AlignmentFile`). + + 2. If `header` is given, the header is built from a + multi-level dictionary. + + 3. If `text` is given, new header text is copied from raw + text. + + 4. The names (`reference_names`) and lengths + (`reference_lengths`) are supplied directly as lists. + + When reading or writing a CRAM file, the filename of a FASTA-formatted + reference can be specified with `reference_filename`. + + By default, if a file is opened in mode 'r', it is checked + for a valid header (`check_header` = True) and a definition of + chromosome names (`check_sq` = True). + + Parameters + ---------- + mode : string + `mode` should be ``r`` for reading or ``w`` for writing. The + default is text mode (:term:`SAM`). For binary (:term:`BAM`) + I/O you should append ``b`` for compressed or ``u`` for + uncompressed :term:`BAM` output. Use ``h`` to output header + information in text (:term:`TAM`) mode. Use ``c`` for + :term:`CRAM` formatted files. + + If ``b`` is present, it must immediately follow ``r`` or + ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, + ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a + :term:`BAM` formatted file for reading, type:: + + f = pysam.AlignmentFile('ex1.bam','rb') + + If mode is not specified, the method will try to auto-detect + in the order 'rb', 'r', thus both the following should work:: + + f1 = pysam.AlignmentFile('ex1.bam') + f2 = pysam.AlignmentFile('ex1.sam') + + template : AlignmentFile + when writing, copy header frem `template`. + + header : dict + when writing, build header from a multi-level dictionary. The + first level are the four types ('HD', 'SQ', ...). The second + level are a list of lines, with each line being a list of + tag-value pairs. The header is constructed first from all the + defined fields, followed by user tags in alphabetical order. + + text : string + when writing, use the string provided as the header + + reference_names : list + see referece_lengths + + reference_lengths : list + when writing, build header from list of chromosome names and + lengths. By default, 'SQ' and 'LN' tags will be added to the + header text. This option can be changed by unsetting the flag + `add_sq_text`. + + add_sq_text : bool + do not add 'SQ' and 'LN' tags to header. This option permits + construction :term:`SAM` formatted files without a header. + + check_header : bool + when reading, check if header is present (default=True) + + check_sq : bool + when reading, check if SQ entries are present in header + (default=True) + + reference_filename : string + Path to a FASTA-formatted reference file. Valid only for CRAM files. + When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL + specified in the header (``UR`` tag), which are normally used to find + the reference. + + filename : string + Alternative to filepath_or_object. Filename of the file + to be opened. + + duplicate_filehandle: bool + By default, file handles passed either directly or through + File-like objects will be duplicated before passing them to + htslib. The duplication prevents issues where the same stream + will be closed by htslib and through destruction of the + high-level python object. Set to False to turn off + duplication. + + """ + + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + self.filename = None + self.mode = None + self.is_stream = False + self.is_remote = False + self.index = NULL + + if "filename" in kwargs: + args = [kwargs["filename"]] + del kwargs["filename"] + + self._open(*args, **kwargs) + + # allocate memory for iterator + self.b = calloc(1, sizeof(bam1_t)) + + def has_index(self): + """return true if htsfile has an existing (and opened) index. + """ + return self.index != NULL + + def check_index(self): + """return True if index is present. + + Raises + ------ + + AttributeError + if htsfile is :term:`SAM` formatted and thus has no index. + + ValueError + if htsfile is closed or index could not be opened. + """ + + if not self.is_open: + raise ValueError("I/O operation on closed file") + if not self.is_bam and not self.is_cram: + raise AttributeError( + "AlignmentFile.mapped only available in bam files") + if self.index == NULL: + raise ValueError( + "mapping information not recorded in index " + "or index not available") + return True + + def _open(self, + filepath_or_object, + mode=None, + AlignmentFile template=None, + reference_names=None, + reference_lengths=None, + reference_filename=None, + text=None, + header=None, + port=None, + add_sq_text=True, + check_header=True, + check_sq=True, + filepath_index=None, + referencenames=None, + referencelengths=None, + duplicate_filehandle=True): + '''open a sam, bam or cram formatted file. + + If _open is called on an existing file, the current file + will be closed and a new file will be opened. + ''' + cdef char *cfilename = NULL + cdef char *creference_filename = NULL + cdef char *cindexname = NULL + cdef char *cmode = NULL + + # for backwards compatibility: + if referencenames is not None: + reference_names = referencenames + if referencelengths is not None: + reference_lengths = referencelengths + + # close a previously opened file + if self.is_open: + self.close() + + # autodetection for read + if mode is None: + mode = "r" + + assert mode in ("r", "w", "rb", "wb", "wh", + "wbu", "rU", "wb0", + "rc", "wc"), \ + "invalid file opening mode `%s`" % mode + + self.duplicate_filehandle = duplicate_filehandle + + # StringIO not supported + if isinstance(filepath_or_object, StringIO): + raise NotImplementedError( + "access from StringIO objects not supported") + # reading from a file descriptor + elif isinstance(filepath_or_object, int): + self.filename = filepath_or_object + filename = None + self.is_remote = False + self.is_stream = True + # reading from a File object or other object with fileno + elif hasattr(filepath_or_object, "fileno"): + if filepath_or_object.closed: + raise ValueError('I/O operation on closed file') + self.filename = filepath_or_object + # .name can be TextIOWrapper + try: + filename = encode_filename(str(filepath_or_object.name)) + cfilename = filename + except AttributeError: + filename = None + self.is_remote = False + self.is_stream = True + # what remains is a filename + else: + self.filename = filename = encode_filename(filepath_or_object) + cfilename = filename + self.is_remote = hisremote(cfilename) + self.is_stream = self.filename == b'-' + + # for htslib, wbu seems to not work + if mode == "wbu": + mode = "wb0" + + self.mode = force_bytes(mode) + self.reference_filename = reference_filename = encode_filename( + reference_filename) + + cdef char * ctext + cdef hFILE * fp + ctext = NULL + + if mode[0] == 'w': + # open file for writing + + # header structure (used for writing) + if template: + self.header = bam_hdr_dup(template.header) + elif header: + self.header = build_header(header) + else: + # build header from a target names and lengths + assert reference_names and reference_lengths, \ + ("either supply options `template`, `header` " + "or both `reference_names` and `reference_lengths` " + "for writing") + assert len(reference_names) == len(reference_lengths), \ + "unequal names and lengths of reference sequences" + + # allocate and fill header + reference_names = [force_bytes(ref) for ref in reference_names] + self.header = bam_hdr_init() + self.header.n_targets = len(reference_names) + n = 0 + for x in reference_names: + n += len(x) + 1 + self.header.target_name = calloc(n, sizeof(char*)) + self.header.target_len = calloc(n, sizeof(uint32_t)) + for x from 0 <= x < self.header.n_targets: + self.header.target_len[x] = reference_lengths[x] + name = reference_names[x] + self.header.target_name[x] = calloc( + len(name) + 1, sizeof(char)) + strncpy(self.header.target_name[x], name, len(name)) + + # Optionally, if there is no text, add a SAM + # compatible header to output file. + if text is None and add_sq_text: + text = [] + for x from 0 <= x < self.header.n_targets: + text.append("@SQ\tSN:%s\tLN:%s\n" % \ + (force_str(reference_names[x]), + reference_lengths[x])) + text = ''.join(text) + + if text is not None: + # copy without \0 + text = force_bytes(text) + ctext = text + self.header.l_text = strlen(ctext) + self.header.text = calloc( + strlen(ctext), sizeof(char)) + memcpy(self.header.text, ctext, strlen(ctext)) + + self.htsfile = self._open_htsfile() + + # set filename with reference sequences. If no filename + # is given, the CRAM reference arrays will be built from + # the @SQ header in the header + if "c" in mode and reference_filename: + # note that fn_aux takes ownership, so create a copy + self.htsfile.fn_aux = strdup(self.reference_filename) + + # write header to htsfile + if "b" in mode or "c" in mode or "h" in mode: + with nogil: + sam_hdr_write(self.htsfile, self.header) + + elif mode[0] == "r": + # open file for reading + if not self._exists(): + raise IOError("file `%s` not found" % self.filename) + + self.htsfile = self._open_htsfile() + + if self.htsfile == NULL: + raise ValueError( + "could not open file (mode='%s') - " + "is it SAM/BAM format?" % mode) + + if self.htsfile.format.category != sequence_data: + raise ValueError("file does not contain alignment data") + + # bam files require a valid header + if self.is_bam or self.is_cram: + with nogil: + self.header = sam_hdr_read(self.htsfile) + if self.header == NULL: + raise ValueError( + "file does not have valid header (mode='%s') " + "- is it BAM format?" % mode ) + else: + # in sam files it is optional (htsfile full of + # unmapped reads) + if check_header: + with nogil: + self.header = sam_hdr_read(self.htsfile) + if self.header == NULL: + raise ValueError( + "file does not have valid header (mode='%s') " + "- is it SAM format?" % mode ) + # self.header.ignore_sam_err = True + + # set filename with reference sequences + if self.is_cram and reference_filename: + creference_filename = self.reference_filename + hts_set_opt(self.htsfile, + CRAM_OPT_REFERENCE, + creference_filename) + + if check_sq and self.header.n_targets == 0: + raise ValueError( + ("file has no sequences defined (mode='%s') - " + "is it SAM/BAM format? Consider opening with " + "check_sq=False") % mode) + + assert self.htsfile != NULL + + # check for index and open if present + cdef int format_index = -1 + if self.is_bam: + format_index = HTS_FMT_BAI + elif self.is_cram: + format_index = HTS_FMT_CRAI + + if mode[0] == "r" and (self.is_bam or self.is_cram): + # open index for remote files + if self.is_remote and not filepath_index: + with nogil: + self.index = hts_idx_load(cfilename, format_index) + if self.index == NULL: + warnings.warn( + "unable to open remote index for '%s'" % cfilename) + else: + has_index = True + if filepath_index: + if not os.path.exists(filepath_index): + warnings.warn( + "unable to open index at %s" % cfilename) + self.index = NULL + has_index = False + elif filename is not None: + if self.is_bam \ + and not os.path.exists(filename + b".bai") \ + and not os.path.exists(filename[:-4] + b".bai") \ + and not os.path.exists(filename + b".csi") \ + and not os.path.exists(filename[:-4] + b".csi"): + self.index = NULL + has_index = False + elif self.is_cram \ + and not os.path.exists(filename + b".crai") \ + and not os.path.exists(filename[:-5] + b".crai"): + self.index = NULL + has_index = False + else: + self.index = NULL + has_index = False + + if has_index: + # returns NULL if there is no index or index could + # not be opened + if filepath_index: + cindexname = filepath_index = encode_filename(filepath_index) + with nogil: + self.index = sam_index_load2(self.htsfile, + cfilename, + cindexname) + else: + with nogil: + self.index = sam_index_load(self.htsfile, + cfilename) + if self.index == NULL: + raise IOError( + "error while opening index for '%s'" % + filename) + + # save start of data section + if not self.is_stream: + self.start_offset = self.tell() + + def get_tid(self, reference): + """ + return the numerical :term:`tid` corresponding to + :term:`reference` + + returns -1 if reference is not known. + """ + if not self.is_open: + raise ValueError("I/O operation on closed file") + reference = force_bytes(reference) + return bam_name2id(self.header, reference) + + def get_reference_name(self, tid): + """ + return :term:`reference` name corresponding to numerical :term:`tid` + """ + if not self.is_open: + raise ValueError("I/O operation on closed file") + if not 0 <= tid < self.header.n_targets: + raise ValueError("reference_id %i out of range 0<=tid<%i" % + (tid, self.header.n_targets)) + return charptr_to_str(self.header.target_name[tid]) + + def parse_region(self, + reference=None, + start=None, + end=None, + region=None, + tid=None): + """parse alternative ways to specify a genomic region. A region can + either be specified by :term:`reference`, `start` and + `end`. `start` and `end` denote 0-based, half-open + intervals. + + Alternatively, a samtools :term:`region` string can be + supplied. + + If any of the coordinates are missing they will be replaced by the + minimum (`start`) or maximum (`end`) coordinate. + + Note that region strings are 1-based, while `start` and `end` denote + an interval in python coordinates. + + Returns + ------- + + tuple : a tuple of `flag`, :term:`tid`, `start` and `end`. The + flag indicates whether no coordinates were supplied and the + genomic region is the complete genomic space. + + Raises + ------ + + ValueError + for invalid or out of bounds regions. + + """ + cdef int rtid + cdef long long rstart + cdef long long rend + + rtid = -1 + rstart = 0 + rend = MAX_POS + if start != None: + try: + rstart = start + except OverflowError: + raise ValueError('start out of range (%i)' % start) + + if end != None: + try: + rend = end + except OverflowError: + raise ValueError('end out of range (%i)' % end) + + if region: + region = force_str(region) + parts = re.split("[:-]", region) + reference = parts[0] + if len(parts) >= 2: + rstart = int(parts[1]) - 1 + if len(parts) >= 3: + rend = int(parts[2]) + + if not reference: + return 0, 0, 0, 0 + + if tid is not None: + rtid = tid + else: + rtid = self.gettid(reference) + + if rtid < 0: + raise ValueError( + "invalid reference `%s`" % reference) + if rstart > rend: + raise ValueError( + 'invalid coordinates: start (%i) > end (%i)' % (rstart, rend)) + if not 0 <= rstart < MAX_POS: + raise ValueError('start out of range (%i)' % rstart) + if not 0 <= rend <= MAX_POS: + raise ValueError('end out of range (%i)' % rend) + + return 1, rtid, rstart, rend + + def fetch(self, + reference=None, + start=None, + end=None, + region=None, + tid=None, + until_eof=False, + multiple_iterators=False): + """fetch reads aligned in a :term:`region`. + + See :meth:`AlignmentFile.parse_region` for more information + on genomic regions. + + Without a `reference` or `region` all mapped reads in the file + will be fetched. The reads will be returned ordered by reference + sequence, which will not necessarily be the order within the + file. This mode of iteration still requires an index. If there is + no index, use `until_eof=True`. + + If only `reference` is set, all reads aligned to `reference` + will be fetched. + + A :term:`SAM` file does not allow random access. If `region` + or `reference` are given, an exception is raised. + + :class:`~pysam.FastaFile` + :class:`~pysam.IteratorRow` + :class:`~pysam.IteratorRow` + :class:`~IteratorRow` + :class:`IteratorRow` + + Parameters + ---------- + + until_eof : bool + + If `until_eof` is True, all reads from the current file + position will be returned in order as they are within the + file. Using this option will also fetch unmapped reads. + + multiple_iterators : bool + + If `multiple_iterators` is True, multiple + iterators on the same file can be used at the same time. The + iterator returned will receive its own copy of a filehandle to + the file effectively re-opening the file. Re-opening a file + creates some overhead, so beware. + + Returns + ------- + + An iterator over a collection of reads. + + Raises + ------ + + ValueError + if the genomic coordinates are out of range or invalid or the + file does not permit random access to genomic coordinates. + + """ + cdef int rtid, rstart, rend, has_coord + + if not self.is_open: + raise ValueError( "I/O operation on closed file" ) + + has_coord, rtid, rstart, rend = self.parse_region( + reference, + start, + end, + region, + tid) + + # Turn of re-opening if htsfile is a stream + if self.is_stream: + multiple_iterators = False + + if self.is_bam or self.is_cram: + if not until_eof and not self.is_remote: + if not self.has_index(): + raise ValueError( + "fetch called on bamfile without index") + + if has_coord: + return IteratorRowRegion( + self, rtid, rstart, rend, + multiple_iterators=multiple_iterators) + else: + if until_eof: + return IteratorRowAll( + self, + multiple_iterators=multiple_iterators) + else: + # AH: check - reason why no multiple_iterators for + # AllRefs? + return IteratorRowAllRefs( + self, + multiple_iterators=multiple_iterators) + else: + if has_coord: + raise ValueError( + "fetching by region is not available for sam files") + + if self.header == NULL: + raise ValueError( + "fetch called for htsfile without header") + + # check if targets are defined + # give warning, sam_read1 segfaults + if self.header.n_targets == 0: + warnings.warn("fetch called for htsfile without header") + + return IteratorRowAll(self, + multiple_iterators=multiple_iterators) + + def head(self, n, multiple_iterators=True): + '''return an iterator over the first n alignments. + + This iterator is is useful for inspecting the bam-file. + + Parameters + ---------- + + multiple_iterators : bool + + is set to True by default in order to + avoid changing the current file position. + + Returns + ------- + + an iterator over a collection of reads + + ''' + return IteratorRowHead(self, n, + multiple_iterators=multiple_iterators) + + def mate(self, AlignedSegment read): + '''return the mate of :class:`~pysam.AlignedSegment` `read`. + + .. note:: + + Calling this method will change the file position. + This might interfere with any iterators that have + not re-opened the file. + + .. note:: + + This method is too slow for high-throughput processing. + If a read needs to be processed with its mate, work + from a read name sorted file or, better, cache reads. + + Returns + ------- + + :class:`~pysam.AlignedSegment` : the mate + + Raises + ------ + + ValueError + if the read is unpaired or the mate is unmapped + + ''' + cdef uint32_t flag = read._delegate.core.flag + + if flag & BAM_FPAIRED == 0: + raise ValueError("read %s: is unpaired" % + (read.query_name)) + if flag & BAM_FMUNMAP != 0: + raise ValueError("mate %s: is unmapped" % + (read.query_name)) + + # xor flags to get the other mate + cdef int x = BAM_FREAD1 + BAM_FREAD2 + flag = (flag ^ x) & x + + # Make sure to use a separate file to jump around + # to mate as otherwise the original file position + # will be lost + # The following code is not using the C API and + # could thus be made much quicker, for example + # by using tell and seek. + for mate in self.fetch( + read._delegate.core.mpos, + read._delegate.core.mpos + 1, + tid=read._delegate.core.mtid, + multiple_iterators=True): + if mate.flag & flag != 0 and \ + mate.query_name == read.query_name: + break + else: + raise ValueError("mate not found") + + return mate + + def pileup(self, + reference=None, + start=None, + end=None, + region=None, + **kwargs): + """perform a :term:`pileup` within a :term:`region`. The region is + specified by :term:`reference`, 'start' and 'end' (using + 0-based indexing). Alternatively, a samtools 'region' string + can be supplied. + + Without 'reference' or 'region' all reads will be used for the + pileup. The reads will be returned ordered by + :term:`reference` sequence, which will not necessarily be the + order within the file. + + Note that :term:`SAM` formatted files do not allow random + access. In these files, if a 'region' or 'reference' are + given an exception is raised. + + .. note:: + + 'all' reads which overlap the region are returned. The + first base returned will be the first base of the first + read 'not' necessarily the first base of the region used + in the query. + + Parameters + ---------- + + stepper : string + The stepper controls how the iterator advances. + Possible options for the stepper are + + ``all`` + skip reads in which any of the following flags are set: + BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP + + ``nofilter`` + uses every single read + + ``samtools`` + same filter and read processing as in :term:`csamtools` + pileup. This requires a 'fastafile' to be given. + + + fastafile : :class:`~pysam.FastaFile` object. + + This is required for some of the steppers. + + max_depth : int + Maximum read depth permitted. The default limit is '8000'. + + truncate : bool + + By default, the samtools pileup engine outputs all reads + overlapping a region. If truncate is True and a region is + given, only columns in the exact region specificied are + returned. + + Returns + ------- + + an iterator over genomic positions. + + """ + cdef int rtid, rstart, rend, has_coord + + if not self.is_open: + raise ValueError("I/O operation on closed file") + + has_coord, rtid, rstart, rend = self.parse_region( + reference, start, end, region) + + if self.is_bam or self.is_cram: + if not self.has_index(): + raise ValueError("no index available for pileup") + + if has_coord: + return IteratorColumnRegion(self, + tid=rtid, + start=rstart, + end=rend, + **kwargs ) + else: + return IteratorColumnAllRefs(self, **kwargs ) + + else: + raise NotImplementedError( + "pileup of samfiles not implemented yet") + + def count(self, + reference=None, + start=None, + end=None, + region=None, + until_eof=False, + read_callback="nofilter"): + '''count the number of reads in :term:`region` + + The region is specified by :term:`reference`, `start` and + `end`. Alternatively, a :term:`samtools` :term:`region` string + can be supplied. + + A :term:`SAM` file does not allow random access and if + `region` or `reference` are given, an exception is raised. + + Parameters + ---------- + + reference : string + reference_name of the genomic region (chromosome) + + start : int + start of the genomic region + + end : int + end of the genomic region + + region : string + a region string in samtools format. + + until_eof : bool + count until the end of the file, possibly including + unmapped reads as well. + + read_callback: string or function + + select a call-back to ignore reads when counting. It can + be either a string with the following values: + + ``all`` + skip reads in which any of the following + flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, + BAM_FDUP + + ``nofilter`` + uses every single read + + Alternatively, `read_callback` can be a function + ``check_read(read)`` that should return True only for + those reads that shall be included in the counting. + + Raises + ------ + + ValueError + if the genomic coordinates are out of range or invalid. + + ''' + cdef AlignedSegment read + cdef long counter = 0 + + if not self.is_open: + raise ValueError("I/O operation on closed file") + + cdef int filter_method = 0 + if read_callback == "all": + filter_method = 1 + elif read_callback == "nofilter": + filter_method = 2 + + for read in self.fetch(reference=reference, + start=start, + end=end, + region=region, + until_eof=until_eof): + # apply filter + if filter_method == 1: + # filter = "all" + if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): + continue + elif filter_method == 2: + # filter = "nofilter" + pass + else: + if not read_callback(read): + continue + counter += 1 + + return counter + + @cython.boundscheck(False) # we do manual bounds checking + def count_coverage(self, + reference=None, + start=None, + end=None, + region=None, + quality_threshold=15, + read_callback='all'): + """count the coverage of genomic positions by reads in :term:`region`. + + The region is specified by :term:`reference`, `start` and + `end`. Alternatively, a :term:`samtools` :term:`region` string + can be supplied. The coverage is computed per-base [ACGT]. + + Parameters + ---------- + + reference : string + reference_name of the genomic region (chromosome) + + start : int + start of the genomic region + + end : int + end of the genomic region + + region : int + a region string. + + quality_threshold : int + quality_threshold is the minimum quality score (in phred) a + base has to reach to be counted. + + read_callback: string or function + + select a call-back to ignore reads when counting. It can + be either a string with the following values: + + ``all`` + skip reads in which any of the following + flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, + BAM_FDUP + + ``nofilter`` + uses every single read + + Alternatively, `read_callback` can be a function + ``check_read(read)`` that should return True only for + those reads that shall be included in the counting. + + Raises + ------ + + ValueError + if the genomic coordinates are out of range or invalid. + + Returns + ------- + + four array.arrays of the same length in order A C G T : tuple + + """ + + cdef int _start = start + cdef int _stop = end + cdef int length = _stop - _start + cdef c_array.array int_array_template = array.array('L', []) + cdef c_array.array count_a + cdef c_array.array count_c + cdef c_array.array count_g + cdef c_array.array count_t + count_a = c_array.clone(int_array_template, length, zero=True) + count_c = c_array.clone(int_array_template, length, zero=True) + count_g = c_array.clone(int_array_template, length, zero=True) + count_t = c_array.clone(int_array_template, length, zero=True) + + cdef AlignedSegment read + cdef cython.str seq + cdef c_array.array quality + cdef int qpos + cdef int refpos + cdef int c = 0 + cdef int filter_method = 0 + if read_callback == "all": + filter_method = 1 + elif read_callback == "nofilter": + filter_method = 2 + + cdef int _threshold = quality_threshold + for read in self.fetch(reference=reference, + start=start, + end=end, + region=region): + # apply filter + if filter_method == 1: + # filter = "all" + if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)): + continue + elif filter_method == 2: + # filter = "nofilter" + pass + else: + if not read_callback(read): + continue + + # count + seq = read.seq + quality = read.query_qualities + for qpos, refpos in read.get_aligned_pairs(True): + if qpos is not None and refpos is not None and \ + _start <= refpos < _stop: + if quality[qpos] >= quality_threshold: + if seq[qpos] == 'A': + count_a.data.as_ulongs[refpos - _start] += 1 + if seq[qpos] == 'C': + count_c.data.as_ulongs[refpos - _start] += 1 + if seq[qpos] == 'G': + count_g.data.as_ulongs[refpos - _start] += 1 + if seq[qpos] == 'T': + count_t.data.as_ulongs[refpos - _start] += 1 + + return count_a, count_c, count_g, count_t + + def find_introns(self, read_iterator): + """Return a dictionary {(start, stop): count} + Listing the intronic sites in the reads (identified by 'N' in the cigar strings), + and their support ( = number of reads ). + + read_iterator can be the result of a .fetch(...) call. + Or it can be a generator filtering such reads. Example + samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) + """ + import collections + res = collections.Counter() + for r in read_iterator: + if 'N' in r.cigarstring: + last_read_pos = False + for read_loc, genome_loc in r.get_aligned_pairs(): + if read_loc is None and last_read_pos: + start = genome_loc + elif read_loc and last_read_pos is None: + stop = genome_loc # we are right exclusive ,so this is correct + res[(start, stop)] += 1 + del start + del stop + last_read_pos = read_loc + return res + + def close(self): + ''' + closes the :class:`pysam.AlignmentFile`.''' + + if self.htsfile == NULL: + return + + cdef int ret = hts_close(self.htsfile) + hts_idx_destroy(self.index) + self.htsfile = NULL + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise OSError(errno, force_str(strerror(errno))) + + def __dealloc__(self): + # remember: dealloc cannot call other methods + # note: no doc string + # note: __del__ is not called. + + # FIXME[kbj]: isn't self.close a method? I've been duplicating + # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty + # solution and perhaps unnecessary given that calling self.close has + # been working for years. + # AH: I have removed the call to close. Even though it is working, + # it seems to be dangerous according to the documentation as the + # object be partially deconstructed already. + cdef int ret = 0 + + if self.htsfile != NULL: + ret = hts_close(self.htsfile) + hts_idx_destroy(self.index); + self.htsfile = NULL + + bam_destroy1(self.b) + if self.header != NULL: + bam_hdr_destroy(self.header) + + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise OSError(errno, force_str(strerror(errno))) + + cpdef int write(self, AlignedSegment read) except -1: + ''' + write a single :class:`pysam.AlignedSegment` to disk. + + Raises + ------ + ValueError + if the writing failed + + Returns + ------- + + int : the number of bytes written. If the file is closed, + this will be 0. + ''' + if not self.is_open: + return 0 + + cdef int ret + + with nogil: + ret = sam_write1(self.htsfile, + self.header, + read._delegate) + + # kbj: Still need to raise an exception with except -1. Otherwise + # when ret == -1 we get a "SystemError: error return without + # exception set". + if ret < 0: + raise IOError( + "sam_write1 failed with error code {}".format(ret)) + + return ret + + # context manager interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + ############################################################### + ############################################################### + ############################################################### + ## properties + ############################################################### + property nreferences: + """"int with the number of :term:`reference` sequences in the file. + This is a read-only attribute.""" + def __get__(self): + if not self.is_open: + raise ValueError("I/O operation on closed file") + return self.header.n_targets + + property references: + """tuple with the names of :term:`reference` sequences. This is a + read-only attribute""" + def __get__(self): + if not self.is_open: raise ValueError( "I/O operation on closed file" ) + t = [] + for x from 0 <= x < self.header.n_targets: + t.append(charptr_to_str(self.header.target_name[x])) + return tuple(t) + + property lengths: + """tuple of the lengths of the :term:`reference` sequences. This is a + read-only attribute. The lengths are in the same order as + :attr:`pysam.AlignmentFile.references` + + """ + def __get__(self): + if not self.is_open: + raise ValueError("I/O operation on closed file") + t = [] + for x from 0 <= x < self.header.n_targets: + t.append(self.header.target_len[x]) + return tuple(t) + + property mapped: + """int with total number of mapped alignments according to the + statistics recorded in the index. This is a read-only + attribute. + """ + def __get__(self): + self.check_index() + cdef int tid + cdef uint64_t total = 0 + cdef uint64_t mapped, unmapped + for tid from 0 <= tid < self.header.n_targets: + with nogil: + hts_idx_get_stat(self.index, tid, &mapped, &unmapped) + total += mapped + return total + + property unmapped: + """int with total number of unmapped reads according to the statistics + recorded in the index. This number of reads includes the number of reads + without coordinates. This is a read-only attribute. + """ + def __get__(self): + self.check_index() + cdef int tid + cdef uint64_t total = hts_idx_get_n_no_coor(self.index) + cdef uint64_t mapped, unmapped + for tid from 0 <= tid < self.header.n_targets: + with nogil: + hts_idx_get_stat(self.index, tid, &mapped, &unmapped) + total += unmapped + return total + + property nocoordinate: + """int with total number of reads without coordinates according to the + statistics recorded in the index. This is a read-only attribute. + """ + def __get__(self): + self.check_index() + cdef uint64_t n + with nogil: + n = hts_idx_get_n_no_coor(self.index) + return n + + property text: + '''string with the full contents of the :term:`sam file` header as a + string. + + This is a read-only attribute. + + See :attr:`pysam.AlignmentFile.header` to get a parsed + representation of the header. + ''' + def __get__(self): + if not self.is_open: + raise ValueError( "I/O operation on closed file" ) + return from_string_and_size(self.header.text, self.header.l_text) + + property header: + """two-level dictionay with header information from the file. + + This is a read-only attribute. + + The first level contains the record (``HD``, ``SQ``, etc) and + the second level contains the fields (``VN``, ``LN``, etc). + + The parser is validating and will raise an AssertionError if + if encounters any record or field tags that are not part of + the SAM specification. Use the + :attr:`pysam.AlignmentFile.text` attribute to get the unparsed + header. + + The parsing follows the SAM format specification with the + exception of the ``CL`` field. This option will consume the + rest of a header line irrespective of any additional fields. + This behaviour has been added to accommodate command line + options that contain characters that are not valid field + separators. + + """ + def __get__(self): + if not self.is_open: + raise ValueError( "I/O operation on closed file" ) + + result = {} + + if self.header.text != NULL: + # convert to python string (note: call self.text to + # create 0-terminated string) + t = self.text + for line in t.split("\n"): + if not line.strip(): continue + assert line.startswith("@"), \ + "header line without '@': '%s'" % line + fields = line[1:].split("\t") + record = fields[0] + assert record in VALID_HEADER_TYPES, \ + "header line with invalid type '%s': '%s'" % (record, line) + + # treat comments + if record == "CO": + if record not in result: + result[record] = [] + result[record].append("\t".join( fields[1:])) + continue + # the following is clumsy as generators do not work? + x = {} + + for idx, field in enumerate(fields[1:]): + if ":" not in field: + raise ValueError("malformatted header: no ':' in field" ) + key, value = field.split(":", 1) + if key in ("CL",): + # special treatment for command line + # statements (CL). These might contain + # characters that are non-conformant with + # the valid field separators in the SAM + # header. Thus, in contravention to the + # SAM API, consume the rest of the line. + key, value = "\t".join(fields[idx+1:]).split(":", 1) + x[key] = KNOWN_HEADER_FIELDS[record][key](value) + break + + # interpret type of known header record tags, default to str + x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) + + if VALID_HEADER_TYPES[record] == dict: + if record in result: + raise ValueError( + "multiple '%s' lines are not permitted" % record) + + result[record] = x + elif VALID_HEADER_TYPES[record] == list: + if record not in result: result[record] = [] + result[record].append(x) + + # if there are no SQ lines in the header, add the + # reference names from the information in the bam + # file. + # + # Background: c-samtools keeps the textual part of the + # header separate from the list of reference names and + # lengths. Thus, if a header contains only SQ lines, + # the SQ information is not part of the textual header + # and thus are missing from the output. See issue 84. + if "SQ" not in result: + sq = [] + for ref, length in zip(self.references, self.lengths): + sq.append({'LN': length, 'SN': ref }) + result["SQ"] = sq + + return result + + ############################################################### + ## file-object like iterator access + ## note: concurrent access will cause errors (see IteratorRow + ## and multiple_iterators) + ## Possible solutions: deprecate or open new file handle + def __iter__(self): + if not self.is_open: + raise ValueError("I/O operation on closed file") + + if not self.is_bam and self.header.n_targets == 0: + raise NotImplementedError( + "can not iterate over samfile without header") + return self + + cdef bam1_t * getCurrent( self ): + return self.b + + cdef int cnext(self): + ''' + cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. + ''' + cdef int ret + with nogil: + ret = sam_read1(self.htsfile, + self.header, + self.b) + return ret + + def __next__(self): + cdef int ret = self.cnext() + if (ret >= 0): + return makeAlignedSegment(self.b, self) + elif ret == -2: + raise IOError('truncated file') + else: + raise StopIteration + + # Compatibility functions for pysam < 0.8.3 + def gettid(self, reference): + """deprecated, use get_tid() instead""" + return self.get_tid(reference) + + def getrname(self, tid): + """deprecated, use get_reference_name() instead""" + return self.get_reference_name(tid) + + +cdef class IteratorRow: + '''abstract base class for iterators over mapped reads. + + Various iterators implement different behaviours for wrapping around + contig boundaries. Examples include: + + :class:`pysam.IteratorRowRegion` + iterate within a single contig and a defined region. + + :class:`pysam.IteratorRowAll` + iterate until EOF. This iterator will also include unmapped reads. + + :class:`pysam.IteratorRowAllRefs` + iterate over all reads in all reference sequences. + + The method :meth:`AlignmentFile.fetch` returns an IteratorRow. + + .. note:: + + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + ''' + + def __init__(self, AlignmentFile samfile, int multiple_iterators=False): + cdef char *cfilename + cdef char *creference_filename + + if not samfile.is_open: + raise ValueError("I/O operation on closed file") + + # makes sure that samfile stays alive as long as the + # iterator is alive + self.samfile = samfile + + # reopen the file - note that this makes the iterator + # slow and causes pileup to slow down significantly. + if multiple_iterators: + cfilename = samfile.filename + with nogil: + self.htsfile = hts_open(cfilename, 'r') + assert self.htsfile != NULL + # read header - required for accurate positioning + # could a tell/seek work? + with nogil: + self.header = sam_hdr_read(self.htsfile) + assert self.header != NULL + self.owns_samfile = True + # options specific to CRAM files + if samfile.is_cram and samfile.reference_filename: + creference_filename = samfile.reference_filename + hts_set_opt(self.htsfile, + CRAM_OPT_REFERENCE, + creference_filename) + + else: + self.htsfile = self.samfile.htsfile + self.owns_samfile = False + self.header = self.samfile.header + + self.retval = 0 + + self.b = bam_init1() + + def __dealloc__(self): + bam_destroy1(self.b) + if self.owns_samfile: + hts_close(self.htsfile) + bam_hdr_destroy(self.header) + + +cdef class IteratorRowRegion(IteratorRow): + """*(AlignmentFile samfile, int tid, int beg, int end, + int multiple_iterators=False)* + + iterate over mapped reads in a region. + + .. note:: + + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + """ + + def __init__(self, AlignmentFile samfile, + int tid, int beg, int end, + int multiple_iterators=False): + + IteratorRow.__init__(self, samfile, + multiple_iterators=multiple_iterators) + + if not samfile.has_index(): + raise ValueError("no index available for iteration") + + with nogil: + self.iter = sam_itr_queryi( + self.samfile.index, + tid, + beg, + end) + + def __iter__(self): + return self + + cdef bam1_t * getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + with nogil: + self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile), + self.iter, + self.b, + self.htsfile) + + def __next__(self): + self.cnext() + if self.retval >= 0: + return makeAlignedSegment(self.b, self.samfile) + elif self.retval == -2: + # Note: it is currently not the case that hts_iter_next + # returns -2 for a truncated file. + # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 + raise IOError('truncated file') + else: + raise StopIteration + + def __dealloc__(self): + hts_itr_destroy(self.iter) + + +cdef class IteratorRowHead(IteratorRow): + """*(AlignmentFile samfile, n, int multiple_iterators=False)* + + iterate over first n reads in `samfile` + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.head`. + + """ + + def __init__(self, AlignmentFile samfile, int n, + int multiple_iterators=False): + + IteratorRow.__init__(self, samfile, + multiple_iterators=multiple_iterators) + + self.max_rows = n + self.current_row = 0 + + def __iter__(self): + return self + + cdef bam1_t * getCurrent( self ): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + with nogil: + ret = sam_read1(self.htsfile, + self.samfile.header, + self.b) + return ret + + def __next__(self): + if self.current_row >= self.max_rows: + raise StopIteration + + cdef int ret = self.cnext() + if ret >= 0: + self.current_row += 1 + return makeAlignedSegment(self.b, self.samfile) + elif ret == -2: + raise IOError('truncated file') + else: + raise StopIteration + + +cdef class IteratorRowAll(IteratorRow): + """*(AlignmentFile samfile, int multiple_iterators=False)* + + iterate over all reads in `samfile` + + .. note:: + + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + """ + + def __init__(self, AlignmentFile samfile, + int multiple_iterators=False): + + IteratorRow.__init__(self, samfile, + multiple_iterators=multiple_iterators) + + def __iter__(self): + return self + + cdef bam1_t * getCurrent( self ): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + with nogil: + ret = sam_read1(self.htsfile, + self.samfile.header, + self.b) + return ret + + def __next__(self): + cdef int ret = self.cnext() + if ret >= 0: + return makeAlignedSegment(self.b, self.samfile) + elif ret == -2: + raise IOError('truncated file') + else: + raise StopIteration + + +cdef class IteratorRowAllRefs(IteratorRow): + """iterates over all mapped reads by chaining iterators over each + reference + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a + :meth:`AlignmentFile.fetch`. + + """ + + def __init__(self, AlignmentFile samfile, + multiple_iterators=False): + + IteratorRow.__init__(self, samfile, + multiple_iterators=multiple_iterators) + + if not samfile.has_index(): + raise ValueError("no index available for fetch") + + self.tid = -1 + + def nextiter(self): + # get a new iterator for a chromosome. The file + # will not be re-opened. + self.rowiter = IteratorRowRegion(self.samfile, + self.tid, + 0, + 1<<29) + # set htsfile and header of the rowiter + # to the values in this iterator to reflect multiple_iterators + self.rowiter.htsfile = self.htsfile + self.rowiter.header = self.header + + # make sure the iterator understand that IteratorRowAllRefs + # has ownership + self.rowiter.owns_samfile = False + + def __iter__(self): + return self + + def __next__(self): + # Create an initial iterator + if self.tid == -1: + if not self.samfile.nreferences: + raise StopIteration + self.tid = 0 + self.nextiter() + + while 1: + self.rowiter.cnext() + + # If current iterator is not exhausted, return aligned read + if self.rowiter.retval > 0: + return makeAlignedSegment(self.rowiter.b, self.samfile) + + self.tid += 1 + + # Otherwise, proceed to next reference or stop + if self.tid < self.samfile.nreferences: + self.nextiter() + else: + raise StopIteration + + +cdef class IteratorRowSelection(IteratorRow): + """*(AlignmentFile samfile)* + + iterate over reads in `samfile` at a given list of file positions. + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`. + """ + + def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True): + + IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators) + + self.positions = positions + self.current_pos = 0 + + def __iter__(self): + return self + + cdef bam1_t * getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator''' + # end iteration if out of positions + if self.current_pos >= len(self.positions): return -1 + + cdef uint64_t pos = self.positions[self.current_pos] + with nogil: + bgzf_seek(hts_get_bgzfp(self.htsfile), + pos, + 0) + self.current_pos += 1 + + cdef int ret + with nogil: + ret = sam_read1(self.htsfile, + self.samfile.header, + self.b) + return ret + + def __next__(self): + cdef int ret = self.cnext() + if (ret >= 0): + return makeAlignedSegment(self.b, self.samfile) + elif (ret == -2): + raise IOError('truncated file') + else: + raise StopIteration + + +cdef int __advance_nofilter(void *data, bam1_t *b): + '''advance without any read filtering. + ''' + cdef __iterdata * d + d = <__iterdata*>data + cdef int ret + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) + return ret + + +cdef int __advance_all(void *data, bam1_t *b): + '''only use reads for pileup passing basic + filters: + + BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP + ''' + + cdef __iterdata * d + cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP + d = <__iterdata*>data + cdef int ret + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) + while ret >= 0 and b.core.flag & mask: + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) + return ret + + +cdef int __advance_snpcalls(void * data, bam1_t * b): + '''advance using same filter and read processing as in + the samtools pileup. + ''' + + # Note that this method requries acces to some + # functions in the samtools code base and is thus + # not htslib only. + # The functions accessed in samtools are: + # 1. bam_prob_realn + # 2. bam_cap_mapQ + cdef __iterdata * d + d = <__iterdata*>data + + cdef int ret + cdef int skip = 0 + cdef int q + cdef int is_cns = 1 + cdef int is_nobaq = 0 + cdef int capQ_thres = 0 + + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) + + # reload sequence + if d.fastafile != NULL and b.core.tid != d.tid: + if d.seq != NULL: + free(d.seq) + d.tid = b.core.tid + with nogil: + d.seq = faidx_fetch_seq( + d.fastafile, + d.header.target_name[d.tid], + 0, MAX_POS, + &d.seq_len) + + if d.seq == NULL: + raise ValueError( + "reference sequence for '%s' (tid=%i) not found" % \ + (d.header.target_name[d.tid], + d.tid)) + + while ret >= 0: + skip = 0 + + # realign read - changes base qualities + if d.seq != NULL and is_cns and not is_nobaq: + bam_prob_realn(b, d.seq) + + if d.seq != NULL and capQ_thres > 10: + q = bam_cap_mapQ(b, d.seq, capQ_thres) + if q < 0: + skip = 1 + elif b.core.qual > q: + b.core.qual = q + if b.core.flag & BAM_FUNMAP: + skip = 1 + elif b.core.flag & 1 and not b.core.flag & 2: + skip = 1 + + if not skip: + break + # additional filters + + with nogil: + ret = sam_itr_next(d.htsfile, d.iter, b) + + return ret + +cdef class IteratorColumn: + '''abstract base class for iterators over columns. + + IteratorColumn objects wrap the pileup functionality of samtools. + + For reasons of efficiency, the iterator points to the current + pileup buffer. The pileup buffer is updated at every iteration. + This might cause some unexpected behavious. For example, + consider the conversion to a list:: + + f = AlignmentFile("file.bam", "rb") + result = list( f.pileup() ) + + Here, ``result`` will contain ``n`` objects of type + :class:`~pysam.PileupColumn` for ``n`` columns, but each object in + ``result`` will contain the same information. + + The desired behaviour can be achieved by list comprehension:: + + result = [ x.pileups() for x in f.pileup() ] + + ``result`` will be a list of ``n`` lists of objects of type + :class:`~pysam.PileupRead`. + + If the iterator is associated with a :class:`~pysam.Fastafile` using the + :meth:`addReference` method, then the iterator will export the + current sequence via the methods :meth:`getSequence` and + :meth:`seq_len`. + + Optional kwargs to the iterator: + + stepper + The stepper controls how the iterator advances. + + Valid values are None, "all" (default), "nofilter" or "samtools". + + See AlignmentFile.pileup for description. + + fastafile + A :class:`~pysam.FastaFile` object + + max_depth + maximum read depth. The default is 8000. + + ''' + + def __cinit__( self, AlignmentFile samfile, **kwargs ): + self.samfile = samfile + self.fastafile = kwargs.get("fastafile", None) + self.stepper = kwargs.get("stepper", None) + self.max_depth = kwargs.get("max_depth", 8000) + self.iterdata.seq = NULL + self.tid = 0 + self.pos = 0 + self.n_plp = 0 + self.plp = NULL + self.pileup_iter = NULL + + def __iter__(self): + return self + + cdef int cnext(self): + '''perform next iteration. + ''' + # do not release gil here because of call-backs + self.plp = bam_plp_auto(self.pileup_iter, + &self.tid, + &self.pos, + &self.n_plp) + + cdef char * getSequence(self): + '''return current reference sequence underlying the iterator. + ''' + return self.iterdata.seq + + property seq_len: + '''current sequence length.''' + def __get__(self): + return self.iterdata.seq_len + + def addReference(self, Fastafile fastafile): + ''' + add reference sequences in `fastafile` to iterator.''' + self.fastafile = fastafile + if self.iterdata.seq != NULL: + free(self.iterdata.seq) + self.iterdata.tid = -1 + self.iterdata.fastafile = self.fastafile.fastafile + + def hasReference(self): + ''' + return true if iterator is associated with a reference''' + return self.fastafile + + cdef setMask(self, mask): + '''set masking flag in iterator. + + reads with bits set in `mask` will be skipped. + ''' + raise NotImplementedError() + # self.mask = mask + # bam_plp_set_mask( self.pileup_iter, self.mask ) + + cdef setupIteratorData( self, + int tid, + int start, + int end, + int multiple_iterators=0 ): + '''setup the iterator structure''' + + self.iter = IteratorRowRegion(self.samfile, tid, start, end, multiple_iterators) + self.iterdata.htsfile = self.samfile.htsfile + self.iterdata.iter = self.iter.iter + self.iterdata.seq = NULL + self.iterdata.tid = -1 + self.iterdata.header = self.samfile.header + + if self.fastafile is not None: + self.iterdata.fastafile = self.fastafile.fastafile + else: + self.iterdata.fastafile = NULL + + # Free any previously allocated memory before reassigning + # pileup_iter + self._free_pileup_iter() + + if self.stepper is None or self.stepper == "all": + with nogil: + self.pileup_iter = bam_plp_init( + &__advance_all, + &self.iterdata) + elif self.stepper == "nofilter": + with nogil: + self.pileup_iter = bam_plp_init( + &__advance_nofilter, + &self.iterdata) + elif self.stepper == "samtools": + with nogil: + self.pileup_iter = bam_plp_init( + &__advance_snpcalls, + &self.iterdata) + else: + raise ValueError( + "unknown stepper option `%s` in IteratorColumn" % self.stepper) + + if self.max_depth: + with nogil: + bam_plp_set_maxcnt(self.pileup_iter, self.max_depth) + + # bam_plp_set_mask( self.pileup_iter, self.mask ) + + cdef reset( self, tid, start, end ): + '''reset iterator position. + + This permits using the iterator multiple times without + having to incur the full set-up costs. + ''' + self.iter = IteratorRowRegion( self.samfile, tid, start, end, multiple_iterators = 0 ) + self.iterdata.iter = self.iter.iter + + # invalidate sequence if different tid + if self.tid != tid: + if self.iterdata.seq != NULL: + free(self.iterdata.seq) + self.iterdata.seq = NULL + self.iterdata.tid = -1 + + # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata ) + with nogil: + bam_plp_reset(self.pileup_iter) + + cdef _free_pileup_iter(self): + '''free the memory alloc'd by bam_plp_init. + + This is needed before setupIteratorData allocates + another pileup_iter, or else memory will be lost. + ''' + if self.pileup_iter != NULL: + with nogil: + bam_plp_reset(self.pileup_iter) + bam_plp_destroy(self.pileup_iter) + self.pileup_iter = NULL + + def __dealloc__(self): + # reset in order to avoid memory leak messages for iterators + # that have not been fully consumed + self._free_pileup_iter() + self.plp = NULL + + if self.iterdata.seq != NULL: + free(self.iterdata.seq) + self.iterdata.seq = NULL + + +cdef class IteratorColumnRegion(IteratorColumn): + '''iterates over a region only. + ''' + def __cinit__(self, AlignmentFile samfile, + int tid = 0, + int start = 0, + int end = MAX_POS, + int truncate = False, + **kwargs ): + + # initialize iterator + self.setupIteratorData(tid, start, end, 1) + self.start = start + self.end = end + self.truncate = truncate + + def __next__(self): + + while 1: + self.cnext() + if self.n_plp < 0: + raise ValueError("error during iteration" ) + + if self.plp == NULL: + raise StopIteration + + if self.truncate: + if self.start > self.pos: continue + if self.pos >= self.end: raise StopIteration + + return makePileupColumn(&self.plp, + self.tid, + self.pos, + self.n_plp, + self.samfile) + + +cdef class IteratorColumnAllRefs(IteratorColumn): + """iterates over all columns by chaining iterators over each reference + """ + + def __cinit__(self, + AlignmentFile samfile, + **kwargs): + + # no iteration over empty files + if not samfile.nreferences: + raise StopIteration + + # initialize iterator + self.setupIteratorData(self.tid, 0, MAX_POS, 1) + + def __next__(self): + + while 1: + self.cnext() + + if self.n_plp < 0: + raise ValueError("error during iteration" ) + + # return result, if within same reference + if self.plp != NULL: + return makePileupColumn(&self.plp, + self.tid, + self.pos, + self.n_plp, + self.samfile) + + # otherwise, proceed to next reference or stop + self.tid += 1 + if self.tid < self.samfile.nreferences: + self.setupIteratorData(self.tid, 0, MAX_POS, 0) + else: + raise StopIteration + + +cdef class SNPCall: + '''the results of a SNP call.''' + cdef int _tid + cdef int _pos + cdef char _reference_base + cdef char _genotype + cdef int _consensus_quality + cdef int _snp_quality + cdef int _rms_mapping_quality + cdef int _coverage + + property tid: + '''the chromosome ID as is defined in the header''' + def __get__(self): + return self._tid + + property pos: + '''nucleotide position of SNP.''' + def __get__(self): return self._pos + + property reference_base: + '''reference base at pos. ``N`` if no reference sequence supplied.''' + def __get__(self): return from_string_and_size( &self._reference_base, 1 ) + + property genotype: + '''the genotype called.''' + def __get__(self): return from_string_and_size( &self._genotype, 1 ) + + property consensus_quality: + '''the genotype quality (Phred-scaled).''' + def __get__(self): return self._consensus_quality + + property snp_quality: + '''the snp quality (Phred scaled) - probability of consensus being + identical to reference sequence.''' + def __get__(self): return self._snp_quality + + property mapping_quality: + '''the root mean square (rms) of the mapping quality of all reads + involved in the call.''' + def __get__(self): return self._rms_mapping_quality + + property coverage: + '''coverage or read depth - the number of reads involved in the call.''' + def __get__(self): return self._coverage + + def __str__(self): + + return "\t".join( map(str, ( + self.tid, + self.pos, + self.reference_base, + self.genotype, + self.consensus_quality, + self.snp_quality, + self.mapping_quality, + self.coverage ) ) ) + + +cdef class IndexedReads: + """*(AlignmentFile samfile, multiple_iterators=True) + + Index a Sam/BAM-file by query name while keeping the + original sort order intact. + + The index is kept in memory and can be substantial. + + By default, the file is re-openend to avoid conflicts if multiple + operators work on the same file. Set `multiple_iterators` = False + to not re-open `samfile`. + + Parameters + ---------- + + samfile : AlignmentFile + File to be indexed. + + multiple_iterators : bool + Flag indicating whether the file should be reopened. Reopening prevents + existing iterators being affected by the indexing. + + """ + + def __init__(self, AlignmentFile samfile, int multiple_iterators=True): + cdef char *cfilename + + # makes sure that samfile stays alive as long as this + # object is alive. + self.samfile = samfile + + assert samfile.is_bam, "can only IndexReads on bam files" + + # multiple_iterators the file - note that this makes the iterator + # slow and causes pileup to slow down significantly. + if multiple_iterators: + cfilename = samfile.filename + with nogil: + self.htsfile = hts_open(cfilename, 'r') + assert self.htsfile != NULL + # read header - required for accurate positioning + with nogil: + self.header = sam_hdr_read(self.htsfile) + self.owns_samfile = True + else: + self.htsfile = self.samfile.htsfile + self.header = self.samfile.header + self.owns_samfile = False + + def build(self): + '''build the index.''' + + self.index = collections.defaultdict(list) + + # this method will start indexing from the current file + # position if you decide + cdef int ret = 1 + cdef bam1_t * b = calloc(1, sizeof( bam1_t)) + + cdef uint64_t pos + + while ret > 0: + with nogil: + pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) + ret = sam_read1(self.htsfile, + self.samfile.header, + b) + if ret > 0: + qname = charptr_to_str(pysam_bam_get_qname(b)) + self.index[qname].append(pos) + + bam_destroy1(b) + + def find(self, query_name): + '''find `query_name` in index. + + Returns + ------- + + IteratorRowSelection + Returns an iterator over all reads with query_name. + + Raises + ------ + + KeyError + if the `query_name` is not in the index. + + ''' + if query_name in self.index: + return IteratorRowSelection( + self.samfile, + self.index[query_name], + multiple_iterators = False) + else: + raise KeyError("read %s not found" % query_name) + + def __dealloc__(self): + if self.owns_samfile: + hts_close(self.htsfile) + bam_hdr_destroy(self.header) + +__all__ = [ + "AlignmentFile", + "IteratorRow", + "IteratorColumn", + "IndexedReads"] diff --git a/pysam/libcbcf.pxd b/pysam/libcbcf.pxd new file mode 100644 index 0000000..fc7f56c --- /dev/null +++ b/pysam/libcbcf.pxd @@ -0,0 +1,144 @@ +############################################################################### +############################################################################### +## Cython wrapper for htslib VCF/BCF reader/writer +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015, 2016 Kevin Jacobs (jacobs@bioinformed.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free +from libc.string cimport memcpy, memcmp, memmove, strncpy, strlen, strdup + +from pysam.libchtslib cimport * + + +cdef class VariantHeader(object): + cdef bcf_hdr_t *ptr + + cpdef VariantRecord new_record(self) + cdef _subset_samples(self, include_samples) + + +cdef class VariantHeaderRecord(object): + cdef VariantHeader header + cdef bcf_hrec_t *ptr + + +cdef class VariantHeaderRecords(object): + cdef VariantHeader header + + +cdef class VariantHeaderContigs(object): + cdef VariantHeader header + + +cdef class VariantHeaderSamples(object): + cdef VariantHeader header + + +cdef class VariantContig(object): + cdef VariantHeader header + cdef int id + + +cdef class VariantMetadata(object): + cdef VariantHeader header + cdef int type + cdef int id + + +cdef class VariantHeaderMetadata(object): + cdef VariantHeader header + cdef int32_t type + + +cdef class VariantRecord(object): + cdef VariantHeader header + cdef bcf1_t *ptr + + +cdef class VariantRecordFilter(object): + cdef VariantRecord record + + +cdef class VariantRecordFormat(object): + cdef VariantRecord record + + +cdef class VariantRecordInfo(object): + cdef VariantRecord record + + +cdef class VariantRecordSamples(object): + cdef VariantRecord record + + +cdef class VariantRecordSample(object): + cdef VariantRecord record + cdef readonly int32_t index + + +cdef class BaseIndex(object): + cdef tuple refs + cdef dict refmap + + +cdef class BCFIndex(BaseIndex): + cdef VariantHeader header + cdef hts_idx_t *ptr + + +cdef class TabixIndex(BaseIndex): + cdef tbx_t *ptr + + +cdef class BaseIterator(object): + cdef VariantFile bcf + cdef hts_itr_t *iter + + +cdef class BCFIterator(BaseIterator): + cdef BCFIndex index + + +cdef class TabixIterator(BaseIterator): + cdef TabixIndex index + cdef kstring_t line_buffer + + +cdef class VariantFile(HTSFile): + cdef readonly VariantHeader header + cdef readonly BaseIndex index + + cdef readonly bint drop_samples # true if sample information is to be ignored + + # FIXME: Temporary, use htsFormat when it is available + cdef readonly bint is_reading # true if file has begun reading records + cdef readonly bint header_written # true if header has already been written + + cpdef VariantRecord new_record(self) + + cpdef int write(self, VariantRecord record) except -1 diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx new file mode 100644 index 0000000..8f40451 --- /dev/null +++ b/pysam/libcbcf.pyx @@ -0,0 +1,3813 @@ +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +## Cython wrapper for htslib VCF/BCF reader/writer +############################################################################### +# +# NOTICE: This code is incomplete and preliminary. It offers a nearly +# complete Pythonic interface to VCF/BCF metadata and data with +# reading and writing capability. Documentation and a unit test suite +# are in the works. The code is best tested under Python 2, but +# should also work with Python 3. Please report any remaining +# str/bytes issues on the github site when using Python 3 and I'll +# fix them promptly. +# +# Here is a minimal example of how to use the API: +# +# $ cat bcfview.py +# import sys +# from pysam import VariantFile +# +# bcf_in = VariantFile(sys.argv[1]) # auto-detect input format +# bcf_out = VariantFile('-', 'w', header=bcf_in.header) +# +# for rec in bcf_in: +# bcf_out.write(rec) +# +# Performance is fairly close to that of bcftools view. Here is an example +# using some 1k Genomes data: +# +# $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l +# 1103799 +# +# real 0m56.114s +# user 1m4.489s +# sys 0m3.102s +# +# $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l +# 1103800 # bcftools adds an extra header +# +# real 0m55.126s +# user 1m3.502s +# sys 0m3.459s +# +############################################################################### +# +# TODO list: +# +# * more genotype methods +# * unit test suite (perhaps py.test based) +# * documentation +# * pickle support +# * left/right locus normalization +# * fix reopen to re-use fd +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +from __future__ import division, print_function + +import os +import sys + +from libc.errno cimport errno, EPIPE +from libc.string cimport strcmp, strpbrk, strerror +from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX + +cimport cython + +from cpython.object cimport PyObject +from cpython.ref cimport Py_INCREF +from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM +from cpython.bytes cimport PyBytes_FromStringAndSize +from cpython.unicode cimport PyUnicode_DecodeASCII +from cpython.version cimport PY_MAJOR_VERSION + +from pysam.libchtslib cimport HTSFile, hisremote + + +from warnings import warn + + +__all__ = ['VariantFile', + 'VariantHeader', + 'VariantHeaderRecord', + 'VariantRecord'] + +######################################################################## +######################################################################## +## Constants +######################################################################## + +cdef int MAX_POS = 2 << 29 +cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') +cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') +cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') + + +######################################################################## +######################################################################## +## Python 3 compatibility functions +######################################################################## + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libcutils cimport encode_filename, from_string_and_size + + +######################################################################## +######################################################################## +## VCF/BCF string intern system +######################################################################## + +cdef dict bcf_str_cache = {} + +cdef inline bcf_str_cache_get_charptr(const char* s): + if s == NULL: + return None + + cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s) + if pystr: + return pystr + + if PY_MAJOR_VERSION < 3: + val = s + else: + val = PyUnicode_DecodeASCII(s, strlen(s), NULL) + + PyDict_SetItemString(bcf_str_cache, s, val) + + return val + + +######################################################################## +######################################################################## +## Low level type conversion helpers +######################################################################## + + +cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id): + return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id) + + +cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): + return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0 + + +cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0): + if not a: + return None + try: + return tuple(charptr_to_str(a[i]) for i in range(n)) + finally: + if free_after and a: + free(a) + + +cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar): + cdef char *datac + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + cdef float *dataf + cdef int i + + if not data or n <= 0: + return None + + if type == BCF_BT_CHAR: + datac = data + while n and datac[n-1] == bcf_str_vector_end: + n -= 1 + value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None + # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. + + value = tuple(v or None for v in value.split(',')) if value else () + # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. + else: + value = [] + if type == BCF_BT_INT8: + data8 = data + for i in range(n): + if data8[i] == bcf_int8_vector_end: + break + value.append(data8[i] if data8[i] != bcf_int8_missing else None) + elif type == BCF_BT_INT16: + data16 = data + for i in range(n): + if data16[i] == bcf_int16_vector_end: + break + value.append(data16[i] if data16[i] != bcf_int16_missing else None) + elif type == BCF_BT_INT32: + data32 = data + for i in range(n): + if data32[i] == bcf_int32_vector_end: + break + value.append(data32[i] if data32[i] != bcf_int32_missing else None) + elif type == BCF_BT_FLOAT: + dataf = data + for i in range(n): + if bcf_float_is_vector_end(dataf[i]): + break + value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None) + else: + raise TypeError('unsupported info type code') + + # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do. + if not value: + if scalar: + value = None + elif count <= 0: + value = () + else: + value = (None,)*count + elif scalar and len(value) == 1: + value = value[0] + else: + value = tuple(value) + + return value + + +cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen): + cdef char *datac + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + cdef float *dataf + cdef ssize_t i, value_count = len(values) + + assert(value_count <= n) + + if bt_type == BCF_BT_CHAR: + if not isinstance(values, (str, bytes)): + values = b','.join(force_bytes(v) if v is not None else b'' for v in values) + value_count = len(values) + assert(value_count <= n) + datac = data + memcpy(datac, values, value_count) + for i in range(value_count, n): + datac[i] = 0 + elif bt_type == BCF_BT_INT8: + datai8 = data + for i in range(value_count): + val = values[i] + datai8[i] = val if val is not None else bcf_int8_missing + for i in range(value_count, n): + datai8[i] = bcf_int8_vector_end + elif bt_type == BCF_BT_INT16: + datai16 = data + for i in range(value_count): + val = values[i] + datai16[i] = val if val is not None else bcf_int16_missing + for i in range(value_count, n): + datai16[i] = bcf_int16_vector_end + elif bt_type == BCF_BT_INT32: + datai32 = data + for i in range(value_count): + val = values[i] + datai32[i] = val if val is not None else bcf_int32_missing + for i in range(value_count, n): + datai32[i] = bcf_int32_vector_end + elif bt_type == BCF_BT_FLOAT: + dataf = data + for i in range(value_count): + val = values[i] + if val is None: + bcf_float_set(dataf + i, bcf_float_missing) + else: + dataf[i] = val + for i in range(value_count, n): + bcf_float_set(dataf + i, bcf_float_vector_end) + else: + raise TypeError('unsupported type') + + +cdef bcf_empty_array(int type, ssize_t n, int vlen): + cdef char *datac + cdef int32_t *data32 + cdef float *dataf + cdef int i + + if n <= 0: + raise ValueError('Cannot create empty array') + + if type == BCF_HT_STR: + value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n) + datac = value + for i in range(n): + datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end + elif type == BCF_HT_INT: + value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n) + data32 = value + for i in range(n): + data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif type == BCF_HT_REAL: + value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n) + dataf = value + for i in range(n): + bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) + else: + raise TypeError('unsupported header type code') + + return value + + +cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values, + void *dst_data, int dst_type, ssize_t dst_values, + int vlen): + cdef char *src_datac + cdef char *dst_datac + cdef int8_t *src_datai8 + cdef int16_t *src_datai16 + cdef int32_t *src_datai32 + cdef int32_t *dst_datai + cdef float *src_dataf + cdef float *dst_dataf + cdef ssize_t src_size, dst_size, i, j + cdef int val + + if src_values > dst_values: + raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values)) + + if src_type == dst_type == BCF_BT_CHAR: + src_datac = src_data + dst_datac = dst_data + memcpy(src_datac, dst_datac, src_values) + for i in range(src_values, dst_values): + dst_datac[i] = 0 + elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32: + src_datai8 = src_data + dst_datai = dst_data + for i in range(src_values): + val = src_datai8[i] + if val == bcf_int8_missing: + val = bcf_int32_missing + elif val == bcf_int8_vector_end: + val = bcf_int32_vector_end + dst_datai[i] = val + for i in range(src_values, dst_values): + dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32: + src_datai16 = src_data + dst_datai = dst_data + for i in range(src_values): + val = src_datai16[i] + if val == bcf_int16_missing: + val = bcf_int32_missing + elif val == bcf_int16_vector_end: + val = bcf_int32_vector_end + dst_datai[i] = val + for i in range(src_values, dst_values): + dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32: + src_datai32 = src_data + dst_datai = dst_data + for i in range(src_values): + dst_datai[i] = src_datai32[i] + for i in range(src_values, dst_values): + dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end + elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT: + src_dataf = src_data + dst_dataf = dst_data + for i in range(src_values): + dst_dataf[i] = src_dataf[i] + for i in range(src_values, dst_values): + bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end) + else: + raise TypeError('unsupported types') + + +cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf1_t *r = record.ptr + + if not check_header_id(hdr, hl_type, id): + raise ValueError('Invalid header') + + cdef int length = bcf_hdr_id2length(hdr, hl_type, id) + cdef int number = bcf_hdr_id2number(hdr, hl_type, id) + + scalar[0] = 0 + + if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id): + count[0] = number + elif length == BCF_VL_FIXED: + if number == 1: + scalar[0] = 1 + count[0] = number + elif length == BCF_VL_R: + count[0] = r.n_allele + elif length == BCF_VL_A: + count[0] = r.n_allele - 1 + elif length == BCF_VL_G: + count[0] = r.n_allele * (r.n_allele + 1) // 2 + elif length == BCF_VL_VAR: + count[0] = -1 + else: + raise ValueError('Unknown format length') + + +cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + + cdef char *s + cdef ssize_t count + cdef int scalar + + bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar) + + if z.len == 0: + if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG: + value = True + elif scalar: + value = None + else: + value = () + elif z.len == 1: + if z.type == BCF_BT_INT8: + value = z.v1.i if z.v1.i != bcf_int8_missing else None + elif z.type == BCF_BT_INT16: + value = z.v1.i if z.v1.i != bcf_int16_missing else None + elif z.type == BCF_BT_INT32: + value = z.v1.i if z.v1.i != bcf_int32_missing else None + elif z.type == BCF_BT_FLOAT: + value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None + elif z.type == BCF_BT_CHAR: + value = force_str(chr(z.v1.i)) + else: + raise TypeError('unsupported info type code') + + if not scalar and value != (): + value = (value,) + else: + value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar) + + return value + + +cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type, + int id, int bt_type, ssize_t bt_len, + ssize_t *value_count, int *scalar, int *realloc): + + if record is None: + raise ValueError('record must not be None') + + bcf_get_value_count(record, hl_type, id, value_count, scalar) + + # Validate values now that we know the type and size + values = (value,) if not isinstance(value, (list, tuple)) else value + + # Validate values now that we know the type and size + if ht_type == BCF_HT_FLAG: + value_count[0] = 1 + elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id): + # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1) + value_count[0] = -1 + + if value_count[0] != -1 and value_count[0] != len(values): + if scalar[0]: + raise TypeError('value expected to be scalar'.format(value_count[0])) + else: + raise TypeError('values expected to be {:d}-tuple'.format(value_count[0])) + + if ht_type == BCF_HT_REAL: + for v in values: + if not(v is None or isinstance(v, (float, int))): + raise TypeError('invalid value for Float format') + elif ht_type == BCF_HT_INT: + for v in values: + if not(v is None or (isinstance(v, (float, int)) and int(v) == v)): + raise TypeError('invalid value for Integer format') + for v in values: + if not(v is None or bcf_int32_missing < v <= INT32_MAX): + raise ValueError('Integer value too small/large to store in VCF/BCF') + elif ht_type == BCF_HT_STR: + values = b','.join(force_bytes(v) if v is not None else b'' for v in values) + elif ht_type == BCF_HT_FLAG: + if values[0] not in (True, False, None, 1, 0): + raise ValueError('Flag values must be: True, False, None, 1, 0') + else: + raise TypeError('unsupported type') + + realloc[0] = 0 + if len(values) <= 1 and hl_type == BCF_HL_INFO: + realloc[0] = 0 + elif len(values) > bt_len: + realloc[0] = 1 + elif bt_type == BCF_BT_INT8: + for v in values: + if v is not None and not(bcf_int8_missing < v <= INT8_MAX): + realloc[0] = 1 + break + elif bt_type == BCF_BT_INT16: + for v in values: + if v is not None and not(bcf_int16_missing < v <= INT16_MAX): + realloc[0] = 1 + break + + return values + + +cdef bcf_encode_alleles(VariantRecord record, values): + if record is None: + raise ValueError('record must not be None') + + cdef bcf1_t *r = record.ptr + cdef int32_t nalleles = r.n_allele + cdef list gt_values = [] + cdef char *s + cdef int i + + if values is None: + return () + + if not isinstance(values, (list, tuple)): + values = (values,) + + for value in values: + if value is None: + gt_values.append(bcf_gt_missing) + elif isinstance(value, (str, bytes)): + bvalue = force_bytes(value) + s = bvalue + for i in range(r.n_allele): + if strcmp(r.d.allele[i], s) != 0: + gt_values.append(bcf_gt_unphased(i)) + break + else: + raise ValueError('Unknown allele') + else: + i = value + if not (0 <= i < nalleles): + raise ValueError('Invalid allele index') + gt_values.append(bcf_gt_unphased(i)) + + return gt_values + + +cdef bcf_info_set_value(VariantRecord record, key, value): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf1_t *r = record.ptr + cdef vdict_t *d + cdef khiter_t k + cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0 + cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + if info: + info_id = info.key + else: + d = hdr.dict[BCF_DT_ID] + k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: + raise KeyError('unknown INFO') + + info_id = kh_val_vdict(d, k).id + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) + values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id, + info.type if info else -1, + info.len if info else -1, + &value_count, &scalar, &realloc) + + if info_type == BCF_HT_FLAG: + if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0: + raise ValueError('Unable to update INFO values') + return + + vlen = value_count < 0 + value_count = len(values) + + # If we can, write updated values to existing allocated storage + if info and not realloc: + r.d.shared_dirty |= BCF1_DIRTY_INF + + if value_count == 0: + info.len = 0 + # FIXME: Check if need to free vptr if info.len > 0? + elif value_count == 1: + # FIXME: Check if need to free vptr if info.len > 0? + if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32: + bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen) + elif info.type == BCF_BT_FLOAT: + bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen) + else: + raise TypeError('unsupported info type code') + info.len = 1 + else: + bcf_object_to_array(values, info.vptr, info.type, info.len, vlen) + return + + alloc_len = max(1, value_count) + if info and info.len > alloc_len: + alloc_len = info.len + + new_values = bcf_empty_array(info_type, alloc_len, vlen) + cdef char *valp = new_values + + if info_type == BCF_HT_INT: + dst_type = BCF_BT_INT32 + elif info_type == BCF_HT_REAL: + dst_type = BCF_BT_FLOAT + elif info_type == BCF_HT_STR: + dst_type = BCF_BT_CHAR + else: + raise ValueError('Unsupported INFO type') + + bcf_object_to_array(values, valp, dst_type, alloc_len, vlen) + + if bcf_update_info(hdr, r, bkey, valp, alloc_len, info_type) < 0: + raise ValueError('Unable to update INFO values') + + +cdef bcf_info_del_value(VariantRecord record, key): + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hdr_t *hdr = record.header.ptr + cdef bcf1_t *r = record.ptr + cdef ssize_t value_count + cdef int scalar + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + if not info: + raise KeyError(key) + + bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar) + + if value_count <= 0: + null_value = () + elif scalar: + null_value = None + else: + null_value = (None,)*value_count + + bcf_info_set_value(record, bkey, null_value) + + +cdef bcf_format_get_value(VariantRecordSample sample, key): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef ssize_t count + cdef int scalar + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError('invalid FORMAT') + + if is_gt_fmt(hdr, fmt.id): + return bcf_format_get_allele_indices(sample) + + bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar) + + if fmt.p and fmt.n and fmt.size: + return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar) + elif scalar: + return None + elif count <= 0: + return () + else: + return (None,)*count + + +cdef bcf_format_set_value(VariantRecordSample sample, key, value): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int fmt_id + cdef vdict_t *d + cdef khiter_t k + cdef int fmt_type, scalar, realloc, dst_type, vlen = 0 + cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if fmt: + fmt_id = fmt.id + else: + d = hdr.dict[BCF_DT_ID] + k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF: + raise KeyError('unknown format') + + fmt_id = kh_val_vdict(d, k).id + + if not check_header_id(hdr, BCF_HL_FMT, fmt_id): + raise ValueError('Invalid header') + + fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id) + + if fmt_type == BCF_HT_FLAG: + raise ValueError('Flag types are not allowed on FORMATs') + + if is_gt_fmt(hdr, fmt_id): + value = bcf_encode_alleles(sample.record, value) + # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT. + fmt_type = BCF_HT_INT + + values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id, + fmt.type if fmt else -1, + fmt.n if fmt else -1, + &value_count, &scalar, &realloc) + + vlen = value_count < 0 + value_count = len(values) + + # If we can, write updated values to existing allocated storage + if fmt and not realloc: + r.d.indiv_dirty = 1 + bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen) + return + + alloc_len = max(1, value_count) + if fmt and fmt.n > alloc_len: + alloc_len = fmt.n + + n = bcf_hdr_nsamples(hdr) + new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen) + cdef char *valp = new_values + + if fmt_type == BCF_HT_INT: + dst_type = BCF_BT_INT32 + dst_size = sizeof(int32_t) * alloc_len + elif fmt_type == BCF_HT_REAL: + dst_type = BCF_BT_FLOAT + dst_size = sizeof(float) * alloc_len + elif fmt_type == BCF_HT_STR: + dst_type = BCF_BT_CHAR + dst_size = sizeof(char) * alloc_len + else: + raise ValueError('Unsupported FORMAT type') + + if fmt and n > 1: + for i in range(n): + bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n, + valp + i*dst_size, dst_type, alloc_len, + vlen) + + bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen) + + if bcf_update_format(hdr, r, bkey, valp, (n*alloc_len), fmt_type) < 0: + raise ValueError('Unable to update format values') + + +cdef bcf_format_del_value(VariantRecordSample sample, key): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef ssize_t value_count + cdef int scalar + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError(key) + + bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar) + + if value_count <= 0: + null_value = () + elif scalar: + null_value = None + else: + null_value = (None,)*value_count + + bcf_format_set_value(sample, bkey, null_value) + + +cdef bcf_format_get_allele_indices(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t n = bcf_hdr_nsamples(hdr) + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + if sample.index < 0 or sample.index >= n or not r.n_fmt: + return () + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + return () + + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + cdef int32_t a, nalleles = r.n_allele + cdef list alleles = [] + + if fmt0.type == BCF_BT_INT8: + data8 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + elif data8[i] == bcf_gt_missing: + a = -1 + else: + a = bcf_gt_allele(data8[i]) + alleles.append(a if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT16: + data16 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + elif data16[i] == bcf_gt_missing: + a = -1 + else: + a = bcf_gt_allele(data16[i]) + alleles.append(a if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT32: + data32 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + elif data32[i] == bcf_gt_missing: + a = -1 + else: + a = bcf_gt_allele(data32[i]) + alleles.append(a if 0 <= a < nalleles else None) + + return tuple(alleles) + + +cdef bcf_format_get_alleles(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t nsamples = bcf_hdr_nsamples(hdr) + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + cdef int32_t nalleles = r.n_allele + + if sample.index < 0 or sample.index >= nsamples or not r.n_fmt: + return () + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + return () + + cdef int32_t a + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + alleles = [] + if fmt0.type == BCF_BT_INT8: + data8 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + a = bcf_gt_allele(data8[i]) + alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT16: + data16 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + a = bcf_gt_allele(data16[i]) + alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) + elif fmt0.type == BCF_BT_INT32: + data32 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + a = bcf_gt_allele(data32[i]) + alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None) + return tuple(alleles) + + +cdef bint bcf_sample_get_phased(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t n = bcf_hdr_nsamples(hdr) + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + if sample.index < 0 or sample.index >= n or not r.n_fmt: + return False + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + return False + + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + + cdef bint phased = False + + if fmt0.type == BCF_BT_INT8: + data8 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + elif data8[i] == bcf_int8_missing: + continue + elif i and not bcf_gt_is_phased(data8[i]): + return False + else: + phased = True + elif fmt0.type == BCF_BT_INT16: + data16 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + elif data16[i] == bcf_int16_missing: + continue + elif i and not bcf_gt_is_phased(data16[i]): + return False + else: + phased = True + elif fmt0.type == BCF_BT_INT32: + data32 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + elif data32[i] == bcf_int32_missing: + continue + elif i and not bcf_gt_is_phased(data32[i]): + return False + else: + phased = True + + return phased + + +cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): + if sample is None: + raise ValueError('sample must not be None') + + cdef bcf_hdr_t *hdr = sample.record.header.ptr + cdef bcf1_t *r = sample.record.ptr + cdef int32_t n = bcf_hdr_nsamples(hdr) + + if bcf_unpack(r, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + + if sample.index < 0 or sample.index >= n or not r.n_fmt: + return + + cdef bcf_fmt_t *fmt0 = r.d.fmt + cdef int gt0 = is_gt_fmt(hdr, fmt0.id) + + if not gt0 or not fmt0.n: + raise ValueError('Cannot set phased before genotype is set') + + cdef int8_t *data8 + cdef int16_t *data16 + cdef int32_t *data32 + + if fmt0.type == BCF_BT_INT8: + data8 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data8[i] == bcf_int8_vector_end: + break + elif data8[i] == bcf_int8_missing: + continue + elif i: + data8[i] = (data8[i] & 0xFE) | phased + elif fmt0.type == BCF_BT_INT16: + data16 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data16[i] == bcf_int16_vector_end: + break + elif data16[i] == bcf_int16_missing: + continue + elif i: + data16[i] = (data16[i] & 0xFFFE) | phased + elif fmt0.type == BCF_BT_INT32: + data32 = (fmt0.p + sample.index * fmt0.size) + for i in range(fmt0.n): + if data32[i] == bcf_int32_vector_end: + break + elif data32[i] == bcf_int32_missing: + continue + elif i: + data32[i] = (data32[i] & 0xFFFFFFFE) | phased + + +######################################################################## +######################################################################## +## Variant Header objects +######################################################################## + + +cdef bcf_header_remove_hrec(VariantHeader header, int i): + if header is None: + raise ValueError('header must not be None') + + cdef bcf_hdr_t *hdr = header.ptr + + if i < 0 or i >= hdr.nhrec: + raise ValueError('Invalid header record index') + + cdef bcf_hrec_t *hrec = hdr.hrec[i] + hdr.nhrec -= 1 + + if i < hdr.nhrec: + memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*)) + + bcf_hrec_destroy(hrec) + hdr.hrec[hdr.nhrec] = NULL + hdr.dirty = 1 + + +#FIXME: implement a full mapping interface +#FIXME: passing bcf_hrec_t* is not safe, since we cannot control the +# object lifetime. +cdef class VariantHeaderRecord(object): + """header record from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def type(self): + """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return None + return METADATA_TYPES[r.type] + + @property + def key(self): + """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)""" + cdef bcf_hrec_t *r = self.ptr + return bcf_str_cache_get_charptr(r.key) if r and r.key else None + + @property + def value(self): + """header value. Set only for generic lines, None for FILTER/INFO, etc.""" + cdef bcf_hrec_t *r = self.ptr + return charptr_to_str(r.value) if r and r.value else None + + @property + def attrs(self): + """sequence of additional header attributes""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return () + cdef int i + return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None, + charptr_to_str(r.vals[i]) if r.vals[i] else None) + for i in range(r.nkeys)) + + def __len__(self): + cdef bcf_hrec_t *r = self.ptr + return r.nkeys if r else 0 + + def __bool__(self): + cdef bcf_hrec_t *r = self.ptr + return r != NULL and r.nkeys != 0 + + def __getitem__(self, key): + """get attribute value""" + cdef bcf_hrec_t *r = self.ptr + cdef int i + if r: + bkey = force_bytes(key) + for i in range(r.nkeys): + if r.keys[i] and r.keys[i] == bkey: + return charptr_to_str(r.vals[i]) if r.vals[i] else None + raise KeyError('cannot find metadata key') + + def __iter__(self): + cdef bcf_hrec_t *r = self.ptr + if not r: + return + cdef int i + for i in range(r.nkeys): + if r.keys[i]: + yield bcf_str_cache_get_charptr(r.keys[i]) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return + cdef int i + for i in range(r.nkeys): + if r.keys[i]: + yield charptr_to_str(r.vals[i]) if r.vals[i] else None + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hrec_t *r = self.ptr + if not r: + return + cdef int i + for i in range(r.nkeys): + if r.keys[i]: + yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + def __str__(self): + cdef bcf_hrec_t *r = self.ptr + + if not r: + raise ValueError('cannot convert deleted record to str') + + cdef kstring_t hrec_str + hrec_str.l = hrec_str.m = 0 + hrec_str.s = NULL + + bcf_hrec_format(r, &hrec_str) + + ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l) + + if hrec_str.m: + free(hrec_str.s) + + return ret + + # FIXME: Not safe -- causes trivial segfaults at the moment + def remove(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_hrec_t *r = self.ptr + if not r: + return + assert(r.key) + cdef char *key = r.key if r.type == BCF_HL_GEN else r.value + print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key)) + bcf_hdr_remove(hdr, r.type, key) + self.ptr = NULL + + +cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr): + if not header: + raise ValueError('invalid VariantHeader') + + if not hdr: + return None + + cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord) + record.header = header + record.ptr = hdr + + return record + + +cdef class VariantHeaderRecords(object): + """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return self.header.ptr.nhrec + + def __bool__(self): + return self.header.ptr.nhrec != 0 + + def __getitem__(self, index): + cdef int32_t i = index + if i < 0 or i >= self.header.ptr.nhrec: + raise IndexError('invalid header record index') + return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) + + def __iter__(self): + cdef int32_t i + for i in range(self.header.ptr.nhrec): + yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i]) + + __hash__ = None + + +cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords) + records.header = header + return records + + +cdef class VariantMetadata(object): + """filter, info or format metadata record from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def name(self): + """metadata name""" + cdef bcf_hdr_t *hdr = self.header.ptr + return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key) + + # Q: Should this be exposed? + @property + def id(self): + """metadata internal header id number""" + return self.id + + @property + def number(self): + """metadata number (i.e. cardinality)""" + cdef bcf_hdr_t *hdr = self.header.ptr + + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + + if self.type == BCF_HL_FLT: + return None + + cdef int l = bcf_hdr_id2length(hdr, self.type, self.id) + if l == BCF_VL_FIXED: + return bcf_hdr_id2number(hdr, self.type, self.id) + elif l == BCF_VL_VAR: + return '.' + else: + return METADATA_LENGTHS[l] + + @property + def type(self): + """metadata value type""" + cdef bcf_hdr_t *hdr = self.header.ptr + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + + if self.type == BCF_HL_FLT: + return None + return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)] + + @property + def description(self): + """metadata description (or None if not set)""" + descr = self.record.get('Description') + if descr: + descr = descr.strip('"') + return force_str(descr) + + @property + def record(self): + """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object""" + cdef bcf_hdr_t *hdr = self.header.ptr + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type] + if not hrec: + return None + return makeVariantHeaderRecord(self.header, hrec) + + def remove_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key + bcf_hdr_remove(hdr, self.type, bkey) + + +cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): + if not header: + raise ValueError('invalid VariantHeader') + + if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT: + raise ValueError('invalid metadata type') + + if id < 0 or id >= header.ptr.n[BCF_DT_ID]: + raise ValueError('invalid metadata id') + + cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata) + meta.header = header + meta.type = type + meta.id = id + + return meta + + +cdef class VariantHeaderMetadata(object): + """mapping from filter, info or format name to :class:`VariantMetadata` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def add(self, id, number, type, description, **kwargs): + """Add a new filter, info or format record""" + if id in self: + raise ValueError('Header already exists for id={}'.format(id)) + + if self.type == BCF_HL_FLT: + if number is not None: + raise ValueError('Number must be None when adding a filter') + if type is not None: + raise ValueError('Type must be None when adding a filter') + + items = [('ID', id), ('Description', description)] + else: + if type not in VALUE_TYPES: + raise ValueError('unknown type specified: {}'.format(type)) + if number is None: + number = '.' + + items = [('ID', id), + ('Number', number), + ('Type', type), + ('Description', description)] + + items += kwargs.items() + self.header.add_meta(METADATA_TYPES[self.type], items=items) + + def __len__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i, n = 0 + + for i in range(hdr.n[BCF_DT_ID]): + idpair = hdr.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + n += 1 + return n + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i + + for i in range(hdr.n[BCF_DT_ID]): + idpair = hdr.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + return True + return False + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = hdr.dict[BCF_DT_ID] + + bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: + raise KeyError('invalid key') + + return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) + + def remove_header(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = hdr.dict[BCF_DT_ID] + + bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: + raise KeyError('invalid key') + + bcf_hdr_remove(hdr, self.type, bkey) + #bcf_hdr_sync(hdr) + + def clear_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + bcf_hdr_remove(hdr, self.type, NULL) + #bcf_hdr_sync(hdr) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_idpair_t *idpair + cdef int32_t i + + for i in range(hdr.n[BCF_DT_ID]): + idpair = hdr.id[BCF_DT_ID] + i + if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF: + yield bcf_str_cache_get_charptr(idpair.key) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata) + meta.header = header + meta.type = type + + return meta + + +cdef class VariantContig(object): + """contig metadata from a :class:`VariantHeader`""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def name(self): + """contig name""" + cdef bcf_hdr_t *hdr = self.header.ptr + return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key) + + @property + def id(self): + """contig internal id number""" + return self.id + + @property + def length(self): + """contig length or None if not available""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0] + return length if length else None + + @property + def header(self): + """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] + return makeVariantHeaderRecord(self.header, hrec) + + def remove_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key + bcf_hdr_remove(hdr, BCF_HL_CTG, bkey) + + +cdef VariantContig makeVariantContig(VariantHeader header, int id): + if not header: + raise ValueError('invalid VariantHeader') + + if id < 0 or id >= header.ptr.n[BCF_DT_CTG]: + raise ValueError('invalid contig id') + + cdef VariantContig contig = VariantContig.__new__(VariantContig) + contig.header = header + contig.id = id + + return contig + + +cdef class VariantHeaderContigs(object): + """mapping from contig name or index to :class:`VariantContig` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] + return hdr.n[BCF_DT_CTG] + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + assert kh_size(hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG] + return hdr.n[BCF_DT_CTG] != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int index + + if isinstance(key, int): + index = key + if index < 0 or index >= hdr.n[BCF_DT_CTG]: + raise IndexError('invalid contig index') + return makeVariantContig(self.header, index) + + cdef vdict_t *d = hdr.dict[BCF_DT_CTG] + bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + if k == kh_end(d): + raise KeyError('invalid contig') + + cdef int id = kh_val_vdict(d, k).id + + return makeVariantContig(self.header, id) + + def remove_header(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int index + cdef const char *bkey + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + index = key + if index < 0 or index >= hdr.n[BCF_DT_CTG]: + raise IndexError('invalid contig index') + bkey = hdr.id[BCF_DT_CTG][self.id].key + else: + d = hdr.dict[BCF_DT_CTG] + key = force_bytes(key) + if kh_get_vdict(d, key) == kh_end(d): + raise KeyError('invalid contig') + bkey = key + + bcf_hdr_remove(hdr, BCF_HL_CTG, bkey) + + def clear_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + bcf_hdr_remove(hdr, BCF_HL_CTG, NULL) + #bcf_hdr_sync(hdr) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = hdr.dict[BCF_DT_CTG] + cdef uint32_t n = kh_size(d) + + assert n == hdr.n[BCF_DT_CTG] + + for i in range(n): + yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i)) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + def add(self, id, **kwargs): + """Add a new contig record""" + if id in self: + raise ValueError('Header already exists for contig {}'.format(id)) + + items = [('ID', id)] + kwargs.items() + self.header.add_meta('contig', items=items) + + +cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs) + contigs.header = header + + return contigs + + +cdef class VariantHeaderSamples(object): + """sequence of sample names from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return bcf_hdr_nsamples(self.header.ptr) + + def __bool__(self): + return bcf_hdr_nsamples(self.header.ptr) != 0 + + def __getitem__(self, index): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int32_t n = bcf_hdr_nsamples(hdr) + cdef int32_t i = index + + if i < 0 or i >= n: + raise IndexError('invalid sample index') + + return charptr_to_str(hdr.samples[i]) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int32_t i, n = bcf_hdr_nsamples(hdr) + + for i in range(n): + yield charptr_to_str(hdr.samples[i]) + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = hdr.dict[BCF_DT_SAMPLE] + bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + return k != kh_end(d) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + def add(self, name): + """Add a new sample""" + self.header.add_sample(name) + + +cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header): + if not header: + raise ValueError('invalid VariantHeader') + + cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples) + samples.header = header + + return samples + + +cdef class VariantHeader(object): + """header information for a :class:`VariantFile` object""" + #FIXME: Add structured proxy + #FIXME: Add generic proxy + #FIXME: Add mutable methods + + # See makeVariantHeader for C constructor + def __cinit__(self): + self.ptr = NULL + + # Python constructor + def __init__(self): + self.ptr = bcf_hdr_init(b'w') + if not self.ptr: + raise ValueError('cannot create VariantHeader') + + def __dealloc__(self): + if self.ptr: + bcf_hdr_destroy(self.ptr) + self.ptr = NULL + + def __bool__(self): + # self.ptr == NULL should be impossible + return self.ptr != NULL + + def copy(self): + return makeVariantHeader(bcf_hdr_dup(self.ptr)) + + def merge(self, VariantHeader header): + if header is None: + raise ValueError('header must not be None') + bcf_hdr_merge(self.ptr, header.ptr) + + @property + def version(self): + """VCF version""" + return force_str(bcf_hdr_get_version(self.ptr)) + + @property + def samples(self): + """samples (:class:`VariantHeaderSamples`)""" + return makeVariantHeaderSamples(self) + + @property + def records(self): + """header records (:class:`VariantHeaderRecords`)""" + return makeVariantHeaderRecords(self) + + @property + def contigs(self): + """contig information (:class:`VariantHeaderContigs`)""" + return makeVariantHeaderContigs(self) + + @property + def filters(self): + """filter metadata (:class:`VariantHeaderMetadata`)""" + return makeVariantHeaderMetadata(self, BCF_HL_FLT) + + @property + def info(self): + """info metadata (:class:`VariantHeaderMetadata`)""" + return makeVariantHeaderMetadata(self, BCF_HL_INFO) + + @property + def formats(self): + """format metadata (:class:`VariantHeaderMetadata`)""" + return makeVariantHeaderMetadata(self, BCF_HL_FMT) + + @property + def alts(self): + """alt metadata (:class:`dict` ID->record). + + The data returned just a snapshot of alt records, is created + every time the property is requested, and modifications will + not be reflected in the header metadata and vice versa. + + i.e. it is just a dict that reflects the state of alt records + at the time it is created. + """ + return {record['ID']:record for record in self.records + if record.key.upper() == 'ALT' } + + # only safe to do when opening an htsfile + cdef _subset_samples(self, include_samples): + keep_samples = set(self.samples) + include_samples = set(include_samples) + missing_samples = include_samples - keep_samples + keep_samples &= include_samples + + if missing_samples: + # FIXME: add specialized exception with payload + raise ValueError( + 'missing {:d} requested samples'.format( + len(missing_samples))) + + keep_samples = force_bytes(','.join(keep_samples)) + cdef char *keep = keep_samples if keep_samples else NULL + cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0) + + if ret != 0: + raise ValueError( + 'bcf_hdr_set_samples failed: ret = {}'.format(ret)) + + def __str__(self): + cdef int hlen + cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen) + + try: + return charptr_to_str_w_len(hstr, hlen) + finally: + free(hstr) + + cpdef VariantRecord new_record(self): + """Create a new empty VariantRecord""" + r = makeVariantRecord(self, bcf_init()) + r.ptr.n_sample = bcf_hdr_nsamples(self.ptr) + return r + + def add_record(self, VariantHeaderRecord record): + """Add an existing :class:`VariantHeaderRecord` to this header""" + if record is None: + raise ValueError('record must not be None') + + cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr) + + bcf_hdr_add_hrec(self.ptr, hrec) + + if self.ptr.dirty: + bcf_hdr_sync(self.ptr) + + def add_line(self, line): + """Add a metadata line to this header""" + bline = force_bytes(line) + if bcf_hdr_append(self.ptr, bline) < 0: + raise ValueError('invalid header line') + + if self.ptr.dirty: + bcf_hdr_sync(self.ptr) + + def add_meta(self, key, value=None, items=None): + """Add metadata to this header""" + if not ((value is not None) ^ (items is not None)): + raise ValueError('either value or items must be specified') + + cdef bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t)) + cdef int quoted + + try: + key = force_bytes(key) + hrec.key = strdup(key) + + if value is not None: + hrec.value = strdup(force_bytes(value)) + else: + for key, value in items: + key = force_bytes(key) + bcf_hrec_add_key(hrec, key, len(key)) + + value = force_bytes(str(value)) + quoted = strpbrk(value, ' ;,"\t<>') != NULL + bcf_hrec_set_val(hrec, hrec.nkeys-1, value, len(value), quoted) + except: + bcf_hrec_destroy(hrec) + raise + + bcf_hdr_add_hrec(self.ptr, hrec) + + if self.ptr.dirty: + bcf_hdr_sync(self.ptr) + + def add_sample(self, name): + """Add a new sample to this header""" + bname = force_bytes(name) + if bcf_hdr_add_sample(self.ptr, bname) < 0: + raise ValueError('Duplicated sample name: {}'.format(name)) + if self.ptr.dirty: + bcf_hdr_sync(self.ptr) + + +cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): + if not hdr: + raise ValueError('cannot create VariantHeader') + + cdef VariantHeader header = VariantHeader.__new__(VariantHeader) + header.ptr = hdr + + return header + + +######################################################################## +######################################################################## +## Variant Record objects +######################################################################## + +cdef class VariantRecordFilter(object): + """Filters set on a :class:`VariantRecord` object, presented as a mapping from + filter index or name to :class:`VariantMetadata` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return self.record.ptr.d.n_flt + + def __bool__(self): + return self.record.ptr.d.n_flt != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int index, id + cdef int n = r.d.n_flt + + if isinstance(key, int): + index = key + + if index < 0 or index >= n: + raise IndexError('invalid filter index') + + id = r.d.flt[index] + else: + if key == '.': + key = 'PASS' + + bkey = force_bytes(key) + id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) + + if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): + raise KeyError('Invalid filter') + + return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) + + def add(self, key): + """Add a new filter""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int id + + if key == '.': + key = 'PASS' + + bkey = force_bytes(key) + id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) + + if not check_header_id(hdr, BCF_HL_FLT, id): + raise KeyError('Invalid filter') + + bcf_add_filter(hdr, r, id) + + def __delitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int index, id + cdef int n = r.d.n_flt + + if isinstance(key, int): + index = key + + if index < 0 or index >= n: + raise IndexError('invalid filter index') + + id = r.d.flt[index] + else: + if key == '.': + key = 'PASS' + + bkey = force_bytes(key) + id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) + + if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): + raise KeyError('Invalid filter') + + bcf_remove_filter(hdr, r, id, 0) + + def clear(self): + """Clear all filters""" + cdef bcf1_t *r = self.record.ptr + r.d.shared_dirty |= BCF1_DIRTY_FLT + r.d.n_flt = 0 + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i + + for i in range(r.d.n_flt): + yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i])) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + bkey = force_bytes(key) + return bcf_has_filter(hdr, r, bkey) == 1 + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter) + filter.record = record + + return filter + + +cdef class VariantRecordFormat(object): + """Format data present for each sample in a :class:`VariantRecord` object, + presented as mapping from format name to :class:`VariantMetadata` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = 0 + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + n += 1 + return n + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + return True + return False + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError('unknown format') + + return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id) + + def __delitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + + if not fmt or not fmt.p: + raise KeyError('unknown format') + + if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0: + raise ValueError('Unable to delete FORMAT') + + def clear(self): + """Clear all formats for all samples within the associated + :class:`VariantRecord` instance""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef const char *key + cdef int i + + for i in reversed(range(r.n_fmt)): + fmt = &r.d.fmt[i] + if fmt.p: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id) + if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0: + raise ValueError('Unable to delete FORMAT') + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int i + + for i in range(r.n_fmt): + fmt = &r.d.fmt[i] + if fmt.p: + yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + return fmt != NULL and fmt.p != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat) + format.record = record + + return format + + +#TODO: Add a getmeta method to return the corresponding VariantMetadata? +cdef class VariantRecordInfo(object): + """Info data stored in a :class:`VariantRecord` object, presented as a + mapping from info metadata name to value.""" + + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return self.record.ptr.n_info + + def __bool__(self): + return self.record.ptr.n_info != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef vdict_t *d + cdef khiter_t k + cdef info_id + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + if not info: + d = hdr.dict[BCF_DT_ID] + k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF: + raise KeyError('Unknown INFO field: {}'.format(key)) + + info_id = kh_val_vdict(d, k).id + else: + info_id = info.key + + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: + return info != NULL and info.vptr != NULL + + if not info or not info.vptr: + raise KeyError('Invalid INFO field: {}'.format(key)) + + return bcf_info_get_value(self.record, info) + + def __setitem__(self, key, value): + bcf_info_set_value(self.record, key, value) + + def __delitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + if not info or not info.vptr: + raise KeyError('Unknown INFO field: {}'.format(key)) + + if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0: + raise ValueError('Unable to delete INFO') + + def clear(self): + """Clear all info data""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef const char *key + cdef int i + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0: + raise ValueError('Unable to delete INFO') + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef int i + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + + if bcf_unpack(r, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + + bkey = force_bytes(key) + cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey) + + return info != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef int i + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + yield bcf_info_get_value(self.record, info) + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_info_t *info + cdef int i + + for i in range(r.n_info): + info = &r.d.info[i] + if info and info.vptr: + key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key) + value = bcf_info_get_value(self.record, info) + yield bcf_str_cache_get_charptr(key), value + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo) + info.record = record + + return info + + +cdef class VariantRecordSamples(object): + """mapping from sample index or name to :class:`VariantRecordSample` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __len__(self): + return bcf_hdr_nsamples(self.record.header.ptr) + + def __bool__(self): + return bcf_hdr_nsamples(self.record.header.ptr) != 0 + + def __getitem__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int n = bcf_hdr_nsamples(hdr) + cdef int sample_index + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + sample_index = key + else: + bkey = force_bytes(key) + sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) + if sample_index < 0: + raise KeyError('invalid sample name') + + if sample_index < 0 or sample_index >= n: + raise IndexError('invalid sample index') + + return makeVariantRecordSample(self.record, sample_index) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = bcf_hdr_nsamples(hdr) + + for i in range(n): + yield charptr_to_str(hdr.samples[i]) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int n = bcf_hdr_nsamples(hdr) + cdef int sample_index + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + sample_index = key + else: + bkey = force_bytes(key) + sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey) + if sample_index < 0: + raise KeyError('invalid sample name') + + return 0 <= sample_index < n + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = bcf_hdr_nsamples(hdr) + + for i in range(n): + yield makeVariantRecordSample(self.record, i) + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t i, n = bcf_hdr_nsamples(hdr) + + for i in range(n): + yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i)) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): + if not record: + raise ValueError('invalid VariantRecord') + + cdef VariantRecordSamples samples = VariantRecordSamples.__new__( + VariantRecordSamples) + samples.record = record + + return samples + + +cdef class VariantRecord(object): + """Variant record""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + def __dealloc__(self): + if self.ptr: + bcf_destroy1(self.ptr) + self.ptr = NULL + + def copy(self): + """return a copy of this VariantRecord object""" + return makeVariantRecord(self.header, bcf_dup(self.ptr)) + + def translate(self, VariantHeader dst_header): + if dst_header is None: + raise ValueError('dst_header must not be None') + + cdef bcf_hdr_t *src_hdr = self.header.ptr + cdef bcf_hdr_t *dst_hdr = dst_header.ptr + + if src_hdr != dst_hdr: + if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr): + msg = 'Cannot translate record. Number of samples does not match header ({} vs {})' + raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr))) + + bcf_translate(dst_hdr, src_hdr, self.ptr) + + @property + def rid(self): + """internal reference id number""" + return self.ptr.rid + + @rid.setter + def rid(self, value): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int r = value + if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val: + raise ValueError('invalid reference id') + self.ptr.rid = r + + @property + def chrom(self): + """chromosome/contig name""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = self.ptr.rid + if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: + raise ValueError('Invalid header') + return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) + + @chrom.setter + def chrom(self, value): + cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] + bchrom = force_bytes(value) + cdef khint_t k = kh_get_vdict(d, bchrom) + if k == kh_end(d): + raise ValueError('Invalid chromosome/contig') + self.ptr.rid = kh_val_vdict(d, k).id + + @property + def contig(self): + """chromosome/contig name""" + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = self.ptr.rid + if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: + raise ValueError('Invalid header') + return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) + + @contig.setter + def contig(self, value): + cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] + bchrom = force_bytes(value) + cdef khint_t k = kh_get_vdict(d, bchrom) + if k == kh_end(d): + raise ValueError('Invalid chromosome/contig') + self.ptr.rid = kh_val_vdict(d, k).id + + @property + def pos(self): + """record start position on chrom/contig (1-based inclusive)""" + return self.ptr.pos + 1 + + @pos.setter + def pos(self, value): + cdef int p = value + if p < 1: + raise ValueError('Position must be positive') + self.ptr.pos = p - 1 + + @property + def start(self): + """record start position on chrom/contig (0-based inclusive)""" + return self.ptr.pos + + @start.setter + def start(self, value): + cdef int s = value + if s < 0: + raise ValueError('Start coordinate must be non-negative') + self.ptr.pos = s + + @property + def stop(self): + """record stop position on chrom/contig (0-based exclusive)""" + return self.ptr.pos + self.ptr.rlen + + @stop.setter + def stop(self, value): + cdef int s = value + if s < self.ptr.pos: + raise ValueError('Stop coordinate must be greater than or equal to start') + self.ptr.rlen = s - self.ptr.pos + if self.ptr.rlen != len(self.ref) or 'END' in self.info: + self.info['END'] = s + + @property + def rlen(self): + """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)""" + return self.ptr.rlen + + @rlen.setter + def rlen(self, value): + cdef int r = value + if r < 0: + raise ValueError('Reference length must be non-negative') + self.ptr.rlen = r + if r != len(self.ref) or 'END' in self.info: + self.info['END'] = self.ptr.pos + r + + @property + def qual(self): + """phred scaled quality score or None if not available""" + return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None + + @qual.setter + def qual(self, value): + if value is not None: + self.ptr.qual = value + else: + bcf_float_set(&self.ptr.qual, bcf_float_missing) + + +# @property +# def n_allele(self): +# return self.ptr.n_allele + +# @property +# def n_sample(self): +# return self.ptr.n_sample + + @property + def id(self): + """record identifier or None if not available""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None + + @id.setter + def id(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + cdef char *idstr = NULL + if value is not None: + bid = force_bytes(value) + idstr = bid + if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0: + raise ValueError('Error updating id') + + @property + def ref(self): + """reference allele""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + return charptr_to_str(r.d.allele[0]) if r.d.allele else None + + @ref.setter + def ref(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + #FIXME: Set alleles directly -- this is stupid + if not value: + raise ValueError('ref allele must not be null') + value = force_bytes(value) + if r.d.allele and r.n_allele: + alleles = [r.d.allele[i] for i in range(r.n_allele)] + alleles[0] = value + else: + alleles = [value] + self.alleles = alleles + + @property + def alleles(self): + """tuple of reference allele followed by alt alleles""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if not r.d.allele: + return None + cdef tuple res = PyTuple_New(r.n_allele) + for i in range(r.n_allele): + a = charptr_to_str(r.d.allele[i]) + PyTuple_SET_ITEM(res, i, a) + Py_INCREF(a) + return res + + @alleles.setter + def alleles(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + value = [force_bytes(v) for v in value] + if b'' in value: + raise ValueError('cannot set null allele') + value = b','.join(value) + if bcf_update_alleles_str(self.header.ptr, r, value) < 0: + raise ValueError('Error updating alleles') + + @property + def alts(self): + """tuple of alt alleles""" + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if r.n_allele < 2 or not r.d.allele: + return None + cdef tuple res = PyTuple_New(r.n_allele - 1) + for i in range(1, r.n_allele): + a = charptr_to_str(r.d.allele[i]) + PyTuple_SET_ITEM(res, i - 1, a) + Py_INCREF(a) + return res + + @alts.setter + def alts(self, value): + #FIXME: Set alleles directly -- this is stupid + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + value = [force_bytes(v) for v in value] + if b'' in value: + raise ValueError('cannot set null alt allele') + ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] + self.alleles = ref + value + + @property + def filter(self): + """filter information (see :class:`VariantRecordFilter`)""" + if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFilter(self) + + @property + def info(self): + """info data (see :class:`VariantRecordInfo`)""" + if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordInfo(self) + + @property + def format(self): + """sample format metadata (see :class:`VariantRecordFormat`)""" + if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFormat(self) + + @property + def samples(self): + """sample data (see :class:`VariantRecordSamples`)""" + if bcf_unpack(self.ptr, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordSamples(self) + + def __str__(self): + cdef kstring_t line + cdef char c + + line.l = line.m = 0 + line.s = NULL + + if vcf_format(self.header.ptr, self.ptr, &line) < 0: + if line.m: + free(line.s) + raise ValueError('vcf_format failed') + + # Strip CR/LF? + #while line.l: + # c = line.s[line.l - 1] + # if c != b'\n' and c != b'\r': + # break + # line.l -= 1 + + ret = charptr_to_str_w_len(line.s, line.l) + + if line.m: + free(line.s) + + return ret + + +cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r): + if not header: + raise ValueError('invalid VariantHeader') + + if not r: + raise ValueError('cannot create VariantRecord') + + if r.errcode: + msg = [] + #if r.errcode & BCF_ERR_CTG_UNDEF: + # msg.append('undefined contig') + #if r.errcode & BCF_ERR_TAG_UNDEF: + # msg.append('undefined tag') + if r.errcode & BCF_ERR_NCOLS: + msg.append('invalid number of columns') + if r.errcode & BCF_ERR_LIMITS: + msg.append('limits violated') + if r.errcode & BCF_ERR_CHAR: + msg.append('invalid character found') + if r.errcode & BCF_ERR_CTG_INVALID: + msg.append('invalid contig') + if r.errcode & BCF_ERR_TAG_INVALID: + msg.append('invalid tag') + + if msg: + msg = ', '.join(msg) + raise ValueError('Error(s) reading record: {}'.format(msg)) + + cdef VariantRecord record = VariantRecord.__new__(VariantRecord) + record.header = header + record.ptr = r + + return record + + +######################################################################## +######################################################################## +## Variant Sampletype object +######################################################################## + + +cdef class VariantRecordSample(object): + """Data for a single sample from a :class:`VariantRecord` object. + Provides data accessors for genotypes and a mapping interface + from format name to values. + """ + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + + @property + def name(self): + """sample name""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t n = bcf_hdr_nsamples(hdr) + + if self.index < 0 or self.index >= n: + raise ValueError('invalid sample index') + + return charptr_to_str(hdr.samples[self.index]) + + @property + def allele_indices(self): + """allele indices for called genotype, if present. Otherwise None""" + return bcf_format_get_allele_indices(self) + + @allele_indices.setter + def allele_indices(self, value): + self['GT'] = value + + @allele_indices.deleter + def allele_indices(self): + self['GT'] = () + + @property + def alleles(self): + """alleles for called genotype, if present. Otherwise None""" + return bcf_format_get_alleles(self) + + @alleles.setter + def alleles(self, value): + self['GT'] = value + + @alleles.deleter + def alleles(self): + self['GT'] = () + + @property + def phased(self): + """False if genotype is missing or any allele is unphased. Otherwise True.""" + return bcf_sample_get_phased(self) + + @phased.setter + def phased(self, value): + bcf_sample_set_phased(self, value) + + def __len__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i, n = 0 + + if bcf_unpack(r, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + n += 1 + return n + + def __bool__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int i + + if bcf_unpack(r, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + + for i in range(r.n_fmt): + if r.d.fmt[i].p: + return True + return False + + def __getitem__(self, key): + return bcf_format_get_value(self, key) + + def __setitem__(self, key, value): + bcf_format_set_value(self, key, value) + + def __delitem__(self, key): + bcf_format_del_value(self, key) + + def clear(self): + """Clear all format data (including genotype) for this sample""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int i + + for i in range(r.n_fmt): + fmt = &r.d.fmt[i] + if fmt.p: + bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) + + def __iter__(self): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef bcf_fmt_t *fmt + cdef int i + + for i in range(r.n_fmt): + fmt = &r.d.fmt[i] + if r.d.fmt[i].p: + yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + bkey = force_bytes(key) + cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey) + return fmt != NULL and fmt.p != NULL + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index): + if not record or sample_index < 0: + raise ValueError('cannot create VariantRecordSample') + + cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample) + sample.record = record + sample.index = sample_index + + return sample + + +######################################################################## +######################################################################## +## Index objects +######################################################################## + + +cdef class BaseIndex(object): + def __init__(self): + self.refs = () + self.remap = {} + + def __len__(self): + return len(self.refs) + + def __bool__(self): + return len(self.refs) != 0 + + def __getitem__(self, key): + if isinstance(key, int): + return self.refs[key] + else: + return self.refmap[key] + + def __iter__(self): + return iter(self.refs) + + def get(self, key, default=None): + """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" + try: + return self[key] + except KeyError: + return default + + def __contains__(self, key): + try: + self[key] + except KeyError: + return False + else: + return True + + def iterkeys(self): + """D.iterkeys() -> an iterator over the keys of D""" + return iter(self) + + def itervalues(self): + """D.itervalues() -> an iterator over the values of D""" + for key in self: + yield self[key] + + def iteritems(self): + """D.iteritems() -> an iterator over the (key, value) items of D""" + for key in self: + yield (key, self[key]) + + def keys(self): + """D.keys() -> list of D's keys""" + return list(self) + + def items(self): + """D.items() -> list of D's (key, value) pairs, as 2-tuples""" + return list(self.iteritems()) + + def values(self): + """D.values() -> list of D's values""" + return list(self.itervalues()) + + # Mappings are not hashable by default, but subclasses can change this + __hash__ = None + + #TODO: implement __richcmp__ + + +cdef class BCFIndex(object): + """CSI index data structure for BCF files""" + def __init__(self): + self.refs = () + self.refmap = {} + + if not self.ptr: + raise ValueError('Invalid index object') + + cdef int n + cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n) + + self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () + self.refmap = { r:i for i,r in enumerate(self.refs) } + + def __dealloc__(self): + if self.ptr: + hts_idx_destroy(self.ptr) + self.ptr = NULL + + def fetch(self, bcf, contig, start, stop, region, reopen): + return BCFIterator(bcf, contig, start, stop, region, reopen) + + +cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx): + if not idx: + return None + + if not header: + raise ValueError('invalid VariantHeader') + + cdef BCFIndex index = BCFIndex.__new__(BCFIndex) + index.header = header + index.ptr = idx + index.__init__() + + return index + + +cdef class TabixIndex(BaseIndex): + """Tabix index data structure for VCF files""" + def __init__(self): + self.refs = () + self.refmap = {} + + if not self.ptr: + raise ValueError('Invalid index object') + + cdef int n + cdef const char **refs = tbx_seqnames(self.ptr, &n) + + self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () + self.refmap = { r:i for i,r in enumerate(self.refs) } + + def __dealloc__(self): + if self.ptr: + tbx_destroy(self.ptr) + self.ptr = NULL + + def fetch(self, bcf, contig, start, stop, region, reopen): + return TabixIterator(bcf, contig, start, stop, region, reopen) + + +cdef TabixIndex makeTabixIndex(tbx_t *idx): + if not idx: + return None + + cdef TabixIndex index = TabixIndex.__new__(TabixIndex) + index.ptr = idx + index.__init__() + + return index + + +######################################################################## +######################################################################## +## Iterators +######################################################################## + + +cdef class BaseIterator(object): + pass + + +# Interal function to clean up after iteration stop or failure. +# This would be a nested function if it weren't a cdef function. +cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): + bcf_destroy1(record) + + # destroy iter so future calls to __next__ raise StopIteration + bcf_itr_destroy(self.iter) + self.iter = NULL + + +cdef class BCFIterator(BaseIterator): + def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): + if bcf is None: + raise ValueError('bcf must not be None') + + if not isinstance(bcf.index, BCFIndex): + raise ValueError('bcf index required') + + cdef BCFIndex index = bcf.index + cdef int rid, cstart, cstop + cdef char *cregion + + if not index: + raise ValueError('bcf index required') + + if reopen: + bcf = bcf.copy() + + if region is not None: + if contig is not None or start is not None or stop is not None: + raise ValueError # FIXME + + bregion = force_bytes(region) + cregion = bregion + with nogil: + self.iter = bcf_itr_querys(index.ptr, bcf.header.ptr, cregion) + else: + if contig is None: + raise ValueError # FIXME + + try: + rid = index.refmap[contig] + except KeyError: + raise ValueError('Unknown contig specified') + + if start is None: + start = 0 + if stop is None: + stop = MAX_POS + + cstart, cstop = start, stop + + with nogil: + self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop) + + # Do not fail on self.iter == NULL, since it signifies a null query. + + self.bcf = bcf + self.index = index + + def __dealloc__(self): + if self.iter: + bcf_itr_destroy(self.iter) + self.iter = NULL + + def __iter__(self): + return self + + def __next__(self): + if not self.iter: + raise StopIteration + + cdef bcf1_t *record = bcf_init1() + + record.pos = -1 + if self.bcf.drop_samples: + record.max_unpack = BCF_UN_SHR + + cdef int ret + + with nogil: + ret = bcf_itr_next(self.bcf.htsfile, self.iter, record) + + if ret < 0: + _stop_BCFIterator(self, record) + if ret == -1: + raise StopIteration + else: + raise ValueError('error reading BCF file') + + ret = bcf_subset_format(self.bcf.header.ptr, record) + + if ret < 0: + _stop_BCFIterator(self, record) + raise ValueError('error in bcf_subset_format') + + return makeVariantRecord(self.bcf.header, record) + + +cdef class TabixIterator(BaseIterator): + def __cinit__(self, *args, **kwargs): + self.line_buffer.l = 0 + self.line_buffer.m = 0 + self.line_buffer.s = NULL + + def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): + if bcf is None: + raise ValueError('bcf must not be None') + + if not isinstance(bcf.index, TabixIndex): + raise ValueError('tabix index required') + + cdef TabixIndex index = bcf.index + + if not index: + raise ValueError('bcf index required') + + if reopen: + bcf = bcf.copy() + + if region is not None: + if contig is not None or start is not None or stop is not None: + raise ValueError # FIXME + + self.iter = tbx_itr_querys(index.ptr, region) + else: + if contig is None: + raise ValueError # FIXME + + rid = index.refmap.get(contig, -1) + + if start is None: + start = 0 + if stop is None: + stop = MAX_POS + + self.iter = tbx_itr_queryi(index.ptr, rid, start, stop) + + # Do not fail on self.iter == NULL, since it signifies a null query. + + self.bcf = bcf + self.index = index + + def __dealloc__(self): + if self.iter: + tbx_itr_destroy(self.iter) + self.iter = NULL + + if self.line_buffer.m: + free(self.line_buffer.s) + + self.line_buffer.l = 0 + self.line_buffer.m = 0 + self.line_buffer.s = NULL + + def __iter__(self): + return self + + def __next__(self): + if not self.iter: + raise StopIteration + + cdef int ret + + with nogil: + ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer) + + if ret < 0: + tbx_itr_destroy(self.iter) + self.iter = NULL + if ret == -1: + raise StopIteration + else: + raise ValueError('error reading indexed VCF file') + + cdef bcf1_t *record = bcf_init1() + + record.pos = -1 + if self.bcf.drop_samples: + record.max_unpack = BCF_UN_SHR + + ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record) + + # FIXME: stop iteration on parse failure? + if ret < 0: + bcf_destroy1(record) + raise ValueError('error in vcf_parse') + + return makeVariantRecord(self.bcf.header, record) + + +######################################################################## +######################################################################## +## Variant File +######################################################################## + + +cdef class VariantFile(HTSFile): + """*(filename, mode=None, index_filename=None, header=None, drop_samples=False, + duplicate_filehandle=True)* + + A :term:`VCF`/:term:`BCF` formatted file. The file is automatically + opened. + + If an index for a variant file exists (.csi or .tbi), it will be + opened automatically. Without an index random access to records + via :meth:`fetch` is disabled. + + For writing, a :class:`VariantHeader` object must be provided, + typically obtained from another :term:`VCF` file/:term:`BCF` + file. + + Parameters + ---------- + mode : string + *mode* should be ``r`` for reading or ``w`` for writing. The default is + text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append + ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. + + If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid + modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. + For instance, to open a :term:`BCF` formatted file for reading, type:: + + f = pysam.VariantFile('ex1.bcf','r') + + If mode is not specified, we will try to auto-detect the file type. All + of the following should work:: + + f1 = pysam.VariantFile('ex1.bcf') + f2 = pysam.VariantFile('ex1.vcf') + f3 = pysam.VariantFile('ex1.vcf.gz') + + index_filename : string + Explicit path to an index file. + + header : VariantHeader + :class:`VariantHeader` object required for writing. + + drop_samples: bool + Ignore sample information when reading. + + duplicate_filehandle: bool + By default, file handles passed either directly or through + File-like objects will be duplicated before passing them to + htslib. The duplication prevents issues where the same stream + will be closed by htslib and through destruction of the + high-level python object. Set to False to turn off + duplication. + + """ + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + + def __init__(self, *args, **kwargs): + self.header = None + self.index = None + self.filename = None + self.mode = None + self.index_filename = None + self.is_stream = False + self.is_remote = False + self.is_reading = False + self.drop_samples = False + self.header_written = False + self.start_offset = -1 + + self.open(*args, **kwargs) + + def close(self): + """closes the :class:`pysam.VariantFile`.""" + cdef int ret = 0 + self.header = self.index = None + if self.htsfile: + # Write header if no records were written + if self.htsfile.is_write and not self.header_written: + self.header_written = True + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + ret = hts_close(self.htsfile) + self.htsfile = NULL + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise OSError(errno, force_str(strerror(errno))) + + def __iter__(self): + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.htsfile.is_write: + raise ValueError('cannot iterate over Variantfile opened for writing') + + self.is_reading = 1 + return self + + def __next__(self): + cdef int ret + cdef bcf1_t *record = bcf_init1() + + record.pos = -1 + if self.drop_samples: + record.max_unpack = BCF_UN_SHR + + with nogil: + ret = bcf_read1(self.htsfile, self.header.ptr, record) + + if ret < 0: + bcf_destroy1(record) + if ret == -1: + raise StopIteration + elif ret == -2: + raise IOError('truncated file') + else: + raise ValueError('Variant read failed') + + return makeVariantRecord(self.header, record) + + def copy(self): + if not self.is_open: + raise ValueError + + cdef VariantFile vars = VariantFile.__new__(VariantFile) + cdef bcf_hdr_t *hdr + + # FIXME: re-open using fd or else header and index could be invalid + vars.htsfile = self._open_htsfile() + + if not vars.htsfile: + raise ValueError('Cannot re-open htsfile') + + # minimize overhead by re-using header and index. This approach is + # currently risky, but see above for how this can be mitigated. + vars.header = self.header + vars.index = self.index + + vars.filename = self.filename + vars.mode = self.mode + vars.index_filename = self.index_filename + vars.drop_samples = self.drop_samples + vars.is_stream = self.is_stream + vars.is_remote = self.is_remote + vars.is_reading = self.is_reading + vars.start_offset = self.start_offset + vars.header_written = self.header_written + + if self.htsfile.is_bin: + vars.seek(self.tell()) + else: + with nogil: + hdr = bcf_hdr_read(vars.htsfile) + makeVariantHeader(hdr) + + return vars + + def open(self, filename, mode='r', + index_filename=None, + VariantHeader header=None, + drop_samples=False, + duplicate_filehandle=True): + """open a vcf/bcf file. + + If open is called on an existing VariantFile, the current file will be + closed and a new file will be opened. + """ + cdef bcf_hdr_t *hdr + cdef BGZF *bgzfp + cdef hts_idx_t *idx + cdef tbx_t *tidx + cdef char *cfilename + cdef char *cindex_filename = NULL + cdef char *cmode + + # close a previously opened file + if self.is_open: + self.close() + + if not mode or mode[0] not in 'rwa': + raise ValueError('mode must begin with r, w or a') + + self.duplicate_filehandle = duplicate_filehandle + + format_modes = [m for m in mode[1:] if m in 'bcguz'] + if len(format_modes) > 1: + raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes))) + + invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex'] + if invalid_modes: + raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes))) + + # Autodetect mode from filename + if mode == 'w' and isinstance(filename, str): + if filename.endswith('.gz'): + mode = 'wz' + elif filename.endswith('.bcf'): + mode = 'wb' + + # for htslib, wbu seems to not work + if mode == 'wbu': + mode = 'wb0' + + self.mode = mode = force_bytes(mode) + try: + filename = encode_filename(filename) + self.is_remote = hisremote(filename) + self.is_stream = filename == b'-' + except TypeError: + filename = filename + self.is_remote = False + self.is_stream = True + + self.filename = filename + + if index_filename is not None: + self.index_filename = index_filename = encode_filename(index_filename) + else: + self.index_filename = None + + self.drop_samples = bool(drop_samples) + self.header = None + + self.header_written = False + + if mode.startswith(b'w'): + # open file for writing + if index_filename is not None: + raise ValueError('Cannot specify an index filename when writing a VCF/BCF file') + + # header structure (used for writing) + if header: + self.header = header.copy() + else: + self.header = VariantHeader() + #raise ValueError('a VariantHeader must be specified') + + # Header is not written until the first write or on close + self.htsfile = self._open_htsfile() + + if not self.htsfile: + raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode)) + + elif mode.startswith(b'r'): + # open file for reading + + if not self._exists(): + raise IOError('file `{}` not found'.format(filename)) + + self.htsfile = self._open_htsfile() + + if not self.htsfile: + raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode)) + + if self.htsfile.format.format not in (bcf, vcf): + raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode)) + + if self.htsfile.format.compression == bgzf: + bgzfp = hts_get_bgzfp(self.htsfile) + if bgzfp and bgzf_check_EOF(bgzfp) == 0: + warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename)) + + with nogil: + hdr = bcf_hdr_read(self.htsfile) + + try: + self.header = makeVariantHeader(hdr) + except ValueError: + raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode)) + + if isinstance(self.filename, bytes): + cfilename = self.filename + else: + cfilename = NULL + + # check for index and open if present + if self.htsfile.format.format == bcf and cfilename: + if index_filename is not None: + cindex_filename = index_filename + with nogil: + idx = bcf_index_load2(cfilename, cindex_filename) + self.index = makeBCFIndex(self.header, idx) + + elif self.htsfile.format.compression == bgzf and cfilename: + if index_filename is not None: + cindex_filename = index_filename + with nogil: + tidx = tbx_index_load2(cfilename, cindex_filename) + self.index = makeTabixIndex(tidx) + + if not self.is_stream: + self.start_offset = self.tell() + else: + raise ValueError("unknown mode {}".format(mode)) + + def reset(self): + """reset file position to beginning of file just after the header.""" + return self.seek(self.start_offset) + + + def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False): + """fetch records in a :term:`region` using 0-based indexing. The + region is specified by :term:`contig`, *start* and *end*. + Alternatively, a samtools :term:`region` string can be supplied. + + Without *contig* or *region* all mapped records will be fetched. The + records will be returned ordered by contig, which will not necessarily + be the order within the file. + + Set *reopen* to true if you will be using multiple iterators on the + same file at the same time. The iterator returned will receive its + own copy of a filehandle to the file effectively re-opening the + file. Re-opening a file incurrs some overhead, so use with care. + + If only *contig* is set, all records on *contig* will be fetched. + If both *region* and *contig* are given, an exception is raised. + + Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index + (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be + read sequentially. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.htsfile.is_write: + raise ValueError('cannot fetch from Variantfile opened for writing') + + if contig is None and region is None: + self.is_reading = 1 + bcf = self.copy() if reopen else self + bcf.seek(self.start_offset) + return iter(bcf) + + if not self.index: + raise ValueError('fetch requires an index') + + self.is_reading = 1 + return self.index.fetch(self, contig, start, stop, region, reopen) + + cpdef VariantRecord new_record(self): + """Create a new empty VariantRecord""" + return self.header.new_record() + + cpdef int write(self, VariantRecord record) except -1: + """ + write a single :class:`pysam.VariantRecord` to disk. + + returns the number of bytes written. + """ + if record is None: + raise ValueError('record must not be None') + + if not self.is_open: + return ValueError('I/O operation on closed file') + + if not self.htsfile.is_write: + raise ValueError('cannot write to a Variantfile opened for reading') + + if not self.header_written: + self.header_written = True + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + #if record.header is not self.header: + # record.translate(self.header) + # raise ValueError('Writing records from a different VariantFile is not yet supported') + + if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr): + msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})' + raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr))) + + cdef int ret + + with nogil: + ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr) + + if ret < 0: + raise IOError(errno, strerror(errno)) + + return ret + + def subset_samples(self, include_samples): + """ + Read only a subset of samples to reduce processing time and memory. + Must be called prior to retrieving records. + """ + if not self.is_open: + raise ValueError('I/O operation on closed file') + + if self.htsfile.is_write: + raise ValueError('cannot subset samples from Variantfile opened for writing') + + if self.is_reading: + raise ValueError('cannot subset samples after fetching records') + + self.header._subset_samples(include_samples) + + # potentially unnecessary optimization that also sets max_unpack + if not include_samples: + self.drop_samples = True diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx new file mode 100644 index 0000000..558ceff --- /dev/null +++ b/pysam/libcbgzf.pyx @@ -0,0 +1,209 @@ +"""Functions that read and write block gzipped files. + +The user of the file doesn't have to worry about the compression +and random access is allowed if an index file is present.""" + +# based on Python 3.5's gzip module + +import io + +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free + +from cpython.object cimport PyObject +from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libchtslib cimport * + + +__all__ = ["BGZFile"] + + +BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE + + +cdef class BGZFile(object): + """The BGZFile class simulates most of the methods of a file object with + the exception of the truncate() method. + + This class only supports opening files in binary mode. If you need to open a + compressed file in text mode, use the gzip.open() function. + """ + cdef BGZF* bgzf + cdef bytes name, index + + def __init__(self, filename, mode=None, index=None): + """Constructor for the BGZFile class. + + The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or + 'xb' depending on whether the file will be read or written. The default + is the mode of fileobj if discernible; otherwise, the default is 'rb'. + A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and + 'wb', 'a' and 'ab', and 'x' and 'xb'. + """ + if mode and ('t' in mode or 'U' in mode): + raise ValueError("Invalid mode: {!r}".format(mode)) + if not mode: + mode = 'rb' + if mode and 'b' not in mode: + mode += 'b' + self.name = force_bytes(filename) + self.index = force_bytes(index) if index is not None else None + self.bgzf = bgzf_open(self.name, mode) + + if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0: + raise IOError('Error building bgzf index') + + def __dealloc__(self): + self.close() + + def write(self,data): + if not self.bgzf: + raise ValueError("write() on closed BGZFile object") + + if not self.bgzf.is_write: + import errno + raise OSError(errno.EBADF, "write() on read-only BGZFile object") + + if isinstance(data, bytes): + length = len(data) + else: + # accept any data that supports the buffer protocol + data = memoryview(data) + length = data.nbytes + + if length > 0 and bgzf_write(self.bgzf, data, length) < 0: + raise IOError('BGZFile write failed') + + return length + + def read(self, size=-1): + cdef ssize_t read_size + + if not self.bgzf: + raise ValueError("read() on closed BGZFile object") + + if self.bgzf.is_write: + import errno + raise OSError(errno.EBADF, "read() on write-only BGZFile object") + + if size < 0: + chunks = [] + while 1: + chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE) + cdata = chunk + read_size = bgzf_read(self.bgzf, chunk, BUFFER_SIZE) + if read_size < 0: + raise IOError('Error reading from BGZFile') + elif not read_size: + break + elif read_size < BUFFER_SIZE: + chunk = chunk[:read_size] + chunks.append(chunk) + return b''.join(chunks) + + elif size > 0: + chunk = PyBytes_FromStringAndSize(NULL, size) + read_size = bgzf_read(self.bgzf, chunk, size) + if read_size < 0: + raise IOError('Error reading from BGZFile') + elif read_size < size: + chunk = chunk[:size] + return chunk + else: + return b'' + + @property + def closed(self): + return self.bgzf == NULL + + def close(self): + if not self.bgzf: + return + + if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: + raise IOError('Error flushing BGZFile object') + + if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0: + raise IOError('Cannot write index') + + cdef ret = bgzf_close(self.bgzf) + self.bgzf = NULL + + if ret < 0: + raise IOError('Error closing BGZFile object') + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.close() + + def flush(self): + if not self.bgzf: + return + + if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: + raise IOError('Error flushing BGZFile object') + + def fileno(self): + """Invoke the underlying file object's fileno() method. + + This will raise AttributeError if the underlying file object + doesn't support fileno(). + """ + raise AttributeError('fileno') + + def rewind(self): + '''Return the uncompressed stream file position indicator to the + beginning of the file''' + if not self.bgzf: + raise ValueError("rewind() on closed BGZFile object") + if not self.bgzf.is_write: + raise OSError("Can't rewind in write mode") + if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0: + raise IOError('Error seeking BGZFFile object') + + def readable(self): + if not self.bgzf: + raise ValueError("readable() on closed BGZFile object") + return self.bgzf != NULL and not self.bgzf.is_write + + def writable(self): + return self.bgzf != NULL and self.bgzf.is_write + + def seekable(self): + return True + + def seek(self, offset, whence=io.SEEK_SET): + if not self.bgzf: + raise ValueError("seek() on closed BGZFile object") + if whence is not io.SEEK_SET: + raise ValueError('Seek from end not supported') + + cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET) + if off < 0: + raise IOError('Error seeking BGZFFile object') + + return off + + def readline(self, size=-1): + if not self.bgzf: + raise ValueError("readline() on closed BGZFile object") + + cdef kstring_t line + cdef char c + + line.l = line.m = 0 + line.s = NULL + if bgzf_getline(self.bgzf, '\n', &line) < 0: + raise IOError('Error reading line in BGZFFile object') + + ret = charptr_to_str_w_len(line.s, line.l) + + if line.m: + free(line.s) + + return ret diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd new file mode 100644 index 0000000..2f5f44b --- /dev/null +++ b/pysam/libcfaidx.pxd @@ -0,0 +1,79 @@ +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free +from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup +from libc.stdio cimport FILE, printf +cimport cython + +from cpython cimport array +from pysam.libchtslib cimport faidx_t, kstring_t, BGZF + +# These functions are put here and not in chtslib.pxd in order +# to avoid warnings for unused functions. +cdef extern from "pysam_stream.h" nogil: + + ctypedef struct kstream_t: + pass + + ctypedef struct kseq_t: + kstring_t name + kstring_t comment + kstring_t seq + kstring_t qual + + kseq_t *kseq_init(BGZF *) + int kseq_read(kseq_t *) + void kseq_destroy(kseq_t *) + kstream_t *ks_init(BGZF *) + void ks_destroy(kstream_t *) + + # Retrieve characters from stream until delimiter + # is reached placing results in str. + int ks_getuntil(kstream_t *, + int delimiter, + kstring_t * str, + int * dret) + +cdef class FastaFile: + cdef bint is_remote + cdef object _filename, _references, _lengths, reference2length + cdef faidx_t* fastafile + cdef char* _fetch(self, char* reference, + int start, int end, int* length) + + +cdef class FastqProxy: + cdef kseq_t * _delegate + cdef cython.str tostring(self) + cpdef array.array get_quality_array(self, int offset=*) + + +cdef class PersistentFastqProxy: + """ + Python container for pysam.libcfaidx.FastqProxy with persistence. + """ + cdef public str comment, quality, sequence, name + cdef cython.str tostring(self) + cpdef array.array get_quality_array(self, int offset=*) + + +cdef class FastxFile: + cdef object _filename + cdef BGZF * fastqfile + cdef kseq_t * entry + cdef bint persist + cdef bint is_remote + + cdef kseq_t * getCurrent(self) + cdef int cnext(self) + + +# Compatibility Layer for pysam 0.8.1 +cdef class FastqFile(FastxFile): + pass + + +# Compatibility Layer for pysam < 0.8 +cdef class Fastafile(FastaFile): + pass + diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx new file mode 100644 index 0000000..774152d --- /dev/null +++ b/pysam/libcfaidx.pyx @@ -0,0 +1,572 @@ +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +# Cython wrapper for SAM/BAM/CRAM files based on htslib +############################################################################### +# The principal classes defined in this module are: +# +# class FastaFile random read read/write access to faidx indexd files +# class FastxFile streamed read/write access to fasta/fastq files +# +# Additionally this module defines several additional classes that are part +# of the internal API. These are: +# +# class FastqProxy +# class PersistentFastqProxy +# +# For backwards compatibility, the following classes are also defined: +# +# class Fastafile equivalent to FastaFile +# class FastqFile equivalent to FastxFile +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015 Andreas Heger +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### +import sys +import os +import re +from cpython cimport array + +from cpython cimport PyErr_SetString, \ + PyBytes_Check, \ + PyUnicode_Check, \ + PyBytes_FromStringAndSize + +from cpython.version cimport PY_MAJOR_VERSION + +from pysam.libchtslib cimport \ + faidx_nseq, fai_load, fai_destroy, fai_fetch, \ + faidx_seq_len, \ + faidx_fetch_seq, hisremote, \ + bgzf_open, bgzf_close + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport qualitystring_to_array, parse_region + +cdef class FastqProxy +cdef makeFastqProxy(kseq_t * src): + '''enter src into AlignedRead.''' + cdef FastqProxy dest = FastqProxy.__new__(FastqProxy) + dest._delegate = src + return dest + +## TODO: +## add automatic indexing. +## add function to get sequence names. +cdef class FastaFile: + """Random access to fasta formatted files that + have been indexed by :term:`faidx`. + + The file is automatically opened. The index file of file + ```` is expected to be called ``.fai``. + + Parameters + ---------- + + filename : string + Filename of fasta file to be opened. + + filepath_index : string + Optional, filename of the index. By default this is + the filename + ".fai". + + Raises + ------ + + ValueError + if index file is missing + + IOError + if file could not be opened + + """ + + def __cinit__(self, *args, **kwargs): + self.fastafile = NULL + self._filename = None + self._references = None + self._lengths = None + self.reference2length = None + self._open(*args, **kwargs) + + def is_open(self): + '''return true if samfile has been opened.''' + return self.fastafile != NULL + + def __len__(self): + if self.fastafile == NULL: + raise ValueError("calling len() on closed file") + + return faidx_nseq(self.fastafile) + + def _open(self, filename, filepath_index=None): + '''open an indexed fasta file. + + This method expects an indexed fasta file. + ''' + + # close a previously opened file + if self.fastafile != NULL: + self.close() + + self._filename = encode_filename(filename) + cdef char *cfilename = self._filename + self.is_remote = hisremote(cfilename) + + if filepath_index is not None: + raise NotImplementedError( + "setting an explicit path for the index " + "is not implemented") + + # open file for reading + if (self._filename != b"-" + and not self.is_remote + and not os.path.exists(filename)): + raise IOError("file `%s` not found" % filename) + + with nogil: + self.fastafile = fai_load(cfilename) + + if self.fastafile == NULL: + raise IOError("could not open file `%s`" % filename) + + if self.is_remote: + filepath_index = os.path.basename( + re.sub("[^:]+:[/]*", "", filename)) + ".fai" + elif filepath_index is None: + filepath_index = filename + ".fai" + + if not os.path.exists(filepath_index): + raise ValueError("could not locate index file {}".format( + filepath_index)) + + with open(filepath_index) as inf: + data = [x.split("\t") for x in inf] + self._references = tuple(x[0] for x in data) + self._lengths = tuple(int(x[1]) for x in data) + self.reference2length = dict(zip(self._references, self._lengths)) + + def close(self): + """close the file.""" + if self.fastafile != NULL: + fai_destroy(self.fastafile) + self.fastafile = NULL + + def __dealloc__(self): + if self.fastafile != NULL: + fai_destroy(self.fastafile) + self.fastafile = NULL + + # context manager interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + property closed: + """"bool indicating the current state of the file object. + This is a read-only attribute; the close() method changes the value. + """ + def __get__(self): + return not self.is_open() + + property filename: + """filename associated with this object. This is a read-only attribute.""" + def __get__(self): + return self._filename + + property references: + '''tuple with the names of :term:`reference` sequences.''' + def __get__(self): + return self._references + + property nreferences: + """"int with the number of :term:`reference` sequences in the file. + This is a read-only attribute.""" + def __get__(self): + return len(self._references) if self.references else None + + property lengths: + """tuple with the lengths of :term:`reference` sequences.""" + def __get__(self): + return self._lengths + + def fetch(self, + reference=None, + start=None, + end=None, + region=None): + """fetch sequences in a :term:`region`. + + A region can + either be specified by :term:`reference`, `start` and + `end`. `start` and `end` denote 0-based, half-open + intervals. + + Alternatively, a samtools :term:`region` string can be + supplied. + + If any of the coordinates are missing they will be replaced by the + minimum (`start`) or maximum (`end`) coordinate. + + Note that region strings are 1-based, while `start` and `end` denote + an interval in python coordinates. + The region is specified by :term:`reference`, `start` and `end`. + + Returns + ------- + + string : a string with the sequence specified by the region. + + Raises + ------ + + IndexError + if the coordinates are out of range + + ValueError + if the region is invalid + + """ + + if not self.is_open(): + raise ValueError("I/O operation on closed file" ) + + cdef int length + cdef char *seq + cdef char *ref + cdef int rstart, rend + + reference, rstart, rend = parse_region(reference, start, end, region) + + if reference is None: + raise ValueError("no sequence/region supplied.") + + if rstart == rend: + return "" + + ref = reference + with nogil: + length = faidx_seq_len(self.fastafile, ref) + if length == -1: + raise KeyError("sequence '%s' not present" % reference) + if rstart >= length: + return "" + + # fai_fetch adds a '\0' at the end + with nogil: + seq = faidx_fetch_seq(self.fastafile, + ref, + rstart, + rend-1, + &length) + + if seq == NULL: + raise ValueError( + "failure when retrieving sequence on '%s'" % reference) + + try: + return charptr_to_str(seq) + finally: + free(seq) + + cdef char * _fetch(self, char * reference, int start, int end, int * length): + '''fetch sequence for reference, start and end''' + + with nogil: + return faidx_fetch_seq(self.fastafile, + reference, + start, + end-1, + length) + + def get_reference_length(self, reference): + '''return the length of reference.''' + return self.reference2length[reference] + + def __getitem__(self, reference): + return self.fetch(reference) + + def __contains__(self, reference): + '''return true if reference in fasta file.''' + return reference in self.reference2length + + +cdef class FastqProxy: + """A single entry in a fastq file.""" + def __init__(self): pass + + property name: + """The name of each entry in the fastq file.""" + def __get__(self): + return charptr_to_str(self._delegate.name.s) + + property sequence: + """The sequence of each entry in the fastq file.""" + def __get__(self): + return charptr_to_str(self._delegate.seq.s) + + property comment: + def __get__(self): + if self._delegate.comment.l: + return charptr_to_str(self._delegate.comment.s) + else: + return None + + property quality: + """The quality score of each entry in the fastq file, represented as a string.""" + def __get__(self): + if self._delegate.qual.l: + return charptr_to_str(self._delegate.qual.s) + else: + return None + + cdef cython.str tostring(self): + if self.comment is None: + comment = "" + else: + comment = " %s" % self.comment + + if self.quality is None: + return ">%s%s\n%s" % (self.name, comment, self.sequence) + else: + return "@%s%s\n%s\n+\n%s" % (self.name, comment, + self.sequence, self.quality) + + def __str__(self): + return self.tostring() + + cpdef array.array get_quality_array(self, int offset=33): + '''return quality values as integer array after subtracting offset.''' + if self.quality is None: + return None + return qualitystring_to_array(force_bytes(self.quality), + offset=offset) + +cdef class PersistentFastqProxy: + """ + Python container for pysam.libcfaidx.FastqProxy with persistence. + Needed to compare multiple fastq records from the same file. + """ + def __init__(self, FastqProxy FastqRead): + self.comment = FastqRead.comment + self.quality = FastqRead.quality + self.sequence = FastqRead.sequence + self.name = FastqRead.name + + cdef cython.str tostring(self): + if self.comment is None: + comment = "" + else: + comment = " %s" % self.comment + + if self.quality is None: + return ">%s%s\n%s" % (self.name, comment, self.sequence) + else: + return "@%s%s\n%s\n+\n%s" % (self.name, comment, + self.sequence, self.quality) + + def __str__(self): + return self.tostring() + + cpdef array.array get_quality_array(self, int offset=33): + '''return quality values as array after subtracting offset.''' + if self.quality is None: + return None + return qualitystring_to_array(force_bytes(self.quality), + offset=offset) + + +cdef class FastxFile: + """Stream access to :term:`fasta` or :term:`fastq` formatted files. + + The file is automatically opened. + + Entries in the file can be both fastq or fasta formatted or even a + mixture of the two. + + This file object permits iterating over all entries in the + file. Random access is not implemented. The iteration returns + objects of type :class:`FastqProxy` + + Parameters + ---------- + + filename : string + Filename of fasta/fastq file to be opened. + + persist : bool + + If True (default) make a copy of the entry in the file during + iteration. If set to False, no copy will be made. This will + permit faster iteration, but an entry will not persist when + the iteration continues. + + Notes + ----- + Prior to version 0.8.2, this was called FastqFile. + + Raises + ------ + + IOError + if file could not be opened + + + Examples + -------- + >>> with pysam.FastxFile(filename) as fh: + ... for entry in fh: + ... print(entry.name) + ... print(entry.sequence) + ... print(entry.comment) + ... print(entry.quality) + + """ + def __cinit__(self, *args, **kwargs): + # self.fastqfile = NULL + self._filename = None + self.entry = NULL + self._open(*args, **kwargs) + + def is_open(self): + '''return true if samfile has been opened.''' + return self.entry != NULL + + def _open(self, filename, persist=True): + '''open a fastq/fasta file in *filename* + + Paramentes + ---------- + + persist : bool + + if True return a copy of the underlying data (default + True). The copy will persist even if the iteration + on the file continues. + + ''' + if self.fastqfile != NULL: + self.close() + + self._filename = encode_filename(filename) + cdef char *cfilename = self._filename + self.is_remote = hisremote(cfilename) + + # open file for reading + if (self._filename != b"-" + and not self.is_remote + and not os.path.exists(filename)): + raise IOError("file `%s` not found" % filename) + + self.persist = persist + + with nogil: + self.fastqfile = bgzf_open(cfilename, "r") + self.entry = kseq_init(self.fastqfile) + self._filename = filename + + def close(self): + '''close the file.''' + if self.fastqfile != NULL: + bgzf_close(self.fastqfile) + self.fastqfile = NULL + if self.entry != NULL: + kseq_destroy(self.entry) + self.entry = NULL + + def __dealloc__(self): + if self.fastqfile != NULL: + bgzf_close(self.fastqfile) + if self.entry: + kseq_destroy(self.entry) + + # context manager interface + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + property closed: + """"bool indicating the current state of the file object. + This is a read-only attribute; the close() method changes the value. + """ + def __get__(self): + return not self.is_open() + + property filename: + """string with the filename associated with this object.""" + def __get__(self): + return self._filename + + def __iter__(self): + if not self.is_open(): + raise ValueError("I/O operation on closed file") + return self + + cdef kseq_t * getCurrent(self): + return self.entry + + cdef int cnext(self): + '''C version of iterator + ''' + with nogil: + return kseq_read(self.entry) + + def __next__(self): + """ + python version of next(). + """ + cdef int l + with nogil: + l = kseq_read(self.entry) + if (l >= 0): + if self.persist: + return PersistentFastqProxy(makeFastqProxy(self.entry)) + return makeFastqProxy(self.entry) + else: + raise StopIteration + +# Compatibility Layer for pysam 0.8.1 +cdef class FastqFile(FastxFile): + """FastqFile is deprecated: use FastxFile instead""" + pass + +# Compatibility Layer for pysam < 0.8 +cdef class Fastafile(FastaFile): + """Fastafile is deprecated: use FastaFile instead""" + pass + +__all__ = ["FastaFile", + "FastqFile", + "FastxFile", + "Fastafile", + "FastqProxy"] diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd new file mode 100644 index 0000000..657a754 --- /dev/null +++ b/pysam/libchtslib.pxd @@ -0,0 +1,1916 @@ +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free +from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup +from libc.stdio cimport FILE, printf +from posix.types cimport off_t + +cdef extern from "Python.h": + FILE* PyFile_AsFile(object) + + +cdef extern from "htslib/kstring.h" nogil: + ctypedef struct kstring_t: + size_t l, m + char *s + + +cdef extern from "htslib_util.h" nogil: + int hts_set_verbosity(int verbosity) + int hts_get_verbosity() + + ctypedef uint32_t khint32_t + ctypedef uint32_t khint_t + ctypedef khint_t khiter_t + + # Used to manage BCF Header info + ctypedef struct vdict_t: + khint_t n_buckets, size, n_occupied, upper_bound + khint32_t *flags + const char *keys + bcf_idinfo_t *vals + + # Used to manage indexed contigs in Tabix + ctypedef struct s2i_t: + khint_t n_buckets, size, n_occupied, upper_bound + khint32_t *flags + const char *keys + int64_t *vals + + # Generic khash methods + khint_t kh_size(void *d) + khint_t kh_begin(void *d) + khint_t kh_end(void *d) + int kh_exist(void *d, khiter_t i) + + # Specialized khash methods for vdict + khint_t kh_get_vdict(vdict_t *d, const char *key) + const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i) + bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i) + + +cdef extern from "htslib/hfile.h" nogil: + ctypedef struct hFILE + + # @abstract Open the named file or URL as a stream + # @return An hFILE pointer, or NULL (with errno set) if an error occurred. + hFILE *hopen(const char *filename, const char *mode) + + # @abstract Associate a stream with an existing open file descriptor + # @return An hFILE pointer, or NULL (with errno set) if an error occurred. + # @notes For socket descriptors (on Windows), mode should contain 's'. + hFILE *hdopen(int fd, const char *mode) + + # @abstract Report whether the file name or URL denotes remote storage + # @return 0 if local, 1 if remote. + # @notes "Remote" means involving e.g. explicit network access, with the + # implication that callers may wish to cache such files' contents locally. + int hisremote(const char *filename) + + # @abstract Flush (for output streams) and close the stream + # @return 0 if successful, or EOF (with errno set) if an error occurred. + int hclose(hFILE *fp) + + # @abstract Close the stream, without flushing or propagating errors + # @notes For use while cleaning up after an error only. Preserves errno. + void hclose_abruptly(hFILE *fp) + + # @abstract Return the stream's error indicator + # @return Non-zero (in fact, an errno value) if an error has occurred. + # @notes This would be called herror() and return true/false to parallel + # ferror(3), but a networking-related herror(3) function already exists. */ + int herrno(hFILE *fp) + + # @abstract Clear the stream's error indicator + void hclearerr(hFILE *fp) + + # @abstract Reposition the read/write stream offset + # @return The resulting offset within the stream (as per lseek(2)), + # or negative if an error occurred. + off_t hseek(hFILE *fp, off_t offset, int whence) + + # @abstract Report the current stream offset + # @return The offset within the stream, starting from zero. + off_t htell(hFILE *fp) + + # @abstract Read one character from the stream + # @return The character read, or EOF on end-of-file or error + int hgetc(hFILE *fp) + + # @abstract Peek at characters to be read without removing them from buffers + # @param fp The file stream + # @param buffer The buffer to which the peeked bytes will be written + # @param nbytes The number of bytes to peek at; limited by the size of the + # internal buffer, which could be as small as 4K. + # @return The number of bytes peeked, which may be less than nbytes if EOF + # is encountered; or negative, if there was an I/O error. + # @notes The characters peeked at remain in the stream's internal buffer, + # and will be returned by later hread() etc calls. + ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) + + # @abstract Read a block of characters from the file + # @return The number of bytes read, or negative if an error occurred. + # @notes The full nbytes requested will be returned, except as limited + # by EOF or I/O errors. + ssize_t hread(hFILE *fp, void *buffer, size_t nbytes) + + # @abstract Write a character to the stream + # @return The character written, or EOF if an error occurred. + int hputc(int c, hFILE *fp) + + # @abstract Write a string to the stream + # @return 0 if successful, or EOF if an error occurred. + int hputs(const char *text, hFILE *fp) + + # @abstract Write a block of characters to the file + # @return Either nbytes, or negative if an error occurred. + # @notes In the absence of I/O errors, the full nbytes will be written. + ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes) + + # @abstract For writing streams, flush buffered output to the underlying stream + # @return 0 if successful, or EOF if an error occurred. + int hflush(hFILE *fp) + + +cdef extern from "htslib/bgzf.h" nogil: + ctypedef struct bgzf_mtaux_t + ctypedef struct bgzidx_t + ctypedef struct z_stream + + ctypedef struct BGZF: + unsigned errcode + unsigned is_write + int is_be + int compress_level + int is_compressed + int is_gzip + int cache_size + int64_t block_address + int64_t uncompressed_address + void *uncompressed_block + void *compressed_block + void *cache + hFILE *fp + bgzf_mtaux_t *mt + bgzidx_t *idx + int idx_build_otf + z_stream *gz_stream + + #***************** + # Basic routines * + # *****************/ + + # Open an existing file descriptor for reading or writing. + # + # @param fd file descriptor + # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for + # writing, 'a' for appending, 'g' for gzip rather than BGZF + # compression (with 'w' only), and digit specifies the zlib + # compression level. + # Note that there is a distinction between 'u' and '0': the + # first yields plain uncompressed output whereas the latter + # outputs uncompressed data wrapped in the zlib format. + # @return BGZF file handler; 0 on error + + BGZF* bgzf_dopen(int fd, const char *mode) + BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility + + # Open the specified file for reading or writing. + BGZF* bgzf_open(const char* path, const char *mode) + + # Open an existing hFILE stream for reading or writing. + BGZF* bgzf_hopen(hFILE *fp, const char *mode) + + # Close the BGZF and free all associated resources. + # + # @param fp BGZF file handler + # @return 0 on success and -1 on error + int bgzf_close(BGZF *fp) + + # Read up to _length_ bytes from the file storing into _data_. + # + # @param fp BGZF file handler + # @param data data array to read into + # @param length size of data to read + # @return number of bytes actually read; 0 on end-of-file and -1 on error + ssize_t bgzf_read(BGZF *fp, void *data, size_t length) + + # Write _length_ bytes from _data_ to the file. If no I/O errors occur, + # the complete _length_ bytes will be written (or queued for writing). + # + # @param fp BGZF file handler + # @param data data array to write + # @param length size of data to write + # @return number of bytes written (i.e., _length_); negative on error + ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) + + # Read up to _length_ bytes directly from the underlying stream without + # decompressing. Bypasses BGZF blocking, so must be used with care in + # specialised circumstances only. + # + # @param fp BGZF file handler + # @param data data array to read into + # @param length number of raw bytes to read + # @return number of bytes actually read; 0 on end-of-file and -1 on error + ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length) + + # Write _length_ bytes directly to the underlying stream without + # compressing. Bypasses BGZF blocking, so must be used with care + # in specialised circumstances only. + # + # @param fp BGZF file handler + # @param data data array to write + # @param length number of raw bytes to write + # @return number of bytes actually written; -1 on error + ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length) + + # Write the data in the buffer to the file. + int bgzf_flush(BGZF *fp) + + int SEEK_SET + + # Return a virtual file pointer to the current location in the file. + # No interpetation of the value should be made, other than a subsequent + # call to bgzf_seek can be used to position the file at the same point. + # Return value is non-negative on success. + int64_t bgzf_tell(BGZF *fp) + + # Set the file to read from the location specified by _pos_. + # + # @param fp BGZF file handler + # @param pos virtual file offset returned by bgzf_tell() + # @param whence must be SEEK_SET + # @return 0 on success and -1 on error + # / + int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) + + # Check if the BGZF end-of-file (EOF) marker is present + # + # @param fp BGZF file handler opened for reading + # @return 1 if the EOF marker is present and correct + # 2 if it can't be checked, e.g., because fp isn't seekable + # 0 if the EOF marker is absent + # -1 (with errno set) on error + int bgzf_check_EOF(BGZF *fp) + + # Check if a file is in the BGZF format + # + # @param fn file name + # @return 1 if _fn_ is BGZF; 0 if not or on I/O error + int bgzf_is_bgzf(const char *fn) + + #********************* + # Advanced routines * + #********************* + + # Set the cache size. Only effective when compiled with -DBGZF_CACHE. + # + # @param fp BGZF file handler + # @param size size of cache in bytes; 0 to disable caching (default) + void bgzf_set_cache_size(BGZF *fp, int size) + + # Flush the file if the remaining buffer size is smaller than _size_ + # @return 0 if flushing succeeded or was not needed; negative on error + int bgzf_flush_try(BGZF *fp, ssize_t size) + + # Read one byte from a BGZF file. It is faster than bgzf_read() + # @param fp BGZF file handler + # @return byte read; -1 on end-of-file or error + int bgzf_getc(BGZF *fp) + + # Read one line from a BGZF file. It is faster than bgzf_getc() + # + # @param fp BGZF file handler + # @param delim delimitor + # @param str string to write to; must be initialized + # @return length of the string; 0 on end-of-file; negative on error + int bgzf_getline(BGZF *fp, int delim, kstring_t *str) + + # Read the next BGZF block. + int bgzf_read_block(BGZF *fp) + + # Enable multi-threading (only effective on writing and when the + # library was compiled with -DBGZF_MT) + # + # @param fp BGZF file handler; must be opened for writing + # @param n_threads #threads used for writing + # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) + + + # Compress a single BGZF block. + # + # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE) + # @param dlen size of output buffer; updated on return to the number + # of bytes actually written to dst + # @param src buffer to be compressed + # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE) + # @param level compression level + # @return 0 on success and negative on error + # + int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level) + + #******************* + # bgzidx routines * + # BGZF at the uncompressed offset + # + # @param fp BGZF file handler; must be opened for reading + # @param uoffset file offset in the uncompressed data + # @param where SEEK_SET supported atm + # + # Returns 0 on success and -1 on error. + int bgzf_useek(BGZF *fp, long uoffset, int where) + + # Position in uncompressed BGZF + # + # @param fp BGZF file handler; must be opened for reading + # + # Returns the current offset on success and -1 on error. + long bgzf_utell(BGZF *fp) + + # Tell BGZF to build index while compressing. + # + # @param fp BGZF file handler; can be opened for reading or writing. + # + # Returns 0 on success and -1 on error. + int bgzf_index_build_init(BGZF *fp) + + # Load BGZF index + # + # @param fp BGZF file handler + # @param bname base name + # @param suffix suffix to add to bname (can be NULL) + # + # Returns 0 on success and -1 on error. + int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix) + + # Save BGZF index + # + # @param fp BGZF file handler + # @param bname base name + # @param suffix suffix to add to bname (can be NULL) + # + # Returns 0 on success and -1 on error. + int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix) + + +cdef extern from "htslib/hts.h" nogil: + uint32_t kroundup32(uint32_t x) + + ctypedef struct cram_fd + + union FilePointerUnion: + BGZF *bgzf + cram_fd *cram + hFILE *hfile + void *voidp + + enum htsFormatCategory: + unknown_category + sequence_data # Sequence data -- SAM, BAM, CRAM, etc + variant_data # Variant calling data -- VCF, BCF, etc + index_file # Index file associated with some data file + region_list # Coordinate intervals or regions -- BED, etc + category_maximum + + enum htsExactFormat: + unknown_format + binary_format + text_format + sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed + format_maximum + + enum htsCompression: + no_compression, gzip, bgzf, custom + compression_maximum + + enum hts_fmt_option: + CRAM_OPT_DECODE_MD, + CRAM_OPT_PREFIX, + CRAM_OPT_VERBOSITY, + CRAM_OPT_SEQS_PER_SLICE, + CRAM_OPT_SLICES_PER_CONTAINER, + CRAM_OPT_RANGE, + CRAM_OPT_VERSION, + CRAM_OPT_EMBED_REF, + CRAM_OPT_IGNORE_MD5, + CRAM_OPT_REFERENCE, + CRAM_OPT_MULTI_SEQ_PER_SLICE, + CRAM_OPT_NO_REF, + CRAM_OPT_USE_BZIP2, + CRAM_OPT_SHARED_REF, + CRAM_OPT_NTHREADS, + CRAM_OPT_THREAD_POOL, + CRAM_OPT_USE_LZMA, + CRAM_OPT_USE_RANS, + CRAM_OPT_REQUIRED_FIELDS, + HTS_OPT_COMPRESSION_LEVEL, + HTS_OPT_NTHREADS, + + ctypedef struct htsVersion: + short major, minor + + ctypedef struct htsFormat: + htsFormatCategory category + htsExactFormat format + htsVersion version + htsCompression compression + short compression_level + void *specific + + ctypedef struct htsFile: + uint8_t is_bin + uint8_t is_write + uint8_t is_be + uint8_t is_cram + int64_t lineno + kstring_t line + char *fn + char *fn_aux + FilePointerUnion fp + htsFormat format + + int hts_verbose + + # @abstract Table for converting a nucleotide character to 4-bit encoding. + # The input character may be either an IUPAC ambiguity code, '=' for 0, or + # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 + # for A/C/G/T or combinations of these bits for ambiguous bases. + const unsigned char *seq_nt16_table + + # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC + # ambiguity code letter (or '=' when given 0). + const char *seq_nt16_str + + # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. + # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). + const int *seq_nt16_int + + # @abstract Get the htslib version number + # @return For released versions, a string like "N.N[.N]"; or git describe + # output if using a library built within a Git repository. + const char *hts_version() + + # @abstract Determine format by peeking at the start of a file + # @param fp File opened for reading, positioned at the beginning + # @param fmt Format structure that will be filled out on return + # @return 0 for success, or negative if an error occurred. + int hts_detect_format(hFILE *fp, htsFormat *fmt) + + # @abstract Get a human-readable description of the file format + # @return Description string, to be freed by the caller after use. + char *hts_format_description(const htsFormat *format) + + # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file + # @param fn The file name or "-" for stdin/stdout + # @param mode Mode matching / [rwa][bceguxz0-9]* / + # @discussion + # With 'r' opens for reading; any further format mode letters are ignored + # as the format is detected by checking the first few bytes or BGZF blocks + # of the file. With 'w' or 'a' opens for writing or appending, with format + # specifier letters: + # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) + # c CRAM format + # g gzip compressed + # u uncompressed + # z bgzf compressed + # [0-9] zlib compression level + # and with non-format option letters (for any of 'r'/'w'/'a'): + # e close the file on exec(2) (opens with O_CLOEXEC, where supported) + # x create the file exclusively (opens with O_EXCL, where supported) + # Note that there is a distinction between 'u' and '0': the first yields + # plain uncompressed output whereas the latter outputs uncompressed data + # wrapped in the zlib format. + # @example + # [rw]b .. compressed BCF, BAM, FAI + # [rw]bu .. uncompressed BCF + # [rw]z .. compressed VCF + # [rw] .. uncompressed VCF + htsFile *hts_open(const char *fn, const char *mode) + + # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file + # @param fn The file name or "-" for stdin/stdout + # @param mode Open mode, as per hts_open() + # @param fmt Optional format specific parameters + # @discussion + # See hts_open() for description of fn and mode. + # // TODO Update documentation for s/opts/fmt/ + # Opts contains a format string (sam, bam, cram, vcf, bcf) which will, + # if defined, override mode. Opts also contains a linked list of hts_opt + # structures to apply to the open file handle. These can contain things + # like pointers to the reference or information on compression levels, + # block sizes, etc. + htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) + + # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file + # @param fp The already-open file handle + # @param fn The file name or "-" for stdin/stdout + # @param mode Open mode, as per hts_open() + htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode) + + # @abstract Close a file handle, flushing buffered data for output streams + # @param fp The file handle to be closed + # @return 0 for success, or negative if an error occurred. + int hts_close(htsFile *fp) + + # @abstract Returns the file's format information + # @param fp The file handle + # @return Read-only pointer to the file's htsFormat. + const htsFormat *hts_get_format(htsFile *fp) + + # @ abstract Returns a string containing the file format extension. + # @ param format Format structure containing the file type. + # @ return A string ("sam", "bam", etc) or "?" for unknown formats. + const char *hts_format_file_extension(const htsFormat *format) + + # @abstract Sets a specified CRAM option on the open file handle. + # @param fp The file handle open the open file. + # @param opt The CRAM_OPT_* option. + # @param ... Optional arguments, dependent on the option used. + # @return 0 for success, or negative if an error occurred. + int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...) + + int hts_getline(htsFile *fp, int delimiter, kstring_t *str) + char **hts_readlines(const char *fn, int *_n) + + # @abstract Parse comma-separated list or read list from a file + # @param list File name or comma-separated list + # @param is_file + # @param _n Size of the output array (number of items read) + # @return NULL on failure or pointer to newly allocated array of + # strings + char **hts_readlist(const char *fn, int is_file, int *_n) + + # @abstract Create extra threads to aid compress/decompression for this file + # @param fp The file handle + # @param n The number of worker threads to create + # @return 0 for success, or negative if an error occurred. + # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE. + int hts_set_threads(htsFile *fp, int n) + + # @abstract Set .fai filename for a file opened for reading + # @return 0 for success, negative on failure + # @discussion + # Called before *_hdr_read(), this provides the name of a .fai file + # used to provide a reference list if the htsFile contains no @SQ headers. + int hts_set_fai_filename(htsFile *fp, const char *fn_aux) + + int8_t HTS_IDX_NOCOOR + int8_t HTS_IDX_START + int8_t HTS_IDX_REST + int8_t HTS_IDX_NONE + + int8_t HTS_FMT_CSI + int8_t HTS_FMT_BAI + int8_t HTS_FMT_TBI + int8_t HTS_FMT_CRAI + + BGZF *hts_get_bgzfp(htsFile *fp) + int hts_useek(htsFile *fp, long uoffset, int where) + long hts_utell(htsFile *fp) + + ctypedef struct hts_idx_t + + ctypedef struct hts_pair64_t: + uint64_t u, v + + ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end) + + ctypedef struct hts_bins_t: + int n, m + int *a + + ctypedef struct hts_itr_t: + uint32_t read_rest + uint32_t finished + int tid, bed, end, n_off, i + int curr_tid, curr_beg, curr_end + uint64_t curr_off + hts_pair64_t *off + hts_readrec_func *readfunc + hts_bins_t bins + + hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) + void hts_idx_destroy(hts_idx_t *idx) + int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) + void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) + + #### Save an index to a file + # @param idx Index to be written + # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added + # @param fmt One of the HTS_FMT_* index formats + # @return 0 if successful, or negative if an error occurred. + int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) + + #### Save an index to a specific file + # @param idx Index to be written + # @param fn Input BAM/BCF/etc filename + # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn + # @param fmt One of the HTS_FMT_* index formats + # @return 0 if successful, or negative if an error occurred. + int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) + + #### Load an index file + # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or + # the extension substituted, to search for an existing index file + # @param fmt One of the HTS_FMT_* index formats + # @return The index, or NULL if an error occurred. + hts_idx_t *hts_idx_load(const char *fn, int fmt) + + #### Load a specific index file + # @param fn Input BAM/BCF/etc filename + # @param fnidx The input index filename + # @return The index, or NULL if an error occurred. + hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx) + + uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta) + void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy) + + int hts_idx_get_stat(const hts_idx_t* idx, int tid, + uint64_t* mapped, uint64_t* unmapped) + + uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) + + int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers + + # Parse a numeric string + # The number may be expressed in scientific notation, and optionally may + # contain commas in the integer part (before any decimal point or E notation). + # @param str String to be parsed + # @param strend If non-NULL, set on return to point to the first character + # in @a str after those forming the parsed number + # @param flags Or'ed-together combination of HTS_PARSE_* flags + # @return Converted value of the parsed number. + # + # When @a strend is NULL, a warning will be printed (if hts_verbose is 2 + # or more) if there are any trailing characters after the number. + long long hts_parse_decimal(const char *str, char **strend, int flags) + + # Parse a "CHR:START-END"-style region string + # @param str String to be parsed + # @param beg Set on return to the 0-based start of the region + # @param end Set on return to the 1-based end of the region + # @return Pointer to the colon or '\0' after the reference sequence name, + # or NULL if @a str could not be parsed. + const char *hts_parse_reg(const char *str, int *beg, int *end) + + hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) + void hts_itr_destroy(hts_itr_t *iter) + + ctypedef int (*hts_name2id_f)(void*, const char*) + ctypedef const char *(*hts_id2name_f)(void*, int) + ctypedef hts_itr_t *hts_itr_query_func( + const hts_idx_t *idx, + int tid, + int beg, + int end, + hts_readrec_func *readrec) + + hts_itr_t *hts_itr_querys( + const hts_idx_t *idx, + const char *reg, + hts_name2id_f getid, + void *hdr, + hts_itr_query_func *itr_query, + hts_readrec_func *readrec) + + int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) + const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values + + # hts_file_type() - Convenience function to determine file type + # @fname: the file name + # + # Returns one of the FT_* defines. + # + # DEPRECATED: This function has been replaced by hts_detect_format(). + # It and these FT_* macros will be removed in a future HTSlib release. + int FT_UNKN + int FT_GZ + int FT_VCF + int FT_VCF_GZ + int FT_BCF + int FT_BCF_GZ + int FT_STDIN + + int hts_file_type(const char *fname) + + inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) + inline int hts_bin_bot(int bin, int n_lvls) + + # * Endianness * + inline int ed_is_big() + inline uint16_t ed_swap_2(uint16_t v) + inline void *ed_swap_2p(void *x) + inline uint32_t ed_swap_4(uint32_t v) + inline void *ed_swap_4p(void *x) + inline uint64_t ed_swap_8(uint64_t v) + inline void *ed_swap_8p(void *x) + + +cdef extern from "htslib/sam.h" nogil: + #********************** + #*** SAM/BAM header *** + #********************** + + # @abstract Structure for the alignment header. + # @field n_targets number of reference sequences + # @field l_text length of the plain text in the header + # @field target_len lengths of the reference sequences + # @field target_name names of the reference sequences + # @field text plain text + # @field sdict header dictionary + + ctypedef struct bam_hdr_t: + int32_t n_targets, ignore_sam_err + uint32_t l_text + uint32_t *target_len + uint8_t *cigar_tab + char **target_name + char *text + void *sdict + + #**************************** + #*** CIGAR related macros *** + #**************************** + + int BAM_CMATCH + int BAM_CINS + int BAM_CDEL + int BAM_CREF_SKIP + int BAM_CSOFT_CLIP + int BAM_CHARD_CLIP + int BAM_CPAD + int BAM_CEQUAL + int BAM_CDIFF + int BAM_CBACK + + char *BAM_CIGAR_STR + int BAM_CIGAR_SHIFT + uint32_t BAM_CIGAR_MASK + uint32_t BAM_CIGAR_TYPE + + char bam_cigar_op(uint32_t c) + uint32_t bam_cigar_oplen(uint32_t c) + char bam_cigar_opchr(uint32_t) + uint32_t bam_cigar_gen(char, uint32_t) + int bam_cigar_type(char o) + + # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair + int BAM_FPAIRED + # @abstract the read is mapped in a proper pair + int BAM_FPROPER_PAIR + # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR + int BAM_FUNMAP + # @abstract the mate is unmapped + int BAM_FMUNMAP + # @abstract the read is mapped to the reverse strand + int BAM_FREVERSE + # @abstract the mate is mapped to the reverse strand + int BAM_FMREVERSE + # @abstract this is read1 + int BAM_FREAD1 + # @abstract this is read2 + int BAM_FREAD2 + # @abstract not primary alignment + int BAM_FSECONDARY + # @abstract QC failure + int BAM_FQCFAIL + # @abstract optical or PCR duplicate + int BAM_FDUP + # @abstract supplementary alignment + int BAM_FSUPPLEMENTARY + + #************************* + #*** Alignment records *** + #************************* + + # @abstract Structure for core alignment information. + # @field tid chromosome ID, defined by bam_hdr_t + # @field pos 0-based leftmost coordinate + # @field bin bin calculated by bam_reg2bin() + # @field qual mapping quality + # @field l_qname length of the query name + # @field flag bitwise flag + # @field n_cigar number of CIGAR operations + # @field l_qseq length of the query sequence (read) + # @field mtid chromosome ID of next read in template, defined by bam_hdr_t + # @field mpos 0-based leftmost coordinate of next read in template + + ctypedef struct bam1_core_t: + int32_t tid + int32_t pos + uint16_t bin + uint8_t qual + uint8_t l_qname + uint16_t flag + uint16_t n_cigar + int32_t l_qseq + int32_t mtid + int32_t mpos + int32_t isize + + # @abstract Structure for one alignment. + # @field core core information about the alignment + # @field l_data current length of bam1_t::data + # @field m_data maximum length of bam1_t::data + # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux + # + # @discussion Notes: + # + # 1. qname is zero tailing and core.l_qname includes the tailing '\0'. + # 2. l_qseq is calculated from the total length of an alignment block + # on reading or from CIGAR. + # 3. cigar data is encoded 4 bytes per CIGAR operation. + # 4. seq is nybble-encoded according to seq_nt16_table. + ctypedef struct bam1_t: + bam1_core_t core + int l_data, m_data + uint8_t *data + uint64_t id + + # @abstract Get whether the query is on the reverse strand + # @param b pointer to an alignment + # @return boolean true if query is on the reverse strand + int bam_is_rev(bam1_t *b) + + # @abstract Get whether the query's mate is on the reverse strand + # @param b pointer to an alignment + # @return boolean true if query's mate on the reverse strand + int bam_is_mrev(bam1_t *b) + + # @abstract Get the name of the query + # @param b pointer to an alignment + # @return pointer to the name string, null terminated + char *bam_get_qname(bam1_t *b) + + # @abstract Get the CIGAR array + # @param b pointer to an alignment + # @return pointer to the CIGAR array + # + # @discussion In the CIGAR array, each element is a 32-bit integer. The + # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the + # length of a CIGAR. + uint32_t *bam_get_cigar(bam1_t *b) + + # @abstract Get query sequence + # @param b pointer to an alignment + # @return pointer to sequence + # + # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, + # 8 for T and 15 for N. Two bases are packed in one byte with the base + # at the higher 4 bits having smaller coordinate on the read. It is + # recommended to use bam_seqi() macro to get the base. + char *bam_get_seq(bam1_t *b) + + # @abstract Get query quality + # @param b pointer to an alignment + # @return pointer to quality string + uint8_t *bam_get_qual(bam1_t *b) + + # @abstract Get auxiliary data + # @param b pointer to an alignment + # @return pointer to the concatenated auxiliary data + uint8_t *bam_get_aux(bam1_t *b) + + # @abstract Get length of auxiliary data + # @param b pointer to an alignment + # @return length of the concatenated auxiliary data + int bam_get_l_aux(bam1_t *b) + + # @abstract Get a base on read + # @param s Query sequence returned by bam1_seq() + # @param i The i-th position, 0-based + # @return 4-bit integer representing the base. + char bam_seqi(char *s, int i) + + #************************** + #*** Exported functions *** + #************************** + + #*************** + #*** BAM I/O *** + #*************** + + bam_hdr_t *bam_hdr_init() + bam_hdr_t *bam_hdr_read(BGZF *fp) + int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) + void bam_hdr_destroy(bam_hdr_t *h) + int bam_name2id(bam_hdr_t *h, const char *ref) + bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0) + + bam1_t *bam_init1() + void bam_destroy1(bam1_t *b) + int bam_read1(BGZF *fp, bam1_t *b) + int bam_write1(BGZF *fp, const bam1_t *b) + bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) + bam1_t *bam_dup1(const bam1_t *bsrc) + + int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) + int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) + + # @abstract Calculate the rightmost base position of an alignment on the + # reference genome. + + # @param b pointer to an alignment + # @return the coordinate of the first base after the alignment, 0-based + + # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. + # For an unmapped read (either according to its flags or if it has no cigar + # string), we return b->core.pos + 1 by convention. + int32_t bam_endpos(const bam1_t *b) + + int bam_str2flag(const char *str) # returns negative value on error + char *bam_flag2str(int flag) # The string must be freed by the user + + #************************* + #*** BAM/CRAM indexing *** + #************************* + + # These BAM iterator functions work only on BAM files. To work with either + # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. + void bam_itr_destroy(hts_itr_t *iter) + hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) + hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) + int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) + + # Load/build .csi or .bai BAM index file. Does not work with CRAM. + # It is recommended to use the sam_index_* functions below instead. + hts_idx_t *bam_index_load(const char *fn) + int bam_index_build(const char *fn, int min_shift) + + # Load a BAM (.csi or .bai) or CRAM (.crai) index file + # @param fp File handle of the data file whose index is being opened + # @param fn BAM/CRAM/etc filename to search alongside for the index file + # @return The index, or NULL if an error occurred. + hts_idx_t *sam_index_load(htsFile *fp, const char *fn) + + # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file + # @param fp File handle of the data file whose index is being opened + # @param fn BAM/CRAM/etc data file filename + # @param fnidx Index filename, or NULL to search alongside @a fn + # @return The index, or NULL if an error occurred. + hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) + + # Generate and save an index file + # @param fn Input BAM/etc filename, to which .csi/etc will be added + # @param min_shift Positive to generate CSI, or 0 to generate BAI + # @return 0 if successful, or negative if an error occurred (usually -1; or + # -2: opening fn failed; -3: format not indexable) + int sam_index_build(const char *fn, int min_shift) + + # Generate and save an index to a specific file + # @param fn Input BAM/CRAM/etc filename + # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn + # @param min_shift Positive to generate CSI, or 0 to generate BAI + # @return 0 if successful, or negative if an error occurred. + int sam_index_build2(const char *fn, const char *fnidx, int min_shift) + + void sam_itr_destroy(hts_itr_t *iter) + hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) + hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) + int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) + + #*************** + #*** SAM I/O *** + #*************** + + htsFile *sam_open(const char *fn, const char *mode) + htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt) + int sam_close(htsFile *fp) + + int sam_open_mode(char *mode, const char *fn, const char *format) + + # A version of sam_open_mode that can handle ,key=value options. + # The format string is allocated and returned, to be freed by the caller. + # Prefix should be "r" or "w", + char *sam_open_mode_opts(const char *fn, const char *mode, const char *format) + + bam_hdr_t *sam_hdr_parse(int l_text, const char *text) + bam_hdr_t *sam_hdr_read(htsFile *fp) + int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) + + int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) + int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) + int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) + int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) + + #************************************* + #*** Manipulating auxiliary fields *** + #************************************* + + uint8_t *bam_aux_get(const bam1_t *b, const char *tag) + int32_t bam_aux2i(const uint8_t *s) + double bam_aux2f(const uint8_t *s) + char bam_aux2A(const uint8_t *s) + char *bam_aux2Z(const uint8_t *s) + + void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data) + int bam_aux_del(bam1_t *b, uint8_t *s) + + #************************** + #*** Pileup and Mpileup *** + #************************** + + # @abstract Structure for one alignment covering the pileup position. + # @field b pointer to the alignment + # @field qpos position of the read base at the pileup site, 0-based + # @field indel indel length; 0 for no indel, positive for ins and negative for del + # @field level the level of the read in the "viewer" mode + # @field is_del 1 iff the base on the padded read is a deletion + # @field is_head ??? + # @field is_tail ??? + # @field is_refskip ??? + # @field aux ??? + # + # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The + # difference between the two functions is that the former does not + # set bam_pileup1_t::level, while the later does. Level helps the + # implementation of alignment viewers, but calculating this has some + # overhead. + # + # is_del, is_head, etc are a bit field, declaring as below should + # work as expected, see + # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J + + ctypedef struct bam_pileup1_t: + bam1_t *b + int32_t qpos + int indel, level + uint32_t is_del + uint32_t is_head + uint32_t is_tail + uint32_t is_refskip + uint32_t aux + + ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) + ctypedef int (*bam_test_f)() + + ctypedef struct __bam_plp_t + ctypedef __bam_plp_t *bam_plp_t + + ctypedef struct __bam_mplp_t + ctypedef __bam_mplp_t *bam_mplp_t + + # bam_plp_init() - sets an iterator over multiple + # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return + # status: 0 on success, -1 on end, < -1 on non-recoverable errors + # @data: user data to pass to @func + bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) + void bam_plp_destroy(bam_plp_t iter) + int bam_plp_push(bam_plp_t iter, const bam1_t *b) + const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) + const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) + void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) + void bam_plp_reset(bam_plp_t iter) + + bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) + + # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping + # read pairs and for each base pair set the base quality of the + # lower-quality base to zero, thus effectively discarding it from + # calling. If the two bases are identical, the quality of the other base + # is increased to the sum of their qualities (capped at 200), otherwise + # it is multiplied by 0.8. + void bam_mplp_init_overlaps(bam_mplp_t iter) + void bam_mplp_destroy(bam_mplp_t iter) + void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) + int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) + + # Added by AH + # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" + + +cdef extern from "htslib/faidx.h" nogil: + + ctypedef struct faidx_t: + pass + + int fai_build(char *fn) + + void fai_destroy(faidx_t *fai) + + faidx_t *fai_load(char *fn) + + char *fai_fetch(faidx_t *fai, + char *reg, + int *len) + + int faidx_nseq(faidx_t *fai) + + int faidx_has_seq(faidx_t *fai, const char *seq) + + char *faidx_fetch_seq(faidx_t *fai, + char *c_name, + int p_beg_i, + int p_end_i, + int *len) + + int faidx_seq_len(faidx_t *fai, const char *seq) + + +# tabix support +cdef extern from "htslib/tbx.h" nogil: + + # tbx.h definitions + int8_t TBX_MAX_SHIFT + int8_t TBX_GENERIC + int8_t TBX_SAM + int8_t TBX_VCF + int8_t TBX_UCSC + + ctypedef struct tbx_conf_t: + int32_t preset + int32_t sc, bc, ec # seq col., beg col. and end col. + int32_t meta_char, line_skip + + ctypedef struct tbx_t: + tbx_conf_t conf + hts_idx_t *idx + void * dict + + tbx_conf_t tbx_conf_gff + tbx_conf_t tbx_conf_bed + tbx_conf_t tbx_conf_psltbl + tbx_conf_t tbx_conf_sam + tbx_conf_t tbx_conf_vcf + + void tbx_itr_destroy(hts_itr_t * iter) + hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end) + hts_itr_t * tbx_itr_querys(tbx_t * t, char * s) + int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data) + + int tbx_name2id(tbx_t *tbx, char *ss) + + int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf) + int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf) + + tbx_t * tbx_index_load(char *fn) + tbx_t *tbx_index_load2(const char *fn, const char *fnidx) + + # free the array but not the values + char **tbx_seqnames(tbx_t *tbx, int *n) + + void tbx_destroy(tbx_t *tbx) + + +# VCF/BCF API +cdef extern from "htslib/vcf.h" nogil: + + # Header struct + + uint8_t BCF_HL_FLT # header line + uint8_t BCF_HL_INFO + uint8_t BCF_HL_FMT + uint8_t BCF_HL_CTG + uint8_t BCF_HL_STR # structured header line TAG= + uint8_t BCF_HL_GEN # generic header line + + uint8_t BCF_HT_FLAG # header type + uint8_t BCF_HT_INT + uint8_t BCF_HT_REAL + uint8_t BCF_HT_STR + + uint8_t BCF_VL_FIXED # variable length + uint8_t BCF_VL_VAR + uint8_t BCF_VL_A + uint8_t BCF_VL_G + uint8_t BCF_VL_R + + # === Dictionary === + # + # The header keeps three dictonaries. The first keeps IDs in the + # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths + # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] + # is the actual hash table, which is opaque to the end users. In the hash + # table, the key is the ID or sample name as a C string and the value is a + # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash + # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the + # size of the hash table or, equivalently, the length of the id[] arrays. + + uint8_t BCF_DT_ID # dictionary type + uint8_t BCF_DT_CTG + uint8_t BCF_DT_SAMPLE + + # Complete textual representation of a header line + ctypedef struct bcf_hrec_t: + int type # One of the BCF_HL_* type + char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. + char *value # Set only for generic lines, NULL for FILTER/INFO, etc. + int nkeys # Number of structured fields + char **keys # The key=value pairs + char **vals + + ctypedef struct bcf_idinfo_t: + uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2] + bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG + int id + + ctypedef struct bcf_idpair_t: + const char *key + const bcf_idinfo_t *val + + ctypedef struct bcf_hdr_t: + int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) + bcf_idpair_t *id[3] + void *dict[3] # ID dictionary, contig dict and sample dict + char **samples + bcf_hrec_t **hrec + int nhrec, dirty + int ntransl + int *transl[2] # for bcf_translate() + int nsamples_ori # for bcf_hdr_set_samples() + uint8_t *keep_samples + kstring_t mem + int32_t m[3] # m: allocated size of the dictionary block in use (see n above) + + uint8_t bcf_type_shift[] + + # * VCF record * + + uint8_t BCF_BT_NULL + uint8_t BCF_BT_INT8 + uint8_t BCF_BT_INT16 + uint8_t BCF_BT_INT32 + uint8_t BCF_BT_FLOAT + uint8_t BCF_BT_CHAR + + uint8_t VCF_REF + uint8_t VCF_SNP + uint8_t VCF_MNP + uint8_t VCF_INDEL + uint8_t VCF_OTHER + + ctypedef struct variant_t: + int type, n # variant type and the number of bases affected, negative for deletions + + ctypedef struct bcf_fmt_t: + int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key + int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types + uint8_t *p # same as vptr and vptr_* in bcf_info_t below + uint32_t p_len + uint32_t p_off + uint8_t p_free + + union bcf_info_union_t: + int32_t i # integer value + float f # float value + + ctypedef struct bcf_info_t: + int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key + int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars + + # v1 union only set if $len==1; for easier access + bcf_info_union_t v1 + uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes + uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset + uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes + uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new + # data block is bigger than the original + + uint8_t BCF1_DIRTY_ID + uint8_t BCF1_DIRTY_ALS + uint8_t BCF1_DIRTY_FLT + uint8_t BCF1_DIRTY_INF + + ctypedef struct bcf_dec_t: + int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change + int n_flt # Number of FILTER fields + int *flt # FILTER keys in the dictionary + char *id # ID + char *als # REF+ALT block (\0-seperated) + char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated + bcf_info_t *info # INFO + bcf_fmt_t *fmt # FORMAT and individual sample + variant_t *var # $var and $var_type set only when set_variant_types called + int n_var, var_type + int shared_dirty # if set, shared.s must be recreated on BCF output + int indiv_dirty # if set, indiv.s must be recreated on BCF output + + uint8_t BCF_ERR_CTG_UNDEF + uint8_t BCF_ERR_TAG_UNDEF + uint8_t BCF_ERR_NCOLS + uint8_t BCF_ERR_LIMITS + uint8_t BCF_ERR_CHAR + uint8_t BCF_ERR_CTG_INVALID + uint8_t BCF_ERR_TAG_INVALID + + # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file + # is slower because the string is first to be parsed, packed into BCF line + # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it + # is known in advance that some of the fields will not be required (notably + # the sample columns), parsing of these can be skipped by setting max_unpack + # appropriately. + # Similarly, it is fast to output a BCF line because the columns (kept in + # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF + # line must be formatted in vcf_format. + + ctypedef struct bcf1_t: + int32_t rid # CHROM + int32_t pos # POS + int32_t rlen # length of REF + float qual # QUAL + uint32_t n_info, n_allele + uint32_t n_fmt, n_sample + kstring_t shared, indiv + bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() + int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed + int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work + int unpack_size[3] # the original block size of ID, REF+ALT and FILTER + int errcode # one of BCF_ERR_* codes + + ####### API ####### + + # BCF and VCF I/O + # + # A note about naming conventions: htslib internally represents VCF + # records as bcf1_t data structures, therefore most functions are + # prefixed with bcf_. There are a few exceptions where the functions must + # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In + # these cases, functions prefixed with bcf_ are more general and work + # with both BCF and VCF. + + # bcf_hdr_init() - create an empty BCF header. + # @param mode "r" or "w" + # + # When opened for writing, the mandatory fileFormat and + # FILTER=PASS lines are added automatically. + bcf_hdr_t *bcf_hdr_init(const char *mode) + + # Destroy a BCF header struct + void bcf_hdr_destroy(bcf_hdr_t *h) + + # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) + bcf1_t *bcf_init() + + # Deallocate a bcf1_t object + void bcf_destroy(bcf1_t *v) + + # Same as bcf_destroy() but frees only the memory allocated by bcf1_t, + # not the bcf1_t object itself. + void bcf_empty(bcf1_t *v) + + # Make the bcf1_t object ready for next read. Intended mostly for + # internal use, the user should rarely need to call this function + # directly. + void bcf_clear(bcf1_t *v) + + # Reads VCF or BCF header + bcf_hdr_t *bcf_hdr_read(htsFile *fp) + + # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed + # @samples: samples to include or exclude from file or as a comma-separated string. + # LIST|FILE .. select samples in list/file + # ^LIST|FILE .. exclude samples from list/file + # - .. include all samples + # NULL .. exclude all samples + # @is_file: @samples is a file (1) or a comma-separated list (0) + # + # The bottleneck of VCF reading is parsing of genotype fields. If the + # reader knows in advance that only subset of samples is needed (possibly + # no samples at all), the performance of bcf_read() can be significantly + # improved by calling bcf_hdr_set_samples after bcf_hdr_read(). + # The function bcf_read() will subset the VCF/BCF records automatically + # with the notable exception when reading records via bcf_itr_next(). + # In this case, bcf_subset_format() must be called explicitly, because + # bcf_readrec() does not see the header. + # + # Returns 0 on success, -1 on error or a positive integer if the list + # contains samples not present in the VCF header. In such a case, the + # return value is the index of the offending sample. + # + int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) + int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) + + # Writes VCF or BCF header + int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) + + # Parse VCF line contained in kstring and populate the bcf1_t struct + int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) + + # The opposite of vcf_parse. It should rarely be called directly, see vcf_write + int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) + + # bcf_read() - read next VCF or BCF record + # + # Returns -1 on critical errors, 0 otherwise. On errors which are not + # critical for reading, such as missing header definitions, v->errcode is + # set to one of BCF_ERR* code and must be checked before calling + # vcf_write(). + int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + + # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) + # + # Note that bcf_unpack() must be called even when reading VCF. It is safe + # to call the function repeatedly, it will not unpack the same field + # twice. + uint8_t BCF_UN_STR # up to ALT inclusive + uint8_t BCF_UN_FLT # up to FILTER + uint8_t BCF_UN_INFO # up to INFO + uint8_t BCF_UN_SHR # all shared information + uint8_t BCF_UN_FMT # unpack format and each sample + uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT + uint8_t BCF_UN_ALL # everything + + int bcf_unpack(bcf1_t *b, int which) + + # bcf_dup() - create a copy of BCF record. + # + # Note that bcf_unpack() must be called on the returned copy as if it was + # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) + # internally to reflect any changes made by bcf_update_* functions. + bcf1_t *bcf_dup(bcf1_t *src) + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) + + # bcf_write() - write one VCF or BCF record. The type is determined at the open() call. + int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v) + + # The following functions work only with VCFs and should rarely be called + # directly. Usually one wants to use their bcf_* alternatives, which work + # transparently with both VCFs and BCFs. + bcf_hdr_t *vcf_hdr_read(htsFile *fp) + int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) + int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + + #************************************************************************ + # Header querying and manipulation routines + #************************************************************************ + + # Create a new header using the supplied template + bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) + + # Copy header lines from src to dst if not already present in dst. See also bcf_translate(). + # Returns 0 on success or sets a bit on error: + # 1 .. conflicting definitions of tag length + # # todo + int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) + + # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() + # @param dst: the destination header to be merged into, NULL on the first pass + # @param src: the source header + # + # Notes: + # - use as: + # bcf_hdr_t *dst = NULL; + # for (i=0; i 0 ) + # for (i=0; i=0 + # + # The returned values are: + # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* + # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields + # bcf_hdr_id2type .. the field type, one of BCF_HT_* + # bcf_hdr_id2coltype .. the column type, one of BCF_HL_* + # + # Notes: Prior to using the macros, the presence of the info should be + # tested with bcf_hdr_idinfo_exists(). + # + int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id) + int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id) + bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id) + + void bcf_fmt_array(kstring_t *s, int n, int type, void *data) + uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) + + void bcf_enc_vchar(kstring_t *s, int l, const char *a) + void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) + void bcf_enc_vfloat(kstring_t *s, int n, float *a) + + #************************************************************************ + # BCF index + # + # Note that these functions work with BCFs only. See synced_bcf_reader.h + # which provides (amongst other things) an API to work transparently with + # both indexed BCFs and VCFs. + #************************************************************************ + + hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) + int bcf_index_build(const char *fn, int min_shift) + int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) + + #******************* + # Typed value I/O * + #****************** + + # Note that in contrast with BCFv2.1 specification, HTSlib implementation + # allows missing values in vectors. For integer types, the values 0x80, + # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, + # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of + # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an + # end-of-vector indicator. + # Note that the end-of-vector byte is not part of the vector. + + # This trial BCF version (v2.2) is compatible with the VCF specification and + # enables to handle correctly vectors with different ploidy in presence of + # missing values. + + int32_t bcf_int8_vector_end + int32_t bcf_int16_vector_end + int32_t bcf_int32_vector_end + int32_t bcf_str_vector_end + int32_t bcf_int8_missing + int32_t bcf_int16_missing + int32_t bcf_int32_missing + int32_t bcf_str_missing + + uint32_t bcf_float_vector_end + uint32_t bcf_float_missing + + void bcf_float_set(float *ptr, uint32_t value) + void bcf_float_set_vector_end(float *x) + void bcf_float_set_missing(float *x) + + int bcf_float_is_missing(float f) + int bcf_float_is_vector_end(float f) + void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) + void bcf_enc_size(kstring_t *s, int size, int type) + int bcf_enc_inttype(long x) + void bcf_enc_int1(kstring_t *s, int32_t x) + int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) + int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) + int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) + + # These trivial wrappers are defined only for consistency with other parts of htslib + bcf1_t *bcf_init1() + int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) + void bcf_destroy1(bcf1_t *v) + void bcf_empty1(bcf1_t *v) + int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) + void bcf_clear1(bcf1_t *v) + int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) + + # Other nice wrappers + void bcf_itr_destroy(hts_itr_t *iter) + hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) + hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s) + int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r) + hts_idx_t *bcf_index_load(const char *fn) + const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr) + + +# VCF/BCF utility functions +cdef extern from "htslib/vcfutils.h" nogil: + struct kbitset_t + + # bcf_trim_alleles() - remove ALT alleles unused in genotype fields + # @header: for access to BCF_DT_ID dictionary + # @line: VCF line obtain from vcf_parse1 + # + # Returns the number of removed alleles on success or negative + # on error: + # -1 .. some allele index is out of bounds + int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) + + # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask + # @header: for access to BCF_DT_ID dictionary + # @line: VCF line obtained from vcf_parse1 + # @mask: alleles to remove + # + # If you have more than 31 alleles, then the integer bit mask will + # overflow, so use bcf_remove_allele_set instead + void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask) + + # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set + # @header: for access to BCF_DT_ID dictionary + # @line: VCF line obtained from vcf_parse1 + # @rm_set: pointer to kbitset_t object with bits set for allele + # indexes to remove + # + # Number=A,R,G INFO and FORMAT fields will be updated accordingly. + void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set) + + # bcf_calc_ac() - calculate the number of REF and ALT alleles + # @header: for access to BCF_DT_ID dictionary + # @line: VCF line obtained from vcf_parse1 + # @ac: array of length line->n_allele + # @which: determine if INFO/AN,AC and indv fields be used + # + # Returns 1 if the call succeeded, or 0 if the value could not + # be determined. + # + # The value of @which determines if existing INFO/AC,AN can be + # used (BCF_UN_INFO) and and if indv fields can be splitted + # (BCF_UN_FMT). + int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) + + # bcf_gt_type() - determines type of the genotype + # @fmt_ptr: the GT format field as set for example by set_fmt_ptr + # @isample: sample index (starting from 0) + # @ial: index of the 1st non-reference allele (starting from 1) + # @jal: index of the 2nd non-reference allele (starting from 1) + # + # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA, + # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial + # is not NULL and the genotype has one or more non-reference + # alleles, $ial will be set. In case of GT_HET_AA, $ial is the + # position of the allele which appeared first in ALT. If $jal is + # not null and the genotype is GT_HET_AA, $jal will be set and is + # the position of the second allele in ALT. + uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation + uint8_t GT_HOM_AA + uint8_t GT_HET_RA + uint8_t GT_HET_AA + uint8_t GT_HAPL_R + uint8_t GT_HAPL_A + uint8_t GT_UNKN + int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal) + + int bcf_acgt2int(char c) + char bcf_int2acgt(int i) + + # bcf_ij2G() - common task: allele indexes to Number=G index (diploid) + # @i,j: allele indexes, 0-based, i<=j + # Returns index to the Number=G diploid array + uint32_t bcf_ij2G(uint32_t i, uint32_t j) + + +cdef class HTSFile(object): + cdef htsFile *htsfile # pointer to htsFile structure + cdef int64_t start_offset # BGZF offset of first record + + cdef readonly object filename # filename as supplied by user + cdef readonly object mode # file opening mode + cdef readonly object index_filename # filename of index, if supplied by user + + cdef readonly bint is_stream # Is htsfile a non-seekable stream + cdef readonly bint is_remote # Is htsfile a remote stream + cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh + + cdef htsFile *_open_htsfile(self) except? NULL diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx new file mode 100644 index 0000000..7eea059 --- /dev/null +++ b/pysam/libchtslib.pyx @@ -0,0 +1,265 @@ +# cython: embedsignature=True +# cython: profile=True +# adds doc-strings for sphinx +import os + +from posix.unistd cimport dup + +from pysam.libchtslib cimport * + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libcutils cimport encode_filename, from_string_and_size + + +__all__ = ["get_verbosity", "set_verbosity"] + + +######################################################################## +######################################################################## +## Constants +######################################################################## + +cdef int MAX_POS = 2 << 29 +cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') +cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', + 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED') +cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM') + + +cpdef set_verbosity(int verbosity): + """Set htslib's hts_verbose global variable to the specified value.""" + return hts_set_verbosity(verbosity) + +cpdef get_verbosity(): + """Return the value of htslib's hts_verbose global variable.""" + return hts_get_verbosity() + + +class CallableValue(object): + def __init__(self, value): + self.value = value + def __call__(self): + return self.value + def __bool__(self): + return self.value + def __nonzero__(self): + return self.value + def __eq__(self, other): + return self.value == other + def __ne__(self, other): + return self.value != other + + +CTrue = CallableValue(True) +CFalse = CallableValue(False) + + +cdef class HTSFile(object): + """ + Base class for HTS file types + """ + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + self.duplicate_filehandle = True + + def __dealloc__(self): + if self.htsfile: + hts_close(self.htsfile) + self.htsfile = NULL + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + @property + def category(self): + """General file format category. One of UNKNOWN, ALIGNMENTS, + VARIANTS, INDEX, REGIONS""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMAT_CATEGORIES[self.htsfile.format.category] + + @property + def format(self): + """File format. + + One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, + BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED. + """ + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMATS[self.htsfile.format.format] + + @property + def version(self): + """Tuple of file format version numbers (major, minor)""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return self.htsfile.format.version.major, self.htsfile.format.version.minor + + @property + def compression(self): + """File compression. + + One of NONE, GZIP, BGZF, CUSTOM.""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return COMPRESSION[self.htsfile.format.compression] + + @property + def description(self): + """Vaguely human readable description of the file format""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + cdef char *desc = hts_format_description(&self.htsfile.format) + try: + return charptr_to_str(desc) + finally: + free(desc) + + @property + def is_open(self): + """return True if HTSFile is open and in a valid state.""" + return CTrue if self.htsfile != NULL else CFalse + + @property + def is_closed(self): + """return True if HTSFile is closed.""" + return self.htsfile == NULL + + @property + def closed(self): + """return True if HTSFile is closed.""" + return self.htsfile == NULL + + @property + def is_write(self): + """return True if HTSFile is open for writing""" + return self.htsfile != NULL and self.htsfile.is_write != 0 + + @property + def is_read(self): + """return True if HTSFile is open for reading""" + return self.htsfile != NULL and self.htsfile.is_write == 0 + + @property + def is_sam(self): + """return True if HTSFile is reading or writing a SAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == sam + + @property + def is_bam(self): + """return True if HTSFile is reading or writing a BAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == bam + + @property + def is_cram(self): + """return True if HTSFile is reading or writing a BAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == cram + + @property + def is_vcf(self): + """return True if HTSFile is reading or writing a VCF variant file""" + return self.htsfile != NULL and self.htsfile.format.format == vcf + + @property + def is_bcf(self): + """return True if HTSFile is reading or writing a BCF variant file""" + return self.htsfile != NULL and self.htsfile.format.format == bcf + + def reset(self): + """reset file position to beginning of file just after the header. + + Returns + ------- + + The file position after moving the file pointer. + + """ + return self.seek(self.start_offset) + + def seek(self, uint64_t offset): + """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise OSError('seek not available in streams') + + cdef int64_t ret + if self.htsfile.format.compression != no_compression: + with nogil: + ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) + else: + with nogil: + ret = hts_useek(self.htsfile, offset, SEEK_SET) + return ret + + def tell(self): + """return current file position, see :meth:`pysam.HTSFile.seek`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise OSError('tell not available in streams') + + cdef int64_t ret + if self.htsfile.format.compression != no_compression: + with nogil: + ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) + else: + with nogil: + ret = hts_utell(self.htsfile) + return ret + + cdef htsFile *_open_htsfile(self) except? NULL: + cdef char *cfilename + cdef char *cmode = self.mode + cdef int fd, dup_fd + + if isinstance(self.filename, bytes): + cfilename = self.filename + with nogil: + return hts_open(cfilename, cmode) + else: + if isinstance(self.filename, int): + fd = self.filename + else: + fd = self.filename.fileno() + + if self.duplicate_filehandle: + dup_fd = dup(fd) + else: + dup_fd = fd + + # Replicate mode normalization done in hts_open_format + smode = self.mode.replace(b'b', b'').replace(b'c', b'') + if b'b' in self.mode: + smode += b'b' + elif b'c' in self.mode: + smode += b'c' + cmode = smode + + hfile = hdopen(dup_fd, cmode) + if hfile == NULL: + raise IOError('Cannot create hfile') + + try: + # filename.name can be an int + filename = str(self.filename.name) + except AttributeError: + filename = ''.format(fd) + + filename = encode_filename(filename) + cfilename = filename + with nogil: + return hts_hopen(hfile, cfilename, cmode) + + def _exists(self): + """return False iff file is local, a file and exists. + """ + return (not isinstance(self.filename, (str, bytes)) or + self.filename == b'-' or + self.is_remote or + os.path.exists(self.filename)) diff --git a/pysam/libcsamfile.pxd b/pysam/libcsamfile.pxd new file mode 100644 index 0000000..de36998 --- /dev/null +++ b/pysam/libcsamfile.pxd @@ -0,0 +1,45 @@ +from pysam.libcalignmentfile cimport AlignedSegment, AlignmentFile + +################################################# +# Compatibility Layer for pysam < 0.8 + +# import all declarations from htslib +from pysam.libchtslib cimport * + +cdef class AlignedRead(AlignedSegment): + pass + +cdef class Samfile(AlignmentFile): + pass + +# import the conversion functions +cdef extern from "htslib_util.h": + + # add *nbytes* into the variable length data of *src* at *pos* + bam1_t * pysam_bam_update(bam1_t * b, + size_t nbytes_old, + size_t nbytes_new, + uint8_t * pos) + + # now: static + int aux_type2size(int) + + char * pysam_bam_get_qname(bam1_t * b) + uint32_t * pysam_bam_get_cigar(bam1_t * b) + uint8_t * pysam_bam_get_seq(bam1_t * b) + uint8_t * pysam_bam_get_qual(bam1_t * b) + uint8_t * pysam_bam_get_aux(bam1_t * b) + int pysam_bam_get_l_aux(bam1_t * b) + char pysam_bam_seqi(uint8_t * s, int i) + + uint16_t pysam_get_bin(bam1_t * b) + uint8_t pysam_get_qual(bam1_t * b) + uint8_t pysam_get_l_qname(bam1_t * b) + uint16_t pysam_get_flag(bam1_t * b) + uint16_t pysam_get_n_cigar(bam1_t * b) + void pysam_set_bin(bam1_t * b, uint16_t v) + void pysam_set_qual(bam1_t * b, uint8_t v) + void pysam_set_l_qname(bam1_t * b, uint8_t v) + void pysam_set_flag(bam1_t * b, uint16_t v) + void pysam_set_n_cigar(bam1_t * b, uint16_t v) + void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) diff --git a/pysam/libcsamfile.pyx b/pysam/libcsamfile.pyx new file mode 100644 index 0000000..bde93d8 --- /dev/null +++ b/pysam/libcsamfile.pyx @@ -0,0 +1,43 @@ +# cython: embedsignature=True +# cython: profile=True +# adds doc-strings for sphinx +import tempfile +import os +import sys +import types +import itertools +import struct +import ctypes +import collections +import re +import platform +import warnings +from cpython cimport PyErr_SetString, \ + PyBytes_Check, \ + PyUnicode_Check, \ + PyBytes_FromStringAndSize + +from cpython.version cimport PY_MAJOR_VERSION + +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment + + +cdef class Samfile(AlignmentFile): + '''Deprecated alternative for :class:`~pysam.AlignmentFile` + + Added for backwards compatibility with pysam <= 0.8.0 + ''' + pass + + +cdef class AlignedRead(AlignedSegment): + '''Deprecated alternative for :class:`~pysam.AlignedSegment` + + Added for backwards compatibility with pysam <= 0.8.0 + ''' + pass + + +__all__ = ['Samfile', 'AlignedRead'] + + diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd new file mode 100644 index 0000000..12cd9dd --- /dev/null +++ b/pysam/libctabix.pxd @@ -0,0 +1,123 @@ +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free +from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup +from libc.stdio cimport FILE, printf + +# Note: this replaces python "open"! +cdef extern from "fcntl.h": + int open(char *pathname, int flags) + +cdef extern from "unistd.h" nogil: + ctypedef int ssize_t + ssize_t read(int fd, void *buf, size_t count) + int close(int fd) + +from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \ + tbx_t, kstring_t, BGZF, HTSFile + + +# These functions are put here and not in chtslib.pxd in order +# to avoid warnings for unused functions. +cdef extern from "pysam_stream.h" nogil: + + ctypedef struct kstream_t: + pass + + ctypedef struct kseq_t: + kstring_t name + kstring_t comment + kstring_t seq + kstring_t qual + + kseq_t *kseq_init(BGZF *) + int kseq_read(kseq_t *) + void kseq_destroy(kseq_t *) + kstream_t *ks_init(BGZF *) + void ks_destroy(kstream_t *) + + # Retrieve characters from stream until delimiter + # is reached placing results in str. + int ks_getuntil(kstream_t *, + int delimiter, + kstring_t * str, + int * dret) + + +cdef class tabix_file_iterator: + cdef BGZF * fh + cdef kstream_t * kstream + cdef kstring_t buffer + cdef size_t size + cdef Parser parser + cdef int fd + cdef int duplicated_fd + cdef infile + + cdef __cnext__(self) + + +cdef class TabixFile(HTSFile): + # pointer to index structure + cdef tbx_t * index + + cdef readonly object filename_index + + cdef Parser parser + + cdef encoding + + +cdef class Parser: + cdef encoding + cdef parse(self, char * buffer, int len) + + +cdef class asTuple(Parser): + cdef parse(self, char * buffer, int len) + + +cdef class asGTF(Parser): + pass + + +cdef class asBed(Parser): + pass + + +cdef class asVCF(Parser): + pass + + +cdef class TabixIterator: + cdef hts_itr_t * iterator + cdef TabixFile tabixfile + cdef kstring_t buffer + cdef encoding + cdef int __cnext__(self) + + +cdef class TabixIteratorParsed(TabixIterator): + cdef Parser parser + + +cdef class GZIterator: + cdef object _filename + cdef BGZF * gzipfile + cdef kstream_t * kstream + cdef kstring_t buffer + cdef int __cnext__(self) + cdef encoding + + +cdef class GZIteratorHead(GZIterator): + pass + + +cdef class GZIteratorParsed(GZIterator): + cdef Parser parser + + +# Compatibility Layer for pysam < 0.8 +cdef class Tabixfile(TabixFile): + pass diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx new file mode 100644 index 0000000..10dc23b --- /dev/null +++ b/pysam/libctabix.pyx @@ -0,0 +1,1188 @@ +# cython: embedsignature=True +# cython: profile=True +############################################################################### +############################################################################### +# Cython wrapper for access to tabix indexed files in bgzf format +############################################################################### +# The principal classes and functions defined in this module are: +# +# class TabixFile class wrapping tabix indexed files in bgzf format +# +# class asTuple Parser class for tuples +# class asGT Parser class for GTF formatted rows +# class asBed Parser class for Bed formatted rows +# class asVCF Parser class for VCF formatted rows +# +# class tabix_generic_iterator Streamed iterator of bgzf formatted files +# +# Additionally this module defines several additional classes that are part +# of the internal API. These are: +# +# class Parser base class for parsers of tab-separated rows +# class tabix_file_iterator +# class TabixIterator iterator class over rows in bgzf file +# class EmptyIterator +# +# For backwards compatibility, the following classes are also defined: +# +# class Tabixfile equivalent to TabixFile +# +############################################################################### +# +# The MIT License +# +# Copyright (c) 2015 Andreas Heger +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### +import os +import sys + +from libc.stdio cimport printf, fprintf, stderr +from libc.string cimport strerror +from libc.errno cimport errno +from posix.unistd cimport dup + +from cpython cimport PyErr_SetString, PyBytes_Check, \ + PyUnicode_Check, PyBytes_FromStringAndSize, \ + PyObject_AsFileDescriptor + +from cpython.version cimport PY_MAJOR_VERSION + +cimport pysam.libctabixproxies as ctabixproxies + +from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ + BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ + tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \ + tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ + tbx_destroy, hisremote, region_list + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size + +cdef class Parser: + + def __init__(self, encoding="ascii"): + self.encoding = encoding + + def set_encoding(self, encoding): + self.encoding = encoding + + def get_encoding(self): + return self.encoding + + cdef parse(self, char * buffer, int length): + raise NotImplementedError( + 'parse method of %s not implemented' % str(self)) + + def __call__(self, char * buffer, int length): + return self.parse(buffer, length) + + +cdef class asTuple(Parser): + '''converts a :term:`tabix row` into a python tuple. + + A field in a row is accessed by numeric index. + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.TupleProxy r + r = ctabixproxies.TupleProxy(self.encoding) + # need to copy - there were some + # persistence issues with "present" + r.copy(buffer, len) + return r + + +cdef class asGTF(Parser): + '''converts a :term:`tabix row` into a GTF record with the following + fields: + + +----------+----------+-------------------------------+ + |*Column* |*Name* |*Content* | + +----------+----------+-------------------------------+ + |1 |contig |the chromosome name | + +----------+----------+-------------------------------+ + |2 |feature |The feature type | + +----------+----------+-------------------------------+ + |3 |source |The feature source | + +----------+----------+-------------------------------+ + |4 |start |genomic start coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |5 |end |genomic end coordinate | + | | |(0-based) | + +----------+----------+-------------------------------+ + |6 |score |feature score | + +----------+----------+-------------------------------+ + |7 |strand |strand | + +----------+----------+-------------------------------+ + |8 |frame |frame | + +----------+----------+-------------------------------+ + |9 |attributes|the attribute field | + +----------+----------+-------------------------------+ + + GTF formatted entries also define the following fields that + are derived from the attributes field: + + +--------------------+------------------------------+ + |*Name* |*Content* | + +--------------------+------------------------------+ + |gene_id |the gene identifier | + +--------------------+------------------------------+ + |transcript_id |the transcript identifier | + +--------------------+------------------------------+ + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.GTFProxy r + r = ctabixproxies.GTFProxy(self.encoding) + r.copy(buffer, len) + return r + + +cdef class asBed(Parser): + '''converts a :term:`tabix row` into a bed record + with the following fields: + + +-----------+-----------+------------------------------------------+ + |*Column* |*Field* |*Contents* | + | | | | + +-----------+-----------+------------------------------------------+ + |1 |contig |contig | + | | | | + +-----------+-----------+------------------------------------------+ + |2 |start |genomic start coordinate (zero-based) | + +-----------+-----------+------------------------------------------+ + |3 |end |genomic end coordinate plus one | + | | |(zero-based) | + +-----------+-----------+------------------------------------------+ + |4 |name |name of feature. | + +-----------+-----------+------------------------------------------+ + |5 |score |score of feature | + +-----------+-----------+------------------------------------------+ + |6 |strand |strand of feature | + +-----------+-----------+------------------------------------------+ + |7 |thickStart |thickStart | + +-----------+-----------+------------------------------------------+ + |8 |thickEnd |thickEnd | + +-----------+-----------+------------------------------------------+ + |9 |itemRGB |itemRGB | + +-----------+-----------+------------------------------------------+ + |10 |blockCount |number of bocks | + +-----------+-----------+------------------------------------------+ + |11 |blockSizes |',' separated string of block sizes | + +-----------+-----------+------------------------------------------+ + |12 |blockStarts|',' separated string of block genomic | + | | |start positions | + +-----------+-----------+------------------------------------------+ + + Only the first three fields are required. Additional + fields are optional, but if one is defined, all the preceding + need to be defined as well. + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.BedProxy r + r = ctabixproxies.BedProxy(self.encoding) + r.copy(buffer, len) + return r + + +cdef class asVCF(Parser): + '''converts a :term:`tabix row` into a VCF record with + the following fields: + + +----------+---------+------------------------------------+ + |*Column* |*Field* |*Contents* | + | | | | + +----------+---------+------------------------------------+ + |1 |contig |chromosome | + +----------+---------+------------------------------------+ + |2 |pos |chromosomal position, zero-based | + +----------+---------+------------------------------------+ + |3 |id |id | + +----------+---------+------------------------------------+ + |4 |ref |reference allele | + +----------+---------+------------------------------------+ + |5 |alt |alternate alleles | + +----------+---------+------------------------------------+ + |6 |qual |quality | + +----------+---------+------------------------------------+ + |7 |filter |filter | + +----------+---------+------------------------------------+ + |8 |info |info | + +----------+---------+------------------------------------+ + |9 |format |format specifier. | + +----------+---------+------------------------------------+ + + Access to genotypes is via index:: + + contig = vcf.contig + first_sample_genotype = vcf[0] + second_sample_genotype = vcf[1] + + ''' + cdef parse(self, char * buffer, int len): + cdef ctabixproxies.VCFProxy r + r = ctabixproxies.VCFProxy(self.encoding) + r.copy(buffer, len) + return r + + +cdef class TabixFile: + """Random access to bgzf formatted files that + have been indexed by :term:`tabix`. + + The file is automatically opened. The index file of file + ```` is expected to be called ``.tbi`` + by default (see parameter `index`). + + Parameters + ---------- + + filename : string + Filename of bgzf file to be opened. + + index : string + The filename of the index. If not set, the default is to + assume that the index is called ``filename.tbi` + + mode : char + The file opening mode. Currently, only ``r`` is permitted. + + parser : :class:`pysam.Parser` + + sets the default parser for this tabix file. If `parser` + is None, the results are returned as an unparsed string. + Otherwise, `parser` is assumed to be a functor that will return + parsed data (see for example :class:`~pysam.asTuple` and + :class:`~pysam.asGTF`). + + encoding : string + + The encoding passed to the parser + + Raises + ------ + + ValueError + if index file is missing. + + IOError + if file could not be opened + """ + def __cinit__(self, + filename, + mode='r', + parser=None, + index=None, + encoding="ascii", + *args, + **kwargs ): + + self.htsfile = NULL + self.is_remote = False + self.is_stream = False + self.parser = parser + self._open(filename, mode, index, *args, **kwargs) + self.encoding = encoding + + def _open( self, + filename, + mode='r', + index=None, + ): + '''open a :term:`tabix file` for reading.''' + + if mode != 'r': + raise ValueError("invalid file opening mode `%s`" % mode) + + if self.htsfile != NULL: + self.close() + self.htsfile = NULL + + filename_index = index or (filename + ".tbi") + # encode all the strings to pass to tabix + self.filename = encode_filename(filename) + self.filename_index = encode_filename(filename_index) + + self.is_stream = self.filename == b'-' + self.is_remote = hisremote(self.filename) + + if not self.is_remote: + if not os.path.exists(filename): + raise IOError("file `%s` not found" % filename) + + if not os.path.exists(filename_index): + raise IOError("index `%s` not found" % filename_index) + + # open file + cdef char *cfilename = self.filename + with nogil: + self.htsfile = hts_open(cfilename, 'r') + + if self.htsfile == NULL: + raise IOError("could not open file `%s`" % filename) + + #if self.htsfile.format.category != region_list: + # raise ValueError("file does not contain region data") + + cfilename = self.filename_index + with nogil: + self.index = tbx_index_load(cfilename) + + if self.index == NULL: + raise IOError("could not open index for `%s`" % filename) + + if not self.is_stream: + self.start_offset = self.tell() + + def _dup(self): + '''return a copy of this tabix file. + + The file is being re-opened. + ''' + return TabixFile(self.filename, + mode="r", + parser=self.parser, + index=self.filename_index, + encoding=self.encoding) + + def fetch(self, + reference=None, + start=None, + end=None, + region=None, + parser=None, + multiple_iterators=False): + '''fetch one or more rows in a :term:`region` using 0-based + indexing. The region is specified by :term:`reference`, + *start* and *end*. Alternatively, a samtools :term:`region` + string can be supplied. + + Without *reference* or *region* all entries will be fetched. + + If only *reference* is set, all reads matching on *reference* + will be fetched. + + If *parser* is None, the default parser will be used for + parsing. + + Set *multiple_iterators* to true if you will be using multiple + iterators on the same file at the same time. The iterator + returned will receive its own copy of a filehandle to the file + effectively re-opening the file. Re-opening a file creates + some overhead, so beware. + + ''' + if not self.is_open(): + raise ValueError("I/O operation on closed file") + + # convert coordinates to region string, which is one-based + if reference: + if end is not None: + if end < 0: + raise ValueError("end out of range (%i)" % end) + if start is None: + start = 0 + + if start < 0: + raise ValueError("start out of range (%i)" % end) + elif start > end: + raise ValueError( + 'start (%i) >= end (%i)' % (start, end)) + elif start == end: + return EmptyIterator() + else: + region = '%s:%i-%i' % (reference, start + 1, end) + elif start is not None: + if start < 0: + raise ValueError("start out of range (%i)" % end) + region = '%s:%i' % (reference, start + 1) + else: + region = reference + + # get iterator + cdef hts_itr_t * itr + cdef char *cstr + cdef TabixFile fileobj + + # reopen the same file if necessary + if multiple_iterators: + fileobj = self._dup() + else: + fileobj = self + + if region is None: + # without region or reference - iterate from start + with nogil: + itr = tbx_itr_queryi(fileobj.index, + HTS_IDX_START, + 0, + 0) + else: + s = force_bytes(region, encoding=fileobj.encoding) + cstr = s + with nogil: + itr = tbx_itr_querys(fileobj.index, cstr) + + if itr == NULL: + if region is None: + if len(self.contigs) > 0: + # when accessing a tabix file created prior tabix 1.0 + # the full-file iterator is empty. + raise ValueError( + "could not create iterator, possible " + "tabix version mismatch") + else: + # possible reason is that the file is empty - + # return an empty iterator + return EmptyIterator() + else: + raise ValueError( + "could not create iterator for region '%s'" % + region) + + # use default parser if no parser is specified + if parser is None: + parser = fileobj.parser + + cdef TabixIterator a + if parser is None: + a = TabixIterator(encoding=fileobj.encoding) + else: + parser.set_encoding(fileobj.encoding) + a = TabixIteratorParsed(parser) + + a.tabixfile = fileobj + a.iterator = itr + + return a + + ############################################################### + ############################################################### + ############################################################### + ## properties + ############################################################### + property header: + '''the file header. + + The file header consists of the lines at the beginning of a + file that are prefixed by the comment character ``#``. + + .. note:: + The header is returned as an iterator presenting lines + without the newline character. + + .. note:: + The header is only available for local files. For remote + files an Attribute Error is raised. + + ''' + + def __get__(self): + if self.is_remote: + raise AttributeError( + "the header is not available for remote files") + return GZIteratorHead(self.filename) + + property contigs: + '''list of chromosome names''' + def __get__(self): + cdef char ** sequences + cdef int nsequences + + with nogil: + sequences = tbx_seqnames(self.index, &nsequences) + cdef int x + result = [] + for x from 0 <= x < nsequences: + result.append(force_str(sequences[x])) + + # htslib instructions: + # only free container, not the sequences themselves + free(sequences) + + return result + + def close(self): + ''' + closes the :class:`pysam.TabixFile`.''' + if self.htsfile != NULL: + hts_close(self.htsfile) + self.htsfile = NULL + if self.index != NULL: + tbx_destroy(self.index) + self.index = NULL + + def __dealloc__( self ): + # remember: dealloc cannot call other python methods + # note: no doc string + # note: __del__ is not called. + if self.htsfile != NULL: + hts_close(self.htsfile) + self.htsfile = NULL + if self.index != NULL: + tbx_destroy(self.index) + + +cdef class TabixIterator: + """iterates over rows in *tabixfile* in region + given by *tid*, *start* and *end*. + """ + + def __init__(self, encoding="ascii"): + self.encoding = encoding + + def __iter__(self): + self.buffer.s = NULL + self.buffer.l = 0 + self.buffer.m = 0 + + return self + + cdef int __cnext__(self): + '''iterate to next element. + + Return -5 if file has been closed when this function + was called. + ''' + if self.tabixfile.htsfile == NULL: + return -5 + + cdef int retval + + while 1: + with nogil: + retval = tbx_itr_next( + self.tabixfile.htsfile, + self.tabixfile.index, + self.iterator, + &self.buffer) + + if retval < 0: + break + + if self.buffer.s[0] != '#': + break + + return retval + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef int retval = self.__cnext__() + if retval == -5: + raise IOError("iteration on closed file") + elif retval < 0: + raise StopIteration + + return charptr_to_str(self.buffer.s, self.encoding) + + def next(self): + return self.__next__() + + def __dealloc__(self): + if self.iterator != NULL: + tbx_itr_destroy(self.iterator) + if self.buffer.s != NULL: + free(self.buffer.s) + + +class EmptyIterator: + '''empty iterator''' + + def __iter__(self): + return self + + def next(self): + raise StopIteration() + + def __next__(self): + raise StopIteration() + + +cdef class TabixIteratorParsed(TabixIterator): + """iterates over mapped reads in a region. + + The *parser* determines the encoding. + + Returns parsed data. + """ + + def __init__(self, + Parser parser): + + TabixIterator.__init__(self) + self.parser = parser + + def __next__(self): + """python version of next(). + + pyrex uses this non-standard name instead of next() + """ + + cdef int retval = self.__cnext__() + if retval == -5: + raise IOError("iteration on closed file") + elif retval < 0: + raise StopIteration + + return self.parser.parse(self.buffer.s, + self.buffer.l) + + +cdef class GZIterator: + def __init__(self, filename, int buffer_size=65536, encoding="ascii"): + '''iterate line-by-line through gzip (or bgzip) + compressed file. + ''' + if not os.path.exists(filename): + raise IOError("No such file or directory: %s" % filename) + + filename = encode_filename(filename) + cdef char *cfilename = filename + with nogil: + self.gzipfile = bgzf_open(cfilename, "r") + self._filename = filename + self.kstream = ks_init(self.gzipfile) + self.encoding = encoding + + self.buffer.l = 0 + self.buffer.m = 0 + self.buffer.s = malloc(buffer_size) + + def __dealloc__(self): + '''close file.''' + if self.gzipfile != NULL: + bgzf_close(self.gzipfile) + self.gzipfile = NULL + if self.buffer.s != NULL: + free(self.buffer.s) + if self.kstream != NULL: + ks_destroy(self.kstream) + + def __iter__(self): + return self + + cdef int __cnext__(self): + cdef int dret = 0 + cdef int retval = 0 + while 1: + with nogil: + retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) + + if retval < 0: + break + + return dret + return -1 + + def __next__(self): + """python version of next(). + """ + cdef int retval = self.__cnext__() + if retval < 0: + raise StopIteration + return force_str(self.buffer.s, self.encoding) + + +cdef class GZIteratorHead(GZIterator): + '''iterate line-by-line through gzip (or bgzip) + compressed file returning comments at top of file. + ''' + + def __next__(self): + """python version of next(). + """ + cdef int retval = self.__cnext__() + if retval < 0: + raise StopIteration + if self.buffer.s[0] == '#': + return self.buffer.s + else: + raise StopIteration + + +cdef class GZIteratorParsed(GZIterator): + '''iterate line-by-line through gzip (or bgzip) + compressed file returning comments at top of file. + ''' + + def __init__(self, parser): + self.parser = parser + + def __next__(self): + """python version of next(). + """ + cdef int retval = self.__cnext__() + if retval < 0: + raise StopIteration + + return self.parser.parse(self.buffer.s, + self.buffer.l) + + +def tabix_compress(filename_in, + filename_out, + force=False): + '''compress *filename_in* writing the output to *filename_out*. + + Raise an IOError if *filename_out* already exists, unless *force* + is set. + ''' + + if not force and os.path.exists(filename_out): + raise IOError( + "Filename '%s' already exists, use *force* to " + "overwrite" % filename_out) + + cdef int WINDOW_SIZE + cdef int c, r + cdef void * buffer + cdef BGZF * fp + cdef int fd_src + cdef bint is_empty = True + cdef int O_RDONLY + O_RDONLY = os.O_RDONLY + + WINDOW_SIZE = 64 * 1024 + + fn = encode_filename(filename_out) + cdef char *cfn = fn + with nogil: + fp = bgzf_open(cfn, "w") + if fp == NULL: + raise IOError("could not open '%s' for writing" % filename_out) + + fn = encode_filename(filename_in) + fd_src = open(fn, O_RDONLY) + if fd_src == 0: + raise IOError("could not open '%s' for reading" % filename_in) + + buffer = malloc(WINDOW_SIZE) + c = 1 + + while c > 0: + with nogil: + c = read(fd_src, buffer, WINDOW_SIZE) + if c > 0: + is_empty = False + r = bgzf_write(fp, buffer, c) + if r < 0: + free(buffer) + raise OSError("writing failed") + + free(buffer) + r = bgzf_close(fp) + if r < 0: + raise OSError("error %i when writing to file %s" % (r, filename_out)) + + r = close(fd_src) + # an empty file will return with -1, thus ignore this. + if r < 0: + if not (r == -1 and is_empty): + raise OSError("error %i when closing file %s" % (r, filename_in)) + + +def tabix_index( filename, + force = False, + seq_col = None, + start_col = None, + end_col = None, + preset = None, + meta_char = "#", + zerobased = False, + int min_shift = -1, + ): + '''index tab-separated *filename* using tabix. + + An existing index will not be overwritten unless + *force* is set. + + The index will be built from coordinates + in columns *seq_col*, *start_col* and *end_col*. + + The contents of *filename* have to be sorted by + contig and position - the method does not check + if the file is sorted. + + Column indices are 0-based. Coordinates in the file + are assumed to be 1-based. + + If *preset* is provided, the column coordinates + are taken from a preset. Valid values for preset + are "gff", "bed", "sam", "vcf", psltbl", "pileup". + + Lines beginning with *meta_char* and the first + *line_skip* lines will be skipped. + + If *filename* does not end in ".gz", it will be automatically + compressed. The original file will be removed and only the + compressed file will be retained. + + If *filename* ends in *gz*, the file is assumed to be already + compressed with bgzf. + + *min-shift* sets the minimal interval size to 1<malloc( buffer_size ) +# self.size = buffer_size +# self.parser = parser + +# def __iter__(self): +# return self + +# cdef __cnext__(self): + +# cdef char * b +# cdef size_t nbytes +# b = self.buffer + +# while not feof( self.infile ): +# nbytes = getline( &b, &self.size, self.infile) + +# # stop at first error or eof +# if (nbytes == -1): break +# # skip comments +# if (b[0] == '#'): continue + +# # skip empty lines +# if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue + +# # make sure that entry is complete +# if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': +# result = b +# raise ValueError( "incomplete line at %s" % result ) + +# # make sure that this goes fully through C +# # otherwise buffer is copied to/from a +# # Python object causing segfaults as +# # the wrong memory is freed +# return self.parser.parse( b, nbytes ) + +# raise StopIteration + +# def __dealloc__(self): +# free(self.buffer) + +# def __next__(self): +# return self.__cnext__() + +######################################################### +######################################################### +######################################################### +## Iterators for parsing through unindexed files. +######################################################### +# cdef buildGzipError(void *gzfp): +# cdef int errnum = 0 +# cdef char *s = gzerror(gzfp, &errnum) +# return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s) + + +cdef class tabix_file_iterator: + '''iterate over a compressed or uncompressed ``infile``. + ''' + + def __cinit__(self, + infile, + Parser parser, + int buffer_size=65536): + + if infile.closed: + raise ValueError("I/O operation on closed file.") + + self.infile = infile + + cdef int fd = PyObject_AsFileDescriptor(infile) + if fd == -1: + raise ValueError("I/O operation on closed file.") + + self.duplicated_fd = dup(fd) + + # From the manual: + # gzopen can be used to read a file which is not in gzip format; + # in this case gzread will directly read from the file without decompression. + # When reading, this will be detected automatically by looking + # for the magic two-byte gzip header. + self.fh = bgzf_dopen(self.duplicated_fd, 'r') + + if self.fh == NULL: + raise IOError('%s' % strerror(errno)) + + self.kstream = ks_init(self.fh) + + self.buffer.s = malloc(buffer_size) + #if self.buffer == NULL: + # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size) + #self.size = buffer_size + self.parser = parser + + def __iter__(self): + return self + + cdef __cnext__(self): + + cdef char * b + cdef int dret = 0 + cdef int retval = 0 + while 1: + with nogil: + retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) + + if retval < 0: + break + #raise IOError('gzip error: %s' % buildGzipError( self.fh )) + + b = self.buffer.s + + # skip comments + if (b[0] == '#'): + continue + + # skip empty lines + if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': + continue + + # gzgets terminates at \n, no need to test + + # parser creates a copy + return self.parser.parse(b, self.buffer.l) + + raise StopIteration + + def __dealloc__(self): + free(self.buffer.s) + ks_destroy(self.kstream) + bgzf_close(self.fh) + + def __next__(self): + return self.__cnext__() + + def next(self): + return self.__cnext__() + + +class tabix_generic_iterator: + '''iterate over ``infile``. + + Permits the use of file-like objects for example from the gzip module. + ''' + def __init__(self, infile, parser): + + self.infile = infile + if self.infile.closed: + raise ValueError("I/O operation on closed file.") + self.parser = parser + + def __iter__(self): + return self + + # cython version - required for python 3 + def __next__(self): + + cdef char * b + cdef char * cpy + cdef size_t nbytes + + encoding = self.parser.get_encoding() + + # note that GzipFile.close() does not close the file + # reading is still possible. + if self.infile.closed: + raise ValueError("I/O operation on closed file.") + + while 1: + + line = self.infile.readline() + if not line: + break + + s = force_bytes(line, encoding) + b = s + nbytes = len(line) + assert b[nbytes] == '\0' + + # skip comments + if b[0] == '#': + continue + + # skip empty lines + if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': + continue + + # make sure that entry is complete + if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': + raise ValueError("incomplete line at %s" % line) + + bytes_cpy = b + cpy = bytes_cpy + + return self.parser(cpy, nbytes) + + raise StopIteration + + # python version - required for python 2.7 + def next(self): + return self.__next__() + +def tabix_iterator(infile, parser): + """return an iterator over all entries in a file. + + Results are returned parsed as specified by the *parser*. If + *parser* is None, the results are returned as an unparsed string. + Otherwise, *parser* is assumed to be a functor that will return + parsed data (see for example :class:`~pysam.asTuple` and + :class:`~pysam.asGTF`). + + """ + if PY_MAJOR_VERSION >= 3: + return tabix_generic_iterator(infile, parser) + else: + return tabix_file_iterator(infile, parser) + + # file objects can use C stdio + # used to be: isinstance( infile, file): + # if PY_MAJOR_VERSION >= 3: + # if isinstance( infile, io.IOBase ): + # return tabix_copy_iterator( infile, parser ) + # else: + # return tabix_generic_iterator( infile, parser ) + # else: +# if isinstance( infile, file ): +# return tabix_copy_iterator( infile, parser ) +# else: +# return tabix_generic_iterator( infile, parser ) + +cdef class Tabixfile(TabixFile): + """Tabixfile is deprecated: use TabixFile instead""" + pass + + +__all__ = [ + "tabix_index", + "tabix_compress", + "TabixFile", + "Tabixfile", + "asTuple", + "asGTF", + "asVCF", + "asBed", + "GZIterator", + "GZIteratorHead", + "tabix_iterator", + "tabix_generic_iterator", + "tabix_file_iterator", +] diff --git a/pysam/libctabixproxies.pxd b/pysam/libctabixproxies.pxd new file mode 100644 index 0000000..5317b81 --- /dev/null +++ b/pysam/libctabixproxies.pxd @@ -0,0 +1,59 @@ +#cdef extern from "Python.h": +# ctypedef struct FILE + +from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t + +cdef class TupleProxy: + + cdef: + char * data + char ** fields + int nfields + int index + int nbytes + int offset + bint is_modified + + cdef encoding + + cpdef int getMaxFields(self) + cpdef int getMinFields(self) +# cdef char * _getindex(self, int idx) + + cdef take(self, char * buffer, size_t nbytes) + cdef present(self, char * buffer, size_t nbytes) + cdef copy(self, char * buffer, size_t nbytes, bint reset=*) + cdef update(self, char * buffer, size_t nbytes) + +cdef class GTFProxy(TupleProxy) : + + cdef: + char * _attributes + cdef bint hasOwnAttributes + + cpdef int getMaxFields(self) + cpdef int getMinFields(self) + cdef char * getAttributes(self) + +cdef class NamedTupleProxy(TupleProxy): + pass + +cdef class BedProxy(NamedTupleProxy): + + cdef: + char * contig + uint32_t start + uint32_t end + int bedfields + + cpdef int getMaxFields(self) + cpdef int getMinFields(self) + cdef update(self, char * buffer, size_t nbytes) + +cdef class VCFProxy(NamedTupleProxy) : + + cdef: + char * contig + uint32_t pos + + cdef update(self, char * buffer, size_t nbytes) diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx new file mode 100644 index 0000000..9a8a678 --- /dev/null +++ b/pysam/libctabixproxies.pyx @@ -0,0 +1,827 @@ +from cpython cimport PyBytes_FromStringAndSize + +from libc.stdio cimport printf, feof, fgets +from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr +from libc.stdlib cimport free, malloc, calloc, realloc +from libc.stdlib cimport atoi, atol, atof + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size + +import collections + +cdef char *StrOrEmpty(char * buffer): + if buffer == NULL: + return "" + else: return buffer + +cdef int isNew(char * p, char * buffer, size_t nbytes): + """return True if `p` is located within `buffer` of size + `nbytes` + """ + if p == NULL: + return 0 + return not (buffer <= p < buffer + nbytes) + + +cdef class TupleProxy: + '''Proxy class for access to parsed row as a tuple. + + This class represents a table row for fast read-access. + + Access to individual fields is via the [] operator. + + Only read-only access is implemented. + + ''' + + def __cinit__(self, encoding="ascii"): + self.data = NULL + self.fields = NULL + self.index = 0 + self.nbytes = 0 + self.is_modified = 0 + self.nfields = 0 + # start counting at field offset + self.offset = 0 + self.encoding = encoding + + def __dealloc__(self): + cdef int x + if self.is_modified: + for x from 0 <= x < self.nfields: + if isNew(self.fields[x], self.data, self.nbytes): + free(self.fields[x]) + self.fields[x] = NULL + + if self.data != NULL: + free(self.data) + if self.fields != NULL: + free(self.fields) + + def __copy__(self): + if self.is_modified: + raise NotImplementedError( + "copying modified tuples is not implemented") + cdef TupleProxy n = type(self)() + n.copy(self.data, self.nbytes, reset=True) + return n + + def compare(self, TupleProxy other): + '''return -1,0,1, if contents in this are binary + <,=,> to *other* + + ''' + if self.is_modified or other.is_modified: + raise NotImplementedError( + 'comparison of modified TupleProxies is not implemented') + if self.data == other.data: + return 0 + + if self.nbytes < other.nbytes: + return -1 + elif self.nbytes > other.nbytes: + return 1 + return memcmp(self.data, other.data, self.nbytes) + + def __richcmp__(self, TupleProxy other, int op): + if op == 2: # == operator + return self.compare(other) == 0 + elif op == 3: # != operator + return self.compare(other) != 0 + else: + err_msg = "op {0} isn't implemented yet".format(op) + raise NotImplementedError(err_msg) + + cdef take(self, char * buffer, size_t nbytes): + '''start presenting buffer. + + Take ownership of the pointer. + ''' + self.data = buffer + self.nbytes = nbytes + self.update(buffer, nbytes) + + cdef present(self, char * buffer, size_t nbytes): + '''start presenting buffer. + + Do not take ownership of the pointer. + ''' + self.update(buffer, nbytes) + + cdef copy(self, char * buffer, size_t nbytes, bint reset=False): + '''start presenting buffer of size *nbytes*. + + Buffer is a '\0'-terminated string without the '\n'. + + Take a copy of buffer. + ''' + # +1 for '\0' + cdef int s = sizeof(char) * (nbytes + 1) + self.data = malloc(s) + if self.data == NULL: + raise ValueError("out of memory in TupleProxy.copy()") + memcpy(self.data, buffer, s) + + if reset: + for x from 0 <= x < nbytes: + if self.data[x] == '\0': + self.data[x] = '\t' + + self.update(self.data, nbytes) + + cpdef int getMinFields(self): + '''return minimum number of fields.''' + # 1 is not a valid tabix entry, but TupleProxy + # could be more generic. + return 1 + + cpdef int getMaxFields(self): + '''return maximum number of fields. Return + 0 for unknown length.''' + return 0 + + cdef update(self, char * buffer, size_t nbytes): + '''update internal data. + + *buffer* is a \0 terminated string. + + *nbytes* is the number of bytes in buffer (excluding + the \0) + + Update starts work in buffer, thus can be used + to collect any number of fields until nbytes + is exhausted. + + If max_fields is set, the number of fields is initialized to + max_fields. + + ''' + cdef char * pos + cdef char * old_pos + cdef int field + cdef int max_fields, min_fields, x + + assert strlen(buffer) == nbytes, \ + "length of buffer (%i) != number of bytes (%i)" % ( + strlen(buffer), nbytes) + + if buffer[nbytes] != 0: + raise ValueError("incomplete line at %s" % buffer) + + ################################# + # remove line breaks and feeds and update number of bytes + x = nbytes - 1 + while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): + buffer[x] = '\0' + x -= 1 + self.nbytes = x + 1 + + ################################# + # clear data + if self.fields != NULL: + free(self.fields) + + for field from 0 <= field < self.nfields: + if isNew(self.fields[field], self.data, self.nbytes): + free(self.fields[field]) + + self.is_modified = self.nfields = 0 + + ################################# + # allocate new + max_fields = self.getMaxFields() + # pre-count fields - better would be + # to guess or dynamically grow + if max_fields == 0: + for x from 0 <= x < nbytes: + if buffer[x] == '\t': + max_fields += 1 + max_fields += 1 + + self.fields = calloc(max_fields, sizeof(char *)) + if self.fields == NULL: + raise ValueError("out of memory in TupleProxy.update()") + + ################################# + # start filling + field = 0 + self.fields[field] = pos = buffer + field += 1 + old_pos = pos + while 1: + + pos = memchr(pos, '\t', nbytes) + if pos == NULL: + break + if field >= max_fields: + raise ValueError( + "parsing error: more than %i fields in line: %s" % + (max_fields, buffer)) + + pos[0] = '\0' + pos += 1 + self.fields[field] = pos + field += 1 + nbytes -= pos - old_pos + if nbytes < 0: + break + old_pos = pos + self.nfields = field + if self.nfields < self.getMinFields(): + raise ValueError( + "parsing error: fewer that %i fields in line: %s" % + (self.getMinFields(), buffer)) + + def _getindex(self, int index): + '''return item at idx index''' + cdef int i = index + if i < 0: + i += self.nfields + if i < 0: + raise IndexError("list index out of range") + # apply offset - separating a fixed number + # of fields from a variable number such as in VCF + i += self.offset + if i >= self.nfields: + raise IndexError( + "list index out of range %i >= %i" % + (i, self.nfields)) + return force_str(self.fields[i], self.encoding) + + def __getitem__(self, key): + if type(key) == int: + return self._getindex(key) + # slice object + start, end, step = key.indices(self.nfields) + result = [] + for index in range(start, end, step): + result.append(self._getindex(index)) + return result + + def _setindex(self, index, value): + '''set item at idx index.''' + cdef int idx = index + if idx < 0: + raise IndexError("list index out of range") + if idx >= self.nfields: + raise IndexError("list index out of range") + + if isNew(self.fields[idx], self.data, self.nbytes): + free(self.fields[idx] ) + + self.is_modified = 1 + + if value is None: + self.fields[idx] = NULL + return + + # conversion with error checking + value = force_bytes(value) + cdef char * tmp = value + self.fields[idx] = malloc((strlen( tmp ) + 1) * sizeof(char)) + if self.fields[idx] == NULL: + raise ValueError("out of memory" ) + strcpy(self.fields[idx], tmp) + + def __setitem__(self, index, value): + '''set item at *index* to *value*''' + cdef int i = index + if i < 0: + i += self.nfields + i += self.offset + + self._setindex(i, value) + + def __len__(self): + return self.nfields + + def __iter__(self): + self.index = 0 + return self + + def __next__(self): + """python version of next(). + """ + if self.index >= self.nfields: + raise StopIteration + cdef char * retval = self.fields[self.index] + self.index += 1 + if retval == NULL: + return None + else: + return force_str(retval, self.encoding) + + def __str__(self): + '''return original data''' + # copy and replace \0 bytes with \t characters + cdef char * cpy + if self.is_modified: + # todo: treat NULL values + result = [] + for x in xrange(0, self.nfields): + result.append(StrOrEmpty(self.fields[x]).decode(self.encoding)) + return "\t".join(result) + else: + cpy = calloc(sizeof(char), self.nbytes+1) + if cpy == NULL: + raise ValueError("out of memory") + memcpy(cpy, self.data, self.nbytes+1) + for x from 0 <= x < self.nbytes: + if cpy[x] == '\0': + cpy[x] = '\t' + result = cpy[:self.nbytes] + free(cpy) + r = result.decode(self.encoding) + return r + +def toDot(v): + '''convert value to '.' if None''' + if v is None: + return "." + else: + return str(v) + +def quote(v): + '''return a quoted attribute.''' + if isinstance(v, str): + return '"%s"' % v + else: + return str(v) + + +cdef class GTFProxy(TupleProxy): + '''Proxy class for access to GTF fields. + + This class represents a GTF entry for fast read-access. + Write-access has been added as well, though some care must + be taken. If any of the string fields (contig, source, ...) + are set, the new value is tied to the lifetime of the + argument that was supplied. + + The only exception is the attributes field when set from + a dictionary - this field will manage its own memory. + ''' + + def __cinit__(self): + # automatically calls TupleProxy.__cinit__ + self.hasOwnAttributes = False + self._attributes = NULL + + def __dealloc__(self): + # automatically calls TupleProxy.__dealloc__ + if self.hasOwnAttributes: + free(self._attributes) + + cpdef int getMinFields(self): + '''return minimum number of fields.''' + return 9 + + cpdef int getMaxFields(self): + '''return max number of fields.''' + return 9 + + property contig: + '''contig of feature.''' + def __get__(self): + return self._getindex(0) + def __set__(self, value): + self._setindex(0, value) + + property source: + '''feature source.''' + def __get__(self): + return self._getindex(1) + def __set__(self, value): + if value is None: + value = "." + self._setindex(1, value) + + property feature: + '''feature name.''' + def __get__(self): + return self._getindex(2) + def __set__(self, value): + if value is None: + value = "." + self._setindex(2, value) + + property start: + '''feature start (in 0-based open/closed coordinates).''' + def __get__(self ): + return int( self._getindex(3)) - 1 + def __set__(self, value ): + self._setindex(3, str(value+1)) + + property end: + '''feature end (in 0-based open/closed coordinates).''' + def __get__(self): + return int(self._getindex(4)) + def __set__(self, value): + self._setindex(4, str(value)) + + property score: + '''feature score.''' + def __get__(self): + v = self._getindex(5) + if v == "" or v[0] == '.': + return None + else: + return float(v) + + def __set__(self, value): + if value is None: + value = "." + self._setindex(5, str(value)) + + property strand: + '''feature strand.''' + def __get__(self): + return self._getindex(6) + def __set__(self, value ): + if value is None: + value = "." + self._setindex(6, value) + + property frame: + '''feature frame.''' + def __get__(self): + v = self._getindex(7) + if v == "" or v[0] == '.': + return v + else: + return int(v) + + def __set__(self, value): + if value is None: + value = "." + self._setindex(7, str(value)) + + property attributes: + '''feature attributes (as a string).''' + def __get__(self): + if self.hasOwnAttributes: + return force_str(self._attributes) + else: + return force_str(self._getindex(8)) + def __set__( self, value): + if self.hasOwnAttributes: + free(self._attributes) + self._attributes = NULL + self.hasOwnAttributes = False + self._setindex(8, value) + + cdef char * getAttributes(self): + '''return pointer to attributes.''' + cdef char * attributes + if self.hasOwnAttributes: + attributes = self._attributes + else: + attributes = self.fields[8] + if attributes == NULL: + raise KeyError("no attributes defined GTF entry") + return attributes + + def asDict(self): + """parse attributes - return as dict + """ + + # remove comments + attributes = self.attributes + + # separate into fields + # Fields might contain a ";", for example in ENSEMBL GTF file + # for mouse, v78: + # ...; transcript_name "TXNRD2;-001"; .... + # The current heuristic is to split on a semicolon followed by a + # space, see also http://mblab.wustl.edu/GTF22.html + + # Remove white space to prevent a last empty field. + fields = [x.strip() for x in attributes.strip().split("; ")] + + result = collections.OrderedDict() + + for f in fields: + + # strip semicolon (GTF files without a space after the last semicolon) + if f.endswith(";"): + f = f[:-1] + + # split at most once in order to avoid separating + # multi-word values + d = [x.strip() for x in f.split(" ", 1)] + + n,v = d[0], d[1] + if len(d) > 2: + v = d[1:] + + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + else: + ## try to convert to a value + try: + v = float(v) + v = int(v) + except ValueError: + pass + except TypeError: + pass + + result[n] = v + + return result + + def fromDict(self, d): + '''set attributes from a dictionary.''' + cdef char * p + cdef int l + + # clean up if this field is set twice + if self.hasOwnAttributes: + free(self._attributes) + + aa = [] + for k,v in d.items(): + if isinstance(v, str): + aa.append( '%s "%s"' % (k,v) ) + else: + aa.append( '%s %s' % (k,str(v)) ) + + a = force_bytes("; ".join(aa) + ";") + p = a + l = len(a) + self._attributes = calloc(l + 1, sizeof(char)) + if self._attributes == NULL: + raise ValueError("out of memory") + memcpy(self._attributes, p, l) + + self.hasOwnAttributes = True + self.is_modified = True + + def __str__(self): + cdef char * cpy + cdef int x + + if self.is_modified: + return "\t".join( + (self.contig, + self.source, + self.feature, + str(self.start+1), + str(self.end), + toDot(self.score), + toDot(self.strand), + toDot(self.frame), + self.attributes)) + else: + return TupleProxy.__str__(self) + + def invert(self, int lcontig): + '''invert coordinates to negative strand coordinates + + This method will only act if the feature is on the + negative strand.''' + + if self.strand[0] == '-': + start = min(self.start, self.end) + end = max(self.start, self.end) + self.start, self.end = lcontig - end, lcontig - start + + def keys(self): + '''return a list of attributes defined in this entry.''' + r = self.attributes + return [x.strip().split(" ")[0] + # separator is ';' followed by space + for x in r.split("; ") if x.strip() != ''] + + def __getitem__(self, key): + return self.__getattr__(key) + + def __getattr__(self, item): + """Generic lookup of attribute from GFF/GTF attributes + Only called if there *isn't* an attribute with this name + """ + cdef char * start + cdef char * query + cdef char * cpy + cdef char * end + cdef int l + + # + # important to use the getAttributes function. + # Using the self.attributes property to access + # the attributes caused a hard-to-trace bug + # in which fields in the attribute string were + # set to 0. + # Running through valgrind complained that + # memory was accessed in the memory field + # that has been released. It is not clear + # why this happened and might be a cython bug + # (Version 0.16). The valgrind warnings + # disappeard after accessing the C data structures + # directly and so did the bug. + cdef char * attributes = self.getAttributes() + if attributes == NULL: + raise KeyError("key %s not found, no attributes" % item) + + # add space in order to make sure + # to not pick up a field that is a prefix of another field + r = force_bytes(item + " ") + query = r + start = strstr(attributes, query) + + if start == NULL: + raise AttributeError("'GTFProxy' has no attribute '%s'" % item) + + start += strlen(query) + # skip gaps before + while start[0] == ' ': + start += 1 + + if start[0] == '"': + start += 1 + end = start + while end[0] != '\0' and end[0] != '"': + end += 1 + l = end - start + result = force_str(PyBytes_FromStringAndSize(start, l), + self.encoding) + return result + else: + return force_str(start, self.encoding) + + def setAttribute(self, name, value): + '''convenience method to set an attribute.''' + r = self.asDict() + r[name] = value + self.fromDict(r) + + def __cmp__(self, other): + return (self.contig, self.strand, self.start) < \ + (other.contig, other.strand, other.start) + + # python 3 compatibility + def __richcmp__(GTFProxy self, GTFProxy other, int op): + if op == 0: + return (self.contig, self.strand, self.start) < \ + (other.contig, other.strand, other.start) + elif op == 1: + return (self.contig, self.strand, self.start) <= \ + (other.contig, other.strand, other.start) + elif op == 2: + return self.compare(other) == 0 + elif op == 3: + return self.compare(other) != 0 + else: + err_msg = "op {0} isn't implemented yet".format(op) + raise NotImplementedError(err_msg) + + +cdef class NamedTupleProxy(TupleProxy): + + map_key2field = {} + + def __setattr__(self, key, value): + '''set attribute.''' + cdef int idx + idx, f = self.map_key2field[key] + if self.nfields < idx: + raise KeyError("field %s not set" % key) + TupleProxy.__setitem__(self, idx, str(value)) + + def __getattr__(self, key): + cdef int idx + idx, f = self.map_key2field[key] + if self.nfields < idx: + raise KeyError("field %s not set" % key) + if f == str: + return force_str(self.fields[idx], + self.encoding) + return f(self.fields[idx]) + + +cdef class BedProxy(NamedTupleProxy): + '''Proxy class for access to Bed fields. + + This class represents a BED entry for fast read-access. + ''' + map_key2field = { + 'contig' : (0, str), + 'start' : (1, int), + 'end' : (2, int), + 'name' : (3, str), + 'score' : (4, float), + 'strand' : (5, str), + 'thickStart' : (6, int), + 'thickEnd' : (7, int), + 'itemRGB' : (8, str), + 'blockCount': (9, int), + 'blockSizes': (10, str), + 'blockStarts': (11, str), } + + cpdef int getMinFields(self): + '''return minimum number of fields.''' + return 3 + + cpdef int getMaxFields(self): + '''return max number of fields.''' + return 12 + + cdef update(self, char * buffer, size_t nbytes): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + TupleProxy.update(self, buffer, nbytes) + + if self.nfields < 3: + raise ValueError( + "bed format requires at least three columns") + + # determines bed format + self.bedfields = self.nfields + + # do automatic conversion + self.contig = self.fields[0] + self.start = atoi(self.fields[1]) + self.end = atoi(self.fields[2]) + + # __setattr__ in base class seems to take precedence + # hence implement setters in __setattr__ + #property start: + # def __get__( self ): return self.start + #property end: + # def __get__( self ): return self.end + + def __str__(self): + + cdef int save_fields = self.nfields + # ensure fields to use correct format + self.nfields = self.bedfields + retval = TupleProxy.__str__(self) + self.nfields = save_fields + return retval + + def __setattr__(self, key, value ): + '''set attribute.''' + if key == "start": + self.start = value + elif key == "end": + self.end = value + + cdef int idx + idx, f = self.map_key2field[key] + TupleProxy._setindex(self, idx, str(value) ) + +cdef class VCFProxy(NamedTupleProxy): + '''Proxy class for access to VCF fields. + + The genotypes are accessed via a numeric index. + Sample headers are not available. + ''' + map_key2field = { + 'contig' : (0, str), + 'pos' : (1, int), + 'id' : (2, str), + 'ref' : (3, str), + 'alt' : (4, str), + 'qual' : (5, str), + 'filter' : (6, str), + 'info' : (7, str), + 'format' : (8, str) } + + def __cinit__(self): + # automatically calls TupleProxy.__cinit__ + # start indexed access at genotypes + self.offset = 9 + + cdef update(self, char * buffer, size_t nbytes): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + TupleProxy.update(self, buffer, nbytes) + + self.contig = self.fields[0] + # vcf counts from 1 - correct here + self.pos = atoi(self.fields[1]) - 1 + + def __len__(self): + '''return number of genotype fields.''' + return max(0, self.nfields - 9) + + property pos: + '''feature end (in 0-based open/closed coordinates).''' + def __get__(self): + return self.pos + + def __setattr__(self, key, value): + '''set attribute.''' + if key == "pos": + self.pos = value + value += 1 + + cdef int idx + idx, f = self.map_key2field[key] + TupleProxy._setindex(self, idx, str(value)) + diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd new file mode 100644 index 0000000..81e544a --- /dev/null +++ b/pysam/libcutils.pxd @@ -0,0 +1,38 @@ +######################################################################### +# Utility functions used across pysam +######################################################################### +cimport cython +from cpython cimport array as c_array + +cpdef parse_region(reference=*, start=*, end=*, region=*) + +######################################################################### +# Utility functions for quality string conversions + +cpdef c_array.array qualitystring_to_array(input_str, int offset=*) +cpdef array_to_qualitystring(c_array.array arr, int offset=*) +cpdef qualities_to_qualitystring(qualities, int offset=*) + +######################################################################## +######################################################################## +######################################################################## +## Python 3 compatibility functions +######################################################################## +cdef charptr_to_str(const char *s, encoding=*) +cdef bytes charptr_to_bytes(const char *s, encoding=*) +cdef charptr_to_str_w_len(const char* s, size_t n, encoding=*) +cdef force_str(object s, encoding=*) +cdef bytes force_bytes(object s, encoding=*) +cdef bytes encode_filename(object filename) +cdef from_string_and_size(const char *s, size_t length) + +cdef extern from "pysam_util.h": + + int samtools_main(int argc, char *argv[]) + int bcftools_main(int argc, char *argv[]) + void pysam_set_stderr(int fd) + void pysam_unset_stderr() + void pysam_set_stdout(int fd) + void pysam_set_stdout_fn(const char *) + void pysam_unset_stdout() + void set_optind(int) diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx new file mode 100644 index 0000000..80bd9e4 --- /dev/null +++ b/pysam/libcutils.pyx @@ -0,0 +1,375 @@ +import types +import sys +import string +import re +import tempfile +import os +import io +from contextlib import contextmanager + +from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION +from cpython cimport PyBytes_Check, PyUnicode_Check +from cpython cimport array as c_array +from libc.stdlib cimport calloc, free +from libc.string cimport strncpy +from libc.stdio cimport fprintf, stderr, fflush +from libc.stdio cimport stdout as c_stdout +from posix.fcntl cimport open as c_open, O_WRONLY + +##################################################################### +# hard-coded constants +cdef int MAX_POS = 2 << 29 + +################################################################# +# Utility functions for quality string conversions +cpdef c_array.array qualitystring_to_array(input_str, int offset=33): + """convert a qualitystring to an array of quality values.""" + if input_str is None: + return None + qs = force_bytes(input_str) + cdef char i + return c_array.array('B', [i - offset for i in qs]) + + +cpdef array_to_qualitystring(c_array.array qualities, int offset=33): + """convert an array of quality values to a string.""" + if qualities is None: + return None + cdef int x + + cdef c_array.array result + result = c_array.clone(qualities, len(qualities), zero=False) + + for x from 0 <= x < len(qualities): + result[x] = qualities[x] + offset + return force_str(result.tostring()) + + +cpdef qualities_to_qualitystring(qualities, int offset=33): + """convert a list or array of quality scores to the string + representation used in the SAM format. + + Parameters + ---------- + offset : int + offset to be added to the quality scores to arrive at + the characters of the quality string (default=33). + + Returns + ------- + string + a quality string + + """ + cdef char x + if qualities is None: + return None + elif isinstance(qualities, c_array.array): + return array_to_qualitystring(qualities, offset=offset) + else: + # tuples and lists + return force_str("".join([chr(x + offset) for x in qualities])) + + +######################################################################## +######################################################################## +######################################################################## +## Python 3 compatibility functions +######################################################################## + +cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 + +cdef from_string_and_size(const char* s, size_t length): + if IS_PYTHON3: + return s[:length].decode("ascii") + else: + return s[:length] + + +# filename encoding (adapted from lxml.etree.pyx) +cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii' + + +cdef bytes encode_filename(object filename): + """Make sure a filename is 8-bit encoded (or None).""" + if filename is None: + return None + elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2: + # Added to support path-like objects + return os.fsencode(filename) + elif PyBytes_Check(filename): + return filename + elif PyUnicode_Check(filename): + return filename.encode(FILENAME_ENCODING) + else: + raise TypeError("Argument must be string or unicode.") + + +cdef bytes force_bytes(object s, encoding="ascii"): + """convert string or unicode object to bytes, assuming + ascii encoding. + """ + if s is None: + return None + elif PyBytes_Check(s): + return s + elif PyUnicode_Check(s): + return s.encode(encoding) + else: + raise TypeError("Argument must be string, bytes or unicode.") + + +cdef charptr_to_str(const char* s, encoding="ascii"): + if s == NULL: + return None + if PY_MAJOR_VERSION < 3: + return s + else: + return s.decode(encoding) + + +cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"): + if s == NULL: + return None + if PY_MAJOR_VERSION < 3: + return s[:n] + else: + return s[:n].decode(encoding) + + +cdef bytes charptr_to_bytes(const char* s, encoding="ascii"): + if s == NULL: + return None + else: + return s + + +cdef force_str(object s, encoding="ascii"): + """Return s converted to str type of current Python + (bytes in Py2, unicode in Py3)""" + if s is None: + return None + if PY_MAJOR_VERSION < 3: + return s + elif PyBytes_Check(s): + return s.decode(encoding) + else: + # assume unicode + return s + + +cpdef parse_region(reference=None, + start=None, + end=None, + region=None): + """parse alternative ways to specify a genomic region. A region can + either be specified by :term:`reference`, `start` and + `end`. `start` and `end` denote 0-based, half-open + intervals. + + Alternatively, a samtools :term:`region` string can be + supplied. + + If any of the coordinates are missing they will be replaced by the + minimum (`start`) or maximum (`end`) coordinate. + + Note that region strings are 1-based, while `start` and `end` denote + an interval in python coordinates. + + Returns + ------- + + tuple : a tuple of `reference`, `start` and `end`. + + Raises + ------ + + ValueError + for invalid or out of bounds regions. + + """ + cdef int rtid + cdef long long rstart + cdef long long rend + + rtid = -1 + rstart = 0 + rend = MAX_POS + if start != None: + try: + rstart = start + except OverflowError: + raise ValueError('start out of range (%i)' % start) + + if end != None: + try: + rend = end + except OverflowError: + raise ValueError('end out of range (%i)' % end) + + if region: + region = force_str(region) + parts = re.split("[:-]", region) + reference = parts[0] + if len(parts) >= 2: + rstart = int(parts[1]) - 1 + if len(parts) >= 3: + rend = int(parts[2]) + + if not reference: + return None, 0, 0 + + if not 0 <= rstart < MAX_POS: + raise ValueError('start out of range (%i)' % rstart) + if not 0 <= rend <= MAX_POS: + raise ValueError('end out of range (%i)' % rend) + if rstart > rend: + raise ValueError( + 'invalid region: start (%i) > end (%i)' % (rstart, rend)) + + return force_bytes(reference), rstart, rend + + +def _pysam_dispatch(collection, + method, + args=None, + catch_stdout=True, + save_stdout=None): + '''call ``method`` in samtools/bcftools providing arguments in args. + + Catching of stdout can be turned off by setting *catch_stdout* to + False. + + ''' + + if method == "index": + if not os.path.exists(args[0]): + raise IOError("No such file or directory: '%s'" % args[0]) + + if args is None: + args = [] + else: + args = list(args) + + # redirect stderr to file + stderr_h, stderr_f = tempfile.mkstemp() + pysam_set_stderr(stderr_h) + + # redirect stdout to file + if save_stdout: + stdout_f = save_stdout + stdout_h = c_open(force_bytes(stdout_f), + O_WRONLY) + if stdout_h == -1: + raise OSError("error while opening {} for writing".format(stdout_f)) + + pysam_set_stdout_fn(force_bytes(stdout_f)) + pysam_set_stdout(stdout_h) + elif catch_stdout: + stdout_h, stdout_f = tempfile.mkstemp() + + MAP_STDOUT_OPTIONS = { + "samtools": { + "view": "-o {}", + "mpileup": "-o {}", + "depad": "-o {}", + "calmd": "", # uses pysam_stdout_fn + }, + "bcftools": {} + } + + stdout_option = None + if collection == "bcftools": + # in bcftools, most methods accept -o, the exceptions + # are below: + if method not in ("index", "roh", "stats"): + stdout_option = "-o {}" + elif method in MAP_STDOUT_OPTIONS[collection]: + # special case - samtools view -c outputs on stdout + if not(method == "view" and "-c" in args): + stdout_option = MAP_STDOUT_OPTIONS[collection][method] + + if stdout_option is not None: + os.close(stdout_h) + pysam_set_stdout_fn(force_bytes(stdout_f)) + args.extend(stdout_option.format(stdout_f).split(" ")) + else: + pysam_set_stdout(stdout_h) + else: + pysam_set_stdout_fn("-") + + # setup the function call to samtools/bcftools main + cdef char ** cargs + cdef int i, n, retval, l + n = len(args) + method = force_bytes(method) + collection = force_bytes(collection) + args = [force_bytes(a) for a in args] + + # allocate two more for first (dummy) argument (contains command) + cdef int extra_args = 0 + if method == b"index": + extra_args = 1 + # add extra arguments for commands accepting optional arguments + # such as 'samtools index x.bam [out.index]' + cargs = calloc(n + 2 + extra_args, sizeof(char *)) + cargs[0] = collection + cargs[1] = method + + # create copies of strings - getopt for long options permutes + # arguments + for i from 0 <= i < n: + l = len(args[i]) + cargs[i + 2] = calloc(l + 1, sizeof(char)) + strncpy(cargs[i + 2], args[i], l) + + # reset getopt. On OsX there getopt reset is different + # between getopt and getopt_long + if method in [b'index', b'cat', b'quickcheck', + b'faidx', b'kprobaln']: + set_optind(1) + else: + set_optind(0) + + # call samtools/bcftools + if collection == b"samtools": + retval = samtools_main(n + 2, cargs) + elif collection == b"bcftools": + retval = bcftools_main(n + 2, cargs) + + for i from 0 <= i < n: + free(cargs[i + 2]) + free(cargs) + + # get error messages + def _collect(fn): + out = [] + try: + with open(fn, "r") as inf: + out = inf.read() + except UnicodeDecodeError: + with open(fn, "rb") as inf: + # read binary output + out = inf.read() + finally: + os.remove(fn) + return out + + pysam_unset_stderr() + out_stderr = _collect(stderr_f) + + if save_stdout: + pysam_unset_stdout() + out_stdout = None + elif catch_stdout: + pysam_unset_stdout() + out_stdout = _collect(stdout_f) + else: + out_stdout = None + + return retval, out_stderr, out_stdout + + +__all__ = ["qualitystring_to_array", + "array_to_qualitystring", + "qualities_to_qualitystring"] diff --git a/pysam/libcvcf.pxd b/pysam/libcvcf.pxd new file mode 100644 index 0000000..139597f --- /dev/null +++ b/pysam/libcvcf.pxd @@ -0,0 +1,2 @@ + + diff --git a/pysam/libcvcf.pyx b/pysam/libcvcf.pyx new file mode 100644 index 0000000..956f8a5 --- /dev/null +++ b/pysam/libcvcf.pyx @@ -0,0 +1,1203 @@ +# cython: embedsignature=True +# +# Code to read, write and edit VCF files +# +# VCF lines are encoded as a dictionary with these keys (note: all lowercase): +# 'chrom': string +# 'pos': integer +# 'id': string +# 'ref': string +# 'alt': list of strings +# 'qual': integer +# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"] +# 'info': dictionary of values (see below) +# 'format': list of keys (strings) +# sample keys: dictionary of values (see below) +# +# The sample keys are accessible through vcf.getsamples() +# +# A dictionary of values contains value keys (defined in ##INFO or +# ##FORMAT lines) which map to a list, containing integers, floats, +# strings, or characters. Missing values are replaced by a particular +# value, often -1 or . +# +# Genotypes are not stored as a string, but as a list of 1 or 3 +# elements (for haploid and diploid samples), the first (and last) the +# integer representing an allele, and the second the separation +# character. Note that there is just one genotype per sample, but for +# consistency the single element is stored in a list. +# +# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as +# (key, value) pairs and are accessible through getheader() +# +# The VCF class can be instantiated with a 'regions' variable +# consisting of tuples (chrom,start,end) encoding 0-based half-open +# segments. Only variants with a position inside the segment will be +# parsed. A regions parser is available under parse_regions. +# +# When instantiated, a reference can be passed to the VCF class. This +# may be any class that supports a fetch(chrom, start, end) method. +# +# NOTE: the position that is returned to Python is 0-based, NOT +# 1-based as in the VCF file. +# NOTE: There is also preliminary VCF functionality in the VariantFile class. +# +# TODO: +# only v4.0 writing is complete; alleles are not converted to v3.3 format +# + +from collections import namedtuple, defaultdict +from operator import itemgetter +import sys, re, copy, bisect + +from libc.stdlib cimport atoi +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t + +cimport pysam.libctabix as libctabix +cimport pysam.libctabixproxies as libctabixproxies + +from pysam.libcutils cimport force_str + +import pysam + +gtsRegEx = re.compile("[|/\\\\]") +alleleRegEx = re.compile('^[ACGTN]+$') + +# Utility function. Uses 0-based coordinates +def get_sequence(chrom, start, end, fa): + # obtain sequence from .fa file, without truncation + if end<=start: return "" + if not fa: return "N"*(end-start) + if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper() + sequence = fa.fetch(chrom, start, end).upper() + if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence)) + return sequence + +# Utility function. Parses a region string +def parse_regions( string ): + result = [] + for r in string.split(','): + elts = r.split(':') + chrom, start, end = elts[0], 0, 3000000000 + if len(elts)==1: pass + elif len(elts)==2: + if len(elts[1])>0: + ielts = elts[1].split('-') + if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r) + try: start, end = int(ielts[0])-1, int(ielts[1]) + except: raise ValueError("Don't understand region string '%s'" % r) + else: + raise ValueError("Don't understand region string '%s'" % r) + result.append( (chrom,start,end) ) + return result + + +FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue') + +########################################################################################################### +# +# New class +# +########################################################################################################### + +cdef class VCFRecord(libctabixproxies.TupleProxy): + '''vcf record. + + initialized from data and vcf meta + ''' + + cdef vcf + cdef char * contig + cdef uint32_t pos + + def __init__(self, vcf): + self.vcf = vcf + self.encoding = vcf.encoding + + # if len(data) != len(self.vcf._samples): + # self.vcf.error(str(data), + # self.BAD_NUMBER_OF_COLUMNS, + # "expected %s for %s samples (%s), got %s" % \ + # (len(self.vcf._samples), + # len(self.vcf._samples), + # self.vcf._samples, + # len(data))) + + def __cinit__(self, vcf): + # start indexed access at genotypes + self.offset = 9 + + self.vcf = vcf + self.encoding = vcf.encoding + + def error(self, line, error, opt=None): + '''raise error.''' + # pass to vcf file for error handling + return self.vcf.error(line, error, opt) + + cdef update(self, char * buffer, size_t nbytes): + '''update internal data. + + nbytes does not include the terminal '\0'. + ''' + libctabixproxies.TupleProxy.update(self, buffer, nbytes) + + self.contig = self.fields[0] + # vcf counts from 1 - correct here + self.pos = atoi(self.fields[1]) - 1 + + def __len__(self): + return max(0, self.nfields - 9) + + property contig: + def __get__(self): return self.contig + + property pos: + def __get__(self): return self.pos + + property id: + def __get__(self): return self.fields[2] + + property ref: + def __get__(self): + return self.fields[3] + + property alt: + def __get__(self): + # convert v3.3 to v4.0 alleles below + alt = self.fields[4] + if alt == ".": alt = [] + else: alt = alt.upper().split(',') + return alt + + property qual: + def __get__(self): + qual = self.fields[5] + if qual == b".": qual = -1 + else: + try: qual = float(qual) + except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL) + return qual + + property filter: + def __get__(self): + f = self.fields[6] + # postpone checking that filters exist. Encode missing filter or no filtering as empty list + if f == b"." or f == b"PASS" or f == b"0": return [] + else: return f.split(';') + + property info: + def __get__(self): + col = self.fields[7] + # dictionary of keys, and list of values + info = {} + if col != b".": + for blurp in col.split(';'): + elts = blurp.split('=') + if len(elts) == 1: v = None + elif len(elts) == 2: v = elts[1] + else: self.vcf.error(str(self),self.ERROR_INFO_STRING) + info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf)) + return info + + property format: + def __get__(self): + return self.fields[8].split(':') + + property samples: + def __get__(self): + return self.vcf._samples + + def __getitem__(self, key): + + # parse sample columns + values = self.fields[self.vcf._sample2column[key]].split(':') + alt = self.alt + format = self.format + + if len(values) > len(format): + self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\ + (len(values),key,len(format))) + + result = {} + for idx in range(len(format)): + expected = self.vcf.get_expected(format[idx], self.vcf._format, alt) + if idx < len(values): value = values[idx] + else: + if expected == -1: value = "." + else: value = ",".join(["."]*expected) + + result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data)) + if expected != -1 and len(result[format[idx]]) != expected: + self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS, + "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]])) + if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]])) + result[format[idx]] = result[format[idx]][:expected] + + return result + + +cdef class asVCFRecord(libctabix.Parser): + '''converts a :term:`tabix row` into a VCF record.''' + cdef vcffile + def __init__(self, vcffile): + self.vcffile = vcffile + + cdef parse(self, char * buffer, int len): + cdef VCFRecord r + r = VCFRecord(self.vcffile) + r.copy(buffer, len) + return r + +class VCF(object): + + # types + NT_UNKNOWN = 0 + NT_NUMBER = 1 + NT_ALLELES = 2 + NT_NR_ALLELES = 3 + NT_GENOTYPES = 4 + NT_PHASED_GENOTYPES = 5 + + _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier", + 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string", + 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s", + 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)", + 4:"POS_NOT_NUMERICAL:Position column is not numerical", + 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field", + 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF", + 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF", + 8:"POS_NOT_POSITIVE:Position field must be >0", + 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'", + 10:"ERROR_INFO_STRING:Error while parsing info field", + 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)", + 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s", + 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string", + 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header", + 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header", + 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)", + 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)", + 18:"BAD_GENOTYPE:Cannot parse genotype (%s)", + 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)", + 20:"MISSING_REF:Reference allele missing", + 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)", + 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets", + 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes", + 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields", + 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs", + 26:"WRONG_REF:Wrong reference %s", + 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data", + 28:"BAD_CHR_TAG:Error calculating chr tag for %s", + 29:"ZERO_LENGTH_ALLELE:Found zero-length allele", + 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base", + 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'", + 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s", + 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value", + } + + # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields + _header = [] + + # version number; 33=v3.3; 40=v4.0 + _version = 40 + + # info, filter and format data + _info = {} + _filter = {} + _format = {} + + # header; and required columns + _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] + _samples = [] + + # control behaviour + _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD + _warn_errors = set([]) + _leftalign = False + + # reference sequence + _reference = None + + # regions to include; None includes everything + _regions = None + + # statefull stuff + _lineno = -1 + _line = None + _lines = None + + def __init__(self, _copy=None, reference=None, regions=None, + lines=None, leftalign=False): + # make error identifiers accessible by name + for id in self._errors.keys(): + self.__dict__[self._errors[id].split(':')[0]] = id + if _copy != None: + self._leftalign = _copy._leftalign + self._header = _copy._header[:] + self._version = _copy._version + self._info = copy.deepcopy(_copy._info) + self._filter = copy.deepcopy(_copy._filter) + self._format = copy.deepcopy(_copy._format) + self._samples = _copy._samples[:] + self._sample2column = copy.deepcopy(_copy._sample2column) + self._ignored_errors = copy.deepcopy(_copy._ignored_errors) + self._warn_errors = copy.deepcopy(_copy._warn_errors) + self._reference = _copy._reference + self._regions = _copy._regions + if reference: self._reference = reference + if regions: self._regions = regions + if leftalign: self._leftalign = leftalign + self._lines = lines + self.encoding = "ascii" + self.tabixfile = None + + def error(self,line,error,opt=None): + if error in self._ignored_errors: return + errorlabel, errorstring = self._errors[error].split(':') + if opt: errorstring = errorstring % opt + errwarn = ["Error","Warning"][error in self._warn_errors] + errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring) + if error in self._warn_errors: return + raise ValueError(errorstring) + + def parse_format(self,line,format,filter=False): + if self._version == 40: + if not format.startswith('<'): + self.error(line,self.V40_MISSING_ANGLE_BRACKETS) + format = "<"+format + if not format.endswith('>'): + self.error(line,self.V40_MISSING_ANGLE_BRACKETS) + format += ">" + format = format[1:-1] + data = {'id':None,'number':None,'type':None,'descr':None} + idx = 0 + while len(format.strip())>0: + elts = format.strip().split(',') + first, rest = elts[0], ','.join(elts[1:]) + if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')): + if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS) + if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) + first = ["ID=","Number=","Type=","Description="][idx] + first + if first.startswith('ID='): data['id'] = first.split('=')[1] + elif first.startswith('Number='): data['number'] = first.split('=')[1] + elif first.startswith('Type='): data['type'] = first.split('=')[1] + elif first.startswith('Description='): + elts = format.split('"') + if len(elts)<3: + self.error(line,self.FORMAT_MISSING_QUOTES) + elts = first.split('=') + [rest] + data['descr'] = elts[1] + rest = '"'.join(elts[2:]) + if rest.startswith(','): rest = rest[1:] + else: + self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) + format = rest + idx += 1 + if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings + if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) + if 'descr' not in data: + # missing description + self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) + data['descr'] = "" + if not data['type'] and not data['number']: + # fine, ##filter format + return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.') + if not data['type'] in ["Integer","Float","Character","String","Flag"]: + self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) + # I would like a missing-value field, but it isn't there + if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value + else: data['missing'] = '.' + if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) + try: + n = int(data['number']) + t = self.NT_NUMBER + except ValueError: + n = -1 + if data['number'] == '.': t = self.NT_UNKNOWN + elif data['number'] == '#alleles': t = self.NT_ALLELES + elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES + elif data['number'] == '#genotypes': t = self.NT_GENOTYPES + elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES + elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES + # abbreviations added in VCF version v4.1 + elif data['number'] == 'A': t = self.NT_ALLELES + elif data['number'] == 'G': t = self.NT_GENOTYPES + else: + self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) + # if number is 0 - type must be Flag + if n == 0 and data['type'] != 'Flag': + self.error( line, self.ZERO_FOR_NON_FLAG_FIELD) + # force type 'Flag' if no number + data['type'] = 'Flag' + + return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing']) + + def format_format( self, fmt, filter=False ): + values = [('ID',fmt.id)] + if fmt.number != None and not filter: + if fmt.numbertype == self.NT_UNKNOWN: nmb = "." + elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number) + elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles" + elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles" + elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes" + elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes" + else: + raise ValueError("Unknown number type encountered: %s" % fmt.numbertype) + values.append( ('Number',nmb) ) + values.append( ('Type', fmt.type) ) + values.append( ('Description', '"' + fmt.description + '"') ) + if self._version == 33: + format = ",".join([v for k,v in values]) + else: + format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">" + return format + + def get_expected(self, format, formatdict, alt): + fmt = formatdict[format] + if fmt.numbertype == self.NT_UNKNOWN: return -1 + if fmt.numbertype == self.NT_NUMBER: return fmt.number + if fmt.numbertype == self.NT_ALLELES: return len(alt)+1 + if fmt.numbertype == self.NT_NR_ALLELES: return len(alt) + if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2 + if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1) + return 0 + + + def _add_definition(self, formatdict, key, data, line ): + if key in formatdict: return + self.error(line,self.ERROR_UNKNOWN_KEY,key) + if data == None: + formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".") + return + if data == []: data = [""] # unsure what type -- say string + if type(data[0]) == type(0.0): + formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None) + return + if type(data[0]) == type(0): + formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None) + return + formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".") + + + # todo: trim trailing missing values + def format_formatdata( self, data, format, key=True, value=True, separator=":" ): + output, sdata = [], [] + if type(data) == type([]): # for FORMAT field, make data with dummy values + d = {} + for k in data: d[k] = [] + data = d + # convert missing values; and silently add definitions if required + for k in data: + self._add_definition( format, k, data[k], "(output)" ) + for idx,v in enumerate(data[k]): + if v == format[k].missingvalue: data[k][idx] = "." + # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string + for k in data: + if k != 'GT': sdata.append( (k,data[k]) ) + sdata.sort() + if 'GT' in data: + sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata + for k,v in sdata: + if v == []: v = None + if key and value: + if v != None: output.append( k+"="+','.join(map(str,v)) ) + else: output.append( k ) + elif key: output.append(k) + elif value: + if v != None: output.append( ','.join(map(str,v)) ) + else: output.append( "." ) # should not happen + # snip off trailing missing data + while len(output) > 1: + last = output[-1].replace(',','').replace('.','') + if len(last)>0: break + output = output[:-1] + return separator.join(output) + + + def enter_default_format(self): + for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'), + FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1), + FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'), + FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), + FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), + FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1), + FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'), + FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'), + FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'), + FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid + FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'), + FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1), + FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1), + FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1), + ]: + if f.id not in self._format: + self._format[f.id] = f + + def parse_header(self, line): + + assert line.startswith('##') + elts = line[2:].split('=') + key = elts[0].strip() + value = '='.join(elts[1:]).strip() + if key == "fileformat": + if value == "VCFv3.3": + self._version = 33 + elif value == "VCFv4.0": + self._version = 40 + elif value == "VCFv4.1": + # AH - for testing + self._version = 40 + elif value == "VCFv4.2": + # AH - for testing + self._version = 40 + else: + self.error(line,self.UNKNOWN_FORMAT_STRING) + elif key == "INFO": + f = self.parse_format(line, value) + self._info[ f.id ] = f + elif key == "FILTER": + f = self.parse_format(line, value, filter=True) + self._filter[ f.id ] = f + elif key == "FORMAT": + f = self.parse_format(line, value) + self._format[ f.id ] = f + else: + # keep other keys in the header field + self._header.append( (key,value) ) + + + def write_header( self, stream ): + stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10)) + for key,value in self._header: stream.write("##%s=%s\n" % (key,value)) + for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]: + for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER")))) + + + def parse_heading( self, line ): + assert line.startswith('#') + assert not line.startswith('##') + headings = line[1:].split('\t') + # test for 8, as FORMAT field might be missing + if len(headings)==1 and len(line[1:].split()) >= 8: + self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS) + headings = line[1:].split() + + for i,s in enumerate(self._required): + + if len(headings)<=i or headings[i] != s: + + if len(headings) <= i: + err = "(%sth entry not found)" % (i+1) + else: + err = "(found %s, expected %s)" % (headings[i],s) + + #self.error(line,self.BADLY_FORMATTED_HEADING,err) + # allow FORMAT column to be absent + if len(headings) == 8: + headings.append("FORMAT") + else: + self.error(line,self.BADLY_FORMATTED_HEADING,err) + + self._samples = headings[9:] + self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] ) + + def write_heading( self, stream ): + stream.write("#" + "\t".join(self._required + self._samples) + "\n") + + def convertGT(self, GTstring): + if GTstring == ".": return ["."] + try: + gts = gtsRegEx.split(GTstring) + if len(gts) == 1: return [int(gts[0])] + if len(gts) != 2: raise ValueError() + if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]] + return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])] + except ValueError: + self.error(self._line,self.BAD_GENOTYPE,GTstring) + return [".","|","."] + + def convertGTback(self, GTdata): + return ''.join(map(str,GTdata)) + + def parse_formatdata( self, key, value, formatdict, line ): + # To do: check that the right number of values is present + f = formatdict.get(key,None) + if f == None: + self._add_definition(formatdict, key, value, line ) + f = formatdict[key] + if f.type == "Flag": + if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE) + return [] + values = value.split(',') + # deal with trailing data in some early VCF files + if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1: + self.error(line,self.ERROR_TRAILING_DATA,values[-1]) + values[-1] = values[-1].split(';')[0] + if f.type == "Integer": + for idx,v in enumerate(values): + try: + if v == ".": values[idx] = f.missingvalue + else: values[idx] = int(v) + except: + self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values))) + return [0] * len(values) + return values + elif f.type == "String": + self._line = line + if f.id == "GT": values = list(map( self.convertGT, values )) + return values + elif f.type == "Character": + for v in values: + if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR) + return values + elif f.type == "Float": + for idx,v in enumerate(values): + if v == ".": values[idx] = f.missingvalue + try: return list(map(float,values)) + except: + self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values))) + return [0.0] * len(values) + else: + # can't happen + self.error(line,self.ERROR_INFO_STRING) + + def inregion(self, chrom, pos): + if not self._regions: return True + for r in self._regions: + if r[0] == chrom and r[1] <= pos < r[2]: return True + return False + + def parse_data( self, line, lineparse=False ): + cols = line.split('\t') + if len(cols) != len(self._samples)+9: + # gracefully deal with absent FORMAT column + # and those missing samples + if len(cols) == 8: + cols.append("") + else: + self.error(line, + self.BAD_NUMBER_OF_COLUMNS, + "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols))) + + chrom = cols[0] + + # get 0-based position + try: pos = int(cols[1])-1 + except: self.error(line,self.POS_NOT_NUMERICAL) + if pos < 0: self.error(line,self.POS_NOT_POSITIVE) + + # implement filtering + if not self.inregion(chrom,pos): return None + + # end of first-pass parse for sortedVCF + if lineparse: return chrom, pos, line + + id = cols[2] + + ref = cols[3].upper() + if ref == ".": + self.error(line,self.MISSING_REF) + if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) + else: ref = "" + else: + for c in ref: + if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF) + if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference) + + # make sure reference is sane + if self._reference: + left = max(0,pos-100) + faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference) + faref = faref_leftflank[pos-left:] + if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) + ref = faref + + # convert v3.3 to v4.0 alleles below + if cols[4] == ".": alt = [] + else: alt = cols[4].upper().split(',') + + if cols[5] == ".": qual = -1 + else: + try: qual = float(cols[5]) + except: self.error(line,self.QUAL_NOT_NUMERICAL) + + # postpone checking that filters exist. Encode missing filter or no filtering as empty list + if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = [] + else: filter = cols[6].split(';') + + # dictionary of keys, and list of values + info = {} + if cols[7] != ".": + for blurp in cols[7].split(';'): + elts = blurp.split('=') + if len(elts) == 1: v = None + elif len(elts) == 2: v = elts[1] + else: self.error(line,self.ERROR_INFO_STRING) + info[elts[0]] = self.parse_formatdata(elts[0], + v, + self._info, + line) + + # Gracefully deal with absent FORMAT column + if cols[8] == "": format = [] + else: format = cols[8].split(':') + + # check: all filters are defined + for f in filter: + if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f) + + # check: format fields are defined + if self._format: + for f in format: + if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f) + + # convert v3.3 alleles + if self._version == 33: + if len(ref) != 1: self.error(line,self.V33_BAD_REF) + newalts = [] + have_deletions = False + for a in alt: + if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference + elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference + elif a.startswith('D'): # allow D and D + have_deletions = True + try: + l = int(a[1:]) # throws ValueError if sequence + if len(ref) < l: # add to reference if necessary + addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) + ref += addns + for i,na in enumerate(newalts): newalts[i] = na+addns + a = ref[l:] # new deletion, deleting pos...pos+l + except ValueError: + s = a[1:] + if len(ref) < len(s): # add Ns to reference if necessary + addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) + if not s.endswith(addns) and addns != 'N'*len(addns): + self.error(line,self.V33_UNMATCHED_DELETION, + "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) + ref += addns + for i,na in enumerate(newalts): newalts[i] = na+addns + a = ref[len(s):] # new deletion, deleting from pos + else: + self.error(line,self.V33_BAD_ALLELE) + newalts.append(a) + alt = newalts + # deletion alleles exist, add dummy 1st reference allele, and account for leading base + if have_deletions: + if pos == 0: + # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 + addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) + ref += addn + alt = [allele+addn for allele in alt] + else: + addn = get_sequence(chrom,pos-1,pos,self._reference) + ref = addn + ref + alt = [addn + allele for allele in alt] + pos -= 1 + else: + # format v4.0 -- just check for nucleotides + for allele in alt: + if not alleleRegEx.match(allele): + self.error(line,self.V40_BAD_ALLELE,allele) + + # check for leading nucleotide in indel calls + for allele in alt: + if len(allele) != len(ref): + if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE) + if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): + self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE) + + # trim trailing bases in alleles + # AH: not certain why trimming this needs to be added + # disabled now for unit testing + # if alt: + # for i in range(1,min(len(ref),min(map(len,alt)))): + # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): + # break + # ref, alt = ref[:-1], [allele[:-1] for allele in alt] + + # left-align alleles, if a reference is available + if self._leftalign and self._reference: + while left < pos: + movable = True + for allele in alt: + if len(allele) > len(ref): + longest, shortest = allele, ref + else: + longest, shortest = ref, allele + if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): + movable = False + if longest[-1].upper() != longest[len(shortest)-1].upper(): + movable = False + if not movable: + break + ref = ref[:-1] + alt = [allele[:-1] for allele in alt] + if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: + ref = faref_leftflank[pos-left-1] + ref + alt = [faref_leftflank[pos-left-1] + allele for allele in alt] + pos -= 1 + + # parse sample columns + samples = [] + for sample in cols[9:]: + dict = {} + values = sample.split(':') + if len(values) > len(format): + self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format))) + for idx in range(len(format)): + expected = self.get_expected(format[idx], self._format, alt) + if idx < len(values): value = values[idx] + else: + if expected == -1: value = "." + else: value = ",".join(["."]*expected) + + dict[format[idx]] = self.parse_formatdata(format[idx], + value, + self._format, + line) + if expected != -1 and len(dict[format[idx]]) != expected: + self.error(line,self.BAD_NUMBER_OF_PARAMETERS, + "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]])) + if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]])) + dict[format[idx]] = dict[format[idx]][:expected] + samples.append( dict ) + + # done + d = {'chrom':chrom, + 'pos':pos, # return 0-based position + 'id':id, + 'ref':ref, + 'alt':alt, + 'qual':qual, + 'filter':filter, + 'info':info, + 'format':format} + for key,value in zip(self._samples,samples): + d[key] = value + + return d + + + def write_data(self, stream, data): + required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples + for k in required: + if k not in data: raise ValueError("Required key %s not found in data" % str(k)) + if data['alt'] == []: alt = "." + else: alt = ",".join(data['alt']) + if data['filter'] == None: filter = "." + elif data['filter'] == []: + if self._version == 33: filter = "0" + else: filter = "PASS" + else: filter = ';'.join(data['filter']) + if data['qual'] == -1: qual = "." + else: qual = str(data['qual']) + + output = [data['chrom'], + str(data['pos']+1), # change to 1-based position + data['id'], + data['ref'], + alt, + qual, + filter, + self.format_formatdata( + data['info'], self._info, separator=";"), + self.format_formatdata( + data['format'], self._format, value=False)] + + for s in self._samples: + output.append(self.format_formatdata( + data[s], self._format, key=False)) + + stream.write( "\t".join(output) + "\n" ) + + def _parse_header(self, stream): + self._lineno = 0 + for line in stream: + line = force_str(line, self.encoding) + self._lineno += 1 + if line.startswith('##'): + self.parse_header(line.strip()) + elif line.startswith('#'): + self.parse_heading(line.strip()) + self.enter_default_format() + else: + break + return line + + def _parse(self, line, stream): + # deal with files with header only + if line.startswith("##"): return + if len(line.strip()) > 0: + d = self.parse_data( line.strip() ) + if d: yield d + for line in stream: + self._lineno += 1 + if self._lines and self._lineno > self._lines: raise StopIteration + d = self.parse_data( line.strip() ) + if d: yield d + + ###################################################################################################### + # + # API follows + # + ###################################################################################################### + + def getsamples(self): + """ List of samples in VCF file """ + return self._samples + + def setsamples(self,samples): + """ List of samples in VCF file """ + self._samples = samples + + def getheader(self): + """ List of header key-value pairs (strings) """ + return self._header + + def setheader(self,header): + """ List of header key-value pairs (strings) """ + self._header = header + + def getinfo(self): + """ Dictionary of ##INFO tags, as VCF.FORMAT values """ + return self._info + + def setinfo(self,info): + """ Dictionary of ##INFO tags, as VCF.FORMAT values """ + self._info = info + + def getformat(self): + """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ + return self._format + + def setformat(self,format): + """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ + self._format = format + + def getfilter(self): + """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ + return self._filter + + def setfilter(self,filter): + """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ + self._filter = filter + + def setversion(self, version): + if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files") + self._version = version + + def setregions(self, regions): + self._regions = regions + + def setreference(self, ref): + """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """ + self._reference = ref + + def ignoreerror(self, errorstring): + try: self._ignored_errors.add(self.__dict__[errorstring]) + except KeyError: raise ValueError("Invalid error string: %s" % errorstring) + + def warnerror(self, errorstring): + try: self._warn_errors.add(self.__dict__[errorstring]) + except KeyError: raise ValueError("Invalid error string: %s" % errorstring) + + def parse(self, stream): + """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """ + last_line = self._parse_header(stream) + # now return a generator that does the actual work. In this way the pre-processing is done + # before the first piece of data is yielded + return self._parse(last_line, stream) + + def write(self, stream, datagenerator): + """ Writes a VCF file to a stream, using a data generator (or list) """ + self.write_header(stream) + self.write_heading(stream) + for data in datagenerator: self.write_data(stream,data) + + def writeheader(self, stream): + """ Writes a VCF header """ + self.write_header(stream) + self.write_heading(stream) + + def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2): + """ Utility function: compares two calls for equality """ + # a variant should always be assigned to a unique position, one base before + # the leftmost position of the alignment gap. If this rule is implemented + # correctly, the two positions must be equal for the calls to be identical. + if pos1 != pos2: return False + # from both calls, trim rightmost bases when identical. Do this safely, i.e. + # only when the reference bases are not Ns + while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]: + ref1 = ref1[:-1] + alt1 = alt1[:-1] + while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]: + ref2 = ref2[:-1] + alt2 = alt2[:-1] + # now, the alternative alleles must be identical + return alt1 == alt2 + +########################################################################################################### +########################################################################################################### +## API functions added by Andreas +########################################################################################################### + + def connect(self, filename, encoding="ascii"): + '''connect to tabix file.''' + self.encoding=encoding + self.tabixfile = pysam.Tabixfile(filename, encoding=encoding) + self._parse_header(self.tabixfile.header) + + def __del__(self): + self.close() + self.tabixfile = None + + def close(self): + if self.tabixfile: + self.tabixfile.close() + self.tabixfile = None + + def fetch(self, + reference=None, + start=None, + end=None, + region=None ): + """ Parse a stream of VCF-formatted lines. + Initializes class instance and return generator """ + return self.tabixfile.fetch( + reference, + start, + end, + region, + parser = asVCFRecord(self)) + + def validate(self, record): + '''validate vcf record. + + returns a validated record. + ''' + + raise NotImplementedError("needs to be checked") + + chrom, pos = record.chrom, record.pos + + # check reference + ref = record.ref + if ref == ".": + self.error(str(record),self.MISSING_REF) + if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) + else: ref = "" + else: + for c in ref: + if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF) + if "N" in ref: ref = get_sequence(chrom, + pos, + pos+len(ref), + self._reference) + + # make sure reference is sane + if self._reference: + left = max(0,self.pos-100) + faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference) + faref = faref_leftflank[pos-left:] + if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) + ref = faref + + # check: format fields are defined + for f in record.format: + if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f) + + # check: all filters are defined + for f in record.filter: + if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f) + + # convert v3.3 alleles + if self._version == 33: + if len(ref) != 1: self.error(str(record),self.V33_BAD_REF) + newalts = [] + have_deletions = False + for a in alt: + if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference + elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference + elif a.startswith('D'): # allow D and D + have_deletions = True + try: + l = int(a[1:]) # throws ValueError if sequence + if len(ref) < l: # add to reference if necessary + addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) + ref += addns + for i,na in enumerate(newalts): newalts[i] = na+addns + a = ref[l:] # new deletion, deleting pos...pos+l + except ValueError: + s = a[1:] + if len(ref) < len(s): # add Ns to reference if necessary + addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) + if not s.endswith(addns) and addns != 'N'*len(addns): + self.error(str(record),self.V33_UNMATCHED_DELETION, + "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) + ref += addns + for i,na in enumerate(newalts): newalts[i] = na+addns + a = ref[len(s):] # new deletion, deleting from pos + else: + self.error(str(record),self.V33_BAD_ALLELE) + newalts.append(a) + alt = newalts + # deletion alleles exist, add dummy 1st reference allele, and account for leading base + if have_deletions: + if pos == 0: + # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 + addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) + ref += addn + alt = [allele+addn for allele in alt] + else: + addn = get_sequence(chrom,pos-1,pos,self._reference) + ref = addn + ref + alt = [addn + allele for allele in alt] + pos -= 1 + else: + # format v4.0 -- just check for nucleotides + for allele in alt: + if not alleleRegEx.match(allele): + self.error(str(record),self.V40_BAD_ALLELE,allele) + + + # check for leading nucleotide in indel calls + for allele in alt: + if len(allele) != len(ref): + if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE) + if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): + self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE) + + # trim trailing bases in alleles + # AH: not certain why trimming this needs to be added + # disabled now for unit testing + # for i in range(1,min(len(ref),min(map(len,alt)))): + # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): + # break + # ref, alt = ref[:-1], [allele[:-1] for allele in alt] + + # left-align alleles, if a reference is available + if self._leftalign and self._reference: + while left < pos: + movable = True + for allele in alt: + if len(allele) > len(ref): + longest, shortest = allele, ref + else: + longest, shortest = ref, allele + if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): + movable = False + if longest[-1].upper() != longest[len(shortest)-1].upper(): + movable = False + if not movable: + break + ref = ref[:-1] + alt = [allele[:-1] for allele in alt] + if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: + ref = faref_leftflank[pos-left-1] + ref + alt = [faref_leftflank[pos-left-1] + allele for allele in alt] + pos -= 1 + +__all__ = [ + "VCF", "VCFRecord", ] diff --git a/pysam/utils.py b/pysam/utils.py index c5bb539..5c045df 100644 --- a/pysam/utils.py +++ b/pysam/utils.py @@ -1,4 +1,4 @@ -from pysam.cutils import _pysam_dispatch +from pysam.libcutils import _pysam_dispatch class SamtoolsError(Exception): diff --git a/pysam/version.py b/pysam/version.py index 0a985de..facb3bb 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,7 +1,9 @@ # pysam versioning information -__version__ = "0.9.1.4" +__version__ = "0.10.0" __samtools_version__ = "1.3.1" -__htslib_version__ = "1.3.1" +__bcftools_version__ = "1.3.1" + +__htslib_version__ = "1.3.2" diff --git a/requirements.txt b/requirements.txt index 687929a..6e8fc44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -cython>=0.22 +cython>=0.24.1 diff --git a/run_tests_travis.sh b/run_tests_travis.sh index 414043e..a229ff5 100755 --- a/run_tests_travis.sh +++ b/run_tests_travis.sh @@ -6,75 +6,36 @@ WORKDIR=`pwd` #Install miniconda python if [ $TRAVIS_OS_NAME == "osx" ]; then - curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - bash Miniconda3-latest-MacOSX-x86_64.sh -b + wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh else - curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -b + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate # Default OS versions are old and have SSL / CERT issues fi +bash Miniconda3.sh -b + # Create a new conda environment with the target python version ~/miniconda3/bin/conda install conda-build -y -~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose +~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose psutil pip + +# activate testenv environment +source ~/miniconda3/bin/activate testenv -# Add new conda environment to PATH -export PATH=~/miniconda3/envs/testenv/bin/:$PATH +conda config --add channels conda-forge +conda config --add channels defaults +conda config --add channels r +conda config --add channels bioconda -# Hack to force linking to anaconda libraries rather than system libraries -#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/ -#export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/ +conda install -y samtools bcftools htslib # Need to make C compiler and linker use the anaconda includes and libraries: export PREFIX=~/miniconda3/ export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib" export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl" -# create a new folder to store external tools -mkdir -p $WORKDIR/external-tools - -# install htslib -cd $WORKDIR/external-tools -curl -L https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 > htslib-1.3.1.tar.bz2 -tar xjvf htslib-1.3.1.tar.bz2 -cd htslib-1.3.1 -make -PATH=$PATH:$WORKDIR/external-tools/htslib-1.3.1 -LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3.1 - -# install samtools, compile against htslib -cd $WORKDIR/external-tools -curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3.1/samtools-1.3.1.tar.bz2 > samtools-1.3.1.tar.bz2 -tar xjvf samtools-1.3.1.tar.bz2 -cd samtools-1.3.1 -./configure --with-htslib=../htslib-1.3.1 -make -PATH=$PATH:$WORKDIR/external-tools/samtools-1.3.1 - -echo "installed samtools" samtools --version - -if [ $? != 0 ]; then - exit 1 -fi - -# install bcftools -cd $WORKDIR/external-tools -curl -L https://github.com/samtools/bcftools/releases/download/1.3.1/bcftools-1.3.1.tar.bz2 > bcftools-1.3.1.tar.bz2 -tar xjf bcftools-1.3.1.tar.bz2 -cd bcftools-1.3.1 -./configure --with-htslib=../htslib-1.3.1 -make -PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3.1 - -echo "installed bcftools" +htslib --version bcftools --version -if [ $? != 0 ]; then - exit 1 -fi - -popd - # Try building conda recipe first ~/miniconda3/bin/conda-build ci/conda-recipe/ --python=$CONDA_PY @@ -105,9 +66,10 @@ if [ $? != 0 ]; then exit 1 fi -# build source tar-ball +# build source tar-ball. Make sure to build so that .pyx files +# are cythonized. cd .. -python setup.py sdist +python setup.py build sdist if [ $? != 0 ]; then exit 1 @@ -123,7 +85,7 @@ fi # test pip installation from tar-ball with cython echo "pip installing with cython" -pip install --verbose --no-deps --no-use-wheel dist/pysam-*.tar.gz +pip install --verbose --no-deps --no-binary=:all: dist/pysam-*.tar.gz if [ $? != 0 ]; then exit 1 @@ -131,10 +93,10 @@ fi # attempt pip installation without cython echo "pip installing without cython" -~/miniconda3/bin/conda remove cython +~/miniconda3/bin/conda remove -y cython ~/miniconda3/bin/conda list -echo "pthyon is" `which python` -pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz +echo "python is" `which python` +pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz if [ $? != 0 ]; then exit 1 @@ -144,7 +106,7 @@ fi # command line options echo "pip installing without cython and no configure options" export HTSLIB_CONFIGURE_OPTIONS="" -pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz +pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz if [ $? != 0 ]; then exit 1 diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 3d5ffa5..8c883b0 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -489,9 +489,9 @@ int main_samview(int argc, char *argv[]) } view_end: - if (is_count && ret == 0) + if (is_count && ret == 0) fprintf(pysam_stdout, "%" PRId64 "\n", count); - + // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); diff --git a/setup.py b/setup.py index e301f11..6d52617 100644 --- a/setup.py +++ b/setup.py @@ -60,13 +60,18 @@ def run_configure(option): def run_make_print_config(): - stdout = subprocess.check_output(["make", "print-config"]) + stdout = subprocess.check_output(["make", "-s", "print-config"]) if IS_PYTHON3: stdout = stdout.decode("ascii") - result = dict([[x.strip() for x in line.split("=")] - for line in stdout.splitlines()]) - return result + make_print_config = {} + for line in stdout.splitlines(): + if "=" in line: + row = line.split("=") + if len(row) == 2: + make_print_config.update( + {row[0].strip(): row[1].strip()}) + return make_print_config def configure_library(library_dir, env_options=None, options=[]): @@ -139,16 +144,12 @@ try: import cython HAVE_CYTHON = True print ("# pysam: cython is available - using cythonize if necessary") - source_pattern = "pysam/c%s.pyx" - if HTSLIB_MODE != "external": - HTSLIB_MODE = "shared" + source_pattern = "pysam/libc%s.pyx" except ImportError: HAVE_CYTHON = False print ("# pysam: no cython available - using pre-compiled C") # no Cython available - use existing C code - source_pattern = "pysam/c%s.c" - if HTSLIB_MODE != "external": - HTSLIB_MODE = "shared" + source_pattern = "pysam/libc%s.c" # collect pysam version sys.path.insert(0, "pysam") @@ -230,7 +231,6 @@ if HTSLIB_LIBRARY_DIR: chtslib_sources = [] htslib_library_dirs = [HTSLIB_LIBRARY_DIR] htslib_include_dirs = [HTSLIB_INCLUDE_DIR] - internal_htslib_libraries = [] external_htslib_libraries = ['z', 'hts'] elif HTSLIB_MODE == 'separate': @@ -240,7 +240,6 @@ elif HTSLIB_MODE == 'separate': shared_htslib_sources = htslib_sources htslib_library_dirs = [] htslib_include_dirs = ['htslib'] - internal_htslib_libraries = [] elif HTSLIB_MODE == 'shared': # link each pysam component against the same @@ -249,30 +248,15 @@ elif HTSLIB_MODE == 'shared': htslib_library_dirs = [ 'pysam', ".", - os.path.join("build", - distutils_dir_name("lib"), - "pysam")] + os.path.join("build", distutils_dir_name("lib"), "pysam")] htslib_include_dirs = ['htslib'] - if IS_PYTHON3: - if sys.version_info.minor >= 5: - internal_htslib_libraries = ["chtslib.{}".format( - sysconfig.get_config_var('SOABI'))] - else: - if sys.platform == "darwin": - # On OSX, python 3.3 and 3.4 Libs have no platform tags. - internal_htslib_libraries = ["chtslib"] - else: - internal_htslib_libraries = ["chtslib.{}{}".format( - sys.implementation.cache_tag, - sys.abiflags)] - else: - internal_htslib_libraries = ["chtslib"] - else: raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE) +internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]] + # build config.py with open(os.path.join("pysam", "config.py"), "w") as outf: outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE)) @@ -382,7 +366,7 @@ chtslib = Extension( # Selected ones have been copied into samfile_utils.c # Needs to be devolved somehow. csamfile = Extension( - "pysam.csamfile", + "pysam.libcsamfile", [source_pattern % "samfile", "pysam/htslib_util.c", "pysam/samfile_util.c", @@ -402,7 +386,7 @@ csamfile = Extension( # Selected ones have been copied into samfile_utils.c # Needs to be devolved somehow. calignmentfile = Extension( - "pysam.calignmentfile", + "pysam.libcalignmentfile", [source_pattern % "alignmentfile", "pysam/htslib_util.c", "pysam/samfile_util.c", @@ -422,7 +406,7 @@ calignmentfile = Extension( # Selected ones have been copied into samfile_utils.c # Needs to be devolved somehow. calignedsegment = Extension( - "pysam.calignedsegment", + "pysam.libcalignedsegment", [source_pattern % "alignedsegment", "pysam/htslib_util.c", "pysam/samfile_util.c", @@ -438,7 +422,7 @@ calignedsegment = Extension( ) ctabix = Extension( - "pysam.ctabix", + "pysam.libctabix", [source_pattern % "tabix", "pysam/tabix_util.c"] + htslib_sources + @@ -452,7 +436,7 @@ ctabix = Extension( ) cutils = Extension( - "pysam.cutils", + "pysam.libcutils", [source_pattern % "utils", "pysam/pysam_util.c"] + glob.glob(os.path.join("samtools", "*.pysam.c")) + # glob.glob(os.path.join("samtools", "*", "*.pysam.c")) + @@ -470,7 +454,7 @@ cutils = Extension( ) cfaidx = Extension( - "pysam.cfaidx", + "pysam.libcfaidx", [source_pattern % "faidx"] + htslib_sources + os_c_files, @@ -483,7 +467,7 @@ cfaidx = Extension( ) ctabixproxies = Extension( - "pysam.ctabixproxies", + "pysam.libctabixproxies", [source_pattern % "tabixproxies"] + os_c_files, library_dirs=htslib_library_dirs, @@ -495,7 +479,7 @@ ctabixproxies = Extension( ) cvcf = Extension( - "pysam.cvcf", + "pysam.libcvcf", [source_pattern % "vcf"] + os_c_files, library_dirs=htslib_library_dirs, @@ -507,7 +491,7 @@ cvcf = Extension( ) cbcf = Extension( - "pysam.cbcf", + "pysam.libcbcf", [source_pattern % "bcf"] + htslib_sources + os_c_files, @@ -519,6 +503,19 @@ cbcf = Extension( define_macros=define_macros ) +cbgzf = Extension( + "pysam.libcbgzf", + [source_pattern % "bgzf"] + + htslib_sources + + os_c_files, + library_dirs=htslib_library_dirs, + include_dirs=["htslib", "."] + include_os + htslib_include_dirs, + libraries=external_htslib_libraries + internal_htslib_libraries, + language="c", + extra_compile_args=extra_compile_args, + define_macros=define_macros +) + metadata = { 'name': "pysam", 'version': version, @@ -539,6 +536,7 @@ metadata = { ctabixproxies, cvcf, cbcf, + cbgzf, cfaidx, cutils], 'cmdclass': cmdclass, diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 94b2eb3..b0a3466 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -46,19 +46,19 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.query_sequence, None) self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) self.assertEqual(a.flag, 0) - self.assertEqual(a.reference_id, 0) + self.assertEqual(a.reference_id, -1) self.assertEqual(a.mapping_quality, 0) self.assertEqual(a.cigartuples, None) self.assertEqual(a.tags, []) - self.assertEqual(a.next_reference_id, 0) - self.assertEqual(a.next_reference_start, 0) + self.assertEqual(a.next_reference_id, -1) + self.assertEqual(a.next_reference_start, -1) self.assertEqual(a.template_length, 0) def testStrOfEmptyRead(self): a = pysam.AlignedSegment() s = str(a) self.assertEqual( - "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]", + "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]", s) def testSettingTagInEmptyRead(self): @@ -231,6 +231,24 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.get_blocks(), [(20, 30), (31, 40), (40, 60)]) + def test_infer_query_length(self): + '''Test infer_query_length on M|=|X|I|D|H|S cigar ops''' + a = self.buildRead() + a.cigarstring = '15M' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '15=' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '15X' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '5M5I5M' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '5M5D5M' + self.assertEqual(a.infer_query_length(), 10) + a.cigarstring = '5H10M' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '5S10M' + self.assertEqual(a.infer_query_length(), 15) + def test_get_aligned_pairs_soft_clipping(self): a = self.buildRead() a.cigartuples = ((4, 2), (0, 35), (4, 3)) @@ -375,6 +393,18 @@ class TestAlignedSegment(ReadTest): a.cigarstring = "1S20M1S" self.assertEqual(a.query_alignment_length, 20) + def test_query_length_is_limited(self): + + a = self.buildRead() + a.query_name = "A" * 1 + a.query_name = "A" * 254 + self.assertRaises( + ValueError, + setattr, + a, + "query_name", + "A" * 255) + class TestCigarStats(ReadTest): @@ -754,5 +784,6 @@ class TestAsString(unittest.TestCase): for s, p in zip(reference, pysamf): self.assertEqual(s, p.tostring(pysamf)) + if __name__ == "__main__": unittest.main() diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index c042f4f..18fb05b 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -29,7 +29,8 @@ from TestUtils import checkBinaryEqual, checkURL, \ get_temp_filename -DATADIR = "pysam_data" +DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__), + "pysam_data")) ################################################## @@ -353,26 +354,53 @@ class BasicTestBAMFromFilename(BasicTestBAMFromFetch): class BasicTestBAMFromFile(BasicTestBAMFromFetch): def setUp(self): - f = open(os.path.join(DATADIR, "ex3.bam")) - self.samfile = pysam.AlignmentFile( - f, "rb") + with open(os.path.join(DATADIR, "ex3.bam")) as f: + self.samfile = pysam.AlignmentFile( + f, "rb") + self.reads = [r for r in self.samfile] + + +class BasicTestBAMFromFileNo(BasicTestBAMFromFetch): + + def setUp(self): + with open(os.path.join(DATADIR, "ex3.bam")) as f: + self.samfile = pysam.AlignmentFile( + f.fileno(), "rb") self.reads = [r for r in self.samfile] class BasicTestSAMFromFile(BasicTestBAMFromFetch): def setUp(self): - f = open(os.path.join(DATADIR, "ex3.sam")) - self.samfile = pysam.AlignmentFile( - f, "r") + with open(os.path.join(DATADIR, "ex3.sam")) as f: + self.samfile = pysam.AlignmentFile( + f, "r") + self.reads = [r for r in self.samfile] + + +class BasicTestSAMFromFileNo(BasicTestBAMFromFetch): + + def setUp(self): + with open(os.path.join(DATADIR, "ex3.sam")) as f: + self.samfile = pysam.AlignmentFile( + f.fileno(), "r") self.reads = [r for r in self.samfile] class BasicTestCRAMFromFile(BasicTestCRAMFromFetch): def setUp(self): - f = open(os.path.join(DATADIR, "ex3.cram")) - self.samfile = pysam.AlignmentFile(f, "rc") + with open(os.path.join(DATADIR, "ex3.cram")) as f: + self.samfile = pysam.AlignmentFile(f, "rc") + self.reads = [r for r in self.samfile] + + +class BasicTestCRAMFromFileNo(BasicTestCRAMFromFetch): + + def setUp(self): + with open(os.path.join(DATADIR, "ex3.cram")) as f: + self.samfile = pysam.AlignmentFile( + f.fileno(), "rc") self.reads = [r for r in self.samfile] @@ -690,7 +718,7 @@ class TestIO(unittest.TestCase): samfile = pysam.AlignmentFile(f, "rb") f.close() self.assertTrue(f.closed) - # access to Samfile should still work + # access to Samfile still works self.checkEcho("ex1.bam", "ex1.bam", "tmp_ex1.bam", @@ -818,6 +846,15 @@ class TestIO(unittest.TestCase): mode="rb") self.assertEqual(len(list(samfile.fetch())), 3270) + def testBAMWithCSIIndex(self): + '''see issue 116''' + input_filename = os.path.join(DATADIR, "ex1_csi.bam") + samfile = pysam.AlignmentFile(input_filename, + "rb", + check_sq=False) + samfile.fetch('chr2') + + class TestAutoDetect(unittest.TestCase): @@ -1291,8 +1328,8 @@ class TestHeaderSAM(unittest.TestCase): """testing header manipulation""" - header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], + header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'}, + {'LN': 1584, 'SN': 'chr2', 'AH': '*'}], 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"}, {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', @@ -2343,6 +2380,46 @@ class TestPileupQueryPosition(unittest.TestCase): last[r.alignment.query_name] = r.query_position +class TestFindIntrons(unittest.TestCase): + samfilename = "pysam_data/ex_spliced.bam" + + def setUp(self): + self.samfile = pysam.AlignmentFile(self.samfilename) + + def tearDown(self): + self.samfile.close() + + def test_total(self): + all_read_counts = self.samfile.count() + splice_sites = self.samfile.find_introns(self.samfile.fetch()) + self.assertEqual(sum(splice_sites.values()), all_read_counts -1) # there is a single unspliced read in there + + def test_first(self): + reads = list(self.samfile.fetch())[:10] + splice_sites = self.samfile.find_introns(reads) + starts = [14792+38 - 1] + stops = [14792+38 + 140 - 1] + self.assertEqual(len(splice_sites), 1) + self.assertTrue((starts[0], stops[0]) in splice_sites) + self.assertEqual(splice_sites[(starts[0], stops[0])], 9) # first one is the unspliced read + + def test_all(self): + reads = list(self.samfile.fetch()) + splice_sites = self.samfile.find_introns(reads) + should = collections.Counter({ + (14829, 14969): 33, + (15038, 15795): 24, + (15947, 16606): 3, + (16765, 16857): 9, + (16765, 16875): 1, + (17055, 17232): 19, + (17055, 17605): 3, + (17055, 17914): 1, + (17368, 17605): 7, + }) + self.assertEqual(should, splice_sites) + + class TestLogging(unittest.TestCase): '''test around bug issue 42, @@ -2511,7 +2588,6 @@ class TestMappedUnmapped(unittest.TestCase): inf.mapped) - class TestSamtoolsProxy(unittest.TestCase): '''tests for sanity checking access to samtools functions.''' @@ -2592,6 +2668,34 @@ class TestVerbosity(unittest.TestCase): self.assertEqual(pysam.get_verbosity(), 3) +class TestSanityCheckingBAM(unittest.TestCase): + + mode = "wb" + + def check_write(self, read): + + fn = "tmp_test_sanity_check.bam" + names = ["chr1"] + lengths = [10000] + with pysam.AlignmentFile( + fn, + self.mode, + reference_names=names, + reference_lengths=lengths) as outf: + outf.write(read) + + if os.path.exists(fn): + os.unlink(fn) + + def test_empty_read_gives_value_error(self): + read = pysam.AlignedSegment() + self.check_write(read) + +# SAM writing fails, as query length is 0 +# class TestSanityCheckingSAM(TestSanityCheckingSAM): +# mode = "w" + + if __name__ == "__main__": # build data files print ("building data files") diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py index 1fc88f3..ff13045 100644 --- a/tests/SamFile_test.py +++ b/tests/SamFile_test.py @@ -941,8 +941,8 @@ class TestIteratorColumn2(unittest.TestCase): class TestHeaderSam(unittest.TestCase): - header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], + header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'}, + {'LN': 1584, 'SN': 'chr2', 'AH': '*'}], 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"}, {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}], 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}], @@ -1231,19 +1231,19 @@ class TestAlignedRead(ReadTest): self.assertEqual(a.seq, None) self.assertEqual(a.qual, None) self.assertEqual(a.flag, 0) - self.assertEqual(a.rname, 0) + self.assertEqual(a.rname, -1) self.assertEqual(a.mapq, 0) self.assertEqual(a.cigar, []) self.assertEqual(a.tags, []) - self.assertEqual(a.mrnm, 0) - self.assertEqual(a.mpos, 0) + self.assertEqual(a.mrnm, -1) + self.assertEqual(a.mpos, -1) self.assertEqual(a.isize, 0) def testStrOfEmptyRead(self): a = pysam.AlignedRead() s = str(a) self.assertEqual( - "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]", + "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]", s) def buildRead(self): diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py new file mode 100644 index 0000000..ce59da7 --- /dev/null +++ b/tests/StreamFiledescriptors_test.py @@ -0,0 +1,82 @@ +import os +import subprocess +import threading +import errno +import unittest + +from pysam import AlignmentFile + +DATADIR = os.path.abspath(os.path.join( + os.path.dirname(__file__), + "pysam_data")) + + +def alignmentfile_writer_thread(infile, outfile): + def _writer_thread(infile, outfile): + """read from infile and write to outfile""" + try: + i = 0 + for record in infile: + outfile.write(record) + i += 1 + except IOError as e: + if e.errno != errno.EPIPE: + pass + finally: + outfile.close() + + writer = threading.Thread(target=_writer_thread, args=(infile, outfile)) + writer.daemon = True + writer.start() + return writer + + +class StreamTest(unittest.TestCase): + + def stream_process(self, proc, in_stream, out_stream, writer): + + with AlignmentFile(proc.stdout) as infile: + read = 0 + for record in infile: + read += 1 + return 0, read + + def test_text_processing(self): + + proc = subprocess.Popen('head -n200', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + shell=True) + + in_stream = AlignmentFile('pysam_data/ex1.bam') + out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header) + writer = alignmentfile_writer_thread(in_stream, + out_stream) + + written, read = self.stream_process(proc, + in_stream, + out_stream, + writer) + self.assertEqual(read, 198) + + def test_samtools_processing(self): + + proc = subprocess.Popen('samtools view -b -f 4', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + shell=True) + + in_stream = AlignmentFile('pysam_data/ex1.bam') + out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header) + writer = alignmentfile_writer_thread(in_stream, + out_stream) + + written, read = self.stream_process(proc, + in_stream, + out_stream, + writer) + self.assertEqual(read, 35) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index ef21245..aa82c66 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -1,8 +1,15 @@ import os +import sys import unittest import pysam import gzip import subprocess + +try: + from pathlib import Path +except ImportError: + Path = None + from TestUtils import get_temp_filename, check_lines_equal DATADIR="cbcf_data" @@ -75,6 +82,17 @@ class TestOpening(unittest.TestCase): os.unlink("tmp_testEmptyFile.vcf") + + if Path and sys.version_info >= (3,6): + def testEmptyFileVCFFromPath(self): + with open("tmp_testEmptyFile.vcf", "w"): + pass + + self.assertRaises(ValueError, pysam.VariantFile, + Path("tmp_testEmptyFile.vcf")) + + os.unlink("tmp_testEmptyFile.vcf") + def testEmptyFileVCFGZWithIndex(self): with open("tmp_testEmptyFile.vcf", "w"): pass @@ -171,12 +189,12 @@ class TestHeader(unittest.TestCase): # remove last header line starting with #CHROM ref.pop() ref = sorted(ref) - comp = sorted([str(x) for x in v.header.records]) + comp = sorted(str(x) for x in v.header.records) self.assertEqual(len(ref), len(comp)) for x, y in zip(ref, comp): - self.assertEqual(x[:-1], str(y)) + self.assertEqual(x, y) # These tests need to be separate and start from newly opened files. This @@ -195,6 +213,13 @@ class TestParsing(unittest.TestCase): chrom = [rec.chrom for rec in v] self.assertEqual(chrom, ['M', '17', '20', '20', '20']) + if Path and sys.version_info >= (3,6): + def testChromFromPath(self): + fn = os.path.join(DATADIR, self.filename) + v = pysam.VariantFile(Path(fn)) + chrom = [rec.chrom for rec in v] + self.assertEqual(chrom, ['M', '17', '20', '20', '20']) + def testPos(self): fn = os.path.join(DATADIR, self.filename) v = pysam.VariantFile(fn) @@ -330,9 +355,22 @@ class TestConstructionVCFWithContigs(unittest.TestCase): """construct VariantFile from scratch.""" filename = "example_vcf42_withcontigs.vcf" + compression = 'NONE' + description = 'VCF version 4.2 variant calling text' - def complete_check(self, fn_in, fn_out): + def testBase(self): + with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: + self.assertEqual(inf.category, 'VARIANTS') + self.assertEqual(inf.format, 'VCF') + self.assertEqual(inf.version, (4, 2)) + self.assertEqual(inf.compression, self.compression) + self.assertEqual(inf.description, self.description) + self.assertTrue(inf.is_open) + self.assertEqual(inf.is_read, True) + self.assertEqual(inf.is_write, False) + def complete_check(self, fn_in, fn_out): + self.maxDiff = None check_lines_equal( self, fn_in, fn_out, sort=True, filter_f=lambda x: x.startswith("##contig")) @@ -349,14 +387,15 @@ class TestConstructionVCFWithContigs(unittest.TestCase): for record in vcf_in.header.records: header.add_record(record) - fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf" - vcf_out = pysam.VariantFile(fn, "w", header=header) + for sample in vcf_in.header.samples: + header.add_sample(sample) + + vcf_out = pysam.VariantFile(fn_out, "w", header=header) for record in vcf_in: - # currently segfaults here: - # vcf_out.write(record) - pass - return + record.translate(header) + vcf_out.write(record) + vcf_in.close() vcf_out.close() self.complete_check(fn_in, fn_out) @@ -370,6 +409,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase): for record in vcf_in: vcf_out.write(record) + vcf_in.close() vcf_out.close() self.complete_check(fn_in, fn_out) @@ -397,8 +437,8 @@ class TestConstructionVCFWithContigs(unittest.TestCase): self.complete_check(fn_in, fn_out) -# Currently segfaults for VCFs without contigs -# class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs): + +#class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs): # """construct VariantFile from scratch.""" # filename = "example_vcf40.vcf" @@ -407,18 +447,33 @@ class TestConstructionVCFGZWithContigs(TestConstructionVCFWithContigs): """construct VariantFile from scratch.""" filename = "example_vcf42_withcontigs.vcf.gz" + compression = 'BGZF' + description = 'VCF version 4.2 BGZF-compressed variant calling data' class TestConstructionVCFGZWithoutContigs(TestConstructionVCFWithContigs): """construct VariantFile from scratch.""" filename = "example_vcf42.vcf.gz" + compression = 'BGZF' + description = 'VCF version 4.2 BGZF-compressed variant calling data' class TestSettingRecordValues(unittest.TestCase): filename = "example_vcf40.vcf" + def testBase(self): + with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: + self.assertEqual(inf.category, 'VARIANTS') + self.assertEqual(inf.format, 'VCF') + self.assertEqual(inf.version, (4, 0)) + self.assertEqual(inf.compression, 'NONE') + self.assertEqual(inf.description, 'VCF version 4.0 variant calling text') + self.assertTrue(inf.is_open) + self.assertEqual(inf.is_read, True) + self.assertEqual(inf.is_write, False) + def testSetQual(self): with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: record = next(inf) @@ -435,8 +490,7 @@ class TestSettingRecordValues(unittest.TestCase): sample = record.samples["NA00001"] print (sample["GT"]) self.assertEqual(sample["GT"], (0, 0)) -# Fails with TypeError -# sample["GT"] = sample["GT"] + sample["GT"] = sample["GT"] class TestSubsetting(unittest.TestCase): diff --git a/tests/_compile_test.pyx b/tests/_compile_test.pyx index db6b5b6..dfe7937 100644 --- a/tests/_compile_test.pyx +++ b/tests/_compile_test.pyx @@ -1,5 +1,5 @@ -from pysam.calignmentfile cimport AlignmentFile, AlignedSegment -from pysam.ctabix cimport Tabixfile +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment +from pysam.libctabix cimport Tabixfile cdef AlignmentFile samfile cdef Tabixfile tabixfile diff --git a/tests/_cython_flagstat.pyx b/tests/_cython_flagstat.pyx index f0f03bb..8e376b0 100644 --- a/tests/_cython_flagstat.pyx +++ b/tests/_cython_flagstat.pyx @@ -1,6 +1,6 @@ -from pysam.calignmentfile cimport AlignmentFile, AlignedSegment -from pysam.calignmentfile cimport pysam_get_flag -from pysam.calignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment +from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED +from pysam.libcalignedsegment cimport pysam_get_flag def count(AlignmentFile samfile): cdef int is_proper = 0 diff --git a/tests/cython_flagstat.py b/tests/cython_flagstat.py deleted file mode 100644 index 851157a..0000000 --- a/tests/cython_flagstat.py +++ /dev/null @@ -1,11 +0,0 @@ -import pysam - -import pyximport -pyximport.install() -import _cython_flagstat - -is_paired, is_proper = _cython_flagstat.count( - pysam.AlignmentFile("ex1.bam", "rb")) - -print ("there are alignments of %i paired reads" % is_paired) -print ("there are %i proper paired alignments" % is_proper) diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index 89a4a0c..2ccedd2 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -18,7 +18,8 @@ all: ex1.pileup.gz \ empty.bam empty.bam.bai \ explicit_index.bam explicit_index.cram \ faidx_empty_seq.fq.gz \ - ex1.fa.gz ex1.fa.gz.fai + ex1.fa.gz ex1.fa.gz.fai \ + ex1_csi.bam # ex2.sam - as ex1.sam, but with header ex2.sam.gz: ex1.bam ex1.bam.bai @@ -44,6 +45,7 @@ uncompressed.bam: ex2.sam ex1.fa.fai:ex1.fa samtools faidx ex1.fa + ex1.bam:ex1.sam.gz ex1.fa.fai samtools import ex1.fa.fai ex1.sam.gz ex1.bam @@ -56,6 +58,10 @@ ex1.pileup.gz:ex1.bam ex1.fa ex2_truncated.bam: ex2.bam head -c 124000 ex2.bam > ex2_truncated.bam +ex1_csi.bam: ex1.bam + cp ex1.bam ex1_csi.bam + samtools index -c ex1_csi.bam + empty.bam: ex2.sam grep "^@" $< | samtools view -Sb - > $@ diff --git a/tests/pysam_data/ex3.sam b/tests/pysam_data/ex3.sam index 495d4fe..7a09188 100644 --- a/tests/pysam_data/ex3.sam +++ b/tests/pysam_data/ex3.sam @@ -1,6 +1,6 @@ @HD VN:1.0 -@SQ SN:chr1 LN:1575 -@SQ SN:chr2 LN:1584 +@SQ SN:chr1 LN:1575 AH:chr1:5000000-5010000 +@SQ SN:chr2 LN:1584 AH:* @RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 CN:name:with:colon @RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 CN:name:with:colon @PG ID:P1 VN:1.0 diff --git a/tests/pysam_data/ex_spliced.sam b/tests/pysam_data/ex_spliced.sam new file mode 100644 index 0000000..ae8086a --- /dev/null +++ b/tests/pysam_data/ex_spliced.sam @@ -0,0 +1,297 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:248956422 +@SQ SN:2 LN:242193529 +@SQ SN:3 LN:198295559 +@SQ SN:4 LN:190214555 +@SQ SN:5 LN:181538259 +@SQ SN:6 LN:170805979 +@SQ SN:7 LN:159345973 +@SQ SN:8 LN:145138636 +@SQ SN:9 LN:138394717 +@SQ SN:10 LN:133797422 +@SQ SN:11 LN:135086622 +@SQ SN:12 LN:133275309 +@SQ SN:13 LN:114364328 +@SQ SN:14 LN:107043718 +@SQ SN:15 LN:101991189 +@SQ SN:16 LN:90338345 +@SQ SN:17 LN:83257441 +@SQ SN:18 LN:80373285 +@SQ SN:19 LN:58617616 +@SQ SN:20 LN:64444167 +@SQ SN:21 LN:46709983 +@SQ SN:22 LN:50818468 +@SQ SN:X LN:156040895 +@SQ SN:Y LN:57227415 +@SQ SN:MT LN:16569 +@SQ SN:GL000008.2 LN:209709 +@SQ SN:GL000009.2 LN:201709 +@SQ SN:GL000194.1 LN:191469 +@SQ SN:GL000195.1 LN:182896 +@SQ SN:GL000205.2 LN:185591 +@SQ SN:GL000208.1 LN:92689 +@SQ SN:GL000213.1 LN:164239 +@SQ SN:GL000214.1 LN:137718 +@SQ SN:GL000216.2 LN:176608 +@SQ SN:GL000218.1 LN:161147 +@SQ SN:GL000219.1 LN:179198 +@SQ SN:GL000220.1 LN:161802 +@SQ SN:GL000221.1 LN:155397 +@SQ SN:GL000224.1 LN:179693 +@SQ SN:GL000225.1 LN:211173 +@SQ SN:GL000226.1 LN:15008 +@SQ SN:KI270302.1 LN:2274 +@SQ SN:KI270303.1 LN:1942 +@SQ SN:KI270304.1 LN:2165 +@SQ SN:KI270305.1 LN:1472 +@SQ SN:KI270310.1 LN:1201 +@SQ SN:KI270311.1 LN:12399 +@SQ SN:KI270312.1 LN:998 +@SQ SN:KI270315.1 LN:2276 +@SQ SN:KI270316.1 LN:1444 +@SQ SN:KI270317.1 LN:37690 +@SQ SN:KI270320.1 LN:4416 +@SQ SN:KI270322.1 LN:21476 +@SQ SN:KI270329.1 LN:1040 +@SQ SN:KI270330.1 LN:1652 +@SQ SN:KI270333.1 LN:2699 +@SQ SN:KI270334.1 LN:1368 +@SQ SN:KI270335.1 LN:1048 +@SQ SN:KI270336.1 LN:1026 +@SQ SN:KI270337.1 LN:1121 +@SQ SN:KI270338.1 LN:1428 +@SQ SN:KI270340.1 LN:1428 +@SQ SN:KI270362.1 LN:3530 +@SQ SN:KI270363.1 LN:1803 +@SQ SN:KI270364.1 LN:2855 +@SQ SN:KI270366.1 LN:8320 +@SQ SN:KI270371.1 LN:2805 +@SQ SN:KI270372.1 LN:1650 +@SQ SN:KI270373.1 LN:1451 +@SQ SN:KI270374.1 LN:2656 +@SQ SN:KI270375.1 LN:2378 +@SQ SN:KI270376.1 LN:1136 +@SQ SN:KI270378.1 LN:1048 +@SQ SN:KI270379.1 LN:1045 +@SQ SN:KI270381.1 LN:1930 +@SQ SN:KI270382.1 LN:4215 +@SQ SN:KI270383.1 LN:1750 +@SQ SN:KI270384.1 LN:1658 +@SQ SN:KI270385.1 LN:990 +@SQ SN:KI270386.1 LN:1788 +@SQ SN:KI270387.1 LN:1537 +@SQ SN:KI270388.1 LN:1216 +@SQ SN:KI270389.1 LN:1298 +@SQ SN:KI270390.1 LN:2387 +@SQ SN:KI270391.1 LN:1484 +@SQ SN:KI270392.1 LN:971 +@SQ SN:KI270393.1 LN:1308 +@SQ SN:KI270394.1 LN:970 +@SQ SN:KI270395.1 LN:1143 +@SQ SN:KI270396.1 LN:1880 +@SQ SN:KI270411.1 LN:2646 +@SQ SN:KI270412.1 LN:1179 +@SQ SN:KI270414.1 LN:2489 +@SQ SN:KI270417.1 LN:2043 +@SQ SN:KI270418.1 LN:2145 +@SQ SN:KI270419.1 LN:1029 +@SQ SN:KI270420.1 LN:2321 +@SQ SN:KI270422.1 LN:1445 +@SQ SN:KI270423.1 LN:981 +@SQ SN:KI270424.1 LN:2140 +@SQ SN:KI270425.1 LN:1884 +@SQ SN:KI270429.1 LN:1361 +@SQ SN:KI270435.1 LN:92983 +@SQ SN:KI270438.1 LN:112505 +@SQ SN:KI270442.1 LN:392061 +@SQ SN:KI270448.1 LN:7992 +@SQ SN:KI270465.1 LN:1774 +@SQ SN:KI270466.1 LN:1233 +@SQ SN:KI270467.1 LN:3920 +@SQ SN:KI270468.1 LN:4055 +@SQ SN:KI270507.1 LN:5353 +@SQ SN:KI270508.1 LN:1951 +@SQ SN:KI270509.1 LN:2318 +@SQ SN:KI270510.1 LN:2415 +@SQ SN:KI270511.1 LN:8127 +@SQ SN:KI270512.1 LN:22689 +@SQ SN:KI270515.1 LN:6361 +@SQ SN:KI270516.1 LN:1300 +@SQ SN:KI270517.1 LN:3253 +@SQ SN:KI270518.1 LN:2186 +@SQ SN:KI270519.1 LN:138126 +@SQ SN:KI270521.1 LN:7642 +@SQ SN:KI270522.1 LN:5674 +@SQ SN:KI270528.1 LN:2983 +@SQ SN:KI270529.1 LN:1899 +@SQ SN:KI270530.1 LN:2168 +@SQ SN:KI270538.1 LN:91309 +@SQ SN:KI270539.1 LN:993 +@SQ SN:KI270544.1 LN:1202 +@SQ SN:KI270548.1 LN:1599 +@SQ SN:KI270579.1 LN:31033 +@SQ SN:KI270580.1 LN:1553 +@SQ SN:KI270581.1 LN:7046 +@SQ SN:KI270582.1 LN:6504 +@SQ SN:KI270583.1 LN:1400 +@SQ SN:KI270584.1 LN:4513 +@SQ SN:KI270587.1 LN:2969 +@SQ SN:KI270588.1 LN:6158 +@SQ SN:KI270589.1 LN:44474 +@SQ SN:KI270590.1 LN:4685 +@SQ SN:KI270591.1 LN:5796 +@SQ SN:KI270593.1 LN:3041 +@SQ SN:KI270706.1 LN:175055 +@SQ SN:KI270707.1 LN:32032 +@SQ SN:KI270708.1 LN:127682 +@SQ SN:KI270709.1 LN:66860 +@SQ SN:KI270710.1 LN:40176 +@SQ SN:KI270711.1 LN:42210 +@SQ SN:KI270712.1 LN:176043 +@SQ SN:KI270713.1 LN:40745 +@SQ SN:KI270714.1 LN:41717 +@SQ SN:KI270715.1 LN:161471 +@SQ SN:KI270716.1 LN:153799 +@SQ SN:KI270717.1 LN:40062 +@SQ SN:KI270718.1 LN:38054 +@SQ SN:KI270719.1 LN:176845 +@SQ SN:KI270720.1 LN:39050 +@SQ SN:KI270721.1 LN:100316 +@SQ SN:KI270722.1 LN:194050 +@SQ SN:KI270723.1 LN:38115 +@SQ SN:KI270724.1 LN:39555 +@SQ SN:KI270725.1 LN:172810 +@SQ SN:KI270726.1 LN:43739 +@SQ SN:KI270727.1 LN:448248 +@SQ SN:KI270728.1 LN:1872759 +@SQ SN:KI270729.1 LN:280839 +@SQ SN:KI270730.1 LN:112551 +@SQ SN:KI270731.1 LN:150754 +@SQ SN:KI270732.1 LN:41543 +@SQ SN:KI270733.1 LN:179772 +@SQ SN:KI270734.1 LN:165050 +@SQ SN:KI270735.1 LN:42811 +@SQ SN:KI270736.1 LN:181920 +@SQ SN:KI270737.1 LN:103838 +@SQ SN:KI270738.1 LN:99375 +@SQ SN:KI270739.1 LN:73985 +@SQ SN:KI270740.1 LN:37240 +@SQ SN:KI270741.1 LN:157432 +@SQ SN:KI270742.1 LN:186739 +@SQ SN:KI270743.1 LN:210658 +@SQ SN:KI270744.1 LN:168472 +@SQ SN:KI270745.1 LN:41891 +@SQ SN:KI270746.1 LN:66486 +@SQ SN:KI270747.1 LN:198735 +@SQ SN:KI270748.1 LN:93321 +@SQ SN:KI270749.1 LN:158759 +@SQ SN:KI270750.1 LN:148850 +@SQ SN:KI270751.1 LN:150742 +@SQ SN:KI270752.1 LN:27745 +@SQ SN:KI270753.1 LN:62944 +@SQ SN:KI270754.1 LN:40191 +@SQ SN:KI270755.1 LN:36723 +@SQ SN:KI270756.1 LN:79590 +@SQ SN:KI270757.1 LN:71251 +@PG ID:STAR PN:STAR VN:STAR_2.4.1a +HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 51M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 38M140N13M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:1214:7658:35836 272 1 14792 0 38M140N13M * 0 0 GGGCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHJJJJJJJJCGHIJJIJJJJJJIJJGIJJIJIJIJJJJI NH:i:6 HI:i:3 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:2114:4116:44566 272 1 14794 0 36M140N15M * 0 0 GCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA <@@DDDDDDFHCFHEFGBE+2AFH@GIEGF=GGHII9FB?BFFDEE??BD?D@DADDDDBDDD@FGHFHFHIHGI@?DGC@CF NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:1105:1515:82248 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG ??:ADBDDDDD:A<+C?AFDB@E?F4<*?:?1:??):??0009??9?(8BC NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1110:16355:5537 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG @CCFFFFFHH?ADHGIJIJJJJJIIEHIJJJJJIJIGIIJJIJJIIJIJJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1102:17802:20689 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG CCCFFFFFHHHHHJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJI NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:1104:7670:95815 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@DBDDDHHBFDBFGEBBGHG@HIBHIDHBGGGEFBDDDFDGBBBGCHHI NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1110:11368:101298 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG BCCFFFFFCFHHHJJJJJIJJJJJJJJJJJJJGJJJJJJJJJJJJJJJIJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1115:2363:85646 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @C@FFFB?CFDFHDHGIIIIEGIIIIEDGIIIIIIIIIGGIIGIIGCGHIH NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:2213:6044:80821 272 1 14805 0 25M140N26M * 0 0 GCTCCGGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@FFFFFFFBFDGIIJIJGGFHIIIJIIJGIIEHICHIGEGFG?FGHGA>9B8BF@ NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1215:4185:31561 272 1 14815 0 15M140N36M * 0 0 CTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCG CCCFFFFFHHHHGJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJIJJJH NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2108:1506:70629 272 1 14816 0 14M140N37M * 0 0 TTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGA ?@@;BDDD=DFFDDDFGGFCA?)1@F?F+AAEEEBFFIIF@DE4= NH:i:8 HI:i:2 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:1108:6828:32713 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT ?@@ADDDDD?CDD:CFB@:@G@ABGFGFGFBFEAFEEEFCFCF@F=)8=@> NH:i:8 HI:i:2 AS:i:45 nM:i:2 +HWI-C00113:131:HMHYWADXX:1:1111:7491:39504 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFDFFFHHHFIGEGHIGIGGDGIJFHEHGGIJJJIJIJJJJJIIIIGI NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1212:16079:85811 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHGHHHJJJJJJJIIJJJJJIJIJJHIIJJJJIJJJJJJJIJJJ NH:i:8 HI:i:2 AS:i:45 nM:i:2 +HWI-C00113:131:HMHYWADXX:1:2101:7167:50357 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@DD?DDFHHHD@?CG?FHGIIIIG@??BGHIE;8@BFEG NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1211:4828:84953 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @BB?DFFFHHHHHIJIJJJJJJJJJJJHIJJIIJJJJJJIIIJJJJJJIJI NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2107:20905:80208 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHHHFBHIIEIIDIHGGGGG@GGHCFGHIIJIGGGGIJIGIGGH NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2112:6263:84991 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@?DDDDFBH?FHIGIGIIGG;GHBGCD?DCGIIGHEGBBFHGGIHBFIG NH:i:8 HI:i:2 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:2202:10314:26844 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJJJJJJIJJJJIIJJJJJJJJJJJJJJJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2213:21028:90280 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@BDDDAD?FDF9GIBB@@FFG3CFF:DD)?BD*9D@@F4BDEEEFFF8= NH:i:8 HI:i:2 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:1216:14847:22529 272 1 15004 0 35M757N16M * 0 0 CGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTT @@@FFFFDHHBDHIIIJJJJIIIIIIJJIJJGIJIFIJJIDHHGBEHIJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2111:14281:81135 272 1 15007 0 32M757N19M * 0 0 CATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTG @@@DDDBD42=:ACFFIE?FFGAFF@FFFDGEAG>D@DBB9BC3D@EDFFA NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2203:4824:93169 272 1 15008 0 31M757N20M * 0 0 ATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTGC CCCFFFFFHHHHHJJJJIJJJJHIJIJJJJJJJJGIJJJJI?DFGFHIHJF NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1112:17298:87937 272 1 15925 1 23M659N28M * 0 0 CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC ?@;;BA;3ABC?C?6EGDGIIBA+AACACD>>:9:??2< NH:i:6 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2109:14386:93817 272 1 16728 0 38M92N13M * 0 0 GGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTCG @CCFFFDDHHHHDHIFHIJJJGHHIIJHHHHHHFFFFEFEEEECDDDDDDB NH:i:6 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2203:14322:7218 272 1 16741 0 25M110N26M * 0 0 GGTGTTAGTACCCCATCTTGTAGGTCTCAGTGTGGAAGGTGGGCAGTTCTG ?@?DDD?BFFHHFB7EFGGGEFHIHADB8D>822BDG?FHBGEH?FHGG3 NH:i:6 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2116:7403:96086 272 1 17020 0 36M177N15M * 0 0 GCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG :?=DDD=AAAC:+BDIIIIIIA? NH:i:7 HI:i:5 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1209:11002:81132 272 1 17020 0 36M177N15M * 0 0 GCCCGGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG @@@DD@A<@DDDF;BCGF<4CHCEG?EG@FGF9)?BB:?B?DBF>D?**9B NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:1115:8064:78307 272 1 17021 0 35M177N16M * 0 0 CCCTGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGC 11844BBDD=FDFEFFFDFI?HEHAFBEHEEEFC?E:FDGDDDH9CBEHHHEEFB?F>GD@3?FB?BB@ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1101:15891:42282 272 1 17028 0 28M177N23M * 0 0 CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT CCCFFFFFHHHHHJHHIIJJJJJJJJJJIIJJJJIJJJIJJJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1107:10929:6659 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHDHIHHJJGJJJJJJJIJJIJGIJJJIJJJIJJJJJIJJG NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1114:7098:71178 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG =?@BDEEFHBDHFBGIEGIHEHIGDHGEIIJIIIEHIHIIGHDGHIGIIH@ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1209:3383:100724 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG ?@@ADDDDDHDH?EEFHEHIGIIGHGHIFII>BFIH? NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2111:3771:31345 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@DFFFFGHDHHHJGIHJJJJGIJJIJIJIIJJIIJJIGHIJJJIJJIJ< NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2205:14794:36455 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHIJJJJJJJJJJJJJJJJJJIJJJJIJJIJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1107:19701:64552 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFDFFHHHHDGIIJIJJJIIJDGHGJJJJJJIJJJJJJJGIJJJJJJF NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1210:18711:88303 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHJJJJJJJJIJFIJJJEIIHIIJJIIJJGJJJIJJJJJE NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2212:19113:15559 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@7B>DDC=@BFBGBAFCGBFDB@DHIHIDD>@@GHID NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1212:15591:47491 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC @@C+ADDDDHFFDEGEGIIIDFHIFHIIIIIGEHIIBH>FGGGHGHFGGII NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2215:10125:81395 272 1 17031 0 25M859N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCAGGGCCCGCTCGTCCAGGGGGC CCCFFFFFGHHHHJJJJJJJJJHJJJJJJIJIIJJJHIJJJJJJJJJIJHE NH:i:6 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2102:9065:90529 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG C@CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJFHIFHIJIJJJJJJJJ NH:i:5 HI:i:2 AS:i:47 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2204:7767:77376 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@@FDFDDBFHADEHEIGIGIJIGHIHG?EDGHGGCFH:B?BD@FGFHGIH NH:i:5 HI:i:2 AS:i:47 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1212:6793:42000 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@?DADBD8CFADGFHIIIIE3A9?DH?FHGHH@EHGIEHGGIIIGGHIGHGFDEHGH=FHGIIH NH:i:3 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2207:3786:78354 16 1 17340 1 29M237N22M * 0 0 GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA CCCFFFFFHHHHHJJJJJIIJJJJJJJJJJJJHHIJIHHBFIHIIJJJJJI NH:i:3 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1115:8438:81914 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG @@CFFFFDHH?HDGGHIIGIGHIGHGIDIIIFGIIGHHDG:?DFHEHIIII NH:i:3 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1114:13486:49038 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG ?@@:D@DDFHAFFHGGFHFHH@CCHIIIII@:CFGFGGC?D)?8DHHGCGI NH:i:3 HI:i:1 AS:i:49 nM:i:0 diff --git a/tests/python_flagstat.py b/tests/python_flagstat.py deleted file mode 100644 index b14e52d..0000000 --- a/tests/python_flagstat.py +++ /dev/null @@ -1,11 +0,0 @@ -import pysam - -is_paired = 0 -is_proper = 0 - -for read in pysam.AlignmentFile("ex1.bam", "rb"): - is_paired += read.is_paired - is_proper += read.is_proper_pair - -print ("there are alignments of %i paired reads" % is_paired) -print ("there are %i proper paired alignments" % is_proper) diff --git a/tests/samtools_test.py b/tests/samtools_test.py index d5b2791..aa4c554 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -71,6 +71,7 @@ class SamtoolsTest(unittest.TestCase): # an output file. statements = [ "view ex1.bam > %(out)s_ex1.view", + "view -c ex1.bam > %(out)s_ex1.count", # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam", "sort ex1.bam -o %(out)s_ex1.sort.bam", "mpileup ex1.bam > %(out)s_ex1.pileup",