From dadf905780b2c683a23346e1eece6aa56717ca88 Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Mon, 23 Jan 2017 19:09:05 -0800 Subject: [PATCH] Imported Upstream version 0.10.0+ds --- .travis.yml | 4 +- bcftools/vcfisec.c | 2 +- bcftools/vcfisec.c.pysam.c | 2 +- benchmark/cython_flagstat.py | 23 + benchmark/python_flagstat.py | 23 + buildwheels.sh | 69 + cy_build.py | 2 - doc/api.rst | 2 +- doc/release.rst | 29 + doc/usage.rst | 14 +- pysam/__init__.py | 77 +- pysam/chtslib.pyx | 19 - ...gnedsegment.pxd => libcalignedsegment.pxd} | 4 +- ...gnedsegment.pyx => libcalignedsegment.pyx} | 88 +- ...lignmentfile.pxd => libcalignmentfile.pxd} | 40 +- ...lignmentfile.pyx => libcalignmentfile.pyx} | 343 ++-- pysam/{cbcf.pxd => libcbcf.pxd} | 33 +- pysam/{cbcf.pyx => libcbcf.pyx} | 1505 +++++++++-------- pysam/libcbgzf.pyx | 209 +++ pysam/{cfaidx.pxd => libcfaidx.pxd} | 4 +- pysam/{cfaidx.pyx => libcfaidx.pyx} | 13 +- pysam/{chtslib.pxd => libchtslib.pxd} | 18 + pysam/libchtslib.pyx | 265 +++ pysam/{csamfile.pxd => libcsamfile.pxd} | 4 +- pysam/{csamfile.pyx => libcsamfile.pyx} | 2 +- pysam/{ctabix.pxd => libctabix.pxd} | 28 +- pysam/{ctabix.pyx => libctabix.pyx} | 94 +- ...ctabixproxies.pxd => libctabixproxies.pxd} | 0 ...ctabixproxies.pyx => libctabixproxies.pyx} | 4 +- pysam/{cutils.pxd => libcutils.pxd} | 0 pysam/{cutils.pyx => libcutils.pyx} | 44 +- pysam/{cvcf.pxd => libcvcf.pxd} | 0 pysam/{cvcf.pyx => libcvcf.pyx} | 12 +- pysam/utils.py | 2 +- pysam/version.py | 6 +- requirements.txt | 2 +- run_tests_travis.sh | 82 +- samtools/sam_view.c.pysam.c | 4 +- setup.py | 76 +- tests/AlignedSegment_test.py | 39 +- tests/AlignmentFile_test.py | 130 +- tests/SamFile_test.py | 12 +- tests/StreamFiledescriptors_test.py | 82 + tests/VariantFile_test.py | 80 +- tests/_compile_test.pyx | 4 +- tests/_cython_flagstat.pyx | 6 +- tests/cython_flagstat.py | 11 - tests/pysam_data/Makefile | 8 +- tests/pysam_data/ex3.sam | 4 +- tests/pysam_data/ex_spliced.sam | 297 ++++ tests/python_flagstat.py | 11 - tests/samtools_test.py | 1 + 52 files changed, 2539 insertions(+), 1294 deletions(-) create mode 100644 benchmark/cython_flagstat.py create mode 100644 benchmark/python_flagstat.py create mode 100755 buildwheels.sh delete mode 100644 pysam/chtslib.pyx rename pysam/{calignedsegment.pxd => libcalignedsegment.pxd} (97%) rename pysam/{calignedsegment.pyx => libcalignedsegment.pyx} (97%) rename pysam/{calignmentfile.pxd => libcalignmentfile.pxd} (83%) rename pysam/{calignmentfile.pyx => libcalignmentfile.pyx} (92%) rename pysam/{cbcf.pxd => libcbcf.pxd} (72%) rename pysam/{cbcf.pyx => libcbcf.pyx} (76%) create mode 100644 pysam/libcbgzf.pyx rename pysam/{cfaidx.pxd => libcfaidx.pxd} (94%) rename pysam/{cfaidx.pyx => libcfaidx.pyx} (97%) rename pysam/{chtslib.pxd => libchtslib.pxd} (99%) create mode 100644 pysam/libchtslib.pyx rename pysam/{csamfile.pxd => libcsamfile.pxd} (93%) rename pysam/{csamfile.pyx => libcsamfile.pyx} (92%) rename pysam/{ctabix.pxd => libctabix.pxd} (89%) rename pysam/{ctabix.pyx => libctabix.pyx} (94%) rename pysam/{ctabixproxies.pxd => libctabixproxies.pxd} (100%) rename pysam/{ctabixproxies.pyx => libctabixproxies.pyx} (99%) rename pysam/{cutils.pxd => libcutils.pxd} (100%) rename pysam/{cutils.pyx => libcutils.pyx} (92%) rename pysam/{cvcf.pxd => libcvcf.pxd} (100%) rename pysam/{cvcf.pyx => libcvcf.pyx} (99%) create mode 100644 tests/StreamFiledescriptors_test.py delete mode 100644 tests/cython_flagstat.py create mode 100644 tests/pysam_data/ex_spliced.sam delete mode 100644 tests/python_flagstat.py diff --git a/.travis.yml b/.travis.yml index 1482ed7..bfc5d1c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,14 +3,14 @@ os: - osx language: c -sudo: required +sudo: false env: matrix: - CONDA_PY=2.7 - - CONDA_PY=3.3 - CONDA_PY=3.4 - CONDA_PY=3.5 + - CONDA_PY=3.6 addons: apt: diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index 9afe620..9eb3a7c 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -317,7 +317,7 @@ static void init_data(args_t *args) while (*p && *p!=',') p++; if ( *p==',' ) p++; } - if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files); + if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files); if ( args->isec_op==OP_COMPLEMENT && args->nwrite ) { if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n"); diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index 758d475..e3890d5 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -319,7 +319,7 @@ static void init_data(args_t *args) while (*p && *p!=',') p++; if ( *p==',' ) p++; } - if ( args->nwrite>1 && !args->prefix ) error("Expected -p when mutliple output files given: --write %s\n", args->write_files); + if ( args->nwrite>1 && !args->prefix ) error("Expected -p when multiple output files given: --write %s\n", args->write_files); if ( args->isec_op==OP_COMPLEMENT && args->nwrite ) { if ( args->nwrite>1 ) error("Multiple files to -w make no sense with -C\n"); diff --git a/benchmark/cython_flagstat.py b/benchmark/cython_flagstat.py new file mode 100644 index 0000000..6a9b7df --- /dev/null +++ b/benchmark/cython_flagstat.py @@ -0,0 +1,23 @@ +"""compute number of reads/alignments from BAM file +=================================================== + +This is a benchmarking utility script with limited functionality. + +Compute simple flag stats on a BAM-file using +the pysam cython interface. + +""" + +import sys +import pysam +import pyximport +pyximport.install() +import _cython_flagstat + +assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0]) + +is_paired, is_proper = _cython_flagstat.count( + pysam.AlignmentFile(sys.argv[1], "rb")) + +print ("there are alignments of %i paired reads" % is_paired) +print ("there are %i proper paired alignments" % is_proper) diff --git a/benchmark/python_flagstat.py b/benchmark/python_flagstat.py new file mode 100644 index 0000000..0a14d80 --- /dev/null +++ b/benchmark/python_flagstat.py @@ -0,0 +1,23 @@ +"""compute number of reads/alignments from BAM file +=================================================== + +This is a benchmarking utility script with limited functionality. + +Compute simple flag stats on a BAM-file using +the pysam python interface. +""" + +import sys +import pysam + +assert len(sys.argv) == 2, "USAGE: {} filename.bam".format(sys.argv[0]) + +is_paired = 0 +is_proper = 0 + +for read in pysam.AlignmentFile(sys.argv[1], "rb"): + is_paired += read.is_paired + is_proper += read.is_proper_pair + +print ("there are alignments of %i paired reads" % is_paired) +print ("there are %i proper paired alignments" % is_proper) diff --git a/buildwheels.sh b/buildwheels.sh new file mode 100755 index 0000000..a5987f1 --- /dev/null +++ b/buildwheels.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# +# Build manylinux1 wheels for pysam. Based on the example at +# +# +# It is best to run this in a fresh clone of the repository! +# +# Run this within the repository root: +# docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh +# +# The wheels will be put into the wheelhouse/ subdirectory. +# +# For interactive tests: +# docker run -it -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /bin/bash + +set -xeuo pipefail + +# For convenience, if this script is called from outside of a docker container, +# it starts a container and runs itself inside of it. +if ! grep -q docker /proc/1/cgroup; then + # We are not inside a container + exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0 +fi + +yum install -y zlib-devel + +# Python 2.6 is not supported +rm -r /opt/python/cp26* + +# Python 3.3 builds fail with: +# /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib +rm -r /opt/python/cp33* + +# Without libcurl support, htslib can open files from HTTP and FTP URLs. +# With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a +# current version of OpenSSL, and we do not want to be responsible for +# updating the wheels as soon as there are any security issues. So disable +# libcurl for now. +# See also . +# +export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl" + +PYBINS="/opt/python/*/bin" +for PYBIN in ${PYBINS}; do + ${PYBIN}/pip install -r /io/requirements.txt + ${PYBIN}/pip wheel /io/ -w wheelhouse/ +done + +# Bundle external shared libraries into the wheels +# +# The '-L .' option is a workaround. By default, auditwheel puts all external +# libraries (.so files) into a .libs directory and sets the RUNPATH to $ORIGIN/.libs. +# When HTSLIB_MODE is 'shared' (now the default), then all so libraries part of +# pysam require that RUNPATH is set to $ORIGIN (without the .libs). It seems +# auditwheel overwrites $ORIGIN with $ORIGIN/.libs. This workaround makes +# auditwheel set the RUNPATH to "$ORIGIN/." and it will work as desired. +# +for whl in wheelhouse/*.whl; do + auditwheel repair -L . $whl -w /io/wheelhouse/ +done + +# Created files are owned by root, so fix permissions. +chown -R --reference=/io/setup.py /io/wheelhouse/ + +# TODO Install packages and test them +#for PYBIN in ${PYBINS}; do +# ${PYBIN}/pip install pysam --no-index -f /io/wheelhouse +# (cd $HOME; ${PYBIN}/nosetests ...) +#done diff --git a/cy_build.py b/cy_build.py index 880b5cc..29af588 100644 --- a/cy_build.py +++ b/cy_build.py @@ -16,7 +16,6 @@ if sys.platform == 'darwin': config_vars = get_config_vars() config_vars['LDSHARED'] = config_vars['LDSHARED'].replace('-bundle', '') config_vars['SHLIB_EXT'] = '.so' - config_vars['SO'] = '.so' def is_pip_install(): @@ -61,7 +60,6 @@ class cy_build_ext(build_ext): ext.library_dirs.append(os.path.join(self.build_lib, "pysam")) if sys.platform == 'darwin': - relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"] if "develop" in sys.argv or "test" in sys.argv: diff --git a/doc/api.rst b/doc/api.rst index 671fe4e..686c60d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -201,7 +201,7 @@ Fastq files :members: -.. autoclass:: pysam.cfaidx.FastqProxy +.. autoclass:: pysam.FastqProxy :members: diff --git a/doc/release.rst b/doc/release.rst index f49b8f0..1d378f3 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,35 @@ Release notes ============= +Release 0.10.0 +============== + +This release implements further functionality in the VariantFile API +and includes several bugfixes: + +* treat special case -c option in samtools view outputs to stdout even + if -o given, fixes #315 +* permit reading BAM files with CSI index, closes #370 +* raise Error if query name exceeds maximum length, fixes #373 +* new method to compute hash value for AlignedSegment +* AlignmentFile, VariantFile and TabixFile all inherit from HTSFile +* Avoid segfault by detecting out of range reference_id and + next_reference in AlignedSegment.tostring +* Issue #355: Implement streams using file descriptors for VariantFile +* upgrade to htslib 1.3.2 +* fix compilation with musl libc +* Issue #316, #360: Rename all Cython modules to have lib as a prefix +* Issue #332, hardclipped bases in cigar included by + pysam.AlignedSegment.infer_query_length() +* Added support for Python 3.6 filename encoding protocol +* Issue #371, fix incorrect parsing of scalar INFO and FORMAT fields in VariantRecord +* Issue #331, fix failure in VariantFile.reset() method +* Issue #314, add VariantHeader.new_record(), VariantFile.new_record() and + VariantRecord.copy() methods to create new VariantRecord objects +* Added VariantRecordFilter.add() method to allow setting new VariantRecord filters +* Preliminary (potentially unsafe) support for removing and altering header metadata +* Many minor fixes and improvements to VariantFile and related objects + Release 0.9.1 ============= diff --git a/doc/usage.rst b/doc/usage.rst index 90e7688..936f3bd 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -197,12 +197,22 @@ be retrieved by numeric index: for row in tbx.fetch("chr1", 1000, 2000): print ("chromosome is", row[0]) -By providing a parser argument to :class:`~pysam.AlignmentFile.fetch` +By providing a parser to :class:`~pysam.AlignmentFile.fetch` or :class:`~pysam.TabixFile`, the data will we presented in parsed -form: +form:: for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asTuple()): print ("chromosome is", row.contig) + print ("first field (chrom)=", row[0]) + +Pre-built parsers are available for :term:`bed` +(:class:`~pysam.asBed`) formatted files and :term:`gtf` +(:class:`~pysam.asGTF`) formatted files. Thus, additional fields +become available through named access, for example:: + + for row in tbx.fetch("chr1", 1000, 2000, parser=pysam.asBed()): + print ("name is", row.name) + .. Currently inactivated as pileup deprecated .. Using the samtools SNP caller diff --git a/pysam/__init__.py b/pysam/__init__.py index d1b5d41..ed17e04 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -3,22 +3,24 @@ import sys import sysconfig from pysam.libchtslib import * -from pysam.cutils import * -import pysam.cutils as cutils -import pysam.cfaidx as cfaidx -from pysam.cfaidx import * -import pysam.ctabix as ctabix -from pysam.ctabix import * -import pysam.csamfile as csamfile -from pysam.csamfile import * -import pysam.calignmentfile as calignmentfile -from pysam.calignmentfile import * -import pysam.calignedsegment as calignedsegment -from pysam.calignedsegment import * -import pysam.cvcf as cvcf -from pysam.cvcf import * -import pysam.cbcf as cbcf -from pysam.cbcf import * +from pysam.libcutils import * +import pysam.libcutils as libcutils +import pysam.libcfaidx as libcfaidx +from pysam.libcfaidx import * +import pysam.libctabix as libctabix +from pysam.libctabix import * +import pysam.libcsamfile as libcsamfile +from pysam.libcsamfile import * +import pysam.libcalignmentfile as libcalignmentfile +from pysam.libcalignmentfile import * +import pysam.libcalignedsegment as libcalignedsegment +from pysam.libcalignedsegment import * +import pysam.libcvcf as libcvcf +from pysam.libcvcf import * +import pysam.libcbcf as libcbcf +from pysam.libcbcf import * +import pysam.libcbgzf as libcbgzf +from pysam.libcbgzf import * from pysam.utils import SamtoolsError import pysam.Pileup as Pileup from pysam.samtools import * @@ -28,14 +30,15 @@ import pysam.config # export all the symbols from separate modules __all__ = \ libchtslib.__all__ +\ - cutils.__all__ +\ - ctabix.__all__ +\ - cvcf.__all__ +\ - cbcf.__all__ +\ - cfaidx.__all__ +\ - calignmentfile.__all__ +\ - calignedsegment.__all__ +\ - csamfile.__all__ +\ + libcutils.__all__ +\ + libctabix.__all__ +\ + libcvcf.__all__ +\ + libcbcf.__all__ +\ + libcbgzf.__all__ +\ + libcfaidx.__all__ +\ + libcalignmentfile.__all__ +\ + libcalignedsegment.__all__ +\ + libcsamfile.__all__ +\ ["SamtoolsError"] +\ ["Pileup"] @@ -75,25 +78,17 @@ def get_defines(): def get_libraries(): '''return a list of libraries to link against.''' - # Note that this list does not include csamtools.so as there are + # Note that this list does not include libcsamtools.so as there are # numerous name conflicts with libchtslib.so. dirname = os.path.abspath(os.path.join(os.path.dirname(__file__))) - pysam_libs = ['ctabixproxies', - 'cfaidx', - 'csamfile', - 'cvcf', - 'cbcf', - 'ctabix'] + pysam_libs = ['libctabixproxies', + 'libcfaidx', + 'libcsamfile', + 'libcvcf', + 'libcbcf', + 'libctabix'] if pysam.config.HTSLIB == "builtin": pysam_libs.append('libchtslib') - if sys.version_info.major >= 3: - if sys.version_info.minor >= 5: - return [os.path.join(dirname, x + ".{}.so".format( - sysconfig.get_config_var('SOABI'))) for x in pysam_libs] - else: - return [os.path.join(dirname, x + ".{}{}.so".format( - sys.implementation.cache_tag, - sys.abiflags)) for x in pysam_libs] - else: - return [os.path.join(dirname, x + ".so") for x in pysam_libs] + so = sysconfig.get_config_var('SO') + return [os.path.join(dirname, x + so) for x in pysam_libs] diff --git a/pysam/chtslib.pyx b/pysam/chtslib.pyx deleted file mode 100644 index eab229f..0000000 --- a/pysam/chtslib.pyx +++ /dev/null @@ -1,19 +0,0 @@ -# cython: embedsignature=True -# cython: profile=True -# adds doc-strings for sphinx -from pysam.chtslib cimport * - -cpdef set_verbosity(int verbosity): - u"""Set htslib's hts_verbose global variable to the specified value. - """ - return hts_set_verbosity(verbosity) - -cpdef get_verbosity(): - u"""Return the value of htslib's hts_verbose global variable. - """ - return hts_get_verbosity() - -__all__ = [ - "get_verbosity", - "set_verbosity"] - diff --git a/pysam/calignedsegment.pxd b/pysam/libcalignedsegment.pxd similarity index 97% rename from pysam/calignedsegment.pxd rename to pysam/libcalignedsegment.pxd index 0880bef..f1d59d1 100644 --- a/pysam/calignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -1,4 +1,4 @@ -from pysam.chtslib cimport * +from pysam.libchtslib cimport * cdef extern from "htslib_util.h": @@ -32,7 +32,7 @@ cdef extern from "htslib_util.h": void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) -from pysam.calignmentfile cimport AlignmentFile +from pysam.libcalignmentfile cimport AlignmentFile ctypedef AlignmentFile AlignmentFile_t diff --git a/pysam/calignedsegment.pyx b/pysam/libcalignedsegment.pyx similarity index 97% rename from pysam/calignedsegment.pyx rename to pysam/libcalignedsegment.pyx index f4e0750..c95bb13 100644 --- a/pysam/calignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -65,9 +65,9 @@ from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize from libc.string cimport strchr from cpython cimport array as c_array -from pysam.cutils cimport force_bytes, force_str, \ +from pysam.libcutils cimport force_bytes, force_str, \ charptr_to_str, charptr_to_bytes -from pysam.cutils cimport qualities_to_qualitystring, qualitystring_to_array, \ +from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \ array_to_qualitystring # Constants for binary tag conversion @@ -87,6 +87,12 @@ else: CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") +##################################################################### +# C multiplication with wrapping around +cdef inline uint32_t c_mul(uint32_t a, uint32_t b): + return (a * b) & 0xffffffff + + ##################################################################### # typecode guessing cdef inline char map_typecode_htslib_to_python(uint8_t s): @@ -230,7 +236,7 @@ cdef inline packTags(tags): to be used in a call to struct.pack_into. """ fmts, args = ["<"], [] - + cdef char array_typecode datatype2format = { @@ -284,7 +290,7 @@ cdef inline packTags(tags): .format(value.typecode)) valuetype = force_bytes(chr(array_typecode)) - + if valuetype not in datatype2format: raise ValueError("invalid value type '%s' (%s)" % (valuetype, type(valuetype))) @@ -337,9 +343,12 @@ cdef inline int32_t calculateQueryLength(bam1_t * src): for k from 0 <= k < pysam_get_n_cigar(src): op = cigar_p[k] & BAM_CIGAR_MASK - if op == BAM_CMATCH or op == BAM_CINS or \ + if op == BAM_CMATCH or \ + op == BAM_CINS or \ op == BAM_CSOFT_CLIP or \ - op == BAM_CEQUAL or op == BAM_CDIFF: + op == BAM_CHARD_CLIP or \ + op == BAM_CEQUAL or \ + op == BAM_CDIFF: qpos += cigar_p[k] >> BAM_CIGAR_SHIFT return qpos @@ -513,7 +522,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): Positions corresponding to `N` (skipped region from the reference) in the CIGAR string will not appear in the returned sequence. The MD should correspondingly not contain these. Thus proper tags are:: - + Deletion from the reference: cigar=5M1D5M MD=5^C5 Skipped region from reference: cigar=5M1N5M MD=10 @@ -555,7 +564,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): l = cigar_p[k] >> BAM_CIGAR_SHIFT if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: for i from 0 <= i < l: - s[s_idx] = read_sequence[r_idx] + s[s_idx] = read_sequence[r_idx] r_idx += 1 s_idx += 1 elif op == BAM_CDEL: @@ -579,7 +588,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): "Padding (BAM_CPAD, 6) is currently not supported. " "Please implement. Sorry about that.") - cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") + cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") if md_tag_ptr == NULL: seq = PyBytes_FromStringAndSize(s, s_idx) free(s) @@ -665,6 +674,13 @@ cdef class AlignedSegment: self._delegate.data = calloc( self._delegate.m_data, 1) self._delegate.l_data = 0 + # set some data to make read approximately legit. + # Note, SAM writing fails with q_name of length 0 + self._delegate.core.l_qname = 0 + self._delegate.core.tid = -1 + self._delegate.core.pos = -1 + self._delegate.core.mtid = -1 + self._delegate.core.mpos = -1 # caching for selected fields self.cache_query_qualities = None @@ -751,18 +767,17 @@ cdef class AlignedSegment: return NotImplemented def __hash__(self): - cdef bam1_t * src - src = self._delegate - # shift and xor values in the core structure - # make sure tid and mtid are shifted by different amounts - # should variable length data be included? - cdef uint32_t hash_value = src.core.tid << 24 ^ \ - src.core.pos << 16 ^ \ - src.core.qual << 8 ^ \ - src.core.flag ^ \ - src.core.isize << 24 ^ \ - src.core.mtid << 16 ^ \ - src.core.mpos << 8 + cdef bam1_t * src = self._delegate + cdef int x + + # see http://effbot.org/zone/python-hash.htm + cdef uint8_t * c = &src.core + cdef uint32_t hash_value = c[0] + for x from 1 <= x < sizeof(bam1_core_t): + hash_value = c_mul(hash_value, 1000003) ^ c[x] + c = src.data + for x from 0 <= x < src.l_data: + hash_value = c_mul(hash_value, 1000003) ^ c[x] return hash_value @@ -775,8 +790,13 @@ cdef class AlignedSegment: ---------- htsfile -- AlignmentFile object to map numerical - identifers to chromosome names. + identifiers to chromosome names. """ + cdef int n_targets = htsfile.header.n_targets + + if self._delegate.core.tid >= n_targets \ + or self._delegate.core.mtid >= n_targets: + raise ValueError('htsfile does not match aligned segment') cdef kstring_t line line.l = line.m = 0 @@ -788,7 +808,7 @@ cdef class AlignedSegment: raise ValueError('sam_format failed') ret = force_str(line.s[:line.l]) - + if line.m: free(line.s) @@ -808,6 +828,11 @@ cdef class AlignedSegment: def __set__(self, qname): if qname is None or len(qname) == 0: return + + if len(qname) >= 255: + raise ValueError("query length out of range {} > 254".format( + len(qname))) + qname = force_bytes(qname) cdef bam1_t * src cdef int l @@ -823,7 +848,6 @@ cdef class AlignedSegment: l, p) - pysam_set_l_qname(src, l) # re-acquire pointer to location in memory @@ -955,7 +979,7 @@ cdef class AlignedSegment: in the BAM/SAM file. The length of a query is 0 if there is no sequence in the BAM/SAM file. In those cases, the read length can be inferred from the CIGAR alignment, see - :meth:`pysam.AlignmentFile.infer_query_length.`. + :meth:`pysam.AlignedSegment.infer_query_length`. The length includes soft-clipped bases and is equal to ``len(query_sequence)``. @@ -1669,7 +1693,7 @@ cdef class AlignedSegment: each cigar operation. """ - + cdef int nfields = NCIGAR_CODES + 1 cdef c_array.array base_counts = array.array( @@ -2288,7 +2312,7 @@ cdef class AlignedSegment: cdef class PileupColumn: - '''A pileup of reads at a particular reference sequence postion + '''A pileup of reads at a particular reference sequence position (:term:`column`). A pileup column contains all the reads that map to a certain target base. @@ -2416,11 +2440,11 @@ cdef class PileupRead: return self._qpos property indel: - """indel length for the position follwing the current pileup site. + """indel length for the position following the current pileup site. This quantity peeks ahead to the next cigar operation in this - alignment. If the next operation is and insertion, indel will - be positve. If the next operation is a deletion, it will be + alignment. If the next operation is an insertion, indel will + be positive. If the next operation is a deletion, it will be negation. 0 if the next operation is not an indel. """ @@ -2428,7 +2452,8 @@ cdef class PileupRead: return self._indel property level: - """the level of the read in the "viewer" mode""" + """the level of the read in the "viewer" mode. Note that this value + is currently not computed.""" def __get__(self): return self._level @@ -2448,6 +2473,7 @@ cdef class PileupRead: return self._is_tail property is_refskip: + """1 iff the base on the padded read is part of CIGAR N op.""" def __get__(self): return self._is_refskip diff --git a/pysam/calignmentfile.pxd b/pysam/libcalignmentfile.pxd similarity index 83% rename from pysam/calignmentfile.pxd rename to pysam/libcalignmentfile.pxd index 3384e7e..6f32f47 100644 --- a/pysam/calignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -4,9 +4,9 @@ from libc.stdlib cimport malloc, calloc, realloc, free from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup from libc.stdio cimport FILE, printf -from pysam.cfaidx cimport faidx_t, Fastafile -from pysam.calignedsegment cimport AlignedSegment -from pysam.chtslib cimport * +from pysam.libcfaidx cimport faidx_t, Fastafile +from pysam.libcalignedsegment cimport AlignedSegment +from pysam.libchtslib cimport * from cpython cimport array cimport cython @@ -36,33 +36,16 @@ ctypedef struct __iterdata: int seq_len -cdef class AlignmentFile: - - cdef object _filename - cdef object _reference_filename - - # pointer to htsFile structure - cdef htsFile * htsfile +cdef class AlignmentFile(HTSFile): + cdef readonly object reference_filename # pointer to index cdef hts_idx_t *index # header structure cdef bam_hdr_t * header - # true if file is bam format - cdef readonly bint is_bam - # true if file is bam format - cdef readonly bint is_cram - # true if not a file but a stream - cdef readonly bint is_stream - # true if file is not on the local filesystem - cdef readonly bint is_remote + # current read within iteration cdef bam1_t * b - # file opening mode - cdef char * mode - - # beginning of read section - cdef int64_t start_offset cdef bam1_t * getCurrent(self) cdef int cnext(self) @@ -70,12 +53,14 @@ cdef class AlignmentFile: # write an aligned read cpdef int write(self, AlignedSegment read) except -1 + cdef class PileupColumn: cdef bam_pileup1_t ** plp cdef int tid cdef int pos cdef int n_pu + cdef class PileupRead: cdef AlignedSegment _alignment cdef int32_t _qpos @@ -86,6 +71,7 @@ cdef class PileupRead: cdef uint32_t _is_tail cdef uint32_t _is_refskip + cdef class IteratorRow: cdef int retval cdef bam1_t * b @@ -94,6 +80,7 @@ cdef class IteratorRow: cdef bam_hdr_t * header cdef int owns_samfile + cdef class IteratorRowRegion(IteratorRow): cdef hts_itr_t * iter cdef bam1_t * getCurrent(self) @@ -109,16 +96,19 @@ cdef class IteratorRowAll(IteratorRow): cdef bam1_t * getCurrent(self) cdef int cnext(self) + cdef class IteratorRowAllRefs(IteratorRow): cdef int tid cdef IteratorRowRegion rowiter + cdef class IteratorRowSelection(IteratorRow): cdef int current_pos cdef positions cdef bam1_t * getCurrent(self) cdef int cnext(self) + cdef class IteratorColumn: # result of the last plbuf_push @@ -147,18 +137,20 @@ cdef class IteratorColumn: cdef reset(self, tid, start, end) cdef _free_pileup_iter(self) + cdef class IteratorColumnRegion(IteratorColumn): cdef int start cdef int end cdef int truncate + cdef class IteratorColumnAllRefs(IteratorColumn): pass + cdef class IndexedReads: cdef AlignmentFile samfile cdef htsFile * htsfile cdef index cdef int owns_samfile cdef bam_hdr_t * header - diff --git a/pysam/calignmentfile.pyx b/pysam/libcalignmentfile.pyx similarity index 92% rename from pysam/calignmentfile.pyx rename to pysam/libcalignmentfile.pyx index ed5e584..2161f87 100644 --- a/pysam/calignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -58,13 +58,15 @@ import re import warnings import array +from libc.errno cimport errno, EPIPE +from libc.string cimport strcmp, strpbrk, strerror from cpython cimport array as c_array from cpython.version cimport PY_MAJOR_VERSION -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size -from pysam.calignedsegment cimport makeAlignedSegment, makePileupColumn -from pysam.chtslib cimport hisremote +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn +from pysam.libchtslib cimport HTSFile, hisremote if PY_MAJOR_VERSION >= 3: from io import StringIO @@ -97,7 +99,8 @@ VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") # default type conversions within SAM header records KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, "SQ" : {"SN" : str, "LN" : int, "AS" : str, - "M5" : str, "SP" : str, "UR" : str,}, + "M5" : str, "SP" : str, "UR" : str, + "AH" : str,}, "RG" : {"ID" : str, "CN" : str, "DS" : str, "DT" : str, "FO" : str, "KS" : str, "LB" : str, "PG" : str, "PI" : str, @@ -110,7 +113,7 @@ KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str}, # the end as parsing a CL will ignore any subsequent records. VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "GO"), "SQ" : ("SN", "LN", "AS", "M5", - "UR", "SP"), + "UR", "SP", "AH"), "RG" : ("ID", "CN", "SM", "LB", "PU", "PI", "DT", "DS", "PL", "FO", "KS", "PG", @@ -216,11 +219,11 @@ cdef bam_hdr_t * build_header(new_header): return dest -cdef class AlignmentFile: +cdef class AlignmentFile(HTSFile): """AlignmentFile(filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=NULL, header=None, add_sq_text=False, check_header=True, check_sq=True, - reference_filename=None, filename=None) + reference_filename=None, filename=None, duplicate_filehandle=True) A :term:`SAM`/:term:`BAM` formatted file. @@ -323,16 +326,23 @@ cdef class AlignmentFile: Alternative to filepath_or_object. Filename of the file to be opened. + duplicate_filehandle: bool + By default, file handles passed either directly or through + File-like objects will be duplicated before passing them to + htslib. The duplication prevents issues where the same stream + will be closed by htslib and through destruction of the + high-level python object. Set to False to turn off + duplication. + """ def __cinit__(self, *args, **kwargs): - self.htsfile = NULL - self._filename = None - self.is_bam = False + self.filename = None + self.mode = None self.is_stream = False - self.is_cram = False self.is_remote = False + self.index = NULL if "filename" in kwargs: args = [kwargs["filename"]] @@ -343,10 +353,6 @@ cdef class AlignmentFile: # allocate memory for iterator self.b = calloc(1, sizeof(bam1_t)) - def is_open(self): - '''return true if htsfile has been opened.''' - return self.htsfile != NULL - def has_index(self): """return true if htsfile has an existing (and opened) index. """ @@ -365,7 +371,7 @@ cdef class AlignmentFile: if htsfile is closed or index could not be opened. """ - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") if not self.is_bam and not self.is_cram: raise AttributeError( @@ -391,16 +397,17 @@ cdef class AlignmentFile: check_sq=True, filepath_index=None, referencenames=None, - referencelengths=None): + referencelengths=None, + duplicate_filehandle=True): '''open a sam, bam or cram formatted file. If _open is called on an existing file, the current file will be closed and a new file will be opened. ''' - cdef char *cfilename - cdef char *creference_filename - cdef char *cindexname - cdef char *cmode + cdef char *cfilename = NULL + cdef char *creference_filename = NULL + cdef char *cindexname = NULL + cdef char *cmode = NULL # for backwards compatibility: if referencenames is not None: @@ -408,6 +415,10 @@ cdef class AlignmentFile: if referencelengths is not None: reference_lengths = referencelengths + # close a previously opened file + if self.is_open: + self.close() + # autodetection for read if mode is None: mode = "r" @@ -417,38 +428,46 @@ cdef class AlignmentFile: "rc", "wc"), \ "invalid file opening mode `%s`" % mode - # close a previously opened file - if self.htsfile != NULL: - self.close() + self.duplicate_filehandle = duplicate_filehandle # StringIO not supported if isinstance(filepath_or_object, StringIO): - filename = "stringio" raise NotImplementedError( "access from StringIO objects not supported") - if filepath_or_object.closed: - raise ValueError('I/O operation on closed StringIO object') - # check if we are working with a File object + # reading from a file descriptor + elif isinstance(filepath_or_object, int): + self.filename = filepath_or_object + filename = None + self.is_remote = False + self.is_stream = True + # reading from a File object or other object with fileno elif hasattr(filepath_or_object, "fileno"): - filename = filepath_or_object.name if filepath_or_object.closed: raise ValueError('I/O operation on closed file') + self.filename = filepath_or_object + # .name can be TextIOWrapper + try: + filename = encode_filename(str(filepath_or_object.name)) + cfilename = filename + except AttributeError: + filename = None + self.is_remote = False + self.is_stream = True + # what remains is a filename else: - filename = filepath_or_object + self.filename = filename = encode_filename(filepath_or_object) + cfilename = filename + self.is_remote = hisremote(cfilename) + self.is_stream = self.filename == b'-' # for htslib, wbu seems to not work if mode == "wbu": mode = "wb0" - cdef bytes bmode = mode.encode('ascii') - self._filename = filename = encode_filename(filename) - self._reference_filename = reference_filename = encode_filename( + self.mode = force_bytes(mode) + self.reference_filename = reference_filename = encode_filename( reference_filename) - # FIXME: Use htsFormat when it is available - self.is_stream = filename == b"-" - self.is_remote = hisremote(filename) - cdef char * ctext cdef hFILE * fp ctext = NULL @@ -477,10 +496,8 @@ cdef class AlignmentFile: n = 0 for x in reference_names: n += len(x) + 1 - self.header.target_name = calloc( - n, sizeof(char*)) - self.header.target_len = calloc( - n, sizeof(uint32_t)) + self.header.target_name = calloc(n, sizeof(char*)) + self.header.target_len = calloc(n, sizeof(uint32_t)) for x from 0 <= x < self.header.n_targets: self.header.target_len[x] = reference_lengths[x] name = reference_names[x] @@ -507,57 +524,34 @@ cdef class AlignmentFile: strlen(ctext), sizeof(char)) memcpy(self.header.text, ctext, strlen(ctext)) - # open file (hts_open is synonym with sam_open) - cfilename, cmode = filename, bmode - if hasattr(filepath_or_object, "fileno"): - fp = hdopen(filepath_or_object.fileno(), cmode) - with nogil: - self.htsfile = hts_hopen(fp, cfilename, cmode) - else: - with nogil: - self.htsfile = hts_open(cfilename, cmode) - - # htsfile.format does not get set until writing, so use - # the format specifier explicitely given by the user. - self.is_bam = "b" in mode - self.is_cram = "c" in mode + self.htsfile = self._open_htsfile() # set filename with reference sequences. If no filename # is given, the CRAM reference arrays will be built from # the @SQ header in the header - if self.is_cram and reference_filename: + if "c" in mode and reference_filename: # note that fn_aux takes ownership, so create a copy - self.htsfile.fn_aux = strdup(self._reference_filename) + self.htsfile.fn_aux = strdup(self.reference_filename) # write header to htsfile - if self.is_bam or self.is_cram or "h" in mode: + if "b" in mode or "c" in mode or "h" in mode: with nogil: sam_hdr_write(self.htsfile, self.header) elif mode[0] == "r": # open file for reading - if (filename != b"-" - and not self.is_remote - and not os.path.exists(filename)): - raise IOError("file `%s` not found" % filename) - - # open file (hts_open is synonym with sam_open) - cfilename, cmode = filename, bmode - if hasattr(filepath_or_object, "fileno"): - fp = hdopen(filepath_or_object.fileno(), cmode) - with nogil: - self.htsfile = hts_hopen(fp, cfilename, cmode) - else: - with nogil: - self.htsfile = hts_open(cfilename, cmode) + if not self._exists(): + raise IOError("file `%s` not found" % self.filename) + + self.htsfile = self._open_htsfile() if self.htsfile == NULL: raise ValueError( "could not open file (mode='%s') - " "is it SAM/BAM format?" % mode) - self.is_bam = self.htsfile.format.format == bam - self.is_cram = self.htsfile.format.format == cram + if self.htsfile.format.category != sequence_data: + raise ValueError("file does not contain alignment data") # bam files require a valid header if self.is_bam or self.is_cram: @@ -581,7 +575,7 @@ cdef class AlignmentFile: # set filename with reference sequences if self.is_cram and reference_filename: - creference_filename = self._reference_filename + creference_filename = self.reference_filename hts_set_opt(self.htsfile, CRAM_OPT_REFERENCE, creference_filename) @@ -592,8 +586,7 @@ cdef class AlignmentFile: "is it SAM/BAM format? Consider opening with " "check_sq=False") % mode) - if self.htsfile == NULL: - raise IOError("could not open file `%s`" % filename ) + assert self.htsfile != NULL # check for index and open if present cdef int format_index = -1 @@ -603,11 +596,8 @@ cdef class AlignmentFile: format_index = HTS_FMT_CRAI if mode[0] == "r" and (self.is_bam or self.is_cram): - # open index for remote files if self.is_remote and not filepath_index: - cfilename = filename - with nogil: self.index = hts_idx_load(cfilename, format_index) if self.index == NULL: @@ -615,17 +605,18 @@ cdef class AlignmentFile: "unable to open remote index for '%s'" % cfilename) else: has_index = True - cfilename = filename if filepath_index: if not os.path.exists(filepath_index): warnings.warn( "unable to open index at %s" % cfilename) self.index = NULL has_index = False - else: + elif filename is not None: if self.is_bam \ and not os.path.exists(filename + b".bai") \ - and not os.path.exists(filename[:-4] + b".bai"): + and not os.path.exists(filename[:-4] + b".bai") \ + and not os.path.exists(filename + b".csi") \ + and not os.path.exists(filename[:-4] + b".csi"): self.index = NULL has_index = False elif self.is_cram \ @@ -633,6 +624,9 @@ cdef class AlignmentFile: and not os.path.exists(filename[:-5] + b".crai"): self.index = NULL has_index = False + else: + self.index = NULL + has_index = False if has_index: # returns NULL if there is no index or index could @@ -643,7 +637,6 @@ cdef class AlignmentFile: self.index = sam_index_load2(self.htsfile, cfilename, cindexname) - else: with nogil: self.index = sam_index_load(self.htsfile, @@ -664,7 +657,7 @@ cdef class AlignmentFile: returns -1 if reference is not known. """ - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") reference = force_bytes(reference) return bam_name2id(self.header, reference) @@ -673,78 +666,13 @@ cdef class AlignmentFile: """ return :term:`reference` name corresponding to numerical :term:`tid` """ - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") if not 0 <= tid < self.header.n_targets: raise ValueError("reference_id %i out of range 0<=tid<%i" % (tid, self.header.n_targets)) return charptr_to_str(self.header.target_name[tid]) - def reset(self): - """reset file position to beginning of file just after - the header. - - Returns - ------- - - The file position after moving the file pointer. - - """ - return self.seek(self.start_offset, 0) - - def seek(self, uint64_t offset, int where=0): - """move file pointer to position `offset`, see - :meth:`pysam.AlignmentFile.tell`. - - Parameters - ---------- - - offset : int - - position of the read/write pointer within the file. - - where : int - - optional and defaults to 0 which means absolute file - positioning, other values are 1 which means seek relative to - the current position and 2 means seek relative to the file's - end. - - Returns - ------- - - the file position after moving the file pointer - - """ - - if not self.is_open(): - raise ValueError("I/O operation on closed file") - if not self.is_bam: - raise NotImplementedError( - "seek only available in bam files") - if self.is_stream: - raise OSError("seek no available in streams") - - cdef uint64_t pos - with nogil: - pos = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, where) - return pos - - def tell(self): - """ - return current file position. - """ - if not self.is_open(): - raise ValueError("I/O operation on closed file") - if not (self.is_bam or self.is_cram): - raise NotImplementedError( - "seek only available in bam files") - - cdef uint64_t pos - with nogil: - pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) - return pos - def parse_region(self, reference=None, start=None, @@ -891,7 +819,7 @@ cdef class AlignmentFile: """ cdef int rtid, rstart, rend, has_coord - if not self.is_open(): + if not self.is_open: raise ValueError( "I/O operation on closed file" ) has_coord, rtid, rstart, rend = self.parse_region( @@ -1055,7 +983,7 @@ cdef class AlignmentFile: ---------- stepper : string - The stepper controlls how the iterator advances. + The stepper controls how the iterator advances. Possible options for the stepper are ``all`` @@ -1092,7 +1020,7 @@ cdef class AlignmentFile: """ cdef int rtid, rstart, rend, has_coord - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") has_coord, rtid, rstart, rend = self.parse_region( @@ -1177,8 +1105,8 @@ cdef class AlignmentFile: cdef AlignedSegment read cdef long counter = 0 - if not self.is_open(): - raise ValueError( "I/O operation on closed file" ) + if not self.is_open: + raise ValueError("I/O operation on closed file") cdef int filter_method = 0 if read_callback == "all": @@ -1329,13 +1257,48 @@ cdef class AlignmentFile: return count_a, count_c, count_g, count_t + def find_introns(self, read_iterator): + """Return a dictionary {(start, stop): count} + Listing the intronic sites in the reads (identified by 'N' in the cigar strings), + and their support ( = number of reads ). + + read_iterator can be the result of a .fetch(...) call. + Or it can be a generator filtering such reads. Example + samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) + """ + import collections + res = collections.Counter() + for r in read_iterator: + if 'N' in r.cigarstring: + last_read_pos = False + for read_loc, genome_loc in r.get_aligned_pairs(): + if read_loc is None and last_read_pos: + start = genome_loc + elif read_loc and last_read_pos is None: + stop = genome_loc # we are right exclusive ,so this is correct + res[(start, stop)] += 1 + del start + del stop + last_read_pos = read_loc + return res + def close(self): ''' closes the :class:`pysam.AlignmentFile`.''' - if self.htsfile != NULL: - hts_close(self.htsfile) - hts_idx_destroy(self.index); - self.htsfile = NULL + + if self.htsfile == NULL: + return + + cdef int ret = hts_close(self.htsfile) + hts_idx_destroy(self.index) + self.htsfile = NULL + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise OSError(errno, force_str(strerror(errno))) def __dealloc__(self): # remember: dealloc cannot call other methods @@ -1349,14 +1312,24 @@ cdef class AlignmentFile: # AH: I have removed the call to close. Even though it is working, # it seems to be dangerous according to the documentation as the # object be partially deconstructed already. + cdef int ret = 0 + if self.htsfile != NULL: - hts_close(self.htsfile) + ret = hts_close(self.htsfile) hts_idx_destroy(self.index); self.htsfile = NULL bam_destroy1(self.b) if self.header != NULL: bam_hdr_destroy(self.header) + + + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise OSError(errno, force_str(strerror(errno))) cpdef int write(self, AlignedSegment read) except -1: ''' @@ -1373,7 +1346,7 @@ cdef class AlignmentFile: int : the number of bytes written. If the file is closed, this will be 0. ''' - if not self.is_open(): + if not self.is_open: return 0 cdef int ret @@ -1387,7 +1360,8 @@ cdef class AlignmentFile: # when ret == -1 we get a "SystemError: error return without # exception set". if ret < 0: - raise ValueError('sam write failed') + raise IOError( + "sam_write1 failed with error code {}".format(ret)) return ret @@ -1404,23 +1378,11 @@ cdef class AlignmentFile: ############################################################### ## properties ############################################################### - property closed: - """bool indicating the current state of the file object. - This is a read-only attribute; the close() method changes the value. - """ - def __get__(self): - return not self.is_open() - - property filename: - """filename associated with this object. This is a read-only attribute.""" - def __get__(self): - return self._filename - property nreferences: """"int with the number of :term:`reference` sequences in the file. This is a read-only attribute.""" def __get__(self): - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") return self.header.n_targets @@ -1428,7 +1390,7 @@ cdef class AlignmentFile: """tuple with the names of :term:`reference` sequences. This is a read-only attribute""" def __get__(self): - if not self.is_open(): raise ValueError( "I/O operation on closed file" ) + if not self.is_open: raise ValueError( "I/O operation on closed file" ) t = [] for x from 0 <= x < self.header.n_targets: t.append(charptr_to_str(self.header.target_name[x])) @@ -1441,7 +1403,7 @@ cdef class AlignmentFile: """ def __get__(self): - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") t = [] for x from 0 <= x < self.header.n_targets: @@ -1491,13 +1453,6 @@ cdef class AlignmentFile: n = hts_idx_get_n_no_coor(self.index) return n - property format: - '''string describing the file format''' - def __get__(self): - if not self.is_open(): - raise ValueError( "I/O operation on closed file" ) - return hts_format_description(&self.htsfile.format) - property text: '''string with the full contents of the :term:`sam file` header as a string. @@ -1508,7 +1463,7 @@ cdef class AlignmentFile: representation of the header. ''' def __get__(self): - if not self.is_open(): + if not self.is_open: raise ValueError( "I/O operation on closed file" ) return from_string_and_size(self.header.text, self.header.l_text) @@ -1535,7 +1490,7 @@ cdef class AlignmentFile: """ def __get__(self): - if not self.is_open(): + if not self.is_open: raise ValueError( "I/O operation on closed file" ) result = {} @@ -1613,7 +1568,7 @@ cdef class AlignmentFile: ## and multiple_iterators) ## Possible solutions: deprecate or open new file handle def __iter__(self): - if not self.is_open(): + if not self.is_open: raise ValueError("I/O operation on closed file") if not self.is_bam and self.header.n_targets == 0: @@ -1683,7 +1638,7 @@ cdef class IteratorRow: cdef char *cfilename cdef char *creference_filename - if not samfile.is_open(): + if not samfile.is_open: raise ValueError("I/O operation on closed file") # makes sure that samfile stays alive as long as the @@ -1693,7 +1648,7 @@ cdef class IteratorRow: # reopen the file - note that this makes the iterator # slow and causes pileup to slow down significantly. if multiple_iterators: - cfilename = samfile._filename + cfilename = samfile.filename with nogil: self.htsfile = hts_open(cfilename, 'r') assert self.htsfile != NULL @@ -1704,8 +1659,8 @@ cdef class IteratorRow: assert self.header != NULL self.owns_samfile = True # options specific to CRAM files - if samfile.is_cram and samfile._reference_filename: - creference_filename = samfile._reference_filename + if samfile.is_cram and samfile.reference_filename: + creference_filename = samfile.reference_filename hts_set_opt(self.htsfile, CRAM_OPT_REFERENCE, creference_filename) @@ -2462,7 +2417,7 @@ cdef class IndexedReads: # multiple_iterators the file - note that this makes the iterator # slow and causes pileup to slow down significantly. if multiple_iterators: - cfilename = samfile._filename + cfilename = samfile.filename with nogil: self.htsfile = hts_open(cfilename, 'r') assert self.htsfile != NULL diff --git a/pysam/cbcf.pxd b/pysam/libcbcf.pxd similarity index 72% rename from pysam/cbcf.pxd rename to pysam/libcbcf.pxd index b6e210a..fc7f56c 100644 --- a/pysam/cbcf.pxd +++ b/pysam/libcbcf.pxd @@ -3,18 +3,9 @@ ## Cython wrapper for htslib VCF/BCF reader/writer ############################################################################### # -# NOTICE: This code is incomplete and preliminary. It is nearly complete as -# an immutable interface, but has no capability (yet) to mutate the -# resulting data (beyond dropping all samples). Documentation still -# needs to be written and a unit test suite is in the works. The -# code is also specific to Python 2 and will require a bit of work -# to properly adapt to Python 3. -# -############################################################################### -# # The MIT License # -# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com) +# Copyright (c) 2015, 2016 Kevin Jacobs (jacobs@bioinformed.com) # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -39,14 +30,15 @@ from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free -from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup +from libc.string cimport memcpy, memcmp, memmove, strncpy, strlen, strdup -from pysam.chtslib cimport * +from pysam.libchtslib cimport * cdef class VariantHeader(object): cdef bcf_hdr_t *ptr + cpdef VariantRecord new_record(self) cdef _subset_samples(self, include_samples) @@ -137,23 +129,16 @@ cdef class TabixIterator(BaseIterator): cdef kstring_t line_buffer -cdef class VariantFile(object): - cdef htsFile *htsfile # pointer to htsFile structure - cdef int64_t start_offset # BGZF offset of first record - - cdef readonly object filename # filename as supplied by user - cdef readonly object mode # file opening mode - cdef readonly object index_filename # filename of index, if supplied by user - +cdef class VariantFile(HTSFile): cdef readonly VariantHeader header cdef readonly BaseIndex index cdef readonly bint drop_samples # true if sample information is to be ignored # FIXME: Temporary, use htsFormat when it is available - cdef readonly bint is_bcf # true if file is a bcf file - cdef readonly bint is_stream # true if not a seekable file but a stream - cdef readonly bint is_remote # true if file is not on the local filesystem - cdef readonly bint is_reading # true if file has begun reading records + cdef readonly bint is_reading # true if file has begun reading records + cdef readonly bint header_written # true if header has already been written + + cpdef VariantRecord new_record(self) cpdef int write(self, VariantRecord record) except -1 diff --git a/pysam/cbcf.pyx b/pysam/libcbcf.pyx similarity index 76% rename from pysam/cbcf.pyx rename to pysam/libcbcf.pyx index 41fd44f..8f40451 100644 --- a/pysam/cbcf.pyx +++ b/pysam/libcbcf.pyx @@ -7,8 +7,7 @@ # # NOTICE: This code is incomplete and preliminary. It offers a nearly # complete Pythonic interface to VCF/BCF metadata and data with -# reading and writing capability. It has limited capability to to -# mutate the resulting data. Documentation and a unit test suite +# reading and writing capability. Documentation and a unit test suite # are in the works. The code is best tested under Python 2, but # should also work with Python 3. Please report any remaining # str/bytes issues on the github site when using Python 3 and I'll @@ -43,126 +42,22 @@ # user 1m3.502s # sys 0m3.459s # -# Here is a quick tour through the API:: -# -# VariantFile(filename, mode=None, header=None, drop_samples=False) -# -# Attributes / Properties -# -# htsfile: htsFile* [private] -# start_offset: BGZF offset of first record [private] -# filename: filename [read only] -# mode: mode [read only] -# header: VariantHeader object [read only] -# index: TabixIndex, BCFIndex or None [read only] -# drop_samples: sample information is to be ignored [read only] -# -# is_stream: file is stdin/stdout [read only] -# is_remote: file is not on the local filesystem [read only] -# is_reading: file has begun reading records [read only] -# category: file format general category [read only] -# format: file format [read only] -# version: tuple of (major, minor) format version [read only] -# compression: file compression -# description: vaguely human readable description of [read only] -# file format. -# -# Methods: -# copy() -# close() -# open(filename, mode=None, header=None, drop_samples=False) -# reset() -# seek(offset) -# tell() -# fetch(contig=None, start=None, stop=None, region=None, reopen=False) -# subset_samples(include_samples) -# -# VariantHeader() -# -# version: VCF version -# samples: sequence-like access to samples -# records: sequence-like access to partially parsed headers -# contigs: mapping-like object for contig name -> VariantContig -# -# filters: mapping-like object for filter name -> VariantMetadata -# info: mapping-like object for info name -> VariantMetadata -# formats: mapping-like object for formats name -> VariantMetadata -# -# VariantRecord(...) -# -# header: VariantHeader object -# rid: reference id (i.e. tid) -# chrom: chromosome/contig string -# contig: synonym for chrom -# pos: 1-based start position (inclusive) -# start: 0-based start position (inclusive) -# stop: 0-based stop position (exclusive) -# rlen: reference length (stop - start) -# id: record identifier -# ref: reference allele -# alleles: alleles (ref followed by alts) -# alts: alt alleles -# qual: quality (float) -# filter: mapping-like object for filter name -> type info -# info: mapping-like object for info name -> value -# format: mapping-like object for format name -> type info -# samples: mapping-like object of sample genotypes & attrs -# -# VariantRecordSample(...) -# -# name: sample name -# index: sample index -# allele_indices: tuple of allele indices (ref=0, alt=1..len(alts), missing=-1) -# alleles: tuple of alleles (missing=None) -# -# VariantRecordSample is also a mapping object from formats to values -# -# VariantContig(...) -# -# id: reference id (i.e. tid) -# name: chromosome/contig string -# length: contig length if provided, else None -# header: defining VariantHeaderRecord -# -# VariantMetadata(...) # for FILTER, INFO and FORMAT metadata -# -# id: internal id -# name: metadata name -# type: value data type -# number: number of values -# header: defining VariantHeaderRecord -# -# VariantHeaderRecord(...) # replace with single tuple of key/value pairs? -# -# type: record type -# key: first record key -# value: first record value -# attrs: remaining key/value pairs -# ############################################################################### # -# TODO list for next major sprint: +# TODO list: # # * more genotype methods # * unit test suite (perhaps py.test based) # * documentation -# * htslib 1.2 format info -# -# For later sprints: -# -# * ability to create indices -# * mutable header and record data # * pickle support -# * Python 3 support # * left/right locus normalization -# * parallel iteration (like synced_bcf_reader) # * fix reopen to re-use fd # ############################################################################### # # The MIT License # -# Copyright (c) 2015 Kevin Jacobs (jacobs@bioinformed.com) +# Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com) # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -189,7 +84,8 @@ from __future__ import division, print_function import os import sys -from libc.string cimport strcmp, strpbrk +from libc.errno cimport errno, EPIPE +from libc.string cimport strcmp, strpbrk, strerror from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX cimport cython @@ -202,7 +98,7 @@ from cpython.bytes cimport PyBytes_FromStringAndSize from cpython.unicode cimport PyUnicode_DecodeASCII from cpython.version cimport PY_MAJOR_VERSION -from pysam.chtslib cimport hisremote +from pysam.libchtslib cimport HTSFile, hisremote from warnings import warn @@ -223,18 +119,14 @@ cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') -cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') -cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', - 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED') -cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM') ######################################################################## ######################################################################## ## Python 3 compatibility functions ######################################################################## -from pysam.cutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len -from pysam.cutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libcutils cimport encode_filename, from_string_and_size ######################################################################## @@ -268,6 +160,10 @@ cdef inline bcf_str_cache_get_charptr(const char* s): ######################################################################## +cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id): + return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id) + + cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0 @@ -497,8 +393,15 @@ cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values, cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar): + if record is None: + raise ValueError('record must not be None') + cdef bcf_hdr_t *hdr = record.header.ptr cdef bcf1_t *r = record.ptr + + if not check_header_id(hdr, hl_type, id): + raise ValueError('Invalid header') + cdef int length = bcf_hdr_id2length(hdr, hl_type, id) cdef int number = bcf_hdr_id2number(hdr, hl_type, id) @@ -523,6 +426,9 @@ cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *cou cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): + if record is None: + raise ValueError('record must not be None') + cdef bcf_hdr_t *hdr = record.header.ptr cdef char *s @@ -540,33 +446,13 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): value = () elif z.len == 1: if z.type == BCF_BT_INT8: - if z.v1.i == bcf_int8_missing: - value = None - elif z.v1.i == bcf_int8_vector_end: - value = () - else: - value = z.v1.i + value = z.v1.i if z.v1.i != bcf_int8_missing else None elif z.type == BCF_BT_INT16: - if z.v1.i == bcf_int16_missing: - value = None - elif z.v1.i == bcf_int16_vector_end: - value = () - else: - value = z.v1.i + value = z.v1.i if z.v1.i != bcf_int16_missing else None elif z.type == BCF_BT_INT32: - if z.v1.i == bcf_int32_missing: - value = None - elif z.v1.i == bcf_int32_vector_end: - value = () - else: - value = z.v1.i + value = z.v1.i if z.v1.i != bcf_int32_missing else None elif z.type == BCF_BT_FLOAT: - if bcf_float_is_missing(z.v1.f): - value = None - elif bcf_float_is_vector_end(z.v1.f): - value = () - else: - value = z.v1.f + value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None elif z.type == BCF_BT_CHAR: value = force_str(chr(z.v1.i)) else: @@ -581,17 +467,23 @@ cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z): cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type, - int id, int bt_type, ssize_t bt_len, ssize_t *value_count, - int *scalar, int *realloc): + int id, int bt_type, ssize_t bt_len, + ssize_t *value_count, int *scalar, int *realloc): + + if record is None: + raise ValueError('record must not be None') bcf_get_value_count(record, hl_type, id, value_count, scalar) # Validate values now that we know the type and size - values = (value,) if not isinstance(value, tuple) else value + values = (value,) if not isinstance(value, (list, tuple)) else value # Validate values now that we know the type and size if ht_type == BCF_HT_FLAG: value_count[0] = 1 + elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id): + # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1) + value_count[0] = -1 if value_count[0] != -1 and value_count[0] != len(values): if scalar[0]: @@ -638,13 +530,16 @@ cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_ty cdef bcf_encode_alleles(VariantRecord record, values): + if record is None: + raise ValueError('record must not be None') + cdef bcf1_t *r = record.ptr cdef int32_t nalleles = r.n_allele cdef list gt_values = [] cdef char *s cdef int i - if not values: + if values is None: return () if not isinstance(values, (list, tuple)): @@ -652,7 +547,7 @@ cdef bcf_encode_alleles(VariantRecord record, values): for value in values: if value is None: - gt_values.append(None) + gt_values.append(bcf_gt_missing) elif isinstance(value, (str, bytes)): bvalue = force_bytes(value) s = bvalue @@ -672,6 +567,9 @@ cdef bcf_encode_alleles(VariantRecord record, values): cdef bcf_info_set_value(VariantRecord record, key, value): + if record is None: + raise ValueError('record must not be None') + cdef bcf_hdr_t *hdr = record.header.ptr cdef bcf1_t *r = record.ptr cdef vdict_t *d @@ -696,9 +594,13 @@ cdef bcf_info_set_value(VariantRecord record, key, value): info_id = kh_val_vdict(d, k).id + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id, - info.type if info else -1, info.len if info else -1, + info.type if info else -1, + info.len if info else -1, &value_count, &scalar, &realloc) if info_type == BCF_HT_FLAG: @@ -752,6 +654,9 @@ cdef bcf_info_set_value(VariantRecord record, key, value): cdef bcf_info_del_value(VariantRecord record, key): + if record is None: + raise ValueError('record must not be None') + cdef bcf_hdr_t *hdr = record.header.ptr cdef bcf1_t *r = record.ptr cdef ssize_t value_count @@ -779,6 +684,9 @@ cdef bcf_info_del_value(VariantRecord record, key): cdef bcf_format_get_value(VariantRecordSample sample, key): + if sample is None: + raise ValueError('sample must not be None') + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef ssize_t count @@ -809,6 +717,9 @@ cdef bcf_format_get_value(VariantRecordSample sample, key): cdef bcf_format_set_value(VariantRecordSample sample, key, value): + if sample is None: + raise ValueError('sample must not be None') + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef int fmt_id @@ -834,6 +745,9 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): fmt_id = kh_val_vdict(d, k).id + if not check_header_id(hdr, BCF_HL_FMT, fmt_id): + raise ValueError('Invalid header') + fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id) if fmt_type == BCF_HT_FLAG: @@ -841,9 +755,12 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): if is_gt_fmt(hdr, fmt_id): value = bcf_encode_alleles(sample.record, value) + # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT. + fmt_type = BCF_HT_INT values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id, - fmt.type if fmt else -1, fmt.n if fmt else -1, + fmt.type if fmt else -1, + fmt.n if fmt else -1, &value_count, &scalar, &realloc) vlen = value_count < 0 @@ -888,6 +805,9 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): cdef bcf_format_del_value(VariantRecordSample sample, key): + if sample is None: + raise ValueError('sample must not be None') + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef ssize_t value_count @@ -915,6 +835,9 @@ cdef bcf_format_del_value(VariantRecordSample sample, key): cdef bcf_format_get_allele_indices(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef int32_t n = bcf_hdr_nsamples(hdr) @@ -942,7 +865,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample): for i in range(fmt0.n): if data8[i] == bcf_int8_vector_end: break - elif data8[i] == bcf_int8_missing: + elif data8[i] == bcf_gt_missing: a = -1 else: a = bcf_gt_allele(data8[i]) @@ -952,7 +875,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample): for i in range(fmt0.n): if data16[i] == bcf_int16_vector_end: break - elif data16[i] == bcf_int16_missing: + elif data16[i] == bcf_gt_missing: a = -1 else: a = bcf_gt_allele(data16[i]) @@ -962,7 +885,7 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample): for i in range(fmt0.n): if data32[i] == bcf_int32_vector_end: break - elif data32[i] == bcf_int32_missing: + elif data32[i] == bcf_gt_missing: a = -1 else: a = bcf_gt_allele(data32[i]) @@ -972,6 +895,9 @@ cdef bcf_format_get_allele_indices(VariantRecordSample sample): cdef bcf_format_get_alleles(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef int32_t nsamples = bcf_hdr_nsamples(hdr) @@ -1020,6 +946,9 @@ cdef bcf_format_get_alleles(VariantRecordSample sample): cdef bint bcf_sample_get_phased(VariantRecordSample sample): + if sample is None: + raise ValueError('sample must not be None') + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef int32_t n = bcf_hdr_nsamples(hdr) @@ -1080,6 +1009,9 @@ cdef bint bcf_sample_get_phased(VariantRecordSample sample): cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): + if sample is None: + raise ValueError('sample must not be None') + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef int32_t n = bcf_hdr_nsamples(hdr) @@ -1134,59 +1066,89 @@ cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased): ## Variant Header objects ######################################################################## + +cdef bcf_header_remove_hrec(VariantHeader header, int i): + if header is None: + raise ValueError('header must not be None') + + cdef bcf_hdr_t *hdr = header.ptr + + if i < 0 or i >= hdr.nhrec: + raise ValueError('Invalid header record index') + + cdef bcf_hrec_t *hrec = hdr.hrec[i] + hdr.nhrec -= 1 + + if i < hdr.nhrec: + memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*)) + + bcf_hrec_destroy(hrec) + hdr.hrec[hdr.nhrec] = NULL + hdr.dirty = 1 + + #FIXME: implement a full mapping interface -#FIXME: passing bcf_hrec_t* may not be the safest approach once mutating -# operations are allowed. +#FIXME: passing bcf_hrec_t* is not safe, since we cannot control the +# object lifetime. cdef class VariantHeaderRecord(object): """header record from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') - property type: + @property + def type(self): """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - return METADATA_TYPES[r.type] + cdef bcf_hrec_t *r = self.ptr + if not r: + return None + return METADATA_TYPES[r.type] - property key: + @property + def key(self): """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - return bcf_str_cache_get_charptr(r.key) if r.key else None + cdef bcf_hrec_t *r = self.ptr + return bcf_str_cache_get_charptr(r.key) if r and r.key else None - property value: + @property + def value(self): """header value. Set only for generic lines, None for FILTER/INFO, etc.""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - return charptr_to_str(r.value) if r.value else None + cdef bcf_hrec_t *r = self.ptr + return charptr_to_str(r.value) if r and r.value else None - property attrs: + @property + def attrs(self): """sequence of additional header attributes""" - def __get__(self): - cdef bcf_hrec_t *r = self.ptr - cdef int i - return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None, - charptr_to_str(r.vals[i]) if r.vals[i] else None) - for i in range(r.nkeys)) + cdef bcf_hrec_t *r = self.ptr + if not r: + return () + cdef int i + return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None, + charptr_to_str(r.vals[i]) if r.vals[i] else None) + for i in range(r.nkeys)) def __len__(self): cdef bcf_hrec_t *r = self.ptr - return r.nkeys + return r.nkeys if r else 0 def __bool__(self): cdef bcf_hrec_t *r = self.ptr - return r.nkeys != 0 + return r != NULL and r.nkeys != 0 def __getitem__(self, key): """get attribute value""" cdef bcf_hrec_t *r = self.ptr cdef int i - bkey = force_bytes(key) - for i in range(r.nkeys): - if r.keys[i] and r.keys[i] == bkey: - return charptr_to_str(r.vals[i]) if r.vals[i] else None + if r: + bkey = force_bytes(key) + for i in range(r.nkeys): + if r.keys[i] and r.keys[i] == bkey: + return charptr_to_str(r.vals[i]) if r.vals[i] else None raise KeyError('cannot find metadata key') def __iter__(self): cdef bcf_hrec_t *r = self.ptr + if not r: + return cdef int i for i in range(r.nkeys): if r.keys[i]: @@ -1214,6 +1176,8 @@ cdef class VariantHeaderRecord(object): def itervalues(self): """D.itervalues() -> an iterator over the values of D""" cdef bcf_hrec_t *r = self.ptr + if not r: + return cdef int i for i in range(r.nkeys): if r.keys[i]: @@ -1222,6 +1186,8 @@ cdef class VariantHeaderRecord(object): def iteritems(self): """D.iteritems() -> an iterator over the (key, value) items of D""" cdef bcf_hrec_t *r = self.ptr + if not r: + return cdef int i for i in range(r.nkeys): if r.keys[i]: @@ -1246,11 +1212,34 @@ cdef class VariantHeaderRecord(object): def __str__(self): cdef bcf_hrec_t *r = self.ptr - if r.type == BCF_HL_GEN: - return '##{}={}'.format(self.key, self.value) - else: - attrs = ','.join('{}={}'.format(k, v) for k,v in self.attrs if k != 'IDX') - return '##{}=<{}>'.format(self.key or self.type, attrs) + + if not r: + raise ValueError('cannot convert deleted record to str') + + cdef kstring_t hrec_str + hrec_str.l = hrec_str.m = 0 + hrec_str.s = NULL + + bcf_hrec_format(r, &hrec_str) + + ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l) + + if hrec_str.m: + free(hrec_str.s) + + return ret + + # FIXME: Not safe -- causes trivial segfaults at the moment + def remove(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_hrec_t *r = self.ptr + if not r: + return + assert(r.key) + cdef char *key = r.key if r.type == BCF_HL_GEN else r.value + print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key)) + bcf_hdr_remove(hdr, r.type, key) + self.ptr = NULL cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr): @@ -1269,6 +1258,8 @@ cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_ cdef class VariantHeaderRecords(object): """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def __len__(self): return self.header.ptr.nhrec @@ -1300,63 +1291,75 @@ cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header): cdef class VariantMetadata(object): - """filter, info or format metadata record from a :class:`VariantHeader` - object""" + """filter, info or format metadata record from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') - property name: + @property + def name(self): """metadata name""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key) + cdef bcf_hdr_t *hdr = self.header.ptr + return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key) # Q: Should this be exposed? - property id: + @property + def id(self): """metadata internal header id number""" - def __get__(self): - return self.id + return self.id - property number: + @property + def number(self): """metadata number (i.e. cardinality)""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or self.type == BCF_HL_FLT: - return None - cdef int l = bcf_hdr_id2length(hdr, self.type, self.id) - if l == BCF_VL_FIXED: - return bcf_hdr_id2number(hdr, self.type, self.id) - elif l == BCF_VL_VAR: - return '.' - else: - return METADATA_LENGTHS[l] + cdef bcf_hdr_t *hdr = self.header.ptr + + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + + if self.type == BCF_HL_FLT: + return None - property type: + cdef int l = bcf_hdr_id2length(hdr, self.type, self.id) + if l == BCF_VL_FIXED: + return bcf_hdr_id2number(hdr, self.type, self.id) + elif l == BCF_VL_VAR: + return '.' + else: + return METADATA_LENGTHS[l] + + @property + def type(self): """metadata value type""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - if not bcf_hdr_idinfo_exists(hdr, self.type, self.id) or \ - self.type == BCF_HL_FLT: - return None - return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)] - - property description: + cdef bcf_hdr_t *hdr = self.header.ptr + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + + if self.type == BCF_HL_FLT: + return None + return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)] + + @property + def description(self): """metadata description (or None if not set)""" - def __get__(self): - descr = self.record.get('Description') - if descr: - descr = descr.strip('"') - return force_str(descr) - - property record: - """:class:`VariantHeaderRecord` associated with this - :class:`VariantMetadata` object""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - if not bcf_hdr_idinfo_exists(hdr, self.type, self.id): - return None - cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type] - if not hrec: - return None - return makeVariantHeaderRecord(self.header, hrec) + descr = self.record.get('Description') + if descr: + descr = descr.strip('"') + return force_str(descr) + + @property + def record(self): + """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object""" + cdef bcf_hdr_t *hdr = self.header.ptr + if not check_header_id(hdr, self.type, self.id): + raise ValueError('Invalid header id') + cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type] + if not hrec: + return None + return makeVariantHeaderRecord(self.header, hrec) + + def remove_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key + bcf_hdr_remove(hdr, self.type, bkey) cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id): @@ -1379,6 +1382,8 @@ cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id) cdef class VariantHeaderMetadata(object): """mapping from filter, info or format name to :class:`VariantMetadata` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def add(self, id, number, type, description, **kwargs): """Add a new filter, info or format record""" @@ -1436,10 +1441,28 @@ cdef class VariantHeaderMetadata(object): cdef khiter_t k = kh_get_vdict(d, bkey) if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: - raise KeyError('invalid filter') + raise KeyError('invalid key') return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id) + def remove_header(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef vdict_t *d = hdr.dict[BCF_DT_ID] + + bkey = force_bytes(key) + cdef khiter_t k = kh_get_vdict(d, bkey) + + if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF: + raise KeyError('invalid key') + + bcf_hdr_remove(hdr, self.type, bkey) + #bcf_hdr_sync(hdr) + + def clear_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + bcf_hdr_remove(hdr, self.type, NULL) + #bcf_hdr_sync(hdr) + def __iter__(self): cdef bcf_hdr_t *hdr = self.header.ptr cdef bcf_idpair_t *idpair @@ -1510,31 +1533,38 @@ cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32 cdef class VariantContig(object): """contig metadata from a :class:`VariantHeader`""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') - property name: + @property + def name(self): """contig name""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key) + cdef bcf_hdr_t *hdr = self.header.ptr + return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key) - property id: + @property + def id(self): """contig internal id number""" - def __get__(self): - return self.id + return self.id - property length: + @property + def length(self): """contig length or None if not available""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0] - return length if length else None + cdef bcf_hdr_t *hdr = self.header.ptr + cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0] + return length if length else None - property header: + @property + def header(self): """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] - return makeVariantHeaderRecord(self.header, hrec) + cdef bcf_hdr_t *hdr = self.header.ptr + cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0] + return makeVariantHeaderRecord(self.header, hrec) + + def remove_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key + bcf_hdr_remove(hdr, BCF_HL_CTG, bkey) cdef VariantContig makeVariantContig(VariantHeader header, int id): @@ -1553,6 +1583,8 @@ cdef VariantContig makeVariantContig(VariantHeader header, int id): cdef class VariantHeaderContigs(object): """mapping from contig name or index to :class:`VariantContig` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def __len__(self): cdef bcf_hdr_t *hdr = self.header.ptr @@ -1585,6 +1617,32 @@ cdef class VariantHeaderContigs(object): return makeVariantContig(self.header, id) + def remove_header(self, key): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int index + cdef const char *bkey + cdef vdict_t *d + cdef khiter_t k + + if isinstance(key, int): + index = key + if index < 0 or index >= hdr.n[BCF_DT_CTG]: + raise IndexError('invalid contig index') + bkey = hdr.id[BCF_DT_CTG][self.id].key + else: + d = hdr.dict[BCF_DT_CTG] + key = force_bytes(key) + if kh_get_vdict(d, key) == kh_end(d): + raise KeyError('invalid contig') + bkey = key + + bcf_hdr_remove(hdr, BCF_HL_CTG, bkey) + + def clear_header(self): + cdef bcf_hdr_t *hdr = self.header.ptr + bcf_hdr_remove(hdr, BCF_HL_CTG, NULL) + #bcf_hdr_sync(hdr) + def __iter__(self): cdef bcf_hdr_t *hdr = self.header.ptr cdef vdict_t *d = hdr.dict[BCF_DT_CTG] @@ -1662,6 +1720,8 @@ cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header): cdef class VariantHeaderSamples(object): """sequence of sample names from a :class:`VariantHeader` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def __len__(self): return bcf_hdr_nsamples(self.header.ptr) @@ -1716,7 +1776,6 @@ cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header): cdef class VariantHeader(object): """header information for a :class:`VariantFile` object""" - #FIXME: Add structured proxy #FIXME: Add generic proxy #FIXME: Add mutable methods @@ -1743,42 +1802,48 @@ cdef class VariantHeader(object): def copy(self): return makeVariantHeader(bcf_hdr_dup(self.ptr)) - property version: + def merge(self, VariantHeader header): + if header is None: + raise ValueError('header must not be None') + bcf_hdr_merge(self.ptr, header.ptr) + + @property + def version(self): """VCF version""" - def __get__(self): - return force_str(bcf_hdr_get_version(self.ptr)) + return force_str(bcf_hdr_get_version(self.ptr)) - property samples: + @property + def samples(self): """samples (:class:`VariantHeaderSamples`)""" - def __get__(self): - return makeVariantHeaderSamples(self) + return makeVariantHeaderSamples(self) - property records: + @property + def records(self): """header records (:class:`VariantHeaderRecords`)""" - def __get__(self): - return makeVariantHeaderRecords(self) + return makeVariantHeaderRecords(self) - property contigs: + @property + def contigs(self): """contig information (:class:`VariantHeaderContigs`)""" - def __get__(self): - return makeVariantHeaderContigs(self) + return makeVariantHeaderContigs(self) - property filters: + @property + def filters(self): """filter metadata (:class:`VariantHeaderMetadata`)""" - def __get__(self): - return makeVariantHeaderMetadata(self, BCF_HL_FLT) + return makeVariantHeaderMetadata(self, BCF_HL_FLT) - property info: + @property + def info(self): """info metadata (:class:`VariantHeaderMetadata`)""" - def __get__(self): - return makeVariantHeaderMetadata(self, BCF_HL_INFO) + return makeVariantHeaderMetadata(self, BCF_HL_INFO) - property formats: + @property + def formats(self): """format metadata (:class:`VariantHeaderMetadata`)""" - def __get__(self): - return makeVariantHeaderMetadata(self, BCF_HL_FMT) + return makeVariantHeaderMetadata(self, BCF_HL_FMT) - property alts: + @property + def alts(self): """alt metadata (:class:`dict` ID->record). The data returned just a snapshot of alt records, is created @@ -1787,12 +1852,9 @@ cdef class VariantHeader(object): i.e. it is just a dict that reflects the state of alt records at the time it is created. - """ - def __get__(self): - return {record['ID']:record for record in self.records - if record.key.upper() == 'ALT' } - + return {record['ID']:record for record in self.records + if record.key.upper() == 'ALT' } # only safe to do when opening an htsfile cdef _subset_samples(self, include_samples): @@ -1824,15 +1886,23 @@ cdef class VariantHeader(object): finally: free(hstr) + cpdef VariantRecord new_record(self): + """Create a new empty VariantRecord""" + r = makeVariantRecord(self, bcf_init()) + r.ptr.n_sample = bcf_hdr_nsamples(self.ptr) + return r + def add_record(self, VariantHeaderRecord record): """Add an existing :class:`VariantHeaderRecord` to this header""" - cdef bcf_hrec_t *r = record.ptr + if record is None: + raise ValueError('record must not be None') - if r.type == BCF_HL_GEN: - self.add_meta(r.key, r.value) - else: - items = [(k,v) for k,v in record.attrs if k != 'IDX'] - self.add_meta(r.key, items=items) + cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr) + + bcf_hdr_add_hrec(self.ptr, hrec) + + if self.ptr.dirty: + bcf_hdr_sync(self.ptr) def add_line(self, line): """Add a metadata line to this header""" @@ -1901,6 +1971,8 @@ cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr): cdef class VariantRecordFilter(object): """Filters set on a :class:`VariantRecord` object, presented as a mapping from filter index or name to :class:`VariantMetadata` object""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def __len__(self): return self.record.ptr.d.n_flt @@ -1928,12 +2000,28 @@ cdef class VariantRecordFilter(object): bkey = force_bytes(key) id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) - if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \ - or not bcf_has_filter(hdr, self.record.ptr, bkey): + if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): raise KeyError('Invalid filter') return makeVariantMetadata(self.record.header, BCF_HL_FLT, id) + def add(self, key): + """Add a new filter""" + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int id + + if key == '.': + key = 'PASS' + + bkey = force_bytes(key) + id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) + + if not check_header_id(hdr, BCF_HL_FLT, id): + raise KeyError('Invalid filter') + + bcf_add_filter(hdr, r, id) + def __delitem__(self, key): cdef bcf_hdr_t *hdr = self.record.header.ptr cdef bcf1_t *r = self.record.ptr @@ -1954,8 +2042,7 @@ cdef class VariantRecordFilter(object): bkey = force_bytes(key) id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey) - if not bcf_hdr_idinfo_exists(hdr, BCF_HL_FLT, id) \ - or not bcf_has_filter(hdr, self.record.ptr, bkey): + if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey): raise KeyError('Invalid filter') bcf_remove_filter(hdr, r, id, 0) @@ -2032,6 +2119,8 @@ cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record): cdef class VariantRecordFormat(object): """Format data present for each sample in a :class:`VariantRecord` object, presented as mapping from format name to :class:`VariantMetadata` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def __len__(self): cdef bcf_hdr_t *hdr = self.record.header.ptr @@ -2155,8 +2244,7 @@ cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record): if not record: raise ValueError('invalid VariantRecord') - cdef VariantRecordFormat format = VariantRecordFormat.__new__( - VariantRecordFormat) + cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat) format.record = record return format @@ -2167,6 +2255,9 @@ cdef class VariantRecordInfo(object): """Info data stored in a :class:`VariantRecord` object, presented as a mapping from info metadata name to value.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') + def __len__(self): return self.record.ptr.n_info @@ -2197,6 +2288,9 @@ cdef class VariantRecordInfo(object): else: info_id = info.key + if not check_header_id(hdr, BCF_HL_INFO, info_id): + raise ValueError('Invalid header') + if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG: return info != NULL and info.vptr != NULL @@ -2331,6 +2425,8 @@ cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record): cdef class VariantRecordSamples(object): """mapping from sample index or name to :class:`VariantRecordSample` object.""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def __len__(self): return bcf_hdr_nsamples(self.record.header.ptr) @@ -2445,226 +2541,279 @@ cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record): cdef class VariantRecord(object): """Variant record""" + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') def __dealloc__(self): if self.ptr: bcf_destroy1(self.ptr) self.ptr = NULL - property rid: + def copy(self): + """return a copy of this VariantRecord object""" + return makeVariantRecord(self.header, bcf_dup(self.ptr)) + + def translate(self, VariantHeader dst_header): + if dst_header is None: + raise ValueError('dst_header must not be None') + + cdef bcf_hdr_t *src_hdr = self.header.ptr + cdef bcf_hdr_t *dst_hdr = dst_header.ptr + + if src_hdr != dst_hdr: + if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr): + msg = 'Cannot translate record. Number of samples does not match header ({} vs {})' + raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr))) + + bcf_translate(dst_hdr, src_hdr, self.ptr) + + @property + def rid(self): """internal reference id number""" - def __get__(self): - return self.ptr.rid - def __set__(self, rid): - cdef bcf_hdr_t *hdr = self.header.ptr - cdef int r = rid - if rid < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val: - raise ValueError('invalid reference id') - self.ptr.rid = r - - property chrom: + return self.ptr.rid + + @rid.setter + def rid(self, value): + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int r = value + if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val: + raise ValueError('invalid reference id') + self.ptr.rid = r + + @property + def chrom(self): """chromosome/contig name""" - def __get__(self): - return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid)) - def __set__(self, chrom): - cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] - bchrom = force_bytes(chrom) - cdef khint_t k = kh_get_vdict(d, bchrom) - if k == kh_end(d): - raise ValueError('Invalid chromosome/contig') - self.ptr.rid = kh_val_vdict(d, k).id - - property contig: + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = self.ptr.rid + if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: + raise ValueError('Invalid header') + return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) + + @chrom.setter + def chrom(self, value): + cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] + bchrom = force_bytes(value) + cdef khint_t k = kh_get_vdict(d, bchrom) + if k == kh_end(d): + raise ValueError('Invalid chromosome/contig') + self.ptr.rid = kh_val_vdict(d, k).id + + @property + def contig(self): """chromosome/contig name""" - def __get__(self): - return bcf_str_cache_get_charptr(bcf_hdr_id2name(self.header.ptr, self.ptr.rid)) - def __set__(self, chrom): - cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] - bchrom = force_bytes(chrom) - cdef khint_t k = kh_get_vdict(d, bchrom) - if k == kh_end(d): - raise ValueError('Invalid chromosome/contig') - self.ptr.rid = kh_val_vdict(d, k).id - - property pos: + cdef bcf_hdr_t *hdr = self.header.ptr + cdef int rid = self.ptr.rid + if rid < 0 or rid >= hdr.n[BCF_DT_CTG]: + raise ValueError('Invalid header') + return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid)) + + @contig.setter + def contig(self, value): + cdef vdict_t *d = self.header.ptr.dict[BCF_DT_CTG] + bchrom = force_bytes(value) + cdef khint_t k = kh_get_vdict(d, bchrom) + if k == kh_end(d): + raise ValueError('Invalid chromosome/contig') + self.ptr.rid = kh_val_vdict(d, k).id + + @property + def pos(self): """record start position on chrom/contig (1-based inclusive)""" - def __get__(self): - return self.ptr.pos + 1 - def __set__(self, pos): - if pos < 1: - raise ValueError('Position must be positive') - # FIXME: check start <= stop? - # KBJ: Can't or else certain mutating operations will become - # difficult or impossible. e.g. having to delete - # info['END'] before being able to reset pos is going to - # create subtle bugs. Better to check this when writing - # records. - self.ptr.pos = pos - 1 - - property start: + return self.ptr.pos + 1 + + @pos.setter + def pos(self, value): + cdef int p = value + if p < 1: + raise ValueError('Position must be positive') + self.ptr.pos = p - 1 + + @property + def start(self): """record start position on chrom/contig (0-based inclusive)""" - def __get__(self): - return self.ptr.pos - def __set__(self, start): - if start < 0: - raise ValueError('Start coordinate must be non-negative') - # FIXME: check start <= stop? - # KBJ: See above. - self.ptr.pos = start - - property stop: + return self.ptr.pos + + @start.setter + def start(self, value): + cdef int s = value + if s < 0: + raise ValueError('Start coordinate must be non-negative') + self.ptr.pos = s + + @property + def stop(self): """record stop position on chrom/contig (0-based exclusive)""" - def __get__(self): - return self.ptr.pos + self.ptr.rlen - def __set__(self, stop): - if stop < self.ptr.pos: - raise ValueError('Stop coordinate must be greater than or equal to start') - self.ptr.rlen = stop - self.ptr.pos - - property rlen: + return self.ptr.pos + self.ptr.rlen + + @stop.setter + def stop(self, value): + cdef int s = value + if s < self.ptr.pos: + raise ValueError('Stop coordinate must be greater than or equal to start') + self.ptr.rlen = s - self.ptr.pos + if self.ptr.rlen != len(self.ref) or 'END' in self.info: + self.info['END'] = s + + @property + def rlen(self): """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)""" - def __get__(self): - return self.ptr.rlen - def __set__(self, rlen): - if rlen < 0: - raise ValueError('Reference length must be non-negative') - self.ptr.rlen = rlen - - property qual: + return self.ptr.rlen + + @rlen.setter + def rlen(self, value): + cdef int r = value + if r < 0: + raise ValueError('Reference length must be non-negative') + self.ptr.rlen = r + if r != len(self.ref) or 'END' in self.info: + self.info['END'] = self.ptr.pos + r + + @property + def qual(self): """phred scaled quality score or None if not available""" - def __get__(self): - return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None - def __set__(self, qual): - if qual is not None: - self.ptr.qual = qual - else: - bcf_float_set(&self.ptr.qual, bcf_float_missing) + return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None + + @qual.setter + def qual(self, value): + if value is not None: + self.ptr.qual = value + else: + bcf_float_set(&self.ptr.qual, bcf_float_missing) + -# property n_allele: -# def __get__(self): -# return self.ptr.n_allele +# @property +# def n_allele(self): +# return self.ptr.n_allele -# property n_sample: -# def __get__(self): -# return self.ptr.n_sample +# @property +# def n_sample(self): +# return self.ptr.n_sample - property id: + @property + def id(self): """record identifier or None if not available""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None - def __set__(self, id): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - cdef char *idstr = NULL - if id is not None: - bid = force_bytes(id) - idstr = bid - if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0: - raise ValueError('Error updating id') - - property ref: + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None + + @id.setter + def id(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + cdef char *idstr = NULL + if value is not None: + bid = force_bytes(value) + idstr = bid + if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0: + raise ValueError('Error updating id') + + @property + def ref(self): """reference allele""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - return charptr_to_str(r.d.allele[0]) if r.d.allele else None - def __set__(self, ref): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - #FIXME: Set alleles directly -- this is stupid - if not ref: - raise ValueError('ref allele cannot be null') - ref = force_bytes(ref) - if r.d.allele and r.n_allele: - alleles = [r.d.allele[i] for i in range(r.n_allele)] - alleles[0] = ref - else: - alleles = [ref] - self.alleles = alleles + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + return charptr_to_str(r.d.allele[0]) if r.d.allele else None - property alleles: + @ref.setter + def ref(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + #FIXME: Set alleles directly -- this is stupid + if not value: + raise ValueError('ref allele must not be null') + value = force_bytes(value) + if r.d.allele and r.n_allele: + alleles = [r.d.allele[i] for i in range(r.n_allele)] + alleles[0] = value + else: + alleles = [value] + self.alleles = alleles + + @property + def alleles(self): """tuple of reference allele followed by alt alleles""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - if not r.d.allele: - return None - cdef tuple res = PyTuple_New(r.n_allele) - for i in range(r.n_allele): - a = charptr_to_str(r.d.allele[i]) - PyTuple_SET_ITEM(res, i, a) - Py_INCREF(a) - return res - def __set__(self, values): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - values = [force_bytes(v) for v in values] - if b'' in values: - raise ValueError('cannot set null allele') - values = b','.join(values) - if bcf_update_alleles_str(self.header.ptr, r, values) < 0: - raise ValueError('Error updating alleles') - - property alts: + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if not r.d.allele: + return None + cdef tuple res = PyTuple_New(r.n_allele) + for i in range(r.n_allele): + a = charptr_to_str(r.d.allele[i]) + PyTuple_SET_ITEM(res, i, a) + Py_INCREF(a) + return res + + @alleles.setter + def alleles(self, value): + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + value = [force_bytes(v) for v in value] + if b'' in value: + raise ValueError('cannot set null allele') + value = b','.join(value) + if bcf_update_alleles_str(self.header.ptr, r, value) < 0: + raise ValueError('Error updating alleles') + + @property + def alts(self): """tuple of alt alleles""" - def __get__(self): - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - if r.n_allele < 2 or not r.d.allele: - return None - cdef tuple res = PyTuple_New(r.n_allele - 1) - for i in range(1, r.n_allele): - a = charptr_to_str(r.d.allele[i]) - PyTuple_SET_ITEM(res, i - 1, a) - Py_INCREF(a) - return res - def __set__(self, values): - #FIXME: Set alleles directly -- this is stupid - cdef bcf1_t *r = self.ptr - if bcf_unpack(r, BCF_UN_STR) < 0: - raise ValueError('Error unpacking VariantRecord') - values = [force_bytes(v) for v in values] - if b'' in values: - raise ValueError('cannot set null alt allele') - ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] - self.alleles = ref + values - - property filter: + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + if r.n_allele < 2 or not r.d.allele: + return None + cdef tuple res = PyTuple_New(r.n_allele - 1) + for i in range(1, r.n_allele): + a = charptr_to_str(r.d.allele[i]) + PyTuple_SET_ITEM(res, i - 1, a) + Py_INCREF(a) + return res + + @alts.setter + def alts(self, value): + #FIXME: Set alleles directly -- this is stupid + cdef bcf1_t *r = self.ptr + if bcf_unpack(r, BCF_UN_STR) < 0: + raise ValueError('Error unpacking VariantRecord') + value = [force_bytes(v) for v in value] + if b'' in value: + raise ValueError('cannot set null alt allele') + ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.'] + self.alleles = ref + value + + @property + def filter(self): """filter information (see :class:`VariantRecordFilter`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordFilter(self) + if bcf_unpack(self.ptr, BCF_UN_FLT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFilter(self) - property info: + @property + def info(self): """info data (see :class:`VariantRecordInfo`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordInfo(self) + if bcf_unpack(self.ptr, BCF_UN_INFO) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordInfo(self) - property format: + @property + def format(self): """sample format metadata (see :class:`VariantRecordFormat`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordFormat(self) + if bcf_unpack(self.ptr, BCF_UN_FMT) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordFormat(self) - property samples: + @property + def samples(self): """sample data (see :class:`VariantRecordSamples`)""" - def __get__(self): - if bcf_unpack(self.ptr, BCF_UN_ALL) < 0: - raise ValueError('Error unpacking VariantRecord') - return makeVariantRecordSamples(self) + if bcf_unpack(self.ptr, BCF_UN_ALL) < 0: + raise ValueError('Error unpacking VariantRecord') + return makeVariantRecordSamples(self) def __str__(self): cdef kstring_t line @@ -2700,6 +2849,27 @@ cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r): if not r: raise ValueError('cannot create VariantRecord') + if r.errcode: + msg = [] + #if r.errcode & BCF_ERR_CTG_UNDEF: + # msg.append('undefined contig') + #if r.errcode & BCF_ERR_TAG_UNDEF: + # msg.append('undefined tag') + if r.errcode & BCF_ERR_NCOLS: + msg.append('invalid number of columns') + if r.errcode & BCF_ERR_LIMITS: + msg.append('limits violated') + if r.errcode & BCF_ERR_CHAR: + msg.append('invalid character found') + if r.errcode & BCF_ERR_CTG_INVALID: + msg.append('invalid contig') + if r.errcode & BCF_ERR_TAG_INVALID: + msg.append('invalid tag') + + if msg: + msg = ', '.join(msg) + raise ValueError('Error(s) reading record: {}'.format(msg)) + cdef VariantRecord record = VariantRecord.__new__(VariantRecord) record.header = header record.ptr = r @@ -2718,43 +2888,55 @@ cdef class VariantRecordSample(object): Provides data accessors for genotypes and a mapping interface from format name to values. """ + def __init__(self, *args, **kwargs): + raise TypeError('this class cannot be instantiated from Python') - property name: + @property + def name(self): """sample name""" - def __get__(self): - cdef bcf_hdr_t *hdr = self.record.header.ptr - cdef bcf1_t *r = self.record.ptr - cdef int32_t n = bcf_hdr_nsamples(hdr) + cdef bcf_hdr_t *hdr = self.record.header.ptr + cdef bcf1_t *r = self.record.ptr + cdef int32_t n = bcf_hdr_nsamples(hdr) - if self.index < 0 or self.index >= n: - raise ValueError('invalid sample index') + if self.index < 0 or self.index >= n: + raise ValueError('invalid sample index') - return charptr_to_str(hdr.samples[self.index]) + return charptr_to_str(hdr.samples[self.index]) - property allele_indices: + @property + def allele_indices(self): """allele indices for called genotype, if present. Otherwise None""" - def __get__(self): - return bcf_format_get_allele_indices(self) - def __set__(self, values): - self['GT'] = values - def __del__(self): - self['GT'] = () - - property alleles: + return bcf_format_get_allele_indices(self) + + @allele_indices.setter + def allele_indices(self, value): + self['GT'] = value + + @allele_indices.deleter + def allele_indices(self): + self['GT'] = () + + @property + def alleles(self): """alleles for called genotype, if present. Otherwise None""" - def __get__(self): - return bcf_format_get_alleles(self) - def __set__(self, values): - self['GT'] = values - def __del__(self): - self['GT'] = () - - property phased: + return bcf_format_get_alleles(self) + + @alleles.setter + def alleles(self, value): + self['GT'] = value + + @alleles.deleter + def alleles(self): + self['GT'] = () + + @property + def phased(self): """False if genotype is missing or any allele is unphased. Otherwise True.""" - def __get__(self): - return bcf_sample_get_phased(self) - def __set__(self, value): - bcf_sample_set_phased(self, value) + return bcf_sample_get_phased(self) + + @phased.setter + def phased(self, value): + bcf_sample_set_phased(self, value) def __len__(self): cdef bcf_hdr_t *hdr = self.record.header.ptr @@ -2956,10 +3138,7 @@ cdef class BCFIndex(object): cdef int n cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n) - if not refs: - raise ValueError('Cannot retrieve reference sequence names') - - self.refs = char_array_to_tuple(refs, n, free_after=1) + self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () self.refmap = { r:i for i,r in enumerate(self.refs) } def __dealloc__(self): @@ -2998,10 +3177,7 @@ cdef class TabixIndex(BaseIndex): cdef int n cdef const char **refs = tbx_seqnames(self.ptr, &n) - if not refs: - raise ValueError('Cannot retrieve reference sequence names') - - self.refs = char_array_to_tuple(refs, n, free_after=1) + self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else () self.refmap = { r:i for i,r in enumerate(self.refs) } def __dealloc__(self): @@ -3046,6 +3222,8 @@ cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record): cdef class BCFIterator(BaseIterator): def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): + if bcf is None: + raise ValueError('bcf must not be None') if not isinstance(bcf.index, BCFIndex): raise ValueError('bcf index required') @@ -3075,7 +3253,7 @@ cdef class BCFIterator(BaseIterator): try: rid = index.refmap[contig] except KeyError: - raise('Unknown contig specified') + raise ValueError('Unknown contig specified') if start is None: start = 0 @@ -3138,6 +3316,9 @@ cdef class TabixIterator(BaseIterator): self.line_buffer.s = NULL def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, region=None, reopen=True): + if bcf is None: + raise ValueError('bcf must not be None') + if not isinstance(bcf.index, TabixIndex): raise ValueError('tabix index required') @@ -3226,34 +3407,58 @@ cdef class TabixIterator(BaseIterator): ######################################################################## -cdef class VariantFile(object): - """*(filename, mode=None, index_filename=None, header=None, drop_samples=False)* +cdef class VariantFile(HTSFile): + """*(filename, mode=None, index_filename=None, header=None, drop_samples=False, + duplicate_filehandle=True)* A :term:`VCF`/:term:`BCF` formatted file. The file is automatically opened. - *mode* should be ``r`` for reading or ``w`` for writing. The default is - text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append - ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. + If an index for a variant file exists (.csi or .tbi), it will be + opened automatically. Without an index random access to records + via :meth:`fetch` is disabled. + + For writing, a :class:`VariantHeader` object must be provided, + typically obtained from another :term:`VCF` file/:term:`BCF` + file. + + Parameters + ---------- + mode : string + *mode* should be ``r`` for reading or ``w`` for writing. The default is + text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append + ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output. + + If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid + modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. + For instance, to open a :term:`BCF` formatted file for reading, type:: - If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid - modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``. - For instance, to open a :term:`BCF` formatted file for reading, type:: + f = pysam.VariantFile('ex1.bcf','r') - f = pysam.VariantFile('ex1.bcf','rb') + If mode is not specified, we will try to auto-detect the file type. All + of the following should work:: - If mode is not specified, we will try to auto-detect in the order 'rb', - 'r', thus both the following should work:: + f1 = pysam.VariantFile('ex1.bcf') + f2 = pysam.VariantFile('ex1.vcf') + f3 = pysam.VariantFile('ex1.vcf.gz') - f1 = pysam.VariantFile('ex1.bcf') - f2 = pysam.VariantFile('ex1.vcf') + index_filename : string + Explicit path to an index file. - If an index for a variant file exists (.csi or .tbi), it will be opened - automatically. Without an index random access to records via - :meth:`fetch` is disabled. + header : VariantHeader + :class:`VariantHeader` object required for writing. + + drop_samples: bool + Ignore sample information when reading. + + duplicate_filehandle: bool + By default, file handles passed either directly or through + File-like objects will be duplicated before passing them to + htslib. The duplication prevents issues where the same stream + will be closed by htslib and through destruction of the + high-level python object. Set to False to turn off + duplication. - For writing, a :class:`VariantHeader` object must be provided, typically - obtained from another :term:`VCF` file/:term:`BCF` file. """ def __cinit__(self, *args, **kwargs): self.htsfile = NULL @@ -3268,88 +3473,38 @@ cdef class VariantFile(object): self.is_remote = False self.is_reading = False self.drop_samples = False + self.header_written = False self.start_offset = -1 self.open(*args, **kwargs) - def __dealloc__(self): - if self.htsfile: - hts_close(self.htsfile) - self.htsfile = NULL - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - return False - - property category: - """General file format category. One of UNKNOWN, ALIGNMENTS, - VARIANTS, INDEX, REGIONS""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return FORMAT_CATEGORIES[self.htsfile.format.category] - - property format: - """File format. - - One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, - BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED. - """ - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return FORMATS[self.htsfile.format.format] - - property version: - """Tuple of file format version numbers (major, minor)""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return (self.htsfile.format.version.major, - self.htsfile.format.version.minor) - - property compression: - """File compression. - - One of NONE, GZIP, BGZF, CUSTOM.""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - return COMPRESSION[self.htsfile.format.compression] - - property description: - """Vaguely human readable description of the file format""" - def __get__(self): - if not self.htsfile: - raise ValueError('metadata not available on closed file') - cdef char *desc = hts_format_description(&self.htsfile.format) - try: - return charptr_to_str(desc) - finally: - free(desc) - def close(self): """closes the :class:`pysam.VariantFile`.""" + cdef int ret = 0 + self.header = self.index = None if self.htsfile: - hts_close(self.htsfile) + # Write header if no records were written + if self.htsfile.is_write and not self.header_written: + self.header_written = True + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + + ret = hts_close(self.htsfile) self.htsfile = NULL - self.header = self.index = None - property is_open: - def __get__(self): - """return True if VariantFile is open and in a valid state.""" - return self.htsfile != NULL + if ret < 0: + global errno + if errno == EPIPE: + errno = 0 + else: + raise OSError(errno, force_str(strerror(errno))) def __iter__(self): if not self.is_open: raise ValueError('I/O operation on closed file') - if not self.mode.startswith(b'r'): - raise ValueError( - 'cannot iterate over Variantfile opened for writing') + if self.htsfile.is_write: + raise ValueError('cannot iterate over Variantfile opened for writing') self.is_reading = 1 return self @@ -3382,13 +3537,9 @@ cdef class VariantFile(object): cdef VariantFile vars = VariantFile.__new__(VariantFile) cdef bcf_hdr_t *hdr - cdef char *cfilename - cdef char *cmode # FIXME: re-open using fd or else header and index could be invalid - cfilename, cmode = self.filename, self.mode - with nogil: - vars.htsfile = hts_open(cfilename, cmode) + vars.htsfile = self._open_htsfile() if not vars.htsfile: raise ValueError('Cannot re-open htsfile') @@ -3406,6 +3557,7 @@ cdef class VariantFile(object): vars.is_remote = self.is_remote vars.is_reading = self.is_reading vars.start_offset = self.start_offset + vars.header_written = self.header_written if self.htsfile.is_bin: vars.seek(self.tell()) @@ -3416,10 +3568,11 @@ cdef class VariantFile(object): return vars - def open(self, filename, mode='rb', + def open(self, filename, mode='r', index_filename=None, VariantHeader header=None, - drop_samples=False): + drop_samples=False, + duplicate_filehandle=True): """open a vcf/bcf file. If open is called on an existing VariantFile, the current file will be @@ -3437,24 +3590,51 @@ cdef class VariantFile(object): if self.is_open: self.close() - if mode not in ('r','w','rb','wb', 'wh', 'wbu', 'rU', 'wb0'): - raise ValueError('invalid file opening mode `{}`'.format(mode)) + if not mode or mode[0] not in 'rwa': + raise ValueError('mode must begin with r, w or a') + + self.duplicate_filehandle = duplicate_filehandle + + format_modes = [m for m in mode[1:] if m in 'bcguz'] + if len(format_modes) > 1: + raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes))) + + invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex'] + if invalid_modes: + raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes))) + + # Autodetect mode from filename + if mode == 'w' and isinstance(filename, str): + if filename.endswith('.gz'): + mode = 'wz' + elif filename.endswith('.bcf'): + mode = 'wb' # for htslib, wbu seems to not work if mode == 'wbu': mode = 'wb0' self.mode = mode = force_bytes(mode) - self.filename = filename = encode_filename(filename) + try: + filename = encode_filename(filename) + self.is_remote = hisremote(filename) + self.is_stream = filename == b'-' + except TypeError: + filename = filename + self.is_remote = False + self.is_stream = True + + self.filename = filename + if index_filename is not None: self.index_filename = index_filename = encode_filename(index_filename) else: self.index_filename = None + self.drop_samples = bool(drop_samples) self.header = None - self.is_remote = hisremote(filename) - self.is_stream = filename == b'-' + self.header_written = False if mode.startswith(b'w'): # open file for writing @@ -3465,29 +3645,22 @@ cdef class VariantFile(object): if header: self.header = header.copy() else: - raise ValueError('a VariantHeader must be specified') + self.header = VariantHeader() + #raise ValueError('a VariantHeader must be specified') - # open file. Header gets written to file at the same time - # for bam files and sam files (in the latter case, the - # mode needs to be wh) - cfilename, cmode = filename, mode - with nogil: - self.htsfile = hts_open(cfilename, cmode) + # Header is not written until the first write or on close + self.htsfile = self._open_htsfile() if not self.htsfile: - raise ValueError("could not open file `{}` (mode='{}')".format((filename, mode))) - - with nogil: - bcf_hdr_write(self.htsfile, self.header.ptr) + raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode)) elif mode.startswith(b'r'): # open file for reading - if filename != b'-' and not self.is_remote and not os.path.exists(filename): + + if not self._exists(): raise IOError('file `{}` not found'.format(filename)) - cfilename, cmode = filename, mode - with nogil: - self.htsfile = hts_open(cfilename, cmode) + self.htsfile = self._open_htsfile() if not self.htsfile: raise ValueError("could not open file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode)) @@ -3508,15 +3681,20 @@ cdef class VariantFile(object): except ValueError: raise ValueError("file `{}` does not have valid header (mode='{}') - is it VCF/BCF format?".format(filename, mode)) + if isinstance(self.filename, bytes): + cfilename = self.filename + else: + cfilename = NULL + # check for index and open if present - if self.htsfile.format.format == bcf: + if self.htsfile.format.format == bcf and cfilename: if index_filename is not None: cindex_filename = index_filename with nogil: idx = bcf_index_load2(cfilename, cindex_filename) self.index = makeBCFIndex(self.header, idx) - elif self.htsfile.format.compression == bgzf: + elif self.htsfile.format.compression == bgzf and cfilename: if index_filename is not None: cindex_filename = index_filename with nogil: @@ -3530,40 +3708,8 @@ cdef class VariantFile(object): def reset(self): """reset file position to beginning of file just after the header.""" - return self.seek(self.start_offset, 0) + return self.seek(self.start_offset) - def seek(self, uint64_t offset): - """move file pointer to position *offset*, see - :meth:`pysam.VariantFile.tell`.""" - if not self.is_open: - raise ValueError('I/O operation on closed file') - if self.is_stream: - raise OSError('seek not available in streams') - - cdef int64_t ret - if self.htsfile.format.compression != no_compression: - with nogil: - ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) - else: - with nogil: - ret = hts_useek(self.htsfile, offset, SEEK_SET) - return ret - - def tell(self): - """return current file position, see :meth:`pysam.VariantFile.seek`.""" - if not self.is_open: - raise ValueError('I/O operation on closed file') - if self.is_stream: - raise OSError('tell not available in streams') - - cdef int64_t ret - if self.htsfile.format.compression != no_compression: - with nogil: - ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) - else: - with nogil: - ret = hts_utell(self.htsfile) - return ret def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False): """fetch records in a :term:`region` using 0-based indexing. The @@ -3589,9 +3735,8 @@ cdef class VariantFile(object): if not self.is_open: raise ValueError('I/O operation on closed file') - if not self.mode.startswith(b'r'): - raise ValueError('cannot fetch from Variantfile opened ' - 'for writing') + if self.htsfile.is_write: + raise ValueError('cannot fetch from Variantfile opened for writing') if contig is None and region is None: self.is_reading = 1 @@ -3605,28 +3750,45 @@ cdef class VariantFile(object): self.is_reading = 1 return self.index.fetch(self, contig, start, stop, region, reopen) + cpdef VariantRecord new_record(self): + """Create a new empty VariantRecord""" + return self.header.new_record() + cpdef int write(self, VariantRecord record) except -1: """ write a single :class:`pysam.VariantRecord` to disk. returns the number of bytes written. """ + if record is None: + raise ValueError('record must not be None') + if not self.is_open: return ValueError('I/O operation on closed file') - if not self.mode.startswith(b'w'): + if not self.htsfile.is_write: raise ValueError('cannot write to a Variantfile opened for reading') + if not self.header_written: + self.header_written = True + with nogil: + bcf_hdr_write(self.htsfile, self.header.ptr) + #if record.header is not self.header: + # record.translate(self.header) # raise ValueError('Writing records from a different VariantFile is not yet supported') + if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr): + msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})' + raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr))) + cdef int ret with nogil: ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr) if ret < 0: - raise ValueError('write failed') + raise IOError(errno, strerror(errno)) return ret @@ -3638,9 +3800,8 @@ cdef class VariantFile(object): if not self.is_open: raise ValueError('I/O operation on closed file') - if not self.mode.startswith(b'r'): - raise ValueError('cannot subset samples from Variantfile ' - 'opened for writing') + if self.htsfile.is_write: + raise ValueError('cannot subset samples from Variantfile opened for writing') if self.is_reading: raise ValueError('cannot subset samples after fetching records') diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx new file mode 100644 index 0000000..558ceff --- /dev/null +++ b/pysam/libcbgzf.pyx @@ -0,0 +1,209 @@ +"""Functions that read and write block gzipped files. + +The user of the file doesn't have to worry about the compression +and random access is allowed if an index file is present.""" + +# based on Python 3.5's gzip module + +import io + +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +from libc.stdlib cimport malloc, calloc, realloc, free + +from cpython.object cimport PyObject +from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libchtslib cimport * + + +__all__ = ["BGZFile"] + + +BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE + + +cdef class BGZFile(object): + """The BGZFile class simulates most of the methods of a file object with + the exception of the truncate() method. + + This class only supports opening files in binary mode. If you need to open a + compressed file in text mode, use the gzip.open() function. + """ + cdef BGZF* bgzf + cdef bytes name, index + + def __init__(self, filename, mode=None, index=None): + """Constructor for the BGZFile class. + + The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or + 'xb' depending on whether the file will be read or written. The default + is the mode of fileobj if discernible; otherwise, the default is 'rb'. + A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and + 'wb', 'a' and 'ab', and 'x' and 'xb'. + """ + if mode and ('t' in mode or 'U' in mode): + raise ValueError("Invalid mode: {!r}".format(mode)) + if not mode: + mode = 'rb' + if mode and 'b' not in mode: + mode += 'b' + self.name = force_bytes(filename) + self.index = force_bytes(index) if index is not None else None + self.bgzf = bgzf_open(self.name, mode) + + if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0: + raise IOError('Error building bgzf index') + + def __dealloc__(self): + self.close() + + def write(self,data): + if not self.bgzf: + raise ValueError("write() on closed BGZFile object") + + if not self.bgzf.is_write: + import errno + raise OSError(errno.EBADF, "write() on read-only BGZFile object") + + if isinstance(data, bytes): + length = len(data) + else: + # accept any data that supports the buffer protocol + data = memoryview(data) + length = data.nbytes + + if length > 0 and bgzf_write(self.bgzf, data, length) < 0: + raise IOError('BGZFile write failed') + + return length + + def read(self, size=-1): + cdef ssize_t read_size + + if not self.bgzf: + raise ValueError("read() on closed BGZFile object") + + if self.bgzf.is_write: + import errno + raise OSError(errno.EBADF, "read() on write-only BGZFile object") + + if size < 0: + chunks = [] + while 1: + chunk = PyBytes_FromStringAndSize(NULL, BUFFER_SIZE) + cdata = chunk + read_size = bgzf_read(self.bgzf, chunk, BUFFER_SIZE) + if read_size < 0: + raise IOError('Error reading from BGZFile') + elif not read_size: + break + elif read_size < BUFFER_SIZE: + chunk = chunk[:read_size] + chunks.append(chunk) + return b''.join(chunks) + + elif size > 0: + chunk = PyBytes_FromStringAndSize(NULL, size) + read_size = bgzf_read(self.bgzf, chunk, size) + if read_size < 0: + raise IOError('Error reading from BGZFile') + elif read_size < size: + chunk = chunk[:size] + return chunk + else: + return b'' + + @property + def closed(self): + return self.bgzf == NULL + + def close(self): + if not self.bgzf: + return + + if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: + raise IOError('Error flushing BGZFile object') + + if self.index and bgzf_index_dump(self.bgzf, self.index, NULL) < 0: + raise IOError('Cannot write index') + + cdef ret = bgzf_close(self.bgzf) + self.bgzf = NULL + + if ret < 0: + raise IOError('Error closing BGZFile object') + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.close() + + def flush(self): + if not self.bgzf: + return + + if self.bgzf.is_write and bgzf_flush(self.bgzf) < 0: + raise IOError('Error flushing BGZFile object') + + def fileno(self): + """Invoke the underlying file object's fileno() method. + + This will raise AttributeError if the underlying file object + doesn't support fileno(). + """ + raise AttributeError('fileno') + + def rewind(self): + '''Return the uncompressed stream file position indicator to the + beginning of the file''' + if not self.bgzf: + raise ValueError("rewind() on closed BGZFile object") + if not self.bgzf.is_write: + raise OSError("Can't rewind in write mode") + if bgzf_seek(self.bgzf, 0, SEEK_SET) < 0: + raise IOError('Error seeking BGZFFile object') + + def readable(self): + if not self.bgzf: + raise ValueError("readable() on closed BGZFile object") + return self.bgzf != NULL and not self.bgzf.is_write + + def writable(self): + return self.bgzf != NULL and self.bgzf.is_write + + def seekable(self): + return True + + def seek(self, offset, whence=io.SEEK_SET): + if not self.bgzf: + raise ValueError("seek() on closed BGZFile object") + if whence is not io.SEEK_SET: + raise ValueError('Seek from end not supported') + + cdef int64_t off = bgzf_seek(self.bgzf, offset, SEEK_SET) + if off < 0: + raise IOError('Error seeking BGZFFile object') + + return off + + def readline(self, size=-1): + if not self.bgzf: + raise ValueError("readline() on closed BGZFile object") + + cdef kstring_t line + cdef char c + + line.l = line.m = 0 + line.s = NULL + if bgzf_getline(self.bgzf, '\n', &line) < 0: + raise IOError('Error reading line in BGZFFile object') + + ret = charptr_to_str_w_len(line.s, line.l) + + if line.m: + free(line.s) + + return ret diff --git a/pysam/cfaidx.pxd b/pysam/libcfaidx.pxd similarity index 94% rename from pysam/cfaidx.pxd rename to pysam/libcfaidx.pxd index 7749274..2f5f44b 100644 --- a/pysam/cfaidx.pxd +++ b/pysam/libcfaidx.pxd @@ -6,7 +6,7 @@ from libc.stdio cimport FILE, printf cimport cython from cpython cimport array -from pysam.chtslib cimport faidx_t, kstring_t, BGZF +from pysam.libchtslib cimport faidx_t, kstring_t, BGZF # These functions are put here and not in chtslib.pxd in order # to avoid warnings for unused functions. @@ -50,7 +50,7 @@ cdef class FastqProxy: cdef class PersistentFastqProxy: """ - Python container for pysam.cfaidx.FastqProxy with persistence. + Python container for pysam.libcfaidx.FastqProxy with persistence. """ cdef public str comment, quality, sequence, name cdef cython.str tostring(self) diff --git a/pysam/cfaidx.pyx b/pysam/libcfaidx.pyx similarity index 97% rename from pysam/cfaidx.pyx rename to pysam/libcfaidx.pyx index 78f9aac..774152d 100644 --- a/pysam/cfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -57,15 +57,15 @@ from cpython cimport PyErr_SetString, \ from cpython.version cimport PY_MAJOR_VERSION -from pysam.chtslib cimport \ +from pysam.libchtslib cimport \ faidx_nseq, fai_load, fai_destroy, fai_fetch, \ faidx_seq_len, \ faidx_fetch_seq, hisremote, \ bgzf_open, bgzf_close -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size -from pysam.cutils cimport qualitystring_to_array, parse_region +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport qualitystring_to_array, parse_region cdef class FastqProxy cdef makeFastqProxy(kseq_t * src): @@ -371,7 +371,7 @@ cdef class FastqProxy: cdef class PersistentFastqProxy: """ - Python container for pysam.cfaidx.FastqProxy with persistence. + Python container for pysam.libcfaidx.FastqProxy with persistence. Needed to compare multiple fastq records from the same file. """ def __init__(self, FastqProxy FastqRead): @@ -568,4 +568,5 @@ cdef class Fastafile(FastaFile): __all__ = ["FastaFile", "FastqFile", "FastxFile", - "Fastafile"] + "Fastafile", + "FastqProxy"] diff --git a/pysam/chtslib.pxd b/pysam/libchtslib.pxd similarity index 99% rename from pysam/chtslib.pxd rename to pysam/libchtslib.pxd index 33c1559..657a754 100644 --- a/pysam/chtslib.pxd +++ b/pysam/libchtslib.pxd @@ -1290,6 +1290,9 @@ cdef extern from "htslib/vcf.h" nogil: uint8_t BCF_ERR_TAG_UNDEF uint8_t BCF_ERR_NCOLS uint8_t BCF_ERR_LIMITS + uint8_t BCF_ERR_CHAR + uint8_t BCF_ERR_CTG_INVALID + uint8_t BCF_ERR_TAG_INVALID # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file # is slower because the string is first to be parsed, packed into BCF line @@ -1896,3 +1899,18 @@ cdef extern from "htslib/vcfutils.h" nogil: # @i,j: allele indexes, 0-based, i<=j # Returns index to the Number=G diploid array uint32_t bcf_ij2G(uint32_t i, uint32_t j) + + +cdef class HTSFile(object): + cdef htsFile *htsfile # pointer to htsFile structure + cdef int64_t start_offset # BGZF offset of first record + + cdef readonly object filename # filename as supplied by user + cdef readonly object mode # file opening mode + cdef readonly object index_filename # filename of index, if supplied by user + + cdef readonly bint is_stream # Is htsfile a non-seekable stream + cdef readonly bint is_remote # Is htsfile a remote stream + cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh + + cdef htsFile *_open_htsfile(self) except? NULL diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx new file mode 100644 index 0000000..7eea059 --- /dev/null +++ b/pysam/libchtslib.pyx @@ -0,0 +1,265 @@ +# cython: embedsignature=True +# cython: profile=True +# adds doc-strings for sphinx +import os + +from posix.unistd cimport dup + +from pysam.libchtslib cimport * + +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len +from pysam.libcutils cimport encode_filename, from_string_and_size + + +__all__ = ["get_verbosity", "set_verbosity"] + + +######################################################################## +######################################################################## +## Constants +######################################################################## + +cdef int MAX_POS = 2 << 29 +cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') +cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', + 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED') +cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM') + + +cpdef set_verbosity(int verbosity): + """Set htslib's hts_verbose global variable to the specified value.""" + return hts_set_verbosity(verbosity) + +cpdef get_verbosity(): + """Return the value of htslib's hts_verbose global variable.""" + return hts_get_verbosity() + + +class CallableValue(object): + def __init__(self, value): + self.value = value + def __call__(self): + return self.value + def __bool__(self): + return self.value + def __nonzero__(self): + return self.value + def __eq__(self, other): + return self.value == other + def __ne__(self, other): + return self.value != other + + +CTrue = CallableValue(True) +CFalse = CallableValue(False) + + +cdef class HTSFile(object): + """ + Base class for HTS file types + """ + def __cinit__(self, *args, **kwargs): + self.htsfile = NULL + self.duplicate_filehandle = True + + def __dealloc__(self): + if self.htsfile: + hts_close(self.htsfile) + self.htsfile = NULL + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + return False + + @property + def category(self): + """General file format category. One of UNKNOWN, ALIGNMENTS, + VARIANTS, INDEX, REGIONS""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMAT_CATEGORIES[self.htsfile.format.category] + + @property + def format(self): + """File format. + + One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM, + BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED. + """ + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return FORMATS[self.htsfile.format.format] + + @property + def version(self): + """Tuple of file format version numbers (major, minor)""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return self.htsfile.format.version.major, self.htsfile.format.version.minor + + @property + def compression(self): + """File compression. + + One of NONE, GZIP, BGZF, CUSTOM.""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + return COMPRESSION[self.htsfile.format.compression] + + @property + def description(self): + """Vaguely human readable description of the file format""" + if not self.htsfile: + raise ValueError('metadata not available on closed file') + cdef char *desc = hts_format_description(&self.htsfile.format) + try: + return charptr_to_str(desc) + finally: + free(desc) + + @property + def is_open(self): + """return True if HTSFile is open and in a valid state.""" + return CTrue if self.htsfile != NULL else CFalse + + @property + def is_closed(self): + """return True if HTSFile is closed.""" + return self.htsfile == NULL + + @property + def closed(self): + """return True if HTSFile is closed.""" + return self.htsfile == NULL + + @property + def is_write(self): + """return True if HTSFile is open for writing""" + return self.htsfile != NULL and self.htsfile.is_write != 0 + + @property + def is_read(self): + """return True if HTSFile is open for reading""" + return self.htsfile != NULL and self.htsfile.is_write == 0 + + @property + def is_sam(self): + """return True if HTSFile is reading or writing a SAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == sam + + @property + def is_bam(self): + """return True if HTSFile is reading or writing a BAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == bam + + @property + def is_cram(self): + """return True if HTSFile is reading or writing a BAM alignment file""" + return self.htsfile != NULL and self.htsfile.format.format == cram + + @property + def is_vcf(self): + """return True if HTSFile is reading or writing a VCF variant file""" + return self.htsfile != NULL and self.htsfile.format.format == vcf + + @property + def is_bcf(self): + """return True if HTSFile is reading or writing a BCF variant file""" + return self.htsfile != NULL and self.htsfile.format.format == bcf + + def reset(self): + """reset file position to beginning of file just after the header. + + Returns + ------- + + The file position after moving the file pointer. + + """ + return self.seek(self.start_offset) + + def seek(self, uint64_t offset): + """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise OSError('seek not available in streams') + + cdef int64_t ret + if self.htsfile.format.compression != no_compression: + with nogil: + ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) + else: + with nogil: + ret = hts_useek(self.htsfile, offset, SEEK_SET) + return ret + + def tell(self): + """return current file position, see :meth:`pysam.HTSFile.seek`.""" + if not self.is_open: + raise ValueError('I/O operation on closed file') + if self.is_stream: + raise OSError('tell not available in streams') + + cdef int64_t ret + if self.htsfile.format.compression != no_compression: + with nogil: + ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) + else: + with nogil: + ret = hts_utell(self.htsfile) + return ret + + cdef htsFile *_open_htsfile(self) except? NULL: + cdef char *cfilename + cdef char *cmode = self.mode + cdef int fd, dup_fd + + if isinstance(self.filename, bytes): + cfilename = self.filename + with nogil: + return hts_open(cfilename, cmode) + else: + if isinstance(self.filename, int): + fd = self.filename + else: + fd = self.filename.fileno() + + if self.duplicate_filehandle: + dup_fd = dup(fd) + else: + dup_fd = fd + + # Replicate mode normalization done in hts_open_format + smode = self.mode.replace(b'b', b'').replace(b'c', b'') + if b'b' in self.mode: + smode += b'b' + elif b'c' in self.mode: + smode += b'c' + cmode = smode + + hfile = hdopen(dup_fd, cmode) + if hfile == NULL: + raise IOError('Cannot create hfile') + + try: + # filename.name can be an int + filename = str(self.filename.name) + except AttributeError: + filename = ''.format(fd) + + filename = encode_filename(filename) + cfilename = filename + with nogil: + return hts_hopen(hfile, cfilename, cmode) + + def _exists(self): + """return False iff file is local, a file and exists. + """ + return (not isinstance(self.filename, (str, bytes)) or + self.filename == b'-' or + self.is_remote or + os.path.exists(self.filename)) diff --git a/pysam/csamfile.pxd b/pysam/libcsamfile.pxd similarity index 93% rename from pysam/csamfile.pxd rename to pysam/libcsamfile.pxd index a76a599..de36998 100644 --- a/pysam/csamfile.pxd +++ b/pysam/libcsamfile.pxd @@ -1,10 +1,10 @@ -from pysam.calignmentfile cimport AlignedSegment, AlignmentFile +from pysam.libcalignmentfile cimport AlignedSegment, AlignmentFile ################################################# # Compatibility Layer for pysam < 0.8 # import all declarations from htslib -from pysam.chtslib cimport * +from pysam.libchtslib cimport * cdef class AlignedRead(AlignedSegment): pass diff --git a/pysam/csamfile.pyx b/pysam/libcsamfile.pyx similarity index 92% rename from pysam/csamfile.pyx rename to pysam/libcsamfile.pyx index ed9d79b..bde93d8 100644 --- a/pysam/csamfile.pyx +++ b/pysam/libcsamfile.pyx @@ -19,7 +19,7 @@ from cpython cimport PyErr_SetString, \ from cpython.version cimport PY_MAJOR_VERSION -from pysam.calignmentfile cimport AlignmentFile, AlignedSegment +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment cdef class Samfile(AlignmentFile): diff --git a/pysam/ctabix.pxd b/pysam/libctabix.pxd similarity index 89% rename from pysam/ctabix.pxd rename to pysam/libctabix.pxd index 028090e..12cd9dd 100644 --- a/pysam/ctabix.pxd +++ b/pysam/libctabix.pxd @@ -13,8 +13,9 @@ cdef extern from "unistd.h" nogil: ssize_t read(int fd, void *buf, size_t count) int close(int fd) -from pysam.chtslib cimport hts_idx_t, hts_itr_t, htsFile, \ - tbx_t, kstring_t, BGZF +from pysam.libchtslib cimport hts_idx_t, hts_itr_t, htsFile, \ + tbx_t, kstring_t, BGZF, HTSFile + # These functions are put here and not in chtslib.pxd in order # to avoid warnings for unused functions. @@ -55,40 +56,39 @@ cdef class tabix_file_iterator: cdef __cnext__(self) -cdef class TabixFile: - # pointer to tabixfile - cdef htsFile * tabixfile +cdef class TabixFile(HTSFile): # pointer to index structure cdef tbx_t * index - # flag indicating whether file is remote - cdef int is_remote - - cdef object _filename - cdef object _filename_index + cdef readonly object filename_index cdef Parser parser cdef encoding + cdef class Parser: cdef encoding - cdef parse(self, char * buffer, int len) + cdef class asTuple(Parser): cdef parse(self, char * buffer, int len) + cdef class asGTF(Parser): pass + cdef class asBed(Parser): pass + cdef class asVCF(Parser): pass + cdef class TabixIterator: cdef hts_itr_t * iterator cdef TabixFile tabixfile @@ -96,9 +96,11 @@ cdef class TabixIterator: cdef encoding cdef int __cnext__(self) + cdef class TabixIteratorParsed(TabixIterator): cdef Parser parser + cdef class GZIterator: cdef object _filename cdef BGZF * gzipfile @@ -107,13 +109,15 @@ cdef class GZIterator: cdef int __cnext__(self) cdef encoding + cdef class GZIteratorHead(GZIterator): pass + cdef class GZIteratorParsed(GZIterator): cdef Parser parser + # Compatibility Layer for pysam < 0.8 cdef class Tabixfile(TabixFile): pass - diff --git a/pysam/ctabix.pyx b/pysam/libctabix.pyx similarity index 94% rename from pysam/ctabix.pyx rename to pysam/libctabix.pyx index a23fa87..10dc23b 100644 --- a/pysam/ctabix.pyx +++ b/pysam/libctabix.pyx @@ -66,16 +66,16 @@ from cpython cimport PyErr_SetString, PyBytes_Check, \ from cpython.version cimport PY_MAJOR_VERSION -cimport pysam.ctabixproxies as ctabixproxies +cimport pysam.libctabixproxies as ctabixproxies -from pysam.chtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ +from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ tbx_index_build, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ - tbx_destroy, hisremote + tbx_destroy, hisremote, region_list -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size cdef class Parser: @@ -290,14 +290,16 @@ cdef class TabixFile: """ def __cinit__(self, filename, - mode = 'r', + mode='r', parser=None, index=None, encoding="ascii", *args, **kwargs ): - self.tabixfile = NULL + self.htsfile = NULL + self.is_remote = False + self.is_stream = False self.parser = parser self._open(filename, mode, index, *args, **kwargs) self.encoding = encoding @@ -307,21 +309,22 @@ cdef class TabixFile: mode='r', index=None, ): - '''open a :term:`tabix file` for reading. - ''' + '''open a :term:`tabix file` for reading.''' - assert mode in ("r",), "invalid file opening mode `%s`" % mode + if mode != 'r': + raise ValueError("invalid file opening mode `%s`" % mode) - if self.tabixfile != NULL: + if self.htsfile != NULL: self.close() - self.tabixfile = NULL + self.htsfile = NULL filename_index = index or (filename + ".tbi") # encode all the strings to pass to tabix - self._filename = encode_filename(filename) - self._filename_index = encode_filename(filename_index) + self.filename = encode_filename(filename) + self.filename_index = encode_filename(filename_index) - self.is_remote = hisremote(self._filename) + self.is_stream = self.filename == b'-' + self.is_remote = hisremote(self.filename) if not self.is_remote: if not os.path.exists(filename): @@ -331,36 +334,37 @@ cdef class TabixFile: raise IOError("index `%s` not found" % filename_index) # open file - cdef char *cfilename = self._filename + cdef char *cfilename = self.filename with nogil: - self.tabixfile = hts_open(cfilename, 'r') + self.htsfile = hts_open(cfilename, 'r') - if self.tabixfile == NULL: + if self.htsfile == NULL: raise IOError("could not open file `%s`" % filename) - cfilename = self._filename_index + #if self.htsfile.format.category != region_list: + # raise ValueError("file does not contain region data") + + cfilename = self.filename_index with nogil: self.index = tbx_index_load(cfilename) if self.index == NULL: raise IOError("could not open index for `%s`" % filename) + if not self.is_stream: + self.start_offset = self.tell() + def _dup(self): '''return a copy of this tabix file. The file is being re-opened. ''' - return TabixFile(self._filename, + return TabixFile(self.filename, mode="r", parser=self.parser, - index=self._filename_index, + index=self.filename_index, encoding=self.encoding) - def is_open(self): - '''return true if samfile has been opened.''' - return self.tabixfile != NULL - - def fetch(self, reference=None, start=None, @@ -472,33 +476,11 @@ cdef class TabixFile: return a - # context manager interface - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - return False - ############################################################### ############################################################### ############################################################### ## properties ############################################################### - property closed: - """"bool indicating the current state of the file object. - This is a read-only attribute; the close() method changes the value. - """ - def __get__(self): - return not self.is_open() - - property filename: - '''filename associated with this object.''' - def __get__(self): - if not self.is_open(): - raise ValueError("I/O operation on closed file") - return self._filename - property header: '''the file header. @@ -543,9 +525,9 @@ cdef class TabixFile: def close(self): ''' closes the :class:`pysam.TabixFile`.''' - if self.tabixfile != NULL: - hts_close(self.tabixfile) - self.tabixfile = NULL + if self.htsfile != NULL: + hts_close(self.htsfile) + self.htsfile = NULL if self.index != NULL: tbx_destroy(self.index) self.index = NULL @@ -554,9 +536,9 @@ cdef class TabixFile: # remember: dealloc cannot call other python methods # note: no doc string # note: __del__ is not called. - if self.tabixfile != NULL: - hts_close(self.tabixfile) - self.tabixfile = NULL + if self.htsfile != NULL: + hts_close(self.htsfile) + self.htsfile = NULL if self.index != NULL: tbx_destroy(self.index) @@ -582,7 +564,7 @@ cdef class TabixIterator: Return -5 if file has been closed when this function was called. ''' - if self.tabixfile.tabixfile == NULL: + if self.tabixfile.htsfile == NULL: return -5 cdef int retval @@ -590,7 +572,7 @@ cdef class TabixIterator: while 1: with nogil: retval = tbx_itr_next( - self.tabixfile.tabixfile, + self.tabixfile.htsfile, self.tabixfile.index, self.iterator, &self.buffer) diff --git a/pysam/ctabixproxies.pxd b/pysam/libctabixproxies.pxd similarity index 100% rename from pysam/ctabixproxies.pxd rename to pysam/libctabixproxies.pxd diff --git a/pysam/ctabixproxies.pyx b/pysam/libctabixproxies.pyx similarity index 99% rename from pysam/ctabixproxies.pyx rename to pysam/libctabixproxies.pyx index f5288cc..9a8a678 100644 --- a/pysam/ctabixproxies.pyx +++ b/pysam/libctabixproxies.pyx @@ -5,8 +5,8 @@ from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr from libc.stdlib cimport free, malloc, calloc, realloc from libc.stdlib cimport atoi, atol, atof -from pysam.cutils cimport force_bytes, force_str, charptr_to_str -from pysam.cutils cimport encode_filename, from_string_and_size +from pysam.libcutils cimport force_bytes, force_str, charptr_to_str +from pysam.libcutils cimport encode_filename, from_string_and_size import collections diff --git a/pysam/cutils.pxd b/pysam/libcutils.pxd similarity index 100% rename from pysam/cutils.pxd rename to pysam/libcutils.pxd diff --git a/pysam/cutils.pyx b/pysam/libcutils.pyx similarity index 92% rename from pysam/cutils.pyx rename to pysam/libcutils.pyx index 7510727..80bd9e4 100644 --- a/pysam/cutils.pyx +++ b/pysam/libcutils.pyx @@ -7,7 +7,7 @@ import os import io from contextlib import contextmanager -from cpython.version cimport PY_MAJOR_VERSION +from cpython.version cimport PY_MAJOR_VERSION, PY_MINOR_VERSION from cpython cimport PyBytes_Check, PyUnicode_Check from cpython cimport array as c_array from libc.stdlib cimport calloc, free @@ -36,10 +36,10 @@ cpdef array_to_qualitystring(c_array.array qualities, int offset=33): if qualities is None: return None cdef int x - + cdef c_array.array result result = c_array.clone(qualities, len(qualities), zero=False) - + for x from 0 <= x < len(qualities): result[x] = qualities[x] + offset return force_str(result.tostring()) @@ -76,6 +76,7 @@ cpdef qualities_to_qualitystring(qualities, int offset=33): ######################################################################## ## Python 3 compatibility functions ######################################################################## + cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 cdef from_string_and_size(const char* s, size_t length): @@ -84,42 +85,39 @@ cdef from_string_and_size(const char* s, size_t length): else: return s[:length] -# filename encoding (copied from lxml.etree.pyx) -cdef str _FILENAME_ENCODING -_FILENAME_ENCODING = sys.getfilesystemencoding() -if _FILENAME_ENCODING is None: - _FILENAME_ENCODING = sys.getdefaultencoding() -if _FILENAME_ENCODING is None: - _FILENAME_ENCODING = 'ascii' -#cdef char* _C_FILENAME_ENCODING -#_C_FILENAME_ENCODING = _FILENAME_ENCODING +# filename encoding (adapted from lxml.etree.pyx) +cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii' + cdef bytes encode_filename(object filename): """Make sure a filename is 8-bit encoded (or None).""" if filename is None: return None + elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2: + # Added to support path-like objects + return os.fsencode(filename) elif PyBytes_Check(filename): return filename elif PyUnicode_Check(filename): - return filename.encode(_FILENAME_ENCODING) + return filename.encode(FILENAME_ENCODING) else: - raise TypeError(u"Argument must be string or unicode.") + raise TypeError("Argument must be string or unicode.") + cdef bytes force_bytes(object s, encoding="ascii"): - u"""convert string or unicode object to bytes, assuming + """convert string or unicode object to bytes, assuming ascii encoding. """ - if not IS_PYTHON3: - return s - elif s is None: + if s is None: return None elif PyBytes_Check(s): return s elif PyUnicode_Check(s): return s.encode(encoding) else: - raise TypeError(u"Argument must be string, bytes or unicode.") + raise TypeError("Argument must be string, bytes or unicode.") + cdef charptr_to_str(const char* s, encoding="ascii"): if s == NULL: @@ -129,6 +127,7 @@ cdef charptr_to_str(const char* s, encoding="ascii"): else: return s.decode(encoding) + cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"): if s == NULL: return None @@ -137,12 +136,14 @@ cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"): else: return s[:n].decode(encoding) + cdef bytes charptr_to_bytes(const char* s, encoding="ascii"): if s == NULL: return None else: return s + cdef force_str(object s, encoding="ascii"): """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)""" @@ -156,6 +157,7 @@ cdef force_str(object s, encoding="ascii"): # assume unicode return s + cpdef parse_region(reference=None, start=None, end=None, @@ -283,7 +285,9 @@ def _pysam_dispatch(collection, if method not in ("index", "roh", "stats"): stdout_option = "-o {}" elif method in MAP_STDOUT_OPTIONS[collection]: - stdout_option = MAP_STDOUT_OPTIONS[collection][method] + # special case - samtools view -c outputs on stdout + if not(method == "view" and "-c" in args): + stdout_option = MAP_STDOUT_OPTIONS[collection][method] if stdout_option is not None: os.close(stdout_h) diff --git a/pysam/cvcf.pxd b/pysam/libcvcf.pxd similarity index 100% rename from pysam/cvcf.pxd rename to pysam/libcvcf.pxd diff --git a/pysam/cvcf.pyx b/pysam/libcvcf.pyx similarity index 99% rename from pysam/cvcf.pyx rename to pysam/libcvcf.pyx index 5e2fda2..956f8a5 100644 --- a/pysam/cvcf.pyx +++ b/pysam/libcvcf.pyx @@ -54,10 +54,10 @@ from libc.stdlib cimport atoi from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -cimport pysam.ctabix as ctabix -cimport pysam.ctabixproxies as ctabixproxies +cimport pysam.libctabix as libctabix +cimport pysam.libctabixproxies as libctabixproxies -from pysam.cutils cimport force_str +from pysam.libcutils cimport force_str import pysam @@ -101,7 +101,7 @@ FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue # ########################################################################################################### -cdef class VCFRecord( ctabixproxies.TupleProxy): +cdef class VCFRecord(libctabixproxies.TupleProxy): '''vcf record. initialized from data and vcf meta @@ -141,7 +141,7 @@ cdef class VCFRecord( ctabixproxies.TupleProxy): nbytes does not include the terminal '\0'. ''' - ctabixproxies.TupleProxy.update(self, buffer, nbytes) + libctabixproxies.TupleProxy.update(self, buffer, nbytes) self.contig = self.fields[0] # vcf counts from 1 - correct here @@ -238,7 +238,7 @@ cdef class VCFRecord( ctabixproxies.TupleProxy): return result -cdef class asVCFRecord(ctabix.Parser): +cdef class asVCFRecord(libctabix.Parser): '''converts a :term:`tabix row` into a VCF record.''' cdef vcffile def __init__(self, vcffile): diff --git a/pysam/utils.py b/pysam/utils.py index c5bb539..5c045df 100644 --- a/pysam/utils.py +++ b/pysam/utils.py @@ -1,4 +1,4 @@ -from pysam.cutils import _pysam_dispatch +from pysam.libcutils import _pysam_dispatch class SamtoolsError(Exception): diff --git a/pysam/version.py b/pysam/version.py index 0a985de..facb3bb 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,7 +1,9 @@ # pysam versioning information -__version__ = "0.9.1.4" +__version__ = "0.10.0" __samtools_version__ = "1.3.1" -__htslib_version__ = "1.3.1" +__bcftools_version__ = "1.3.1" + +__htslib_version__ = "1.3.2" diff --git a/requirements.txt b/requirements.txt index 687929a..6e8fc44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -cython>=0.22 +cython>=0.24.1 diff --git a/run_tests_travis.sh b/run_tests_travis.sh index 414043e..a229ff5 100755 --- a/run_tests_travis.sh +++ b/run_tests_travis.sh @@ -6,75 +6,36 @@ WORKDIR=`pwd` #Install miniconda python if [ $TRAVIS_OS_NAME == "osx" ]; then - curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - bash Miniconda3-latest-MacOSX-x86_64.sh -b + wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O Miniconda3.sh else - curl -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh -b + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3.sh --no-check-certificate # Default OS versions are old and have SSL / CERT issues fi +bash Miniconda3.sh -b + # Create a new conda environment with the target python version ~/miniconda3/bin/conda install conda-build -y -~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose +~/miniconda3/bin/conda create -q -y --name testenv python=$CONDA_PY cython numpy nose psutil pip + +# activate testenv environment +source ~/miniconda3/bin/activate testenv -# Add new conda environment to PATH -export PATH=~/miniconda3/envs/testenv/bin/:$PATH +conda config --add channels conda-forge +conda config --add channels defaults +conda config --add channels r +conda config --add channels bioconda -# Hack to force linking to anaconda libraries rather than system libraries -#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/ -#export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:~/miniconda3/envs/testenv/lib/ +conda install -y samtools bcftools htslib # Need to make C compiler and linker use the anaconda includes and libraries: export PREFIX=~/miniconda3/ export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib" export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl" -# create a new folder to store external tools -mkdir -p $WORKDIR/external-tools - -# install htslib -cd $WORKDIR/external-tools -curl -L https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 > htslib-1.3.1.tar.bz2 -tar xjvf htslib-1.3.1.tar.bz2 -cd htslib-1.3.1 -make -PATH=$PATH:$WORKDIR/external-tools/htslib-1.3.1 -LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$WORKDIR/external-tools/htslib-1.3.1 - -# install samtools, compile against htslib -cd $WORKDIR/external-tools -curl -L http://downloads.sourceforge.net/project/samtools/samtools/1.3.1/samtools-1.3.1.tar.bz2 > samtools-1.3.1.tar.bz2 -tar xjvf samtools-1.3.1.tar.bz2 -cd samtools-1.3.1 -./configure --with-htslib=../htslib-1.3.1 -make -PATH=$PATH:$WORKDIR/external-tools/samtools-1.3.1 - -echo "installed samtools" samtools --version - -if [ $? != 0 ]; then - exit 1 -fi - -# install bcftools -cd $WORKDIR/external-tools -curl -L https://github.com/samtools/bcftools/releases/download/1.3.1/bcftools-1.3.1.tar.bz2 > bcftools-1.3.1.tar.bz2 -tar xjf bcftools-1.3.1.tar.bz2 -cd bcftools-1.3.1 -./configure --with-htslib=../htslib-1.3.1 -make -PATH=$PATH:$WORKDIR/external-tools/bcftools-1.3.1 - -echo "installed bcftools" +htslib --version bcftools --version -if [ $? != 0 ]; then - exit 1 -fi - -popd - # Try building conda recipe first ~/miniconda3/bin/conda-build ci/conda-recipe/ --python=$CONDA_PY @@ -105,9 +66,10 @@ if [ $? != 0 ]; then exit 1 fi -# build source tar-ball +# build source tar-ball. Make sure to build so that .pyx files +# are cythonized. cd .. -python setup.py sdist +python setup.py build sdist if [ $? != 0 ]; then exit 1 @@ -123,7 +85,7 @@ fi # test pip installation from tar-ball with cython echo "pip installing with cython" -pip install --verbose --no-deps --no-use-wheel dist/pysam-*.tar.gz +pip install --verbose --no-deps --no-binary=:all: dist/pysam-*.tar.gz if [ $? != 0 ]; then exit 1 @@ -131,10 +93,10 @@ fi # attempt pip installation without cython echo "pip installing without cython" -~/miniconda3/bin/conda remove cython +~/miniconda3/bin/conda remove -y cython ~/miniconda3/bin/conda list -echo "pthyon is" `which python` -pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz +echo "python is" `which python` +pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz if [ $? != 0 ]; then exit 1 @@ -144,7 +106,7 @@ fi # command line options echo "pip installing without cython and no configure options" export HTSLIB_CONFIGURE_OPTIONS="" -pip install --verbose --no-deps --no-use-wheel --force-reinstall --upgrade dist/pysam-*.tar.gz +pip install --verbose --no-deps --no-binary=:all: --force-reinstall --upgrade dist/pysam-*.tar.gz if [ $? != 0 ]; then exit 1 diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 3d5ffa5..8c883b0 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -489,9 +489,9 @@ int main_samview(int argc, char *argv[]) } view_end: - if (is_count && ret == 0) + if (is_count && ret == 0) fprintf(pysam_stdout, "%" PRId64 "\n", count); - + // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); diff --git a/setup.py b/setup.py index e301f11..6d52617 100644 --- a/setup.py +++ b/setup.py @@ -60,13 +60,18 @@ def run_configure(option): def run_make_print_config(): - stdout = subprocess.check_output(["make", "print-config"]) + stdout = subprocess.check_output(["make", "-s", "print-config"]) if IS_PYTHON3: stdout = stdout.decode("ascii") - result = dict([[x.strip() for x in line.split("=")] - for line in stdout.splitlines()]) - return result + make_print_config = {} + for line in stdout.splitlines(): + if "=" in line: + row = line.split("=") + if len(row) == 2: + make_print_config.update( + {row[0].strip(): row[1].strip()}) + return make_print_config def configure_library(library_dir, env_options=None, options=[]): @@ -139,16 +144,12 @@ try: import cython HAVE_CYTHON = True print ("# pysam: cython is available - using cythonize if necessary") - source_pattern = "pysam/c%s.pyx" - if HTSLIB_MODE != "external": - HTSLIB_MODE = "shared" + source_pattern = "pysam/libc%s.pyx" except ImportError: HAVE_CYTHON = False print ("# pysam: no cython available - using pre-compiled C") # no Cython available - use existing C code - source_pattern = "pysam/c%s.c" - if HTSLIB_MODE != "external": - HTSLIB_MODE = "shared" + source_pattern = "pysam/libc%s.c" # collect pysam version sys.path.insert(0, "pysam") @@ -230,7 +231,6 @@ if HTSLIB_LIBRARY_DIR: chtslib_sources = [] htslib_library_dirs = [HTSLIB_LIBRARY_DIR] htslib_include_dirs = [HTSLIB_INCLUDE_DIR] - internal_htslib_libraries = [] external_htslib_libraries = ['z', 'hts'] elif HTSLIB_MODE == 'separate': @@ -240,7 +240,6 @@ elif HTSLIB_MODE == 'separate': shared_htslib_sources = htslib_sources htslib_library_dirs = [] htslib_include_dirs = ['htslib'] - internal_htslib_libraries = [] elif HTSLIB_MODE == 'shared': # link each pysam component against the same @@ -249,30 +248,15 @@ elif HTSLIB_MODE == 'shared': htslib_library_dirs = [ 'pysam', ".", - os.path.join("build", - distutils_dir_name("lib"), - "pysam")] + os.path.join("build", distutils_dir_name("lib"), "pysam")] htslib_include_dirs = ['htslib'] - if IS_PYTHON3: - if sys.version_info.minor >= 5: - internal_htslib_libraries = ["chtslib.{}".format( - sysconfig.get_config_var('SOABI'))] - else: - if sys.platform == "darwin": - # On OSX, python 3.3 and 3.4 Libs have no platform tags. - internal_htslib_libraries = ["chtslib"] - else: - internal_htslib_libraries = ["chtslib.{}{}".format( - sys.implementation.cache_tag, - sys.abiflags)] - else: - internal_htslib_libraries = ["chtslib"] - else: raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE) +internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]] + # build config.py with open(os.path.join("pysam", "config.py"), "w") as outf: outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE)) @@ -382,7 +366,7 @@ chtslib = Extension( # Selected ones have been copied into samfile_utils.c # Needs to be devolved somehow. csamfile = Extension( - "pysam.csamfile", + "pysam.libcsamfile", [source_pattern % "samfile", "pysam/htslib_util.c", "pysam/samfile_util.c", @@ -402,7 +386,7 @@ csamfile = Extension( # Selected ones have been copied into samfile_utils.c # Needs to be devolved somehow. calignmentfile = Extension( - "pysam.calignmentfile", + "pysam.libcalignmentfile", [source_pattern % "alignmentfile", "pysam/htslib_util.c", "pysam/samfile_util.c", @@ -422,7 +406,7 @@ calignmentfile = Extension( # Selected ones have been copied into samfile_utils.c # Needs to be devolved somehow. calignedsegment = Extension( - "pysam.calignedsegment", + "pysam.libcalignedsegment", [source_pattern % "alignedsegment", "pysam/htslib_util.c", "pysam/samfile_util.c", @@ -438,7 +422,7 @@ calignedsegment = Extension( ) ctabix = Extension( - "pysam.ctabix", + "pysam.libctabix", [source_pattern % "tabix", "pysam/tabix_util.c"] + htslib_sources + @@ -452,7 +436,7 @@ ctabix = Extension( ) cutils = Extension( - "pysam.cutils", + "pysam.libcutils", [source_pattern % "utils", "pysam/pysam_util.c"] + glob.glob(os.path.join("samtools", "*.pysam.c")) + # glob.glob(os.path.join("samtools", "*", "*.pysam.c")) + @@ -470,7 +454,7 @@ cutils = Extension( ) cfaidx = Extension( - "pysam.cfaidx", + "pysam.libcfaidx", [source_pattern % "faidx"] + htslib_sources + os_c_files, @@ -483,7 +467,7 @@ cfaidx = Extension( ) ctabixproxies = Extension( - "pysam.ctabixproxies", + "pysam.libctabixproxies", [source_pattern % "tabixproxies"] + os_c_files, library_dirs=htslib_library_dirs, @@ -495,7 +479,7 @@ ctabixproxies = Extension( ) cvcf = Extension( - "pysam.cvcf", + "pysam.libcvcf", [source_pattern % "vcf"] + os_c_files, library_dirs=htslib_library_dirs, @@ -507,7 +491,7 @@ cvcf = Extension( ) cbcf = Extension( - "pysam.cbcf", + "pysam.libcbcf", [source_pattern % "bcf"] + htslib_sources + os_c_files, @@ -519,6 +503,19 @@ cbcf = Extension( define_macros=define_macros ) +cbgzf = Extension( + "pysam.libcbgzf", + [source_pattern % "bgzf"] + + htslib_sources + + os_c_files, + library_dirs=htslib_library_dirs, + include_dirs=["htslib", "."] + include_os + htslib_include_dirs, + libraries=external_htslib_libraries + internal_htslib_libraries, + language="c", + extra_compile_args=extra_compile_args, + define_macros=define_macros +) + metadata = { 'name': "pysam", 'version': version, @@ -539,6 +536,7 @@ metadata = { ctabixproxies, cvcf, cbcf, + cbgzf, cfaidx, cutils], 'cmdclass': cmdclass, diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 94b2eb3..b0a3466 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -46,19 +46,19 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.query_sequence, None) self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) self.assertEqual(a.flag, 0) - self.assertEqual(a.reference_id, 0) + self.assertEqual(a.reference_id, -1) self.assertEqual(a.mapping_quality, 0) self.assertEqual(a.cigartuples, None) self.assertEqual(a.tags, []) - self.assertEqual(a.next_reference_id, 0) - self.assertEqual(a.next_reference_start, 0) + self.assertEqual(a.next_reference_id, -1) + self.assertEqual(a.next_reference_start, -1) self.assertEqual(a.template_length, 0) def testStrOfEmptyRead(self): a = pysam.AlignedSegment() s = str(a) self.assertEqual( - "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]", + "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]", s) def testSettingTagInEmptyRead(self): @@ -231,6 +231,24 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.get_blocks(), [(20, 30), (31, 40), (40, 60)]) + def test_infer_query_length(self): + '''Test infer_query_length on M|=|X|I|D|H|S cigar ops''' + a = self.buildRead() + a.cigarstring = '15M' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '15=' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '15X' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '5M5I5M' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '5M5D5M' + self.assertEqual(a.infer_query_length(), 10) + a.cigarstring = '5H10M' + self.assertEqual(a.infer_query_length(), 15) + a.cigarstring = '5S10M' + self.assertEqual(a.infer_query_length(), 15) + def test_get_aligned_pairs_soft_clipping(self): a = self.buildRead() a.cigartuples = ((4, 2), (0, 35), (4, 3)) @@ -375,6 +393,18 @@ class TestAlignedSegment(ReadTest): a.cigarstring = "1S20M1S" self.assertEqual(a.query_alignment_length, 20) + def test_query_length_is_limited(self): + + a = self.buildRead() + a.query_name = "A" * 1 + a.query_name = "A" * 254 + self.assertRaises( + ValueError, + setattr, + a, + "query_name", + "A" * 255) + class TestCigarStats(ReadTest): @@ -754,5 +784,6 @@ class TestAsString(unittest.TestCase): for s, p in zip(reference, pysamf): self.assertEqual(s, p.tostring(pysamf)) + if __name__ == "__main__": unittest.main() diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index c042f4f..18fb05b 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -29,7 +29,8 @@ from TestUtils import checkBinaryEqual, checkURL, \ get_temp_filename -DATADIR = "pysam_data" +DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__), + "pysam_data")) ################################################## @@ -353,26 +354,53 @@ class BasicTestBAMFromFilename(BasicTestBAMFromFetch): class BasicTestBAMFromFile(BasicTestBAMFromFetch): def setUp(self): - f = open(os.path.join(DATADIR, "ex3.bam")) - self.samfile = pysam.AlignmentFile( - f, "rb") + with open(os.path.join(DATADIR, "ex3.bam")) as f: + self.samfile = pysam.AlignmentFile( + f, "rb") + self.reads = [r for r in self.samfile] + + +class BasicTestBAMFromFileNo(BasicTestBAMFromFetch): + + def setUp(self): + with open(os.path.join(DATADIR, "ex3.bam")) as f: + self.samfile = pysam.AlignmentFile( + f.fileno(), "rb") self.reads = [r for r in self.samfile] class BasicTestSAMFromFile(BasicTestBAMFromFetch): def setUp(self): - f = open(os.path.join(DATADIR, "ex3.sam")) - self.samfile = pysam.AlignmentFile( - f, "r") + with open(os.path.join(DATADIR, "ex3.sam")) as f: + self.samfile = pysam.AlignmentFile( + f, "r") + self.reads = [r for r in self.samfile] + + +class BasicTestSAMFromFileNo(BasicTestBAMFromFetch): + + def setUp(self): + with open(os.path.join(DATADIR, "ex3.sam")) as f: + self.samfile = pysam.AlignmentFile( + f.fileno(), "r") self.reads = [r for r in self.samfile] class BasicTestCRAMFromFile(BasicTestCRAMFromFetch): def setUp(self): - f = open(os.path.join(DATADIR, "ex3.cram")) - self.samfile = pysam.AlignmentFile(f, "rc") + with open(os.path.join(DATADIR, "ex3.cram")) as f: + self.samfile = pysam.AlignmentFile(f, "rc") + self.reads = [r for r in self.samfile] + + +class BasicTestCRAMFromFileNo(BasicTestCRAMFromFetch): + + def setUp(self): + with open(os.path.join(DATADIR, "ex3.cram")) as f: + self.samfile = pysam.AlignmentFile( + f.fileno(), "rc") self.reads = [r for r in self.samfile] @@ -690,7 +718,7 @@ class TestIO(unittest.TestCase): samfile = pysam.AlignmentFile(f, "rb") f.close() self.assertTrue(f.closed) - # access to Samfile should still work + # access to Samfile still works self.checkEcho("ex1.bam", "ex1.bam", "tmp_ex1.bam", @@ -818,6 +846,15 @@ class TestIO(unittest.TestCase): mode="rb") self.assertEqual(len(list(samfile.fetch())), 3270) + def testBAMWithCSIIndex(self): + '''see issue 116''' + input_filename = os.path.join(DATADIR, "ex1_csi.bam") + samfile = pysam.AlignmentFile(input_filename, + "rb", + check_sq=False) + samfile.fetch('chr2') + + class TestAutoDetect(unittest.TestCase): @@ -1291,8 +1328,8 @@ class TestHeaderSAM(unittest.TestCase): """testing header manipulation""" - header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], + header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'}, + {'LN': 1584, 'SN': 'chr2', 'AH': '*'}], 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"}, {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', @@ -2343,6 +2380,46 @@ class TestPileupQueryPosition(unittest.TestCase): last[r.alignment.query_name] = r.query_position +class TestFindIntrons(unittest.TestCase): + samfilename = "pysam_data/ex_spliced.bam" + + def setUp(self): + self.samfile = pysam.AlignmentFile(self.samfilename) + + def tearDown(self): + self.samfile.close() + + def test_total(self): + all_read_counts = self.samfile.count() + splice_sites = self.samfile.find_introns(self.samfile.fetch()) + self.assertEqual(sum(splice_sites.values()), all_read_counts -1) # there is a single unspliced read in there + + def test_first(self): + reads = list(self.samfile.fetch())[:10] + splice_sites = self.samfile.find_introns(reads) + starts = [14792+38 - 1] + stops = [14792+38 + 140 - 1] + self.assertEqual(len(splice_sites), 1) + self.assertTrue((starts[0], stops[0]) in splice_sites) + self.assertEqual(splice_sites[(starts[0], stops[0])], 9) # first one is the unspliced read + + def test_all(self): + reads = list(self.samfile.fetch()) + splice_sites = self.samfile.find_introns(reads) + should = collections.Counter({ + (14829, 14969): 33, + (15038, 15795): 24, + (15947, 16606): 3, + (16765, 16857): 9, + (16765, 16875): 1, + (17055, 17232): 19, + (17055, 17605): 3, + (17055, 17914): 1, + (17368, 17605): 7, + }) + self.assertEqual(should, splice_sites) + + class TestLogging(unittest.TestCase): '''test around bug issue 42, @@ -2511,7 +2588,6 @@ class TestMappedUnmapped(unittest.TestCase): inf.mapped) - class TestSamtoolsProxy(unittest.TestCase): '''tests for sanity checking access to samtools functions.''' @@ -2592,6 +2668,34 @@ class TestVerbosity(unittest.TestCase): self.assertEqual(pysam.get_verbosity(), 3) +class TestSanityCheckingBAM(unittest.TestCase): + + mode = "wb" + + def check_write(self, read): + + fn = "tmp_test_sanity_check.bam" + names = ["chr1"] + lengths = [10000] + with pysam.AlignmentFile( + fn, + self.mode, + reference_names=names, + reference_lengths=lengths) as outf: + outf.write(read) + + if os.path.exists(fn): + os.unlink(fn) + + def test_empty_read_gives_value_error(self): + read = pysam.AlignedSegment() + self.check_write(read) + +# SAM writing fails, as query length is 0 +# class TestSanityCheckingSAM(TestSanityCheckingSAM): +# mode = "w" + + if __name__ == "__main__": # build data files print ("building data files") diff --git a/tests/SamFile_test.py b/tests/SamFile_test.py index 1fc88f3..ff13045 100644 --- a/tests/SamFile_test.py +++ b/tests/SamFile_test.py @@ -941,8 +941,8 @@ class TestIteratorColumn2(unittest.TestCase): class TestHeaderSam(unittest.TestCase): - header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], + header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'}, + {'LN': 1584, 'SN': 'chr2', 'AH': '*'}], 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"}, {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}], 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}], @@ -1231,19 +1231,19 @@ class TestAlignedRead(ReadTest): self.assertEqual(a.seq, None) self.assertEqual(a.qual, None) self.assertEqual(a.flag, 0) - self.assertEqual(a.rname, 0) + self.assertEqual(a.rname, -1) self.assertEqual(a.mapq, 0) self.assertEqual(a.cigar, []) self.assertEqual(a.tags, []) - self.assertEqual(a.mrnm, 0) - self.assertEqual(a.mpos, 0) + self.assertEqual(a.mrnm, -1) + self.assertEqual(a.mpos, -1) self.assertEqual(a.isize, 0) def testStrOfEmptyRead(self): a = pysam.AlignedRead() s = str(a) self.assertEqual( - "None\t0\t0\t0\t0\tNone\t0\t0\t0\tNone\tNone\t[]", + "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]", s) def buildRead(self): diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py new file mode 100644 index 0000000..ce59da7 --- /dev/null +++ b/tests/StreamFiledescriptors_test.py @@ -0,0 +1,82 @@ +import os +import subprocess +import threading +import errno +import unittest + +from pysam import AlignmentFile + +DATADIR = os.path.abspath(os.path.join( + os.path.dirname(__file__), + "pysam_data")) + + +def alignmentfile_writer_thread(infile, outfile): + def _writer_thread(infile, outfile): + """read from infile and write to outfile""" + try: + i = 0 + for record in infile: + outfile.write(record) + i += 1 + except IOError as e: + if e.errno != errno.EPIPE: + pass + finally: + outfile.close() + + writer = threading.Thread(target=_writer_thread, args=(infile, outfile)) + writer.daemon = True + writer.start() + return writer + + +class StreamTest(unittest.TestCase): + + def stream_process(self, proc, in_stream, out_stream, writer): + + with AlignmentFile(proc.stdout) as infile: + read = 0 + for record in infile: + read += 1 + return 0, read + + def test_text_processing(self): + + proc = subprocess.Popen('head -n200', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + shell=True) + + in_stream = AlignmentFile('pysam_data/ex1.bam') + out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header) + writer = alignmentfile_writer_thread(in_stream, + out_stream) + + written, read = self.stream_process(proc, + in_stream, + out_stream, + writer) + self.assertEqual(read, 198) + + def test_samtools_processing(self): + + proc = subprocess.Popen('samtools view -b -f 4', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + shell=True) + + in_stream = AlignmentFile('pysam_data/ex1.bam') + out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header) + writer = alignmentfile_writer_thread(in_stream, + out_stream) + + written, read = self.stream_process(proc, + in_stream, + out_stream, + writer) + self.assertEqual(read, 35) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index ef21245..aa82c66 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -1,8 +1,15 @@ import os +import sys import unittest import pysam import gzip import subprocess + +try: + from pathlib import Path +except ImportError: + Path = None + from TestUtils import get_temp_filename, check_lines_equal DATADIR="cbcf_data" @@ -75,6 +82,17 @@ class TestOpening(unittest.TestCase): os.unlink("tmp_testEmptyFile.vcf") + + if Path and sys.version_info >= (3,6): + def testEmptyFileVCFFromPath(self): + with open("tmp_testEmptyFile.vcf", "w"): + pass + + self.assertRaises(ValueError, pysam.VariantFile, + Path("tmp_testEmptyFile.vcf")) + + os.unlink("tmp_testEmptyFile.vcf") + def testEmptyFileVCFGZWithIndex(self): with open("tmp_testEmptyFile.vcf", "w"): pass @@ -171,12 +189,12 @@ class TestHeader(unittest.TestCase): # remove last header line starting with #CHROM ref.pop() ref = sorted(ref) - comp = sorted([str(x) for x in v.header.records]) + comp = sorted(str(x) for x in v.header.records) self.assertEqual(len(ref), len(comp)) for x, y in zip(ref, comp): - self.assertEqual(x[:-1], str(y)) + self.assertEqual(x, y) # These tests need to be separate and start from newly opened files. This @@ -195,6 +213,13 @@ class TestParsing(unittest.TestCase): chrom = [rec.chrom for rec in v] self.assertEqual(chrom, ['M', '17', '20', '20', '20']) + if Path and sys.version_info >= (3,6): + def testChromFromPath(self): + fn = os.path.join(DATADIR, self.filename) + v = pysam.VariantFile(Path(fn)) + chrom = [rec.chrom for rec in v] + self.assertEqual(chrom, ['M', '17', '20', '20', '20']) + def testPos(self): fn = os.path.join(DATADIR, self.filename) v = pysam.VariantFile(fn) @@ -330,9 +355,22 @@ class TestConstructionVCFWithContigs(unittest.TestCase): """construct VariantFile from scratch.""" filename = "example_vcf42_withcontigs.vcf" + compression = 'NONE' + description = 'VCF version 4.2 variant calling text' - def complete_check(self, fn_in, fn_out): + def testBase(self): + with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: + self.assertEqual(inf.category, 'VARIANTS') + self.assertEqual(inf.format, 'VCF') + self.assertEqual(inf.version, (4, 2)) + self.assertEqual(inf.compression, self.compression) + self.assertEqual(inf.description, self.description) + self.assertTrue(inf.is_open) + self.assertEqual(inf.is_read, True) + self.assertEqual(inf.is_write, False) + def complete_check(self, fn_in, fn_out): + self.maxDiff = None check_lines_equal( self, fn_in, fn_out, sort=True, filter_f=lambda x: x.startswith("##contig")) @@ -349,14 +387,15 @@ class TestConstructionVCFWithContigs(unittest.TestCase): for record in vcf_in.header.records: header.add_record(record) - fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf" - vcf_out = pysam.VariantFile(fn, "w", header=header) + for sample in vcf_in.header.samples: + header.add_sample(sample) + + vcf_out = pysam.VariantFile(fn_out, "w", header=header) for record in vcf_in: - # currently segfaults here: - # vcf_out.write(record) - pass - return + record.translate(header) + vcf_out.write(record) + vcf_in.close() vcf_out.close() self.complete_check(fn_in, fn_out) @@ -370,6 +409,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase): for record in vcf_in: vcf_out.write(record) + vcf_in.close() vcf_out.close() self.complete_check(fn_in, fn_out) @@ -397,8 +437,8 @@ class TestConstructionVCFWithContigs(unittest.TestCase): self.complete_check(fn_in, fn_out) -# Currently segfaults for VCFs without contigs -# class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs): + +#class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs): # """construct VariantFile from scratch.""" # filename = "example_vcf40.vcf" @@ -407,18 +447,33 @@ class TestConstructionVCFGZWithContigs(TestConstructionVCFWithContigs): """construct VariantFile from scratch.""" filename = "example_vcf42_withcontigs.vcf.gz" + compression = 'BGZF' + description = 'VCF version 4.2 BGZF-compressed variant calling data' class TestConstructionVCFGZWithoutContigs(TestConstructionVCFWithContigs): """construct VariantFile from scratch.""" filename = "example_vcf42.vcf.gz" + compression = 'BGZF' + description = 'VCF version 4.2 BGZF-compressed variant calling data' class TestSettingRecordValues(unittest.TestCase): filename = "example_vcf40.vcf" + def testBase(self): + with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: + self.assertEqual(inf.category, 'VARIANTS') + self.assertEqual(inf.format, 'VCF') + self.assertEqual(inf.version, (4, 0)) + self.assertEqual(inf.compression, 'NONE') + self.assertEqual(inf.description, 'VCF version 4.0 variant calling text') + self.assertTrue(inf.is_open) + self.assertEqual(inf.is_read, True) + self.assertEqual(inf.is_write, False) + def testSetQual(self): with pysam.VariantFile(os.path.join(DATADIR, self.filename)) as inf: record = next(inf) @@ -435,8 +490,7 @@ class TestSettingRecordValues(unittest.TestCase): sample = record.samples["NA00001"] print (sample["GT"]) self.assertEqual(sample["GT"], (0, 0)) -# Fails with TypeError -# sample["GT"] = sample["GT"] + sample["GT"] = sample["GT"] class TestSubsetting(unittest.TestCase): diff --git a/tests/_compile_test.pyx b/tests/_compile_test.pyx index db6b5b6..dfe7937 100644 --- a/tests/_compile_test.pyx +++ b/tests/_compile_test.pyx @@ -1,5 +1,5 @@ -from pysam.calignmentfile cimport AlignmentFile, AlignedSegment -from pysam.ctabix cimport Tabixfile +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment +from pysam.libctabix cimport Tabixfile cdef AlignmentFile samfile cdef Tabixfile tabixfile diff --git a/tests/_cython_flagstat.pyx b/tests/_cython_flagstat.pyx index f0f03bb..8e376b0 100644 --- a/tests/_cython_flagstat.pyx +++ b/tests/_cython_flagstat.pyx @@ -1,6 +1,6 @@ -from pysam.calignmentfile cimport AlignmentFile, AlignedSegment -from pysam.calignmentfile cimport pysam_get_flag -from pysam.calignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment +from pysam.libcalignmentfile cimport BAM_FPROPER_PAIR, BAM_FPAIRED +from pysam.libcalignedsegment cimport pysam_get_flag def count(AlignmentFile samfile): cdef int is_proper = 0 diff --git a/tests/cython_flagstat.py b/tests/cython_flagstat.py deleted file mode 100644 index 851157a..0000000 --- a/tests/cython_flagstat.py +++ /dev/null @@ -1,11 +0,0 @@ -import pysam - -import pyximport -pyximport.install() -import _cython_flagstat - -is_paired, is_proper = _cython_flagstat.count( - pysam.AlignmentFile("ex1.bam", "rb")) - -print ("there are alignments of %i paired reads" % is_paired) -print ("there are %i proper paired alignments" % is_proper) diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index 89a4a0c..2ccedd2 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -18,7 +18,8 @@ all: ex1.pileup.gz \ empty.bam empty.bam.bai \ explicit_index.bam explicit_index.cram \ faidx_empty_seq.fq.gz \ - ex1.fa.gz ex1.fa.gz.fai + ex1.fa.gz ex1.fa.gz.fai \ + ex1_csi.bam # ex2.sam - as ex1.sam, but with header ex2.sam.gz: ex1.bam ex1.bam.bai @@ -44,6 +45,7 @@ uncompressed.bam: ex2.sam ex1.fa.fai:ex1.fa samtools faidx ex1.fa + ex1.bam:ex1.sam.gz ex1.fa.fai samtools import ex1.fa.fai ex1.sam.gz ex1.bam @@ -56,6 +58,10 @@ ex1.pileup.gz:ex1.bam ex1.fa ex2_truncated.bam: ex2.bam head -c 124000 ex2.bam > ex2_truncated.bam +ex1_csi.bam: ex1.bam + cp ex1.bam ex1_csi.bam + samtools index -c ex1_csi.bam + empty.bam: ex2.sam grep "^@" $< | samtools view -Sb - > $@ diff --git a/tests/pysam_data/ex3.sam b/tests/pysam_data/ex3.sam index 495d4fe..7a09188 100644 --- a/tests/pysam_data/ex3.sam +++ b/tests/pysam_data/ex3.sam @@ -1,6 +1,6 @@ @HD VN:1.0 -@SQ SN:chr1 LN:1575 -@SQ SN:chr2 LN:1584 +@SQ SN:chr1 LN:1575 AH:chr1:5000000-5010000 +@SQ SN:chr2 LN:1584 AH:* @RG ID:L1 PU:SC_1_10 LB:SC_1 SM:NA12891 CN:name:with:colon @RG ID:L2 PU:SC_2_12 LB:SC_2 SM:NA12891 CN:name:with:colon @PG ID:P1 VN:1.0 diff --git a/tests/pysam_data/ex_spliced.sam b/tests/pysam_data/ex_spliced.sam new file mode 100644 index 0000000..ae8086a --- /dev/null +++ b/tests/pysam_data/ex_spliced.sam @@ -0,0 +1,297 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:248956422 +@SQ SN:2 LN:242193529 +@SQ SN:3 LN:198295559 +@SQ SN:4 LN:190214555 +@SQ SN:5 LN:181538259 +@SQ SN:6 LN:170805979 +@SQ SN:7 LN:159345973 +@SQ SN:8 LN:145138636 +@SQ SN:9 LN:138394717 +@SQ SN:10 LN:133797422 +@SQ SN:11 LN:135086622 +@SQ SN:12 LN:133275309 +@SQ SN:13 LN:114364328 +@SQ SN:14 LN:107043718 +@SQ SN:15 LN:101991189 +@SQ SN:16 LN:90338345 +@SQ SN:17 LN:83257441 +@SQ SN:18 LN:80373285 +@SQ SN:19 LN:58617616 +@SQ SN:20 LN:64444167 +@SQ SN:21 LN:46709983 +@SQ SN:22 LN:50818468 +@SQ SN:X LN:156040895 +@SQ SN:Y LN:57227415 +@SQ SN:MT LN:16569 +@SQ SN:GL000008.2 LN:209709 +@SQ SN:GL000009.2 LN:201709 +@SQ SN:GL000194.1 LN:191469 +@SQ SN:GL000195.1 LN:182896 +@SQ SN:GL000205.2 LN:185591 +@SQ SN:GL000208.1 LN:92689 +@SQ SN:GL000213.1 LN:164239 +@SQ SN:GL000214.1 LN:137718 +@SQ SN:GL000216.2 LN:176608 +@SQ SN:GL000218.1 LN:161147 +@SQ SN:GL000219.1 LN:179198 +@SQ SN:GL000220.1 LN:161802 +@SQ SN:GL000221.1 LN:155397 +@SQ SN:GL000224.1 LN:179693 +@SQ SN:GL000225.1 LN:211173 +@SQ SN:GL000226.1 LN:15008 +@SQ SN:KI270302.1 LN:2274 +@SQ SN:KI270303.1 LN:1942 +@SQ SN:KI270304.1 LN:2165 +@SQ SN:KI270305.1 LN:1472 +@SQ SN:KI270310.1 LN:1201 +@SQ SN:KI270311.1 LN:12399 +@SQ SN:KI270312.1 LN:998 +@SQ SN:KI270315.1 LN:2276 +@SQ SN:KI270316.1 LN:1444 +@SQ SN:KI270317.1 LN:37690 +@SQ SN:KI270320.1 LN:4416 +@SQ SN:KI270322.1 LN:21476 +@SQ SN:KI270329.1 LN:1040 +@SQ SN:KI270330.1 LN:1652 +@SQ SN:KI270333.1 LN:2699 +@SQ SN:KI270334.1 LN:1368 +@SQ SN:KI270335.1 LN:1048 +@SQ SN:KI270336.1 LN:1026 +@SQ SN:KI270337.1 LN:1121 +@SQ SN:KI270338.1 LN:1428 +@SQ SN:KI270340.1 LN:1428 +@SQ SN:KI270362.1 LN:3530 +@SQ SN:KI270363.1 LN:1803 +@SQ SN:KI270364.1 LN:2855 +@SQ SN:KI270366.1 LN:8320 +@SQ SN:KI270371.1 LN:2805 +@SQ SN:KI270372.1 LN:1650 +@SQ SN:KI270373.1 LN:1451 +@SQ SN:KI270374.1 LN:2656 +@SQ SN:KI270375.1 LN:2378 +@SQ SN:KI270376.1 LN:1136 +@SQ SN:KI270378.1 LN:1048 +@SQ SN:KI270379.1 LN:1045 +@SQ SN:KI270381.1 LN:1930 +@SQ SN:KI270382.1 LN:4215 +@SQ SN:KI270383.1 LN:1750 +@SQ SN:KI270384.1 LN:1658 +@SQ SN:KI270385.1 LN:990 +@SQ SN:KI270386.1 LN:1788 +@SQ SN:KI270387.1 LN:1537 +@SQ SN:KI270388.1 LN:1216 +@SQ SN:KI270389.1 LN:1298 +@SQ SN:KI270390.1 LN:2387 +@SQ SN:KI270391.1 LN:1484 +@SQ SN:KI270392.1 LN:971 +@SQ SN:KI270393.1 LN:1308 +@SQ SN:KI270394.1 LN:970 +@SQ SN:KI270395.1 LN:1143 +@SQ SN:KI270396.1 LN:1880 +@SQ SN:KI270411.1 LN:2646 +@SQ SN:KI270412.1 LN:1179 +@SQ SN:KI270414.1 LN:2489 +@SQ SN:KI270417.1 LN:2043 +@SQ SN:KI270418.1 LN:2145 +@SQ SN:KI270419.1 LN:1029 +@SQ SN:KI270420.1 LN:2321 +@SQ SN:KI270422.1 LN:1445 +@SQ SN:KI270423.1 LN:981 +@SQ SN:KI270424.1 LN:2140 +@SQ SN:KI270425.1 LN:1884 +@SQ SN:KI270429.1 LN:1361 +@SQ SN:KI270435.1 LN:92983 +@SQ SN:KI270438.1 LN:112505 +@SQ SN:KI270442.1 LN:392061 +@SQ SN:KI270448.1 LN:7992 +@SQ SN:KI270465.1 LN:1774 +@SQ SN:KI270466.1 LN:1233 +@SQ SN:KI270467.1 LN:3920 +@SQ SN:KI270468.1 LN:4055 +@SQ SN:KI270507.1 LN:5353 +@SQ SN:KI270508.1 LN:1951 +@SQ SN:KI270509.1 LN:2318 +@SQ SN:KI270510.1 LN:2415 +@SQ SN:KI270511.1 LN:8127 +@SQ SN:KI270512.1 LN:22689 +@SQ SN:KI270515.1 LN:6361 +@SQ SN:KI270516.1 LN:1300 +@SQ SN:KI270517.1 LN:3253 +@SQ SN:KI270518.1 LN:2186 +@SQ SN:KI270519.1 LN:138126 +@SQ SN:KI270521.1 LN:7642 +@SQ SN:KI270522.1 LN:5674 +@SQ SN:KI270528.1 LN:2983 +@SQ SN:KI270529.1 LN:1899 +@SQ SN:KI270530.1 LN:2168 +@SQ SN:KI270538.1 LN:91309 +@SQ SN:KI270539.1 LN:993 +@SQ SN:KI270544.1 LN:1202 +@SQ SN:KI270548.1 LN:1599 +@SQ SN:KI270579.1 LN:31033 +@SQ SN:KI270580.1 LN:1553 +@SQ SN:KI270581.1 LN:7046 +@SQ SN:KI270582.1 LN:6504 +@SQ SN:KI270583.1 LN:1400 +@SQ SN:KI270584.1 LN:4513 +@SQ SN:KI270587.1 LN:2969 +@SQ SN:KI270588.1 LN:6158 +@SQ SN:KI270589.1 LN:44474 +@SQ SN:KI270590.1 LN:4685 +@SQ SN:KI270591.1 LN:5796 +@SQ SN:KI270593.1 LN:3041 +@SQ SN:KI270706.1 LN:175055 +@SQ SN:KI270707.1 LN:32032 +@SQ SN:KI270708.1 LN:127682 +@SQ SN:KI270709.1 LN:66860 +@SQ SN:KI270710.1 LN:40176 +@SQ SN:KI270711.1 LN:42210 +@SQ SN:KI270712.1 LN:176043 +@SQ SN:KI270713.1 LN:40745 +@SQ SN:KI270714.1 LN:41717 +@SQ SN:KI270715.1 LN:161471 +@SQ SN:KI270716.1 LN:153799 +@SQ SN:KI270717.1 LN:40062 +@SQ SN:KI270718.1 LN:38054 +@SQ SN:KI270719.1 LN:176845 +@SQ SN:KI270720.1 LN:39050 +@SQ SN:KI270721.1 LN:100316 +@SQ SN:KI270722.1 LN:194050 +@SQ SN:KI270723.1 LN:38115 +@SQ SN:KI270724.1 LN:39555 +@SQ SN:KI270725.1 LN:172810 +@SQ SN:KI270726.1 LN:43739 +@SQ SN:KI270727.1 LN:448248 +@SQ SN:KI270728.1 LN:1872759 +@SQ SN:KI270729.1 LN:280839 +@SQ SN:KI270730.1 LN:112551 +@SQ SN:KI270731.1 LN:150754 +@SQ SN:KI270732.1 LN:41543 +@SQ SN:KI270733.1 LN:179772 +@SQ SN:KI270734.1 LN:165050 +@SQ SN:KI270735.1 LN:42811 +@SQ SN:KI270736.1 LN:181920 +@SQ SN:KI270737.1 LN:103838 +@SQ SN:KI270738.1 LN:99375 +@SQ SN:KI270739.1 LN:73985 +@SQ SN:KI270740.1 LN:37240 +@SQ SN:KI270741.1 LN:157432 +@SQ SN:KI270742.1 LN:186739 +@SQ SN:KI270743.1 LN:210658 +@SQ SN:KI270744.1 LN:168472 +@SQ SN:KI270745.1 LN:41891 +@SQ SN:KI270746.1 LN:66486 +@SQ SN:KI270747.1 LN:198735 +@SQ SN:KI270748.1 LN:93321 +@SQ SN:KI270749.1 LN:158759 +@SQ SN:KI270750.1 LN:148850 +@SQ SN:KI270751.1 LN:150742 +@SQ SN:KI270752.1 LN:27745 +@SQ SN:KI270753.1 LN:62944 +@SQ SN:KI270754.1 LN:40191 +@SQ SN:KI270755.1 LN:36723 +@SQ SN:KI270756.1 LN:79590 +@SQ SN:KI270757.1 LN:71251 +@PG ID:STAR PN:STAR VN:STAR_2.4.1a +HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 51M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:2202:17748:47494 272 1 14792 0 38M140N13M * 0 0 GGGCCTCTCACCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHFHIIJJIJAFHJJJJJGJIIHGIJGGIJJIIJIIJJJG NH:i:6 HI:i:3 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:1214:7658:35836 272 1 14792 0 38M140N13M * 0 0 GGGCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCAT CCCFFFFFHHHHHJJJJJJJJCGHIJJIJJJJJJIJJGIJJIJIJIJJJJI NH:i:6 HI:i:3 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:2114:4116:44566 272 1 14794 0 36M140N15M * 0 0 GCCCCTCACCAGCCCCAGGTCTTTTCCCAGAGATGCCCTTGCGCCTCATGA <@@DDDDDDFHCFHEFGBE+2AFH@GIEGF=GGHII9FB?BFFDEE??BD?D@DADDDDBDDD@FGHFHFHIHGI@?DGC@CF NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:1105:1515:82248 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG ??:ADBDDDDD:A<+C?AFDB@E?F4<*?:?1:??):??0009??9?(8BC NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1110:16355:5537 272 1 14802 0 28M140N23M * 0 0 CCAGCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTG @CCFFFFFHH?ADHGIJIJJJJJIIEHIJJJJJIJIGIIJJIJJIIJIJJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1102:17802:20689 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG CCCFFFFFHHHHHJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJI NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:1104:7670:95815 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@DBDDDHHBFDBFGEBBGHG@HIBHIDHBGGGEFBDDDFDGBBBGCHHI NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1110:11368:101298 272 1 14805 0 25M140N26M * 0 0 GCCCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG BCCFFFFFCFHHHJJJJJIJJJJJJJJJJJJJGJJJJJJJJJJJJJJJIJJ NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1115:2363:85646 272 1 14805 0 25M140N26M * 0 0 GCTCCAGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @C@FFFB?CFDFHDHGIIIIEGIIIIEDGIIIIIIIIIGGIIGIIGCGHIH NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:2213:6044:80821 272 1 14805 0 25M140N26M * 0 0 GCTCCGGGTCCTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTG @@@FFFFFFFBFDGIIJIJGGFHIIIJIIJGIIEHICHIGEGFG?FGHGA>9B8BF@ NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1215:4185:31561 272 1 14815 0 15M140N36M * 0 0 CTTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCG CCCFFFFFHHHHGJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJIJJJH NH:i:7 HI:i:4 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2108:1506:70629 272 1 14816 0 14M140N37M * 0 0 TTTCCCAGAGATGCCCTTGCGCCTCATGACCAGCTTGTTGAAGAGATCCGA ?@@;BDDD=DFFDDDFGGFCA?)1@F?F+AAEEEBFFIIF@DE4= NH:i:8 HI:i:2 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:1108:6828:32713 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT ?@@ADDDDD?CDD:CFB@:@G@ABGFGFGFBFEAFEEEFCFCF@F=)8=@> NH:i:8 HI:i:2 AS:i:45 nM:i:2 +HWI-C00113:131:HMHYWADXX:1:1111:7491:39504 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFDFFFHHHFIGEGHIGIGGDGIJFHEHGGIJJJIJIJJJJJIIIIGI NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1212:16079:85811 272 1 15003 0 36M757N15M * 0 0 CCGGCATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHGHHHJJJJJJJIIJJJJJIJIJJHIIJJJJIJJJJJJJIJJJ NH:i:8 HI:i:2 AS:i:45 nM:i:2 +HWI-C00113:131:HMHYWADXX:1:2101:7167:50357 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@DD?DDFHHHD@?CG?FHGIIIIG@??BGHIE;8@BFEG NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1211:4828:84953 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @BB?DFFFHHHHHIJIJJJJJJJJJJJHIJJIIJJJJJJIIIJJJJJJIJI NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2107:20905:80208 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @CCFFFFFHHHFBHIIEIIDIHGGGGG@GGHCFGHIIJIGGGGIJIGIGGH NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2112:6263:84991 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@?DDDDFBH?FHIGIGIIGG;GHBGCD?DCGIIGHEGBBFHGGIHBFIG NH:i:8 HI:i:2 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:2202:10314:26844 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT CCCFFFFFHHHHHJJJJJJJJJJJJJJIJJJJIIJJJJJJJJJJJJJJJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2213:21028:90280 272 1 15003 0 36M757N15M * 0 0 CCGACATCAAGTCCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCT @@@BDDDAD?FDF9GIBB@@FFG3CFF:DD)?BD*9D@@F4BDEEEFFF8= NH:i:8 HI:i:2 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:1:1216:14847:22529 272 1 15004 0 35M757N16M * 0 0 CGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTT @@@FFFFDHHBDHIIIJJJJIIIIIIJJIJJGIJIFIJJIDHHGBEHIJJJ NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2111:14281:81135 272 1 15007 0 32M757N19M * 0 0 CATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTG @@@DDDBD42=:ACFFIE?FFGAFF@FFFDGEAG>D@DBB9BC3D@EDFFA NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2203:4824:93169 272 1 15008 0 31M757N20M * 0 0 ATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTCCTTCTGC CCCFFFFFHHHHHJJJJIJJJJHIJIJJJJJJJJGIJJJJI?DFGFHIHJF NH:i:8 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1112:17298:87937 272 1 15925 1 23M659N28M * 0 0 CACTTCCCTGGGAGCTCCCTGGACTGAAGGAGACGCGCTGCTGCTGCTGTC ?@;;BA;3ABC?C?6EGDGIIBA+AACACD>>:9:??2< NH:i:6 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2109:14386:93817 272 1 16728 0 38M92N13M * 0 0 GGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTTGAGAGGCTCG @CCFFFDDHHHHDHIFHIJJJGHHIIJHHHHHHFFFFEFEEEECDDDDDDB NH:i:6 HI:i:2 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2203:14322:7218 272 1 16741 0 25M110N26M * 0 0 GGTGTTAGTACCCCATCTTGTAGGTCTCAGTGTGGAAGGTGGGCAGTTCTG ?@?DDD?BFFHHFB7EFGGGEFHIHADB8D>822BDG?FHBGEH?FHGG3 NH:i:6 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2116:7403:96086 272 1 17020 0 36M177N15M * 0 0 GCCCAGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG :?=DDD=AAAC:+BDIIIIIIA? NH:i:7 HI:i:5 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1209:11002:81132 272 1 17020 0 36M177N15M * 0 0 GCCCGGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTG @@@DD@A<@DDDF;BCGF<4CHCEG?EG@FGF9)?BB:?B?DBF>D?**9B NH:i:7 HI:i:4 AS:i:47 nM:i:1 +HWI-C00113:131:HMHYWADXX:2:1115:8064:78307 272 1 17021 0 35M177N16M * 0 0 CCCTGGTCTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGC 11844BBDD=FDFEFFFDFI?HEHAFBEHEEEFC?E:FDGDDDH9CBEHHHEEFB?F>GD@3?FB?BB@ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1101:15891:42282 272 1 17028 0 28M177N23M * 0 0 CTGGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCT CCCFFFFFHHHHHJHHIIJJJJJJJJJJIIJJJJIJJJIJJJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1107:10929:6659 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHDHIHHJJGJJJJJJJIJJIJGIJJJIJJJIJJJJJIJJG NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1114:7098:71178 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG =?@BDEEFHBDHFBGIEGIHEHIGDHGEIIJIIIEHIHIIGHDGHIGIIH@ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1209:3383:100724 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG ?@@ADDDDDHDH?EEFHEHIGIIGHGHIFII>BFIH? NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2111:3771:31345 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@DFFFFGHDHHHJGIHJJJJGIJJIJIJIIJJIIJJIGHIJJJIJJIJ< NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2205:14794:36455 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHIJJJJJJJJJJJJJJJJJJIJJJJIJJIJJJJJJJJJJ NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1107:19701:64552 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFDFFHHHHDGIIJIJJJIIJDGHGJJJJJJIJJJJJJJGIJJJJJJF NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1210:18711:88303 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG CCCFFFFFHHHHHJJJJJJJJIJFIJJJEIIHIIJJIIJJGJJJIJJJJJE NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2212:19113:15559 272 1 17030 0 26M177N25M * 0 0 GGCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTG @@@7B>DDC=@BFBGBAFCGBFDB@DHIHIDD>@@GHID NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1212:15591:47491 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC @@C+ADDDDHFFDEGEGIIIDFHIFHIIIIIGEHIIBH>FGGGHGHFGGII NH:i:7 HI:i:6 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2215:10125:81395 272 1 17031 0 25M859N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCAGGGCCCGCTCGTCCAGGGGGC CCCFFFFFGHHHHJJJJJJJJJHJJJJJJIJIIJJJHIJJJJJJJJJIJHE NH:i:6 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2102:9065:90529 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG C@CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJFHIFHIJIJJJJJJJJ NH:i:5 HI:i:2 AS:i:47 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:2204:7767:77376 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@@FDFDDBFHADEHEIGIGIJIGHIHG?EDGHGGCFH:B?BD@FGFHGIH NH:i:5 HI:i:2 AS:i:47 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1212:6793:42000 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@?DADBD8CFADGFHIIIIE3A9?DH?FHGHH@EHGIEHGGIIIGGHIGHGFDEHGH=FHGIIH NH:i:3 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:2207:3786:78354 16 1 17340 1 29M237N22M * 0 0 GGGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGA CCCFFFFFHHHHHJJJJJIIJJJJJJJJJJJJHHIJIHHBFIHIIJJJJJI NH:i:3 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:1:1115:8438:81914 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG @@CFFFFDHH?HDGGHIIGIGHIGHGIDIIIFGIIGHHDG:?DFHEHIIII NH:i:3 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:2:1114:13486:49038 16 1 17341 1 28M237N23M * 0 0 GGGGTCCAGGAAGACATACTTCTTCTACAGGTTCTCGGTGGTGTTGAAGAG ?@@:D@DDFHAFFHGGFHFHH@CCHIIIII@:CFGFGGC?D)?8DHHGCGI NH:i:3 HI:i:1 AS:i:49 nM:i:0 diff --git a/tests/python_flagstat.py b/tests/python_flagstat.py deleted file mode 100644 index b14e52d..0000000 --- a/tests/python_flagstat.py +++ /dev/null @@ -1,11 +0,0 @@ -import pysam - -is_paired = 0 -is_proper = 0 - -for read in pysam.AlignmentFile("ex1.bam", "rb"): - is_paired += read.is_paired - is_proper += read.is_proper_pair - -print ("there are alignments of %i paired reads" % is_paired) -print ("there are %i proper paired alignments" % is_proper) diff --git a/tests/samtools_test.py b/tests/samtools_test.py index d5b2791..aa4c554 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -71,6 +71,7 @@ class SamtoolsTest(unittest.TestCase): # an output file. statements = [ "view ex1.bam > %(out)s_ex1.view", + "view -c ex1.bam > %(out)s_ex1.count", # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam", "sort ex1.bam -o %(out)s_ex1.sort.bam", "mpileup ex1.bam > %(out)s_ex1.pileup", -- 2.30.2