From: Michael R. Crusoe Date: Tue, 6 Aug 2019 17:37:29 +0000 (+0200) Subject: New upstream version 0.15.3+ds X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2~12^2~7 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=169a628899d883a30944ece2535f066881671cf4;p=python-pysam.git New upstream version 0.15.3+ds --- diff --git a/.gitignore b/.gitignore index fd28d6d..b07a532 100644 --- a/.gitignore +++ b/.gitignore @@ -17,11 +17,13 @@ tests/cbcf_data tests/tabix_data samtools/config.h -bcftools/config.g +bcftools/config.h htslib/config.status htslib/config.h htslib/config.log htslib/config.mk +htslib/htslib.pc.tmp +htslib/htslib-uninstalled.pc pysam/config.py # linking tests diff --git a/.travis.yml b/.travis.yml index 3642281..b30d9b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ os: - - linx + - linux - osx language: c @@ -7,11 +7,58 @@ language: c env: matrix: - CONDA_PY=2.7 - - CONDA_PY=3.4 - - CONDA_PY=3.5 - CONDA_PY=3.6 + - CONDA_PY=3.7 global: - PYSAM_LINKING_TEST=1 + - TWINE_USERNAME=grepall + - secure: 'OcwwP8/o21+SGW0UVAnnCQwllhGSCq2HJzpI9EhX3kh6J9RTkyx/+drkg45bx1Z5u8zymuAFappEYzlpzqZE886XezkjOYGVa/u+Coqr1oT/BEJHFCkCA4o26yESp7Zy8aNj/juhB7Rfa77pIDXBayqTzbALz/AURMtZapasB18=' + +_deploy_common: &deploy_common + if: tag IS present + install: + - python3 -m pip install cibuildwheel twine + +matrix: + include: + - stage: deploy + os: linux + language: python + python: '3.5' + services: + - docker + env: + - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && pip install -r requirements.txt" + - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' + addons: + apt: + packages: + - gcc + - g++ + - libcurl4-openssl-dev # for libcurl support in sdist + - libssl-dev # for s3 support in sdist + <<: *deploy_common + script: + - set -e + - cibuildwheel --output-dir dist + - python3 -m pip install Cython + - python3 setup.py build_ext --inplace + - python3 setup.py sdist + - twine check dist/* + - twine upload --skip-existing dist/* + - stage: deploy + os: osx + language: generic + env: + - CIBW_BEFORE_BUILD="pip install -r requirements.txt" + - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"' + addons: {} + <<: *deploy_common + script: + - set -e + - cibuildwheel --output-dir dist + - twine check dist/* + - twine upload --skip-existing dist/* addons: apt: diff --git a/NEWS b/NEWS index 4861555..49ce485 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,26 @@ http://pysam.readthedocs.io/en/latest/release.html Release notes ============= +Release 0.15.3 +============== + +Bugfix release. + +* [#824] allow reading of UTF-8 encoded text in VCF/BCF files. +* [#780] close all filehandles before opening new ones in pysam_dispatch +* [#773] do not cache VariantRecord.id to avoid memory leak +* [#781] default of multiple_iterators=True is changed to False for + CRAM files. +* [#825] fix collections.abc import +* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy + bug when setting FORMAT fields. +* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its + memory as needed, rather than a fixed-size char buffer. +* [#814] Build wheels and upload them to PyPI +* [#755] Allow passing flags and arguments to index methods +* [#763] Strip \0 in header check +* [#761] Test Tabix index contents, not the compression + Release 0.15.2 ============== diff --git a/README.rst b/README.rst index 2005eb6..f8a641d 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,8 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. +The current version of pysam wraps 3rd-party code from htslib-1.9, samtools-1.9, and bcftools-1.9. + Pysam is available through `pypi `_. To install, type:: diff --git a/bcftools/bcftools.pysam.c b/bcftools/bcftools.pysam.c index e3e251e..de8739d 100644 --- a/bcftools/bcftools.pysam.c +++ b/bcftools/bcftools.pysam.c @@ -10,7 +10,6 @@ FILE * bcftools_stderr = NULL; FILE * bcftools_stdout = NULL; const char * bcftools_stdout_fn = NULL; -int bcftools_stdout_fileno = STDOUT_FILENO; FILE * bcftools_set_stderr(int fd) @@ -21,11 +20,10 @@ FILE * bcftools_set_stderr(int fd) return bcftools_stderr; } -void bcftools_unset_stderr(void) +void bcftools_close_stderr(void) { - if (bcftools_stderr != NULL) - fclose(bcftools_stderr); - bcftools_stderr = fopen("/dev/null", "w"); + fclose(bcftools_stderr); + bcftools_stderr = NULL; } FILE * bcftools_set_stdout(int fd) @@ -37,7 +35,6 @@ FILE * bcftools_set_stdout(int fd) { fprintf(bcftools_stderr, "could not set stdout to fd %i", fd); } - bcftools_stdout_fileno = fd; return bcftools_stdout; } @@ -46,12 +43,10 @@ void bcftools_set_stdout_fn(const char *fn) bcftools_stdout_fn = fn; } -void bcftools_unset_stdout(void) +void bcftools_close_stdout(void) { - if (bcftools_stdout != NULL) - fclose(bcftools_stdout); - bcftools_stdout = fopen("/dev/null", "w"); - bcftools_stdout_fileno = STDOUT_FILENO; + fclose(bcftools_stdout); + bcftools_stdout = NULL; } int bcftools_puts(const char *s) diff --git a/bcftools/bcftools.pysam.h b/bcftools/bcftools.pysam.h index 92ab370..453567a 100644 --- a/bcftools/bcftools.pysam.h +++ b/bcftools/bcftools.pysam.h @@ -1,7 +1,7 @@ -#ifndef PYSAM_H -#define PYSAM_H +#ifndef bcftools_PYSAM_H +#define bcftools_PYSAM_H -#include "stdio.h" +#include extern FILE * bcftools_stderr; @@ -17,7 +17,7 @@ FILE * bcftools_set_stderr(int fd); /*! set pysam standard output to point to file descriptor - Setting the stderr will close the previous stdout. + Setting the stdout will close the previous stdout. */ FILE * bcftools_set_stdout(int fd); @@ -26,17 +26,15 @@ FILE * bcftools_set_stdout(int fd); */ void bcftools_set_stdout_fn(const char * fn); -/*! set pysam standard error to /dev/null. +/*! close pysam standard error and set to NULL - Unsetting the stderr will close the previous stderr. */ -void bcftools_unset_stderr(void); +void bcftools_close_stderr(void); -/*! set pysam standard error to /dev/null. +/*! close pysam standard output and set to NULL - Unsetting the stderr will close the previous stderr. */ -void bcftools_unset_stdout(void); +void bcftools_close_stdout(void); int bcftools_puts(const char *s); diff --git a/devtools/buildwheels.sh b/devtools/buildwheels.sh index ae0d953..bf71bf9 100755 --- a/devtools/buildwheels.sh +++ b/devtools/buildwheels.sh @@ -5,6 +5,10 @@ # # It is best to run this in a fresh clone of the repository! # +# Before running, make sure to update image: +# +# docker pull quay.io/pypa/manylinux1_x86_64 +# # Run this within the repository root: # docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh # @@ -24,13 +28,6 @@ fi yum install -y zlib-devel bzip2-devel xz-devel -# Python 2.6 is not supported -rm -r /opt/python/cp26* - -# Python 3.3 builds fail with: -# /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib -rm -r /opt/python/cp33* - # Without libcurl support, htslib can open files from HTTP and FTP URLs. # With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a # current version of OpenSSL, and we do not want to be responsible for diff --git a/devtools/import.py b/devtools/import.py index 44e606a..7430cc5 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -94,16 +94,16 @@ def _update_pysam_files(cf, destdir): outfile.write('#include "{}.pysam.h"\n\n'.format(basename)) subname, _ = os.path.splitext(os.path.basename(filename)) if subname in MAIN.get(basename, []): - lines = re.sub("int main\(", "int {}_main(".format( + lines = re.sub(r"int main\(", "int {}_main(".format( basename), lines) else: - lines = re.sub("int main\(", "int {}_{}_main(".format( + lines = re.sub(r"int main\(", "int {}_{}_main(".format( basename, subname), lines) lines = re.sub("stderr", "{}_stderr".format(basename), lines) lines = re.sub("stdout", "{}_stdout".format(basename), lines) - lines = re.sub(" printf\(", " fprintf({}_stdout, ".format(basename), lines) - lines = re.sub("([^kf])puts\(", r"\1{}_puts(".format(basename), lines) - lines = re.sub("putchar\(([^)]+)\)", + lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines) + lines = re.sub(r"([^kf])puts\(", r"\1{}_puts(".format(basename), lines) + lines = re.sub(r"putchar\(([^)]+)\)", r"fputc(\1, {}_stdout)".format(basename), lines) fn = os.path.basename(filename) diff --git a/devtools/run_tests_travis.sh b/devtools/run_tests_travis.sh index 89988bb..b20c8a0 100755 --- a/devtools/run_tests_travis.sh +++ b/devtools/run_tests_travis.sh @@ -31,30 +31,33 @@ bash Miniconda3.sh -b # activate testenv environment source ~/miniconda3/bin/activate testenv -conda config --add channels r conda config --add channels defaults -conda config --add channels conda-forge conda config --add channels bioconda +conda config --add channels conda-forge # pin versions, so that tests do not fail when pysam/htslib out of step # add htslib dependencies -conda install -y "samtools=1.7" "bcftools=1.6" "htslib=1.7" xz curl bzip2 +# NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488 +conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses # Need to make C compiler and linker use the anaconda includes and libraries: export PREFIX=~/miniconda3/ export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib" -export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl --disable-lzma" +export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl" +echo "show samtools, htslib, and bcftools versions" samtools --version -htslib --version +htsfile --version bcftools --version # Try building conda recipe first ~/miniconda3/bin/conda-build devtools/conda-recipe/ --python=$CONDA_PY # install code from the repository via setup.py -echo "installing via setup.py from repository" -python setup.py install +echo +echo "============ installing via setup.py from repository ============" +echo +python setup.py install || exit # create auxilliary data echo diff --git a/doc/api.rst b/doc/api.rst index d2a9a80..3f2c042 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -142,6 +142,9 @@ BAM/SAM formatted files. .. autoclass:: pysam.AlignmentFile :members: +.. autoclass:: pysam.AlignmentHeader + :members: + An :class:`~pysam.AlignedSegment` represents an aligned segment within a SAM/BAM file. diff --git a/doc/glossary.rst b/doc/glossary.rst index de3e032..4e9fa57 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -24,7 +24,7 @@ Glossary are 0-based, half-open intervals, i.e., the position 10,000 is part of the interval, but 20,000 is not. An exception are :term:`samtools` compatible region strings such as - 'chr1:10000:20000', which are closed, i.e., both positions 10,000 + 'chr1:10000-20000', which are closed, i.e., both positions 10,000 and 20,000 are part of the interval. column diff --git a/doc/release.rst b/doc/release.rst index 5adc437..ee1875b 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,7 +2,27 @@ Release notes ============= -Release 0.15.1 +Release 0.15.3 +============== + +Bugfix release. + +* [#824] allow reading of UTF-8 encoded text in VCF/BCF files. +* [#780] close all filehandles before opening new ones in pysam_dispatch +* [#773] do not cache VariantRecord.id to avoid memory leak +* [#781] default of multiple_iterators=True is changed to False for + CRAM files. +* [#825] fix collections.abc import +* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy + bug when setting FORMAT fields. +* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its + memory as needed, rather than a fixed-size char buffer. +* [#814] Build wheels and upload them to PyPI +* [#755] Allow passing flags and arguments to index methods +* [#763] Strip \0 in header check +* [#761] Test Tabix index contents, not the compression + +Release 0.15.2 ============== Bugfix release. diff --git a/doc/usage.rst b/doc/usage.rst index 6172329..f4b7498 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -189,7 +189,7 @@ region can be retrieved by calling :meth:`~pysam.TabixFile.fetch()`:: print (str(row)) This will return a tuple-like data structure in which columns can -be retrieved by numeric index: +be retrieved by numeric index:: for row in tbx.fetch("chr1", 1000, 2000): print ("chromosome is", row[0]) @@ -385,7 +385,7 @@ flagstat command and consists of three files: 2. The cython implementation :file:`_pysam_flagstat.pyx`. This script imports the pysam API via:: - from pysam.calignmentfile cimport AlignmentFile, AlignedSegment + from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment This statement imports, amongst others, :class:`AlignedSegment` into the namespace. Speed can be gained from declaring diff --git a/import/pysam.c b/import/pysam.c index d08cf50..5692622 100644 --- a/import/pysam.c +++ b/import/pysam.c @@ -10,7 +10,6 @@ FILE * @pysam@_stderr = NULL; FILE * @pysam@_stdout = NULL; const char * @pysam@_stdout_fn = NULL; -int @pysam@_stdout_fileno = STDOUT_FILENO; FILE * @pysam@_set_stderr(int fd) @@ -21,11 +20,10 @@ FILE * @pysam@_set_stderr(int fd) return @pysam@_stderr; } -void @pysam@_unset_stderr(void) +void @pysam@_close_stderr(void) { - if (@pysam@_stderr != NULL) - fclose(@pysam@_stderr); - @pysam@_stderr = fopen("/dev/null", "w"); + fclose(@pysam@_stderr); + @pysam@_stderr = NULL; } FILE * @pysam@_set_stdout(int fd) @@ -37,7 +35,6 @@ FILE * @pysam@_set_stdout(int fd) { fprintf(@pysam@_stderr, "could not set stdout to fd %i", fd); } - @pysam@_stdout_fileno = fd; return @pysam@_stdout; } @@ -46,12 +43,10 @@ void @pysam@_set_stdout_fn(const char *fn) @pysam@_stdout_fn = fn; } -void @pysam@_unset_stdout(void) +void @pysam@_close_stdout(void) { - if (@pysam@_stdout != NULL) - fclose(@pysam@_stdout); - @pysam@_stdout = fopen("/dev/null", "w"); - @pysam@_stdout_fileno = STDOUT_FILENO; + fclose(@pysam@_stdout); + @pysam@_stdout = NULL; } int @pysam@_puts(const char *s) diff --git a/import/pysam.h b/import/pysam.h index c3cc623..6abb884 100644 --- a/import/pysam.h +++ b/import/pysam.h @@ -1,7 +1,7 @@ -#ifndef PYSAM_H -#define PYSAM_H +#ifndef @pysam@_PYSAM_H +#define @pysam@_PYSAM_H -#include "stdio.h" +#include extern FILE * @pysam@_stderr; @@ -17,7 +17,7 @@ FILE * @pysam@_set_stderr(int fd); /*! set pysam standard output to point to file descriptor - Setting the stderr will close the previous stdout. + Setting the stdout will close the previous stdout. */ FILE * @pysam@_set_stdout(int fd); @@ -26,17 +26,15 @@ FILE * @pysam@_set_stdout(int fd); */ void @pysam@_set_stdout_fn(const char * fn); -/*! set pysam standard error to /dev/null. +/*! close pysam standard error and set to NULL - Unsetting the stderr will close the previous stderr. */ -void @pysam@_unset_stderr(void); +void @pysam@_close_stderr(void); -/*! set pysam standard error to /dev/null. +/*! close pysam standard output and set to NULL - Unsetting the stderr will close the previous stderr. */ -void @pysam@_unset_stdout(void); +void @pysam@_close_stdout(void); int @pysam@_puts(const char *s); diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h index 25bd3e4..33c46c9 100644 --- a/pysam/htslib_util.h +++ b/pysam/htslib_util.h @@ -18,20 +18,6 @@ typedef khash_t(vdict) vdict_t; KHASH_DECLARE(s2i, kh_cstr_t, int64_t) typedef khash_t(s2i) s2i_t; -////////////////////////////////////////////////////////////////// -/*! set pysam standard error to point to file descriptor - - Setting the stderr will close the previous stderr. - */ -// FILE * pysam_set_stderr(int fd); - -////////////////////////////////////////////////////////////////// -/*! set pysam standard error to /dev/null. - - Unsetting the stderr will close the previous stderr. - */ -// void pysam_unset_stderr(); - ////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////// diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd index 48ca93f..c964160 100644 --- a/pysam/libcalignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -70,7 +70,7 @@ cdef class PileupColumn: cdef int n_pu cdef AlignmentHeader header cdef uint32_t min_base_quality - cdef uint8_t * buf + cdef kstring_t buf cdef char * reference_sequence cdef class PileupRead: diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index 7ad61ac..02ef5b6 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -69,7 +69,6 @@ from cpython cimport array as c_array from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \ INT8_MAX, INT16_MAX, INT32_MAX, \ UINT8_MAX, UINT16_MAX, UINT32_MAX -from libc.stdio cimport snprintf from pysam.libchtslib cimport HTS_IDX_NOCOOR from pysam.libcutils cimport force_bytes, force_str, \ @@ -89,9 +88,6 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 cdef char* CODE2CIGAR= "MIDNSHP=XB" cdef int NCIGAR_CODES = 10 -# dimensioned for 8000 pileup limit (+ insertions/deletions) -cdef uint32_t MAX_PILEUP_BUFFER_SIZE = 10000 - if IS_PYTHON3: CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) maketrans = str.maketrans @@ -632,9 +628,8 @@ cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp, dest.n_pu = n_pu dest.min_base_quality = min_base_quality dest.reference_sequence = reference_sequence - dest.buf = calloc(MAX_PILEUP_BUFFER_SIZE, sizeof(uint8_t)) - if dest.buf == NULL: - raise MemoryError("could not allocate pileup buffer") + dest.buf.l = dest.buf.m = 0 + dest.buf.s = NULL return dest @@ -1853,10 +1848,9 @@ cdef class AlignedSegment: return s def get_forward_qualities(self): - """return the original read sequence. + """return base qualities of the read sequence. - Reads mapping to the reverse strand will be reverse - complemented. + Reads mapping to the reverse strand will be reversed. """ if self.is_reverse: return self.query_qualities[::-1] @@ -2799,8 +2793,7 @@ cdef class PileupColumn: "\n".join(map(str, self.pileups)) def __dealloc__(self): - if self.buf is not NULL: - free(self.buf) + free(self.buf.s) def set_min_base_quality(self, min_base_quality): """set the minimum base quality for this pileup column. @@ -2971,15 +2964,16 @@ cdef class PileupColumn: cdef uint32_t x = 0 cdef uint32_t j = 0 cdef uint32_t c = 0 - cdef uint32_t n = 0 cdef uint8_t cc = 0 cdef uint8_t rb = 0 - cdef uint8_t * buf = self.buf + cdef kstring_t * buf = &self.buf cdef bam_pileup1_t * p = NULL if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") + buf.l = 0 + # todo: reference sequence to count matches/mismatches # todo: convert assertions to exceptions for x from 0 <= x < self.n_pu: @@ -2992,16 +2986,12 @@ cdef class PileupColumn: continue # see samtools pileup_seq if mark_ends and p.is_head: - buf[n] = '^' - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc('^', buf) if p.b.core.qual > 93: - buf[n] = 126 + kputc(126, buf) else: - buf[n] = p.b.core.qual + 33 - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc(p.b.core.qual + 33, buf) if not p.is_del: if p.qpos < p.b.core.l_qseq: cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)] @@ -3012,68 +3002,44 @@ cdef class PileupColumn: rb = self.reference_sequence[self.reference_pos] if seq_nt16_table[cc] == seq_nt16_table[rb]: cc = "=" - buf[n] = strand_mark_char(cc, p.b) - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc(strand_mark_char(cc, p.b), buf) elif add_indels: if p.is_refskip: if bam_is_rev(p.b): - buf[n] = '<' + kputc('<', buf) else: - buf[n] = '>' + kputc('>', buf) else: - buf[n] = '*' - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc('*', buf) if add_indels: if p.indel > 0: - buf[n] = '+' - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE - n += snprintf(&(buf[n]), - MAX_PILEUP_BUFFER_SIZE - n, - "%i", - p.indel) - assert n < MAX_PILEUP_BUFFER_SIZE + kputc('+', buf) + kputw(p.indel, buf) for j from 1 <= j <= p.indel: cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)] - buf[n] = strand_mark_char(cc, p.b) - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc(strand_mark_char(cc, p.b), buf) elif p.indel < 0: - buf[n] = '-' - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE - n += snprintf(&(buf[n]), - MAX_PILEUP_BUFFER_SIZE - n, - "%i", - -p.indel) - assert n < MAX_PILEUP_BUFFER_SIZE + kputc('-', buf) + kputw(-p.indel, buf) for j from 1 <= j <= -p.indel: # TODO: out-of-range check here? if self.reference_sequence == NULL: cc = 'N' else: cc = self.reference_sequence[self.reference_pos + j] - buf[n] = strand_mark_char(cc, p.b) - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc(strand_mark_char(cc, p.b), buf) if mark_ends and p.is_tail: - buf[n] = '$' - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc('$', buf) - buf[n] = ':' - n += 1 - assert n < MAX_PILEUP_BUFFER_SIZE + kputc(':', buf) - if n == 0: + if buf.l == 0: # could be zero if all qualities are too low return "" else: # quicker to ensemble all and split than to encode all separately. # ignore last ":" - return force_str(PyBytes_FromStringAndSize(buf, n-1)).split(":") + return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":") def get_query_qualities(self): """query base quality scores at pileup column position. diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 4030868..d35b0db 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -56,6 +56,10 @@ ######################################################## import os import collections +try: + from collections.abc import Sequence, Mapping # noqa +except ImportError: + from collections import Sequence, Mapping # noqa import re import warnings import array @@ -99,11 +103,11 @@ IndexStats = collections.namedtuple("IndexStats", cdef int MAX_POS = (1 << 31) - 1 # valid types for SAM headers -VALID_HEADER_TYPES = {"HD" : collections.Mapping, - "SQ" : collections.Sequence, - "RG" : collections.Sequence, - "PG" : collections.Sequence, - "CO" : collections.Sequence} +VALID_HEADER_TYPES = {"HD" : Mapping, + "SQ" : Sequence, + "RG" : Sequence, + "PG" : Sequence, + "CO" : Sequence} # order of records within SAM headers VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") @@ -293,7 +297,7 @@ cdef class AlignmentHeader(object): raise ValueError( "invalid type for record %s: %s, expected %s".format( record, type(data), VALID_HEADER_TYPES[record])) - if isinstance(data, collections.Mapping): + if isinstance(data, Mapping): lines.append(build_header_line(data, record)) else: for fields in header_dict[record]: @@ -303,7 +307,7 @@ cdef class AlignmentHeader(object): for record, data in sorted(header_dict.items()): if record in VALID_HEADERS: continue - if isinstance(data, collections.Mapping): + if isinstance(data, Mapping): lines.append(build_header_line(data, record)) else: for fields in header_dict[record]: @@ -346,7 +350,7 @@ cdef class AlignmentHeader(object): return makeAlignmentHeader(bam_hdr_dup(self.ptr)) property nreferences: - """"int with the number of :term:`reference` sequences in the file. + """int with the number of :term:`reference` sequences in the file. This is a read-only attribute.""" def __get__(self): @@ -418,7 +422,8 @@ cdef class AlignmentHeader(object): # convert to python string t = self.__str__() for line in t.split("\n"): - if not line.strip(): + line = line.strip(' \0') + if not line: continue assert line.startswith("@"), \ "header line without '@': '%s'" % line @@ -454,13 +459,13 @@ cdef class AlignmentHeader(object): # interpret type of known header record tags, default to str x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) - if VALID_HEADER_TYPES[record] == collections.Mapping: + if VALID_HEADER_TYPES[record] == Mapping: if record in result: raise ValueError( "multiple '%s' lines are not permitted" % record) result[record] = x - elif VALID_HEADER_TYPES[record] == collections.Sequence: + elif VALID_HEADER_TYPES[record] == Sequence: if record not in result: result[record] = [] result[record].append(x) @@ -889,7 +894,7 @@ cdef class AlignmentFile(HTSFile): self.header = template.header.copy() elif isinstance(header, AlignmentHeader): self.header = header.copy() - elif isinstance(header, collections.Mapping): + elif isinstance(header, Mapping): self.header = AlignmentHeader.from_dict(header) elif reference_names and reference_lengths: self.header = AlignmentHeader.from_references( @@ -1604,7 +1609,6 @@ cdef class AlignmentFile(HTSFile): Or it can be a generator filtering such reads. Example samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse) """ - import collections res = collections.Counter() for r in read_iterator: if 'N' in r.cigarstring: @@ -1635,7 +1639,6 @@ cdef class AlignmentFile(HTSFile): AlignedSegment r int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP - import collections res = collections.Counter() match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position @@ -1898,7 +1901,7 @@ cdef class AlignmentFile(HTSFile): return self.header.get_reference_length(reference) property nreferences: - """"int with the number of :term:`reference` sequences in the file. + """int with the number of :term:`reference` sequences in the file. This is a read-only attribute.""" def __get__(self): if self.header: @@ -2610,10 +2613,16 @@ cdef class IteratorColumnRegion(IteratorColumn): int start = 0, int stop = MAX_POS, int truncate = False, + int multiple_iterators = True, **kwargs ): - # initialize iterator - self._setup_iterator(tid, start, stop, 1) + # initialize iterator. Multiple iterators not available + # for CRAM. + if multiple_iterators and samfile.is_cram: + warnings.warn("multiple_iterators not implemented for CRAM") + multiple_iterators = False + + self._setup_iterator(tid, start, stop, multiple_iterators) self.start = start self.stop = stop self.truncate = truncate diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index 9ac58dc..e40a801 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -95,7 +95,7 @@ from cpython.ref cimport Py_INCREF from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM from cpython.bytes cimport PyBytes_FromStringAndSize -from cpython.unicode cimport PyUnicode_DecodeASCII +from cpython.unicode cimport PyUnicode_DecodeUTF8 from cpython.version cimport PY_MAJOR_VERSION from pysam.libchtslib cimport HTSFile, hisremote @@ -151,7 +151,7 @@ cdef inline bcf_str_cache_get_charptr(const char* s): if PY_MAJOR_VERSION < 3: val = s else: - val = PyUnicode_DecodeASCII(s, strlen(s), NULL) + val = PyUnicode_DecodeUTF8(s, strlen(s), NULL) PyDict_SetItemString(bcf_str_cache, s, val) @@ -282,7 +282,7 @@ cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int sca else: # Otherwise, copy the entire block b = datac[:n] - value = tuple(v.decode('ascii') if v and v != bcf_str_missing else None for v in b.split(b',')) + value = tuple(v.decode('utf-8') if v and v != bcf_str_missing else None for v in b.split(b',')) else: value = [] if type == BCF_BT_INT8: @@ -412,9 +412,13 @@ cdef bcf_empty_array(int type, ssize_t n, int vlen): return value -cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values, - void *dst_data, int dst_type, ssize_t dst_values, +cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values, + void *dst_data, int dst_type, size_t dst_values, int vlen): + """copy data from src to dest where the size of the elements (src_type/dst_type) differ + as well as the number of elements (src_values/dst_values). + """ + cdef char *src_datac cdef char *dst_datac cdef int8_t *src_datai8 @@ -432,7 +436,7 @@ cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values, if src_type == dst_type == BCF_BT_CHAR: src_datac = src_data dst_datac = dst_data - memcpy(src_datac, dst_datac, src_values) + memcpy(dst_datac, src_datac, src_values) for i in range(src_values, dst_values): dst_datac[i] = 0 elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32: @@ -574,7 +578,7 @@ cdef object bcf_check_values(VariantRecord record, value, int sample, cdef int given = len(values) if value_count[0] != -1 and value_count[0] != given: if scalar[0]: - raise TypeError('value expected to be scalar, given len={}'.format(value_count[0], given)) + raise TypeError('value expected to be scalar, given len={}'.format(given)) else: raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given)) @@ -813,14 +817,14 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): if key == 'phased': sample.phased = bool(value) return - + cdef bcf_hdr_t *hdr = sample.record.header.ptr cdef bcf1_t *r = sample.record.ptr cdef int fmt_id cdef vdict_t *d cdef khiter_t k cdef int fmt_type, scalar, realloc, dst_type, vlen = 0 - cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size + cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size if bcf_unpack(r, BCF_UN_ALL) < 0: raise ValueError('Error unpacking VariantRecord') @@ -857,11 +861,10 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): fmt.type if fmt else -1, fmt.n if fmt else -1, &value_count, &scalar, &realloc) - vlen = value_count < 0 value_count = len(values) - - # If we can, write updated values to existing allocated storage + + # If we can, write updated values to existing allocated storage. if fmt and not realloc: r.d.indiv_dirty = 1 bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen) @@ -871,9 +874,9 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): if fmt and fmt.n > alloc_len: alloc_len = fmt.n - n = r.n_sample - new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen) - cdef char *valp = new_values + nsamples = r.n_sample + new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen) + cdef char *new_values_p = new_values if fmt_type == BCF_HT_INT: dst_type = BCF_BT_INT32 @@ -887,15 +890,15 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value): else: raise ValueError('Unsupported FORMAT type') - if fmt and n > 1: - for i in range(n): - bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n, - valp + i*dst_size, dst_type, alloc_len, + if fmt and nsamples > 1: + for i in range(nsamples): + bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n, + new_values_p + i * dst_size, dst_type, alloc_len, vlen) - bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen) + bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen) - if bcf_update_format(hdr, r, bkey, valp, (n*alloc_len), fmt_type) < 0: + if bcf_update_format(hdr, r, bkey, new_values_p, (nsamples * alloc_len), fmt_type) < 0: raise ValueError('Unable to update format values') @@ -1160,7 +1163,7 @@ cdef inline bcf_sync_end(VariantRecord record): cdef bcf_hdr_t *hdr = record.header.ptr cdef bcf_info_t *info cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END') - cdef int ref_len + cdef int ref_len # allow missing ref when instantiating a new record if record.ref is not None: @@ -2036,12 +2039,20 @@ cdef class VariantHeader(object): def __str__(self): cdef int hlen - cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen) + cdef kstring_t line + line.l = line.m = 0 + line.s = NULL - try: - return charptr_to_str_w_len(hstr, hlen) - finally: - free(hstr) + if bcf_hdr_format(self.ptr, 0, &line) < 0: + if line.m: + free(line.s) + raise ValueError('bcf_hdr_format failed') + + ret = charptr_to_str_w_len(line.s, line.l) + + if line.m: + free(line.s) + return ret def new_record(self, contig=None, start=0, stop=0, alleles=None, id=None, qual=None, filter=None, info=None, samples=None, @@ -3121,7 +3132,9 @@ cdef class VariantRecord(object): cdef bcf1_t *r = self.ptr if bcf_unpack(r, BCF_UN_STR) < 0: raise ValueError('Error unpacking VariantRecord') - return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None + # causes a memory leak https://github.com/pysam-developers/pysam/issues/773 + # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None + return charptr_to_str(r.d.id) if r.d.id != b'.' else None @id.setter def id(self, value): @@ -3178,7 +3191,7 @@ cdef class VariantRecord(object): @alleles.setter def alleles(self, values): cdef bcf1_t *r = self.ptr - + # Cache rlen of symbolic alleles before call to bcf_update_alleles_str cdef int rlen = r.rlen diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd index f1c8d76..62a6f3d 100644 --- a/pysam/libcbcftools.pxd +++ b/pysam/libcbcftools.pxd @@ -2,8 +2,8 @@ cdef extern from "bcftools.pysam.h": int bcftools_main(int argc, char *argv[]) void bcftools_set_stderr(int fd) - void bcftools_unset_stderr() + void bcftools_close_stderr() void bcftools_set_stdout(int fd) void bcftools_set_stdout_fn(const char *) - void bcftools_unset_stdout() + void bcftools_close_stdout() void bcftools_set_optind(int) diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index cf783b9..40d8430 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -214,7 +214,7 @@ cdef class FastaFile: return False property closed: - """"bool indicating the current state of the file object. + """bool indicating the current state of the file object. This is a read-only attribute; the close() method changes the value. """ def __get__(self): @@ -231,7 +231,7 @@ cdef class FastaFile: return self._references property nreferences: - """"int with the number of :term:`reference` sequences in the file. + """int with the number of :term:`reference` sequences in the file. This is a read-only attribute.""" def __get__(self): return len(self._references) if self.references else None @@ -610,7 +610,7 @@ cdef class FastxFile: return False property closed: - """"bool indicating the current state of the file object. + """bool indicating the current state of the file object. This is a read-only attribute; the close() method changes the value. """ def __get__(self): diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 8bcf399..9dafee3 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -20,6 +20,11 @@ cdef extern from "htslib/kstring.h" nogil: size_t l, m char *s + int kputc(int c, kstring_t *s) + int kputw(int c, kstring_t *s) + int kputl(long c, kstring_t *s) + int ksprintf(kstring_t *s, const char *fmt, ...) + cdef extern from "htslib_util.h" nogil: int hts_set_verbosity(int verbosity) @@ -1703,6 +1708,12 @@ cdef extern from "htslib/vcf.h" nogil: # Read VCF header from a file and update the header int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) + # Appends formatted header text to _str_. + # If _is_bcf_ is zero, `IDX` fields are discarded. + # @return 0 if successful, or negative if an error occurred + # @since 1.4 + int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str); + # Returns formatted header (newly allocated string) and its length, # excluding the terminating \0. If is_bcf parameter is unset, IDX # fields are discarded. diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd index ff797f8..70fda60 100644 --- a/pysam/libcsamtools.pxd +++ b/pysam/libcsamtools.pxd @@ -2,8 +2,8 @@ cdef extern from "samtools.pysam.h": int samtools_main(int argc, char *argv[]) void samtools_set_stderr(int fd) - void samtools_unset_stderr() + void samtools_close_stderr() void samtools_set_stdout(int fd) void samtools_set_stdout_fn(const char *) - void samtools_unset_stdout() + void samtools_close_stdout() void samtools_set_optind(int) diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index 994a240..ab8e9a6 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -18,10 +18,10 @@ from libc.stdio cimport stdout as c_stdout from posix.fcntl cimport open as c_open, O_WRONLY from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \ - samtools_unset_stderr, samtools_unset_stdout, samtools_set_stdout_fn, samtools_set_optind + samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \ - bcftools_unset_stderr, bcftools_unset_stdout, bcftools_set_stdout_fn, bcftools_set_optind + bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind ##################################################################### # hard-coded constants @@ -91,14 +91,14 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 cdef from_string_and_size(const char* s, size_t length): if IS_PYTHON3: - return s[:length].decode("ascii") + return s[:length].decode("utf8") else: return s[:length] # filename encoding (adapted from lxml.etree.pyx) cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii' - +cdef str TEXT_ENCODING = 'utf-8' cdef bytes encode_filename(object filename): """Make sure a filename is 8-bit encoded (or None).""" @@ -115,9 +115,9 @@ cdef bytes encode_filename(object filename): raise TypeError("Argument must be string or unicode.") -cdef bytes force_bytes(object s, encoding="ascii"): +cdef bytes force_bytes(object s, encoding=TEXT_ENCODING): """convert string or unicode object to bytes, assuming - ascii encoding. + utf8 encoding. """ if s is None: return None @@ -129,7 +129,7 @@ cdef bytes force_bytes(object s, encoding="ascii"): raise TypeError("Argument must be string, bytes or unicode.") -cdef charptr_to_str(const char* s, encoding="ascii"): +cdef charptr_to_str(const char* s, encoding=TEXT_ENCODING): if s == NULL: return None if PY_MAJOR_VERSION < 3: @@ -138,7 +138,7 @@ cdef charptr_to_str(const char* s, encoding="ascii"): return s.decode(encoding) -cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"): +cdef charptr_to_str_w_len(const char* s, size_t n, encoding=TEXT_ENCODING): if s == NULL: return None if PY_MAJOR_VERSION < 3: @@ -147,14 +147,14 @@ cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"): return s[:n].decode(encoding) -cdef bytes charptr_to_bytes(const char* s, encoding="ascii"): +cdef bytes charptr_to_bytes(const char* s, encoding=TEXT_ENCODING): if s == NULL: return None else: return s -cdef force_str(object s, encoding="ascii"): +cdef force_str(object s, encoding=TEXT_ENCODING): """Return s converted to str type of current Python (bytes in Py2, unicode in Py3)""" if s is None: @@ -274,9 +274,25 @@ def _pysam_dispatch(collection, False. ''' - if method == "index": - if args and not os.path.exists(args[0]): - raise IOError("No such file or directory: '%s'" % args[0]) + if method == "index" and args: + # We make sure that at least 1 input file exists, + # and if it doesn't we raise an IOError. + SIMPLE_FLAGS = ['-c', '--csi', '-f', '--force', '-t', '--tbi', '-n', '--nstats', '-s', '--stats'] + ARGUMENTS = ['-m', '--min-shift', '-o', '--output-file', '--threads', '-@'] + skip_next = False + for arg in args: + if skip_next: + skip_next = False + continue + if arg in SIMPLE_FLAGS or (len(arg) > 2 and arg.startswith('-@')): + continue + if arg in ARGUMENTS: + skip_next = True + continue + if not os.path.exists(arg): + raise IOError("No such file or directory: '%s'" % arg) + else: + break if args is None: args = [] @@ -285,8 +301,6 @@ def _pysam_dispatch(collection, # redirect stderr to file stderr_h, stderr_f = tempfile.mkstemp() - samtools_set_stderr(stderr_h) - bcftools_set_stderr(stderr_h) # redirect stdout to file if save_stdout: @@ -297,9 +311,7 @@ def _pysam_dispatch(collection, raise IOError("error while opening {} for writing".format(stdout_f)) samtools_set_stdout_fn(force_bytes(stdout_f)) - samtools_set_stdout(stdout_h) bcftools_set_stdout_fn(force_bytes(stdout_f)) - bcftools_set_stdout(stdout_h) elif catch_stdout: stdout_h, stdout_f = tempfile.mkstemp() @@ -329,12 +341,11 @@ def _pysam_dispatch(collection, samtools_set_stdout_fn(force_bytes(stdout_f)) bcftools_set_stdout_fn(force_bytes(stdout_f)) args.extend(stdout_option.format(stdout_f).split(" ")) - else: - samtools_set_stdout(stdout_h) - bcftools_set_stdout(stdout_h) + stdout_h = c_open(b"/dev/null", O_WRONLY) else: samtools_set_stdout_fn("-") bcftools_set_stdout_fn("-") + stdout_h = c_open(b"/dev/null", O_WRONLY) # setup the function call to samtools/bcftools main cdef char ** cargs @@ -373,9 +384,17 @@ def _pysam_dispatch(collection, # call samtools/bcftools if collection == b"samtools": + samtools_set_stdout(stdout_h) + samtools_set_stderr(stderr_h) retval = samtools_main(n + 2, cargs) + samtools_close_stdout() + samtools_close_stderr() elif collection == b"bcftools": + bcftools_set_stdout(stdout_h) + bcftools_set_stderr(stderr_h) retval = bcftools_main(n + 2, cargs) + bcftools_close_stdout() + bcftools_close_stderr() for i from 0 <= i < n: free(cargs[i + 2]) @@ -395,13 +414,6 @@ def _pysam_dispatch(collection, os.remove(fn) return out - samtools_unset_stderr() - bcftools_unset_stderr() - - if save_stdout or catch_stdout: - samtools_unset_stdout() - bcftools_unset_stdout() - out_stderr = _collect(stderr_f) if save_stdout: out_stdout = None diff --git a/pysam/version.py b/pysam/version.py index d4ec161..49afa17 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,5 +1,5 @@ # pysam versioning information -__version__ = "0.15.2" +__version__ = "0.15.3" # TODO: upgrade number __samtools_version__ = "1.9" diff --git a/samtools/samtools.pysam.c b/samtools/samtools.pysam.c index edbd9a2..b26f892 100644 --- a/samtools/samtools.pysam.c +++ b/samtools/samtools.pysam.c @@ -10,7 +10,6 @@ FILE * samtools_stderr = NULL; FILE * samtools_stdout = NULL; const char * samtools_stdout_fn = NULL; -int samtools_stdout_fileno = STDOUT_FILENO; FILE * samtools_set_stderr(int fd) @@ -21,11 +20,10 @@ FILE * samtools_set_stderr(int fd) return samtools_stderr; } -void samtools_unset_stderr(void) +void samtools_close_stderr(void) { - if (samtools_stderr != NULL) - fclose(samtools_stderr); - samtools_stderr = fopen("/dev/null", "w"); + fclose(samtools_stderr); + samtools_stderr = NULL; } FILE * samtools_set_stdout(int fd) @@ -37,7 +35,6 @@ FILE * samtools_set_stdout(int fd) { fprintf(samtools_stderr, "could not set stdout to fd %i", fd); } - samtools_stdout_fileno = fd; return samtools_stdout; } @@ -46,12 +43,10 @@ void samtools_set_stdout_fn(const char *fn) samtools_stdout_fn = fn; } -void samtools_unset_stdout(void) +void samtools_close_stdout(void) { - if (samtools_stdout != NULL) - fclose(samtools_stdout); - samtools_stdout = fopen("/dev/null", "w"); - samtools_stdout_fileno = STDOUT_FILENO; + fclose(samtools_stdout); + samtools_stdout = NULL; } int samtools_puts(const char *s) diff --git a/samtools/samtools.pysam.h b/samtools/samtools.pysam.h index d97ee25..df8fd01 100644 --- a/samtools/samtools.pysam.h +++ b/samtools/samtools.pysam.h @@ -1,7 +1,7 @@ -#ifndef PYSAM_H -#define PYSAM_H +#ifndef samtools_PYSAM_H +#define samtools_PYSAM_H -#include "stdio.h" +#include extern FILE * samtools_stderr; @@ -17,7 +17,7 @@ FILE * samtools_set_stderr(int fd); /*! set pysam standard output to point to file descriptor - Setting the stderr will close the previous stdout. + Setting the stdout will close the previous stdout. */ FILE * samtools_set_stdout(int fd); @@ -26,17 +26,15 @@ FILE * samtools_set_stdout(int fd); */ void samtools_set_stdout_fn(const char * fn); -/*! set pysam standard error to /dev/null. +/*! close pysam standard error and set to NULL - Unsetting the stderr will close the previous stderr. */ -void samtools_unset_stderr(void); +void samtools_close_stderr(void); -/*! set pysam standard error to /dev/null. +/*! close pysam standard output and set to NULL - Unsetting the stderr will close the previous stderr. */ -void samtools_unset_stdout(void); +void samtools_close_stdout(void); int samtools_puts(const char *s); diff --git a/setup.py b/setup.py index 49261f9..4c97e87 100644 --- a/setup.py +++ b/setup.py @@ -13,8 +13,6 @@ This module provides a low-level wrapper around the htslib C-API as using cython and a high-level API for convenient access to the data within standard genomic file formats. -The current version wraps htslib-1.7, samtools-1.7 and bcftools-1.6. - See: http://www.htslib.org https://github.com/pysam-developers/pysam @@ -53,6 +51,7 @@ def changedir(path): def run_configure(option): + sys.stdout.flush() try: retcode = subprocess.call( " ".join(("./configure", option)), @@ -80,6 +79,25 @@ def run_make_print_config(): return make_print_config +@contextmanager +def set_compiler_envvars(): + tmp_vars = [] + for var in ['CC', 'CFLAGS', 'LDFLAGS']: + if var in os.environ: + print ("# pysam: (env) {}={}".format(var, os.environ[var])) + elif var in sysconfig.get_config_vars(): + value = sysconfig.get_config_var(var) + print ("# pysam: (sysconfig) {}={}".format(var, value)) + os.environ[var] = value + tmp_vars += [var] + + try: + yield + finally: + for var in tmp_vars: + del os.environ[var] + + def configure_library(library_dir, env_options=None, options=[]): configure_script = os.path.join(library_dir, "configure") @@ -93,7 +111,7 @@ def configure_library(library_dir, env_options=None, options=[]): raise ValueError( "configure script {} does not exist".format(configure_script)) - with changedir(library_dir): + with changedir(library_dir), set_compiler_envvars(): if env_options is not None: if run_configure(env_options): return env_options @@ -120,7 +138,7 @@ def get_pysam_version(): sys.path.insert(0, "pysam") import version return version.__version__ - + # How to link against HTSLIB # shared: build shared chtslib from builtin htslib code. @@ -412,7 +430,7 @@ Operating System :: POSIX Operating System :: Unix Operating System :: MacOS """ - + metadata = { 'name': "pysam", 'version': get_pysam_version(), diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index ec0f5e1..e0be5ef 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -22,7 +22,7 @@ from functools import partial import pysam import pysam.samtools -from TestUtils import checkBinaryEqual, check_url, \ +from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \ check_samtools_view_equal, checkFieldEqual, force_str, \ get_temp_filename, BAM_DATADIR @@ -470,7 +470,8 @@ class TestIO(unittest.TestCase): self.checkEcho("ex2.bam", "ex2.bam", "tmp_ex2.bam", - "rb", "wb") + "rb", "wb", + checkf=checkGZBinaryEqual) def testCRAM2CRAM(self): # in some systems different reference sequence paths might be @@ -489,7 +490,8 @@ class TestIO(unittest.TestCase): self.checkEcho("ex2.sam", "ex2.bam", "tmp_ex2.bam", - "r", "wb") + "r", "wb", + checkf=checkGZBinaryEqual) def testBAM2SAM(self): self.checkEcho("ex2.bam", @@ -665,7 +667,8 @@ class TestIO(unittest.TestCase): self.checkEcho("ex1.bam", "ex1.bam", "tmp_ex1.bam", - "rb", "wb") + "rb", "wb", + checkf=checkGZBinaryEqual) f = open(os.path.join(BAM_DATADIR, "ex1.bam")) samfile = pysam.AlignmentFile(f, "rb") @@ -1372,7 +1375,7 @@ class TestDeNovoConstruction(unittest.TestCase): outfile.close() self.assertTrue( - checkBinaryEqual(tmpfilename, self.bamfile), + checkGZBinaryEqual(tmpfilename, self.bamfile), "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile)) @@ -1407,6 +1410,11 @@ class TestEmptyHeader(unittest.TestCase): self.assertTrue("SQ" in s.header.to_dict()) self.assertTrue("@SQ" in str(s.header)) + def test_bam_without_seq_with_null_bytes_in_header(self): + s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header_null_bytes.bam")) + self.assertTrue("SQ" in s.header.to_dict()) + self.assertTrue("@SQ" in str(s.header)) + class TestMismatchingHeader(unittest.TestCase): '''see issue 716.''' diff --git a/tests/TestUtils.py b/tests/TestUtils.py index 7811b9e..f33761e 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -96,6 +96,19 @@ def checkBinaryEqual(filename1, filename2): return found +def checkGZBinaryEqual(filename1, filename2): + '''return true if the decompressed contents of the two files + are binary equal. + ''' + with gzip.open(filename1, "rb") as infile1: + d1 = infile1.read() + with gzip.open(filename2, "rb") as infile2: + d2 = infile2.read() + if d1 == d2: + return True + return False + + def check_samtools_view_equal( filename1, filename2, without_header=False): diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index 88bd34f..4458d1f 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -1,4 +1,7 @@ +# -*- coding: utf-8 -*- + import os +import glob import sys import unittest import pysam @@ -633,6 +636,37 @@ class TestSubsetting(unittest.TestCase): inf.subset_samples(["NA00001"]) +class TestVCFVersions(unittest.TestCase): + + def setUp(self): + self.files_to_test = (glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf.gz")) + + glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf")) + + glob.glob(os.path.join(CBCF_DATADIR, "example_v*.bcf"))) + + def test_all_records_can_be_fetched(self): + + for fn in self.files_to_test: + with pysam.VariantFile(fn) as inf: + records = list(inf.fetch()) + + +class TestUnicode(unittest.TestCase): + filename = "example_vcf43_with_utf8.vcf.gz" + + def test_record_with_unicode(self): + with pysam.VariantFile(os.path.join(CBCF_DATADIR, + self.filename)) as inf: + records = list(inf.fetch("20", 2345677, 2345678)) + self.assertEqual(len(records), 1) + record = records.pop() + self.assertEqual( + record.info["CLNVI"], + ('Institute_of_Human_Genetics', u'Friedrich-Alexander-Universität_Erlangen-Nürnberg')) + self.assertEqual(record.samples[0]["AN"], "alpha") + self.assertEqual(record.samples[1]["AN"], u"ä") + self.assertEqual(record.samples[2]["AN"], u"ü") + + if __name__ == "__main__": # build data files print("building data files") diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py new file mode 100644 index 0000000..fd80a80 --- /dev/null +++ b/tests/VariantRecord_test.py @@ -0,0 +1,72 @@ +import os +import glob +import sys +import unittest +import pysam +import shutil +import gzip +import subprocess +import pytest + +try: + from pathlib import Path +except ImportError: + Path = None + +from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context + + +@pytest.fixture +def vcf_header(): + vcf_header = pysam.VariantHeader() + vcf_header.add_sample("sample1") + vcf_header.add_sample("sample2") + vcf_header.contigs.add("1") + return vcf_header + +## segfault without coordinates + +def test_ascii_annotation_can_be_added(vcf_header): + vcf_header.formats.add("AN", 1, "String", "An annotation") + record = vcf_header.new_record( + contig="1", + start=12, + stop=13, + samples=[ + {"AN": "anno1"}, + {"AN": "anno2"}]) + assert str(record)[:-1].split("\t")[-2:] == ["anno1", "anno2"] + + +def test_ascii_annotation_with_variable_length_can_be_added(vcf_header): + vcf_header.formats.add("AN", 1, "String", "An annotation") + record = vcf_header.new_record( + contig="1", + start=12, + stop=13, + samples=[ + {"AN": "anno1b"}, + {"AN": "anno1"}]) + assert str(record)[:-1].split("\t")[-2:] == ["anno1b", "anno1"] + record = vcf_header.new_record( + contig="1", + start=12, + stop=13, + samples=[ + {"AN": "anno2"}, + {"AN": "anno2b"}]) + assert str(record)[:-1].split("\t")[-2:] == ["anno2", "anno2b"] + + +def test_unicode_annotation_can_be_added(vcf_header): + vcf_header.formats.add("AN", 1, "String", "An annotation") + record = vcf_header.new_record( + contig="1", + start=12, + stop=13, + samples=[ + {"AN": "anno1"}, + {"AN": "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"}]) + assert str(record)[:-1].split("\t")[-2:] == [ + "anno1", + "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"] diff --git a/tests/cbcf_data/example_vcf43.vcf b/tests/cbcf_data/example_vcf43.vcf new file mode 100644 index 0000000..9010401 --- /dev/null +++ b/tests/cbcf_data/example_vcf43.vcf @@ -0,0 +1,24 @@ +##fileformat=VCFv4.3 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/tests/cbcf_data/example_vcf43_with_utf8.vcf b/tests/cbcf_data/example_vcf43_with_utf8.vcf new file mode 100644 index 0000000..4826fc4 --- /dev/null +++ b/tests/cbcf_data/example_vcf43_with_utf8.vcf @@ -0,0 +1,27 @@ +##fileformat=VCFv4.3 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 +20 2345678 rs123 G A 29 PASS NS=3;DP=14;AF=0.5;CLNVI=Institute_of_Human_Genetics,Friedrich-Alexander-Universität_Erlangen-Nürnberg;DB;H2 GT:GQ:DP:HQ:AN 0|0:48:1:51,51:alpha 1|0:48:8:51,51:ä 1/1:43:5:.,.:ü diff --git a/tests/pysam_data/example_no_seq_in_header_null_bytes.bam b/tests/pysam_data/example_no_seq_in_header_null_bytes.bam new file mode 100644 index 0000000..aaf4b84 Binary files /dev/null and b/tests/pysam_data/example_no_seq_in_header_null_bytes.bam differ diff --git a/tests/samtools_test.py b/tests/samtools_test.py index 40d84c0..b20a869 100644 --- a/tests/samtools_test.py +++ b/tests/samtools_test.py @@ -45,7 +45,7 @@ def get_version(executable): if IS_PYTHON3: lines = lines.decode('ascii') try: - x = re.search("Version:\s+(\S+)", lines).groups()[0] + x = re.search(r"Version:\s+(\S+)", lines).groups()[0] except AttributeError: raise ValueError("could not get version from %s" % lines) return x @@ -82,6 +82,7 @@ class SamtoolsTest(unittest.TestCase): # TODO: issues with file naming # "faidx ex1.fa; %(out)s_ex1.fa.fai", "index ex1.bam %(out)s_ex1.bam.fai", + "index -@2 ex1.bam %(out)s_ex1.bam.fai", "idxstats ex1.bam > %(out)s_ex1.idxstats", "fixmate ex1.bam %(out)s_ex1.fixmate.bam", "flagstat ex1.bam > %(out)s_ex1.flagstat", @@ -189,7 +190,7 @@ class SamtoolsTest(unittest.TestCase): pysam_method = getattr(self.module, command) # run samtools - full_statement = re.sub("%\(out\)s", self.executable, statement) + full_statement = re.sub(r"%\(out\)s", self.executable, statement) run_command(" ".join((self.executable, full_statement))) # sys.stdout.write("%s %s ok" % (command, self.executable)) @@ -199,7 +200,7 @@ class SamtoolsTest(unittest.TestCase): parts = parts[:-2] # avoid interpolation to preserve string quoting, tab chars, etc. - pysam_parts = [re.sub("%\(out\)s", "pysam", x) for x in parts[1:]] + pysam_parts = [re.sub(r"%\(out\)s", "pysam", x) for x in parts[1:]] output = pysam_method(*pysam_parts, raw=True, catch_stdout=True) @@ -272,7 +273,7 @@ class SamtoolsTest(unittest.TestCase): mapped_command = self.get_command(statement, map_to_internal=True) pysam_method = getattr(self.module, mapped_command) usage_msg = pysam_method.usage() - expected = "Usage:\s+{} {}".format(self.executable, command) + expected = r"Usage:\s+{} {}".format(self.executable, command) self.assertTrue(re.search(expected, usage_msg) is not None) def tearDown(self): @@ -287,6 +288,18 @@ class EmptyIndexTest(unittest.TestCase): self.assertRaises(IOError, pysam.samtools.index, "exdoesntexist.bam") + def testEmptyIndexWithExtraFlag(self): + self.assertRaises(IOError, pysam.samtools.index, + "-c", + "exdoesntexist.bam") + + def testEmptyIndexWithExtraArg(self): + self.assertRaises(IOError, pysam.samtools.index, + "-c", + "-m", + "14", + "exdoesntexist.bam") + if sys.platform != "darwin": # fails with segfault with htslib 1.5 on Osx, an issue with flockfile diff --git a/tests/tabix_test.py b/tests/tabix_test.py index fdd39b0..c17f7ff 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -14,9 +14,8 @@ import unittest import subprocess import glob import re -import copy -import tempfile -from TestUtils import check_url, load_and_convert, TABIX_DATADIR, get_temp_filename +from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \ + load_and_convert, TABIX_DATADIR, get_temp_filename IS_PYTHON3 = sys.version_info[0] >= 3 @@ -40,30 +39,6 @@ def splitToBytes(s): return [x.encode("ascii") for x in s.split("\t")] -def checkBinaryEqual(filename1, filename2): - '''return true if the two files are binary equal.''' - if os.path.getsize(filename1) != os.path.getsize(filename2): - return False - - with open(filename1, "rb") as infile: - d1 = infile.read() - - with open(filename2, "rb") as infile: - d2 = infile.read() - - if len(d1) != len(d2): - return False - - found = False - for c1, c2 in zip(d1, d2): - if c1 != c2: - break - else: - found = True - - return found - - class TestIndexing(unittest.TestCase): filename = os.path.join(TABIX_DATADIR, "example.gtf.gz") filename_idx = os.path.join(TABIX_DATADIR, "example.gtf.gz.tbi") @@ -77,7 +52,7 @@ class TestIndexing(unittest.TestCase): '''test indexing via preset.''' pysam.tabix_index(self.tmpfilename, preset="gff") - self.assertTrue(checkBinaryEqual( + self.assertTrue(checkGZBinaryEqual( self.tmpfilename + ".tbi", self.filename_idx)) def test_indexing_to_custom_location_works(self): @@ -86,7 +61,7 @@ class TestIndexing(unittest.TestCase): index_path = get_temp_filename(suffix='custom.tbi') pysam.tabix_index(self.tmpfilename, preset="gff", index=index_path, force=True) - self.assertTrue(checkBinaryEqual(index_path, self.filename_idx)) + self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx)) os.unlink(index_path) def test_indexing_with_explict_columns_works(self): @@ -98,7 +73,7 @@ class TestIndexing(unittest.TestCase): end_col=4, line_skip=0, zerobased=False) - self.assertTrue(checkBinaryEqual( + self.assertTrue(checkGZBinaryEqual( self.tmpfilename + ".tbi", self.filename_idx)) def test_indexing_with_lineskipping_works(self): @@ -109,7 +84,7 @@ class TestIndexing(unittest.TestCase): end_col=4, line_skip=1, zerobased=False) - self.assertFalse(checkBinaryEqual( + self.assertFalse(checkGZBinaryEqual( self.tmpfilename + ".tbi", self.filename_idx)) def tearDown(self): diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py index 32f8e42..7ad7db0 100644 --- a/tests/tabixproxies_test.py +++ b/tests/tabixproxies_test.py @@ -128,7 +128,7 @@ class TestGTF(TestParser): """build attribute string from dictionary d""" s = "; ".join(["{} \"{}\"".format(x, y) for (x, y) in d.items()]) + ";" # remove quotes around numeric values - s = re.sub("\"(\d+)\"", r"\1", s) + s = re.sub(r'"(\d+)"', r'\1', s) return s def testRead(self):