New upstream version 0.15.3+ds
authorMichael R. Crusoe <michael.crusoe@gmail.com>
Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)
committerMichael R. Crusoe <michael.crusoe@gmail.com>
Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)
39 files changed:
.gitignore
.travis.yml
NEWS
README.rst
bcftools/bcftools.pysam.c
bcftools/bcftools.pysam.h
devtools/buildwheels.sh
devtools/import.py
devtools/run_tests_travis.sh
doc/api.rst
doc/glossary.rst
doc/release.rst
doc/usage.rst
import/pysam.c
import/pysam.h
pysam/htslib_util.h
pysam/libcalignedsegment.pxd
pysam/libcalignedsegment.pyx
pysam/libcalignmentfile.pyx
pysam/libcbcf.pyx
pysam/libcbcftools.pxd
pysam/libcfaidx.pyx
pysam/libchtslib.pxd
pysam/libcsamtools.pxd
pysam/libcutils.pyx
pysam/version.py
samtools/samtools.pysam.c
samtools/samtools.pysam.h
setup.py
tests/AlignmentFile_test.py
tests/TestUtils.py
tests/VariantFile_test.py
tests/VariantRecord_test.py [new file with mode: 0644]
tests/cbcf_data/example_vcf43.vcf [new file with mode: 0644]
tests/cbcf_data/example_vcf43_with_utf8.vcf [new file with mode: 0644]
tests/pysam_data/example_no_seq_in_header_null_bytes.bam [new file with mode: 0644]
tests/samtools_test.py
tests/tabix_test.py
tests/tabixproxies_test.py

index fd28d6d2e22a76fc0840fa2db8ba2f8451aabf67..b07a532d0958bdbeb9909e9a7aabc2cffc5dbeed 100644 (file)
@@ -17,11 +17,13 @@ tests/cbcf_data
 tests/tabix_data
 
 samtools/config.h
-bcftools/config.g
+bcftools/config.h
 htslib/config.status
 htslib/config.h
 htslib/config.log
 htslib/config.mk
+htslib/htslib.pc.tmp
+htslib/htslib-uninstalled.pc
 pysam/config.py
 
 # linking tests
index 36422812fba168befeecbeb0700ac8dad3eca32f..b30d9b8cfdea0c11e829231d84aace2015cd5d4a 100644 (file)
@@ -1,5 +1,5 @@
 os:
-  - linx
+  - linux
   - osx
 
 language: c
@@ -7,11 +7,58 @@ language: c
 env:
   matrix:
     - CONDA_PY=2.7
-    - CONDA_PY=3.4
-    - CONDA_PY=3.5
     - CONDA_PY=3.6
+    - CONDA_PY=3.7
   global:
     - PYSAM_LINKING_TEST=1
+    - TWINE_USERNAME=grepall
+    - secure: 'OcwwP8/o21+SGW0UVAnnCQwllhGSCq2HJzpI9EhX3kh6J9RTkyx/+drkg45bx1Z5u8zymuAFappEYzlpzqZE886XezkjOYGVa/u+Coqr1oT/BEJHFCkCA4o26yESp7Zy8aNj/juhB7Rfa77pIDXBayqTzbALz/AURMtZapasB18='
+
+_deploy_common: &deploy_common
+  if: tag IS present
+  install:
+    - python3 -m pip install cibuildwheel twine
+
+matrix:
+  include:
+    - stage: deploy
+      os: linux
+      language: python
+      python: '3.5'
+      services:
+        - docker
+      env:
+        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+      addons:
+        apt:
+          packages:
+            - gcc
+            - g++
+            - libcurl4-openssl-dev  # for libcurl support in sdist
+            - libssl-dev  # for s3 support in sdist
+      <<: *deploy_common
+      script:
+        - set -e
+        - cibuildwheel --output-dir dist
+        - python3 -m pip install Cython
+        - python3 setup.py build_ext --inplace
+        - python3 setup.py sdist
+        - twine check dist/*
+        - twine upload --skip-existing dist/*
+    - stage: deploy
+      os: osx
+      language: generic
+      env:
+        - CIBW_BEFORE_BUILD="pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+      addons: {}
+      <<: *deploy_common
+      script:
+        - set -e
+        - cibuildwheel --output-dir dist
+        - twine check dist/*
+        - twine upload --skip-existing dist/*
 
 addons:
   apt:
diff --git a/NEWS b/NEWS
index 4861555a9451e699f4bfce6d4d45d84353ea4a77..49ce4857c47038225156f64d3e132508880847f4 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,26 @@ http://pysam.readthedocs.io/en/latest/release.html
 Release notes
 =============
 
+Release 0.15.3
+==============
+
+Bugfix release.
+
+* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
+* [#780] close all filehandles before opening new ones in pysam_dispatch
+* [#773] do not cache VariantRecord.id to avoid memory leak
+* [#781] default of multiple_iterators=True is changed to False for
+  CRAM files.
+* [#825] fix collections.abc import
+* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
+  bug when setting FORMAT fields.
+* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
+  memory as needed, rather than a fixed-size char buffer.
+* [#814] Build wheels and upload them to PyPI
+* [#755] Allow passing flags and arguments to index methods
+* [#763] Strip \0 in header check
+* [#761] Test Tabix index contents, not the compression
+
 Release 0.15.2
 ==============
 
index 2005eb6d0db0857de99e213684f0628cd2212ce0..f8a641de0a4723e371c5c5b0f9028b84027a55bf 100644 (file)
@@ -25,6 +25,8 @@ as it resolves non-python dependencies and uses pre-configured
 compilation options. Especially for OS X this will potentially save a
 lot of trouble.
 
+The current version of pysam wraps 3rd-party code from htslib-1.9, samtools-1.9, and bcftools-1.9.
+
 Pysam is available through `pypi
 <https://pypi.python.org/pypi/pysam>`_. To install, type::
 
index e3e251ef499b3849988617cb8748937fba4a6b39..de8739d06fb89253a1832676a35bf85a48289b0d 100644 (file)
@@ -10,7 +10,6 @@
 FILE * bcftools_stderr = NULL;
 FILE * bcftools_stdout = NULL;
 const char * bcftools_stdout_fn = NULL;
-int bcftools_stdout_fileno = STDOUT_FILENO;
 
 
 FILE * bcftools_set_stderr(int fd)
@@ -21,11 +20,10 @@ FILE * bcftools_set_stderr(int fd)
   return bcftools_stderr;
 }
 
-void bcftools_unset_stderr(void)
+void bcftools_close_stderr(void)
 {
-  if (bcftools_stderr != NULL)
-    fclose(bcftools_stderr);
-  bcftools_stderr = fopen("/dev/null", "w");
+  fclose(bcftools_stderr);
+  bcftools_stderr = NULL;
 }
 
 FILE * bcftools_set_stdout(int fd)
@@ -37,7 +35,6 @@ FILE * bcftools_set_stdout(int fd)
     {
       fprintf(bcftools_stderr, "could not set stdout to fd %i", fd);
     }
-  bcftools_stdout_fileno = fd;
   return bcftools_stdout;
 }
 
@@ -46,12 +43,10 @@ void bcftools_set_stdout_fn(const char *fn)
   bcftools_stdout_fn = fn;
 }
 
-void bcftools_unset_stdout(void)
+void bcftools_close_stdout(void)
 {
-  if (bcftools_stdout != NULL)
-    fclose(bcftools_stdout);
-  bcftools_stdout = fopen("/dev/null", "w");
-  bcftools_stdout_fileno = STDOUT_FILENO;
+  fclose(bcftools_stdout);
+  bcftools_stdout = NULL;
 }
 
 int bcftools_puts(const char *s)
index 92ab370a2e30aeb4d5bb9711301eec805e8d21df..453567a54067d46fff33f0ad9962e840caca4292 100644 (file)
@@ -1,7 +1,7 @@
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef bcftools_PYSAM_H
+#define bcftools_PYSAM_H
 
-#include "stdio.h"
+#include <stdio.h>
 
 extern FILE * bcftools_stderr;
 
@@ -17,7 +17,7 @@ FILE * bcftools_set_stderr(int fd);
 
 /*! set pysam standard output to point to file descriptor
 
-  Setting the stderr will close the previous stdout.
+  Setting the stdout will close the previous stdout.
  */
 FILE * bcftools_set_stdout(int fd);
 
@@ -26,17 +26,15 @@ FILE * bcftools_set_stdout(int fd);
  */
 void bcftools_set_stdout_fn(const char * fn);
 
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
   
-  Unsetting the stderr will close the previous stderr.
  */
-void bcftools_unset_stderr(void);
+void bcftools_close_stderr(void);
 
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
   
-  Unsetting the stderr will close the previous stderr.
  */
-void bcftools_unset_stdout(void);
+void bcftools_close_stdout(void);
 
 int bcftools_puts(const char *s);
 
index ae0d953e491552e59449a3a5e877f2ca0281256b..bf71bf9d7a136cb508237516747541f3b9917f8c 100755 (executable)
@@ -5,6 +5,10 @@
 #
 # It is best to run this in a fresh clone of the repository!
 #
+# Before running, make sure to update image:
+#
+#   docker pull quay.io/pypa/manylinux1_x86_64
+#
 # Run this within the repository root:
 #   docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
 #
@@ -24,13 +28,6 @@ fi
 
 yum install -y zlib-devel bzip2-devel xz-devel
 
-# Python 2.6 is not supported
-rm -r /opt/python/cp26*
-
-# Python 3.3 builds fail with:
-#  /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib
-rm -r /opt/python/cp33*
-
 # Without libcurl support, htslib can open files from HTTP and FTP URLs.
 # With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a
 # current version of OpenSSL, and we do not want to be responsible for
index 44e606a22c1e36a1d1c6fa371cc15682d6aa6ab6..7430cc58a21a786153ee88c28af06332d166962a 100644 (file)
@@ -94,16 +94,16 @@ def _update_pysam_files(cf, destdir):
                 outfile.write('#include "{}.pysam.h"\n\n'.format(basename))
                 subname, _ = os.path.splitext(os.path.basename(filename))
                 if subname in MAIN.get(basename, []):
-                    lines = re.sub("int main\(", "int {}_main(".format(
+                    lines = re.sub(r"int main\(", "int {}_main(".format(
                         basename), lines)
                 else:
-                    lines = re.sub("int main\(", "int {}_{}_main(".format(
+                    lines = re.sub(r"int main\(", "int {}_{}_main(".format(
                         basename, subname), lines)
                 lines = re.sub("stderr", "{}_stderr".format(basename), lines)
                 lines = re.sub("stdout", "{}_stdout".format(basename), lines)
-                lines = re.sub(" printf\(", " fprintf({}_stdout, ".format(basename), lines)
-                lines = re.sub("([^kf])puts\(", r"\1{}_puts(".format(basename), lines)
-                lines = re.sub("putchar\(([^)]+)\)",
+                lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines)
+                lines = re.sub(r"([^kf])puts\(", r"\1{}_puts(".format(basename), lines)
+                lines = re.sub(r"putchar\(([^)]+)\)",
                                r"fputc(\1, {}_stdout)".format(basename), lines)
 
                 fn = os.path.basename(filename)
index 89988bb690986f026a660689a5aa5b697a7d8519..b20c8a07534290484305be7d73713c7cf8605bef 100755 (executable)
@@ -31,30 +31,33 @@ bash Miniconda3.sh -b
 # activate testenv environment
 source ~/miniconda3/bin/activate testenv
 
-conda config --add channels r
 conda config --add channels defaults
-conda config --add channels conda-forge
 conda config --add channels bioconda
+conda config --add channels conda-forge
 
 # pin versions, so that tests do not fail when pysam/htslib out of step
 # add htslib dependencies
-conda install -y "samtools=1.7" "bcftools=1.6" "htslib=1.7" xz curl bzip2
+# NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488
+conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses
 
 # Need to make C compiler and linker use the anaconda includes and libraries:
 export PREFIX=~/miniconda3/
 export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
-export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl --disable-lzma"
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
 
+echo "show samtools, htslib, and bcftools versions"
 samtools --version
-htslib --version
+htsfile --version
 bcftools --version
 
 # Try building conda recipe first
 ~/miniconda3/bin/conda-build devtools/conda-recipe/ --python=$CONDA_PY
 
 # install code from the repository via setup.py
-echo "installing via setup.py from repository"
-python setup.py install
+echo
+echo "============ installing via setup.py from repository ============"
+echo
+python setup.py install || exit
 
 # create auxilliary data
 echo
index d2a9a80fceb21a16698528c3b2f39d199dd177cc..3f2c042deefc071e6757a7b521afae1574b52718 100644 (file)
@@ -142,6 +142,9 @@ BAM/SAM formatted files.
 .. autoclass:: pysam.AlignmentFile
    :members:
 
+.. autoclass:: pysam.AlignmentHeader
+   :members:
+
 An :class:`~pysam.AlignedSegment` represents an aligned segment within
 a SAM/BAM file.
 
index de3e032622696cfdefeb4f69ceb7dfe4faba9f87..4e9fa57e664337f87d42e911d723a1e9cce58f10 100644 (file)
@@ -24,7 +24,7 @@ Glossary
       are 0-based, half-open intervals, i.e., the position 10,000 is
       part of the interval, but 20,000 is not. An exception are
       :term:`samtools` compatible region strings such as
-      'chr1:10000:20000', which are closed, i.e., both positions 10,000
+      'chr1:10000-20000', which are closed, i.e., both positions 10,000
       and 20,000 are part of the interval.
 
    column
index 5adc437bc54f518420d2b796d7a582c8a245b9ac..ee1875b1b4241bc37305172b5928e827dc5cd9d5 100644 (file)
@@ -2,7 +2,27 @@
 Release notes
 =============
 
-Release 0.15.1
+Release 0.15.3
+==============
+
+Bugfix release.
+
+* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
+* [#780] close all filehandles before opening new ones in pysam_dispatch
+* [#773] do not cache VariantRecord.id to avoid memory leak
+* [#781] default of multiple_iterators=True is changed to False for
+  CRAM files.
+* [#825] fix collections.abc import
+* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
+  bug when setting FORMAT fields.
+* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
+  memory as needed, rather than a fixed-size char buffer.
+* [#814] Build wheels and upload them to PyPI
+* [#755] Allow passing flags and arguments to index methods
+* [#763] Strip \0 in header check
+* [#761] Test Tabix index contents, not the compression
+
+Release 0.15.2
 ==============
 
 Bugfix release.
index 6172329ac42390c345afbbeebc1f60b845fd590e..f4b7498070c1ad48d8553f1e3148392b7f17db07 100644 (file)
@@ -189,7 +189,7 @@ region can be retrieved by calling :meth:`~pysam.TabixFile.fetch()`::
          print (str(row))
 
 This will return a tuple-like data structure in which columns can
-be retrieved by numeric index:
+be retrieved by numeric index::
 
     for row in tbx.fetch("chr1", 1000, 2000):
          print ("chromosome is", row[0])
@@ -385,7 +385,7 @@ flagstat command and consists of three files:
 2. The cython implementation :file:`_pysam_flagstat.pyx`. This script
    imports the pysam API via::
 
-      from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
+      from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
 
    This statement imports, amongst others, :class:`AlignedSegment`
    into the namespace. Speed can be gained from declaring
index d08cf50e5542366ddb63ffed52a5eb16c7b505fc..56926222a882c97bc9a178415df746c599f8fc72 100644 (file)
@@ -10,7 +10,6 @@
 FILE * @pysam@_stderr = NULL;
 FILE * @pysam@_stdout = NULL;
 const char * @pysam@_stdout_fn = NULL;
-int @pysam@_stdout_fileno = STDOUT_FILENO;
 
 
 FILE * @pysam@_set_stderr(int fd)
@@ -21,11 +20,10 @@ FILE * @pysam@_set_stderr(int fd)
   return @pysam@_stderr;
 }
 
-void @pysam@_unset_stderr(void)
+void @pysam@_close_stderr(void)
 {
-  if (@pysam@_stderr != NULL)
-    fclose(@pysam@_stderr);
-  @pysam@_stderr = fopen("/dev/null", "w");
+  fclose(@pysam@_stderr);
+  @pysam@_stderr = NULL;
 }
 
 FILE * @pysam@_set_stdout(int fd)
@@ -37,7 +35,6 @@ FILE * @pysam@_set_stdout(int fd)
     {
       fprintf(@pysam@_stderr, "could not set stdout to fd %i", fd);
     }
-  @pysam@_stdout_fileno = fd;
   return @pysam@_stdout;
 }
 
@@ -46,12 +43,10 @@ void @pysam@_set_stdout_fn(const char *fn)
   @pysam@_stdout_fn = fn;
 }
 
-void @pysam@_unset_stdout(void)
+void @pysam@_close_stdout(void)
 {
-  if (@pysam@_stdout != NULL)
-    fclose(@pysam@_stdout);
-  @pysam@_stdout = fopen("/dev/null", "w");
-  @pysam@_stdout_fileno = STDOUT_FILENO;
+  fclose(@pysam@_stdout);
+  @pysam@_stdout = NULL;
 }
 
 int @pysam@_puts(const char *s)
index c3cc6231462eea087048f53a1e8d80ae35cbbcea..6abb884e7ad0653ec293a25dceeb04603cbc8851 100644 (file)
@@ -1,7 +1,7 @@
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef @pysam@_PYSAM_H
+#define @pysam@_PYSAM_H
 
-#include "stdio.h"
+#include <stdio.h>
 
 extern FILE * @pysam@_stderr;
 
@@ -17,7 +17,7 @@ FILE * @pysam@_set_stderr(int fd);
 
 /*! set pysam standard output to point to file descriptor
 
-  Setting the stderr will close the previous stdout.
+  Setting the stdout will close the previous stdout.
  */
 FILE * @pysam@_set_stdout(int fd);
 
@@ -26,17 +26,15 @@ FILE * @pysam@_set_stdout(int fd);
  */
 void @pysam@_set_stdout_fn(const char * fn);
 
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
   
-  Unsetting the stderr will close the previous stderr.
  */
-void @pysam@_unset_stderr(void);
+void @pysam@_close_stderr(void);
 
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
   
-  Unsetting the stderr will close the previous stderr.
  */
-void @pysam@_unset_stdout(void);
+void @pysam@_close_stdout(void);
 
 int @pysam@_puts(const char *s);
 
index 25bd3e496493d9c623361144ca4ad7e005cfbf56..33c46c98e0c999a34dc0ab34f9acc1d44b97a2bf 100644 (file)
@@ -18,20 +18,6 @@ typedef khash_t(vdict) vdict_t;
 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
 typedef khash_t(s2i) s2i_t;
                     
-//////////////////////////////////////////////////////////////////
-/*! set pysam standard error to point to file descriptor
-
-  Setting the stderr will close the previous stderr.
- */
-// FILE * pysam_set_stderr(int fd);
-
-//////////////////////////////////////////////////////////////////
-/*! set pysam standard error to /dev/null.
-  
-  Unsetting the stderr will close the previous stderr.
- */
-// void pysam_unset_stderr();
-
 //////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////
index 48ca93f7a0e37374c11fc41d9bae0d49da88cdca..c964160382f41893cf7569f726255cb458a3ad8c 100644 (file)
@@ -70,7 +70,7 @@ cdef class PileupColumn:
     cdef int n_pu
     cdef AlignmentHeader header
     cdef uint32_t min_base_quality
-    cdef uint8_t * buf
+    cdef kstring_t buf
     cdef char * reference_sequence
 
 cdef class PileupRead:
index 7ad61ac25ac671c961f77fd2733c158a81afb999..02ef5b679902d24e017e8bc776d0e926ab5bb0bd 100644 (file)
@@ -69,7 +69,6 @@ from cpython cimport array as c_array
 from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
     INT8_MAX, INT16_MAX, INT32_MAX, \
     UINT8_MAX, UINT16_MAX, UINT32_MAX
-from libc.stdio cimport snprintf
 
 from pysam.libchtslib cimport HTS_IDX_NOCOOR
 from pysam.libcutils cimport force_bytes, force_str, \
@@ -89,9 +88,6 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
 cdef char* CODE2CIGAR= "MIDNSHP=XB"
 cdef int NCIGAR_CODES = 10
 
-# dimensioned for 8000 pileup limit (+ insertions/deletions)
-cdef uint32_t MAX_PILEUP_BUFFER_SIZE = 10000
-
 if IS_PYTHON3:
     CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
     maketrans = str.maketrans
@@ -632,9 +628,8 @@ cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp,
     dest.n_pu = n_pu
     dest.min_base_quality = min_base_quality
     dest.reference_sequence = reference_sequence
-    dest.buf = <uint8_t *>calloc(MAX_PILEUP_BUFFER_SIZE, sizeof(uint8_t))
-    if dest.buf == NULL:
-        raise MemoryError("could not allocate pileup buffer")
+    dest.buf.l = dest.buf.m = 0
+    dest.buf.s = NULL
 
     return dest
 
@@ -1853,10 +1848,9 @@ cdef class AlignedSegment:
         return s
 
     def get_forward_qualities(self):
-        """return the original read sequence.
+        """return base qualities of the read sequence.
         
-        Reads mapping to the reverse strand will be reverse
-        complemented.
+        Reads mapping to the reverse strand will be reversed.
         """
         if self.is_reverse:
             return self.query_qualities[::-1]
@@ -2799,8 +2793,7 @@ cdef class PileupColumn:
             "\n".join(map(str, self.pileups))
 
     def __dealloc__(self):
-        if self.buf is not NULL:
-            free(self.buf)
+        free(self.buf.s)
 
     def set_min_base_quality(self, min_base_quality):
         """set the minimum base quality for this pileup column.
@@ -2971,15 +2964,16 @@ cdef class PileupColumn:
         cdef uint32_t x = 0
         cdef uint32_t j = 0
         cdef uint32_t c = 0
-        cdef uint32_t n = 0
         cdef uint8_t cc = 0
         cdef uint8_t rb = 0
-        cdef uint8_t * buf = self.buf
+        cdef kstring_t * buf = &self.buf
         cdef bam_pileup1_t * p = NULL
         
         if self.plp == NULL or self.plp[0] == NULL:
             raise ValueError("PileupColumn accessed after iterator finished")
 
+        buf.l = 0
+
         # todo: reference sequence to count matches/mismatches
         # todo: convert assertions to exceptions
         for x from 0 <= x < self.n_pu:
@@ -2992,16 +2986,12 @@ cdef class PileupColumn:
                 continue
             # see samtools pileup_seq
             if mark_ends and p.is_head:
-                buf[n] = '^'
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                kputc('^', buf)
                 
                 if p.b.core.qual > 93:
-                    buf[n] = 126
+                    kputc(126, buf)
                 else:
-                    buf[n] = p.b.core.qual + 33
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc(p.b.core.qual + 33, buf)
             if not p.is_del:
                 if p.qpos < p.b.core.l_qseq:
                     cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
@@ -3012,68 +3002,44 @@ cdef class PileupColumn:
                     rb = self.reference_sequence[self.reference_pos]
                     if seq_nt16_table[cc] == seq_nt16_table[rb]:
                         cc = "="
-                buf[n] = strand_mark_char(cc, p.b)
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                kputc(strand_mark_char(cc, p.b), buf)
             elif add_indels:
                 if p.is_refskip:
                     if bam_is_rev(p.b):
-                        buf[n] = '<'
+                        kputc('<', buf)
                     else:
-                        buf[n] = '>'
+                        kputc('>', buf)
                 else:
-                    buf[n] = '*'
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc('*', buf)
             if add_indels:
                 if p.indel > 0:
-                    buf[n] = '+'
-                    n += 1
-                    assert n < MAX_PILEUP_BUFFER_SIZE
-                    n += snprintf(<char *>&(buf[n]),
-                                  MAX_PILEUP_BUFFER_SIZE - n,
-                                  "%i",
-                                  p.indel)
-                    assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc('+', buf)
+                    kputw(p.indel, buf)
                     for j from 1 <= j <= p.indel:
                         cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
-                        buf[n] = strand_mark_char(cc, p.b)
-                        n += 1
-                        assert n < MAX_PILEUP_BUFFER_SIZE
+                        kputc(strand_mark_char(cc, p.b), buf)
                 elif p.indel < 0:
-                    buf[n] = '-'
-                    n += 1
-                    assert n < MAX_PILEUP_BUFFER_SIZE
-                    n += snprintf(<char *>&(buf[n]),
-                                  MAX_PILEUP_BUFFER_SIZE - n,
-                                  "%i",
-                                  -p.indel)
-                    assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc('-', buf)
+                    kputw(-p.indel, buf)
                     for j from 1 <= j <= -p.indel:
                         # TODO: out-of-range check here?
                         if self.reference_sequence == NULL:
                             cc = 'N'
                         else:
                             cc = self.reference_sequence[self.reference_pos + j]
-                        buf[n] = strand_mark_char(cc, p.b)
-                        n += 1
-                        assert n < MAX_PILEUP_BUFFER_SIZE
+                        kputc(strand_mark_char(cc, p.b), buf)
             if mark_ends and p.is_tail:
-                buf[n] = '$'
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                kputc('$', buf)
 
-            buf[n] = ':'
-            n += 1
-            assert n < MAX_PILEUP_BUFFER_SIZE
+            kputc(':', buf)
 
-        if n == 0:
+        if buf.l == 0:
             # could be zero if all qualities are too low
             return ""
         else:
             # quicker to ensemble all and split than to encode all separately.
             # ignore last ":"
-            return force_str(PyBytes_FromStringAndSize(<char*>buf, n-1)).split(":")
+            return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
 
     def get_query_qualities(self):
         """query base quality scores at pileup column position.
index 4030868affdd8c11d1e9af0f688d6765a310aafa..d35b0db3ff295541390344239dae28fda4a3bf40 100644 (file)
 ########################################################
 import os
 import collections
+try:
+    from collections.abc import Sequence, Mapping  # noqa
+except ImportError:
+    from collections import Sequence, Mapping  # noqa
 import re
 import warnings
 import array
@@ -99,11 +103,11 @@ IndexStats = collections.namedtuple("IndexStats",
 cdef int MAX_POS = (1 << 31) - 1
 
 # valid types for SAM headers
-VALID_HEADER_TYPES = {"HD" : collections.Mapping,
-                      "SQ" : collections.Sequence,
-                      "RG" : collections.Sequence,
-                      "PG" : collections.Sequence,
-                      "CO" : collections.Sequence}
+VALID_HEADER_TYPES = {"HD" : Mapping,
+                      "SQ" : Sequence,
+                      "RG" : Sequence,
+                      "PG" : Sequence,
+                      "CO" : Sequence}
 
 # order of records within SAM headers
 VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
@@ -293,7 +297,7 @@ cdef class AlignmentHeader(object):
                     raise ValueError(
                         "invalid type for record %s: %s, expected %s".format(
                             record, type(data), VALID_HEADER_TYPES[record]))
-                if isinstance(data, collections.Mapping):
+                if isinstance(data, Mapping):
                     lines.append(build_header_line(data, record))
                 else:
                     for fields in header_dict[record]:
@@ -303,7 +307,7 @@ cdef class AlignmentHeader(object):
         for record, data in sorted(header_dict.items()):
             if record in VALID_HEADERS:
                 continue
-            if isinstance(data, collections.Mapping):
+            if isinstance(data, Mapping):
                 lines.append(build_header_line(data, record))
             else:
                 for fields in header_dict[record]:
@@ -346,7 +350,7 @@ cdef class AlignmentHeader(object):
         return makeAlignmentHeader(bam_hdr_dup(self.ptr))
 
     property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
+        """int with the number of :term:`reference` sequences in the file.
 
         This is a read-only attribute."""
         def __get__(self):
@@ -418,7 +422,8 @@ cdef class AlignmentHeader(object):
         # convert to python string
         t = self.__str__()
         for line in t.split("\n"):
-            if not line.strip():
+            line = line.strip(' \0')
+            if not line:
                 continue
             assert line.startswith("@"), \
                 "header line without '@': '%s'" % line
@@ -454,13 +459,13 @@ cdef class AlignmentHeader(object):
                 # interpret type of known header record tags, default to str
                 x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
 
-            if VALID_HEADER_TYPES[record] == collections.Mapping:
+            if VALID_HEADER_TYPES[record] == Mapping:
                 if record in result:
                     raise ValueError(
                         "multiple '%s' lines are not permitted" % record)
 
                 result[record] = x
-            elif VALID_HEADER_TYPES[record] == collections.Sequence:
+            elif VALID_HEADER_TYPES[record] == Sequence:
                 if record not in result: result[record] = []
                 result[record].append(x)
 
@@ -889,7 +894,7 @@ cdef class AlignmentFile(HTSFile):
                 self.header = template.header.copy()
             elif isinstance(header, AlignmentHeader):
                 self.header = header.copy()
-            elif isinstance(header, collections.Mapping):
+            elif isinstance(header, Mapping):
                 self.header = AlignmentHeader.from_dict(header)
             elif reference_names and reference_lengths:
                 self.header = AlignmentHeader.from_references(
@@ -1604,7 +1609,6 @@ cdef class AlignmentFile(HTSFile):
         Or it can be a generator filtering such reads. Example
         samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
         """
-        import collections
         res = collections.Counter()
         for r in read_iterator:
             if 'N' in r.cigarstring:
@@ -1635,7 +1639,6 @@ cdef class AlignmentFile(HTSFile):
             AlignedSegment r
             int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
 
-        import collections
         res = collections.Counter()
 
         match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
@@ -1898,7 +1901,7 @@ cdef class AlignmentFile(HTSFile):
         return self.header.get_reference_length(reference)
     
     property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
+        """int with the number of :term:`reference` sequences in the file.
         This is a read-only attribute."""
         def __get__(self):
             if self.header:
@@ -2610,10 +2613,16 @@ cdef class IteratorColumnRegion(IteratorColumn):
                   int start = 0,
                   int stop = MAX_POS,
                   int truncate = False,
+                  int multiple_iterators = True,
                   **kwargs ):
 
-        # initialize iterator
-        self._setup_iterator(tid, start, stop, 1)
+        # initialize iterator. Multiple iterators not available
+        # for CRAM.
+        if multiple_iterators and samfile.is_cram:
+            warnings.warn("multiple_iterators not implemented for CRAM")
+            multiple_iterators = False
+
+        self._setup_iterator(tid, start, stop, multiple_iterators)
         self.start = start
         self.stop = stop
         self.truncate = truncate
index 9ac58dc126cf664471c6656b1606fe3c3219d580..e40a8013035a39c33bda78933079981c126de0dc 100644 (file)
@@ -95,7 +95,7 @@ from cpython.ref     cimport Py_INCREF
 from cpython.dict    cimport PyDict_GetItemString, PyDict_SetItemString
 from cpython.tuple   cimport PyTuple_New, PyTuple_SET_ITEM
 from cpython.bytes   cimport PyBytes_FromStringAndSize
-from cpython.unicode cimport PyUnicode_DecodeASCII
+from cpython.unicode cimport PyUnicode_DecodeUTF8
 from cpython.version cimport PY_MAJOR_VERSION
 
 from pysam.libchtslib cimport HTSFile, hisremote
@@ -151,7 +151,7 @@ cdef inline bcf_str_cache_get_charptr(const char* s):
     if PY_MAJOR_VERSION < 3:
         val = s
     else:
-        val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
+        val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
 
     PyDict_SetItemString(bcf_str_cache, s, val)
 
@@ -282,7 +282,7 @@ cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int sca
             else:
                 # Otherwise, copy the entire block
                 b = datac[:n]
-            value = tuple(v.decode('ascii') if v and v != bcf_str_missing else None for v in b.split(b','))
+            value = tuple(v.decode('utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
     else:
         value = []
         if type == BCF_BT_INT8:
@@ -412,9 +412,13 @@ cdef bcf_empty_array(int type, ssize_t n, int vlen):
     return value
 
 
-cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
-                           void *dst_data, int dst_type, ssize_t dst_values,
+cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
+                           void *dst_data, int dst_type, size_t dst_values,
                            int vlen):
+    """copy data from src to dest where the size of the elements (src_type/dst_type) differ
+    as well as the number of elements (src_values/dst_values).
+    """
+    
     cdef char    *src_datac
     cdef char    *dst_datac
     cdef int8_t  *src_datai8
@@ -432,7 +436,7 @@ cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
     if src_type == dst_type == BCF_BT_CHAR:
         src_datac = <char *>src_data
         dst_datac = <char *>dst_data
-        memcpy(src_datac, dst_datac, src_values)
+        memcpy(dst_datac, src_datac, src_values)
         for i in range(src_values, dst_values):
             dst_datac[i] = 0
     elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
@@ -574,7 +578,7 @@ cdef object bcf_check_values(VariantRecord record, value, int sample,
     cdef int given = len(values)
     if value_count[0] != -1 and value_count[0] != given:
         if scalar[0]:
-            raise TypeError('value expected to be scalar, given len={}'.format(value_count[0], given))
+            raise TypeError('value expected to be scalar, given len={}'.format(given))
         else:
             raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
 
@@ -813,14 +817,14 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
     if key == 'phased':
         sample.phased = bool(value)
         return
-
+    
     cdef bcf_hdr_t *hdr = sample.record.header.ptr
     cdef bcf1_t *r = sample.record.ptr
     cdef int fmt_id
     cdef vdict_t *d
     cdef khiter_t k
     cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
-    cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
+    cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
 
     if bcf_unpack(r, BCF_UN_ALL) < 0:
         raise ValueError('Error unpacking VariantRecord')
@@ -857,11 +861,10 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
                               fmt.type if fmt else -1,
                               fmt.n    if fmt else -1,
                               &value_count, &scalar, &realloc)
-
     vlen = value_count < 0
     value_count = len(values)
-
-    # If we can, write updated values to existing allocated storage
+    
+    # If we can, write updated values to existing allocated storage.
     if fmt and not realloc:
         r.d.indiv_dirty = 1
         bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
@@ -871,9 +874,9 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
     if fmt and fmt.n > alloc_len:
         alloc_len = fmt.n
 
-    n = r.n_sample
-    new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
-    cdef char *valp = <char *>new_values
+    nsamples = r.n_sample
+    new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
+    cdef char *new_values_p = <char *>new_values
 
     if fmt_type == BCF_HT_INT:
         dst_type = BCF_BT_INT32
@@ -887,15 +890,15 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
     else:
         raise ValueError('Unsupported FORMAT type')
 
-    if fmt and n > 1:
-        for i in range(n):
-            bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
-                                  valp  + i*dst_size, dst_type, alloc_len,
+    if fmt and nsamples > 1:
+        for i in range(nsamples):
+            bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
+                                  new_values_p  + i * dst_size, dst_type, alloc_len,
                                   vlen)
 
-    bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
+    bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
 
-    if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
+    if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
         raise ValueError('Unable to update format values')
 
 
@@ -1160,7 +1163,7 @@ cdef inline bcf_sync_end(VariantRecord record):
     cdef bcf_hdr_t *hdr = record.header.ptr
     cdef bcf_info_t *info
     cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
-    cdef int ref_len 
+    cdef int ref_len
 
     # allow missing ref when instantiating a new record
     if record.ref is not None:
@@ -2036,12 +2039,20 @@ cdef class VariantHeader(object):
 
     def __str__(self):
         cdef int hlen
-        cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
+        cdef kstring_t line
+        line.l = line.m = 0
+        line.s = NULL
 
-        try:
-            return charptr_to_str_w_len(hstr, hlen)
-        finally:
-            free(hstr)
+        if bcf_hdr_format(self.ptr, 0, &line) < 0:
+            if line.m:
+                free(line.s)
+            raise ValueError('bcf_hdr_format failed')
+
+        ret = charptr_to_str_w_len(line.s, line.l)
+
+        if line.m:
+            free(line.s)
+        return ret
 
     def new_record(self, contig=None, start=0, stop=0, alleles=None,
                          id=None, qual=None, filter=None, info=None, samples=None,
@@ -3121,7 +3132,9 @@ cdef class VariantRecord(object):
         cdef bcf1_t *r = self.ptr
         if bcf_unpack(r, BCF_UN_STR) < 0:
             raise ValueError('Error unpacking VariantRecord')
-        return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+        # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
+        # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+        return charptr_to_str(r.d.id) if r.d.id != b'.' else None
 
     @id.setter
     def id(self, value):
@@ -3178,7 +3191,7 @@ cdef class VariantRecord(object):
     @alleles.setter
     def alleles(self, values):
         cdef bcf1_t *r = self.ptr
-        
+
         # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
         cdef int rlen = r.rlen
 
index f1c8d764dcd939152718cd6f5b4869fdcd18d5c0..62a6f3d4ad9c1362115de52c5859521a315b2cc6 100644 (file)
@@ -2,8 +2,8 @@ cdef extern from "bcftools.pysam.h":
 
     int bcftools_main(int argc, char *argv[])
     void bcftools_set_stderr(int fd)
-    void bcftools_unset_stderr()
+    void bcftools_close_stderr()
     void bcftools_set_stdout(int fd)
     void bcftools_set_stdout_fn(const char *)
-    void bcftools_unset_stdout()
+    void bcftools_close_stdout()
     void bcftools_set_optind(int)
index cf783b999493568cb13aacd0f6937cf687bee82d..40d8430fbdab48bb75d59833029b103f01482924 100644 (file)
@@ -214,7 +214,7 @@ cdef class FastaFile:
         return False
 
     property closed:
-        """"bool indicating the current state of the file object.
+        """bool indicating the current state of the file object.
         This is a read-only attribute; the close() method changes the value.
         """
         def __get__(self):
@@ -231,7 +231,7 @@ cdef class FastaFile:
             return self._references
 
     property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
+        """int with the number of :term:`reference` sequences in the file.
         This is a read-only attribute."""
         def __get__(self):
             return len(self._references) if self.references else None
@@ -610,7 +610,7 @@ cdef class FastxFile:
         return False
 
     property closed:
-        """"bool indicating the current state of the file object.
+        """bool indicating the current state of the file object.
         This is a read-only attribute; the close() method changes the value.
         """
         def __get__(self):
index 8bcf399f36266ef4c2d07fb053a39d5392da5690..9dafee30f8b00296acd1e5792cd4ff8f01fc16e6 100644 (file)
@@ -20,6 +20,11 @@ cdef extern from "htslib/kstring.h" nogil:
         size_t l, m
         char *s
 
+    int kputc(int c, kstring_t *s)
+    int kputw(int c, kstring_t *s)
+    int kputl(long c, kstring_t *s)
+    int ksprintf(kstring_t *s, const char *fmt, ...)
+
 
 cdef extern from "htslib_util.h" nogil:
     int hts_set_verbosity(int verbosity)
@@ -1703,6 +1708,12 @@ cdef extern from "htslib/vcf.h" nogil:
     # Read VCF header from a file and update the header
     int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
 
+    # Appends formatted header text to _str_.
+    # If _is_bcf_ is zero, `IDX` fields are discarded.
+    #  @return 0 if successful, or negative if an error occurred
+    #  @since 1.4
+    int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str);
+    
     # Returns formatted header (newly allocated string) and its length,
     # excluding the terminating \0. If is_bcf parameter is unset, IDX
     # fields are discarded.
index ff797f879eed63a3ba427855c09a214817761349..70fda608fd4fb85b279029fb4de390523136fbfe 100644 (file)
@@ -2,8 +2,8 @@ cdef extern from "samtools.pysam.h":
 
     int samtools_main(int argc, char *argv[])
     void samtools_set_stderr(int fd)
-    void samtools_unset_stderr()
+    void samtools_close_stderr()
     void samtools_set_stdout(int fd)
     void samtools_set_stdout_fn(const char *)
-    void samtools_unset_stdout()
+    void samtools_close_stdout()
     void samtools_set_optind(int)
index 994a24009777bae6f6c873d04b58162dff8ae1c8..ab8e9a63f6a17bec2fb8c36fe052faad23cdbe16 100644 (file)
@@ -18,10 +18,10 @@ from libc.stdio cimport stdout as c_stdout
 from posix.fcntl cimport open as c_open, O_WRONLY
 
 from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \
-    samtools_unset_stderr, samtools_unset_stdout, samtools_set_stdout_fn, samtools_set_optind
+    samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind
 
 from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \
-    bcftools_unset_stderr, bcftools_unset_stdout, bcftools_set_stdout_fn, bcftools_set_optind
+    bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind
 
 #####################################################################
 # hard-coded constants
@@ -91,14 +91,14 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
 
 cdef from_string_and_size(const char* s, size_t length):
     if IS_PYTHON3:
-        return s[:length].decode("ascii")
+        return s[:length].decode("utf8")
     else:
         return s[:length]
 
 
 # filename encoding (adapted from lxml.etree.pyx)
 cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
-
+cdef str TEXT_ENCODING = 'utf-8'
 
 cdef bytes encode_filename(object filename):
     """Make sure a filename is 8-bit encoded (or None)."""
@@ -115,9 +115,9 @@ cdef bytes encode_filename(object filename):
         raise TypeError("Argument must be string or unicode.")
 
 
-cdef bytes force_bytes(object s, encoding="ascii"):
+cdef bytes force_bytes(object s, encoding=TEXT_ENCODING):
     """convert string or unicode object to bytes, assuming
-    ascii encoding.
+    utf8 encoding.
     """
     if s is None:
         return None
@@ -129,7 +129,7 @@ cdef bytes force_bytes(object s, encoding="ascii"):
         raise TypeError("Argument must be string, bytes or unicode.")
 
 
-cdef charptr_to_str(const char* s, encoding="ascii"):
+cdef charptr_to_str(const char* s, encoding=TEXT_ENCODING):
     if s == NULL:
         return None
     if PY_MAJOR_VERSION < 3:
@@ -138,7 +138,7 @@ cdef charptr_to_str(const char* s, encoding="ascii"):
         return s.decode(encoding)
 
 
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=TEXT_ENCODING):
     if s == NULL:
         return None
     if PY_MAJOR_VERSION < 3:
@@ -147,14 +147,14 @@ cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
         return s[:n].decode(encoding)
 
 
-cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
+cdef bytes charptr_to_bytes(const char* s, encoding=TEXT_ENCODING):
     if s == NULL:
         return None
     else:
         return s
 
 
-cdef force_str(object s, encoding="ascii"):
+cdef force_str(object s, encoding=TEXT_ENCODING):
     """Return s converted to str type of current Python
     (bytes in Py2, unicode in Py3)"""
     if s is None:
@@ -274,9 +274,25 @@ def _pysam_dispatch(collection,
     False.
     '''
 
-    if method == "index":
-        if args and not os.path.exists(args[0]):
-            raise IOError("No such file or directory: '%s'" % args[0])
+    if method == "index" and args:
+        # We make sure that at least 1 input file exists,
+        # and if it doesn't we raise an IOError.
+        SIMPLE_FLAGS = ['-c', '--csi', '-f', '--force', '-t', '--tbi', '-n', '--nstats', '-s', '--stats']
+        ARGUMENTS = ['-m', '--min-shift', '-o', '--output-file', '--threads', '-@']
+        skip_next = False
+        for arg in args:
+            if skip_next:
+                skip_next = False
+                continue
+            if arg in SIMPLE_FLAGS or (len(arg) > 2 and arg.startswith('-@')):
+                continue
+            if arg in ARGUMENTS:
+                skip_next = True
+                continue
+            if not os.path.exists(arg):
+                raise IOError("No such file or directory: '%s'" % arg)
+            else:
+                break
             
     if args is None:
         args = []
@@ -285,8 +301,6 @@ def _pysam_dispatch(collection,
 
     # redirect stderr to file
     stderr_h, stderr_f = tempfile.mkstemp()
-    samtools_set_stderr(stderr_h)
-    bcftools_set_stderr(stderr_h)
         
     # redirect stdout to file
     if save_stdout:
@@ -297,9 +311,7 @@ def _pysam_dispatch(collection,
             raise IOError("error while opening {} for writing".format(stdout_f))
 
         samtools_set_stdout_fn(force_bytes(stdout_f))
-        samtools_set_stdout(stdout_h)
         bcftools_set_stdout_fn(force_bytes(stdout_f))
-        bcftools_set_stdout(stdout_h)
             
     elif catch_stdout:
         stdout_h, stdout_f = tempfile.mkstemp()
@@ -329,12 +341,11 @@ def _pysam_dispatch(collection,
             samtools_set_stdout_fn(force_bytes(stdout_f))
             bcftools_set_stdout_fn(force_bytes(stdout_f))
             args.extend(stdout_option.format(stdout_f).split(" "))
-        else:
-            samtools_set_stdout(stdout_h)
-            bcftools_set_stdout(stdout_h)
+            stdout_h = c_open(b"/dev/null", O_WRONLY)
     else:
         samtools_set_stdout_fn("-")
         bcftools_set_stdout_fn("-")
+        stdout_h = c_open(b"/dev/null", O_WRONLY)
 
     # setup the function call to samtools/bcftools main
     cdef char ** cargs
@@ -373,9 +384,17 @@ def _pysam_dispatch(collection,
 
     # call samtools/bcftools
     if collection == b"samtools":
+        samtools_set_stdout(stdout_h)
+        samtools_set_stderr(stderr_h)
         retval = samtools_main(n + 2, cargs)
+        samtools_close_stdout()
+        samtools_close_stderr()
     elif collection == b"bcftools":
+        bcftools_set_stdout(stdout_h)
+        bcftools_set_stderr(stderr_h)
         retval = bcftools_main(n + 2, cargs)
+        bcftools_close_stdout()
+        bcftools_close_stderr()
 
     for i from 0 <= i < n:
         free(cargs[i + 2])
@@ -395,13 +414,6 @@ def _pysam_dispatch(collection,
             os.remove(fn)
         return out
 
-    samtools_unset_stderr()
-    bcftools_unset_stderr()
-
-    if save_stdout or catch_stdout:
-        samtools_unset_stdout()
-        bcftools_unset_stdout()
-
     out_stderr = _collect(stderr_f)
     if save_stdout:
         out_stdout = None
index d4ec16163031ec5f58a619710a15e6220b7465f5..49afa1770f0a0e20dfc159093304a4fa50f028df 100644 (file)
@@ -1,5 +1,5 @@
 # pysam versioning information
-__version__ = "0.15.2"
+__version__ = "0.15.3"
 
 # TODO: upgrade number
 __samtools_version__ = "1.9"
index edbd9a2d4821ff2e0fae274f548306f24942a3a8..b26f892a93c77f51244b8e26481c99df5c0df90c 100644 (file)
@@ -10,7 +10,6 @@
 FILE * samtools_stderr = NULL;
 FILE * samtools_stdout = NULL;
 const char * samtools_stdout_fn = NULL;
-int samtools_stdout_fileno = STDOUT_FILENO;
 
 
 FILE * samtools_set_stderr(int fd)
@@ -21,11 +20,10 @@ FILE * samtools_set_stderr(int fd)
   return samtools_stderr;
 }
 
-void samtools_unset_stderr(void)
+void samtools_close_stderr(void)
 {
-  if (samtools_stderr != NULL)
-    fclose(samtools_stderr);
-  samtools_stderr = fopen("/dev/null", "w");
+  fclose(samtools_stderr);
+  samtools_stderr = NULL;
 }
 
 FILE * samtools_set_stdout(int fd)
@@ -37,7 +35,6 @@ FILE * samtools_set_stdout(int fd)
     {
       fprintf(samtools_stderr, "could not set stdout to fd %i", fd);
     }
-  samtools_stdout_fileno = fd;
   return samtools_stdout;
 }
 
@@ -46,12 +43,10 @@ void samtools_set_stdout_fn(const char *fn)
   samtools_stdout_fn = fn;
 }
 
-void samtools_unset_stdout(void)
+void samtools_close_stdout(void)
 {
-  if (samtools_stdout != NULL)
-    fclose(samtools_stdout);
-  samtools_stdout = fopen("/dev/null", "w");
-  samtools_stdout_fileno = STDOUT_FILENO;
+  fclose(samtools_stdout);
+  samtools_stdout = NULL;
 }
 
 int samtools_puts(const char *s)
index d97ee250acc5da1e9837170f71a6d248308cc32b..df8fd0179234306c3d83ccafbd0596b9d2886f0c 100644 (file)
@@ -1,7 +1,7 @@
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef samtools_PYSAM_H
+#define samtools_PYSAM_H
 
-#include "stdio.h"
+#include <stdio.h>
 
 extern FILE * samtools_stderr;
 
@@ -17,7 +17,7 @@ FILE * samtools_set_stderr(int fd);
 
 /*! set pysam standard output to point to file descriptor
 
-  Setting the stderr will close the previous stdout.
+  Setting the stdout will close the previous stdout.
  */
 FILE * samtools_set_stdout(int fd);
 
@@ -26,17 +26,15 @@ FILE * samtools_set_stdout(int fd);
  */
 void samtools_set_stdout_fn(const char * fn);
 
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
   
-  Unsetting the stderr will close the previous stderr.
  */
-void samtools_unset_stderr(void);
+void samtools_close_stderr(void);
 
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
   
-  Unsetting the stderr will close the previous stderr.
  */
-void samtools_unset_stdout(void);
+void samtools_close_stdout(void);
 
 int samtools_puts(const char *s);
 
index 49261f95bc151fa8dac8bfb18ef47bed42fe90ae..4c97e8746431fa587a03817f7aa99feb52aa9928 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -13,8 +13,6 @@ This module provides a low-level wrapper around the htslib C-API as
 using cython and a high-level API for convenient access to the data
 within standard genomic file formats.
 
-The current version wraps htslib-1.7, samtools-1.7 and bcftools-1.6.
-
 See:
 http://www.htslib.org
 https://github.com/pysam-developers/pysam
@@ -53,6 +51,7 @@ def changedir(path):
 
 
 def run_configure(option):
+    sys.stdout.flush()
     try:
         retcode = subprocess.call(
             " ".join(("./configure", option)),
@@ -80,6 +79,25 @@ def run_make_print_config():
     return make_print_config
 
 
+@contextmanager
+def set_compiler_envvars():
+    tmp_vars = []
+    for var in ['CC', 'CFLAGS', 'LDFLAGS']:
+        if var in os.environ:
+            print ("# pysam: (env) {}={}".format(var, os.environ[var]))
+        elif var in sysconfig.get_config_vars():
+            value = sysconfig.get_config_var(var)
+            print ("# pysam: (sysconfig) {}={}".format(var, value))
+            os.environ[var] = value
+            tmp_vars += [var]
+
+    try:
+        yield
+    finally:
+        for var in tmp_vars:
+            del os.environ[var]
+
+
 def configure_library(library_dir, env_options=None, options=[]):
 
     configure_script = os.path.join(library_dir, "configure")
@@ -93,7 +111,7 @@ def configure_library(library_dir, env_options=None, options=[]):
         raise ValueError(
             "configure script {} does not exist".format(configure_script))
 
-    with changedir(library_dir):
+    with changedir(library_dir), set_compiler_envvars():
         if env_options is not None:
             if run_configure(env_options):
                 return env_options
@@ -120,7 +138,7 @@ def get_pysam_version():
     sys.path.insert(0, "pysam")
     import version
     return version.__version__
-    
+
 
 # How to link against HTSLIB
 # shared:   build shared chtslib from builtin htslib code.
@@ -412,7 +430,7 @@ Operating System :: POSIX
 Operating System :: Unix
 Operating System :: MacOS
 """
-    
+
 metadata = {
     'name': "pysam",
     'version': get_pysam_version(),
index ec0f5e1d42ecf081a78e1f98f57a99b97c5c7629..e0be5ef3c5a95b128940036e521f0ccec47b15e4 100644 (file)
@@ -22,7 +22,7 @@ from functools import partial
 
 import pysam
 import pysam.samtools
-from TestUtils import checkBinaryEqual, check_url, \
+from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
     check_samtools_view_equal, checkFieldEqual, force_str, \
     get_temp_filename, BAM_DATADIR
 
@@ -470,7 +470,8 @@ class TestIO(unittest.TestCase):
         self.checkEcho("ex2.bam",
                        "ex2.bam",
                        "tmp_ex2.bam",
-                       "rb", "wb")
+                       "rb", "wb",
+                       checkf=checkGZBinaryEqual)
 
     def testCRAM2CRAM(self):
         # in some systems different reference sequence paths might be
@@ -489,7 +490,8 @@ class TestIO(unittest.TestCase):
         self.checkEcho("ex2.sam",
                        "ex2.bam",
                        "tmp_ex2.bam",
-                       "r", "wb")
+                       "r", "wb",
+                       checkf=checkGZBinaryEqual)
 
     def testBAM2SAM(self):
         self.checkEcho("ex2.bam",
@@ -665,7 +667,8 @@ class TestIO(unittest.TestCase):
         self.checkEcho("ex1.bam",
                        "ex1.bam",
                        "tmp_ex1.bam",
-                       "rb", "wb")
+                       "rb", "wb",
+                       checkf=checkGZBinaryEqual)
 
         f = open(os.path.join(BAM_DATADIR, "ex1.bam"))
         samfile = pysam.AlignmentFile(f, "rb")
@@ -1372,7 +1375,7 @@ class TestDeNovoConstruction(unittest.TestCase):
         outfile.close()
 
         self.assertTrue(
-            checkBinaryEqual(tmpfilename, self.bamfile),
+            checkGZBinaryEqual(tmpfilename, self.bamfile),
             "mismatch when construction BAM file, see %s %s" %
             (tmpfilename, self.bamfile))
 
@@ -1407,6 +1410,11 @@ class TestEmptyHeader(unittest.TestCase):
         self.assertTrue("SQ" in s.header.to_dict())
         self.assertTrue("@SQ" in str(s.header))
 
+    def test_bam_without_seq_with_null_bytes_in_header(self):
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header_null_bytes.bam"))
+        self.assertTrue("SQ" in s.header.to_dict())
+        self.assertTrue("@SQ" in str(s.header))
+
 
 class TestMismatchingHeader(unittest.TestCase):
     '''see issue 716.'''
index 7811b9e1822070c6078967f185de927b275c7198..f33761e5836887e400710bfa3fc36c6fcd141244 100644 (file)
@@ -96,6 +96,19 @@ def checkBinaryEqual(filename1, filename2):
     return found
 
 
+def checkGZBinaryEqual(filename1, filename2):
+    '''return true if the decompressed contents of the two files
+    are binary equal.
+    '''
+    with gzip.open(filename1, "rb") as infile1:
+        d1 = infile1.read()
+        with gzip.open(filename2, "rb") as infile2:
+            d2 = infile2.read()
+        if d1 == d2:
+            return True
+    return False
+
+
 def check_samtools_view_equal(
         filename1, filename2,
         without_header=False):
index 88bd34ff4a8674033fc102362e3ec1f9ec613c50..4458d1f163569e186b023bf5cedc161a6ce228fb 100644 (file)
@@ -1,4 +1,7 @@
+# -*- coding: utf-8 -*-
+
 import os
+import glob
 import sys
 import unittest
 import pysam
@@ -633,6 +636,37 @@ class TestSubsetting(unittest.TestCase):
             inf.subset_samples(["NA00001"])
 
 
+class TestVCFVersions(unittest.TestCase):
+
+    def setUp(self):
+        self.files_to_test = (glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf.gz")) +
+                              glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf")) +
+                              glob.glob(os.path.join(CBCF_DATADIR, "example_v*.bcf")))
+    
+    def test_all_records_can_be_fetched(self):
+
+        for fn in self.files_to_test:
+            with pysam.VariantFile(fn) as inf:
+                records = list(inf.fetch())
+
+
+class TestUnicode(unittest.TestCase):
+    filename = "example_vcf43_with_utf8.vcf.gz"
+
+    def test_record_with_unicode(self):
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR,
+                                            self.filename)) as inf:
+            records = list(inf.fetch("20", 2345677, 2345678))
+        self.assertEqual(len(records), 1)
+        record = records.pop()
+        self.assertEqual(
+            record.info["CLNVI"],
+            ('Institute_of_Human_Genetics', u'Friedrich-Alexander-Universität_Erlangen-Nürnberg'))
+        self.assertEqual(record.samples[0]["AN"], "alpha")
+        self.assertEqual(record.samples[1]["AN"], u"ä")
+        self.assertEqual(record.samples[2]["AN"], u"ü")
+                
+
 if __name__ == "__main__":
     # build data files
     print("building data files")
diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py
new file mode 100644 (file)
index 0000000..fd80a80
--- /dev/null
@@ -0,0 +1,72 @@
+import os
+import glob
+import sys
+import unittest
+import pysam
+import shutil
+import gzip
+import subprocess
+import pytest
+
+try:
+    from pathlib import Path
+except ImportError:
+    Path = None
+
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context
+
+
+@pytest.fixture
+def vcf_header():
+    vcf_header = pysam.VariantHeader()
+    vcf_header.add_sample("sample1")
+    vcf_header.add_sample("sample2")
+    vcf_header.contigs.add("1")
+    return vcf_header
+
+## segfault without coordinates
+
+def test_ascii_annotation_can_be_added(vcf_header):
+    vcf_header.formats.add("AN", 1, "String", "An annotation")
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno1"},
+            {"AN": "anno2"}])
+    assert str(record)[:-1].split("\t")[-2:] == ["anno1", "anno2"]
+
+
+def test_ascii_annotation_with_variable_length_can_be_added(vcf_header):
+    vcf_header.formats.add("AN", 1, "String", "An annotation")
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno1b"},
+            {"AN": "anno1"}])
+    assert str(record)[:-1].split("\t")[-2:] == ["anno1b", "anno1"]
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno2"},
+            {"AN": "anno2b"}])
+    assert str(record)[:-1].split("\t")[-2:] == ["anno2", "anno2b"]
+    
+
+def test_unicode_annotation_can_be_added(vcf_header):
+    vcf_header.formats.add("AN", 1, "String", "An annotation")
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno1"},
+            {"AN": "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"}])
+    assert str(record)[:-1].split("\t")[-2:] == [
+        "anno1",
+        "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"]
diff --git a/tests/cbcf_data/example_vcf43.vcf b/tests/cbcf_data/example_vcf43.vcf
new file mode 100644 (file)
index 0000000..9010401
--- /dev/null
@@ -0,0 +1,24 @@
+##fileformat=VCFv4.3
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001 NA00002 NA00003
+20     14370   rs6054257       G       A       29      PASS    NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ     0|0:48:1:51,51  1|0:48:8:51,51  1/1:43:5:.,.
+20     17330   .       T       A       3       q10     NS=3;DP=11;AF=0.017     GT:GQ:DP:HQ     0|0:49:3:58,50  0|1:3:5:65,3    0/0:41:3
+20     1110696 rs6040355       A       G,T     67      PASS    NS=2;DP=10;AF=0.333,0.667;AA=T;DB       GT:GQ:DP:HQ     1|2:21:6:23,27  2|1:2:0:18,2    2/2:35:4
+20     1230237 .       T       .       47      PASS    NS=3;DP=13;AA=T GT:GQ:DP:HQ     0|0:54:7:56,60  0|0:48:4:51,51  0/0:61:2
+20     1234567 microsat1       GTC     G,GTCT  50      PASS    NS=3;DP=9;AA=G  GT:GQ:DP        0/1:35:4        0/2:17:2        1/1:40:3
diff --git a/tests/cbcf_data/example_vcf43_with_utf8.vcf b/tests/cbcf_data/example_vcf43_with_utf8.vcf
new file mode 100644 (file)
index 0000000..4826fc4
--- /dev/null
@@ -0,0 +1,27 @@
+##fileformat=VCFv4.3
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##INFO=<ID=CLNVI,Number=.,Type=String,Description="the variant's clinical sources reported as tag-value pairs of database and variant identifier">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##FORMAT=<ID=AN,Number=1,Type=String,Description="Some annotation">
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001 NA00002 NA00003
+20     14370   rs6054257       G       A       29      PASS    NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ     0|0:48:1:51,51  1|0:48:8:51,51  1/1:43:5:.,.
+20     17330   .       T       A       3       q10     NS=3;DP=11;AF=0.017     GT:GQ:DP:HQ     0|0:49:3:58,50  0|1:3:5:65,3    0/0:41:3
+20     1110696 rs6040355       A       G,T     67      PASS    NS=2;DP=10;AF=0.333,0.667;AA=T;DB       GT:GQ:DP:HQ     1|2:21:6:23,27  2|1:2:0:18,2    2/2:35:4
+20     1230237 .       T       .       47      PASS    NS=3;DP=13;AA=T GT:GQ:DP:HQ     0|0:54:7:56,60  0|0:48:4:51,51  0/0:61:2
+20     1234567 microsat1       GTC     G,GTCT  50      PASS    NS=3;DP=9;AA=G  GT:GQ:DP        0/1:35:4        0/2:17:2        1/1:40:3
+20     2345678 rs123   G       A       29      PASS    NS=3;DP=14;AF=0.5;CLNVI=Institute_of_Human_Genetics,Friedrich-Alexander-Universität_Erlangen-Nürnberg;DB;H2   GT:GQ:DP:HQ:AN  0|0:48:1:51,51:alpha    1|0:48:8:51,51:ä       1/1:43:5:.,.:ü
diff --git a/tests/pysam_data/example_no_seq_in_header_null_bytes.bam b/tests/pysam_data/example_no_seq_in_header_null_bytes.bam
new file mode 100644 (file)
index 0000000..aaf4b84
Binary files /dev/null and b/tests/pysam_data/example_no_seq_in_header_null_bytes.bam differ
index 40d84c08f8a79db88cfbde19d211052dd1dba61d..b20a869013749ede42ed5ccb9d61adb2d31e8d1b 100644 (file)
@@ -45,7 +45,7 @@ def get_version(executable):
     if IS_PYTHON3:
         lines = lines.decode('ascii')
     try:
-        x = re.search("Version:\s+(\S+)", lines).groups()[0]
+        x = re.search(r"Version:\s+(\S+)", lines).groups()[0]
     except AttributeError:
         raise ValueError("could not get version from %s" % lines)
     return x
@@ -82,6 +82,7 @@ class SamtoolsTest(unittest.TestCase):
         # TODO: issues with file naming
         # "faidx ex1.fa; %(out)s_ex1.fa.fai",
         "index ex1.bam %(out)s_ex1.bam.fai",
+        "index -@2 ex1.bam %(out)s_ex1.bam.fai",
         "idxstats ex1.bam > %(out)s_ex1.idxstats",
         "fixmate ex1.bam %(out)s_ex1.fixmate.bam",
         "flagstat ex1.bam > %(out)s_ex1.flagstat",
@@ -189,7 +190,7 @@ class SamtoolsTest(unittest.TestCase):
         pysam_method = getattr(self.module, command)
 
         # run samtools
-        full_statement = re.sub("%\(out\)s", self.executable, statement)
+        full_statement = re.sub(r"%\(out\)s", self.executable, statement)
         run_command(" ".join((self.executable, full_statement)))
         # sys.stdout.write("%s %s ok" % (command, self.executable))
 
@@ -199,7 +200,7 @@ class SamtoolsTest(unittest.TestCase):
             parts = parts[:-2]
 
         # avoid interpolation to preserve string quoting, tab chars, etc.
-        pysam_parts = [re.sub("%\(out\)s", "pysam", x) for x in parts[1:]]
+        pysam_parts = [re.sub(r"%\(out\)s", "pysam", x) for x in parts[1:]]
         output = pysam_method(*pysam_parts,
                               raw=True,
                               catch_stdout=True)
@@ -272,7 +273,7 @@ class SamtoolsTest(unittest.TestCase):
             mapped_command = self.get_command(statement, map_to_internal=True)
             pysam_method = getattr(self.module, mapped_command)
             usage_msg = pysam_method.usage()
-            expected = "Usage:\s+{} {}".format(self.executable, command)
+            expected = r"Usage:\s+{} {}".format(self.executable, command)
             self.assertTrue(re.search(expected, usage_msg) is not None)
 
     def tearDown(self):
@@ -287,6 +288,18 @@ class EmptyIndexTest(unittest.TestCase):
         self.assertRaises(IOError, pysam.samtools.index,
                           "exdoesntexist.bam")
 
+    def testEmptyIndexWithExtraFlag(self):
+        self.assertRaises(IOError, pysam.samtools.index,
+                          "-c",
+                          "exdoesntexist.bam")
+
+    def testEmptyIndexWithExtraArg(self):
+        self.assertRaises(IOError, pysam.samtools.index,
+                          "-c",
+                          "-m",
+                          "14",
+                          "exdoesntexist.bam")
+
 
 if sys.platform != "darwin":
     # fails with segfault with htslib 1.5 on Osx, an issue with flockfile
index fdd39b02624275b10a889fd6b50ce12e02fe993e..c17f7ffe40b7c746144cc1dfe980632af3dc4e8e 100644 (file)
@@ -14,9 +14,8 @@ import unittest
 import subprocess
 import glob
 import re
-import copy
-import tempfile
-from TestUtils import check_url, load_and_convert, TABIX_DATADIR, get_temp_filename
+from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
+    load_and_convert, TABIX_DATADIR, get_temp_filename
 
 IS_PYTHON3 = sys.version_info[0] >= 3
 
@@ -40,30 +39,6 @@ def splitToBytes(s):
     return [x.encode("ascii") for x in s.split("\t")]
 
 
-def checkBinaryEqual(filename1, filename2):
-    '''return true if the two files are binary equal.'''
-    if os.path.getsize(filename1) != os.path.getsize(filename2):
-        return False
-
-    with open(filename1, "rb") as infile:
-        d1 = infile.read()
-
-    with open(filename2, "rb") as infile:
-        d2 = infile.read()
-
-    if len(d1) != len(d2):
-        return False
-
-    found = False
-    for c1, c2 in zip(d1, d2):
-        if c1 != c2:
-            break
-    else:
-        found = True
-
-    return found
-
-
 class TestIndexing(unittest.TestCase):
     filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
     filename_idx = os.path.join(TABIX_DATADIR, "example.gtf.gz.tbi")
@@ -77,7 +52,7 @@ class TestIndexing(unittest.TestCase):
         '''test indexing via preset.'''
 
         pysam.tabix_index(self.tmpfilename, preset="gff")
-        self.assertTrue(checkBinaryEqual(
+        self.assertTrue(checkGZBinaryEqual(
             self.tmpfilename + ".tbi", self.filename_idx))
 
     def test_indexing_to_custom_location_works(self):
@@ -86,7 +61,7 @@ class TestIndexing(unittest.TestCase):
         index_path = get_temp_filename(suffix='custom.tbi')
         pysam.tabix_index(self.tmpfilename, preset="gff",
                           index=index_path, force=True)
-        self.assertTrue(checkBinaryEqual(index_path, self.filename_idx))
+        self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx))
         os.unlink(index_path)
 
     def test_indexing_with_explict_columns_works(self):
@@ -98,7 +73,7 @@ class TestIndexing(unittest.TestCase):
                           end_col=4,
                           line_skip=0,
                           zerobased=False)
-        self.assertTrue(checkBinaryEqual(
+        self.assertTrue(checkGZBinaryEqual(
             self.tmpfilename + ".tbi", self.filename_idx))
 
     def test_indexing_with_lineskipping_works(self):
@@ -109,7 +84,7 @@ class TestIndexing(unittest.TestCase):
                           end_col=4,
                           line_skip=1,
                           zerobased=False)
-        self.assertFalse(checkBinaryEqual(
+        self.assertFalse(checkGZBinaryEqual(
             self.tmpfilename + ".tbi", self.filename_idx))
 
     def tearDown(self):
index 32f8e42dcd74d92c48fb1adf6e0030d39897adfd..7ad7db0d1a218ab49255b309d559f7fa14dbf03c 100644 (file)
@@ -128,7 +128,7 @@ class TestGTF(TestParser):
         """build attribute string from dictionary d"""
         s = "; ".join(["{} \"{}\"".format(x, y) for (x, y) in d.items()]) + ";"
         # remove quotes around numeric values
-        s = re.sub("\"(\d+)\"", r"\1", s)
+        s = re.sub(r'"(\d+)"', r'\1', s)
         return s
 
     def testRead(self):