New upstream version 0.15.3+ds

author Michael R. Crusoe <michael.crusoe@gmail.com>

Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)

committer Michael R. Crusoe <michael.crusoe@gmail.com>

Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)
author Michael R. Crusoe <michael.crusoe@gmail.com>
Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)
committer Michael R. Crusoe <michael.crusoe@gmail.com>
Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)
diff --git a/.gitignore b/.gitignore

index fd28d6d2e22a76fc0840fa2db8ba2f8451aabf67..b07a532d0958bdbeb9909e9a7aabc2cffc5dbeed 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -17,11 +17,13 @@ tests/cbcf_data
  tests/tabix_data
  
  samtools/config.h
-bcftools/config.g
+bcftools/config.h
  htslib/config.status
  htslib/config.h
  htslib/config.log
  htslib/config.mk
+htslib/htslib.pc.tmp
+htslib/htslib-uninstalled.pc
  pysam/config.py
  
  # linking tests
diff --git a/.travis.yml b/.travis.yml

index 36422812fba168befeecbeb0700ac8dad3eca32f..b30d9b8cfdea0c11e829231d84aace2015cd5d4a 100644 (file)
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
  os:
-  - linx
+  - linux
    - osx
  
  language: c
@@ -7,11 +7,58 @@ language: c
  env:
    matrix:
      - CONDA_PY=2.7
-    - CONDA_PY=3.4
-    - CONDA_PY=3.5
      - CONDA_PY=3.6
+    - CONDA_PY=3.7
    global:
      - PYSAM_LINKING_TEST=1
+    - TWINE_USERNAME=grepall
+    - secure: 'OcwwP8/o21+SGW0UVAnnCQwllhGSCq2HJzpI9EhX3kh6J9RTkyx/+drkg45bx1Z5u8zymuAFappEYzlpzqZE886XezkjOYGVa/u+Coqr1oT/BEJHFCkCA4o26yESp7Zy8aNj/juhB7Rfa77pIDXBayqTzbALz/AURMtZapasB18='
+
+_deploy_common: &deploy_common
+  if: tag IS present
+  install:
+    - python3 -m pip install cibuildwheel twine
+
+matrix:
+  include:
+    - stage: deploy
+      os: linux
+      language: python
+      python: '3.5'
+      services:
+        - docker
+      env:
+        - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+      addons:
+        apt:
+          packages:
+            - gcc
+            - g++
+            - libcurl4-openssl-dev  # for libcurl support in sdist
+            - libssl-dev  # for s3 support in sdist
+      <<: *deploy_common
+      script:
+        - set -e
+        - cibuildwheel --output-dir dist
+        - python3 -m pip install Cython
+        - python3 setup.py build_ext --inplace
+        - python3 setup.py sdist
+        - twine check dist/*
+        - twine upload --skip-existing dist/*
+    - stage: deploy
+      os: osx
+      language: generic
+      env:
+        - CIBW_BEFORE_BUILD="pip install -r requirements.txt"
+        - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+      addons: {}
+      <<: *deploy_common
+      script:
+        - set -e
+        - cibuildwheel --output-dir dist
+        - twine check dist/*
+        - twine upload --skip-existing dist/*
  
  addons:
    apt:
diff --git a/NEWS b/NEWS

index 4861555a9451e699f4bfce6d4d45d84353ea4a77..49ce4857c47038225156f64d3e132508880847f4 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,26 @@ http://pysam.readthedocs.io/en/latest/release.html
  Release notes
  =============
  
+Release 0.15.3
+==============
+
+Bugfix release.
+
+* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
+* [#780] close all filehandles before opening new ones in pysam_dispatch
+* [#773] do not cache VariantRecord.id to avoid memory leak
+* [#781] default of multiple_iterators=True is changed to False for
+  CRAM files.
+* [#825] fix collections.abc import
+* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
+  bug when setting FORMAT fields.
+* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
+  memory as needed, rather than a fixed-size char buffer.
+* [#814] Build wheels and upload them to PyPI
+* [#755] Allow passing flags and arguments to index methods
+* [#763] Strip \0 in header check
+* [#761] Test Tabix index contents, not the compression
+
  Release 0.15.2
  ==============
  
diff --git a/README.rst b/README.rst

index 2005eb6d0db0857de99e213684f0628cd2212ce0..f8a641de0a4723e371c5c5b0f9028b84027a55bf 100644 (file)
--- a/README.rst
+++ b/README.rst
@@ -25,6 +25,8 @@ as it resolves non-python dependencies and uses pre-configured
  compilation options. Especially for OS X this will potentially save a
  lot of trouble.
  
+The current version of pysam wraps 3rd-party code from htslib-1.9, samtools-1.9, and bcftools-1.9.
+
  Pysam is available through `pypi
  <https://pypi.python.org/pypi/pysam>`_. To install, type::
  
diff --git a/bcftools/bcftools.pysam.c b/bcftools/bcftools.pysam.c

index e3e251ef499b3849988617cb8748937fba4a6b39..de8739d06fb89253a1832676a35bf85a48289b0d 100644 (file)
--- a/bcftools/bcftools.pysam.c
+++ b/bcftools/bcftools.pysam.c
@@ -10,7 +10,6 @@
  FILE * bcftools_stderr = NULL;
  FILE * bcftools_stdout = NULL;
  const char * bcftools_stdout_fn = NULL;
-int bcftools_stdout_fileno = STDOUT_FILENO;
  
  
  FILE * bcftools_set_stderr(int fd)
@@ -21,11 +20,10 @@ FILE * bcftools_set_stderr(int fd)
    return bcftools_stderr;
  }
  
-void bcftools_unset_stderr(void)
+void bcftools_close_stderr(void)
  {
-  if (bcftools_stderr != NULL)
-    fclose(bcftools_stderr);
-  bcftools_stderr = fopen("/dev/null", "w");
+  fclose(bcftools_stderr);
+  bcftools_stderr = NULL;
  }
  
  FILE * bcftools_set_stdout(int fd)
@@ -37,7 +35,6 @@ FILE * bcftools_set_stdout(int fd)
      {
        fprintf(bcftools_stderr, "could not set stdout to fd %i", fd);
      }
-  bcftools_stdout_fileno = fd;
    return bcftools_stdout;
  }
  
@@ -46,12 +43,10 @@ void bcftools_set_stdout_fn(const char *fn)
    bcftools_stdout_fn = fn;
  }
  
-void bcftools_unset_stdout(void)
+void bcftools_close_stdout(void)
  {
-  if (bcftools_stdout != NULL)
-    fclose(bcftools_stdout);
-  bcftools_stdout = fopen("/dev/null", "w");
-  bcftools_stdout_fileno = STDOUT_FILENO;
+  fclose(bcftools_stdout);
+  bcftools_stdout = NULL;
  }
  
  int bcftools_puts(const char *s)
diff --git a/bcftools/bcftools.pysam.h b/bcftools/bcftools.pysam.h

index 92ab370a2e30aeb4d5bb9711301eec805e8d21df..453567a54067d46fff33f0ad9962e840caca4292 100644 (file)
--- a/bcftools/bcftools.pysam.h
+++ b/bcftools/bcftools.pysam.h
@@ -1,7 +1,7 @@
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef bcftools_PYSAM_H
+#define bcftools_PYSAM_H
  
-#include "stdio.h"
+#include <stdio.h>
  
  extern FILE * bcftools_stderr;
  
@@ -17,7 +17,7 @@ FILE * bcftools_set_stderr(int fd);
  
  /*! set pysam standard output to point to file descriptor
  
-  Setting the stderr will close the previous stdout.
+  Setting the stdout will close the previous stdout.
   */
  FILE * bcftools_set_stdout(int fd);
  
@@ -26,17 +26,15 @@ FILE * bcftools_set_stdout(int fd);
   */
  void bcftools_set_stdout_fn(const char * fn);
  
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
    
-  Unsetting the stderr will close the previous stderr.
   */
-void bcftools_unset_stderr(void);
+void bcftools_close_stderr(void);
  
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
    
-  Unsetting the stderr will close the previous stderr.
   */
-void bcftools_unset_stdout(void);
+void bcftools_close_stdout(void);
  
  int bcftools_puts(const char *s);
  
diff --git a/devtools/buildwheels.sh b/devtools/buildwheels.sh

index ae0d953e491552e59449a3a5e877f2ca0281256b..bf71bf9d7a136cb508237516747541f3b9917f8c 100755 (executable)
--- a/devtools/buildwheels.sh
+++ b/devtools/buildwheels.sh
@@ -5,6 +5,10 @@
  #
  # It is best to run this in a fresh clone of the repository!
  #
+# Before running, make sure to update image:
+#
+#   docker pull quay.io/pypa/manylinux1_x86_64
+#
  # Run this within the repository root:
  #   docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
  #
@@ -24,13 +28,6 @@ fi
  
  yum install -y zlib-devel bzip2-devel xz-devel
  
-# Python 2.6 is not supported
-rm -r /opt/python/cp26*
-
-# Python 3.3 builds fail with:
-#  /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib
-rm -r /opt/python/cp33*
-
  # Without libcurl support, htslib can open files from HTTP and FTP URLs.
  # With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a
  # current version of OpenSSL, and we do not want to be responsible for
diff --git a/devtools/import.py b/devtools/import.py

index 44e606a22c1e36a1d1c6fa371cc15682d6aa6ab6..7430cc58a21a786153ee88c28af06332d166962a 100644 (file)
--- a/devtools/import.py
+++ b/devtools/import.py
@@ -94,16 +94,16 @@ def _update_pysam_files(cf, destdir):
                  outfile.write('#include "{}.pysam.h"\n\n'.format(basename))
                  subname, _ = os.path.splitext(os.path.basename(filename))
                  if subname in MAIN.get(basename, []):
-                    lines = re.sub("int main\(", "int {}_main(".format(
+                    lines = re.sub(r"int main\(", "int {}_main(".format(
                          basename), lines)
                  else:
-                    lines = re.sub("int main\(", "int {}_{}_main(".format(
+                    lines = re.sub(r"int main\(", "int {}_{}_main(".format(
                          basename, subname), lines)
                  lines = re.sub("stderr", "{}_stderr".format(basename), lines)
                  lines = re.sub("stdout", "{}_stdout".format(basename), lines)
-                lines = re.sub(" printf\(", " fprintf({}_stdout, ".format(basename), lines)
-                lines = re.sub("([^kf])puts\(", r"\1{}_puts(".format(basename), lines)
-                lines = re.sub("putchar\(([^)]+)\)",
+                lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines)
+                lines = re.sub(r"([^kf])puts\(", r"\1{}_puts(".format(basename), lines)
+                lines = re.sub(r"putchar\(([^)]+)\)",
                                 r"fputc(\1, {}_stdout)".format(basename), lines)
  
                  fn = os.path.basename(filename)
diff --git a/devtools/run_tests_travis.sh b/devtools/run_tests_travis.sh

index 89988bb690986f026a660689a5aa5b697a7d8519..b20c8a07534290484305be7d73713c7cf8605bef 100755 (executable)
--- a/devtools/run_tests_travis.sh
+++ b/devtools/run_tests_travis.sh
@@ -31,30 +31,33 @@ bash Miniconda3.sh -b
  # activate testenv environment
  source ~/miniconda3/bin/activate testenv
  
-conda config --add channels r
  conda config --add channels defaults
-conda config --add channels conda-forge
  conda config --add channels bioconda
+conda config --add channels conda-forge
  
  # pin versions, so that tests do not fail when pysam/htslib out of step
  # add htslib dependencies
-conda install -y "samtools=1.7" "bcftools=1.6" "htslib=1.7" xz curl bzip2
+# NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488
+conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses
  
  # Need to make C compiler and linker use the anaconda includes and libraries:
  export PREFIX=~/miniconda3/
  export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
-export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl --disable-lzma"
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
  
+echo "show samtools, htslib, and bcftools versions"
  samtools --version
-htslib --version
+htsfile --version
  bcftools --version
  
  # Try building conda recipe first
  ~/miniconda3/bin/conda-build devtools/conda-recipe/ --python=$CONDA_PY
  
  # install code from the repository via setup.py
-echo "installing via setup.py from repository"
-python setup.py install
+echo
+echo "============ installing via setup.py from repository ============"
+echo
+python setup.py install || exit
  
  # create auxilliary data
  echo
diff --git a/doc/api.rst b/doc/api.rst

index d2a9a80fceb21a16698528c3b2f39d199dd177cc..3f2c042deefc071e6757a7b521afae1574b52718 100644 (file)
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -142,6 +142,9 @@ BAM/SAM formatted files.
  .. autoclass:: pysam.AlignmentFile
     :members:
  
+.. autoclass:: pysam.AlignmentHeader
+   :members:
+
  An :class:`~pysam.AlignedSegment` represents an aligned segment within
  a SAM/BAM file.
  
diff --git a/doc/glossary.rst b/doc/glossary.rst

index de3e032622696cfdefeb4f69ceb7dfe4faba9f87..4e9fa57e664337f87d42e911d723a1e9cce58f10 100644 (file)
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -24,7 +24,7 @@ Glossary
        are 0-based, half-open intervals, i.e., the position 10,000 is
        part of the interval, but 20,000 is not. An exception are
        :term:`samtools` compatible region strings such as
-      'chr1:10000:20000', which are closed, i.e., both positions 10,000
+      'chr1:10000-20000', which are closed, i.e., both positions 10,000
        and 20,000 are part of the interval.
  
     column
diff --git a/doc/release.rst b/doc/release.rst

index 5adc437bc54f518420d2b796d7a582c8a245b9ac..ee1875b1b4241bc37305172b5928e827dc5cd9d5 100644 (file)
--- a/doc/release.rst
+++ b/doc/release.rst
@@ -2,7 +2,27 @@
  Release notes
  =============
  
-Release 0.15.1
+Release 0.15.3
+==============
+
+Bugfix release.
+
+* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
+* [#780] close all filehandles before opening new ones in pysam_dispatch
+* [#773] do not cache VariantRecord.id to avoid memory leak
+* [#781] default of multiple_iterators=True is changed to False for
+  CRAM files.
+* [#825] fix collections.abc import
+* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
+  bug when setting FORMAT fields.
+* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
+  memory as needed, rather than a fixed-size char buffer.
+* [#814] Build wheels and upload them to PyPI
+* [#755] Allow passing flags and arguments to index methods
+* [#763] Strip \0 in header check
+* [#761] Test Tabix index contents, not the compression
+
+Release 0.15.2
  ==============
  
  Bugfix release.
diff --git a/doc/usage.rst b/doc/usage.rst

index 6172329ac42390c345afbbeebc1f60b845fd590e..f4b7498070c1ad48d8553f1e3148392b7f17db07 100644 (file)
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -189,7 +189,7 @@ region can be retrieved by calling :meth:`~pysam.TabixFile.fetch()`::
           print (str(row))
  
  This will return a tuple-like data structure in which columns can
-be retrieved by numeric index:
+be retrieved by numeric index::
  
      for row in tbx.fetch("chr1", 1000, 2000):
           print ("chromosome is", row[0])
@@ -385,7 +385,7 @@ flagstat command and consists of three files:
  2. The cython implementation :file:`_pysam_flagstat.pyx`. This script
     imports the pysam API via::
  
-      from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
+      from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
  
     This statement imports, amongst others, :class:`AlignedSegment`
     into the namespace. Speed can be gained from declaring
diff --git a/import/pysam.c b/import/pysam.c

index d08cf50e5542366ddb63ffed52a5eb16c7b505fc..56926222a882c97bc9a178415df746c599f8fc72 100644 (file)
--- a/import/pysam.c
+++ b/import/pysam.c
@@ -10,7 +10,6 @@
  FILE * @pysam@_stderr = NULL;
  FILE * @pysam@_stdout = NULL;
  const char * @pysam@_stdout_fn = NULL;
-int @pysam@_stdout_fileno = STDOUT_FILENO;
  
  
  FILE * @pysam@_set_stderr(int fd)
@@ -21,11 +20,10 @@ FILE * @pysam@_set_stderr(int fd)
    return @pysam@_stderr;
  }
  
-void @pysam@_unset_stderr(void)
+void @pysam@_close_stderr(void)
  {
-  if (@pysam@_stderr != NULL)
-    fclose(@pysam@_stderr);
-  @pysam@_stderr = fopen("/dev/null", "w");
+  fclose(@pysam@_stderr);
+  @pysam@_stderr = NULL;
  }
  
  FILE * @pysam@_set_stdout(int fd)
@@ -37,7 +35,6 @@ FILE * @pysam@_set_stdout(int fd)
      {
        fprintf(@pysam@_stderr, "could not set stdout to fd %i", fd);
      }
-  @pysam@_stdout_fileno = fd;
    return @pysam@_stdout;
  }
  
@@ -46,12 +43,10 @@ void @pysam@_set_stdout_fn(const char *fn)
    @pysam@_stdout_fn = fn;
  }
  
-void @pysam@_unset_stdout(void)
+void @pysam@_close_stdout(void)
  {
-  if (@pysam@_stdout != NULL)
-    fclose(@pysam@_stdout);
-  @pysam@_stdout = fopen("/dev/null", "w");
-  @pysam@_stdout_fileno = STDOUT_FILENO;
+  fclose(@pysam@_stdout);
+  @pysam@_stdout = NULL;
  }
  
  int @pysam@_puts(const char *s)
diff --git a/import/pysam.h b/import/pysam.h

index c3cc6231462eea087048f53a1e8d80ae35cbbcea..6abb884e7ad0653ec293a25dceeb04603cbc8851 100644 (file)
--- a/import/pysam.h
+++ b/import/pysam.h
@@ -1,7 +1,7 @@
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef @pysam@_PYSAM_H
+#define @pysam@_PYSAM_H
  
-#include "stdio.h"
+#include <stdio.h>
  
  extern FILE * @pysam@_stderr;
  
@@ -17,7 +17,7 @@ FILE * @pysam@_set_stderr(int fd);
  
  /*! set pysam standard output to point to file descriptor
  
-  Setting the stderr will close the previous stdout.
+  Setting the stdout will close the previous stdout.
   */
  FILE * @pysam@_set_stdout(int fd);
  
@@ -26,17 +26,15 @@ FILE * @pysam@_set_stdout(int fd);
   */
  void @pysam@_set_stdout_fn(const char * fn);
  
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
    
-  Unsetting the stderr will close the previous stderr.
   */
-void @pysam@_unset_stderr(void);
+void @pysam@_close_stderr(void);
  
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
    
-  Unsetting the stderr will close the previous stderr.
   */
-void @pysam@_unset_stdout(void);
+void @pysam@_close_stdout(void);
  
  int @pysam@_puts(const char *s);
  
diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h

index 25bd3e496493d9c623361144ca4ad7e005cfbf56..33c46c98e0c999a34dc0ab34f9acc1d44b97a2bf 100644 (file)
--- a/pysam/htslib_util.h
+++ b/pysam/htslib_util.h
@@ -18,20 +18,6 @@ typedef khash_t(vdict) vdict_t;
  KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
  typedef khash_t(s2i) s2i_t;
                      
-//////////////////////////////////////////////////////////////////
-/*! set pysam standard error to point to file descriptor
-
-  Setting the stderr will close the previous stderr.
- */
-// FILE * pysam_set_stderr(int fd);
-
-//////////////////////////////////////////////////////////////////
-/*! set pysam standard error to /dev/null.
-  
-  Unsetting the stderr will close the previous stderr.
- */
-// void pysam_unset_stderr();
-
  //////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////
  //////////////////////////////////////////////////////////////////
diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd

index 48ca93f7a0e37374c11fc41d9bae0d49da88cdca..c964160382f41893cf7569f726255cb458a3ad8c 100644 (file)
--- a/pysam/libcalignedsegment.pxd
+++ b/pysam/libcalignedsegment.pxd
@@ -70,7 +70,7 @@ cdef class PileupColumn:
      cdef int n_pu
      cdef AlignmentHeader header
      cdef uint32_t min_base_quality
-    cdef uint8_t * buf
+    cdef kstring_t buf
      cdef char * reference_sequence
  
  cdef class PileupRead:
diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx

index 7ad61ac25ac671c961f77fd2733c158a81afb999..02ef5b679902d24e017e8bc776d0e926ab5bb0bd 100644 (file)
--- a/pysam/libcalignedsegment.pyx
+++ b/pysam/libcalignedsegment.pyx
@@ -69,7 +69,6 @@ from cpython cimport array as c_array
  from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
      INT8_MAX, INT16_MAX, INT32_MAX, \
      UINT8_MAX, UINT16_MAX, UINT32_MAX
-from libc.stdio cimport snprintf
  
  from pysam.libchtslib cimport HTS_IDX_NOCOOR
  from pysam.libcutils cimport force_bytes, force_str, \
@@ -89,9 +88,6 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
  cdef char* CODE2CIGAR= "MIDNSHP=XB"
  cdef int NCIGAR_CODES = 10
  
-# dimensioned for 8000 pileup limit (+ insertions/deletions)
-cdef uint32_t MAX_PILEUP_BUFFER_SIZE = 10000
-
  if IS_PYTHON3:
      CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
      maketrans = str.maketrans
@@ -632,9 +628,8 @@ cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp,
      dest.n_pu = n_pu
      dest.min_base_quality = min_base_quality
      dest.reference_sequence = reference_sequence
-    dest.buf = <uint8_t *>calloc(MAX_PILEUP_BUFFER_SIZE, sizeof(uint8_t))
-    if dest.buf == NULL:
-        raise MemoryError("could not allocate pileup buffer")
+    dest.buf.l = dest.buf.m = 0
+    dest.buf.s = NULL
  
      return dest
  
@@ -1853,10 +1848,9 @@ cdef class AlignedSegment:
          return s
  
      def get_forward_qualities(self):
-        """return the original read sequence.
+        """return base qualities of the read sequence.
          
-        Reads mapping to the reverse strand will be reverse
-        complemented.
+        Reads mapping to the reverse strand will be reversed.
          """
          if self.is_reverse:
              return self.query_qualities[::-1]
@@ -2799,8 +2793,7 @@ cdef class PileupColumn:
              "\n".join(map(str, self.pileups))
  
      def __dealloc__(self):
-        if self.buf is not NULL:
-            free(self.buf)
+        free(self.buf.s)
  
      def set_min_base_quality(self, min_base_quality):
          """set the minimum base quality for this pileup column.
@@ -2971,15 +2964,16 @@ cdef class PileupColumn:
          cdef uint32_t x = 0
          cdef uint32_t j = 0
          cdef uint32_t c = 0
-        cdef uint32_t n = 0
          cdef uint8_t cc = 0
          cdef uint8_t rb = 0
-        cdef uint8_t * buf = self.buf
+        cdef kstring_t * buf = &self.buf
          cdef bam_pileup1_t * p = NULL
          
          if self.plp == NULL or self.plp[0] == NULL:
              raise ValueError("PileupColumn accessed after iterator finished")
  
+        buf.l = 0
+
          # todo: reference sequence to count matches/mismatches
          # todo: convert assertions to exceptions
          for x from 0 <= x < self.n_pu:
@@ -2992,16 +2986,12 @@ cdef class PileupColumn:
                  continue
              # see samtools pileup_seq
              if mark_ends and p.is_head:
-                buf[n] = '^'
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                kputc('^', buf)
                  
                  if p.b.core.qual > 93:
-                    buf[n] = 126
+                    kputc(126, buf)
                  else:
-                    buf[n] = p.b.core.qual + 33
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc(p.b.core.qual + 33, buf)
              if not p.is_del:
                  if p.qpos < p.b.core.l_qseq:
                      cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
@@ -3012,68 +3002,44 @@ cdef class PileupColumn:
                      rb = self.reference_sequence[self.reference_pos]
                      if seq_nt16_table[cc] == seq_nt16_table[rb]:
                          cc = "="
-                buf[n] = strand_mark_char(cc, p.b)
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                kputc(strand_mark_char(cc, p.b), buf)
              elif add_indels:
                  if p.is_refskip:
                      if bam_is_rev(p.b):
-                        buf[n] = '<'
+                        kputc('<', buf)
                      else:
-                        buf[n] = '>'
+                        kputc('>', buf)
                  else:
-                    buf[n] = '*'
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc('*', buf)
              if add_indels:
                  if p.indel > 0:
-                    buf[n] = '+'
-                    n += 1
-                    assert n < MAX_PILEUP_BUFFER_SIZE
-                    n += snprintf(<char *>&(buf[n]),
-                                  MAX_PILEUP_BUFFER_SIZE - n,
-                                  "%i",
-                                  p.indel)
-                    assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc('+', buf)
+                    kputw(p.indel, buf)
                      for j from 1 <= j <= p.indel:
                          cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
-                        buf[n] = strand_mark_char(cc, p.b)
-                        n += 1
-                        assert n < MAX_PILEUP_BUFFER_SIZE
+                        kputc(strand_mark_char(cc, p.b), buf)
                  elif p.indel < 0:
-                    buf[n] = '-'
-                    n += 1
-                    assert n < MAX_PILEUP_BUFFER_SIZE
-                    n += snprintf(<char *>&(buf[n]),
-                                  MAX_PILEUP_BUFFER_SIZE - n,
-                                  "%i",
-                                  -p.indel)
-                    assert n < MAX_PILEUP_BUFFER_SIZE
+                    kputc('-', buf)
+                    kputw(-p.indel, buf)
                      for j from 1 <= j <= -p.indel:
                          # TODO: out-of-range check here?
                          if self.reference_sequence == NULL:
                              cc = 'N'
                          else:
                              cc = self.reference_sequence[self.reference_pos + j]
-                        buf[n] = strand_mark_char(cc, p.b)
-                        n += 1
-                        assert n < MAX_PILEUP_BUFFER_SIZE
+                        kputc(strand_mark_char(cc, p.b), buf)
              if mark_ends and p.is_tail:
-                buf[n] = '$'
-                n += 1
-                assert n < MAX_PILEUP_BUFFER_SIZE
+                kputc('$', buf)
  
-            buf[n] = ':'
-            n += 1
-            assert n < MAX_PILEUP_BUFFER_SIZE
+            kputc(':', buf)
  
-        if n == 0:
+        if buf.l == 0:
              # could be zero if all qualities are too low
              return ""
          else:
              # quicker to ensemble all and split than to encode all separately.
              # ignore last ":"
-            return force_str(PyBytes_FromStringAndSize(<char*>buf, n-1)).split(":")
+            return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
  
      def get_query_qualities(self):
          """query base quality scores at pileup column position.
diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx

index 4030868affdd8c11d1e9af0f688d6765a310aafa..d35b0db3ff295541390344239dae28fda4a3bf40 100644 (file)
--- a/pysam/libcalignmentfile.pyx
+++ b/pysam/libcalignmentfile.pyx
@@ -56,6 +56,10 @@
  ########################################################
  import os
  import collections
+try:
+    from collections.abc import Sequence, Mapping  # noqa
+except ImportError:
+    from collections import Sequence, Mapping  # noqa
  import re
  import warnings
  import array
@@ -99,11 +103,11 @@ IndexStats = collections.namedtuple("IndexStats",
  cdef int MAX_POS = (1 << 31) - 1
  
  # valid types for SAM headers
-VALID_HEADER_TYPES = {"HD" : collections.Mapping,
-                      "SQ" : collections.Sequence,
-                      "RG" : collections.Sequence,
-                      "PG" : collections.Sequence,
-                      "CO" : collections.Sequence}
+VALID_HEADER_TYPES = {"HD" : Mapping,
+                      "SQ" : Sequence,
+                      "RG" : Sequence,
+                      "PG" : Sequence,
+                      "CO" : Sequence}
  
  # order of records within SAM headers
  VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
@@ -293,7 +297,7 @@ cdef class AlignmentHeader(object):
                      raise ValueError(
                          "invalid type for record %s: %s, expected %s".format(
                              record, type(data), VALID_HEADER_TYPES[record]))
-                if isinstance(data, collections.Mapping):
+                if isinstance(data, Mapping):
                      lines.append(build_header_line(data, record))
                  else:
                      for fields in header_dict[record]:
@@ -303,7 +307,7 @@ cdef class AlignmentHeader(object):
          for record, data in sorted(header_dict.items()):
              if record in VALID_HEADERS:
                  continue
-            if isinstance(data, collections.Mapping):
+            if isinstance(data, Mapping):
                  lines.append(build_header_line(data, record))
              else:
                  for fields in header_dict[record]:
@@ -346,7 +350,7 @@ cdef class AlignmentHeader(object):
          return makeAlignmentHeader(bam_hdr_dup(self.ptr))
  
      property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
+        """int with the number of :term:`reference` sequences in the file.
  
          This is a read-only attribute."""
          def __get__(self):
@@ -418,7 +422,8 @@ cdef class AlignmentHeader(object):
          # convert to python string
          t = self.__str__()
          for line in t.split("\n"):
-            if not line.strip():
+            line = line.strip(' \0')
+            if not line:
                  continue
              assert line.startswith("@"), \
                  "header line without '@': '%s'" % line
@@ -454,13 +459,13 @@ cdef class AlignmentHeader(object):
                  # interpret type of known header record tags, default to str
                  x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
  
-            if VALID_HEADER_TYPES[record] == collections.Mapping:
+            if VALID_HEADER_TYPES[record] == Mapping:
                  if record in result:
                      raise ValueError(
                          "multiple '%s' lines are not permitted" % record)
  
                  result[record] = x
-            elif VALID_HEADER_TYPES[record] == collections.Sequence:
+            elif VALID_HEADER_TYPES[record] == Sequence:
                  if record not in result: result[record] = []
                  result[record].append(x)
  
@@ -889,7 +894,7 @@ cdef class AlignmentFile(HTSFile):
                  self.header = template.header.copy()
              elif isinstance(header, AlignmentHeader):
                  self.header = header.copy()
-            elif isinstance(header, collections.Mapping):
+            elif isinstance(header, Mapping):
                  self.header = AlignmentHeader.from_dict(header)
              elif reference_names and reference_lengths:
                  self.header = AlignmentHeader.from_references(
@@ -1604,7 +1609,6 @@ cdef class AlignmentFile(HTSFile):
          Or it can be a generator filtering such reads. Example
          samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
          """
-        import collections
          res = collections.Counter()
          for r in read_iterator:
              if 'N' in r.cigarstring:
@@ -1635,7 +1639,6 @@ cdef class AlignmentFile(HTSFile):
              AlignedSegment r
              int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
  
-        import collections
          res = collections.Counter()
  
          match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
@@ -1898,7 +1901,7 @@ cdef class AlignmentFile(HTSFile):
          return self.header.get_reference_length(reference)
      
      property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
+        """int with the number of :term:`reference` sequences in the file.
          This is a read-only attribute."""
          def __get__(self):
              if self.header:
@@ -2610,10 +2613,16 @@ cdef class IteratorColumnRegion(IteratorColumn):
                    int start = 0,
                    int stop = MAX_POS,
                    int truncate = False,
+                  int multiple_iterators = True,
                    **kwargs ):
  
-        # initialize iterator
-        self._setup_iterator(tid, start, stop, 1)
+        # initialize iterator. Multiple iterators not available
+        # for CRAM.
+        if multiple_iterators and samfile.is_cram:
+            warnings.warn("multiple_iterators not implemented for CRAM")
+            multiple_iterators = False
+
+        self._setup_iterator(tid, start, stop, multiple_iterators)
          self.start = start
          self.stop = stop
          self.truncate = truncate
diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx

index 9ac58dc126cf664471c6656b1606fe3c3219d580..e40a8013035a39c33bda78933079981c126de0dc 100644 (file)
--- a/pysam/libcbcf.pyx
+++ b/pysam/libcbcf.pyx
@@ -95,7 +95,7 @@ from cpython.ref     cimport Py_INCREF
  from cpython.dict    cimport PyDict_GetItemString, PyDict_SetItemString
  from cpython.tuple   cimport PyTuple_New, PyTuple_SET_ITEM
  from cpython.bytes   cimport PyBytes_FromStringAndSize
-from cpython.unicode cimport PyUnicode_DecodeASCII
+from cpython.unicode cimport PyUnicode_DecodeUTF8
  from cpython.version cimport PY_MAJOR_VERSION
  
  from pysam.libchtslib cimport HTSFile, hisremote
@@ -151,7 +151,7 @@ cdef inline bcf_str_cache_get_charptr(const char* s):
      if PY_MAJOR_VERSION < 3:
          val = s
      else:
-        val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
+        val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
  
      PyDict_SetItemString(bcf_str_cache, s, val)
  
@@ -282,7 +282,7 @@ cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int sca
              else:
                  # Otherwise, copy the entire block
                  b = datac[:n]
-            value = tuple(v.decode('ascii') if v and v != bcf_str_missing else None for v in b.split(b','))
+            value = tuple(v.decode('utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
      else:
          value = []
          if type == BCF_BT_INT8:
@@ -412,9 +412,13 @@ cdef bcf_empty_array(int type, ssize_t n, int vlen):
      return value
  
  
-cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
-                           void *dst_data, int dst_type, ssize_t dst_values,
+cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
+                           void *dst_data, int dst_type, size_t dst_values,
                             int vlen):
+    """copy data from src to dest where the size of the elements (src_type/dst_type) differ
+    as well as the number of elements (src_values/dst_values).
+    """
+    
      cdef char    *src_datac
      cdef char    *dst_datac
      cdef int8_t  *src_datai8
@@ -432,7 +436,7 @@ cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
      if src_type == dst_type == BCF_BT_CHAR:
          src_datac = <char *>src_data
          dst_datac = <char *>dst_data
-        memcpy(src_datac, dst_datac, src_values)
+        memcpy(dst_datac, src_datac, src_values)
          for i in range(src_values, dst_values):
              dst_datac[i] = 0
      elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
@@ -574,7 +578,7 @@ cdef object bcf_check_values(VariantRecord record, value, int sample,
      cdef int given = len(values)
      if value_count[0] != -1 and value_count[0] != given:
          if scalar[0]:
-            raise TypeError('value expected to be scalar, given len={}'.format(value_count[0], given))
+            raise TypeError('value expected to be scalar, given len={}'.format(given))
          else:
              raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
  
@@ -813,14 +817,14 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
      if key == 'phased':
          sample.phased = bool(value)
          return
-
+    
      cdef bcf_hdr_t *hdr = sample.record.header.ptr
      cdef bcf1_t *r = sample.record.ptr
      cdef int fmt_id
      cdef vdict_t *d
      cdef khiter_t k
      cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
-    cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
+    cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
  
      if bcf_unpack(r, BCF_UN_ALL) < 0:
          raise ValueError('Error unpacking VariantRecord')
@@ -857,11 +861,10 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
                                fmt.type if fmt else -1,
                                fmt.n    if fmt else -1,
                                &value_count, &scalar, &realloc)
-
      vlen = value_count < 0
      value_count = len(values)
-
-    # If we can, write updated values to existing allocated storage
+    
+    # If we can, write updated values to existing allocated storage.
      if fmt and not realloc:
          r.d.indiv_dirty = 1
          bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
@@ -871,9 +874,9 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
      if fmt and fmt.n > alloc_len:
          alloc_len = fmt.n
  
-    n = r.n_sample
-    new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
-    cdef char *valp = <char *>new_values
+    nsamples = r.n_sample
+    new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
+    cdef char *new_values_p = <char *>new_values
  
      if fmt_type == BCF_HT_INT:
          dst_type = BCF_BT_INT32
@@ -887,15 +890,15 @@ cdef bcf_format_set_value(VariantRecordSample sample, key, value):
      else:
          raise ValueError('Unsupported FORMAT type')
  
-    if fmt and n > 1:
-        for i in range(n):
-            bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
-                                  valp  + i*dst_size, dst_type, alloc_len,
+    if fmt and nsamples > 1:
+        for i in range(nsamples):
+            bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
+                                  new_values_p  + i * dst_size, dst_type, alloc_len,
                                    vlen)
  
-    bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
+    bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
  
-    if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
+    if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
          raise ValueError('Unable to update format values')
  
  
@@ -1160,7 +1163,7 @@ cdef inline bcf_sync_end(VariantRecord record):
      cdef bcf_hdr_t *hdr = record.header.ptr
      cdef bcf_info_t *info
      cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
-    cdef int ref_len 
+    cdef int ref_len
  
      # allow missing ref when instantiating a new record
      if record.ref is not None:
@@ -2036,12 +2039,20 @@ cdef class VariantHeader(object):
  
      def __str__(self):
          cdef int hlen
-        cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
+        cdef kstring_t line
+        line.l = line.m = 0
+        line.s = NULL
  
-        try:
-            return charptr_to_str_w_len(hstr, hlen)
-        finally:
-            free(hstr)
+        if bcf_hdr_format(self.ptr, 0, &line) < 0:
+            if line.m:
+                free(line.s)
+            raise ValueError('bcf_hdr_format failed')
+
+        ret = charptr_to_str_w_len(line.s, line.l)
+
+        if line.m:
+            free(line.s)
+        return ret
  
      def new_record(self, contig=None, start=0, stop=0, alleles=None,
                           id=None, qual=None, filter=None, info=None, samples=None,
@@ -3121,7 +3132,9 @@ cdef class VariantRecord(object):
          cdef bcf1_t *r = self.ptr
          if bcf_unpack(r, BCF_UN_STR) < 0:
              raise ValueError('Error unpacking VariantRecord')
-        return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+        # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
+        # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+        return charptr_to_str(r.d.id) if r.d.id != b'.' else None
  
      @id.setter
      def id(self, value):
@@ -3178,7 +3191,7 @@ cdef class VariantRecord(object):
      @alleles.setter
      def alleles(self, values):
          cdef bcf1_t *r = self.ptr
-        
+
          # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
          cdef int rlen = r.rlen
  
diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd

index f1c8d764dcd939152718cd6f5b4869fdcd18d5c0..62a6f3d4ad9c1362115de52c5859521a315b2cc6 100644 (file)
--- a/pysam/libcbcftools.pxd
+++ b/pysam/libcbcftools.pxd
@@ -2,8 +2,8 @@ cdef extern from "bcftools.pysam.h":
  
      int bcftools_main(int argc, char *argv[])
      void bcftools_set_stderr(int fd)
-    void bcftools_unset_stderr()
+    void bcftools_close_stderr()
      void bcftools_set_stdout(int fd)
      void bcftools_set_stdout_fn(const char *)
-    void bcftools_unset_stdout()
+    void bcftools_close_stdout()
      void bcftools_set_optind(int)
diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx

index cf783b999493568cb13aacd0f6937cf687bee82d..40d8430fbdab48bb75d59833029b103f01482924 100644 (file)
--- a/pysam/libcfaidx.pyx
+++ b/pysam/libcfaidx.pyx
@@ -214,7 +214,7 @@ cdef class FastaFile:
          return False
  
      property closed:
-        """"bool indicating the current state of the file object.
+        """bool indicating the current state of the file object.
          This is a read-only attribute; the close() method changes the value.
          """
          def __get__(self):
@@ -231,7 +231,7 @@ cdef class FastaFile:
              return self._references
  
      property nreferences:
-        """"int with the number of :term:`reference` sequences in the file.
+        """int with the number of :term:`reference` sequences in the file.
          This is a read-only attribute."""
          def __get__(self):
              return len(self._references) if self.references else None
@@ -610,7 +610,7 @@ cdef class FastxFile:
          return False
  
      property closed:
-        """"bool indicating the current state of the file object.
+        """bool indicating the current state of the file object.
          This is a read-only attribute; the close() method changes the value.
          """
          def __get__(self):
diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd

index 8bcf399f36266ef4c2d07fb053a39d5392da5690..9dafee30f8b00296acd1e5792cd4ff8f01fc16e6 100644 (file)
--- a/pysam/libchtslib.pxd
+++ b/pysam/libchtslib.pxd
@@ -20,6 +20,11 @@ cdef extern from "htslib/kstring.h" nogil:
          size_t l, m
          char *s
  
+    int kputc(int c, kstring_t *s)
+    int kputw(int c, kstring_t *s)
+    int kputl(long c, kstring_t *s)
+    int ksprintf(kstring_t *s, const char *fmt, ...)
+
  
  cdef extern from "htslib_util.h" nogil:
      int hts_set_verbosity(int verbosity)
@@ -1703,6 +1708,12 @@ cdef extern from "htslib/vcf.h" nogil:
      # Read VCF header from a file and update the header
      int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
  
+    # Appends formatted header text to _str_.
+    # If _is_bcf_ is zero, `IDX` fields are discarded.
+    #  @return 0 if successful, or negative if an error occurred
+    #  @since 1.4
+    int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str);
+    
      # Returns formatted header (newly allocated string) and its length,
      # excluding the terminating \0. If is_bcf parameter is unset, IDX
      # fields are discarded.
diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd

index ff797f879eed63a3ba427855c09a214817761349..70fda608fd4fb85b279029fb4de390523136fbfe 100644 (file)
--- a/pysam/libcsamtools.pxd
+++ b/pysam/libcsamtools.pxd
@@ -2,8 +2,8 @@ cdef extern from "samtools.pysam.h":
  
      int samtools_main(int argc, char *argv[])
      void samtools_set_stderr(int fd)
-    void samtools_unset_stderr()
+    void samtools_close_stderr()
      void samtools_set_stdout(int fd)
      void samtools_set_stdout_fn(const char *)
-    void samtools_unset_stdout()
+    void samtools_close_stdout()
      void samtools_set_optind(int)
diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx

index 994a24009777bae6f6c873d04b58162dff8ae1c8..ab8e9a63f6a17bec2fb8c36fe052faad23cdbe16 100644 (file)
--- a/pysam/libcutils.pyx
+++ b/pysam/libcutils.pyx
@@ -18,10 +18,10 @@ from libc.stdio cimport stdout as c_stdout
  from posix.fcntl cimport open as c_open, O_WRONLY
  
  from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \
-    samtools_unset_stderr, samtools_unset_stdout, samtools_set_stdout_fn, samtools_set_optind
+    samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind
  
  from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \
-    bcftools_unset_stderr, bcftools_unset_stdout, bcftools_set_stdout_fn, bcftools_set_optind
+    bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind
  
  #####################################################################
  # hard-coded constants
@@ -91,14 +91,14 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
  
  cdef from_string_and_size(const char* s, size_t length):
      if IS_PYTHON3:
-        return s[:length].decode("ascii")
+        return s[:length].decode("utf8")
      else:
          return s[:length]
  
  
  # filename encoding (adapted from lxml.etree.pyx)
  cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
-
+cdef str TEXT_ENCODING = 'utf-8'
  
  cdef bytes encode_filename(object filename):
      """Make sure a filename is 8-bit encoded (or None)."""
@@ -115,9 +115,9 @@ cdef bytes encode_filename(object filename):
          raise TypeError("Argument must be string or unicode.")
  
  
-cdef bytes force_bytes(object s, encoding="ascii"):
+cdef bytes force_bytes(object s, encoding=TEXT_ENCODING):
      """convert string or unicode object to bytes, assuming
-    ascii encoding.
+    utf8 encoding.
      """
      if s is None:
          return None
@@ -129,7 +129,7 @@ cdef bytes force_bytes(object s, encoding="ascii"):
          raise TypeError("Argument must be string, bytes or unicode.")
  
  
-cdef charptr_to_str(const char* s, encoding="ascii"):
+cdef charptr_to_str(const char* s, encoding=TEXT_ENCODING):
      if s == NULL:
          return None
      if PY_MAJOR_VERSION < 3:
@@ -138,7 +138,7 @@ cdef charptr_to_str(const char* s, encoding="ascii"):
          return s.decode(encoding)
  
  
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=TEXT_ENCODING):
      if s == NULL:
          return None
      if PY_MAJOR_VERSION < 3:
@@ -147,14 +147,14 @@ cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
          return s[:n].decode(encoding)
  
  
-cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
+cdef bytes charptr_to_bytes(const char* s, encoding=TEXT_ENCODING):
      if s == NULL:
          return None
      else:
          return s
  
  
-cdef force_str(object s, encoding="ascii"):
+cdef force_str(object s, encoding=TEXT_ENCODING):
      """Return s converted to str type of current Python
      (bytes in Py2, unicode in Py3)"""
      if s is None:
@@ -274,9 +274,25 @@ def _pysam_dispatch(collection,
      False.
      '''
  
-    if method == "index":
-        if args and not os.path.exists(args[0]):
-            raise IOError("No such file or directory: '%s'" % args[0])
+    if method == "index" and args:
+        # We make sure that at least 1 input file exists,
+        # and if it doesn't we raise an IOError.
+        SIMPLE_FLAGS = ['-c', '--csi', '-f', '--force', '-t', '--tbi', '-n', '--nstats', '-s', '--stats']
+        ARGUMENTS = ['-m', '--min-shift', '-o', '--output-file', '--threads', '-@']
+        skip_next = False
+        for arg in args:
+            if skip_next:
+                skip_next = False
+                continue
+            if arg in SIMPLE_FLAGS or (len(arg) > 2 and arg.startswith('-@')):
+                continue
+            if arg in ARGUMENTS:
+                skip_next = True
+                continue
+            if not os.path.exists(arg):
+                raise IOError("No such file or directory: '%s'" % arg)
+            else:
+                break
              
      if args is None:
          args = []
@@ -285,8 +301,6 @@ def _pysam_dispatch(collection,
  
      # redirect stderr to file
      stderr_h, stderr_f = tempfile.mkstemp()
-    samtools_set_stderr(stderr_h)
-    bcftools_set_stderr(stderr_h)
          
      # redirect stdout to file
      if save_stdout:
@@ -297,9 +311,7 @@ def _pysam_dispatch(collection,
              raise IOError("error while opening {} for writing".format(stdout_f))
  
          samtools_set_stdout_fn(force_bytes(stdout_f))
-        samtools_set_stdout(stdout_h)
          bcftools_set_stdout_fn(force_bytes(stdout_f))
-        bcftools_set_stdout(stdout_h)
              
      elif catch_stdout:
          stdout_h, stdout_f = tempfile.mkstemp()
@@ -329,12 +341,11 @@ def _pysam_dispatch(collection,
              samtools_set_stdout_fn(force_bytes(stdout_f))
              bcftools_set_stdout_fn(force_bytes(stdout_f))
              args.extend(stdout_option.format(stdout_f).split(" "))
-        else:
-            samtools_set_stdout(stdout_h)
-            bcftools_set_stdout(stdout_h)
+            stdout_h = c_open(b"/dev/null", O_WRONLY)
      else:
          samtools_set_stdout_fn("-")
          bcftools_set_stdout_fn("-")
+        stdout_h = c_open(b"/dev/null", O_WRONLY)
  
      # setup the function call to samtools/bcftools main
      cdef char ** cargs
@@ -373,9 +384,17 @@ def _pysam_dispatch(collection,
  
      # call samtools/bcftools
      if collection == b"samtools":
+        samtools_set_stdout(stdout_h)
+        samtools_set_stderr(stderr_h)
          retval = samtools_main(n + 2, cargs)
+        samtools_close_stdout()
+        samtools_close_stderr()
      elif collection == b"bcftools":
+        bcftools_set_stdout(stdout_h)
+        bcftools_set_stderr(stderr_h)
          retval = bcftools_main(n + 2, cargs)
+        bcftools_close_stdout()
+        bcftools_close_stderr()
  
      for i from 0 <= i < n:
          free(cargs[i + 2])
@@ -395,13 +414,6 @@ def _pysam_dispatch(collection,
              os.remove(fn)
          return out
  
-    samtools_unset_stderr()
-    bcftools_unset_stderr()
-
-    if save_stdout or catch_stdout:
-        samtools_unset_stdout()
-        bcftools_unset_stdout()
-
      out_stderr = _collect(stderr_f)
      if save_stdout:
          out_stdout = None
diff --git a/pysam/version.py b/pysam/version.py

index d4ec16163031ec5f58a619710a15e6220b7465f5..49afa1770f0a0e20dfc159093304a4fa50f028df 100644 (file)
--- a/pysam/version.py
+++ b/pysam/version.py
@@ -1,5 +1,5 @@
  # pysam versioning information
-__version__ = "0.15.2"
+__version__ = "0.15.3"
  
  # TODO: upgrade number
  __samtools_version__ = "1.9"
diff --git a/samtools/samtools.pysam.c b/samtools/samtools.pysam.c

index edbd9a2d4821ff2e0fae274f548306f24942a3a8..b26f892a93c77f51244b8e26481c99df5c0df90c 100644 (file)
--- a/samtools/samtools.pysam.c
+++ b/samtools/samtools.pysam.c
@@ -10,7 +10,6 @@
  FILE * samtools_stderr = NULL;
  FILE * samtools_stdout = NULL;
  const char * samtools_stdout_fn = NULL;
-int samtools_stdout_fileno = STDOUT_FILENO;
  
  
  FILE * samtools_set_stderr(int fd)
@@ -21,11 +20,10 @@ FILE * samtools_set_stderr(int fd)
    return samtools_stderr;
  }
  
-void samtools_unset_stderr(void)
+void samtools_close_stderr(void)
  {
-  if (samtools_stderr != NULL)
-    fclose(samtools_stderr);
-  samtools_stderr = fopen("/dev/null", "w");
+  fclose(samtools_stderr);
+  samtools_stderr = NULL;
  }
  
  FILE * samtools_set_stdout(int fd)
@@ -37,7 +35,6 @@ FILE * samtools_set_stdout(int fd)
      {
        fprintf(samtools_stderr, "could not set stdout to fd %i", fd);
      }
-  samtools_stdout_fileno = fd;
    return samtools_stdout;
  }
  
@@ -46,12 +43,10 @@ void samtools_set_stdout_fn(const char *fn)
    samtools_stdout_fn = fn;
  }
  
-void samtools_unset_stdout(void)
+void samtools_close_stdout(void)
  {
-  if (samtools_stdout != NULL)
-    fclose(samtools_stdout);
-  samtools_stdout = fopen("/dev/null", "w");
-  samtools_stdout_fileno = STDOUT_FILENO;
+  fclose(samtools_stdout);
+  samtools_stdout = NULL;
  }
  
  int samtools_puts(const char *s)
diff --git a/samtools/samtools.pysam.h b/samtools/samtools.pysam.h

index d97ee250acc5da1e9837170f71a6d248308cc32b..df8fd0179234306c3d83ccafbd0596b9d2886f0c 100644 (file)
--- a/samtools/samtools.pysam.h
+++ b/samtools/samtools.pysam.h
@@ -1,7 +1,7 @@
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef samtools_PYSAM_H
+#define samtools_PYSAM_H
  
-#include "stdio.h"
+#include <stdio.h>
  
  extern FILE * samtools_stderr;
  
@@ -17,7 +17,7 @@ FILE * samtools_set_stderr(int fd);
  
  /*! set pysam standard output to point to file descriptor
  
-  Setting the stderr will close the previous stdout.
+  Setting the stdout will close the previous stdout.
   */
  FILE * samtools_set_stdout(int fd);
  
@@ -26,17 +26,15 @@ FILE * samtools_set_stdout(int fd);
   */
  void samtools_set_stdout_fn(const char * fn);
  
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
    
-  Unsetting the stderr will close the previous stderr.
   */
-void samtools_unset_stderr(void);
+void samtools_close_stderr(void);
  
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
    
-  Unsetting the stderr will close the previous stderr.
   */
-void samtools_unset_stdout(void);
+void samtools_close_stdout(void);
  
  int samtools_puts(const char *s);
  
diff --git a/setup.py b/setup.py

index 49261f95bc151fa8dac8bfb18ef47bed42fe90ae..4c97e8746431fa587a03817f7aa99feb52aa9928 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -13,8 +13,6 @@ This module provides a low-level wrapper around the htslib C-API as
  using cython and a high-level API for convenient access to the data
  within standard genomic file formats.
  
-The current version wraps htslib-1.7, samtools-1.7 and bcftools-1.6.
-
  See:
  http://www.htslib.org
  https://github.com/pysam-developers/pysam
@@ -53,6 +51,7 @@ def changedir(path):
  
  
  def run_configure(option):
+    sys.stdout.flush()
      try:
          retcode = subprocess.call(
              " ".join(("./configure", option)),
@@ -80,6 +79,25 @@ def run_make_print_config():
      return make_print_config
  
  
+@contextmanager
+def set_compiler_envvars():
+    tmp_vars = []
+    for var in ['CC', 'CFLAGS', 'LDFLAGS']:
+        if var in os.environ:
+            print ("# pysam: (env) {}={}".format(var, os.environ[var]))
+        elif var in sysconfig.get_config_vars():
+            value = sysconfig.get_config_var(var)
+            print ("# pysam: (sysconfig) {}={}".format(var, value))
+            os.environ[var] = value
+            tmp_vars += [var]
+
+    try:
+        yield
+    finally:
+        for var in tmp_vars:
+            del os.environ[var]
+
+
  def configure_library(library_dir, env_options=None, options=[]):
  
      configure_script = os.path.join(library_dir, "configure")
@@ -93,7 +111,7 @@ def configure_library(library_dir, env_options=None, options=[]):
          raise ValueError(
              "configure script {} does not exist".format(configure_script))
  
-    with changedir(library_dir):
+    with changedir(library_dir), set_compiler_envvars():
          if env_options is not None:
              if run_configure(env_options):
                  return env_options
@@ -120,7 +138,7 @@ def get_pysam_version():
      sys.path.insert(0, "pysam")
      import version
      return version.__version__
-    
+
  
  # How to link against HTSLIB
  # shared:   build shared chtslib from builtin htslib code.
@@ -412,7 +430,7 @@ Operating System :: POSIX
  Operating System :: Unix
  Operating System :: MacOS
  """
-    
+
  metadata = {
      'name': "pysam",
      'version': get_pysam_version(),
diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py

index ec0f5e1d42ecf081a78e1f98f57a99b97c5c7629..e0be5ef3c5a95b128940036e521f0ccec47b15e4 100644 (file)
--- a/tests/AlignmentFile_test.py
+++ b/tests/AlignmentFile_test.py
@@ -22,7 +22,7 @@ from functools import partial
  
  import pysam
  import pysam.samtools
-from TestUtils import checkBinaryEqual, check_url, \
+from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
      check_samtools_view_equal, checkFieldEqual, force_str, \
      get_temp_filename, BAM_DATADIR
  
@@ -470,7 +470,8 @@ class TestIO(unittest.TestCase):
          self.checkEcho("ex2.bam",
                         "ex2.bam",
                         "tmp_ex2.bam",
-                       "rb", "wb")
+                       "rb", "wb",
+                       checkf=checkGZBinaryEqual)
  
      def testCRAM2CRAM(self):
          # in some systems different reference sequence paths might be
@@ -489,7 +490,8 @@ class TestIO(unittest.TestCase):
          self.checkEcho("ex2.sam",
                         "ex2.bam",
                         "tmp_ex2.bam",
-                       "r", "wb")
+                       "r", "wb",
+                       checkf=checkGZBinaryEqual)
  
      def testBAM2SAM(self):
          self.checkEcho("ex2.bam",
@@ -665,7 +667,8 @@ class TestIO(unittest.TestCase):
          self.checkEcho("ex1.bam",
                         "ex1.bam",
                         "tmp_ex1.bam",
-                       "rb", "wb")
+                       "rb", "wb",
+                       checkf=checkGZBinaryEqual)
  
          f = open(os.path.join(BAM_DATADIR, "ex1.bam"))
          samfile = pysam.AlignmentFile(f, "rb")
@@ -1372,7 +1375,7 @@ class TestDeNovoConstruction(unittest.TestCase):
          outfile.close()
  
          self.assertTrue(
-            checkBinaryEqual(tmpfilename, self.bamfile),
+            checkGZBinaryEqual(tmpfilename, self.bamfile),
              "mismatch when construction BAM file, see %s %s" %
              (tmpfilename, self.bamfile))
  
@@ -1407,6 +1410,11 @@ class TestEmptyHeader(unittest.TestCase):
          self.assertTrue("SQ" in s.header.to_dict())
          self.assertTrue("@SQ" in str(s.header))
  
+    def test_bam_without_seq_with_null_bytes_in_header(self):
+        s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header_null_bytes.bam"))
+        self.assertTrue("SQ" in s.header.to_dict())
+        self.assertTrue("@SQ" in str(s.header))
+
  
  class TestMismatchingHeader(unittest.TestCase):
      '''see issue 716.'''
diff --git a/tests/TestUtils.py b/tests/TestUtils.py

index 7811b9e1822070c6078967f185de927b275c7198..f33761e5836887e400710bfa3fc36c6fcd141244 100644 (file)
--- a/tests/TestUtils.py
+++ b/tests/TestUtils.py
@@ -96,6 +96,19 @@ def checkBinaryEqual(filename1, filename2):
      return found
  
  
+def checkGZBinaryEqual(filename1, filename2):
+    '''return true if the decompressed contents of the two files
+    are binary equal.
+    '''
+    with gzip.open(filename1, "rb") as infile1:
+        d1 = infile1.read()
+        with gzip.open(filename2, "rb") as infile2:
+            d2 = infile2.read()
+        if d1 == d2:
+            return True
+    return False
+
+
  def check_samtools_view_equal(
          filename1, filename2,
          without_header=False):
diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py

index 88bd34ff4a8674033fc102362e3ec1f9ec613c50..4458d1f163569e186b023bf5cedc161a6ce228fb 100644 (file)
--- a/tests/VariantFile_test.py
+++ b/tests/VariantFile_test.py
@@ -1,4 +1,7 @@
+# -*- coding: utf-8 -*-
+
  import os
+import glob
  import sys
  import unittest
  import pysam
@@ -633,6 +636,37 @@ class TestSubsetting(unittest.TestCase):
              inf.subset_samples(["NA00001"])
  
  
+class TestVCFVersions(unittest.TestCase):
+
+    def setUp(self):
+        self.files_to_test = (glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf.gz")) +
+                              glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf")) +
+                              glob.glob(os.path.join(CBCF_DATADIR, "example_v*.bcf")))
+    
+    def test_all_records_can_be_fetched(self):
+
+        for fn in self.files_to_test:
+            with pysam.VariantFile(fn) as inf:
+                records = list(inf.fetch())
+
+
+class TestUnicode(unittest.TestCase):
+    filename = "example_vcf43_with_utf8.vcf.gz"
+
+    def test_record_with_unicode(self):
+        with pysam.VariantFile(os.path.join(CBCF_DATADIR,
+                                            self.filename)) as inf:
+            records = list(inf.fetch("20", 2345677, 2345678))
+        self.assertEqual(len(records), 1)
+        record = records.pop()
+        self.assertEqual(
+            record.info["CLNVI"],
+            ('Institute_of_Human_Genetics', u'Friedrich-Alexander-Universität_Erlangen-Nürnberg'))
+        self.assertEqual(record.samples[0]["AN"], "alpha")
+        self.assertEqual(record.samples[1]["AN"], u"ä")
+        self.assertEqual(record.samples[2]["AN"], u"ü")
+                
+
  if __name__ == "__main__":
      # build data files
      print("building data files")
diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py

new file mode 100644 (file)

index 0000000..fd80a80
--- /dev/null
+++ b/tests/VariantRecord_test.py
@@ -0,0 +1,72 @@
+import os
+import glob
+import sys
+import unittest
+import pysam
+import shutil
+import gzip
+import subprocess
+import pytest
+
+try:
+    from pathlib import Path
+except ImportError:
+    Path = None
+
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context
+
+
+@pytest.fixture
+def vcf_header():
+    vcf_header = pysam.VariantHeader()
+    vcf_header.add_sample("sample1")
+    vcf_header.add_sample("sample2")
+    vcf_header.contigs.add("1")
+    return vcf_header
+
+## segfault without coordinates
+
+def test_ascii_annotation_can_be_added(vcf_header):
+    vcf_header.formats.add("AN", 1, "String", "An annotation")
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno1"},
+            {"AN": "anno2"}])
+    assert str(record)[:-1].split("\t")[-2:] == ["anno1", "anno2"]
+
+
+def test_ascii_annotation_with_variable_length_can_be_added(vcf_header):
+    vcf_header.formats.add("AN", 1, "String", "An annotation")
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno1b"},
+            {"AN": "anno1"}])
+    assert str(record)[:-1].split("\t")[-2:] == ["anno1b", "anno1"]
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno2"},
+            {"AN": "anno2b"}])
+    assert str(record)[:-1].split("\t")[-2:] == ["anno2", "anno2b"]
+    
+
+def test_unicode_annotation_can_be_added(vcf_header):
+    vcf_header.formats.add("AN", 1, "String", "An annotation")
+    record = vcf_header.new_record(
+        contig="1",
+        start=12,
+        stop=13,
+        samples=[
+            {"AN": "anno1"},
+            {"AN": "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"}])
+    assert str(record)[:-1].split("\t")[-2:] == [
+        "anno1",
+        "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"]
diff --git a/tests/cbcf_data/example_vcf43.vcf b/tests/cbcf_data/example_vcf43.vcf

new file mode 100644 (file)

index 0000000..9010401
--- /dev/null
+++ b/tests/cbcf_data/example_vcf43.vcf
@@ -0,0 +1,24 @@
+##fileformat=VCFv4.3
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001 NA00002 NA00003
+20     14370   rs6054257       G       A       29      PASS    NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ     0|0:48:1:51,51  1|0:48:8:51,51  1/1:43:5:.,.
+20     17330   .       T       A       3       q10     NS=3;DP=11;AF=0.017     GT:GQ:DP:HQ     0|0:49:3:58,50  0|1:3:5:65,3    0/0:41:3
+20     1110696 rs6040355       A       G,T     67      PASS    NS=2;DP=10;AF=0.333,0.667;AA=T;DB       GT:GQ:DP:HQ     1|2:21:6:23,27  2|1:2:0:18,2    2/2:35:4
+20     1230237 .       T       .       47      PASS    NS=3;DP=13;AA=T GT:GQ:DP:HQ     0|0:54:7:56,60  0|0:48:4:51,51  0/0:61:2
+20     1234567 microsat1       GTC     G,GTCT  50      PASS    NS=3;DP=9;AA=G  GT:GQ:DP        0/1:35:4        0/2:17:2        1/1:40:3
diff --git a/tests/cbcf_data/example_vcf43_with_utf8.vcf b/tests/cbcf_data/example_vcf43_with_utf8.vcf

new file mode 100644 (file)

index 0000000..4826fc4
--- /dev/null
+++ b/tests/cbcf_data/example_vcf43_with_utf8.vcf
@@ -0,0 +1,27 @@
+##fileformat=VCFv4.3
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##INFO=<ID=CLNVI,Number=.,Type=String,Description="the variant's clinical sources reported as tag-value pairs of database and variant identifier">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##FORMAT=<ID=AN,Number=1,Type=String,Description="Some annotation">
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA00001 NA00002 NA00003
+20     14370   rs6054257       G       A       29      PASS    NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ     0|0:48:1:51,51  1|0:48:8:51,51  1/1:43:5:.,.
+20     17330   .       T       A       3       q10     NS=3;DP=11;AF=0.017     GT:GQ:DP:HQ     0|0:49:3:58,50  0|1:3:5:65,3    0/0:41:3
+20     1110696 rs6040355       A       G,T     67      PASS    NS=2;DP=10;AF=0.333,0.667;AA=T;DB       GT:GQ:DP:HQ     1|2:21:6:23,27  2|1:2:0:18,2    2/2:35:4
+20     1230237 .       T       .       47      PASS    NS=3;DP=13;AA=T GT:GQ:DP:HQ     0|0:54:7:56,60  0|0:48:4:51,51  0/0:61:2
+20     1234567 microsat1       GTC     G,GTCT  50      PASS    NS=3;DP=9;AA=G  GT:GQ:DP        0/1:35:4        0/2:17:2        1/1:40:3
+20     2345678 rs123   G       A       29      PASS    NS=3;DP=14;AF=0.5;CLNVI=Institute_of_Human_Genetics,Friedrich-Alexander-Universität_Erlangen-Nürnberg;DB;H2   GT:GQ:DP:HQ:AN  0|0:48:1:51,51:alpha    1|0:48:8:51,51:ä       1/1:43:5:.,.:ü
diff --git a/tests/pysam_data/example_no_seq_in_header_null_bytes.bam b/tests/pysam_data/example_no_seq_in_header_null_bytes.bam

new file mode 100644 (file)

index 0000000..aaf4b84

Binary files /dev/null and b/tests/pysam_data/example_no_seq_in_header_null_bytes.bam differ
diff --git a/tests/samtools_test.py b/tests/samtools_test.py

index 40d84c08f8a79db88cfbde19d211052dd1dba61d..b20a869013749ede42ed5ccb9d61adb2d31e8d1b 100644 (file)
--- a/tests/samtools_test.py
+++ b/tests/samtools_test.py
@@ -45,7 +45,7 @@ def get_version(executable):
      if IS_PYTHON3:
          lines = lines.decode('ascii')
      try:
-        x = re.search("Version:\s+(\S+)", lines).groups()[0]
+        x = re.search(r"Version:\s+(\S+)", lines).groups()[0]
      except AttributeError:
          raise ValueError("could not get version from %s" % lines)
      return x
@@ -82,6 +82,7 @@ class SamtoolsTest(unittest.TestCase):
          # TODO: issues with file naming
          # "faidx ex1.fa; %(out)s_ex1.fa.fai",
          "index ex1.bam %(out)s_ex1.bam.fai",
+        "index -@2 ex1.bam %(out)s_ex1.bam.fai",
          "idxstats ex1.bam > %(out)s_ex1.idxstats",
          "fixmate ex1.bam %(out)s_ex1.fixmate.bam",
          "flagstat ex1.bam > %(out)s_ex1.flagstat",
@@ -189,7 +190,7 @@ class SamtoolsTest(unittest.TestCase):
          pysam_method = getattr(self.module, command)
  
          # run samtools
-        full_statement = re.sub("%\(out\)s", self.executable, statement)
+        full_statement = re.sub(r"%\(out\)s", self.executable, statement)
          run_command(" ".join((self.executable, full_statement)))
          # sys.stdout.write("%s %s ok" % (command, self.executable))
  
@@ -199,7 +200,7 @@ class SamtoolsTest(unittest.TestCase):
              parts = parts[:-2]
  
          # avoid interpolation to preserve string quoting, tab chars, etc.
-        pysam_parts = [re.sub("%\(out\)s", "pysam", x) for x in parts[1:]]
+        pysam_parts = [re.sub(r"%\(out\)s", "pysam", x) for x in parts[1:]]
          output = pysam_method(*pysam_parts,
                                raw=True,
                                catch_stdout=True)
@@ -272,7 +273,7 @@ class SamtoolsTest(unittest.TestCase):
              mapped_command = self.get_command(statement, map_to_internal=True)
              pysam_method = getattr(self.module, mapped_command)
              usage_msg = pysam_method.usage()
-            expected = "Usage:\s+{} {}".format(self.executable, command)
+            expected = r"Usage:\s+{} {}".format(self.executable, command)
              self.assertTrue(re.search(expected, usage_msg) is not None)
  
      def tearDown(self):
@@ -287,6 +288,18 @@ class EmptyIndexTest(unittest.TestCase):
          self.assertRaises(IOError, pysam.samtools.index,
                            "exdoesntexist.bam")
  
+    def testEmptyIndexWithExtraFlag(self):
+        self.assertRaises(IOError, pysam.samtools.index,
+                          "-c",
+                          "exdoesntexist.bam")
+
+    def testEmptyIndexWithExtraArg(self):
+        self.assertRaises(IOError, pysam.samtools.index,
+                          "-c",
+                          "-m",
+                          "14",
+                          "exdoesntexist.bam")
+
  
  if sys.platform != "darwin":
      # fails with segfault with htslib 1.5 on Osx, an issue with flockfile
diff --git a/tests/tabix_test.py b/tests/tabix_test.py

index fdd39b02624275b10a889fd6b50ce12e02fe993e..c17f7ffe40b7c746144cc1dfe980632af3dc4e8e 100644 (file)
--- a/tests/tabix_test.py
+++ b/tests/tabix_test.py
@@ -14,9 +14,8 @@ import unittest
  import subprocess
  import glob
  import re
-import copy
-import tempfile
-from TestUtils import check_url, load_and_convert, TABIX_DATADIR, get_temp_filename
+from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
+    load_and_convert, TABIX_DATADIR, get_temp_filename
  
  IS_PYTHON3 = sys.version_info[0] >= 3
  
@@ -40,30 +39,6 @@ def splitToBytes(s):
      return [x.encode("ascii") for x in s.split("\t")]
  
  
-def checkBinaryEqual(filename1, filename2):
-    '''return true if the two files are binary equal.'''
-    if os.path.getsize(filename1) != os.path.getsize(filename2):
-        return False
-
-    with open(filename1, "rb") as infile:
-        d1 = infile.read()
-
-    with open(filename2, "rb") as infile:
-        d2 = infile.read()
-
-    if len(d1) != len(d2):
-        return False
-
-    found = False
-    for c1, c2 in zip(d1, d2):
-        if c1 != c2:
-            break
-    else:
-        found = True
-
-    return found
-
-
  class TestIndexing(unittest.TestCase):
      filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
      filename_idx = os.path.join(TABIX_DATADIR, "example.gtf.gz.tbi")
@@ -77,7 +52,7 @@ class TestIndexing(unittest.TestCase):
          '''test indexing via preset.'''
  
          pysam.tabix_index(self.tmpfilename, preset="gff")
-        self.assertTrue(checkBinaryEqual(
+        self.assertTrue(checkGZBinaryEqual(
              self.tmpfilename + ".tbi", self.filename_idx))
  
      def test_indexing_to_custom_location_works(self):
@@ -86,7 +61,7 @@ class TestIndexing(unittest.TestCase):
          index_path = get_temp_filename(suffix='custom.tbi')
          pysam.tabix_index(self.tmpfilename, preset="gff",
                            index=index_path, force=True)
-        self.assertTrue(checkBinaryEqual(index_path, self.filename_idx))
+        self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx))
          os.unlink(index_path)
  
      def test_indexing_with_explict_columns_works(self):
@@ -98,7 +73,7 @@ class TestIndexing(unittest.TestCase):
                            end_col=4,
                            line_skip=0,
                            zerobased=False)
-        self.assertTrue(checkBinaryEqual(
+        self.assertTrue(checkGZBinaryEqual(
              self.tmpfilename + ".tbi", self.filename_idx))
  
      def test_indexing_with_lineskipping_works(self):
@@ -109,7 +84,7 @@ class TestIndexing(unittest.TestCase):
                            end_col=4,
                            line_skip=1,
                            zerobased=False)
-        self.assertFalse(checkBinaryEqual(
+        self.assertFalse(checkGZBinaryEqual(
              self.tmpfilename + ".tbi", self.filename_idx))
  
      def tearDown(self):
diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py

index 32f8e42dcd74d92c48fb1adf6e0030d39897adfd..7ad7db0d1a218ab49255b309d559f7fa14dbf03c 100644 (file)
--- a/tests/tabixproxies_test.py
+++ b/tests/tabixproxies_test.py
@@ -128,7 +128,7 @@ class TestGTF(TestParser):
          """build attribute string from dictionary d"""
          s = "; ".join(["{} \"{}\"".format(x, y) for (x, y) in d.items()]) + ";"
          # remove quotes around numeric values
-        s = re.sub("\"(\d+)\"", r"\1", s)
+        s = re.sub(r'"(\d+)"', r'\1', s)
          return s
  
      def testRead(self):
author	Michael R. Crusoe <michael.crusoe@gmail.com>
	Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)
committer	Michael R. Crusoe <michael.crusoe@gmail.com>
	Tue, 6 Aug 2019 17:37:29 +0000 (19:37 +0200)
.gitignore		patch \| blob \| history
.travis.yml		patch \| blob \| history
NEWS		patch \| blob \| history
README.rst		patch \| blob \| history
bcftools/bcftools.pysam.c		patch \| blob \| history
bcftools/bcftools.pysam.h		patch \| blob \| history
devtools/buildwheels.sh		patch \| blob \| history
devtools/import.py		patch \| blob \| history
devtools/run_tests_travis.sh		patch \| blob \| history
doc/api.rst		patch \| blob \| history
doc/glossary.rst		patch \| blob \| history
doc/release.rst		patch \| blob \| history
doc/usage.rst		patch \| blob \| history
import/pysam.c		patch \| blob \| history
import/pysam.h		patch \| blob \| history
pysam/htslib_util.h		patch \| blob \| history
pysam/libcalignedsegment.pxd		patch \| blob \| history
pysam/libcalignedsegment.pyx		patch \| blob \| history
pysam/libcalignmentfile.pyx		patch \| blob \| history
pysam/libcbcf.pyx		patch \| blob \| history
pysam/libcbcftools.pxd		patch \| blob \| history
pysam/libcfaidx.pyx		patch \| blob \| history
pysam/libchtslib.pxd		patch \| blob \| history
pysam/libcsamtools.pxd		patch \| blob \| history
pysam/libcutils.pyx		patch \| blob \| history
pysam/version.py		patch \| blob \| history
samtools/samtools.pysam.c		patch \| blob \| history
samtools/samtools.pysam.h		patch \| blob \| history
setup.py		patch \| blob \| history
tests/AlignmentFile_test.py		patch \| blob \| history
tests/TestUtils.py		patch \| blob \| history
tests/VariantFile_test.py		patch \| blob \| history
tests/VariantRecord_test.py	[new file with mode: 0644]	patch \| blob
tests/cbcf_data/example_vcf43.vcf	[new file with mode: 0644]	patch \| blob
tests/cbcf_data/example_vcf43_with_utf8.vcf	[new file with mode: 0644]	patch \| blob
tests/pysam_data/example_no_seq_in_header_null_bytes.bam	[new file with mode: 0644]	patch \| blob
tests/samtools_test.py		patch \| blob \| history
tests/tabix_test.py		patch \| blob \| history
tests/tabixproxies_test.py		patch \| blob \| history