tests/tabix_data
samtools/config.h
-bcftools/config.g
+bcftools/config.h
htslib/config.status
htslib/config.h
htslib/config.log
htslib/config.mk
+htslib/htslib.pc.tmp
+htslib/htslib-uninstalled.pc
pysam/config.py
# linking tests
os:
- - linx
+ - linux
- osx
language: c
env:
matrix:
- CONDA_PY=2.7
- - CONDA_PY=3.4
- - CONDA_PY=3.5
- CONDA_PY=3.6
+ - CONDA_PY=3.7
global:
- PYSAM_LINKING_TEST=1
+ - TWINE_USERNAME=grepall
+ - secure: 'OcwwP8/o21+SGW0UVAnnCQwllhGSCq2HJzpI9EhX3kh6J9RTkyx/+drkg45bx1Z5u8zymuAFappEYzlpzqZE886XezkjOYGVa/u+Coqr1oT/BEJHFCkCA4o26yESp7Zy8aNj/juhB7Rfa77pIDXBayqTzbALz/AURMtZapasB18='
+
+_deploy_common: &deploy_common
+ if: tag IS present
+ install:
+ - python3 -m pip install cibuildwheel twine
+
+matrix:
+ include:
+ - stage: deploy
+ os: linux
+ language: python
+ python: '3.5'
+ services:
+ - docker
+ env:
+ - CIBW_BEFORE_BUILD="yum install -y zlib-devel bzip2-devel xz-devel && pip install -r requirements.txt"
+ - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+ addons:
+ apt:
+ packages:
+ - gcc
+ - g++
+ - libcurl4-openssl-dev # for libcurl support in sdist
+ - libssl-dev # for s3 support in sdist
+ <<: *deploy_common
+ script:
+ - set -e
+ - cibuildwheel --output-dir dist
+ - python3 -m pip install Cython
+ - python3 setup.py build_ext --inplace
+ - python3 setup.py sdist
+ - twine check dist/*
+ - twine upload --skip-existing dist/*
+ - stage: deploy
+ os: osx
+ language: generic
+ env:
+ - CIBW_BEFORE_BUILD="pip install -r requirements.txt"
+ - CIBW_ENVIRONMENT='HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"'
+ addons: {}
+ <<: *deploy_common
+ script:
+ - set -e
+ - cibuildwheel --output-dir dist
+ - twine check dist/*
+ - twine upload --skip-existing dist/*
addons:
apt:
Release notes
=============
+Release 0.15.3
+==============
+
+Bugfix release.
+
+* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
+* [#780] close all filehandles before opening new ones in pysam_dispatch
+* [#773] do not cache VariantRecord.id to avoid memory leak
+* [#781] default of multiple_iterators=True is changed to False for
+ CRAM files.
+* [#825] fix collections.abc import
+* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
+ bug when setting FORMAT fields.
+* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
+ memory as needed, rather than a fixed-size char buffer.
+* [#814] Build wheels and upload them to PyPI
+* [#755] Allow passing flags and arguments to index methods
+* [#763] Strip \0 in header check
+* [#761] Test Tabix index contents, not the compression
+
Release 0.15.2
==============
compilation options. Especially for OS X this will potentially save a
lot of trouble.
+The current version of pysam wraps 3rd-party code from htslib-1.9, samtools-1.9, and bcftools-1.9.
+
Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
FILE * bcftools_stderr = NULL;
FILE * bcftools_stdout = NULL;
const char * bcftools_stdout_fn = NULL;
-int bcftools_stdout_fileno = STDOUT_FILENO;
FILE * bcftools_set_stderr(int fd)
return bcftools_stderr;
}
-void bcftools_unset_stderr(void)
+void bcftools_close_stderr(void)
{
- if (bcftools_stderr != NULL)
- fclose(bcftools_stderr);
- bcftools_stderr = fopen("/dev/null", "w");
+ fclose(bcftools_stderr);
+ bcftools_stderr = NULL;
}
FILE * bcftools_set_stdout(int fd)
{
fprintf(bcftools_stderr, "could not set stdout to fd %i", fd);
}
- bcftools_stdout_fileno = fd;
return bcftools_stdout;
}
bcftools_stdout_fn = fn;
}
-void bcftools_unset_stdout(void)
+void bcftools_close_stdout(void)
{
- if (bcftools_stdout != NULL)
- fclose(bcftools_stdout);
- bcftools_stdout = fopen("/dev/null", "w");
- bcftools_stdout_fileno = STDOUT_FILENO;
+ fclose(bcftools_stdout);
+ bcftools_stdout = NULL;
}
int bcftools_puts(const char *s)
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef bcftools_PYSAM_H
+#define bcftools_PYSAM_H
-#include "stdio.h"
+#include <stdio.h>
extern FILE * bcftools_stderr;
/*! set pysam standard output to point to file descriptor
- Setting the stderr will close the previous stdout.
+ Setting the stdout will close the previous stdout.
*/
FILE * bcftools_set_stdout(int fd);
*/
void bcftools_set_stdout_fn(const char * fn);
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
- Unsetting the stderr will close the previous stderr.
*/
-void bcftools_unset_stderr(void);
+void bcftools_close_stderr(void);
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
- Unsetting the stderr will close the previous stderr.
*/
-void bcftools_unset_stdout(void);
+void bcftools_close_stdout(void);
int bcftools_puts(const char *s);
#
# It is best to run this in a fresh clone of the repository!
#
+# Before running, make sure to update image:
+#
+# docker pull quay.io/pypa/manylinux1_x86_64
+#
# Run this within the repository root:
# docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/buildwheels.sh
#
yum install -y zlib-devel bzip2-devel xz-devel
-# Python 2.6 is not supported
-rm -r /opt/python/cp26*
-
-# Python 3.3 builds fail with:
-# /opt/rh/devtoolset-2/root/usr/libexec/gcc/x86_64-CentOS-linux/4.8.2/ld: cannot find -lchtslib
-rm -r /opt/python/cp33*
-
# Without libcurl support, htslib can open files from HTTP and FTP URLs.
# With libcurl support, it also supports HTTPS and S3 URLs, but libcurl needs a
# current version of OpenSSL, and we do not want to be responsible for
outfile.write('#include "{}.pysam.h"\n\n'.format(basename))
subname, _ = os.path.splitext(os.path.basename(filename))
if subname in MAIN.get(basename, []):
- lines = re.sub("int main\(", "int {}_main(".format(
+ lines = re.sub(r"int main\(", "int {}_main(".format(
basename), lines)
else:
- lines = re.sub("int main\(", "int {}_{}_main(".format(
+ lines = re.sub(r"int main\(", "int {}_{}_main(".format(
basename, subname), lines)
lines = re.sub("stderr", "{}_stderr".format(basename), lines)
lines = re.sub("stdout", "{}_stdout".format(basename), lines)
- lines = re.sub(" printf\(", " fprintf({}_stdout, ".format(basename), lines)
- lines = re.sub("([^kf])puts\(", r"\1{}_puts(".format(basename), lines)
- lines = re.sub("putchar\(([^)]+)\)",
+ lines = re.sub(r" printf\(", " fprintf({}_stdout, ".format(basename), lines)
+ lines = re.sub(r"([^kf])puts\(", r"\1{}_puts(".format(basename), lines)
+ lines = re.sub(r"putchar\(([^)]+)\)",
r"fputc(\1, {}_stdout)".format(basename), lines)
fn = os.path.basename(filename)
# activate testenv environment
source ~/miniconda3/bin/activate testenv
-conda config --add channels r
conda config --add channels defaults
-conda config --add channels conda-forge
conda config --add channels bioconda
+conda config --add channels conda-forge
# pin versions, so that tests do not fail when pysam/htslib out of step
# add htslib dependencies
-conda install -y "samtools=1.7" "bcftools=1.6" "htslib=1.7" xz curl bzip2
+# NB: we force conda-forge:ncurses due to bioconda/bioconda-recipes#13488
+conda install -y "samtools=1.9" "bcftools=1.9" "htslib=1.9" xz curl bzip2 conda-forge:ncurses
# Need to make C compiler and linker use the anaconda includes and libraries:
export PREFIX=~/miniconda3/
export CFLAGS="-I${PREFIX}/include -L${PREFIX}/lib"
-export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl --disable-lzma"
+export HTSLIB_CONFIGURE_OPTIONS="--disable-libcurl"
+echo "show samtools, htslib, and bcftools versions"
samtools --version
-htslib --version
+htsfile --version
bcftools --version
# Try building conda recipe first
~/miniconda3/bin/conda-build devtools/conda-recipe/ --python=$CONDA_PY
# install code from the repository via setup.py
-echo "installing via setup.py from repository"
-python setup.py install
+echo
+echo "============ installing via setup.py from repository ============"
+echo
+python setup.py install || exit
# create auxilliary data
echo
.. autoclass:: pysam.AlignmentFile
:members:
+.. autoclass:: pysam.AlignmentHeader
+ :members:
+
An :class:`~pysam.AlignedSegment` represents an aligned segment within
a SAM/BAM file.
are 0-based, half-open intervals, i.e., the position 10,000 is
part of the interval, but 20,000 is not. An exception are
:term:`samtools` compatible region strings such as
- 'chr1:10000:20000', which are closed, i.e., both positions 10,000
+ 'chr1:10000-20000', which are closed, i.e., both positions 10,000
and 20,000 are part of the interval.
column
Release notes
=============
-Release 0.15.1
+Release 0.15.3
+==============
+
+Bugfix release.
+
+* [#824] allow reading of UTF-8 encoded text in VCF/BCF files.
+* [#780] close all filehandles before opening new ones in pysam_dispatch
+* [#773] do not cache VariantRecord.id to avoid memory leak
+* [#781] default of multiple_iterators=True is changed to False for
+ CRAM files.
+* [#825] fix collections.abc import
+* [#825] use bcf_hdr_format instead of bcf_hdr_fmt_text, fix memcpy
+ bug when setting FORMAT fields.
+* [#804] Use HTSlib's kstring_t, which reallocates and enlarges its
+ memory as needed, rather than a fixed-size char buffer.
+* [#814] Build wheels and upload them to PyPI
+* [#755] Allow passing flags and arguments to index methods
+* [#763] Strip \0 in header check
+* [#761] Test Tabix index contents, not the compression
+
+Release 0.15.2
==============
Bugfix release.
print (str(row))
This will return a tuple-like data structure in which columns can
-be retrieved by numeric index:
+be retrieved by numeric index::
for row in tbx.fetch("chr1", 1000, 2000):
print ("chromosome is", row[0])
2. The cython implementation :file:`_pysam_flagstat.pyx`. This script
imports the pysam API via::
- from pysam.calignmentfile cimport AlignmentFile, AlignedSegment
+ from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
This statement imports, amongst others, :class:`AlignedSegment`
into the namespace. Speed can be gained from declaring
FILE * @pysam@_stderr = NULL;
FILE * @pysam@_stdout = NULL;
const char * @pysam@_stdout_fn = NULL;
-int @pysam@_stdout_fileno = STDOUT_FILENO;
FILE * @pysam@_set_stderr(int fd)
return @pysam@_stderr;
}
-void @pysam@_unset_stderr(void)
+void @pysam@_close_stderr(void)
{
- if (@pysam@_stderr != NULL)
- fclose(@pysam@_stderr);
- @pysam@_stderr = fopen("/dev/null", "w");
+ fclose(@pysam@_stderr);
+ @pysam@_stderr = NULL;
}
FILE * @pysam@_set_stdout(int fd)
{
fprintf(@pysam@_stderr, "could not set stdout to fd %i", fd);
}
- @pysam@_stdout_fileno = fd;
return @pysam@_stdout;
}
@pysam@_stdout_fn = fn;
}
-void @pysam@_unset_stdout(void)
+void @pysam@_close_stdout(void)
{
- if (@pysam@_stdout != NULL)
- fclose(@pysam@_stdout);
- @pysam@_stdout = fopen("/dev/null", "w");
- @pysam@_stdout_fileno = STDOUT_FILENO;
+ fclose(@pysam@_stdout);
+ @pysam@_stdout = NULL;
}
int @pysam@_puts(const char *s)
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef @pysam@_PYSAM_H
+#define @pysam@_PYSAM_H
-#include "stdio.h"
+#include <stdio.h>
extern FILE * @pysam@_stderr;
/*! set pysam standard output to point to file descriptor
- Setting the stderr will close the previous stdout.
+ Setting the stdout will close the previous stdout.
*/
FILE * @pysam@_set_stdout(int fd);
*/
void @pysam@_set_stdout_fn(const char * fn);
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
- Unsetting the stderr will close the previous stderr.
*/
-void @pysam@_unset_stderr(void);
+void @pysam@_close_stderr(void);
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
- Unsetting the stderr will close the previous stderr.
*/
-void @pysam@_unset_stdout(void);
+void @pysam@_close_stdout(void);
int @pysam@_puts(const char *s);
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
typedef khash_t(s2i) s2i_t;
-//////////////////////////////////////////////////////////////////
-/*! set pysam standard error to point to file descriptor
-
- Setting the stderr will close the previous stderr.
- */
-// FILE * pysam_set_stderr(int fd);
-
-//////////////////////////////////////////////////////////////////
-/*! set pysam standard error to /dev/null.
-
- Unsetting the stderr will close the previous stderr.
- */
-// void pysam_unset_stderr();
-
//////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////
cdef int n_pu
cdef AlignmentHeader header
cdef uint32_t min_base_quality
- cdef uint8_t * buf
+ cdef kstring_t buf
cdef char * reference_sequence
cdef class PileupRead:
from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \
INT8_MAX, INT16_MAX, INT32_MAX, \
UINT8_MAX, UINT16_MAX, UINT32_MAX
-from libc.stdio cimport snprintf
from pysam.libchtslib cimport HTS_IDX_NOCOOR
from pysam.libcutils cimport force_bytes, force_str, \
cdef char* CODE2CIGAR= "MIDNSHP=XB"
cdef int NCIGAR_CODES = 10
-# dimensioned for 8000 pileup limit (+ insertions/deletions)
-cdef uint32_t MAX_PILEUP_BUFFER_SIZE = 10000
-
if IS_PYTHON3:
CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
maketrans = str.maketrans
dest.n_pu = n_pu
dest.min_base_quality = min_base_quality
dest.reference_sequence = reference_sequence
- dest.buf = <uint8_t *>calloc(MAX_PILEUP_BUFFER_SIZE, sizeof(uint8_t))
- if dest.buf == NULL:
- raise MemoryError("could not allocate pileup buffer")
+ dest.buf.l = dest.buf.m = 0
+ dest.buf.s = NULL
return dest
return s
def get_forward_qualities(self):
- """return the original read sequence.
+ """return base qualities of the read sequence.
- Reads mapping to the reverse strand will be reverse
- complemented.
+ Reads mapping to the reverse strand will be reversed.
"""
if self.is_reverse:
return self.query_qualities[::-1]
"\n".join(map(str, self.pileups))
def __dealloc__(self):
- if self.buf is not NULL:
- free(self.buf)
+ free(self.buf.s)
def set_min_base_quality(self, min_base_quality):
"""set the minimum base quality for this pileup column.
cdef uint32_t x = 0
cdef uint32_t j = 0
cdef uint32_t c = 0
- cdef uint32_t n = 0
cdef uint8_t cc = 0
cdef uint8_t rb = 0
- cdef uint8_t * buf = self.buf
+ cdef kstring_t * buf = &self.buf
cdef bam_pileup1_t * p = NULL
if self.plp == NULL or self.plp[0] == NULL:
raise ValueError("PileupColumn accessed after iterator finished")
+ buf.l = 0
+
# todo: reference sequence to count matches/mismatches
# todo: convert assertions to exceptions
for x from 0 <= x < self.n_pu:
continue
# see samtools pileup_seq
if mark_ends and p.is_head:
- buf[n] = '^'
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc('^', buf)
if p.b.core.qual > 93:
- buf[n] = 126
+ kputc(126, buf)
else:
- buf[n] = p.b.core.qual + 33
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc(p.b.core.qual + 33, buf)
if not p.is_del:
if p.qpos < p.b.core.l_qseq:
cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
rb = self.reference_sequence[self.reference_pos]
if seq_nt16_table[cc] == seq_nt16_table[rb]:
cc = "="
- buf[n] = strand_mark_char(cc, p.b)
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc(strand_mark_char(cc, p.b), buf)
elif add_indels:
if p.is_refskip:
if bam_is_rev(p.b):
- buf[n] = '<'
+ kputc('<', buf)
else:
- buf[n] = '>'
+ kputc('>', buf)
else:
- buf[n] = '*'
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc('*', buf)
if add_indels:
if p.indel > 0:
- buf[n] = '+'
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
- n += snprintf(<char *>&(buf[n]),
- MAX_PILEUP_BUFFER_SIZE - n,
- "%i",
- p.indel)
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc('+', buf)
+ kputw(p.indel, buf)
for j from 1 <= j <= p.indel:
cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
- buf[n] = strand_mark_char(cc, p.b)
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc(strand_mark_char(cc, p.b), buf)
elif p.indel < 0:
- buf[n] = '-'
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
- n += snprintf(<char *>&(buf[n]),
- MAX_PILEUP_BUFFER_SIZE - n,
- "%i",
- -p.indel)
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc('-', buf)
+ kputw(-p.indel, buf)
for j from 1 <= j <= -p.indel:
# TODO: out-of-range check here?
if self.reference_sequence == NULL:
cc = 'N'
else:
cc = self.reference_sequence[self.reference_pos + j]
- buf[n] = strand_mark_char(cc, p.b)
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc(strand_mark_char(cc, p.b), buf)
if mark_ends and p.is_tail:
- buf[n] = '$'
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc('$', buf)
- buf[n] = ':'
- n += 1
- assert n < MAX_PILEUP_BUFFER_SIZE
+ kputc(':', buf)
- if n == 0:
+ if buf.l == 0:
# could be zero if all qualities are too low
return ""
else:
# quicker to ensemble all and split than to encode all separately.
# ignore last ":"
- return force_str(PyBytes_FromStringAndSize(<char*>buf, n-1)).split(":")
+ return force_str(PyBytes_FromStringAndSize(buf.s, buf.l-1)).split(":")
def get_query_qualities(self):
"""query base quality scores at pileup column position.
########################################################
import os
import collections
+try:
+ from collections.abc import Sequence, Mapping # noqa
+except ImportError:
+ from collections import Sequence, Mapping # noqa
import re
import warnings
import array
cdef int MAX_POS = (1 << 31) - 1
# valid types for SAM headers
-VALID_HEADER_TYPES = {"HD" : collections.Mapping,
- "SQ" : collections.Sequence,
- "RG" : collections.Sequence,
- "PG" : collections.Sequence,
- "CO" : collections.Sequence}
+VALID_HEADER_TYPES = {"HD" : Mapping,
+ "SQ" : Sequence,
+ "RG" : Sequence,
+ "PG" : Sequence,
+ "CO" : Sequence}
# order of records within SAM headers
VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
raise ValueError(
"invalid type for record %s: %s, expected %s".format(
record, type(data), VALID_HEADER_TYPES[record]))
- if isinstance(data, collections.Mapping):
+ if isinstance(data, Mapping):
lines.append(build_header_line(data, record))
else:
for fields in header_dict[record]:
for record, data in sorted(header_dict.items()):
if record in VALID_HEADERS:
continue
- if isinstance(data, collections.Mapping):
+ if isinstance(data, Mapping):
lines.append(build_header_line(data, record))
else:
for fields in header_dict[record]:
return makeAlignmentHeader(bam_hdr_dup(self.ptr))
property nreferences:
- """"int with the number of :term:`reference` sequences in the file.
+ """int with the number of :term:`reference` sequences in the file.
This is a read-only attribute."""
def __get__(self):
# convert to python string
t = self.__str__()
for line in t.split("\n"):
- if not line.strip():
+ line = line.strip(' \0')
+ if not line:
continue
assert line.startswith("@"), \
"header line without '@': '%s'" % line
# interpret type of known header record tags, default to str
x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
- if VALID_HEADER_TYPES[record] == collections.Mapping:
+ if VALID_HEADER_TYPES[record] == Mapping:
if record in result:
raise ValueError(
"multiple '%s' lines are not permitted" % record)
result[record] = x
- elif VALID_HEADER_TYPES[record] == collections.Sequence:
+ elif VALID_HEADER_TYPES[record] == Sequence:
if record not in result: result[record] = []
result[record].append(x)
self.header = template.header.copy()
elif isinstance(header, AlignmentHeader):
self.header = header.copy()
- elif isinstance(header, collections.Mapping):
+ elif isinstance(header, Mapping):
self.header = AlignmentHeader.from_dict(header)
elif reference_names and reference_lengths:
self.header = AlignmentHeader.from_references(
Or it can be a generator filtering such reads. Example
samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
"""
- import collections
res = collections.Counter()
for r in read_iterator:
if 'N' in r.cigarstring:
AlignedSegment r
int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
- import collections
res = collections.Counter()
match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
return self.header.get_reference_length(reference)
property nreferences:
- """"int with the number of :term:`reference` sequences in the file.
+ """int with the number of :term:`reference` sequences in the file.
This is a read-only attribute."""
def __get__(self):
if self.header:
int start = 0,
int stop = MAX_POS,
int truncate = False,
+ int multiple_iterators = True,
**kwargs ):
- # initialize iterator
- self._setup_iterator(tid, start, stop, 1)
+ # initialize iterator. Multiple iterators not available
+ # for CRAM.
+ if multiple_iterators and samfile.is_cram:
+ warnings.warn("multiple_iterators not implemented for CRAM")
+ multiple_iterators = False
+
+ self._setup_iterator(tid, start, stop, multiple_iterators)
self.start = start
self.stop = stop
self.truncate = truncate
from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
from cpython.bytes cimport PyBytes_FromStringAndSize
-from cpython.unicode cimport PyUnicode_DecodeASCII
+from cpython.unicode cimport PyUnicode_DecodeUTF8
from cpython.version cimport PY_MAJOR_VERSION
from pysam.libchtslib cimport HTSFile, hisremote
if PY_MAJOR_VERSION < 3:
val = s
else:
- val = PyUnicode_DecodeASCII(s, strlen(s), NULL)
+ val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
PyDict_SetItemString(bcf_str_cache, s, val)
else:
# Otherwise, copy the entire block
b = datac[:n]
- value = tuple(v.decode('ascii') if v and v != bcf_str_missing else None for v in b.split(b','))
+ value = tuple(v.decode('utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
else:
value = []
if type == BCF_BT_INT8:
return value
-cdef bcf_copy_expand_array(void *src_data, int src_type, ssize_t src_values,
- void *dst_data, int dst_type, ssize_t dst_values,
+cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
+ void *dst_data, int dst_type, size_t dst_values,
int vlen):
+ """copy data from src to dest where the size of the elements (src_type/dst_type) differ
+ as well as the number of elements (src_values/dst_values).
+ """
+
cdef char *src_datac
cdef char *dst_datac
cdef int8_t *src_datai8
if src_type == dst_type == BCF_BT_CHAR:
src_datac = <char *>src_data
dst_datac = <char *>dst_data
- memcpy(src_datac, dst_datac, src_values)
+ memcpy(dst_datac, src_datac, src_values)
for i in range(src_values, dst_values):
dst_datac[i] = 0
elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
cdef int given = len(values)
if value_count[0] != -1 and value_count[0] != given:
if scalar[0]:
- raise TypeError('value expected to be scalar, given len={}'.format(value_count[0], given))
+ raise TypeError('value expected to be scalar, given len={}'.format(given))
else:
raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
if key == 'phased':
sample.phased = bool(value)
return
-
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int fmt_id
cdef vdict_t *d
cdef khiter_t k
cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
- cdef ssize_t i, n, value_count, alloc_size, alloc_len, dst_size
+ cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
fmt.type if fmt else -1,
fmt.n if fmt else -1,
&value_count, &scalar, &realloc)
-
vlen = value_count < 0
value_count = len(values)
-
- # If we can, write updated values to existing allocated storage
+
+ # If we can, write updated values to existing allocated storage.
if fmt and not realloc:
r.d.indiv_dirty = 1
bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
if fmt and fmt.n > alloc_len:
alloc_len = fmt.n
- n = r.n_sample
- new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
- cdef char *valp = <char *>new_values
+ nsamples = r.n_sample
+ new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
+ cdef char *new_values_p = <char *>new_values
if fmt_type == BCF_HT_INT:
dst_type = BCF_BT_INT32
else:
raise ValueError('Unsupported FORMAT type')
- if fmt and n > 1:
- for i in range(n):
- bcf_copy_expand_array(fmt.p + i*fmt.size, fmt.type, fmt.n,
- valp + i*dst_size, dst_type, alloc_len,
+ if fmt and nsamples > 1:
+ for i in range(nsamples):
+ bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
+ new_values_p + i * dst_size, dst_type, alloc_len,
vlen)
- bcf_object_to_array(values, valp + sample.index*dst_size, dst_type, alloc_len, vlen)
+ bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
- if bcf_update_format(hdr, r, bkey, valp, <int>(n*alloc_len), fmt_type) < 0:
+ if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
raise ValueError('Unable to update format values')
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf_info_t *info
cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
- cdef int ref_len
+ cdef int ref_len
# allow missing ref when instantiating a new record
if record.ref is not None:
def __str__(self):
cdef int hlen
- cdef char *hstr = bcf_hdr_fmt_text(self.ptr, 0, &hlen)
+ cdef kstring_t line
+ line.l = line.m = 0
+ line.s = NULL
- try:
- return charptr_to_str_w_len(hstr, hlen)
- finally:
- free(hstr)
+ if bcf_hdr_format(self.ptr, 0, &line) < 0:
+ if line.m:
+ free(line.s)
+ raise ValueError('bcf_hdr_format failed')
+
+ ret = charptr_to_str_w_len(line.s, line.l)
+
+ if line.m:
+ free(line.s)
+ return ret
def new_record(self, contig=None, start=0, stop=0, alleles=None,
id=None, qual=None, filter=None, info=None, samples=None,
cdef bcf1_t *r = self.ptr
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+ # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
+ # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
+ return charptr_to_str(r.d.id) if r.d.id != b'.' else None
@id.setter
def id(self, value):
@alleles.setter
def alleles(self, values):
cdef bcf1_t *r = self.ptr
-
+
# Cache rlen of symbolic alleles before call to bcf_update_alleles_str
cdef int rlen = r.rlen
int bcftools_main(int argc, char *argv[])
void bcftools_set_stderr(int fd)
- void bcftools_unset_stderr()
+ void bcftools_close_stderr()
void bcftools_set_stdout(int fd)
void bcftools_set_stdout_fn(const char *)
- void bcftools_unset_stdout()
+ void bcftools_close_stdout()
void bcftools_set_optind(int)
return False
property closed:
- """"bool indicating the current state of the file object.
+ """bool indicating the current state of the file object.
This is a read-only attribute; the close() method changes the value.
"""
def __get__(self):
return self._references
property nreferences:
- """"int with the number of :term:`reference` sequences in the file.
+ """int with the number of :term:`reference` sequences in the file.
This is a read-only attribute."""
def __get__(self):
return len(self._references) if self.references else None
return False
property closed:
- """"bool indicating the current state of the file object.
+ """bool indicating the current state of the file object.
This is a read-only attribute; the close() method changes the value.
"""
def __get__(self):
size_t l, m
char *s
+ int kputc(int c, kstring_t *s)
+ int kputw(int c, kstring_t *s)
+ int kputl(long c, kstring_t *s)
+ int ksprintf(kstring_t *s, const char *fmt, ...)
+
cdef extern from "htslib_util.h" nogil:
int hts_set_verbosity(int verbosity)
# Read VCF header from a file and update the header
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
+ # Appends formatted header text to _str_.
+ # If _is_bcf_ is zero, `IDX` fields are discarded.
+ # @return 0 if successful, or negative if an error occurred
+ # @since 1.4
+ int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str);
+
# Returns formatted header (newly allocated string) and its length,
# excluding the terminating \0. If is_bcf parameter is unset, IDX
# fields are discarded.
int samtools_main(int argc, char *argv[])
void samtools_set_stderr(int fd)
- void samtools_unset_stderr()
+ void samtools_close_stderr()
void samtools_set_stdout(int fd)
void samtools_set_stdout_fn(const char *)
- void samtools_unset_stdout()
+ void samtools_close_stdout()
void samtools_set_optind(int)
from posix.fcntl cimport open as c_open, O_WRONLY
from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \
- samtools_unset_stderr, samtools_unset_stdout, samtools_set_stdout_fn, samtools_set_optind
+ samtools_close_stdout, samtools_close_stderr, samtools_set_stdout_fn, samtools_set_optind
from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \
- bcftools_unset_stderr, bcftools_unset_stdout, bcftools_set_stdout_fn, bcftools_set_optind
+ bcftools_close_stdout, bcftools_close_stderr, bcftools_set_stdout_fn, bcftools_set_optind
#####################################################################
# hard-coded constants
cdef from_string_and_size(const char* s, size_t length):
if IS_PYTHON3:
- return s[:length].decode("ascii")
+ return s[:length].decode("utf8")
else:
return s[:length]
# filename encoding (adapted from lxml.etree.pyx)
cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
-
+cdef str TEXT_ENCODING = 'utf-8'
cdef bytes encode_filename(object filename):
"""Make sure a filename is 8-bit encoded (or None)."""
raise TypeError("Argument must be string or unicode.")
-cdef bytes force_bytes(object s, encoding="ascii"):
+cdef bytes force_bytes(object s, encoding=TEXT_ENCODING):
"""convert string or unicode object to bytes, assuming
- ascii encoding.
+ utf8 encoding.
"""
if s is None:
return None
raise TypeError("Argument must be string, bytes or unicode.")
-cdef charptr_to_str(const char* s, encoding="ascii"):
+cdef charptr_to_str(const char* s, encoding=TEXT_ENCODING):
if s == NULL:
return None
if PY_MAJOR_VERSION < 3:
return s.decode(encoding)
-cdef charptr_to_str_w_len(const char* s, size_t n, encoding="ascii"):
+cdef charptr_to_str_w_len(const char* s, size_t n, encoding=TEXT_ENCODING):
if s == NULL:
return None
if PY_MAJOR_VERSION < 3:
return s[:n].decode(encoding)
-cdef bytes charptr_to_bytes(const char* s, encoding="ascii"):
+cdef bytes charptr_to_bytes(const char* s, encoding=TEXT_ENCODING):
if s == NULL:
return None
else:
return s
-cdef force_str(object s, encoding="ascii"):
+cdef force_str(object s, encoding=TEXT_ENCODING):
"""Return s converted to str type of current Python
(bytes in Py2, unicode in Py3)"""
if s is None:
False.
'''
- if method == "index":
- if args and not os.path.exists(args[0]):
- raise IOError("No such file or directory: '%s'" % args[0])
+ if method == "index" and args:
+ # We make sure that at least 1 input file exists,
+ # and if it doesn't we raise an IOError.
+ SIMPLE_FLAGS = ['-c', '--csi', '-f', '--force', '-t', '--tbi', '-n', '--nstats', '-s', '--stats']
+ ARGUMENTS = ['-m', '--min-shift', '-o', '--output-file', '--threads', '-@']
+ skip_next = False
+ for arg in args:
+ if skip_next:
+ skip_next = False
+ continue
+ if arg in SIMPLE_FLAGS or (len(arg) > 2 and arg.startswith('-@')):
+ continue
+ if arg in ARGUMENTS:
+ skip_next = True
+ continue
+ if not os.path.exists(arg):
+ raise IOError("No such file or directory: '%s'" % arg)
+ else:
+ break
if args is None:
args = []
# redirect stderr to file
stderr_h, stderr_f = tempfile.mkstemp()
- samtools_set_stderr(stderr_h)
- bcftools_set_stderr(stderr_h)
# redirect stdout to file
if save_stdout:
raise IOError("error while opening {} for writing".format(stdout_f))
samtools_set_stdout_fn(force_bytes(stdout_f))
- samtools_set_stdout(stdout_h)
bcftools_set_stdout_fn(force_bytes(stdout_f))
- bcftools_set_stdout(stdout_h)
elif catch_stdout:
stdout_h, stdout_f = tempfile.mkstemp()
samtools_set_stdout_fn(force_bytes(stdout_f))
bcftools_set_stdout_fn(force_bytes(stdout_f))
args.extend(stdout_option.format(stdout_f).split(" "))
- else:
- samtools_set_stdout(stdout_h)
- bcftools_set_stdout(stdout_h)
+ stdout_h = c_open(b"/dev/null", O_WRONLY)
else:
samtools_set_stdout_fn("-")
bcftools_set_stdout_fn("-")
+ stdout_h = c_open(b"/dev/null", O_WRONLY)
# setup the function call to samtools/bcftools main
cdef char ** cargs
# call samtools/bcftools
if collection == b"samtools":
+ samtools_set_stdout(stdout_h)
+ samtools_set_stderr(stderr_h)
retval = samtools_main(n + 2, cargs)
+ samtools_close_stdout()
+ samtools_close_stderr()
elif collection == b"bcftools":
+ bcftools_set_stdout(stdout_h)
+ bcftools_set_stderr(stderr_h)
retval = bcftools_main(n + 2, cargs)
+ bcftools_close_stdout()
+ bcftools_close_stderr()
for i from 0 <= i < n:
free(cargs[i + 2])
os.remove(fn)
return out
- samtools_unset_stderr()
- bcftools_unset_stderr()
-
- if save_stdout or catch_stdout:
- samtools_unset_stdout()
- bcftools_unset_stdout()
-
out_stderr = _collect(stderr_f)
if save_stdout:
out_stdout = None
# pysam versioning information
-__version__ = "0.15.2"
+__version__ = "0.15.3"
# TODO: upgrade number
__samtools_version__ = "1.9"
FILE * samtools_stderr = NULL;
FILE * samtools_stdout = NULL;
const char * samtools_stdout_fn = NULL;
-int samtools_stdout_fileno = STDOUT_FILENO;
FILE * samtools_set_stderr(int fd)
return samtools_stderr;
}
-void samtools_unset_stderr(void)
+void samtools_close_stderr(void)
{
- if (samtools_stderr != NULL)
- fclose(samtools_stderr);
- samtools_stderr = fopen("/dev/null", "w");
+ fclose(samtools_stderr);
+ samtools_stderr = NULL;
}
FILE * samtools_set_stdout(int fd)
{
fprintf(samtools_stderr, "could not set stdout to fd %i", fd);
}
- samtools_stdout_fileno = fd;
return samtools_stdout;
}
samtools_stdout_fn = fn;
}
-void samtools_unset_stdout(void)
+void samtools_close_stdout(void)
{
- if (samtools_stdout != NULL)
- fclose(samtools_stdout);
- samtools_stdout = fopen("/dev/null", "w");
- samtools_stdout_fileno = STDOUT_FILENO;
+ fclose(samtools_stdout);
+ samtools_stdout = NULL;
}
int samtools_puts(const char *s)
-#ifndef PYSAM_H
-#define PYSAM_H
+#ifndef samtools_PYSAM_H
+#define samtools_PYSAM_H
-#include "stdio.h"
+#include <stdio.h>
extern FILE * samtools_stderr;
/*! set pysam standard output to point to file descriptor
- Setting the stderr will close the previous stdout.
+ Setting the stdout will close the previous stdout.
*/
FILE * samtools_set_stdout(int fd);
*/
void samtools_set_stdout_fn(const char * fn);
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard error and set to NULL
- Unsetting the stderr will close the previous stderr.
*/
-void samtools_unset_stderr(void);
+void samtools_close_stderr(void);
-/*! set pysam standard error to /dev/null.
+/*! close pysam standard output and set to NULL
- Unsetting the stderr will close the previous stderr.
*/
-void samtools_unset_stdout(void);
+void samtools_close_stdout(void);
int samtools_puts(const char *s);
using cython and a high-level API for convenient access to the data
within standard genomic file formats.
-The current version wraps htslib-1.7, samtools-1.7 and bcftools-1.6.
-
See:
http://www.htslib.org
https://github.com/pysam-developers/pysam
def run_configure(option):
+ sys.stdout.flush()
try:
retcode = subprocess.call(
" ".join(("./configure", option)),
return make_print_config
+@contextmanager
+def set_compiler_envvars():
+ tmp_vars = []
+ for var in ['CC', 'CFLAGS', 'LDFLAGS']:
+ if var in os.environ:
+ print ("# pysam: (env) {}={}".format(var, os.environ[var]))
+ elif var in sysconfig.get_config_vars():
+ value = sysconfig.get_config_var(var)
+ print ("# pysam: (sysconfig) {}={}".format(var, value))
+ os.environ[var] = value
+ tmp_vars += [var]
+
+ try:
+ yield
+ finally:
+ for var in tmp_vars:
+ del os.environ[var]
+
+
def configure_library(library_dir, env_options=None, options=[]):
configure_script = os.path.join(library_dir, "configure")
raise ValueError(
"configure script {} does not exist".format(configure_script))
- with changedir(library_dir):
+ with changedir(library_dir), set_compiler_envvars():
if env_options is not None:
if run_configure(env_options):
return env_options
sys.path.insert(0, "pysam")
import version
return version.__version__
-
+
# How to link against HTSLIB
# shared: build shared chtslib from builtin htslib code.
Operating System :: Unix
Operating System :: MacOS
"""
-
+
metadata = {
'name': "pysam",
'version': get_pysam_version(),
import pysam
import pysam.samtools
-from TestUtils import checkBinaryEqual, check_url, \
+from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
check_samtools_view_equal, checkFieldEqual, force_str, \
get_temp_filename, BAM_DATADIR
self.checkEcho("ex2.bam",
"ex2.bam",
"tmp_ex2.bam",
- "rb", "wb")
+ "rb", "wb",
+ checkf=checkGZBinaryEqual)
def testCRAM2CRAM(self):
# in some systems different reference sequence paths might be
self.checkEcho("ex2.sam",
"ex2.bam",
"tmp_ex2.bam",
- "r", "wb")
+ "r", "wb",
+ checkf=checkGZBinaryEqual)
def testBAM2SAM(self):
self.checkEcho("ex2.bam",
self.checkEcho("ex1.bam",
"ex1.bam",
"tmp_ex1.bam",
- "rb", "wb")
+ "rb", "wb",
+ checkf=checkGZBinaryEqual)
f = open(os.path.join(BAM_DATADIR, "ex1.bam"))
samfile = pysam.AlignmentFile(f, "rb")
outfile.close()
self.assertTrue(
- checkBinaryEqual(tmpfilename, self.bamfile),
+ checkGZBinaryEqual(tmpfilename, self.bamfile),
"mismatch when construction BAM file, see %s %s" %
(tmpfilename, self.bamfile))
self.assertTrue("SQ" in s.header.to_dict())
self.assertTrue("@SQ" in str(s.header))
+ def test_bam_without_seq_with_null_bytes_in_header(self):
+ s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header_null_bytes.bam"))
+ self.assertTrue("SQ" in s.header.to_dict())
+ self.assertTrue("@SQ" in str(s.header))
+
class TestMismatchingHeader(unittest.TestCase):
'''see issue 716.'''
return found
+def checkGZBinaryEqual(filename1, filename2):
+ '''return true if the decompressed contents of the two files
+ are binary equal.
+ '''
+ with gzip.open(filename1, "rb") as infile1:
+ d1 = infile1.read()
+ with gzip.open(filename2, "rb") as infile2:
+ d2 = infile2.read()
+ if d1 == d2:
+ return True
+ return False
+
+
def check_samtools_view_equal(
filename1, filename2,
without_header=False):
+# -*- coding: utf-8 -*-
+
import os
+import glob
import sys
import unittest
import pysam
inf.subset_samples(["NA00001"])
+class TestVCFVersions(unittest.TestCase):
+
+ def setUp(self):
+ self.files_to_test = (glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf.gz")) +
+ glob.glob(os.path.join(CBCF_DATADIR, "example_v*.vcf")) +
+ glob.glob(os.path.join(CBCF_DATADIR, "example_v*.bcf")))
+
+ def test_all_records_can_be_fetched(self):
+
+ for fn in self.files_to_test:
+ with pysam.VariantFile(fn) as inf:
+ records = list(inf.fetch())
+
+
+class TestUnicode(unittest.TestCase):
+ filename = "example_vcf43_with_utf8.vcf.gz"
+
+ def test_record_with_unicode(self):
+ with pysam.VariantFile(os.path.join(CBCF_DATADIR,
+ self.filename)) as inf:
+ records = list(inf.fetch("20", 2345677, 2345678))
+ self.assertEqual(len(records), 1)
+ record = records.pop()
+ self.assertEqual(
+ record.info["CLNVI"],
+ ('Institute_of_Human_Genetics', u'Friedrich-Alexander-Universität_Erlangen-Nürnberg'))
+ self.assertEqual(record.samples[0]["AN"], "alpha")
+ self.assertEqual(record.samples[1]["AN"], u"ä")
+ self.assertEqual(record.samples[2]["AN"], u"ü")
+
+
if __name__ == "__main__":
# build data files
print("building data files")
--- /dev/null
+import os
+import glob
+import sys
+import unittest
+import pysam
+import shutil
+import gzip
+import subprocess
+import pytest
+
+try:
+ from pathlib import Path
+except ImportError:
+ Path = None
+
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context
+
+
+@pytest.fixture
+def vcf_header():
+ vcf_header = pysam.VariantHeader()
+ vcf_header.add_sample("sample1")
+ vcf_header.add_sample("sample2")
+ vcf_header.contigs.add("1")
+ return vcf_header
+
+## segfault without coordinates
+
+def test_ascii_annotation_can_be_added(vcf_header):
+ vcf_header.formats.add("AN", 1, "String", "An annotation")
+ record = vcf_header.new_record(
+ contig="1",
+ start=12,
+ stop=13,
+ samples=[
+ {"AN": "anno1"},
+ {"AN": "anno2"}])
+ assert str(record)[:-1].split("\t")[-2:] == ["anno1", "anno2"]
+
+
+def test_ascii_annotation_with_variable_length_can_be_added(vcf_header):
+ vcf_header.formats.add("AN", 1, "String", "An annotation")
+ record = vcf_header.new_record(
+ contig="1",
+ start=12,
+ stop=13,
+ samples=[
+ {"AN": "anno1b"},
+ {"AN": "anno1"}])
+ assert str(record)[:-1].split("\t")[-2:] == ["anno1b", "anno1"]
+ record = vcf_header.new_record(
+ contig="1",
+ start=12,
+ stop=13,
+ samples=[
+ {"AN": "anno2"},
+ {"AN": "anno2b"}])
+ assert str(record)[:-1].split("\t")[-2:] == ["anno2", "anno2b"]
+
+
+def test_unicode_annotation_can_be_added(vcf_header):
+ vcf_header.formats.add("AN", 1, "String", "An annotation")
+ record = vcf_header.new_record(
+ contig="1",
+ start=12,
+ stop=13,
+ samples=[
+ {"AN": "anno1"},
+ {"AN": "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"}])
+ assert str(record)[:-1].split("\t")[-2:] == [
+ "anno1",
+ "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"]
--- /dev/null
+##fileformat=VCFv4.3
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
+20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
+20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
--- /dev/null
+##fileformat=VCFv4.3
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##INFO=<ID=CLNVI,Number=.,Type=String,Description="the variant's clinical sources reported as tag-value pairs of database and variant identifier">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##FORMAT=<ID=AN,Number=1,Type=String,Description="Some annotation">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
+20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
+20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
+20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
+20 2345678 rs123 G A 29 PASS NS=3;DP=14;AF=0.5;CLNVI=Institute_of_Human_Genetics,Friedrich-Alexander-Universität_Erlangen-Nürnberg;DB;H2 GT:GQ:DP:HQ:AN 0|0:48:1:51,51:alpha 1|0:48:8:51,51:ä 1/1:43:5:.,.:ü
if IS_PYTHON3:
lines = lines.decode('ascii')
try:
- x = re.search("Version:\s+(\S+)", lines).groups()[0]
+ x = re.search(r"Version:\s+(\S+)", lines).groups()[0]
except AttributeError:
raise ValueError("could not get version from %s" % lines)
return x
# TODO: issues with file naming
# "faidx ex1.fa; %(out)s_ex1.fa.fai",
"index ex1.bam %(out)s_ex1.bam.fai",
+ "index -@2 ex1.bam %(out)s_ex1.bam.fai",
"idxstats ex1.bam > %(out)s_ex1.idxstats",
"fixmate ex1.bam %(out)s_ex1.fixmate.bam",
"flagstat ex1.bam > %(out)s_ex1.flagstat",
pysam_method = getattr(self.module, command)
# run samtools
- full_statement = re.sub("%\(out\)s", self.executable, statement)
+ full_statement = re.sub(r"%\(out\)s", self.executable, statement)
run_command(" ".join((self.executable, full_statement)))
# sys.stdout.write("%s %s ok" % (command, self.executable))
parts = parts[:-2]
# avoid interpolation to preserve string quoting, tab chars, etc.
- pysam_parts = [re.sub("%\(out\)s", "pysam", x) for x in parts[1:]]
+ pysam_parts = [re.sub(r"%\(out\)s", "pysam", x) for x in parts[1:]]
output = pysam_method(*pysam_parts,
raw=True,
catch_stdout=True)
mapped_command = self.get_command(statement, map_to_internal=True)
pysam_method = getattr(self.module, mapped_command)
usage_msg = pysam_method.usage()
- expected = "Usage:\s+{} {}".format(self.executable, command)
+ expected = r"Usage:\s+{} {}".format(self.executable, command)
self.assertTrue(re.search(expected, usage_msg) is not None)
def tearDown(self):
self.assertRaises(IOError, pysam.samtools.index,
"exdoesntexist.bam")
+ def testEmptyIndexWithExtraFlag(self):
+ self.assertRaises(IOError, pysam.samtools.index,
+ "-c",
+ "exdoesntexist.bam")
+
+ def testEmptyIndexWithExtraArg(self):
+ self.assertRaises(IOError, pysam.samtools.index,
+ "-c",
+ "-m",
+ "14",
+ "exdoesntexist.bam")
+
if sys.platform != "darwin":
# fails with segfault with htslib 1.5 on Osx, an issue with flockfile
import subprocess
import glob
import re
-import copy
-import tempfile
-from TestUtils import check_url, load_and_convert, TABIX_DATADIR, get_temp_filename
+from TestUtils import checkBinaryEqual, checkGZBinaryEqual, check_url, \
+ load_and_convert, TABIX_DATADIR, get_temp_filename
IS_PYTHON3 = sys.version_info[0] >= 3
return [x.encode("ascii") for x in s.split("\t")]
-def checkBinaryEqual(filename1, filename2):
- '''return true if the two files are binary equal.'''
- if os.path.getsize(filename1) != os.path.getsize(filename2):
- return False
-
- with open(filename1, "rb") as infile:
- d1 = infile.read()
-
- with open(filename2, "rb") as infile:
- d2 = infile.read()
-
- if len(d1) != len(d2):
- return False
-
- found = False
- for c1, c2 in zip(d1, d2):
- if c1 != c2:
- break
- else:
- found = True
-
- return found
-
-
class TestIndexing(unittest.TestCase):
filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
filename_idx = os.path.join(TABIX_DATADIR, "example.gtf.gz.tbi")
'''test indexing via preset.'''
pysam.tabix_index(self.tmpfilename, preset="gff")
- self.assertTrue(checkBinaryEqual(
+ self.assertTrue(checkGZBinaryEqual(
self.tmpfilename + ".tbi", self.filename_idx))
def test_indexing_to_custom_location_works(self):
index_path = get_temp_filename(suffix='custom.tbi')
pysam.tabix_index(self.tmpfilename, preset="gff",
index=index_path, force=True)
- self.assertTrue(checkBinaryEqual(index_path, self.filename_idx))
+ self.assertTrue(checkGZBinaryEqual(index_path, self.filename_idx))
os.unlink(index_path)
def test_indexing_with_explict_columns_works(self):
end_col=4,
line_skip=0,
zerobased=False)
- self.assertTrue(checkBinaryEqual(
+ self.assertTrue(checkGZBinaryEqual(
self.tmpfilename + ".tbi", self.filename_idx))
def test_indexing_with_lineskipping_works(self):
end_col=4,
line_skip=1,
zerobased=False)
- self.assertFalse(checkBinaryEqual(
+ self.assertFalse(checkGZBinaryEqual(
self.tmpfilename + ".tbi", self.filename_idx))
def tearDown(self):
"""build attribute string from dictionary d"""
s = "; ".join(["{} \"{}\"".format(x, y) for (x, y) in d.items()]) + ";"
# remove quotes around numeric values
- s = re.sub("\"(\d+)\"", r"\1", s)
+ s = re.sub(r'"(\d+)"', r'\1', s)
return s
def testRead(self):