- osx
language: c
-sudo: false
env:
matrix:
include htslib/htslib/*.h
include htslib/cram/*.c
include htslib/cram/*.h
-include htslib/win/*.c
-include htslib/win/*.h
+include htslib/os/*.c
+include htslib/os/*.h
include cy_build.py
include pysam.py
include requirements.txt
Release notes
=============
+Release 0.15.2
+==============
+
+Bugfix release.
+
+* [#746] catch pileup itorator out-of-scope segfaults
+* [#747] fix faixd fetch with region
+* [#748] increase max_pos to (1<<31)-1
+* [#645] Add missing macOS stub files in `MANIFEST.in`, @SoapZA
+* [#737] Fix bug in get_aligned_pairs, @bkohrn
+
Release 0.15.1
==============
Bugfix release.
+* [#746] catch pileup itorator out-of-scope segfaults
+* [#747] fix faixd fetch with region
+* [#748] increase max_pos to (1<<31)-1
+* [#645] Add missing macOS stub files in `MANIFEST.in`, @SoapZA
+* [#737] Fix bug in get_aligned_pairs, @bkohrn
+
+Release 0.15.1
+==============
+
+Bugfix release.
+
* [#716] raise ValueError if tid is out of range when writing
* [#697] release version using cython 0.28.5 for python 3.7
compatibility
# use array.tostring() to retrieve byte representation and
# save as bytes
datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
- args.extend([pytag[:2],
- ord("B"),
- typecode,
- len(value),
- force_bytes(value.tostring())])
+ if IS_PYTHON3:
+ args.extend([pytag[:2],
+ ord("B"),
+ typecode,
+ len(value),
+ value.tobytes()])
+ else:
+ args.extend([pytag[:2],
+ ord("B"),
+ typecode,
+ len(value),
+ force_bytes(value.tostring())])
else:
if typecode == 0:
else:
for i from pos <= i < pos + l:
result.append((None, i))
+ else:
+ r_idx += l
pos += l
elif op == BAM_CHARD_CLIP:
# out of sync.
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
+ if p == NULL:
+ raise ValueError(
+ "pileup buffer out of sync - most likely use of iterator "
+ "outside loop")
if pileup_base_qual_skip(p, self.min_base_quality):
continue
pileups.append(makePileupRead(p, self.header))
cdef uint32_t c = 0
cdef uint32_t cnt = 0
cdef bam_pileup1_t * p = NULL
+ if self.plp == NULL or self.plp[0] == NULL:
+ raise ValueError("PileupColumn accessed after iterator finished")
+
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
+ if p == NULL:
+ raise ValueError(
+ "pileup buffer out of sync - most likely use of iterator "
+ "outside loop")
if pileup_base_qual_skip(p, self.min_base_quality):
continue
cnt += 1
cdef uint8_t rb = 0
cdef uint8_t * buf = self.buf
cdef bam_pileup1_t * p = NULL
+
+ if self.plp == NULL or self.plp[0] == NULL:
+ raise ValueError("PileupColumn accessed after iterator finished")
# todo: reference sequence to count matches/mismatches
# todo: convert assertions to exceptions
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
+ if p == NULL:
+ raise ValueError(
+ "pileup buffer out of sync - most likely use of iterator "
+ "outside loop")
if pileup_base_qual_skip(p, self.min_base_quality):
continue
# see samtools pileup_seq
result = []
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
+ if p == NULL:
+ raise ValueError(
+ "pileup buffer out of sync - most likely use of iterator "
+ "outside loop")
+
if p.qpos < p.b.core.l_qseq:
c = bam_get_qual(p.b)[p.qpos]
else:
list: a list of quality scores
"""
+ if self.plp == NULL or self.plp[0] == NULL:
+ raise ValueError("PileupColumn accessed after iterator finished")
+
cdef uint32_t x = 0
cdef bam_pileup1_t * p = NULL
result = []
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
+ if p == NULL:
+ raise ValueError(
+ "pileup buffer out of sync - most likely use of iterator "
+ "outside loop")
+
if pileup_base_qual_skip(p, self.min_base_quality):
continue
result.append(p.b.core.qual)
list: a list of read positions
"""
+ if self.plp == NULL or self.plp[0] == NULL:
+ raise ValueError("PileupColumn accessed after iterator finished")
cdef uint32_t x = 0
cdef bam_pileup1_t * p = NULL
result = []
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
+ if p == NULL:
+ raise ValueError(
+ "pileup buffer out of sync - most likely use of iterator "
+ "outside loop")
+
if pileup_base_qual_skip(p, self.min_base_quality):
continue
result.append(p.qpos)
list: a list of query names at pileup column position.
"""
+ if self.plp == NULL or self.plp[0] == NULL:
+ raise ValueError("PileupColumn accessed after iterator finished")
+
cdef uint32_t x = 0
cdef bam_pileup1_t * p = NULL
result = []
for x from 0 <= x < self.n_pu:
p = &(self.plp[0][x])
+ if p == NULL:
+ raise ValueError(
+ "pileup buffer out of sync - most likely use of iterator "
+ "outside loop")
+
if pileup_base_qual_skip(p, self.min_base_quality):
continue
result.append(charptr_to_str(pysam_bam_get_qname(p.b)))
-
# cython: embedsignature=True
# cython: profile=True
########################################################
import array
from libc.errno cimport errno, EPIPE
from libc.string cimport strcmp, strpbrk, strerror
+from libc.stdint cimport INT32_MAX
+
from cpython cimport array as c_array
from cpython.version cimport PY_MAJOR_VERSION
########################################################
## global variables
# maximum genomic coordinace
-cdef int MAX_POS = 2 << 29
+# for some reason, using 'int' causes overlflow
+cdef int MAX_POS = (1 << 31) - 1
# valid types for SAM headers
VALID_HEADER_TYPES = {"HD" : collections.Mapping,
an iterator over genomic positions.
"""
- cdef int rtid, rstart, rstop, has_coord
-
+ cdef int rtid, has_coord
+ cdef int32_t rstart, rstop
+
if not self.is_open:
raise ValueError("I/O operation on closed file")
IteratorRow.__init__(self, samfile,
multiple_iterators=multiple_iterators)
-
with nogil:
self.iter = sam_itr_queryi(
self.index,
## Constants
########################################################################
-cdef int MAX_POS = 2 << 29
+cdef int MAX_POS = (1 << 31) - 1
cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
cdef char *ref
cdef int rstart, rend
- reference, rstart, rend = parse_region(reference, start, end, region)
+ contig, rstart, rend = parse_region(reference, start, end, region)
- if reference is None:
+ if contig is None:
raise ValueError("no sequence/region supplied.")
if rstart == rend:
return ""
- ref = reference
+ contig_b = force_bytes(contig)
+ ref = contig_b
with nogil:
length = faidx_seq_len(self.fastafile, ref)
if length == -1:
- raise KeyError("sequence '%s' not present" % reference)
+ raise KeyError("sequence '%s' not present" % contig)
if rstart >= length:
return ""
if errno:
raise IOError(errno, strerror(errno))
else:
- raise ValueError("failure when retrieving sequence on '%s'" % reference)
+ raise ValueError("failure when retrieving sequence on '%s'" % contig)
try:
return charptr_to_str(seq)
from posix.unistd cimport dup
from libc.errno cimport errno
+from libc.stdint cimport INT32_MAX
from cpython cimport PyBytes_FromStringAndSize
-
from pysam.libchtslib cimport *
-
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
from pysam.libcutils cimport encode_filename, from_string_and_size
import os
import io
import re
-from warnings import warn
+from warnings import warn
########################################################################
DEF SEEK_END = 2
# maximum genomic coordinace
-cdef int MAX_POS = 2 << 29
+cdef int MAX_POS = (1 << 31) - 1
cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
"""
cdef int rtid
- cdef long long rstart
- cdef long long rstop
+ cdef int32_t rstart
+ cdef int32_t rstop
if reference is not None:
if contig is not None:
stop = end
if contig is None and tid is None and region is None:
- return 0, 0, 0, 0
+ return 0, 0, 0, MAX_POS
rtid = -1
rstart = 0
from cpython cimport array as c_array
from libc.stdlib cimport calloc, free
from libc.string cimport strncpy
+from libc.stdint cimport INT32_MAX, int32_t
from libc.stdio cimport fprintf, stderr, fflush
from libc.stdio cimport stdout as c_stdout
from posix.fcntl cimport open as c_open, O_WRONLY
#####################################################################
# hard-coded constants
-cdef int MAX_POS = 2 << 29
+cdef int MAX_POS = (1 << 31) - 1
#################################################################
# Utility functions for quality string conversions
for x from 0 <= x < len(qualities):
result[x] = qualities[x] + offset
- return force_str(result.tostring())
+ if IS_PYTHON3:
+ return force_str(result.tobytes())
+ else:
+ return result.tostring()
cpdef qualities_to_qualitystring(qualities, int offset=33):
for invalid or out of bounds regions.
"""
- cdef long long rstart
- cdef long long rend
+ cdef int32_t rstart
+ cdef int32_t rstop
- if contig is not None:
- reference = contig
- if stop is not None:
- end = stop
+
+ if reference is not None:
+ if contig is not None:
+ raise ValueError('contig and reference should not both be specified')
+ contig = reference
+
+ if contig is not None and region is not None:
+ raise ValueError('contig/reference and region should not both be specified')
+
+ if end is not None:
+ if stop is not None:
+ raise ValueError('stop and end should not both be specified')
+ stop = end
+
+ if contig is None and region is None:
+ raise ValueError("neither contig nor region are given")
rstart = 0
- rend = MAX_POS
- if start != None:
+ rstop = MAX_POS
+ if start is not None:
try:
rstart = start
except OverflowError:
raise ValueError('start out of range (%i)' % start)
- if end != None:
+ if stop is not None:
try:
- rend = end
+ rstop = stop
except OverflowError:
- raise ValueError('end out of range (%i)' % end)
+ raise ValueError('stop out of range (%i)' % stop)
if region:
- region = force_str(region)
if ":" in region:
contig, coord = region.split(":")
parts = coord.split("-")
rstart = int(parts[0]) - 1
if len(parts) >= 1:
- rend = int(parts[1])
+ rstop = int(parts[1])
else:
contig = region
- if not reference:
- return None, 0, 0
-
+ if rstart > rstop:
+ raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
if not 0 <= rstart < MAX_POS:
raise ValueError('start out of range (%i)' % rstart)
- if not 0 <= rend <= MAX_POS:
- raise ValueError('end out of range (%i)' % rend)
- if rstart > rend:
- raise ValueError(
- 'invalid region: start (%i) > end (%i)' % (rstart, rend))
+ if not 0 <= rstop <= MAX_POS:
+ raise ValueError('stop out of range (%i)' % rstop)
- return force_bytes(reference), rstart, rend
+ return contig, rstart, rstop
def _pysam_dispatch(collection,
# pysam versioning information
-__version__ = "0.15.0"
+__version__ = "0.15.2"
# TODO: upgrade number
__samtools_version__ = "1.9"
(6, 27), (7, 28),
(8, 29), (9, 30)])
+ def test_equivalence_matches_only_and_with_seq(self):
+ a = self.build_read()
+ a.query_sequence = "ACGT" * 2
+ a.cigarstring = "4M1D4M"
+ a.set_tag("MD", "4^x4")
+ full = (list(zip(range(0, 4), range(20, 24), "ACGT")) +
+ [(None, 24, "x")] +
+ list(zip(range(4, 8), range(25, 29), "ACGT")))
+ self.assertEqual(
+ a.get_aligned_pairs(matches_only=False, with_seq=True), full)
+
+ self.assertEqual(
+ a.get_aligned_pairs(matches_only=True, with_seq=True),
+ [x for x in full if x[0] is not None and x[1] is not None])
+
+ a = self.build_read()
+ a.query_sequence = "ACGT" * 2
+ a.cigarstring = "4M1N4M"
+ a.set_tag("MD", "8")
+ full = (list(zip(range(0, 4), range(20, 24), "ACGT")) +
+ [(None, 24, None)] +
+ list(zip(range(4, 8), range(25, 29), "ACGT")))
+ self.assertEqual(
+ a.get_aligned_pairs(matches_only=False, with_seq=True), full)
+
+ self.assertEqual(
+ a.get_aligned_pairs(matches_only=True, with_seq=True),
+ [x for x in full if x[0] is not None and x[1] is not None])
+
def test_get_aligned_pairs_lowercase_md(self):
a = self.build_read()
a.query_sequence = "A" * 10
self.compare_headers(header, self.header_without_text)
self.check_name_mapping(header)
+
class TestHeaderSAM(unittest.TestCase):
"""testing header manipulation"""
def check_read_write(self, flag_write, header):
fn = get_temp_filename()
+ print(fn)
with pysam.AlignmentFile(
fn,
flag_write,
with pysam.AlignmentFile(fn) as inf:
read_header = inf.header
- os.unlink(fn)
+ # os.unlink(fn)
self.compare_headers(header, read_header)
+ expected_lengths = dict([(x["SN"], x["LN"]) for x in header["SQ"]])
+ self.assertEqual(expected_lengths,
+ dict(zip(read_header.references,
+ read_header.lengths)))
def test_SAM(self):
self.check_read_write("wh", self.header)
def test_BAM(self):
self.check_read_write("wb", self.header)
-
+
def test_CRAM(self):
header = copy.copy(self.header)
- # for CRAM, \t needs to be quoted:
- header['PG'][1]['CL'] = re.sub(r"\t", r"\\\\t", header['PG'][1]['CL'])
+ if "PG" in header:
+ # for CRAM, \t needs to be quoted:
+ header['PG'][1]['CL'] = re.sub(r"\t", r"\\\\t", header['PG'][1]['CL'])
self.check_read_write("wc", header)
+
+
+class TestHeaderLargeContigs(TestHeaderWriteRead):
+ """see issue 741"""
+
+ header = {'SQ': [{'LN': 2147483647, 'SN': 'chr1'},
+ {'LN': 1584, 'SN': 'chr2'}],
+ 'HD': {'VN': '1.0'}}
fastafile=self.fastafile,
redo_baq=True)
self.check_equal(refs, iterator)
-
+
class TestPileupReadSelectionFastafile(TestPileupReadSelection):
'''test pileup functionality - backwards compatibility'''
'''test if exception is raised if pileup col is accessed after
iterator is exhausted.'''
+ max_n = 0
for pileupcol in self.samfile.pileup():
- pass
-
- self.assertRaises(ValueError, getattr, pileupcol, "pileups")
+ if max_n < pileupcol.n:
+ max_col = pileupcol
+ max_n = pileupcol.n
+
+ self.assertRaises(ValueError, getattr, max_col, "pileups")
+ self.assertRaises(ValueError, max_col.get_query_sequences)
+ self.assertRaises(ValueError, max_col.get_num_aligned)
+ self.assertRaises(ValueError, max_col.get_query_qualities)
+ self.assertRaises(ValueError, max_col.get_mapping_qualities)
+ self.assertRaises(ValueError, max_col.get_query_positions)
+ self.assertRaises(ValueError, max_col.get_query_names)
class TestIteratorColumnBAM(unittest.TestCase):
self.assertRaises(ValueError, self.file.fetch, "chr1", 20, 10)
self.assertRaises(KeyError, self.file.fetch, "chr3", 0, 100)
+ def test_fetch_with_region_and_contig_raises_exception(self):
+ self.assertRaises(ValueError, self.file.fetch, "chr1", 10, 20, "chr1:11-20")
+
+ def test_fetch_with_region_is_equivalent(self):
+ self.assertEqual(self.file.fetch("chr1", 10, 20),
+ self.file.fetch(region="chr1:11-20"))
+
def testLength(self):
self.assertEqual(len(self.file), 2)
filename = os.path.join(BAM_DATADIR, "ex1.fa")
data_suffix = ".fa"
-
+
def test_raise_exception_if_index_is_missing(self):
self.assertRaises(IOError,
pysam.FastaFile,
if not check_url(self.url):
return
- with pysam.Fastafile(self.url) as f:
- self.assertEqual(
- len(f.fetch("chr1", 0, 1000)),
- 1000)
-
+ try:
+ with pysam.Fastafile(self.url) as f:
+ self.assertEqual(
+ len(f.fetch("chr1", 0, 1000)),
+ 1000)
+ except (OSError, IOError):
+ pass
+
def test_sequence_lengths_are_available(self):
if not check_url(self.url):
return
- with pysam.Fastafile(self.url) as f:
- self.assertEqual(len(f.references), 3366)
- self.assertTrue("chr1" in f.references)
- self.assertEqual(f.lengths[0],
- 248956422)
- self.assertEqual(f.get_reference_length("chr1"),
- 248956422)
+ try:
+ with pysam.Fastafile(self.url) as f:
+ self.assertEqual(len(f.references), 3366)
+ self.assertTrue("chr1" in f.references)
+ self.assertEqual(f.lengths[0],
+ 248956422)
+ self.assertEqual(f.get_reference_length("chr1"),
+ 248956422)
+ except (OSError, IOError):
+ pass
class TestFastqRecord(unittest.TestCase):