From 804729d07198184729ad0e30d76070d1ce9e8fce Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Wed, 16 Jan 2019 01:47:32 -0800 Subject: [PATCH] New upstream version 0.15.2+ds --- .travis.yml | 1 - MANIFEST.in | 4 +- NEWS | 11 ++++++ doc/release.rst | 11 ++++++ pysam/libcalignedsegment.pyx | 65 ++++++++++++++++++++++++++++--- pysam/libcalignmentfile.pyx | 12 +++--- pysam/libcbcf.pyx | 2 +- pysam/libcfaidx.pyx | 11 +++--- pysam/libchtslib.pyx | 13 +++---- pysam/libcutils.pyx | 59 ++++++++++++++++------------ pysam/version.py | 2 +- tests/AlignedSegment_test.py | 29 ++++++++++++++ tests/AlignmentFileHeader_test.py | 23 +++++++++-- tests/AlignmentFilePileup_test.py | 17 ++++++-- tests/faidx_test.py | 39 ++++++++++++------- 15 files changed, 227 insertions(+), 72 deletions(-) diff --git a/.travis.yml b/.travis.yml index beccc4d..3642281 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ os: - osx language: c -sudo: false env: matrix: diff --git a/MANIFEST.in b/MANIFEST.in index 531159d..aaacb22 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -50,8 +50,8 @@ include htslib/htslib.pc.in include htslib/htslib/*.h include htslib/cram/*.c include htslib/cram/*.h -include htslib/win/*.c -include htslib/win/*.h +include htslib/os/*.c +include htslib/os/*.h include cy_build.py include pysam.py include requirements.txt diff --git a/NEWS b/NEWS index 0ac7303..4861555 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,17 @@ http://pysam.readthedocs.io/en/latest/release.html Release notes ============= +Release 0.15.2 +============== + +Bugfix release. + +* [#746] catch pileup itorator out-of-scope segfaults +* [#747] fix faixd fetch with region +* [#748] increase max_pos to (1<<31)-1 +* [#645] Add missing macOS stub files in `MANIFEST.in`, @SoapZA +* [#737] Fix bug in get_aligned_pairs, @bkohrn + Release 0.15.1 ============== diff --git a/doc/release.rst b/doc/release.rst index ed3afb2..5adc437 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -7,6 +7,17 @@ Release 0.15.1 Bugfix release. +* [#746] catch pileup itorator out-of-scope segfaults +* [#747] fix faixd fetch with region +* [#748] increase max_pos to (1<<31)-1 +* [#645] Add missing macOS stub files in `MANIFEST.in`, @SoapZA +* [#737] Fix bug in get_aligned_pairs, @bkohrn + +Release 0.15.1 +============== + +Bugfix release. + * [#716] raise ValueError if tid is out of range when writing * [#697] release version using cython 0.28.5 for python 3.7 compatibility diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index d0a20d5..7ad61ac 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -402,11 +402,18 @@ cdef inline pack_tags(tags): # use array.tostring() to retrieve byte representation and # save as bytes datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1]) - args.extend([pytag[:2], - ord("B"), - typecode, - len(value), - force_bytes(value.tostring())]) + if IS_PYTHON3: + args.extend([pytag[:2], + ord("B"), + typecode, + len(value), + value.tobytes()]) + else: + args.extend([pytag[:2], + ord("B"), + typecode, + len(value), + force_bytes(value.tostring())]) else: if typecode == 0: @@ -1945,6 +1952,8 @@ cdef class AlignedSegment: else: for i from pos <= i < pos + l: result.append((None, i)) + else: + r_idx += l pos += l elif op == BAM_CHARD_CLIP: @@ -2845,6 +2854,10 @@ cdef class PileupColumn: # out of sync. for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) + if p == NULL: + raise ValueError( + "pileup buffer out of sync - most likely use of iterator " + "outside loop") if pileup_base_qual_skip(p, self.min_base_quality): continue pileups.append(makePileupRead(p, self.header)) @@ -2887,8 +2900,15 @@ cdef class PileupColumn: cdef uint32_t c = 0 cdef uint32_t cnt = 0 cdef bam_pileup1_t * p = NULL + if self.plp == NULL or self.plp[0] == NULL: + raise ValueError("PileupColumn accessed after iterator finished") + for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) + if p == NULL: + raise ValueError( + "pileup buffer out of sync - most likely use of iterator " + "outside loop") if pileup_base_qual_skip(p, self.min_base_quality): continue cnt += 1 @@ -2956,11 +2976,18 @@ cdef class PileupColumn: cdef uint8_t rb = 0 cdef uint8_t * buf = self.buf cdef bam_pileup1_t * p = NULL + + if self.plp == NULL or self.plp[0] == NULL: + raise ValueError("PileupColumn accessed after iterator finished") # todo: reference sequence to count matches/mismatches # todo: convert assertions to exceptions for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) + if p == NULL: + raise ValueError( + "pileup buffer out of sync - most likely use of iterator " + "outside loop") if pileup_base_qual_skip(p, self.min_base_quality): continue # see samtools pileup_seq @@ -3062,6 +3089,11 @@ cdef class PileupColumn: result = [] for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) + if p == NULL: + raise ValueError( + "pileup buffer out of sync - most likely use of iterator " + "outside loop") + if p.qpos < p.b.core.l_qseq: c = bam_get_qual(p.b)[p.qpos] else: @@ -3079,11 +3111,19 @@ cdef class PileupColumn: list: a list of quality scores """ + if self.plp == NULL or self.plp[0] == NULL: + raise ValueError("PileupColumn accessed after iterator finished") + cdef uint32_t x = 0 cdef bam_pileup1_t * p = NULL result = [] for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) + if p == NULL: + raise ValueError( + "pileup buffer out of sync - most likely use of iterator " + "outside loop") + if pileup_base_qual_skip(p, self.min_base_quality): continue result.append(p.b.core.qual) @@ -3097,12 +3137,19 @@ cdef class PileupColumn: list: a list of read positions """ + if self.plp == NULL or self.plp[0] == NULL: + raise ValueError("PileupColumn accessed after iterator finished") cdef uint32_t x = 0 cdef bam_pileup1_t * p = NULL result = [] for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) + if p == NULL: + raise ValueError( + "pileup buffer out of sync - most likely use of iterator " + "outside loop") + if pileup_base_qual_skip(p, self.min_base_quality): continue result.append(p.qpos) @@ -3116,11 +3163,19 @@ cdef class PileupColumn: list: a list of query names at pileup column position. """ + if self.plp == NULL or self.plp[0] == NULL: + raise ValueError("PileupColumn accessed after iterator finished") + cdef uint32_t x = 0 cdef bam_pileup1_t * p = NULL result = [] for x from 0 <= x < self.n_pu: p = &(self.plp[0][x]) + if p == NULL: + raise ValueError( + "pileup buffer out of sync - most likely use of iterator " + "outside loop") + if pileup_base_qual_skip(p, self.min_base_quality): continue result.append(charptr_to_str(pysam_bam_get_qname(p.b))) diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 1d42dda..4030868 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -1,4 +1,3 @@ - # cython: embedsignature=True # cython: profile=True ######################################################## @@ -62,6 +61,8 @@ import warnings import array from libc.errno cimport errno, EPIPE from libc.string cimport strcmp, strpbrk, strerror +from libc.stdint cimport INT32_MAX + from cpython cimport array as c_array from cpython.version cimport PY_MAJOR_VERSION @@ -94,7 +95,8 @@ IndexStats = collections.namedtuple("IndexStats", ######################################################## ## global variables # maximum genomic coordinace -cdef int MAX_POS = 2 << 29 +# for some reason, using 'int' causes overlflow +cdef int MAX_POS = (1 << 31) - 1 # valid types for SAM headers VALID_HEADER_TYPES = {"HD" : collections.Mapping, @@ -1314,8 +1316,9 @@ cdef class AlignmentFile(HTSFile): an iterator over genomic positions. """ - cdef int rtid, rstart, rstop, has_coord - + cdef int rtid, has_coord + cdef int32_t rstart, rstop + if not self.is_open: raise ValueError("I/O operation on closed file") @@ -2054,7 +2057,6 @@ cdef class IteratorRowRegion(IteratorRow): IteratorRow.__init__(self, samfile, multiple_iterators=multiple_iterators) - with nogil: self.iter = sam_itr_queryi( self.index, diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index b74ada1..9ac58dc 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -111,7 +111,7 @@ __all__ = ['VariantFile', ## Constants ######################################################################## -cdef int MAX_POS = 2 << 29 +cdef int MAX_POS = (1 << 31) - 1 cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String') cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC') cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R') diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index ca2f518..cf783b9 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -287,19 +287,20 @@ cdef class FastaFile: cdef char *ref cdef int rstart, rend - reference, rstart, rend = parse_region(reference, start, end, region) + contig, rstart, rend = parse_region(reference, start, end, region) - if reference is None: + if contig is None: raise ValueError("no sequence/region supplied.") if rstart == rend: return "" - ref = reference + contig_b = force_bytes(contig) + ref = contig_b with nogil: length = faidx_seq_len(self.fastafile, ref) if length == -1: - raise KeyError("sequence '%s' not present" % reference) + raise KeyError("sequence '%s' not present" % contig) if rstart >= length: return "" @@ -315,7 +316,7 @@ cdef class FastaFile: if errno: raise IOError(errno, strerror(errno)) else: - raise ValueError("failure when retrieving sequence on '%s'" % reference) + raise ValueError("failure when retrieving sequence on '%s'" % contig) try: return charptr_to_str(seq) diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 040ad1f..c03c7cf 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -9,10 +9,9 @@ from posix.unistd cimport dup from libc.errno cimport errno +from libc.stdint cimport INT32_MAX from cpython cimport PyBytes_FromStringAndSize - from pysam.libchtslib cimport * - from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len from pysam.libcutils cimport encode_filename, from_string_and_size @@ -25,7 +24,7 @@ from pysam.libcutils cimport encode_filename, from_string_and_size import os import io import re -from warnings import warn +from warnings import warn ######################################################################## @@ -41,7 +40,7 @@ DEF SEEK_CUR = 1 DEF SEEK_END = 2 # maximum genomic coordinace -cdef int MAX_POS = 2 << 29 +cdef int MAX_POS = (1 << 31) - 1 cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS') cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI', @@ -630,8 +629,8 @@ cdef class HTSFile(object): """ cdef int rtid - cdef long long rstart - cdef long long rstop + cdef int32_t rstart + cdef int32_t rstop if reference is not None: if contig is not None: @@ -644,7 +643,7 @@ cdef class HTSFile(object): stop = end if contig is None and tid is None and region is None: - return 0, 0, 0, 0 + return 0, 0, 0, MAX_POS rtid = -1 rstart = 0 diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index 75989b1..994a240 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -12,6 +12,7 @@ from cpython cimport PyBytes_Check, PyUnicode_Check from cpython cimport array as c_array from libc.stdlib cimport calloc, free from libc.string cimport strncpy +from libc.stdint cimport INT32_MAX, int32_t from libc.stdio cimport fprintf, stderr, fflush from libc.stdio cimport stdout as c_stdout from posix.fcntl cimport open as c_open, O_WRONLY @@ -24,7 +25,7 @@ from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stder ##################################################################### # hard-coded constants -cdef int MAX_POS = 2 << 29 +cdef int MAX_POS = (1 << 31) - 1 ################################################################# # Utility functions for quality string conversions @@ -48,7 +49,10 @@ cpdef array_to_qualitystring(c_array.array qualities, int offset=33): for x from 0 <= x < len(qualities): result[x] = qualities[x] + offset - return force_str(result.tostring()) + if IS_PYTHON3: + return force_str(result.tobytes()) + else: + return result.tostring() cpdef qualities_to_qualitystring(qualities, int offset=33): @@ -198,51 +202,58 @@ cpdef parse_region(contig=None, for invalid or out of bounds regions. """ - cdef long long rstart - cdef long long rend + cdef int32_t rstart + cdef int32_t rstop - if contig is not None: - reference = contig - if stop is not None: - end = stop + + if reference is not None: + if contig is not None: + raise ValueError('contig and reference should not both be specified') + contig = reference + + if contig is not None and region is not None: + raise ValueError('contig/reference and region should not both be specified') + + if end is not None: + if stop is not None: + raise ValueError('stop and end should not both be specified') + stop = end + + if contig is None and region is None: + raise ValueError("neither contig nor region are given") rstart = 0 - rend = MAX_POS - if start != None: + rstop = MAX_POS + if start is not None: try: rstart = start except OverflowError: raise ValueError('start out of range (%i)' % start) - if end != None: + if stop is not None: try: - rend = end + rstop = stop except OverflowError: - raise ValueError('end out of range (%i)' % end) + raise ValueError('stop out of range (%i)' % stop) if region: - region = force_str(region) if ":" in region: contig, coord = region.split(":") parts = coord.split("-") rstart = int(parts[0]) - 1 if len(parts) >= 1: - rend = int(parts[1]) + rstop = int(parts[1]) else: contig = region - if not reference: - return None, 0, 0 - + if rstart > rstop: + raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop)) if not 0 <= rstart < MAX_POS: raise ValueError('start out of range (%i)' % rstart) - if not 0 <= rend <= MAX_POS: - raise ValueError('end out of range (%i)' % rend) - if rstart > rend: - raise ValueError( - 'invalid region: start (%i) > end (%i)' % (rstart, rend)) + if not 0 <= rstop <= MAX_POS: + raise ValueError('stop out of range (%i)' % rstop) - return force_bytes(reference), rstart, rend + return contig, rstart, rstop def _pysam_dispatch(collection, diff --git a/pysam/version.py b/pysam/version.py index b3f022f..d4ec161 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,5 +1,5 @@ # pysam versioning information -__version__ = "0.15.0" +__version__ = "0.15.2" # TODO: upgrade number __samtools_version__ = "1.9" diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 5a759a5..48589e6 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -430,6 +430,35 @@ class TestAlignedSegment(ReadTest): (6, 27), (7, 28), (8, 29), (9, 30)]) + def test_equivalence_matches_only_and_with_seq(self): + a = self.build_read() + a.query_sequence = "ACGT" * 2 + a.cigarstring = "4M1D4M" + a.set_tag("MD", "4^x4") + full = (list(zip(range(0, 4), range(20, 24), "ACGT")) + + [(None, 24, "x")] + + list(zip(range(4, 8), range(25, 29), "ACGT"))) + self.assertEqual( + a.get_aligned_pairs(matches_only=False, with_seq=True), full) + + self.assertEqual( + a.get_aligned_pairs(matches_only=True, with_seq=True), + [x for x in full if x[0] is not None and x[1] is not None]) + + a = self.build_read() + a.query_sequence = "ACGT" * 2 + a.cigarstring = "4M1N4M" + a.set_tag("MD", "8") + full = (list(zip(range(0, 4), range(20, 24), "ACGT")) + + [(None, 24, None)] + + list(zip(range(4, 8), range(25, 29), "ACGT"))) + self.assertEqual( + a.get_aligned_pairs(matches_only=False, with_seq=True), full) + + self.assertEqual( + a.get_aligned_pairs(matches_only=True, with_seq=True), + [x for x in full if x[0] is not None and x[1] is not None]) + def test_get_aligned_pairs_lowercase_md(self): a = self.build_read() a.query_sequence = "A" * 10 diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py index b72c8ef..e6c4287 100644 --- a/tests/AlignmentFileHeader_test.py +++ b/tests/AlignmentFileHeader_test.py @@ -121,6 +121,7 @@ class TestHeaderConstruction(unittest.TestCase): self.compare_headers(header, self.header_without_text) self.check_name_mapping(header) + class TestHeaderSAM(unittest.TestCase): """testing header manipulation""" @@ -287,6 +288,7 @@ class TestHeaderWriteRead(unittest.TestCase): def check_read_write(self, flag_write, header): fn = get_temp_filename() + print(fn) with pysam.AlignmentFile( fn, flag_write, @@ -299,17 +301,30 @@ class TestHeaderWriteRead(unittest.TestCase): with pysam.AlignmentFile(fn) as inf: read_header = inf.header - os.unlink(fn) + # os.unlink(fn) self.compare_headers(header, read_header) + expected_lengths = dict([(x["SN"], x["LN"]) for x in header["SQ"]]) + self.assertEqual(expected_lengths, + dict(zip(read_header.references, + read_header.lengths))) def test_SAM(self): self.check_read_write("wh", self.header) def test_BAM(self): self.check_read_write("wb", self.header) - + def test_CRAM(self): header = copy.copy(self.header) - # for CRAM, \t needs to be quoted: - header['PG'][1]['CL'] = re.sub(r"\t", r"\\\\t", header['PG'][1]['CL']) + if "PG" in header: + # for CRAM, \t needs to be quoted: + header['PG'][1]['CL'] = re.sub(r"\t", r"\\\\t", header['PG'][1]['CL']) self.check_read_write("wc", header) + + +class TestHeaderLargeContigs(TestHeaderWriteRead): + """see issue 741""" + + header = {'SQ': [{'LN': 2147483647, 'SN': 'chr1'}, + {'LN': 1584, 'SN': 'chr2'}], + 'HD': {'VN': '1.0'}} diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py index 6872b01..43072fa 100644 --- a/tests/AlignmentFilePileup_test.py +++ b/tests/AlignmentFilePileup_test.py @@ -115,7 +115,7 @@ class TestPileupReadSelection(unittest.TestCase): fastafile=self.fastafile, redo_baq=True) self.check_equal(refs, iterator) - + class TestPileupReadSelectionFastafile(TestPileupReadSelection): '''test pileup functionality - backwards compatibility''' @@ -206,10 +206,19 @@ class TestPileupObjects(unittest.TestCase): '''test if exception is raised if pileup col is accessed after iterator is exhausted.''' + max_n = 0 for pileupcol in self.samfile.pileup(): - pass - - self.assertRaises(ValueError, getattr, pileupcol, "pileups") + if max_n < pileupcol.n: + max_col = pileupcol + max_n = pileupcol.n + + self.assertRaises(ValueError, getattr, max_col, "pileups") + self.assertRaises(ValueError, max_col.get_query_sequences) + self.assertRaises(ValueError, max_col.get_num_aligned) + self.assertRaises(ValueError, max_col.get_query_qualities) + self.assertRaises(ValueError, max_col.get_mapping_qualities) + self.assertRaises(ValueError, max_col.get_query_positions) + self.assertRaises(ValueError, max_col.get_query_names) class TestIteratorColumnBAM(unittest.TestCase): diff --git a/tests/faidx_test.py b/tests/faidx_test.py index 3126e23..2cfdb76 100644 --- a/tests/faidx_test.py +++ b/tests/faidx_test.py @@ -45,6 +45,13 @@ class TestFastaFile(unittest.TestCase): self.assertRaises(ValueError, self.file.fetch, "chr1", 20, 10) self.assertRaises(KeyError, self.file.fetch, "chr3", 0, 100) + def test_fetch_with_region_and_contig_raises_exception(self): + self.assertRaises(ValueError, self.file.fetch, "chr1", 10, 20, "chr1:11-20") + + def test_fetch_with_region_is_equivalent(self): + self.assertEqual(self.file.fetch("chr1", 10, 20), + self.file.fetch(region="chr1:11-20")) + def testLength(self): self.assertEqual(len(self.file), 2) @@ -60,7 +67,7 @@ class TestFastaFilePathIndex(unittest.TestCase): filename = os.path.join(BAM_DATADIR, "ex1.fa") data_suffix = ".fa" - + def test_raise_exception_if_index_is_missing(self): self.assertRaises(IOError, pysam.FastaFile, @@ -223,22 +230,28 @@ class TestRemoteFileFTP(unittest.TestCase): if not check_url(self.url): return - with pysam.Fastafile(self.url) as f: - self.assertEqual( - len(f.fetch("chr1", 0, 1000)), - 1000) - + try: + with pysam.Fastafile(self.url) as f: + self.assertEqual( + len(f.fetch("chr1", 0, 1000)), + 1000) + except (OSError, IOError): + pass + def test_sequence_lengths_are_available(self): if not check_url(self.url): return - with pysam.Fastafile(self.url) as f: - self.assertEqual(len(f.references), 3366) - self.assertTrue("chr1" in f.references) - self.assertEqual(f.lengths[0], - 248956422) - self.assertEqual(f.get_reference_length("chr1"), - 248956422) + try: + with pysam.Fastafile(self.url) as f: + self.assertEqual(len(f.references), 3366) + self.assertTrue("chr1" in f.references) + self.assertEqual(f.lengths[0], + 248956422) + self.assertEqual(f.get_reference_length("chr1"), + 248956422) + except (OSError, IOError): + pass class TestFastqRecord(unittest.TestCase): -- 2.30.2