From 5aa6792bd7dd9b57187fe1816f1d1dee7028141e Mon Sep 17 00:00:00 2001 From: Andreas Tille Date: Sat, 17 Feb 2018 14:42:30 +0100 Subject: [PATCH] New upstream version 0.14+ds --- .gitignore | 2 + MANIFEST.in | 40 +- NEWS | 50 +- bcftools/HMM.c.pysam.c | 2 +- bcftools/bam2bcf.c.pysam.c | 10 +- bcftools/bam2bcf_indel.c.pysam.c | 22 +- bcftools/bam_sample.c.pysam.c | 2 +- bcftools/bcftools.pysam.c | 68 + bcftools/bcftools.pysam.h | 47 + bcftools/bin.c.pysam.c | 2 +- bcftools/ccall.c.pysam.c | 2 +- bcftools/consensus.c.pysam.c | 78 +- bcftools/convert.c.pysam.c | 10 +- bcftools/csq.c.pysam.c | 124 +- bcftools/em.c.pysam.c | 10 +- bcftools/filter.c.pysam.c | 22 +- bcftools/gvcf.c.pysam.c | 2 +- bcftools/hclust.c.pysam.c | 16 +- bcftools/kmin.c.pysam.c | 2 +- bcftools/main.c.pysam.c | 22 +- bcftools/mcall.c.pysam.c | 24 +- bcftools/mpileup.c.pysam.c | 74 +- bcftools/ploidy.c.pysam.c | 2 +- bcftools/prob1.c.pysam.c | 10 +- bcftools/pysam.h | 7 - bcftools/regidx.c.pysam.c | 18 +- bcftools/reheader.c | 521 ++++++ bcftools/reheader.c.pysam.c | 523 ++++++ bcftools/smpl_ilist.c.pysam.c | 2 +- bcftools/tabix.c.pysam.c | 36 +- bcftools/tsv2vcf.c.pysam.c | 2 +- bcftools/vcfannotate.c.pysam.c | 52 +- bcftools/vcfbuf.c.pysam.c | 2 +- bcftools/vcfcall.c.pysam.c | 126 +- bcftools/vcfcnv.c.pysam.c | 86 +- bcftools/vcfconcat.c.pysam.c | 60 +- bcftools/vcfconvert.c.pysam.c | 190 +-- bcftools/vcffilter.c.pysam.c | 46 +- bcftools/vcfgtcheck.c.pysam.c | 52 +- bcftools/vcfindex.c.pysam.c | 60 +- bcftools/vcfisec.c.pysam.c | 78 +- bcftools/vcfmerge.c.pysam.c | 102 +- bcftools/vcfnorm.c.pysam.c | 66 +- bcftools/vcfplugin.c.pysam.c | 94 +- bcftools/vcfquery.c.pysam.c | 54 +- bcftools/vcfroh.c.pysam.c | 100 +- bcftools/vcfsom.c.pysam.c | 58 +- bcftools/vcfsort.c.pysam.c | 30 +- bcftools/vcfstats.c.pysam.c | 348 ++-- bcftools/vcfview.c.pysam.c | 100 +- bcftools/vcmp.c.pysam.c | 2 +- bcftools/version.c.pysam.c | 4 +- benchmark/AlignedSegment_bench.py | 43 - benchmark/tabix_bench.py | 76 - benchmark/windows_small.bed.gz.tbi | Bin 34590 -> 0 bytes doc/benchmarking.rst | 68 + doc/developer.rst | 28 +- doc/index.rst | 5 +- doc/release.rst | 54 + import.py | 53 +- import/pysam.c | 68 + import/pysam.h | 47 + pysam/Pileup.py | 259 +-- pysam/__init__.py | 6 +- pysam/htslib_util.c | 8 +- pysam/htslib_util.h | 9 +- pysam/libcalignedsegment.pxd | 44 +- pysam/libcalignedsegment.pyx | 755 +++++++-- pysam/libcalignmentfile.pxd | 49 +- pysam/libcalignmentfile.pyx | 1468 +++++++++------- pysam/libcbcf.pyx | 27 +- pysam/libcbcftools.pxd | 8 +- pysam/libcfaidx.pxd | 2 + pysam/libcfaidx.pyx | 67 +- pysam/libchtslib.pxd | 33 +- pysam/libchtslib.pyx | 56 +- pysam/libcsamfile.pxd | 4 +- pysam/libcsamtools.pxd | 8 +- pysam/libctabix.pyx | 73 +- pysam/libctabixproxies.pyx | 19 +- pysam/libcutils.pxd | 10 - pysam/libcutils.pyx | 59 +- pysam/namedtuple.py | 117 -- pysam/pysam_util.c | 71 +- pysam/pysam_util.h | 36 - pysam/samfile_util.c | 16 - pysam/samfile_util.h | 7 - pysam/tabix_util.c | 28 - pysam/tabix_util.h | 12 - pysam/utils.py | 5 +- pysam/version.py | 6 +- run_tests_travis.sh | 2 +- samtools/LICENSE | 33 + samtools/README | 54 + samtools/bam.c.pysam.c | 4 +- samtools/bam.h | 2 +- samtools/bam2bcf.c.pysam.c | 10 +- samtools/bam2bcf_indel.c.pysam.c | 22 +- samtools/bam2depth.c.pysam.c | 68 +- samtools/bam_addrprg.c.pysam.c | 42 +- samtools/bam_aux.c.pysam.c | 2 +- samtools/bam_cat.c.pysam.c | 46 +- samtools/bam_color.c.pysam.c | 2 +- samtools/bam_flags.c.pysam.c | 42 +- samtools/bam_import.c.pysam.c | 4 +- samtools/bam_index.c.pysam.c | 14 +- samtools/bam_lpileup.c.pysam.c | 12 +- samtools/bam_markdup.c | 221 ++- samtools/bam_markdup.c.pysam.c | 277 ++- samtools/bam_mate.c.pysam.c | 20 +- samtools/bam_md.c.pysam.c | 26 +- samtools/bam_plbuf.c.pysam.c | 2 +- samtools/bam_plcmd.c.pysam.c | 70 +- samtools/bam_quickcheck.c | 62 +- samtools/bam_quickcheck.c.pysam.c | 86 +- samtools/bam_reheader.c.pysam.c | 38 +- samtools/bam_rmdup.c.pysam.c | 26 +- samtools/bam_rmdupse.c.pysam.c | 6 +- samtools/bam_sort.c | 42 +- samtools/bam_sort.c.pysam.c | 94 +- samtools/bam_split.c.pysam.c | 20 +- samtools/bam_stat.c.pysam.c | 42 +- samtools/bamshuf.c | 12 + samtools/bamshuf.c.pysam.c | 46 +- samtools/bamtk.c | 10 +- samtools/bamtk.c.pysam.c | 33 +- samtools/bedcov.c | 6 + samtools/bedcov.c.pysam.c | 24 +- samtools/bedidx.c | 410 ++++- samtools/bedidx.c.pysam.c | 416 ++++- samtools/bedidx.h | 20 + samtools/cut_target.c | 2 +- samtools/cut_target.c.pysam.c | 24 +- samtools/dict.c | 4 +- samtools/dict.c.pysam.c | 30 +- samtools/faidx.c.pysam.c | 22 +- samtools/lz4/LICENSE | 24 + samtools/lz4/lz4.c | 1478 ++++++++++++++++ samtools/lz4/lz4.c.pysam.c | 1480 +++++++++++++++++ samtools/lz4/lz4.h | 463 ++++++ samtools/misc/ace2sam.c.pysam.c | 36 +- samtools/padding.c.pysam.c | 104 +- samtools/phase.c.pysam.c | 84 +- samtools/pysam.h | 7 - samtools/sam.c.pysam.c | 12 +- samtools/sam_header.c | 2 +- samtools/sam_header.c.pysam.c | 10 +- samtools/sam_opts.c.pysam.c | 4 +- samtools/sam_utils.c | 28 + samtools/sam_utils.c.pysam.c | 44 +- samtools/sam_view.c | 171 +- samtools/sam_view.c.pysam.c | 267 +-- samtools/sample.c.pysam.c | 2 +- samtools/samtools.h | 1 + samtools/samtools.pysam.c | 68 + samtools/samtools.pysam.h | 47 + samtools/stats.c.pysam.c | 60 +- samtools/stats_isize.c.pysam.c | 4 +- .../test/merge/test_bam_translate.c.pysam.c | 168 +- .../test/merge/test_rtrans_build.c.pysam.c | 24 +- .../test/merge/test_trans_tbl_init.c.pysam.c | 136 +- samtools/test/split/test_count_rg.c.pysam.c | 30 +- .../split/test_expand_format_string.c.pysam.c | 30 +- samtools/test/split/test_filter_header_rg.c | 18 +- .../split/test_filter_header_rg.c.pysam.c | 66 +- samtools/test/split/test_parse_args.c.pysam.c | 118 +- samtools/test/test.c | 6 - samtools/test/test.c.pysam.c | 22 +- .../test/tview/test_get_rg_sample.c.pysam.c | 4 +- samtools/tmp_file.c | 507 ++++++ samtools/tmp_file.c.pysam.c | 509 ++++++ samtools/tmp_file.h | 144 ++ samtools/version.h | 2 +- setup.cfg | 2 + setup.py | 416 ++--- tests/AlignedSegment_bench.py | 30 + tests/AlignedSegment_test.py | 433 ++++- tests/AlignmentFileFetchTestUtils.py | 86 + tests/AlignmentFileFetch_bench.py | 98 ++ tests/AlignmentFileHeader_test.py | 325 ++++ tests/AlignmentFilePileup_bench.py | 147 ++ tests/AlignmentFilePileup_test.py | 384 +++++ tests/AlignmentFile_bench.py | 60 + tests/AlignmentFile_test.py | 1074 +++++------- tests/PileupTestUtils.py | 160 ++ tests/StreamFiledescriptors_test.py | 16 +- tests/TestUtils.py | 49 +- tests/VariantFileFetchTestUtils.py | 69 + tests/VariantFile_bench.py | 59 + tests/VariantFile_test.py | 249 ++- tests/cbcf_data/gnomad.vcf | 200 +++ tests/cbcf_data/gnomad_fixed.vcf | 200 +++ tests/compile_test.py | 11 +- tests/faidx_bench.py | 71 + tests/faidx_test.py | 82 +- tests/linking_test.py | 17 +- tests/pysam_data/Makefile | 11 +- tests/pysam_data/example_no_seq_in_header.bam | Bin 0 -> 953 bytes tests/samtools_test.py | 39 +- tests/tabix_bench.py | 180 ++ tests/tabix_data/example.bed | 164 ++ .../tabix_data/example_large.bed | 0 .../tabix_data/example_large.bed.gz | Bin tests/tabix_data/example_large.bed.gz.tbi | Bin 0 -> 7877 bytes tests/tabix_test.py | 81 +- tests/tabixproxies_test.py | 22 +- tests/test_samtools_python.py | 4 +- 207 files changed, 15472 insertions(+), 5063 deletions(-) create mode 100644 bcftools/bcftools.pysam.c create mode 100644 bcftools/bcftools.pysam.h delete mode 100644 bcftools/pysam.h create mode 100644 bcftools/reheader.c create mode 100644 bcftools/reheader.c.pysam.c delete mode 100644 benchmark/AlignedSegment_bench.py delete mode 100644 benchmark/tabix_bench.py delete mode 100644 benchmark/windows_small.bed.gz.tbi create mode 100644 doc/benchmarking.rst create mode 100644 import/pysam.c create mode 100644 import/pysam.h delete mode 100644 pysam/namedtuple.py delete mode 100644 pysam/samfile_util.c delete mode 100644 pysam/samfile_util.h delete mode 100644 pysam/tabix_util.c delete mode 100644 pysam/tabix_util.h create mode 100644 samtools/LICENSE create mode 100644 samtools/README create mode 100644 samtools/bedidx.h create mode 100644 samtools/lz4/LICENSE create mode 100644 samtools/lz4/lz4.c create mode 100644 samtools/lz4/lz4.c.pysam.c create mode 100644 samtools/lz4/lz4.h delete mode 100644 samtools/pysam.h create mode 100644 samtools/samtools.pysam.c create mode 100644 samtools/samtools.pysam.h create mode 100644 samtools/tmp_file.c create mode 100644 samtools/tmp_file.c.pysam.c create mode 100644 samtools/tmp_file.h create mode 100644 tests/AlignedSegment_bench.py create mode 100644 tests/AlignmentFileFetchTestUtils.py create mode 100644 tests/AlignmentFileFetch_bench.py create mode 100644 tests/AlignmentFileHeader_test.py create mode 100644 tests/AlignmentFilePileup_bench.py create mode 100644 tests/AlignmentFilePileup_test.py create mode 100644 tests/AlignmentFile_bench.py create mode 100644 tests/PileupTestUtils.py create mode 100644 tests/VariantFileFetchTestUtils.py create mode 100644 tests/VariantFile_bench.py create mode 100644 tests/cbcf_data/gnomad.vcf create mode 100644 tests/cbcf_data/gnomad_fixed.vcf create mode 100644 tests/faidx_bench.py create mode 100644 tests/pysam_data/example_no_seq_in_header.bam create mode 100644 tests/tabix_bench.py create mode 100644 tests/tabix_data/example.bed rename benchmark/windows_small.bed => tests/tabix_data/example_large.bed (100%) rename benchmark/windows_small.bed.gz => tests/tabix_data/example_large.bed.gz (100%) create mode 100644 tests/tabix_data/example_large.bed.gz.tbi diff --git a/.gitignore b/.gitignore index f3e1e51..74fde57 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ tests/*.sam tests/*.fai tests/pysam_data tests/cbcf_data +tests/tabix_data samtools/config.h htslib/config.status @@ -52,6 +53,7 @@ var/ *.egg-info/ .installed.cfg *.egg +doc/_build # Installer logs pip-log.txt diff --git a/MANIFEST.in b/MANIFEST.in index 4c431ec..531159d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,16 +17,22 @@ include pysam/libc*.c include pysam/*.c include pysam/*.h +# exclude tests from pypi tar-ball - they +# require additional data +prune tests/ + # samtools include samtools/configure include samtools/config.mk.in include samtools/config.h.in include samtools/*.h +include samtools/*.c exclude samtools/config.h include samtools/*/*.h # bcftools include bcftools/*.h +include bcftools/*.c exclude bcftools/config.h # htslib @@ -50,39 +56,5 @@ include cy_build.py include pysam.py include requirements.txt -# pysam tests -include tests/00README.txt -include tests/pysam_data -include tests/tabix_data -include tests/*.py -#ex1.fa -#include tests/ex1.sam.gz -#include tests/ex3.sam -#include tests/ex4.sam -#include tests/ex5.sam -#include tests/ex6.sam -#include tests/ex7.sam -#include tests/ex8.sam -#include tests/ex9_fail.bam -#include tests/ex9_nofail.bam -#include tests/ex10.sam -#include tests/example.py -#include tests/pysam_test.py -#include tests/segfault_tests.py -#include tests/example_*.sam -#include tests/example_btag.bam -#include tests/tag_bug.bam -#include tests/example.vcf40 -#include tests/example_empty_header.bam -#include tests/test_unaligned.bam -#include tests/issue100.bam - -# tabix tests -#include tests/example.gtf.gz -#include tests/example.gtf.gz.tbi -#include tests/example.bed.gz -#include tests/example.bed.gz.tbi -#include tests/vcf-examples/*.vcf - # documentation include doc/* diff --git a/NEWS b/NEWS index 528d750..bca0d0c 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,55 @@ http://pysam.readthedocs.io/en/latest/release.html Release notes ============= +Release 0.14.0 +============== + +This release wraps htslib/samtools versions 1.7.0. + +* SAM/BAM/CRAM headers are now managed by a separate AlignmentHeader + class. +* AlignmentFile.header.as_dict() returns an ordered dictionary. +* Use "stop" instead of "end" to ensure consistency to + VariantFile. The end designations have been kept for backwards + compatibility. + +* [#611] and [#293] CRAM repeated fetch now works, each iterator + reloads index if multiple_iterators=True +* [#608] pysam now wraps htslib 1.7 and samtools 1.7. +* [#580] reference_name and next_reference_name can now be set to "*" + (will be converted to None to indicate an unmapped location) +* [#302] providing no coordinate to count_coverage will not count from + start/end of contig. +* [#325] @SQ records will be automatically added to header if they are + absent from text section of header. +* [#529] add get_forward_sequence() and get_forward_qualities() + methods +* [#577] add from_string() and to_dict()/from_dict() methods to + AlignedSegment. Rename tostring() to to_string() throughout for + consistency +* [#589] return None from build_alignment_sequence if no MD tag is set +* [#528] add PileupColumn.__len__ method + +Backwards incompatible changes: + +* AlignmentFile.header now returns an AlignmentHeader object. Use + AlignmentFile.header.to_dict() to get the dictionary as + previously. Most dictionary accessor methods (keys(), values(), + __getitem__, ...) have been implemented to ensure some level of + backwards compatibility when only reading. + + The rationale for this change is to have consistency between + AlignmentFile and VariantFile. + +* AlignmentFile and FastaFile now raise IOError instead of OSError + +Medium term we plan to have a 1.0 release. The pysam +interface has grown over the years and the API is cluttered with +deprecated names (Samfile, getrname(), gettid(), ...). To work towards +this, the next release (0.15.0) will yield DeprecationWarnings +for any parts of the API that are considered obsolete and will not be +in 1.0. Once 1.0 has been reached, we will use semantic versioning. + Release 0.13.0 =============== @@ -20,7 +69,6 @@ contains a series of bugfixes. * [#537] allow tabix index files to be created in a custom location. * [#530] add get_index_statistics() method - Release 0.12.0.1 ================ diff --git a/bcftools/HMM.c.pysam.c b/bcftools/HMM.c.pysam.c index 998254c..2280c0d 100644 --- a/bcftools/HMM.c.pysam.c +++ b/bcftools/HMM.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* The MIT License diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c index 4db42e4..e6d72dd 100644 --- a/bcftools/bam2bcf.c.pysam.c +++ b/bcftools/bam2bcf.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* bam2bcf.c -- variant calling. @@ -108,7 +108,7 @@ static int get_position(const bam_pileup1_t *p, int *len) if ( cig==BAM_CHARD_CLIP ) continue; if ( cig==BAM_CPAD ) continue; if ( cig==BAM_CREF_SKIP ) continue; - fprintf(pysam_stderr,"todo: cigar %d\n", cig); + fprintf(bcftools_stderr,"todo: cigar %d\n", cig); assert(0); } *len = n_tot_bases; @@ -504,7 +504,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) double sum = 0; const double log2 = log(2.0); - // fprintf(pysam_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); + // fprintf(bcftools_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); int i; for (i=0; in; i++) { @@ -519,7 +519,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) else tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p; sum += tmp; - // fprintf(pysam_stderr,"oi=%d %e\n", oi,tmp); + // fprintf(bcftools_stderr,"oi=%d %e\n", oi,tmp); } call->seg_bias = sum; } @@ -683,7 +683,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int } } -// if (ref_base < 0) fprintf(pysam_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); +// if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); call->shift = (int)(sum_min + .499); } // combine annotations diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c index 4b37122..67fff21 100644 --- a/bcftools/bam2bcf_indel.c.pysam.c +++ b/bcftools/bam2bcf_indel.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* bam2bcf_indel.c -- indel caller. @@ -158,7 +158,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla free(aux); // TODO revisit how/whether to control printing this warning if (hts_verbose >= 2) - fprintf(pysam_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); + fprintf(bcftools_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); return -1; } types = (int*)calloc(n_types, sizeof(int)); @@ -231,7 +231,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; if (max_i >= 0) r[max_i] = 15; if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysam_stderr); fputc('\n', pysam_stderr); + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], bcftools_stderr); fputc('\n', bcftools_stderr); } free(ref0); free(cns); } @@ -299,7 +299,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); else ir = est_indelreg(pos, ref, -types[t], 0); if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(pysam_stderr, "%d, %d, %d\n", pos, types[t], ir); +// fprintf(bcftools_stderr, "%d, %d, %d\n", pos, types[t], ir); // realignment for (s = K = 0; s < n; ++s) { // write ref2 @@ -361,11 +361,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } #if 0 for (l = 0; l < tend - tbeg + abs(types[t]); ++l) - fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysam_stderr); - fputc('\n', pysam_stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysam_stderr); - fputc('\n', pysam_stderr); - fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc); + fputc("ACGTN"[(int)ref2[tbeg-left+l]], bcftools_stderr); + fputc('\n', bcftools_stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], bcftools_stderr); + fputc('\n', bcftools_stderr); + fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), qbeg, tbeg, sc); #endif } } @@ -424,7 +424,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(pysam_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); +// fprintf(bcftools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -456,7 +456,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (x == bca->indel_types[j]) break; p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + //fprintf(bcftools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam_get_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); } } diff --git a/bcftools/bam_sample.c.pysam.c b/bcftools/bam_sample.c.pysam.c index 76d7a61..c25358f 100644 --- a/bcftools/bam_sample.c.pysam.c +++ b/bcftools/bam_sample.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* bam_sample.c -- group data by sample. diff --git a/bcftools/bcftools.pysam.c b/bcftools/bcftools.pysam.c new file mode 100644 index 0000000..63dfed5 --- /dev/null +++ b/bcftools/bcftools.pysam.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include + +#include "bcftools.pysam.h" + +FILE * bcftools_stderr = NULL; +FILE * bcftools_stdout = NULL; +const char * bcftools_stdout_fn = NULL; +int bcftools_stdout_fileno = STDOUT_FILENO; + + +FILE * bcftools_set_stderr(int fd) +{ + if (bcftools_stderr != NULL) + fclose(bcftools_stderr); + bcftools_stderr = fdopen(fd, "w"); + return bcftools_stderr; +} + +void bcftools_unset_stderr(void) +{ + if (bcftools_stderr != NULL) + fclose(bcftools_stderr); + bcftools_stderr = fopen("/dev/null", "w"); +} + +FILE * bcftools_set_stdout(int fd) +{ + if (bcftools_stdout != NULL) + fclose(bcftools_stdout); + bcftools_stdout = fdopen(fd, "w"); + if (bcftools_stdout == NULL) + { + fprintf(bcftools_stderr, "could not set stdout to fd %i", fd); + } + bcftools_stdout_fileno = fd; + return bcftools_stdout; +} + +void bcftools_set_stdout_fn(const char *fn) +{ + bcftools_stdout_fn = fn; +} + +void bcftools_unset_stdout(void) +{ + if (bcftools_stdout != NULL) + fclose(bcftools_stdout); + bcftools_stdout = fopen("/dev/null", "w"); + bcftools_stdout_fileno = STDOUT_FILENO; +} + +void bcftools_set_optind(int val) +{ + // setting this in cython via + // "from posix.unistd cimport optind" + // did not work. + // + // setting to 0 forces a complete re-initialization + optind = val; +} + + + diff --git a/bcftools/bcftools.pysam.h b/bcftools/bcftools.pysam.h new file mode 100644 index 0000000..4c3806c --- /dev/null +++ b/bcftools/bcftools.pysam.h @@ -0,0 +1,47 @@ +#ifndef BCFTOOLS_PYSAM_H +#define BCFTOOLS_PYSAM_H + +#include "stdio.h" + +extern FILE * bcftools_stderr; + +extern FILE * bcftools_stdout; + +extern const char * bcftools_stdout_fn; + +/*! set pysam standard error to point to file descriptor + + Setting the stderr will close the previous stderr. + */ +FILE * bcftools_set_stderr(int fd); + +/*! set pysam standard output to point to file descriptor + + Setting the stderr will close the previous stdout. + */ +FILE * bcftools_set_stdout(int fd); + +/*! set pysam standard output to point to filename + + */ +void bcftools_set_stdout_fn(const char * fn); + +/*! set pysam standard error to /dev/null. + + Unsetting the stderr will close the previous stderr. + */ +void bcftools_unset_stderr(void); + +/*! set pysam standard error to /dev/null. + + Unsetting the stderr will close the previous stderr. + */ +void bcftools_unset_stdout(void); + +int bcftools_dispatch(int argc, char *argv[]); + +void bcftools_set_optind(int); + +extern int bcftools_main(int argc, char *argv[]); + +#endif diff --git a/bcftools/bin.c.pysam.c b/bcftools/bin.c.pysam.c index 6469b57..4880231 100644 --- a/bcftools/bin.c.pysam.c +++ b/bcftools/bin.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* The MIT License diff --git a/bcftools/ccall.c.pysam.c b/bcftools/ccall.c.pysam.c index 1765d84..696b455 100644 --- a/bcftools/ccall.c.pysam.c +++ b/bcftools/ccall.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* ccall.c -- consensus variant calling. diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 5250b4f..9ba5dd0 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* The MIT License @@ -103,7 +103,7 @@ args_t; static chain_t* init_chain(chain_t *chain, int ref_ori_pos) { -// fprintf(pysam_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos); +// fprintf(bcftools_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos); chain = (chain_t*) calloc(1,sizeof(chain_t)); chain->num = 0; chain->block_lengths = NULL; @@ -173,7 +173,7 @@ static void print_chain(args_t *args) static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len) { -// fprintf(pysam_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); +// fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); int num = chain->num; if (ref_start <= chain->ref_last_block_ori) { @@ -235,8 +235,8 @@ static void init_data(args_t *args) args->fp_out = fopen(args->output_fname,"w"); if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } - else args->fp_out = pysam_stdout; - if ( args->isample<0 ) fprintf(pysam_stderr,"Note: the --sample option not given, applying all records\n"); + else args->fp_out = bcftools_stdout; + if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records\n"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); } @@ -279,7 +279,7 @@ static void init_region(args_t *args, char *line) } } args->rid = bcf_hdr_name2id(args->hdr,line); - if ( args->rid<0 ) fprintf(pysam_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname); + if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname); args->fa_buf.l = 0; args->fa_length = 0; args->fa_end_pos = to; @@ -371,7 +371,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( rec->pos <= args->fa_frz_pos ) { - fprintf(pysam_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); + fprintf(bcftools_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); return; } if ( args->mask ) @@ -474,7 +474,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 ) { - fprintf(pysam_stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); return; } if ( rec->rlen > args->fa_buf.l - idx ) @@ -484,7 +484,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( alen > rec->rlen ) { rec->d.allele[ialt][rec->rlen] = 0; - fprintf(pysam_stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); + fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); } } if ( idx>=args->fa_buf.l ) @@ -502,7 +502,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) { - // fprintf(pysam_stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); + // fprintf(bcftools_stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off); char tmp = 0; if ( args->fa_buf.l - idx > rec->rlen ) { @@ -672,35 +672,35 @@ static void consensus(args_t *args) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n"); - fprintf(pysam_stderr, " file. By default, the program will apply all ALT variants. Using the\n"); - fprintf(pysam_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); - fprintf(pysam_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); - fprintf(pysam_stderr, " information, such as INFO/AD or FORMAT/AD.\n"); - fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] \n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -c, --chain write a chain file for liftover\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(pysam_stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); - fprintf(pysam_stderr, " the codes are case-insensitive:\n"); - fprintf(pysam_stderr, " 1: first allele from GT\n"); - fprintf(pysam_stderr, " 2: second allele\n"); - fprintf(pysam_stderr, " R: REF allele in het genotypes\n"); - fprintf(pysam_stderr, " A: ALT allele\n"); - fprintf(pysam_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); - fprintf(pysam_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); - fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(pysam_stderr, " -m, --mask replace regions with N\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -s, --sample apply variants of the given sample\n"); - fprintf(pysam_stderr, "Examples:\n"); - fprintf(pysam_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); - fprintf(pysam_stderr, " # in the form \">chr:from-to\".\n"); - fprintf(pysam_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n"); + fprintf(bcftools_stderr, " file. By default, the program will apply all ALT variants. Using the\n"); + fprintf(bcftools_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); + fprintf(bcftools_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); + fprintf(bcftools_stderr, " information, such as INFO/AD or FORMAT/AD.\n"); + fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] \n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -c, --chain write a chain file for liftover\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(bcftools_stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(bcftools_stderr, " the codes are case-insensitive:\n"); + fprintf(bcftools_stderr, " 1: first allele from GT\n"); + fprintf(bcftools_stderr, " 2: second allele\n"); + fprintf(bcftools_stderr, " R: REF allele in het genotypes\n"); + fprintf(bcftools_stderr, " A: ALT allele\n"); + fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); + fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); + fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(bcftools_stderr, " -m, --mask replace regions with N\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -s, --sample apply variants of the given sample\n"); + fprintf(bcftools_stderr, "Examples:\n"); + fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); + fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n"); + fprintf(bcftools_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index 95814b7..db7b6c1 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* convert.c -- functions for converting between VCF/BCF and related formats. @@ -213,7 +213,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break; case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break; case BCF_BT_CHAR: kputc(info->v1.i, str); break; - default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break; } } else if ( fmt->subscript >=0 ) @@ -234,7 +234,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break; case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break; - default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH } @@ -1015,7 +1015,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; - fprintf(pysam_stderr,"Warning: Assuming INFO/%s\n", key); + fprintf(bcftools_stderr,"Warning: Assuming INFO/%s\n", key); } } } @@ -1199,7 +1199,7 @@ convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char * char *p = convert->format_str; while ( *p ) { - //fprintf(pysam_stderr,"<%s>\n", p); + //fprintf(bcftools_stderr,"<%s>\n", p); switch (*p) { case '[': is_gtf = 1; p++; break; diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index 4a7810c..978bd59 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* The MIT License @@ -887,7 +887,7 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) int biotype = gff_parse_biotype(ss); if ( biotype <= 0 ) { - if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored transcript: %s\n",line); + if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(bcftools_stderr,"ignored transcript: %s\n",line); return; } @@ -913,7 +913,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha int biotype = gff_parse_biotype(ss); if ( biotype <= 0 ) { - if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored gene: %s\n",line); + if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(bcftools_stderr,"ignored gene: %s\n",line); return; } @@ -979,7 +979,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) if ( !ss ) return -1; // no ID, ignore the line if ( !strncmp("chromosome",ss+3,10) ) return -1; if ( !strncmp("supercontig",ss+3,11) ) return -1; - if ( args->quiet<2 ) fprintf(pysam_stderr,"ignored: %s\n", line); + if ( args->quiet<2 ) fprintf(bcftools_stderr,"ignored: %s\n", line); return -1; } @@ -1001,7 +1001,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) // 7. column: strand if ( *ss == '+' ) ftr->strand = STRAND_FWD; else if ( *ss == '-' ) ftr->strand = STRAND_REV; - else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } + else { if ( args->quiet<2 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } ss += 2; // 8. column: phase (codon offset) @@ -1009,7 +1009,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) else if ( *ss == '1' ) ftr->phase = 1; else if ( *ss == '2' ) ftr->phase = 2; else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase - else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } + else { if ( args->quiet<2 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } ss += 2; // substring search for "Parent=transcript:ENST00000437963" @@ -1274,7 +1274,7 @@ void init_gff(args_t *args) if ( !args->quiet ) { - fprintf(pysam_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", regidx_nregs(args->idx_tscript), regidx_nregs(args->idx_exon), regidx_nregs(args->idx_cds), @@ -1291,11 +1291,11 @@ void init_gff(args_t *args) if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) { khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; - fprintf(pysam_stderr,"Ignored the following biotypes:\n"); + fprintf(bcftools_stderr,"Ignored the following biotypes:\n"); for (i = kh_begin(ign); i < kh_end(ign); i++) { if ( !kh_exist(ign,i)) continue; - fprintf(pysam_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); + fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); } } khash_str2int_destroy_free(aux->ignored_biotypes); @@ -1305,7 +1305,7 @@ void init_data(args_t *args) { args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; - if ( !args->quiet ) fprintf(pysam_stderr,"Parsing %s ...\n", args->gff_fname); + if ( !args->quiet ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname); init_gff(args); args->rid = -1; @@ -1338,7 +1338,7 @@ void init_data(args_t *args) if ( args->output_type==FT_TAB_TEXT ) { - args->out = args->output_fname ? fopen(args->output_fname,"w") : pysam_stdout; + args->out = args->output_fname ? fopen(args->output_fname,"w") : bcftools_stdout; if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno)); fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); @@ -1366,7 +1366,7 @@ void init_data(args_t *args) bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); bcf_hdr_write(args->out_fh, args->hdr); } - if ( !args->quiet ) fprintf(pysam_stderr,"Calling...\n"); + if ( !args->quiet ) fprintf(bcftools_stderr,"Calling...\n"); } void destroy_data(args_t *args) @@ -1398,7 +1398,7 @@ void destroy_data(args_t *args) ret = hts_close(args->out_fh); else ret = fclose(args->out); - if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"pysam_stdout"); + if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"bcftools_stdout"); for (i=0; ivcf_rbuf.m; i++) { vbuf_t *vbuf = args->vcf_buf[i]; @@ -1491,7 +1491,7 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) #define XDBG 0 #if XDBG -fprintf(pysam_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); +fprintf(bcftools_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); #endif splice->kref.l = 0; splice->kalt.l = 0; @@ -1507,7 +1507,7 @@ fprintf(pysam_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); else roff = rbeg - splice->vcf.pos; #if XDBG -fprintf(pysam_stderr,"r1: %s roff=%d\n",splice->kref.s,roff); +fprintf(bcftools_stderr,"r1: %s roff=%d\n",splice->kref.s,roff); #endif if ( roff < splice->vcf.rlen && splice->kref.l < rlen ) @@ -1517,7 +1517,7 @@ fprintf(pysam_stderr,"r1: %s roff=%d\n",splice->kref.s,roff); kputsn(splice->vcf.ref + roff, len, &splice->kref); } #if XDBG -fprintf(pysam_stderr,"r2: %s\n",splice->kref.s); +fprintf(bcftools_stderr,"r2: %s\n",splice->kref.s); #endif uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele @@ -1529,7 +1529,7 @@ fprintf(pysam_stderr,"r2: %s\n",splice->kref.s); kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref); } #if XDBG -fprintf(pysam_stderr,"r3: %s\n",splice->kref.s); +fprintf(bcftools_stderr,"r3: %s\n",splice->kref.s); #endif @@ -1543,7 +1543,7 @@ fprintf(pysam_stderr,"r3: %s\n",splice->kref.s); else aoff = abeg - splice->vcf.pos; #if XDBG -fprintf(pysam_stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff); +fprintf(bcftools_stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff); #endif if ( aoff < splice->vcf.alen && splice->kalt.l < alen ) @@ -1556,7 +1556,7 @@ fprintf(pysam_stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff); if ( aoff < 0 ) aoff = 0; else aoff--; #if XDBG -fprintf(pysam_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); +fprintf(bcftools_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); #endif end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele @@ -1568,8 +1568,8 @@ fprintf(pysam_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff); kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt); } #if XDBG -fprintf(pysam_stderr,"a3: %s\n",splice->kalt.s); -fprintf(pysam_stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s); +fprintf(bcftools_stderr,"a3: %s\n",splice->kalt.s); +fprintf(bcftools_stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s); #endif } void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); @@ -1596,7 +1596,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type) { #if XDBG -fprintf(pysam_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); +fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); #endif if ( !type ) return; csq_t csq; @@ -1625,7 +1625,7 @@ static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend; } #if XDBG -fprintf(pysam_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); #endif int ret; @@ -1749,7 +1749,7 @@ static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base #if XDBG -fprintf(pysam_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +fprintf(bcftools_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); #endif if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1 @@ -1895,7 +1895,7 @@ static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; #if XDBG -fprintf(pysam_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); +fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); #endif if ( splice->ref_beg < ex_beg ) // the part before the exon @@ -2049,11 +2049,11 @@ int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; #if XDBG -fprintf(pysam_stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); +fprintf(bcftools_stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); #endif int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); #if XDBG -fprintf(pysam_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq); +fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq); #endif if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA @@ -2186,7 +2186,7 @@ void hap_destroy(hap_node_t *hap) void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) { #if XDBG -fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); +fprintf(bcftools_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); #endif char tmp[3], *codon, *end; int i, len, npad; @@ -2203,12 +2203,12 @@ fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,f #define DBG 0 #if DBG - fprintf(pysam_stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); - fprintf(pysam_stderr," ref: l=%d %s\n", (int)ref.l,ref.s); - fprintf(pysam_stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m); - for (i=0; il); + fprintf(bcftools_stderr," ref: l=%d %s\n", (int)ref.l,ref.s); + fprintf(bcftools_stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m); + for (i=0; i1 - fprintf(pysam_stderr," npad: %d\n",npad); + fprintf(bcftools_stderr," npad: %d\n",npad); #endif assert( npad<=rbeg ); @@ -2226,13 +2226,13 @@ fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,f tmp[i] = seq.s[i-npad]; len = seq.l - i + npad; // the remaining length of padded sseq #if DBG>1 - fprintf(pysam_stderr,"\t i=%d\n", i); + fprintf(bcftools_stderr,"\t i=%d\n", i); #endif if ( i==3 ) { kputc_(dna2aa(tmp), tseq); #if DBG>1 - fprintf(pysam_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]); + fprintf(bcftools_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]); #endif codon = seq.s + 3 - npad; // next codon end = codon + len - 1 - (len % 3); // last position of a valid codon @@ -2240,7 +2240,7 @@ fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,f { kputc_(dna2aa(codon), tseq); #if DBG>1 - fprintf(pysam_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]); + fprintf(bcftools_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]); #endif codon += 3; } @@ -2253,8 +2253,8 @@ fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,f if ( i>0 ) { #if DBG>1 - if(i==1)fprintf(pysam_stderr,"[3]%c\n",tmp[0]); - if(i==2)fprintf(pysam_stderr,"[3]%c%c\n",tmp[0],tmp[1]); + if(i==1)fprintf(bcftools_stderr,"[3]%c\n",tmp[0]); + if(i==2)fprintf(bcftools_stderr,"[3]%c%c\n",tmp[0],tmp[1]); #endif for (; i<3; i++) { @@ -2263,7 +2263,7 @@ fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,f } kputc_(dna2aa(tmp), tseq); #if DBG>1 - fprintf(pysam_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]); + fprintf(bcftools_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]); #endif } if ( fill!=0 ) @@ -2273,7 +2273,7 @@ fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,f { kputc_(dna2aa(codon), tseq); #if DBG>1 - fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon)); + fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon)); #endif codon += 3; } @@ -2284,9 +2284,9 @@ fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,f // right padding - number of bases to take from ref npad = (seq.m - (sbeg + seq.l)) % 3; #if DBG>1 - fprintf(pysam_stderr," npad: %d\n",npad); + fprintf(bcftools_stderr," npad: %d\n",npad); #endif -if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); +if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(bcftools_stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand if ( npad==2 ) @@ -2306,14 +2306,14 @@ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l end = seq.s + seq.l; for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); #if DBG>1 - fprintf(pysam_stderr,"\t i=%d\n", i); - if(i==1)fprintf(pysam_stderr,"[0] %c\n",tmp[2]); - if(i==0)fprintf(pysam_stderr,"[0] %c%c\n",tmp[1],tmp[2]); + fprintf(bcftools_stderr,"\t i=%d\n", i); + if(i==1)fprintf(bcftools_stderr,"[0] %c\n",tmp[2]); + if(i==0)fprintf(bcftools_stderr,"[0] %c%c\n",tmp[1],tmp[2]); #endif if ( i==-1 ) { #if DBG>1 - fprintf(pysam_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp)); + fprintf(bcftools_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp)); #endif kputc_(cdna2aa(tmp), tseq); codon = end - 3; @@ -2321,7 +2321,7 @@ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l { kputc_(cdna2aa(codon), tseq); #if DBG>1 - fprintf(pysam_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon)); + fprintf(bcftools_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon)); #endif codon -= 3; } @@ -2339,8 +2339,8 @@ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l else i = -1; #if DBG>1 - if(i==1)fprintf(pysam_stderr,"[3] %c\n",tmp[2]); - if(i==0)fprintf(pysam_stderr,"[3] %c%c\n",tmp[1],tmp[2]); + if(i==1)fprintf(bcftools_stderr,"[3] %c\n",tmp[2]); + if(i==0)fprintf(bcftools_stderr,"[3] %c%c\n",tmp[1],tmp[2]); #endif } // left padding @@ -2350,7 +2350,7 @@ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end); kputc_(cdna2aa(tmp), tseq); #if DBG>1 - fprintf(pysam_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp)); + fprintf(bcftools_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp)); #endif } if ( fill!=0 ) @@ -2360,7 +2360,7 @@ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l { kputc_(cdna2aa(codon), tseq); #if DBG>1 - fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon)); + fprintf(bcftools_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon)); #endif codon -= 3; } @@ -2368,7 +2368,7 @@ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l } kputc_(0,tseq); tseq->l--; #if DBG - fprintf(pysam_stderr," tseq: %s\n", tseq->s); + fprintf(bcftools_stderr," tseq: %s\n", tseq->s); #endif } @@ -2400,7 +2400,7 @@ void tscript_splice_ref(tscript_t *tr) int csq_push(args_t *args, csq_t *csq, bcf1_t *rec) { #if XDBG -fprintf(pysam_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); +fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); #endif khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos); vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k); @@ -2955,9 +2955,9 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha } if ( print_warning ) { - fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", + fprintf(bcftools_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); - if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n"); + if ( args->quiet ) fprintf(bcftools_stderr,"(This warning is printed only once)\n"); } break; } @@ -3356,7 +3356,7 @@ int test_cds(args_t *args, bcf1_t *rec) if ( hap_ret==1 ) { if ( !args->quiet ) - fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + fprintf(bcftools_stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); } @@ -3380,7 +3380,7 @@ int test_cds(args_t *args, bcf1_t *rec) if ( ngts!=1 && ngts!=2 ) { if ( !args->quiet ) - fprintf(pysam_stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); + fprintf(bcftools_stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); continue; @@ -3438,7 +3438,7 @@ int test_cds(args_t *args, bcf1_t *rec) if ( hap_ret==1 ) { if ( !args->quiet ) - fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", + fprintf(bcftools_stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", @@ -3523,9 +3523,9 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) } if ( print_warning ) { - fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", + fprintf(bcftools_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); - if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n"); + if ( args->quiet ) fprintf(bcftools_stderr,"(This warning is printed only once)\n"); } break; } diff --git a/bcftools/em.c.pysam.c b/bcftools/em.c.pysam.c index 8109152..db27d06 100644 --- a/bcftools/em.c.pysam.c +++ b/bcftools/em.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* em.c -- mathematical functions. @@ -74,7 +74,7 @@ static double prob1(double f, void *data) minaux1_t *a = (minaux1_t*)data; double p = 1., l = 0., f3[3]; int i; -// fprintf(pysam_stdout, "brent %lg\n", f); +// fprintf(bcftools_stdout, "brent %lg\n", f); if (f < 0 || f > 1) return 1e300; f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; for (i = a->beg; i < a->end; ++i) { @@ -90,7 +90,7 @@ static double freq_iter(double *f, const double *_pdg, int beg, int end) { double f0 = *f, f3[3], err; int i; -// fprintf(pysam_stdout, "em %lg\n", *f); +// fprintf(bcftools_stdout, "em %lg\n", *f); f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; for (i = beg, f0 = 0.; i < end; ++i) { const double *pdg = _pdg + i * 3; @@ -128,7 +128,7 @@ static double g3_iter(double g[3], const double *_pdg, int beg, int end) double err, gg[3]; int i; gg[0] = gg[1] = gg[2] = 0.; -// fprintf(pysam_stdout, "%lg,%lg,%lg\n", g[0], g[1], g[2]); +// fprintf(bcftools_stdout, "%lg,%lg,%lg\n", g[0], g[1], g[2]); for (i = beg; i < end; ++i) { double sum, tmp[3]; const double *pdg = _pdg + i * 3; @@ -237,7 +237,7 @@ static int pair_freq_iter(int n, double *pdg[2], double f[4]) { double ff[4]; int i, k, h; -// fprintf(pysam_stdout, "%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); +// fprintf(bcftools_stdout, "%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); memset(ff, 0, 4 * sizeof(double)); for (i = 0; i < n; ++i) { double *p[2], sum, tmp; diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 81f8734..0beb592 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* filter.c -- filter expressions. @@ -403,7 +403,7 @@ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break; case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break; - default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break; + default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH return -1; // this shouldn't happen @@ -1320,7 +1320,7 @@ static int vector_logic_or(token_t *atok, token_t *btok, int or_type) { \ if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \ } \ - /*fprintf(pysam_stderr,"pass=%d\n", pass_site);*/ \ + /*fprintf(bcftools_stderr,"pass=%d\n", pass_site);*/ \ (ret) = pass_site; \ } \ } @@ -1757,16 +1757,16 @@ static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks) if ( tok->tok_type==TOK_VAL ) { if ( tok->key ) - fprintf(pysam_stderr,"%s", tok->key); + fprintf(bcftools_stderr,"%s", tok->key); else if ( tok->tag ) - fprintf(pysam_stderr,"%s", tok->tag); + fprintf(bcftools_stderr,"%s", tok->tag); else - fprintf(pysam_stderr,"%e", tok->threshold); + fprintf(bcftools_stderr,"%e", tok->threshold); } else - fprintf(pysam_stderr,"%c", TOKEN_STRING[tok->tok_type]); - if ( tok->setter ) fprintf(pysam_stderr,"\t[setter %p]", tok->setter); - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"%c", TOKEN_STRING[tok->tok_type]); + if ( tok->setter ) fprintf(bcftools_stderr,"\t[setter %p]", tok->setter); + fprintf(bcftools_stderr,"\n"); } } @@ -1795,8 +1795,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) ret = filters_next_token(&tmp, &len); if ( ret==-1 ) error("Missing quotes in: %s\n", str); - // fprintf(pysam_stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); - // int i; for (i=0; inrmme,clust->nclust); + fprintf(bcftools_stderr,"nrmme=%d nclust=%d\n", clust->nrmme,clust->nclust); for (i=0; inrmme; i++) { node_t *node = clust->rmme[i]; @@ -97,7 +97,7 @@ void hclust_debug(hclust_t *clust) int bkid = node->bkid ? node->bkid->id : -1; int akidx = node->akid ? node->akid->idx : -1; int bkidx = node->bkid ? node->bkid->idx : -1; - fprintf(pysam_stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx); + fprintf(bcftools_stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx); } int j; @@ -110,17 +110,17 @@ void hclust_debug(hclust_t *clust) if ( node->idx==i ) { active=1; break; } node = node->next; } - fprintf(pysam_stderr,"%2d%c ",i,active?'*':' '); + fprintf(bcftools_stderr,"%2d%c ",i,active?'*':' '); for (j=0; jpdist,i,j)==9 ) - fprintf(pysam_stderr," ----- "); + fprintf(bcftools_stderr," ----- "); else - fprintf(pysam_stderr," %f", PDIST(clust->pdist,i,j)); + fprintf(bcftools_stderr," %f", PDIST(clust->pdist,i,j)); } - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"\n"); } - for (j=0; jndat-1; j++) fprintf(pysam_stderr," %6d ",j); fprintf(pysam_stderr,"\n"); + for (j=0; jndat-1; j++) fprintf(bcftools_stderr," %6d ",j); fprintf(bcftools_stderr,"\n"); } #endif diff --git a/bcftools/kmin.c.pysam.c b/bcftools/kmin.c.pysam.c index ee7b512..f0ccb98 100644 --- a/bcftools/kmin.c.pysam.c +++ b/bcftools/kmin.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* The MIT License diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index 9d81ba1..019adc0 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* main.c -- main bcftools command front-end. @@ -49,7 +49,7 @@ int main_vcfcall(int argc, char *argv[]); int main_vcfannotate(int argc, char *argv[]); int main_vcfroh(int argc, char *argv[]); int main_vcfconcat(int argc, char *argv[]); -int main_reheader(int argc, char *argv[]); +int main_bcftools_reheader(int argc, char *argv[]); int main_vcfconvert(int argc, char *argv[]); int main_vcfcnv(int argc, char *argv[]); #if USE_GPL @@ -125,7 +125,7 @@ static cmd_t cmds[] = .alias = "query", .help = "transform VCF/BCF into user-defined formats" }, - { .func = main_reheader, + { .func = main_bcftools_reheader, .alias = "reheader", .help = "modify VCF/BCF header, change sample names" }, @@ -239,24 +239,24 @@ static void usage(FILE *fp) int bcftools_main(int argc, char *argv[]) { - if (argc < 2) { usage(pysam_stderr); return 1; } + if (argc < 2) { usage(bcftools_stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - fprintf(pysam_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2016 Genome Research Ltd.\n", bcftools_version(), hts_version()); + fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2016 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL - fprintf(pysam_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); + fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); #else - fprintf(pysam_stdout, "License Expat: The MIT/Expat license\n"); + fprintf(bcftools_stdout, "License Expat: The MIT/Expat license\n"); #endif - fprintf(pysam_stdout, "This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n"); + fprintf(bcftools_stdout, "This is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n"); return 0; } else if (strcmp(argv[1], "--version-only") == 0) { - fprintf(pysam_stdout, "%s+htslib-%s\n", bcftools_version(), hts_version()); + fprintf(bcftools_stdout, "%s+htslib-%s\n", bcftools_version(), hts_version()); return 0; } else if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) { - if (argc == 2) { usage(pysam_stdout); return 0; } + if (argc == 2) { usage(bcftools_stdout); return 0; } // Otherwise change "bcftools help COMMAND [...]" to "bcftools COMMAND"; // main_xyz() functions by convention display the subcommand's usage // when invoked without any arguments. @@ -281,7 +281,7 @@ int bcftools_main(int argc, char *argv[]) } i++; } - fprintf(pysam_stderr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]); + fprintf(bcftools_stderr, "[E::%s] unrecognized command '%s'\n", __func__, argv[1]); return 1; } diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c index a315656..f05bed2 100644 --- a/bcftools/mcall.c.pysam.c +++ b/bcftools/mcall.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* mcall.c -- multiallelic and rare variant calling. @@ -298,7 +298,7 @@ void mcall_init(call_t *call) call->theta *= aM; if ( call->theta >= 1 ) { - fprintf(pysam_stderr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta); + fprintf(bcftools_stderr,"The prior is too big (theta*aM=%.2f), going with 0.99\n", call->theta); call->theta = 0.99; } call->theta = log(call->theta); @@ -524,13 +524,13 @@ float calc_ICB(int nref, int nalt, int nhets, int ndiploid) double q = 2*fref*falt; // probability of a het, assuming HWE double mean = q*ndiploid; - //fprintf(pysam_stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid); + //fprintf(bcftools_stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid); // Can we use normal approximation? The second condition is for performance only // and is not well justified. if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 ) { - //fprintf(pysam_stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); + //fprintf(bcftools_stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))); } @@ -1045,12 +1045,12 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n if ( igt==GT_SKIP ) continue; lk += gl[igt]; npresent++; - // fprintf(pysam_stderr," %e", gl[igt]); + // fprintf(bcftools_stderr," %e", gl[igt]); } - // fprintf(pysam_stderr,"\t\t"); + // fprintf(bcftools_stderr,"\t\t"); double Pkij = npresent==3 ? (double)2/(trio[itr]>>12) : 1; // with missing genotypes Pkij's are different lk += log(1 - trio_Pm * (1 - Pkij)); - // fprintf(pysam_stderr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij); + // fprintf(bcftools_stderr,"%d%d%d\t%e\t%.2f\n", trio[itr]>>8&0xf,trio[itr]>>4&0xf,trio[itr]&0xf, lk, Pkij); if ( c_lk < lk ) { c_lk = lk; c_itr = trio[itr]; } if ( uc_itr==trio[itr] ) uc_is_mendelian = 1; } @@ -1058,10 +1058,10 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n if ( !uc_is_mendelian ) { uc_lk += log(1 - trio_Pm); - // fprintf(pysam_stderr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); + // fprintf(bcftools_stderr,"c_lk=%e uc_lk=%e c_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,uc_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); if ( c_lk < uc_lk ) { c_lk = uc_lk; c_itr = uc_itr; } } - // fprintf(pysam_stderr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); + // fprintf(bcftools_stderr,"best_lk=%e best_itr=%d%d%d uc_itr=%d%d%d\n", c_lk,c_itr>>8&0xf,c_itr>>4&0xf,c_itr&0xf,uc_itr>>8&0xf,uc_itr>>4&0xf,uc_itr&0xf); // Set genotypes for father, mother, child and calculate genotype qualities for (i=0; i<3; i++) @@ -1295,7 +1295,7 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->als[nals] = tgt->als[i]; j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]); - if ( j+1==*unseen ) { fprintf(pysam_stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } + if ( j+1==*unseen ) { fprintf(bcftools_stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } if ( j>=0 ) { @@ -1468,7 +1468,7 @@ int mcall(call_t *call, bcf1_t *rec) int out_als, nout; if ( nals > 8*sizeof(out_als) ) { - fprintf(pysam_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); + fprintf(bcftools_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); return 0; } nout = mcall_find_best_alleles(call, nals, &out_als); @@ -1512,7 +1512,7 @@ int mcall(call_t *call, bcf1_t *rec) { if ( nout>4 ) { - fprintf(pysam_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); + fprintf(bcftools_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); return 0; } mcall_call_trio_genotypes(call, rec, nals,nout,out_als); diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 94286e9..47c684c 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools @@ -224,7 +224,7 @@ static int mplp_func(void *data, bam1_t *b) if (ma->conf->fai && b->core.tid >= 0) { has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence - fprintf(pysam_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", + fprintf(bcftools_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", __func__, b->core.pos, ref_len, b->core.tid); continue; } @@ -357,7 +357,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) static int mpileup(mplp_conf_t *conf) { if (conf->nfiles == 0) { - fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__); + fprintf(bcftools_stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } @@ -378,7 +378,7 @@ static int mpileup(mplp_conf_t *conf) { conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); if ( !conf->reg ) { - fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname); + fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } @@ -386,7 +386,7 @@ static int mpileup(mplp_conf_t *conf) { conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { - fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname); + fprintf(bcftools_stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } @@ -405,15 +405,15 @@ static int mpileup(mplp_conf_t *conf) conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb"); if ( !conf->mplp_data[i]->fp ) { - fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); + fprintf(bcftools_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(bcftools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { - fprintf(pysam_stderr, "[%s] failed to process %s: %s\n", + fprintf(bcftools_stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } @@ -421,7 +421,7 @@ static int mpileup(mplp_conf_t *conf) conf->mplp_data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); if ( !h_tmp ) { - fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); + fprintf(bcftools_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet @@ -441,7 +441,7 @@ static int mpileup(mplp_conf_t *conf) if (conf->reg) { hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); if (idx == NULL) { - fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); + fprintf(bcftools_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->buf.l = 0; @@ -451,10 +451,10 @@ static int mpileup(mplp_conf_t *conf) { conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { - fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); + fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } - fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); + fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } if ( nregs==1 ) // no need to keep the index in memory @@ -479,11 +479,11 @@ static int mpileup(mplp_conf_t *conf) conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); - fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); + fprintf(bcftools_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); if (conf->bcf_fp == NULL) { - fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); + fprintf(bcftools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); @@ -607,9 +607,9 @@ static int mpileup(mplp_conf_t *conf) conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); if ( (double)conf->max_depth * conf->nfiles > 1<<20) - fprintf(pysam_stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); + fprintf(bcftools_stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) - fprintf(pysam_stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); + fprintf(bcftools_stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); bam_mplp_set_maxcnt(conf->iter, conf->max_depth); conf->max_indel_depth = conf->max_indel_depth * nsmpl; conf->bcf_rec = bcf_init1(); @@ -635,10 +635,10 @@ static int mpileup(mplp_conf_t *conf) { conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { - fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); + fprintf(bcftools_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } - fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); + fprintf(bcftools_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } bam_mplp_reset(conf->iter); @@ -708,7 +708,7 @@ int read_file_list(const char *file_list,int *n,char **argv[]) FILE *fh = fopen(file_list,"r"); if ( !fh ) { - fprintf(pysam_stderr,"%s: %s\n", file_list,strerror(errno)); + fprintf(bcftools_stderr,"%s: %s\n", file_list,strerror(errno)); return 1; } @@ -730,9 +730,9 @@ int read_file_list(const char *file_list,int *n,char **argv[]) for (i=0; i= b->n) return -1; if (b->M != b->n * 2) { - fprintf(pysam_stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); + fprintf(bcftools_stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); return -1; } b->n1 = n1; @@ -527,9 +527,9 @@ int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1 void bcf_p1_dump_afs(bcf_p1aux_t *ma) { int k; - fprintf(pysam_stderr, "[afs]"); + fprintf(bcftools_stderr, "[afs]"); for (k = 0; k <= ma->M; ++k) - fprintf(pysam_stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); + fprintf(bcftools_stderr, "\n"); memset(ma->afs, 0, sizeof(double) * (ma->M + 1)); } diff --git a/bcftools/pysam.h b/bcftools/pysam.h deleted file mode 100644 index b0fc4fb..0000000 --- a/bcftools/pysam.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef PYSAM_H -#define PYSAM_H -#include "stdio.h" -extern FILE * pysam_stderr; -extern FILE * pysam_stdout; -extern const char * pysam_stdout_fn; -#endif diff --git a/bcftools/regidx.c.pysam.c b/bcftools/regidx.c.pysam.c index 4d6dcda..62ef61f 100644 --- a/bcftools/regidx.c.pysam.c +++ b/bcftools/regidx.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* Copyright (C) 2014-2016 Genome Research Ltd. @@ -419,11 +419,11 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t ss = se+1; *beg = strtod(ss, &se); - if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; } + if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse bed line: %s\n", line); return -2; } ss = se+1; *end = strtod(ss, &se) - 1; - if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; } + if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse bed line: %s\n", line); return -2; } return 0; } @@ -451,8 +451,8 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t ss = se+1; *beg = strtod(ss, &se); - if ( ss==se ) { fprintf(pysam_stderr,"Could not parse tab line: %s\n", line); return -2; } - if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } + if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse tab line: %s\n", line); return -2; } + if ( *beg==0 ) { fprintf(bcftools_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } (*beg)--; if ( !se[0] || !se[1] ) @@ -462,7 +462,7 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t ss = se+1; *end = strtod(ss, &se); if ( ss==se ) *end = *beg; - else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } + else if ( *end==0 ) { fprintf(bcftools_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } else (*end)--; } return 0; @@ -490,8 +490,8 @@ int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t ss = se+1; *beg = strtod(ss, &se); - if ( ss==se ) { fprintf(pysam_stderr,"Could not parse reg line: %s\n", line); return -2; } - if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } + if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse reg line: %s\n", line); return -2; } + if ( *beg==0 ) { fprintf(bcftools_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } (*beg)--; if ( !se[0] || !se[1] ) @@ -501,7 +501,7 @@ int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t ss = se+1; *end = strtod(ss, &se); if ( ss==se ) *end = *beg; - else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } + else if ( *end==0 ) { fprintf(bcftools_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; } else (*end)--; } return 0; diff --git a/bcftools/reheader.c b/bcftools/reheader.c new file mode 100644 index 0000000..30a441c --- /dev/null +++ b/bcftools/reheader.c @@ -0,0 +1,521 @@ +/* reheader.c -- reheader subcommand. + + Copyright (C) 2014,2016 Genome Research Ltd. + + Author: Petr Danecek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for hts_get_bgzfp() +#include +#include "bcftools.h" +#include "khash_str2str.h" + +typedef struct _args_t +{ + char **argv, *fname, *samples_fname, *header_fname, *output_fname; + htsFile *fp; + htsFormat type; + int argc; +} +args_t; + +static void read_header_file(char *fname, kstring_t *hdr) +{ + kstring_t tmp = {0,0,0}; + hdr->l = 0; + + htsFile *fp = hts_open(fname, "r"); + if ( !fp ) error("Could not read: %s\n", fname); + while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) + { + kputsn(tmp.s,tmp.l,hdr); + kputc('\n',hdr); + } + if ( hts_close(fp) ) error("Close failed: %s\n", fname); + free(tmp.s); + + while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines + kputc('\n',hdr); +} + +static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int idx) +{ + int i, j, n; + kstring_t key = {0,0,0}; + kstring_t val = {0,0,0}; + + // Are these samples "old-name new-name" pairs? + void *hash = khash_str2str_init(); + for (i=0; il>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines + hdr->s[hdr->l] = 0; + + kstring_t tmp = {0,0,0}; + i = j = n = 0; + while ( hdr->s[idx+i] && hdr->s[idx+i]) + { + if ( hdr->s[idx+i]=='\t' ) + { + hdr->s[idx+i] = 0; + + if ( ++n>9 ) + { + char *ori = khash_str2str_get(hash,hdr->s+idx+j); + kputs(ori ? ori : hdr->s+idx+j, &tmp); + } + else + kputs(hdr->s+idx+j, &tmp); + + kputc('\t',&tmp); + + j = ++i; + continue; + } + i++; + } + char *ori = khash_str2str_get(hash,hdr->s+idx+j); + kputs(ori ? ori : hdr->s+idx+j, &tmp); + + khash_str2str_destroy_free_all(hash); + + hdr->l = idx; + kputs(tmp.s, hdr); + kputc('\n', hdr); + free(tmp.s); + + return 1; +} + +static void set_samples(char **samples, int nsamples, kstring_t *hdr) +{ + // Find the beginning of the #CHROM line + int i = hdr->l - 2, ncols = 0; + while ( i>=0 && hdr->s[i]!='\n' ) + { + if ( hdr->s[i]=='\t' ) ncols++; + i--; + } + if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s); + + // Are the samples "old-sample new-sample" pairs? + if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return; + + // Replace all samples + if ( ncols!=nsamples+8 ) + fprintf(stderr, "Warning: different number of samples: %d vs %d\n", nsamples,ncols-8); + + ncols = 0; + while ( ncols!=9 ) + { + i++; + if ( hdr->s[i]=='\t' ) ncols++; + } + hdr->l = i; + + for (i=0; ifp); + if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) + error("Failed to read %s: %s\n", args->fname, strerror(errno)); + + kstring_t hdr = {0,0,0}; + char *buffer = (char*) fp->uncompressed_block; + + // Read the header and find the position of the data block + if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]); + + int skip_until = 1; // end of the header in the current uncompressed block + while (1) + { + if ( buffer[skip_until]=='\n' ) + { + skip_until++; + if ( skip_until>=fp->block_length ) + { + kputsn(buffer,skip_until,&hdr); + if ( bgzf_read_block(fp) != 0 ) error("Error reading %s\n", args->fname); + if ( !fp->block_length ) break; + skip_until = 0; + } + // The header has finished + if ( buffer[skip_until]!='#' ) + { + kputsn(buffer,skip_until,&hdr); + break; + } + } + skip_until++; + if ( skip_until>=fp->block_length ) + { + kputsn(buffer,fp->block_length,&hdr); + if ( bgzf_read_block(fp) != 0 ) error("Error reading %s\n", args->fname); + if ( !fp->block_length ) break; + skip_until = 0; + } + } + + int nsamples = 0; + char **samples = NULL; + if ( args->samples_fname ) + samples = hts_readlines(args->samples_fname, &nsamples); + if ( args->header_fname ) + { + free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; + read_header_file(args->header_fname, &hdr); + } + if ( samples ) + { + set_samples(samples, nsamples, &hdr); + int i; + for (i=0; ioutput_fname ? args->output_fname : "-","w");; + if ( bgzf_write(bgzf_out, hdr.s, hdr.l) < 0 ) error("Can't write BGZF header (code %d)\n", bgzf_out->errcode); + free(hdr.s); + + // Output all remainig data read with the header block + if ( fp->block_length - skip_until > 0 ) + { + if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); + } + if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); + + // Stream the rest of the file without as it is, without decompressing + ssize_t nread; + const size_t page_size = 32768; + char *buf = (char*) malloc(page_size); + while (1) + { + nread = bgzf_raw_read(fp, buf, page_size); + if ( nread<=0 ) break; + + int count = bgzf_raw_write(bgzf_out, buf, nread); + if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); + } + if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %d\n",args->output_fname ? args->output_fname : "-",bgzf_out->errcode); + if (hts_close(args->fp)) error("Error closing %s: %d\n",args->fname,fp->errcode); + free(buf); +} +static void reheader_vcf(args_t *args) +{ + kstring_t hdr = {0,0,0}; + htsFile *fp = args->fp; + while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) + { + kputc('\n',&fp->line); // hts_getline eats the newline character + if ( fp->line.s[0]!='#' ) break; + kputsn(fp->line.s,fp->line.l,&hdr); + } + + int nsamples = 0; + char **samples = NULL; + if ( args->samples_fname ) + samples = hts_readlines(args->samples_fname, &nsamples); + if ( args->header_fname ) + { + free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; + read_header_file(args->header_fname, &hdr); + } + if ( samples ) + { + set_samples(samples, nsamples, &hdr); + int i; + for (i=0; ioutput_fname ? open(args->output_fname, O_WRONLY|O_CREAT|O_TRUNC, 0666) : STDOUT_FILENO; + if ( out==-1 ) error("%s: %s\n", args->output_fname,strerror(errno)); + if ( write(out, hdr.s, hdr.l)!=hdr.l ) error("Failed to write %d bytes\n", hdr.l); + free(hdr.s); + if ( fp->line.l ) + { + if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); + } + while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) // uncompressed file implies small size, we don't worry about speed + { + kputc('\n',&fp->line); + if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); + } + hts_close(fp); + close(out); +} + +static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) +{ + bcf_hrec_t *src_hrec, *dst_hrec, *tmp; + bcf_hdr_t *out = bcf_hdr_init("r"); + int i; + for (i=0; inhrec; i++) + { + // first insert lines which do not code BCF ids, their order does not matter + dst_hrec = dst->hrec[i]; + if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) continue; + bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); + } + for (i=0; inhrec; i++) + { + // now transfer header lines which define BCF ids + src_hrec = src->hrec[i]; + + if ( src_hrec->type==BCF_HL_FLT || src_hrec->type==BCF_HL_INFO || src_hrec->type==BCF_HL_FMT || src_hrec->type== BCF_HL_CTG ) + { + int j = bcf_hrec_find_key(src_hrec, "ID"); + dst_hrec = bcf_hdr_get_hrec(dst, src_hrec->type, "ID", src_hrec->vals[j], NULL); + if ( !dst_hrec ) continue; + + tmp = bcf_hrec_dup(dst_hrec); + + j = bcf_hrec_find_key(src_hrec, "IDX"); + if ( j>=0 ) + { + j = atoi(src_hrec->vals[j]); + hrec_add_idx(tmp, j); + } + bcf_hdr_add_hrec(out, tmp); + } + } + bcf_hdr_sync(out); + for (i=0; inhrec; i++) + { + // finally add new structured fields + dst_hrec = dst->hrec[i]; + if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) + { + int j = bcf_hrec_find_key(dst_hrec, "ID"); + tmp = bcf_hdr_get_hrec(out, dst_hrec->type, "ID", dst_hrec->vals[j], NULL); + if ( !tmp ) + bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); + } + } + for (i=0; in[BCF_DT_SAMPLE]; i++) bcf_hdr_add_sample(out, dst->samples[i]); + bcf_hdr_destroy(dst); + return out; +} + +static void reheader_bcf(args_t *args, int is_compressed) +{ + htsFile *fp = args->fp; + bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); + kstring_t htxt = {0,0,0}; + bcf_hdr_format(hdr, 1, &htxt); + + int i, nsamples = 0; + char **samples = NULL; + if ( args->samples_fname ) + samples = hts_readlines(args->samples_fname, &nsamples); + if ( args->header_fname ) + { + free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; + read_header_file(args->header_fname, &htxt); + } + if ( samples ) + { + set_samples(samples, nsamples, &htxt); + for (i=0; iheader_fname ) hdr_out = strip_header(hdr, hdr_out); + + // write the header and the body + htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu"); + if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno)); + bcf_hdr_write(fp_out, hdr_out); + + bcf1_t *rec = bcf_init(); + while ( bcf_read(fp, hdr, rec)==0 ) + { + // sanity checking, this slows things down. Make it optional? + bcf_unpack(rec, BCF_UN_ALL); + if ( rec->rid >= hdr_out->n[BCF_DT_CTG] || strcmp(bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid),bcf_hdr_int2id(hdr_out,BCF_DT_CTG,rec->rid)) ) + error("The CHROM is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid)); + + for (i=0; id.n_flt; i++) + { + int id = rec->d.flt[i]; + if ( id >= hdr_out->n[BCF_DT_ID] ) break; + if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FLT,id) ) break; + if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) + error("FIXME: Broken FILTER ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); + } + if ( i!=rec->d.n_flt ) + error("The FILTER is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i])); + + for (i=0; in_info; i++) + { + int id = rec->d.info[i].key; + if ( id >= hdr_out->n[BCF_DT_ID] ) break; + if ( !hdr_out->id[BCF_DT_ID][id].key ) break; + if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_INFO,id) ) break; + if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) + error("FIXME: Broken INFO ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); + } + if ( i!=rec->n_info ) + error("The INFO tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.info[i].key)); + + for (i=0; in_fmt; i++) + { + int id = rec->d.fmt[i].id; + if ( id >= hdr_out->n[BCF_DT_ID] ) break; + if ( !hdr_out->id[BCF_DT_ID][id].key ) break; + if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FMT,id) ) break; + if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) + error("FIXME: Broken FORMAT ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); + } + if ( i!=rec->n_fmt ) + error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); + + bcf_write(fp_out,hdr_out,rec); + } + bcf_destroy(rec); + + free(htxt.s); + hts_close(fp_out); + hts_close(fp); + bcf_hdr_destroy(hdr_out); + bcf_hdr_destroy(hdr); +} + + +static void usage(args_t *args) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "About: Modify header of VCF/BCF files, change sample names.\n"); + fprintf(stderr, "Usage: bcftools reheader [OPTIONS] \n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -h, --header new header\n"); + fprintf(stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(stderr, " -s, --samples new sample names\n"); + fprintf(stderr, "\n"); + exit(1); +} + +int main_bcftools_reheader(int argc, char *argv[]) +{ + int c; + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + + static struct option loptions[] = + { + {"output",1,0,'o'}, + {"header",1,0,'h'}, + {"samples",1,0,'s'}, + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0) + { + switch (c) + { + case 'o': args->output_fname = optarg; break; + case 's': args->samples_fname = optarg; break; + case 'h': args->header_fname = optarg; break; + case '?': usage(args); + default: error("Unknown argument: %s\n", optarg); + } + } + + if ( optind>=argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin + else usage(args); + } + else args->fname = argv[optind]; + + if ( !args->samples_fname && !args->header_fname ) usage(args); + if ( !args->fname ) usage(args); + + args->fp = hts_open(args->fname,"r"); + if ( !args->fp ) error("Failed to open: %s\n", args->fname); + args->type = *hts_get_format(args->fp); + + if ( args->type.format==vcf ) + { + if ( args->type.compression==bgzf || args->type.compression==gzip ) + reheader_vcf_gz(args); + else + reheader_vcf(args); + } + else + reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); + + free(args); + return 0; +} diff --git a/bcftools/reheader.c.pysam.c b/bcftools/reheader.c.pysam.c new file mode 100644 index 0000000..803b483 --- /dev/null +++ b/bcftools/reheader.c.pysam.c @@ -0,0 +1,523 @@ +#include "bcftools.pysam.h" + +/* reheader.c -- reheader subcommand. + + Copyright (C) 2014,2016 Genome Research Ltd. + + Author: Petr Danecek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for hts_get_bgzfp() +#include +#include "bcftools.h" +#include "khash_str2str.h" + +typedef struct _args_t +{ + char **argv, *fname, *samples_fname, *header_fname, *output_fname; + htsFile *fp; + htsFormat type; + int argc; +} +args_t; + +static void read_header_file(char *fname, kstring_t *hdr) +{ + kstring_t tmp = {0,0,0}; + hdr->l = 0; + + htsFile *fp = hts_open(fname, "r"); + if ( !fp ) error("Could not read: %s\n", fname); + while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) + { + kputsn(tmp.s,tmp.l,hdr); + kputc('\n',hdr); + } + if ( hts_close(fp) ) error("Close failed: %s\n", fname); + free(tmp.s); + + while ( hdr->l>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines + kputc('\n',hdr); +} + +static int set_sample_pairs(char **samples, int nsamples, kstring_t *hdr, int idx) +{ + int i, j, n; + kstring_t key = {0,0,0}; + kstring_t val = {0,0,0}; + + // Are these samples "old-name new-name" pairs? + void *hash = khash_str2str_init(); + for (i=0; il>0 && isspace(hdr->s[hdr->l-1]) ) hdr->l--; // remove trailing newlines + hdr->s[hdr->l] = 0; + + kstring_t tmp = {0,0,0}; + i = j = n = 0; + while ( hdr->s[idx+i] && hdr->s[idx+i]) + { + if ( hdr->s[idx+i]=='\t' ) + { + hdr->s[idx+i] = 0; + + if ( ++n>9 ) + { + char *ori = khash_str2str_get(hash,hdr->s+idx+j); + kputs(ori ? ori : hdr->s+idx+j, &tmp); + } + else + kputs(hdr->s+idx+j, &tmp); + + kputc('\t',&tmp); + + j = ++i; + continue; + } + i++; + } + char *ori = khash_str2str_get(hash,hdr->s+idx+j); + kputs(ori ? ori : hdr->s+idx+j, &tmp); + + khash_str2str_destroy_free_all(hash); + + hdr->l = idx; + kputs(tmp.s, hdr); + kputc('\n', hdr); + free(tmp.s); + + return 1; +} + +static void set_samples(char **samples, int nsamples, kstring_t *hdr) +{ + // Find the beginning of the #CHROM line + int i = hdr->l - 2, ncols = 0; + while ( i>=0 && hdr->s[i]!='\n' ) + { + if ( hdr->s[i]=='\t' ) ncols++; + i--; + } + if ( i<0 || strncmp(hdr->s+i+1,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",45) ) error("Could not parse the header: %s\n", hdr->s); + + // Are the samples "old-sample new-sample" pairs? + if ( set_sample_pairs(samples,nsamples,hdr, i+1) ) return; + + // Replace all samples + if ( ncols!=nsamples+8 ) + fprintf(bcftools_stderr, "Warning: different number of samples: %d vs %d\n", nsamples,ncols-8); + + ncols = 0; + while ( ncols!=9 ) + { + i++; + if ( hdr->s[i]=='\t' ) ncols++; + } + hdr->l = i; + + for (i=0; ifp); + if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) + error("Failed to read %s: %s\n", args->fname, strerror(errno)); + + kstring_t hdr = {0,0,0}; + char *buffer = (char*) fp->uncompressed_block; + + // Read the header and find the position of the data block + if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]); + + int skip_until = 1; // end of the header in the current uncompressed block + while (1) + { + if ( buffer[skip_until]=='\n' ) + { + skip_until++; + if ( skip_until>=fp->block_length ) + { + kputsn(buffer,skip_until,&hdr); + if ( bgzf_read_block(fp) != 0 ) error("Error reading %s\n", args->fname); + if ( !fp->block_length ) break; + skip_until = 0; + } + // The header has finished + if ( buffer[skip_until]!='#' ) + { + kputsn(buffer,skip_until,&hdr); + break; + } + } + skip_until++; + if ( skip_until>=fp->block_length ) + { + kputsn(buffer,fp->block_length,&hdr); + if ( bgzf_read_block(fp) != 0 ) error("Error reading %s\n", args->fname); + if ( !fp->block_length ) break; + skip_until = 0; + } + } + + int nsamples = 0; + char **samples = NULL; + if ( args->samples_fname ) + samples = hts_readlines(args->samples_fname, &nsamples); + if ( args->header_fname ) + { + free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; + read_header_file(args->header_fname, &hdr); + } + if ( samples ) + { + set_samples(samples, nsamples, &hdr); + int i; + for (i=0; ioutput_fname ? args->output_fname : "-","w");; + if ( bgzf_write(bgzf_out, hdr.s, hdr.l) < 0 ) error("Can't write BGZF header (code %d)\n", bgzf_out->errcode); + free(hdr.s); + + // Output all remainig data read with the header block + if ( fp->block_length - skip_until > 0 ) + { + if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); + } + if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); + + // Stream the rest of the file without as it is, without decompressing + ssize_t nread; + const size_t page_size = 32768; + char *buf = (char*) malloc(page_size); + while (1) + { + nread = bgzf_raw_read(fp, buf, page_size); + if ( nread<=0 ) break; + + int count = bgzf_raw_write(bgzf_out, buf, nread); + if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); + } + if (bgzf_close(bgzf_out) < 0) error("Error closing %s: %d\n",args->output_fname ? args->output_fname : "-",bgzf_out->errcode); + if (hts_close(args->fp)) error("Error closing %s: %d\n",args->fname,fp->errcode); + free(buf); +} +static void reheader_vcf(args_t *args) +{ + kstring_t hdr = {0,0,0}; + htsFile *fp = args->fp; + while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) + { + kputc('\n',&fp->line); // hts_getline eats the newline character + if ( fp->line.s[0]!='#' ) break; + kputsn(fp->line.s,fp->line.l,&hdr); + } + + int nsamples = 0; + char **samples = NULL; + if ( args->samples_fname ) + samples = hts_readlines(args->samples_fname, &nsamples); + if ( args->header_fname ) + { + free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; + read_header_file(args->header_fname, &hdr); + } + if ( samples ) + { + set_samples(samples, nsamples, &hdr); + int i; + for (i=0; ioutput_fname ? open(args->output_fname, O_WRONLY|O_CREAT|O_TRUNC, 0666) : STDOUT_FILENO; + if ( out==-1 ) error("%s: %s\n", args->output_fname,strerror(errno)); + if ( write(out, hdr.s, hdr.l)!=hdr.l ) error("Failed to write %d bytes\n", hdr.l); + free(hdr.s); + if ( fp->line.l ) + { + if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); + } + while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) // uncompressed file implies small size, we don't worry about speed + { + kputc('\n',&fp->line); + if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %d bytes\n", fp->line.l); + } + hts_close(fp); + close(out); +} + +static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) +{ + bcf_hrec_t *src_hrec, *dst_hrec, *tmp; + bcf_hdr_t *out = bcf_hdr_init("r"); + int i; + for (i=0; inhrec; i++) + { + // first insert lines which do not code BCF ids, their order does not matter + dst_hrec = dst->hrec[i]; + if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) continue; + bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); + } + for (i=0; inhrec; i++) + { + // now transfer header lines which define BCF ids + src_hrec = src->hrec[i]; + + if ( src_hrec->type==BCF_HL_FLT || src_hrec->type==BCF_HL_INFO || src_hrec->type==BCF_HL_FMT || src_hrec->type== BCF_HL_CTG ) + { + int j = bcf_hrec_find_key(src_hrec, "ID"); + dst_hrec = bcf_hdr_get_hrec(dst, src_hrec->type, "ID", src_hrec->vals[j], NULL); + if ( !dst_hrec ) continue; + + tmp = bcf_hrec_dup(dst_hrec); + + j = bcf_hrec_find_key(src_hrec, "IDX"); + if ( j>=0 ) + { + j = atoi(src_hrec->vals[j]); + hrec_add_idx(tmp, j); + } + bcf_hdr_add_hrec(out, tmp); + } + } + bcf_hdr_sync(out); + for (i=0; inhrec; i++) + { + // finally add new structured fields + dst_hrec = dst->hrec[i]; + if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) + { + int j = bcf_hrec_find_key(dst_hrec, "ID"); + tmp = bcf_hdr_get_hrec(out, dst_hrec->type, "ID", dst_hrec->vals[j], NULL); + if ( !tmp ) + bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); + } + } + for (i=0; in[BCF_DT_SAMPLE]; i++) bcf_hdr_add_sample(out, dst->samples[i]); + bcf_hdr_destroy(dst); + return out; +} + +static void reheader_bcf(args_t *args, int is_compressed) +{ + htsFile *fp = args->fp; + bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); + kstring_t htxt = {0,0,0}; + bcf_hdr_format(hdr, 1, &htxt); + + int i, nsamples = 0; + char **samples = NULL; + if ( args->samples_fname ) + samples = hts_readlines(args->samples_fname, &nsamples); + if ( args->header_fname ) + { + free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; + read_header_file(args->header_fname, &htxt); + } + if ( samples ) + { + set_samples(samples, nsamples, &htxt); + for (i=0; iheader_fname ) hdr_out = strip_header(hdr, hdr_out); + + // write the header and the body + htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu"); + if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno)); + bcf_hdr_write(fp_out, hdr_out); + + bcf1_t *rec = bcf_init(); + while ( bcf_read(fp, hdr, rec)==0 ) + { + // sanity checking, this slows things down. Make it optional? + bcf_unpack(rec, BCF_UN_ALL); + if ( rec->rid >= hdr_out->n[BCF_DT_CTG] || strcmp(bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid),bcf_hdr_int2id(hdr_out,BCF_DT_CTG,rec->rid)) ) + error("The CHROM is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid)); + + for (i=0; id.n_flt; i++) + { + int id = rec->d.flt[i]; + if ( id >= hdr_out->n[BCF_DT_ID] ) break; + if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FLT,id) ) break; + if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) + error("FIXME: Broken FILTER ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); + } + if ( i!=rec->d.n_flt ) + error("The FILTER is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i])); + + for (i=0; in_info; i++) + { + int id = rec->d.info[i].key; + if ( id >= hdr_out->n[BCF_DT_ID] ) break; + if ( !hdr_out->id[BCF_DT_ID][id].key ) break; + if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_INFO,id) ) break; + if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) + error("FIXME: Broken INFO ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); + } + if ( i!=rec->n_info ) + error("The INFO tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.info[i].key)); + + for (i=0; in_fmt; i++) + { + int id = rec->d.fmt[i].id; + if ( id >= hdr_out->n[BCF_DT_ID] ) break; + if ( !hdr_out->id[BCF_DT_ID][id].key ) break; + if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FMT,id) ) break; + if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) + error("FIXME: Broken FORMAT ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); + } + if ( i!=rec->n_fmt ) + error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); + + bcf_write(fp_out,hdr_out,rec); + } + bcf_destroy(rec); + + free(htxt.s); + hts_close(fp_out); + hts_close(fp); + bcf_hdr_destroy(hdr_out); + bcf_hdr_destroy(hdr); +} + + +static void usage(args_t *args) +{ + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Modify header of VCF/BCF files, change sample names.\n"); + fprintf(bcftools_stderr, "Usage: bcftools reheader [OPTIONS] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -h, --header new header\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -s, --samples new sample names\n"); + fprintf(bcftools_stderr, "\n"); + exit(1); +} + +int main_bcftools_reheader(int argc, char *argv[]) +{ + int c; + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + + static struct option loptions[] = + { + {"output",1,0,'o'}, + {"header",1,0,'h'}, + {"samples",1,0,'s'}, + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0) + { + switch (c) + { + case 'o': args->output_fname = optarg; break; + case 's': args->samples_fname = optarg; break; + case 'h': args->header_fname = optarg; break; + case '?': usage(args); + default: error("Unknown argument: %s\n", optarg); + } + } + + if ( optind>=argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin + else usage(args); + } + else args->fname = argv[optind]; + + if ( !args->samples_fname && !args->header_fname ) usage(args); + if ( !args->fname ) usage(args); + + args->fp = hts_open(args->fname,"r"); + if ( !args->fp ) error("Failed to open: %s\n", args->fname); + args->type = *hts_get_format(args->fp); + + if ( args->type.format==vcf ) + { + if ( args->type.compression==bgzf || args->type.compression==gzip ) + reheader_vcf_gz(args); + else + reheader_vcf(args); + } + else + reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); + + free(args); + return 0; +} diff --git a/bcftools/smpl_ilist.c.pysam.c b/bcftools/smpl_ilist.c.pysam.c index f52b8ce..36f22a6 100644 --- a/bcftools/smpl_ilist.c.pysam.c +++ b/bcftools/smpl_ilist.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* Copyright (C) 2016 Genome Research Ltd. diff --git a/bcftools/tabix.c.pysam.c b/bcftools/tabix.c.pysam.c index b0c6e0e..ba9e1b3 100644 --- a/bcftools/tabix.c.pysam.c +++ b/bcftools/tabix.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* tabix.c -- tabix subcommand. @@ -52,25 +52,25 @@ int main_tabix(int argc, char *argv[]) else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf; else { - fprintf(pysam_stderr, "The type '%s' not recognised\n", optarg); + fprintf(bcftools_stderr, "The type '%s' not recognised\n", optarg); return 1; detect = 0; } } if (optind == argc) { - fprintf(pysam_stderr, "\nUsage: bcftools tabix [options] [reg1 [...]]\n\n"); - fprintf(pysam_stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); - fprintf(pysam_stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); - fprintf(pysam_stderr, " -b INT column number for region start [4]\n"); - fprintf(pysam_stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); - fprintf(pysam_stderr, " -0 specify coordinates are zero-based\n"); - fprintf(pysam_stderr, " -S INT skip first INT lines [0]\n"); - fprintf(pysam_stderr, " -c CHAR skip lines starting with CHAR [null]\n"); - fprintf(pysam_stderr, " -a print all records\n"); - fprintf(pysam_stderr, " -f force to overwrite existing index\n"); - fprintf(pysam_stderr, " -m INT set the minimal interval size to 1< [reg1 [...]]\n\n"); + fprintf(bcftools_stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); + fprintf(bcftools_stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); + fprintf(bcftools_stderr, " -b INT column number for region start [4]\n"); + fprintf(bcftools_stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); + fprintf(bcftools_stderr, " -0 specify coordinates are zero-based\n"); + fprintf(bcftools_stderr, " -S INT skip first INT lines [0]\n"); + fprintf(bcftools_stderr, " -c CHAR skip lines starting with CHAR [null]\n"); + fprintf(bcftools_stderr, " -a print all records\n"); + fprintf(bcftools_stderr, " -f force to overwrite existing index\n"); + fprintf(bcftools_stderr, " -m INT set the minimal interval size to 1<= 0) fputs(s.s, pysam_stdout) & fputc('\n', pysam_stdout); + while (bgzf_getline(fp, '\n', &s) >= 0) fputs(s.s, bcftools_stdout) & fputc('\n', bcftools_stdout); bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index @@ -101,14 +101,14 @@ int main_tabix(int argc, char *argv[]) if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); free(fn); - fprintf(pysam_stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); + fprintf(bcftools_stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } free(fn); } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { - fprintf(pysam_stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); + fprintf(bcftools_stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); return 1; } } else { // read with random access @@ -122,7 +122,7 @@ int main_tabix(int argc, char *argv[]) for (i = optind + 1; i < argc; ++i) { hts_itr_t *itr; if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue; - while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) fputs(s.s, pysam_stdout) & fputc('\n', pysam_stdout); + while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) fputs(s.s, bcftools_stdout) & fputc('\n', bcftools_stdout); tbx_itr_destroy(itr); } free(s.s); diff --git a/bcftools/tsv2vcf.c.pysam.c b/bcftools/tsv2vcf.c.pysam.c index f5eff01..f6aabf5 100644 --- a/bcftools/tsv2vcf.c.pysam.c +++ b/bcftools/tsv2vcf.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* tsv2vcf.c -- convert from whitespace-separated fields to VCF diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index 09f76c2..72824f7 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfannotate.c -- Annotate and edit VCF/BCF files. @@ -280,7 +280,7 @@ static void init_remove_annots(args_t *args) int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s); if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) ) { - fprintf(pysam_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); + fprintf(bcftools_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); args->nrm--; } else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) @@ -1860,30 +1860,30 @@ static void annotate(args_t *args, bcf1_t *line) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Annotate and edit VCF/BCF files.\n"); - fprintf(pysam_stderr, "Usage: bcftools annotate [options] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(pysam_stderr, " --collapse matching records by , see man page for details [some]\n"); - fprintf(pysam_stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(pysam_stderr, " -I, --set-id [+] set ID column, see man page for details\n"); - fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); - fprintf(pysam_stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(pysam_stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(pysam_stderr, " -x, --remove list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Annotate and edit VCF/BCF files.\n"); + fprintf(bcftools_stderr, "Usage: bcftools annotate [options] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(bcftools_stderr, " --collapse matching records by , see man page for details [some]\n"); + fprintf(bcftools_stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); + fprintf(bcftools_stderr, " -I, --set-id [+] set ID column, see man page for details\n"); + fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); + fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " -x, --remove list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n"); + fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c index 2dc3dae..85a2074 100644 --- a/bcftools/vcfbuf.c.pysam.c +++ b/bcftools/vcfbuf.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* The MIT License diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 8e6721b..f4a0ce9 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. @@ -296,8 +296,8 @@ static void set_samples(args_t *args, const char *fn, int is_file) char x = *se, *xptr = se; *se = 0; int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss); - if ( ismpl < 0 ) { fprintf(pysam_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } - if ( old2new[ismpl] != -1 ) { fprintf(pysam_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } + if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } + if ( old2new[ismpl] != -1 ) { fprintf(bcftools_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } ss = se+1; while ( *ss && isspace(*ss) ) ss++; @@ -403,7 +403,7 @@ static void init_data(args_t *args) if ( args->aux.flag&CALL_CONSTR_TRIO ) { if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname); - fprintf(pysam_stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); + fprintf(bcftools_stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); } } if ( args->ploidy ) @@ -538,7 +538,7 @@ static int parse_format_flag(const char *str) else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP; else { - fprintf(pysam_stderr,"Could not parse \"%s\"\n", str); + fprintf(bcftools_stderr,"Could not parse \"%s\"\n", str); exit(1); } if ( !*se ) break; @@ -578,26 +578,26 @@ ploidy_t *init_ploidy(char *alias) if ( !pld->alias ) { - fprintf(pysam_stderr,"\nPRE-DEFINED PLOIDY FILES\n\n"); - fprintf(pysam_stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(pysam_stderr," * Coordinates are 1-based inclusive.\n"); - fprintf(pysam_stderr," * A '*' means any value not otherwise defined.\n\n"); + fprintf(bcftools_stderr,"\nPRE-DEFINED PLOIDY FILES\n\n"); + fprintf(bcftools_stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(bcftools_stderr," * Coordinates are 1-based inclusive.\n"); + fprintf(bcftools_stderr," * A '*' means any value not otherwise defined.\n\n"); pld = ploidy_predefs; while ( pld->alias ) { - fprintf(pysam_stderr,"%s\n .. %s\n\n", pld->alias,pld->about); + fprintf(bcftools_stderr,"%s\n .. %s\n\n", pld->alias,pld->about); if ( detailed ) - fprintf(pysam_stderr,"%s\n", pld->ploidy); + fprintf(bcftools_stderr,"%s\n", pld->ploidy); pld++; } - fprintf(pysam_stderr,"Run as --ploidy (e.g. --ploidy GRCh37).\n"); - fprintf(pysam_stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n"); - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"Run as --ploidy (e.g. --ploidy GRCh37).\n"); + fprintf(bcftools_stderr,"To see the detailed ploidy definition, append a question mark (e.g. --ploidy GRCh37?).\n"); + fprintf(bcftools_stderr,"\n"); exit(-1); } else if ( detailed ) { - fprintf(pysam_stderr,"%s", pld->ploidy); + fprintf(bcftools_stderr,"%s", pld->ploidy); exit(-1); } return ploidy_init_string(pld->ploidy,2); @@ -605,53 +605,53 @@ ploidy_t *init_ploidy(char *alias) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); - fprintf(pysam_stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); - fprintf(pysam_stderr, " functionality has been temporarily lost in the process of transition to htslib,\n"); - fprintf(pysam_stderr, " but will be added back on popular demand. The original calling model can be\n"); - fprintf(pysam_stderr, " invoked with the -c option.\n"); - fprintf(pysam_stderr, "Usage: bcftools call [options] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "File format options:\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); - fprintf(pysam_stderr, " --ploidy-file space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --samples list of samples to include [all samples]\n"); - fprintf(pysam_stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Input/output options:\n"); - fprintf(pysam_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); - fprintf(pysam_stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); - fprintf(pysam_stderr, " -F, --prior-freqs use prior allele frequencies\n"); - fprintf(pysam_stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(pysam_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); - fprintf(pysam_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); - fprintf(pysam_stderr, " -V, --skip-variants skip indels/snps\n"); - fprintf(pysam_stderr, " -v, --variants-only output variant sites only\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Consensus/variant calling options:\n"); - fprintf(pysam_stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); - fprintf(pysam_stderr, " -C, --constrain one of: alleles, trio (see manual)\n"); - fprintf(pysam_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(pysam_stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(pysam_stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); + fprintf(bcftools_stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); + fprintf(bcftools_stderr, " functionality has been temporarily lost in the process of transition to htslib,\n"); + fprintf(bcftools_stderr, " but will be added back on popular demand. The original calling model can be\n"); + fprintf(bcftools_stderr, " invoked with the -c option.\n"); + fprintf(bcftools_stderr, "Usage: bcftools call [options] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "File format options:\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); + fprintf(bcftools_stderr, " --ploidy-file space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples list of samples to include [all samples]\n"); + fprintf(bcftools_stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Input/output options:\n"); + fprintf(bcftools_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); + fprintf(bcftools_stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); + fprintf(bcftools_stderr, " -F, --prior-freqs use prior allele frequencies\n"); + fprintf(bcftools_stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); + fprintf(bcftools_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); + fprintf(bcftools_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); + fprintf(bcftools_stderr, " -V, --skip-variants skip indels/snps\n"); + fprintf(bcftools_stderr, " -v, --variants-only output variant sites only\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Consensus/variant calling options:\n"); + fprintf(bcftools_stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); + fprintf(bcftools_stderr, " -C, --constrain one of: alleles, trio (see manual)\n"); + fprintf(bcftools_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); + fprintf(bcftools_stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); + fprintf(bcftools_stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); // todo (and more) - // fprintf(pysam_stderr, "\nContrast calling and association test options:\n"); - // fprintf(pysam_stderr, " -1 INT number of group-1 samples [0]\n"); - // fprintf(pysam_stderr, " -C FLOAT posterior constrast for LRTaux.min_lrt); - // fprintf(pysam_stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); - // fprintf(pysam_stderr, " -X FLOAT only perform permutations for P(chi^2)aux.min_perm_p); - fprintf(pysam_stderr, "\n"); + // fprintf(bcftools_stderr, "\nContrast calling and association test options:\n"); + // fprintf(bcftools_stderr, " -1 INT number of group-1 samples [0]\n"); + // fprintf(bcftools_stderr, " -C FLOAT posterior constrast for LRTaux.min_lrt); + // fprintf(bcftools_stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); + // fprintf(bcftools_stderr, " -X FLOAT only perform permutations for P(chi^2)aux.min_perm_p); + fprintf(bcftools_stderr, "\n"); exit(-1); } @@ -718,8 +718,8 @@ int main_vcfcall(int argc, char *argv[]) { case 2 : ploidy_fname = optarg; break; case 1 : ploidy = optarg; break; - case 'X': ploidy = "X"; fprintf(pysam_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; - case 'Y': ploidy = "Y"; fprintf(pysam_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; + case 'X': ploidy = "X"; fprintf(bcftools_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; + case 'Y': ploidy = "Y"; fprintf(bcftools_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) @@ -791,7 +791,7 @@ int main_vcfcall(int argc, char *argv[]) if ( !ploidy_fname && !ploidy ) { - if ( !args.samples_is_file ) fprintf(pysam_stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n"); + if ( !args.samples_is_file ) fprintf(bcftools_stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n"); args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2); } diff --git a/bcftools/vcfcnv.c.pysam.c b/bcftools/vcfcnv.c.pysam.c index 86ba48f..db4dffc 100644 --- a/bcftools/vcfcnv.c.pysam.c +++ b/bcftools/vcfcnv.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* The MIT License @@ -273,7 +273,7 @@ static void init_data(args_t *args) args->hmm = hmm_init(args->nstates, args->tprob, 10000); hmm_init_states(args->hmm, args->iprobs); - args->summary_fh = pysam_stdout; + args->summary_fh = bcftools_stdout; init_sample_files(&args->query_sample, args->output_dir); if ( args->control_sample.name ) { @@ -323,7 +323,7 @@ static void py_plot_cnv(char *script, float th) char *cmd = msprintf("python %s -p %f", script, th); int ret = system(cmd); - if ( ret) fprintf(pysam_stderr, "The command returned non-zero status %d: %s\n", ret, cmd); + if ( ret) fprintf(bcftools_stderr, "The command returned non-zero status %d: %s\n", ret, cmd); free(cmd); } @@ -659,7 +659,7 @@ static int set_observed_prob(args_t *args, sample_t *smpl, int isite) cn3_baf /= norm; #if DBG0 - if ( args->verbose ) fprintf(pysam_stderr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf); + if ( args->verbose ) fprintf(bcftools_stderr,"%f\t%f %f %f\n", baf,cn1_baf,cn2_baf,cn3_baf); #endif double cn1_lrr = exp(-(lrr + 0.45)*(lrr + 0.45)/smpl->lrr_dev2); @@ -884,7 +884,7 @@ static int update_sample_args(args_t *args, sample_t *smpl, int ismpl) baf_AA_dev2 /= norm_baf_AA_dev2; if ( baf_dev2 < baf_AA_dev2 ) baf_dev2 = baf_AA_dev2; double max_mean_cn3 = 0.5 - sqrt(baf_dev2)*1.644854; // R: qnorm(0.95)=1.644854 - //fprintf(pysam_stderr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3); + //fprintf(bcftools_stderr,"dev=%f AA_dev=%f max_mean_cn3=%f mean_cn3=%f\n", baf_dev2,baf_AA_dev2,max_mean_cn3,mean_cn3); assert( max_mean_cn3>0 ); double new_frac = 1./mean_cn3 - 2; @@ -954,13 +954,13 @@ static void cnv_flush_viterbi(args_t *args) if ( args->optimize_frac ) { int niter = 0; - fprintf(pysam_stderr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid)); + fprintf(bcftools_stderr,"Attempting to estimate the fraction of aberrant cells (chr %s):\n", bcf_hdr_id2name(args->hdr,args->prev_rid)); do { - fprintf(pysam_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); + fprintf(bcftools_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); if ( args->control_sample.name ) - fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); + fprintf(bcftools_stderr,"\n"); set_emission_probs(args); hmm_run_fwd_bwd(hmm, args->nsites, args->eprob, args->sites); } @@ -976,10 +976,10 @@ static void cnv_flush_viterbi(args_t *args) if ( args->control_sample.name ) set_gauss_params(args, &args->control_sample); } - fprintf(pysam_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); + fprintf(bcftools_stderr,"\t.. %f %f", args->query_sample.cell_frac,args->query_sample.baf_dev2); if ( args->control_sample.name ) - fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2); + fprintf(bcftools_stderr,"\n"); fprintf(args->query_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n", bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1, @@ -1003,7 +1003,7 @@ static void cnv_flush_viterbi(args_t *args) double ori_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm)); hmm_run_baum_welch(hmm, args->nsites, args->eprob, args->sites); double new_ii = avg_ii_prob(nstates,hmm_get_tprob(hmm)); - fprintf(pysam_stderr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii); + fprintf(bcftools_stderr,"%e\t%e\t%e\n", ori_ii,new_ii,new_ii-ori_ii); double *tprob = init_tprob_matrix(nstates, 1-new_ii, args->same_prob); hmm_set_tprob(args->hmm, tprob, 10000); double *tprob_arr = hmm_get_tprob(hmm); @@ -1015,9 +1015,9 @@ static void cnv_flush_viterbi(args_t *args) { for (j=0; j\n"); - fprintf(pysam_stderr, "General Options:\n"); - fprintf(pysam_stderr, " -c, --control-sample optional control sample name to highlight differences\n"); - fprintf(pysam_stderr, " -f, --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(pysam_stderr, " -o, --output-dir \n"); - fprintf(pysam_stderr, " -p, --plot-threshold plot aberrant chromosomes with quality at least 'float'\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --query-sample query samply name\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, "HMM Options:\n"); - fprintf(pysam_stderr, " -a, --aberrant fraction of aberrant cells in query and control [1.0,1.0]\n"); - fprintf(pysam_stderr, " -b, --BAF-weight relative contribution from BAF [1]\n"); - fprintf(pysam_stderr, " -d, --BAF-dev expected BAF deviation in query and control [0.04,0.04]\n"); // experimental - fprintf(pysam_stderr, " -e, --err-prob uniform error probability [1e-4]\n"); - fprintf(pysam_stderr, " -k, --LRR-dev expected LRR deviation [0.2,0.2]\n"); // experimental - fprintf(pysam_stderr, " -l, --LRR-weight relative contribution from LRR [0.2]\n"); - fprintf(pysam_stderr, " -L, --LRR-smooth-win window of LRR moving average smoothing [10]\n"); - fprintf(pysam_stderr, " -O, --optimize estimate fraction of aberrant cells down to [1.0]\n"); - fprintf(pysam_stderr, " -P, --same-prob prior probability of -s/-c being the same [0.5]\n"); - fprintf(pysam_stderr, " -x, --xy-prob P(x|y) transition probability [1e-9]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Copy number variation caller, requires Illumina's B-allele frequency (BAF) and Log R\n"); + fprintf(bcftools_stderr, " Ratio intensity (LRR). The HMM considers the following copy number states: CN 2\n"); + fprintf(bcftools_stderr, " (normal), 1 (single-copy loss), 0 (complete loss), 3 (single-copy gain)\n"); + fprintf(bcftools_stderr, "Usage: bcftools cnv [OPTIONS] \n"); + fprintf(bcftools_stderr, "General Options:\n"); + fprintf(bcftools_stderr, " -c, --control-sample optional control sample name to highlight differences\n"); + fprintf(bcftools_stderr, " -f, --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(bcftools_stderr, " -o, --output-dir \n"); + fprintf(bcftools_stderr, " -p, --plot-threshold plot aberrant chromosomes with quality at least 'float'\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --query-sample query samply name\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, "HMM Options:\n"); + fprintf(bcftools_stderr, " -a, --aberrant fraction of aberrant cells in query and control [1.0,1.0]\n"); + fprintf(bcftools_stderr, " -b, --BAF-weight relative contribution from BAF [1]\n"); + fprintf(bcftools_stderr, " -d, --BAF-dev expected BAF deviation in query and control [0.04,0.04]\n"); // experimental + fprintf(bcftools_stderr, " -e, --err-prob uniform error probability [1e-4]\n"); + fprintf(bcftools_stderr, " -k, --LRR-dev expected LRR deviation [0.2,0.2]\n"); // experimental + fprintf(bcftools_stderr, " -l, --LRR-weight relative contribution from LRR [0.2]\n"); + fprintf(bcftools_stderr, " -L, --LRR-smooth-win window of LRR moving average smoothing [10]\n"); + fprintf(bcftools_stderr, " -O, --optimize estimate fraction of aberrant cells down to [1.0]\n"); + fprintf(bcftools_stderr, " -P, --same-prob prior probability of -s/-c being the same [0.5]\n"); + fprintf(bcftools_stderr, " -x, --xy-prob P(x|y) transition probability [1e-9]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -1411,7 +1411,7 @@ int main_vcfcnv(int argc, char *argv[]) } cnv_next_line(args, NULL); create_plots(args); - fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); + fprintf(bcftools_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); destroy_data(args); free(args); return 0; diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index 4445a51..1a67b86 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfconcat.c -- Concatenate or combine VCF/BCF files. @@ -236,7 +236,7 @@ static void phased_flush(args_t *args) { if ( !gt_absent_warned ) { - fprintf(pysam_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); + fprintf(bcftools_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); gt_absent_warned = 1; } continue; @@ -247,7 +247,7 @@ static void phased_flush(args_t *args) { if ( !gt_absent_warned ) { - fprintf(pysam_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); + fprintf(bcftools_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); gt_absent_warned = 1; } continue; @@ -699,33 +699,33 @@ static void naive_concat(args_t *args) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n"); - fprintf(pysam_stderr, " columns appearing in the same order. The program can be used, for example, to\n"); - fprintf(pysam_stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n"); - fprintf(pysam_stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); - fprintf(pysam_stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); - fprintf(pysam_stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); - fprintf(pysam_stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); - fprintf(pysam_stderr, " if the BCF headers differ.\n"); - fprintf(pysam_stderr, "Usage: bcftools concat [options] [ [...]]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); - fprintf(pysam_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); - fprintf(pysam_stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); - fprintf(pysam_stderr, " -D, --remove-duplicates Alias for -d none\n"); - fprintf(pysam_stderr, " -f, --file-list Read the list of files from a file.\n"); - fprintf(pysam_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); - fprintf(pysam_stderr, " --no-version Do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); - fprintf(pysam_stderr, " -o, --output Write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); - fprintf(pysam_stderr, " -r, --regions Restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file Restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " --threads Number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Concatenate or combine VCF/BCF files. All source files must have the same sample\n"); + fprintf(bcftools_stderr, " columns appearing in the same order. The program can be used, for example, to\n"); + fprintf(bcftools_stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n"); + fprintf(bcftools_stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); + fprintf(bcftools_stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); + fprintf(bcftools_stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); + fprintf(bcftools_stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); + fprintf(bcftools_stderr, " if the BCF headers differ.\n"); + fprintf(bcftools_stderr, "Usage: bcftools concat [options] [ [...]]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -a, --allow-overlaps First coordinate of the next file can precede last record of the current file.\n"); + fprintf(bcftools_stderr, " -c, --compact-PS Do not output PS tag at each site, only at the start of a new phase set block.\n"); + fprintf(bcftools_stderr, " -d, --rm-dups Output duplicate records present in multiple files only once: \n"); + fprintf(bcftools_stderr, " -D, --remove-duplicates Alias for -d none\n"); + fprintf(bcftools_stderr, " -f, --file-list Read the list of files from a file.\n"); + fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); + fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); + fprintf(bcftools_stderr, " -o, --output Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); + fprintf(bcftools_stderr, " -r, --regions Restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file Restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --threads Number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index d1b15ba..a054ca8 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfconvert.c -- convert between VCF/BCF and related formats. @@ -212,13 +212,13 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr) { float aa,ab,bb; aa = strtod(tsv->ss, &tsv->se); - if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse first value of %d-th sample\n", i+1); return -1; } + if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse first value of %d-th sample\n", i+1); return -1; } tsv->ss = tsv->se+1; ab = strtod(tsv->ss, &tsv->se); - if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse second value of %d-th sample\n", i+1); return -1; } + if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse second value of %d-th sample\n", i+1); return -1; } tsv->ss = tsv->se+1; bb = strtod(tsv->ss, &tsv->se); - if ( tsv->ss==tsv->se ) { fprintf(pysam_stderr,"Could not parse third value of %d-th sample\n", i+1); return -1; } + if ( tsv->ss==tsv->se ) { fprintf(bcftools_stderr,"Could not parse third value of %d-th sample\n", i+1); return -1; } tsv->ss = tsv->se+1; if ( args->rev_als ) { float tmp = bb; bb = aa; aa = tmp; } @@ -264,7 +264,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) if ( !ss[0] || !ss[1] || !ss[2] || (up && (!ss[3] || !ss[4]) ) ) { - fprintf(pysam_stderr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]); + fprintf(bcftools_stderr,"Wrong number of fields at %d-th sample ([%c][%c][%c]). ",i+1,ss[0],ss[1],ss[2]); return -1; } @@ -283,7 +283,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) args->gts[2*i+all] = bcf_int32_vector_end; break; default : - fprintf(pysam_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); + fprintf(bcftools_stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); return -1; } if( ss[all*2+up+1]=='*' ) up = up + 1; @@ -291,7 +291,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) if(up && up != 2) { - fprintf(pysam_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); + fprintf(bcftools_stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); return -1; } @@ -305,8 +305,8 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) } if ( tsv->ss[(nsamples-1)*4+3+nup] ) { - fprintf(pysam_stderr,"nup: %d", nup); - fprintf(pysam_stderr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]); + fprintf(bcftools_stderr,"nup: %d", nup); + fprintf(bcftools_stderr,"Wrong number of fields (%d-th column = [%c]). ", nsamples*2,tsv->ss[(nsamples-1)*4+nup]); return -1; } @@ -419,7 +419,7 @@ static void gensample_to_vcf(args_t *args) free(args->flt); tsv_destroy(tsv); - fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total); + fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total); } static void haplegendsample_to_vcf(args_t *args) @@ -557,7 +557,7 @@ static void haplegendsample_to_vcf(args_t *args) tsv_destroy(hap_tsv); tsv_destroy(leg_tsv); - fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total); + fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total); } static void hapsample_to_vcf(args_t *args) @@ -657,7 +657,7 @@ static void hapsample_to_vcf(args_t *args) free(args->gts); tsv_destroy(tsv); - fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total); + fprintf(bcftools_stderr,"Number of processed rows: \t%d\n", args->n.total); } char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) @@ -740,8 +740,8 @@ static void vcf_to_gensample(args_t *args) if ( gen_fname && (strlen(gen_fname)<3 || strcasecmp(".gz",gen_fname+strlen(gen_fname)-3)) ) gen_compressed = 0; if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0; - if (gen_fname) fprintf(pysam_stderr, "Gen file: %s\n", gen_fname); - if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); + if (gen_fname) fprintf(bcftools_stderr, "Gen file: %s\n", gen_fname); + if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname); // write samples file if (sample_fname) @@ -793,7 +793,7 @@ static void vcf_to_gensample(args_t *args) // biallelic required if ( line->n_allele>2 ) { if (!non_biallelic) - fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); + fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); non_biallelic++; continue; } @@ -812,7 +812,7 @@ static void vcf_to_gensample(args_t *args) nok++; } } - fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", + fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup); if ( str.m ) free(str.s); @@ -864,9 +864,9 @@ static void vcf_to_haplegendsample(args_t *args) if ( legend_fname && (strlen(legend_fname)<3 || strcasecmp(".gz",legend_fname+strlen(legend_fname)-3)) ) legend_compressed = 0; if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0; - if (hap_fname) fprintf(pysam_stderr, "Hap file: %s\n", hap_fname); - if (legend_fname) fprintf(pysam_stderr, "Legend file: %s\n", legend_fname); - if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); + if (hap_fname) fprintf(bcftools_stderr, "Hap file: %s\n", hap_fname); + if (legend_fname) fprintf(bcftools_stderr, "Legend file: %s\n", legend_fname); + if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname); // write samples file if (sample_fname) @@ -923,7 +923,7 @@ static void vcf_to_haplegendsample(args_t *args) // biallelic required if ( line->n_allele>2 ) { if (!non_biallelic) - fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); + fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); non_biallelic++; continue; } @@ -950,7 +950,7 @@ static void vcf_to_haplegendsample(args_t *args) } nok++; } - fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); + fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok,no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); if ( str.m ) free(str.s); if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno)); if ( lout && bgzf_close(lout)!=0 ) error("Error closing %s: %s\n", legend_fname, strerror(errno)); @@ -1012,8 +1012,8 @@ static void vcf_to_hapsample(args_t *args) if ( hap_fname && (strlen(hap_fname)<3 || strcasecmp(".gz",hap_fname+strlen(hap_fname)-3)) ) hap_compressed = 0; if ( sample_fname && strlen(sample_fname)>3 && strcasecmp(".gz",sample_fname+strlen(sample_fname)-3)==0 ) sample_compressed = 0; - if (hap_fname) fprintf(pysam_stderr, "Hap file: %s\n", hap_fname); - if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname); + if (hap_fname) fprintf(bcftools_stderr, "Hap file: %s\n", hap_fname); + if (sample_fname) fprintf(bcftools_stderr, "Sample file: %s\n", sample_fname); // write samples file if (sample_fname) @@ -1066,7 +1066,7 @@ static void vcf_to_hapsample(args_t *args) // biallelic required if ( line->n_allele>2 ) { if (!non_biallelic) - fprintf(pysam_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); + fprintf(bcftools_stderr, "Warning: non-biallelic records are skipped. Consider splitting multi-allelic records into biallelic records using 'bcftools norm -m-'.\n"); non_biallelic++; continue; } @@ -1082,7 +1082,7 @@ static void vcf_to_hapsample(args_t *args) } nok++; } - fprintf(pysam_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); + fprintf(bcftools_stderr, "%d records written, %d skipped: %d/%d/%d no-ALT/non-biallelic/filtered\n", nok, no_alt+non_biallelic+filtered, no_alt, non_biallelic, filtered); if ( str.m ) free(str.s); if ( hout && bgzf_close(hout)!=0 ) error("Error closing %s: %s\n", hap_fname, strerror(errno)); if (hap_fname) free(hap_fname); @@ -1250,13 +1250,13 @@ static void tsv_to_vcf(args_t *args) free(args->str.s); free(args->gts); - fprintf(pysam_stderr,"Rows total: \t%d\n", args->n.total); - fprintf(pysam_stderr,"Rows skipped: \t%d\n", args->n.skipped); - fprintf(pysam_stderr,"Missing GTs: \t%d\n", args->n.missing); - fprintf(pysam_stderr,"Hom RR: \t%d\n", args->n.hom_rr); - fprintf(pysam_stderr,"Het RA: \t%d\n", args->n.het_ra); - fprintf(pysam_stderr,"Hom AA: \t%d\n", args->n.hom_aa); - fprintf(pysam_stderr,"Het AA: \t%d\n", args->n.het_aa); + fprintf(bcftools_stderr,"Rows total: \t%d\n", args->n.total); + fprintf(bcftools_stderr,"Rows skipped: \t%d\n", args->n.skipped); + fprintf(bcftools_stderr,"Missing GTs: \t%d\n", args->n.missing); + fprintf(bcftools_stderr,"Hom RR: \t%d\n", args->n.hom_rr); + fprintf(bcftools_stderr,"Het RA: \t%d\n", args->n.het_ra); + fprintf(bcftools_stderr,"Hom AA: \t%d\n", args->n.hom_aa); + fprintf(bcftools_stderr,"Het AA: \t%d\n", args->n.het_aa); } static void vcf_to_vcf(args_t *args) @@ -1365,69 +1365,69 @@ static void gvcf_to_vcf(args_t *args) static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n"); - fprintf(pysam_stderr, " formats details. When specifying output files explicitly instead\n"); - fprintf(pysam_stderr, " of with , one can use '-' for pysam_stdout and '.' to suppress.\n"); - fprintf(pysam_stderr, "Usage: bcftools convert [OPTIONS] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "VCF input options:\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(pysam_stderr, " -i, --include select sites for which the expression is true\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --samples list of samples to include\n"); - fprintf(pysam_stderr, " -S, --samples-file file of samples to include\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "VCF output options:\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -o, --output output file name [pysam_stdout]\n"); - fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); - fprintf(pysam_stderr, " -G, --gensample2vcf <...> |,\n"); - fprintf(pysam_stderr, " -g, --gensample <...> |,\n"); - fprintf(pysam_stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); - fprintf(pysam_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); - fprintf(pysam_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(pysam_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "gVCF conversion:\n"); - fprintf(pysam_stderr, " --gvcf2vcf expand gVCF reference blocks\n"); - fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n"); - fprintf(pysam_stderr, " --hapsample2vcf <...> |,\n"); - fprintf(pysam_stderr, " --hapsample <...> |,\n"); - fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(pysam_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "HAP/LEGEND/SAMPLE conversion:\n"); - fprintf(pysam_stderr, " -H, --haplegendsample2vcf <...> |,,\n"); - fprintf(pysam_stderr, " -h, --haplegendsample <...> |,,\n"); - fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); - fprintf(pysam_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); - fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "TSV conversion:\n"); - fprintf(pysam_stderr, " --tsv2vcf \n"); - fprintf(pysam_stderr, " -c, --columns columns of the input tsv file [ID,CHROM,POS,AA]\n"); - fprintf(pysam_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(pysam_stderr, " -s, --samples list of sample names\n"); - fprintf(pysam_stderr, " -S, --samples-file file of sample names\n"); - fprintf(pysam_stderr, "\n"); - // fprintf(pysam_stderr, "PLINK options:\n"); - // fprintf(pysam_stderr, " -p, --plink |,,|,,|,\n"); - // fprintf(pysam_stderr, " --tped make tped file instead\n"); - // fprintf(pysam_stderr, " --bin make binary bed/fam/bim files\n"); - // fprintf(pysam_stderr, "\n"); - // fprintf(pysam_stderr, "PBWT options:\n"); - // fprintf(pysam_stderr, " -b, --pbwt or ,,,\n"); - // fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Converts VCF/BCF to other formats and back. See man page for file\n"); + fprintf(bcftools_stderr, " formats details. When specifying output files explicitly instead\n"); + fprintf(bcftools_stderr, " of with , one can use '-' for bcftools_stdout and '.' to suppress.\n"); + fprintf(bcftools_stderr, "Usage: bcftools convert [OPTIONS] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "VCF input options:\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples list of samples to include\n"); + fprintf(bcftools_stderr, " -S, --samples-file file of samples to include\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "VCF output options:\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); + fprintf(bcftools_stderr, " -G, --gensample2vcf <...> |,\n"); + fprintf(bcftools_stderr, " -g, --gensample <...> |,\n"); + fprintf(bcftools_stderr, " --tag tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); + fprintf(bcftools_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(bcftools_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "gVCF conversion:\n"); + fprintf(bcftools_stderr, " --gvcf2vcf expand gVCF reference blocks\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "HAP/SAMPLE conversion (output from SHAPEIT):\n"); + fprintf(bcftools_stderr, " --hapsample2vcf <...> |,\n"); + fprintf(bcftools_stderr, " --hapsample <...> |,\n"); + fprintf(bcftools_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(bcftools_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(bcftools_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "HAP/LEGEND/SAMPLE conversion:\n"); + fprintf(bcftools_stderr, " -H, --haplegendsample2vcf <...> |,,\n"); + fprintf(bcftools_stderr, " -h, --haplegendsample <...> |,,\n"); + fprintf(bcftools_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n"); + fprintf(bcftools_stderr, " --sex output sex column in the sample-file, input format is: Sample\\t[MF]\n"); + fprintf(bcftools_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "TSV conversion:\n"); + fprintf(bcftools_stderr, " --tsv2vcf \n"); + fprintf(bcftools_stderr, " -c, --columns columns of the input tsv file [ID,CHROM,POS,AA]\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); + fprintf(bcftools_stderr, " -s, --samples list of sample names\n"); + fprintf(bcftools_stderr, " -S, --samples-file file of sample names\n"); + fprintf(bcftools_stderr, "\n"); + // fprintf(bcftools_stderr, "PLINK options:\n"); + // fprintf(bcftools_stderr, " -p, --plink |,,|,,|,\n"); + // fprintf(bcftools_stderr, " --tped make tped file instead\n"); + // fprintf(bcftools_stderr, " --bin make binary bed/fam/bim files\n"); + // fprintf(bcftools_stderr, "\n"); + // fprintf(bcftools_stderr, "PBWT options:\n"); + // fprintf(bcftools_stderr, " -b, --pbwt or ,,,\n"); + // fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcffilter.c.pysam.c b/bcftools/vcffilter.c.pysam.c index e603bde..f7d0a47 100644 --- a/bcftools/vcffilter.c.pysam.c +++ b/bcftools/vcffilter.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcffilter.c -- Apply fixed-threshold filters. @@ -132,7 +132,7 @@ static void init_data(args_t *args) kputs("\"IndelGap\"", &tmp); } if ( strncmp(tmp.s+1,args->soft_filter,tmp.l-2) ) - fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); + fprintf(bcftools_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); free(tmp.s); } @@ -401,27 +401,27 @@ static void set_genotypes(args_t *args, bcf1_t *line, int pass_site) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Apply fixed-threshold filters.\n"); - fprintf(pysam_stderr, "Usage: bcftools filter [options] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -g, --SnpGap filter SNPs within base pairs of an indel\n"); - fprintf(pysam_stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); - fprintf(pysam_stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); - fprintf(pysam_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --soft-filter annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); - fprintf(pysam_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Apply fixed-threshold filters.\n"); + fprintf(bcftools_stderr, "Usage: bcftools filter [options] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -g, --SnpGap filter SNPs within base pairs of an indel\n"); + fprintf(bcftools_stderr, " -G, --IndelGap filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); + fprintf(bcftools_stderr, " -i, --include include only sites for which the expression is true (see man page for details\n"); + fprintf(bcftools_stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --soft-filter annotate FILTER column with or unique filter name (\"Filter%%d\") made up by the program (\"+\")\n"); + fprintf(bcftools_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfgtcheck.c.pysam.c b/bcftools/vcfgtcheck.c.pysam.c index 0bd6071..6a6fa58 100644 --- a/bcftools/vcfgtcheck.c.pysam.c +++ b/bcftools/vcfgtcheck.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfgtcheck.c -- Check sample identity. @@ -64,7 +64,7 @@ void py_plot(char *script) int len = strlen(script); char *cmd = !strcmp(".py",script+len-3) ? msprintf("python %s", script) : msprintf("python %s.py", script); int ret = system(cmd); - if ( ret ) fprintf(pysam_stderr, "The command returned non-zero status %d: %s\n", ret, cmd); + if ( ret ) fprintf(bcftools_stderr, "The command returned non-zero status %d: %s\n", ret, cmd); free(cmd); } @@ -268,7 +268,7 @@ static int init_gt2ipl(args_t *args, bcf1_t *gt_line, bcf1_t *sm_line, int *gt2i gt2ipl[ bcf_ij2G(j,i) ] = k<=l ? bcf_ij2G(k,l) : bcf_ij2G(l,k); } } - //for (i=0; ism_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); if ( !args->no_PLs ) - fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); + fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } - FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout; + FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : bcftools_stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; @@ -366,7 +366,7 @@ static void check_gt(args_t *args) { if ( tgt_isample==-1 ) { - fprintf(pysam_stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); + fprintf(bcftools_stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } @@ -614,7 +614,7 @@ static void cross_check_gts(args_t *args) if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); if ( !args->no_PLs ) { - fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); + fprintf(bcftools_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); args->no_PLs = 99; } } @@ -637,7 +637,7 @@ static void cross_check_gts(args_t *args) process_PL(args,line,ntot,ndif); } - FILE *fp = pysam_stdout; + FILE *fp = bcftools_stdout; print_header(args, fp); float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2); @@ -709,24 +709,24 @@ static char *init_prefix(char *prefix) static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n"); - fprintf(pysam_stderr, "Usage: bcftools gtcheck [options] [-g ] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -a, --all-sites output comparison for all sites\n"); - fprintf(pysam_stderr, " -c, --cluster min inter- and max intra-sample error [0.23,-0.3]\n"); - fprintf(pysam_stderr, " -g, --genotypes genotypes to compare against\n"); - fprintf(pysam_stderr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); - fprintf(pysam_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); - fprintf(pysam_stderr, " -p, --plot plot\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --query-sample query sample (by default the first sample is checked)\n"); - fprintf(pysam_stderr, " -S, --target-sample target sample in the -g file (used only for plotting)\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Check sample identity. With no -g BCF given, multi-sample cross-check is performed.\n"); + fprintf(bcftools_stderr, "Usage: bcftools gtcheck [options] [-g ] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -a, --all-sites output comparison for all sites\n"); + fprintf(bcftools_stderr, " -c, --cluster min inter- and max intra-sample error [0.23,-0.3]\n"); + fprintf(bcftools_stderr, " -g, --genotypes genotypes to compare against\n"); + fprintf(bcftools_stderr, " -G, --GTs-only use GTs, ignore PLs, using for unseen genotypes [99]\n"); + fprintf(bcftools_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n"); + fprintf(bcftools_stderr, " -p, --plot plot\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --query-sample query sample (by default the first sample is checked)\n"); + fprintf(bcftools_stderr, " -S, --target-sample target sample in the -g file (used only for plotting)\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index 157fc8e..1e7578c 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access. @@ -41,22 +41,22 @@ DEALINGS IN THE SOFTWARE. */ static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Index bgzip compressed VCF/BCF files for random access.\n"); - fprintf(pysam_stderr, "Usage: bcftools index [options] |\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Indexing options:\n"); - fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); - fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n"); - fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(pysam_stderr, " -o, --output-file FILE optional output index file name\n"); - fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); - fprintf(pysam_stderr, " --threads sets the number of threads [0]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Stats options:\n"); - fprintf(pysam_stderr, " -n, --nrecords print number of records based on existing index file\n"); - fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Index bgzip compressed VCF/BCF files for random access.\n"); + fprintf(bcftools_stderr, "Usage: bcftools index [options] |\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Indexing options:\n"); + fprintf(bcftools_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n"); + fprintf(bcftools_stderr, " -f, --force overwrite index if it already exists\n"); + fprintf(bcftools_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); + fprintf(bcftools_stderr, " -o, --output-file FILE optional output index file name\n"); + fprintf(bcftools_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); + fprintf(bcftools_stderr, " --threads sets the number of threads [0]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Stats options:\n"); + fprintf(bcftools_stderr, " -n, --nrecords print number of records based on existing index file\n"); + fprintf(bcftools_stderr, " -s, --stats print per contig stats based on existing index file\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -68,23 +68,23 @@ int vcf_index_stats(char *fname, int stats) hts_idx_t *idx = NULL; htsFile *fp = hts_open(fname,"r"); - if ( !fp ) { fprintf(pysam_stderr,"Could not read %s\n", fname); return 1; } + if ( !fp ) { fprintf(bcftools_stderr,"Could not read %s\n", fname); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); - if ( !hdr ) { fprintf(pysam_stderr,"Could not read the header: %s\n", fname); return 1; } + if ( !hdr ) { fprintf(bcftools_stderr,"Could not read the header: %s\n", fname); return 1; } if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); - if ( !tbx ) { fprintf(pysam_stderr,"Could not load index for VCF: %s\n", fname); return 1; } + if ( !tbx ) { fprintf(bcftools_stderr,"Could not load index for VCF: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); - if ( !idx ) { fprintf(pysam_stderr,"Could not load index for BCF file: %s\n", fname); return 1; } + if ( !idx ) { fprintf(bcftools_stderr,"Could not load index for BCF file: %s\n", fname); return 1; } } else { - fprintf(pysam_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); + fprintf(bcftools_stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); return 1; } @@ -98,7 +98,7 @@ int vcf_index_stats(char *fname, int stats) if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - fprintf(pysam_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); + fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); } if (!sum) { @@ -107,12 +107,12 @@ int vcf_index_stats(char *fname, int stats) bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { - fprintf(pysam_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname); + fprintf(bcftools_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname); return 1; } bcf_destroy1(rec); } - if (stats&2) fprintf(pysam_stdout, "%" PRIu64 "\n", sum); + if (stats&2) fprintf(bcftools_stdout, "%" PRIu64 "\n", sum); free(seq); hts_close(fp); bcf_hdr_destroy(hdr); @@ -166,17 +166,17 @@ int main_vcfindex(int argc, char *argv[]) } if (stats>2) { - fprintf(pysam_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); + fprintf(bcftools_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__); return 1; } if (tbi && min_shift>0) { - fprintf(pysam_stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__); + fprintf(bcftools_stderr, "[E::%s] min-shift option only expected for CSI indices \n", __func__); return 1; } if (min_shift < 0 || min_shift > 30) { - fprintf(pysam_stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift); + fprintf(bcftools_stderr, "[E::%s] expected min_shift in range [0,30] (%d)\n", __func__, min_shift); return 1; } @@ -194,7 +194,7 @@ int main_vcfindex(int argc, char *argv[]) kputs(outfn,&idx_fname); else { - if (!strcmp(fname, "-")) { fprintf(pysam_stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; } + if (!strcmp(fname, "-")) { fprintf(bcftools_stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; } ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi"); } if (!force) @@ -206,7 +206,7 @@ int main_vcfindex(int argc, char *argv[]) stat(fname, &stat_file); if ( stat_file.st_mtime <= stat_tbi.st_mtime ) { - fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s); + fprintf(bcftools_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s); free(idx_fname.s); return 1; } diff --git a/bcftools/vcfisec.c.pysam.c b/bcftools/vcfisec.c.pysam.c index 15ef22d..d168457 100644 --- a/bcftools/vcfisec.c.pysam.c +++ b/bcftools/vcfisec.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfisec.c -- Create intersections, unions and complements of VCF files. @@ -136,7 +136,7 @@ void isec_vcf(args_t *args) kstring_t str = {0,0,0}; htsFile *out_fh = NULL; - // When only one VCF is output, print VCF to pysam_stdout or -o file + // When only one VCF is output, print VCF to bcftools_stdout or -o file int out_std = 0; if ( args->nwrite==1 && !args->prefix ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; @@ -149,7 +149,7 @@ void isec_vcf(args_t *args) bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) - fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n"); + fprintf(bcftools_stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) @@ -402,7 +402,7 @@ static void init_data(args_t *args) if ( args->fh_sites == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); } else - args->fh_sites = pysam_stdout; + args->fh_sites = bcftools_stdout; } } @@ -448,41 +448,41 @@ static void destroy_data(args_t *args) static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Create intersections, unions and complements of VCF files.\n"); - fprintf(pysam_stderr, "Usage: bcftools isec [options] [...]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(pysam_stderr, " -C, --complement output positions present only in the first file but missing in the others\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysam_stderr, " -i, --include include only sites for which the expression is true\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -n, --nfiles [+-=~] output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -p, --prefix if given, subset each of the input files accordingly, see also -w\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Examples:\n"); - fprintf(pysam_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); - fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, " # Filter sites in A and B (but not in C) and create intersection\n"); - fprintf(pysam_stderr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, " # Extract and write records from A shared by both A and B using exact allele match\n"); - fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, " # Extract records private to A or B comparing by position only\n"); - fprintf(pysam_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Create intersections, unions and complements of VCF files.\n"); + fprintf(bcftools_stderr, "Usage: bcftools isec [options] [...]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); + fprintf(bcftools_stderr, " -C, --complement output positions present only in the first file but missing in the others\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -i, --include include only sites for which the expression is true\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -n, --nfiles [+-=~] output positions present in this many (=), this many or more (+), this many or fewer (-), the exact (~) files\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -p, --prefix if given, subset each of the input files accordingly, see also -w\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Examples:\n"); + fprintf(bcftools_stderr, " # Create intersection and complements of two sets saving the output in dir/*\n"); + fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, " # Filter sites in A and B (but not in C) and create intersection\n"); + fprintf(bcftools_stderr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, " # Extract and write records from A shared by both A and B using exact allele match\n"); + fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, " # Extract records private to A or B comparing by position only\n"); + fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index f12e0a6..e1047ae 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. @@ -522,8 +522,8 @@ void merge_headers(bcf_hdr_t *hw, const bcf_hdr_t *hr, const char *clash_prefix, void debug_als(char **als, int nals) { - int k; for (k=0; knals; i++) { - fprintf(pysam_stdout, " %s [%d]", ma->als[i], ma->cnt[i]); + fprintf(bcftools_stdout, " %s [%d]", ma->als[i], ma->cnt[i]); } - fprintf(pysam_stdout, "\n"); + fprintf(bcftools_stdout, "\n"); } void merge_chrom2qual(args_t *args, bcf1_t *out) @@ -1094,7 +1094,7 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i case BCF_BT_INT16: BRANCH(int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, int); break; case BCF_BT_INT32: BRANCH(int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, int); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), float); break; - default: fprintf(pysam_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); + default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); } #undef BRANCH } @@ -1124,7 +1124,7 @@ static void merge_AGR_info_tag(bcf_hdr_t *hdr, bcf1_t *line, bcf_info_t *info, i case BCF_BT_INT16: BRANCH(int16_t, src[kori]==bcf_int16_missing, src[kori]==bcf_int16_vector_end, int); break; case BCF_BT_INT32: BRANCH(int32_t, src[kori]==bcf_int32_missing, src[kori]==bcf_int32_vector_end, int); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(src[kori]), bcf_float_is_vector_end(src[kori]), float); break; - default: fprintf(pysam_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); + default: fprintf(bcftools_stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); } #undef BRANCH } @@ -1970,27 +1970,27 @@ void debug_maux(args_t *args) maux_t *maux = args->maux; int j,k,l; - fprintf(pysam_stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals); + fprintf(bcftools_stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals); for (j=0; jnreaders; j++) { bcf_sr_t *reader = &files->readers[j]; buffer_t *buf = &maux->buf[j]; - fprintf(pysam_stderr," reader %d: ", j); + fprintf(bcftools_stderr," reader %d: ", j); for (k=buf->beg; kend; k++) { if ( buf->rec[k].skip & SKIP_DONE ) continue; bcf1_t *line = reader->buffer[k]; - fprintf(pysam_stderr,"\t"); - if ( buf->rec[k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round + fprintf(bcftools_stderr,"\t"); + if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"["); // this record will not be merged in this round for (l=0; ln_allele; l++) - fprintf(pysam_stderr,"%s%s", l==0?"":",", line->d.allele[l]); - if ( buf->rec[k].skip ) fprintf(pysam_stderr,"]"); + fprintf(bcftools_stderr,"%s%s", l==0?"":",", line->d.allele[l]); + if ( buf->rec[k].skip ) fprintf(bcftools_stderr,"]"); } - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"\n"); } - fprintf(pysam_stderr," counts: "); - for (j=0; jnals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); - fprintf(pysam_stderr,"\n\n"); + fprintf(bcftools_stderr," counts: "); + for (j=0; jnals; j++) fprintf(bcftools_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); + fprintf(bcftools_stderr,"\n\n"); } void debug_state(args_t *args) @@ -1999,23 +1999,23 @@ void debug_state(args_t *args) int i,j; for (i=0; ifiles->nreaders; i++) { - fprintf(pysam_stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); + fprintf(bcftools_stderr,"reader %d:\tcur,beg,end=% d,%d,%d", i,maux->buf[i].cur,maux->buf[i].beg,maux->buf[i].end); if ( maux->buf[i].cur >=0 ) { bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); const char *chr = bcf_hdr_id2name(hdr, maux->buf[i].rid); - fprintf(pysam_stderr,"\t"); - for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(pysam_stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1); + fprintf(bcftools_stderr,"\t"); + for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(bcftools_stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1); } - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"\n"); } for (i=0; ifiles->nreaders; i++) { - fprintf(pysam_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); - if ( maux->gvcf[i].active ) fprintf(pysam_stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1); - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); + if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1); + fprintf(bcftools_stderr,"\n"); } - fprintf(pysam_stderr,"\n"); + fprintf(bcftools_stderr,"\n"); } @@ -2350,30 +2350,30 @@ void merge_vcf(args_t *args) static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n"); - fprintf(pysam_stderr, " Note that only records from different files can be merged, never from the same file. For\n"); - fprintf(pysam_stderr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n"); - fprintf(pysam_stderr, "Usage: bcftools merge [options] [...]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " --force-samples resolve duplicate sample names\n"); - fprintf(pysam_stderr, " --print-header print only the merged header and exit\n"); - fprintf(pysam_stderr, " --use-header use the provided header\n"); - fprintf(pysam_stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n"); - fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysam_stderr, " -F, --filter-logic remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); - fprintf(pysam_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); - fprintf(pysam_stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); - fprintf(pysam_stderr, " -l, --file-list read file names from the file\n"); - fprintf(pysam_stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file.\n"); + fprintf(bcftools_stderr, " Note that only records from different files can be merged, never from the same file. For\n"); + fprintf(bcftools_stderr, " \"vertical\" merge take a look at \"bcftools norm\" instead.\n"); + fprintf(bcftools_stderr, "Usage: bcftools merge [options] [...]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " --force-samples resolve duplicate sample names\n"); + fprintf(bcftools_stderr, " --print-header print only the merged header and exit\n"); + fprintf(bcftools_stderr, " --use-header use the provided header\n"); + fprintf(bcftools_stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n"); + fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -F, --filter-logic remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n"); + fprintf(bcftools_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n"); + fprintf(bcftools_stderr, " -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); + fprintf(bcftools_stderr, " -l, --file-list read file names from the file\n"); + fprintf(bcftools_stderr, " -m, --merge allow multiallelic records for , see man page for details [both]\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index 9308e6b..5e3a5fb 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfnorm.c -- Left-align and normalize indels. @@ -294,7 +294,7 @@ static int realign(args_t *args, bcf1_t *line) if ( args->check_ref==CHECK_REF_EXIT ) error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); if ( args->check_ref & CHECK_REF_WARN ) - fprintf(pysam_stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); + fprintf(bcftools_stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); free(ref); return ERR_REF_MISMATCH; } @@ -303,7 +303,7 @@ static int realign(args_t *args, bcf1_t *line) if ( args->check_ref==CHECK_REF_EXIT ) error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); if ( args->check_ref & CHECK_REF_WARN ) - fprintf(pysam_stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); + fprintf(bcftools_stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); free(ref); return ERR_REF_MISMATCH; } @@ -333,7 +333,7 @@ static int realign(args_t *args, bcf1_t *line) if ( args->check_ref==CHECK_REF_EXIT ) error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); if ( args->check_ref & CHECK_REF_WARN ) - fprintf(pysam_stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); + fprintf(bcftools_stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); return ERR_REF_MISMATCH; } @@ -917,7 +917,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf { \ /* expecting diploid gt in INFO */ \ if (nvals_ori!=lines[0]->n_allele*(lines[0]->n_allele+1)/2) { \ - fprintf(pysam_stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \ + fprintf(bcftools_stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \ error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ } \ int nvals = dst->n_allele*(dst->n_allele+1)/2; \ @@ -1640,7 +1640,7 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) else if ( args->check_ref==CHECK_REF_EXIT ) error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1); else if ( args->check_ref & CHECK_REF_WARN ) - fprintf(pysam_stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); + fprintf(bcftools_stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); } } } @@ -1729,37 +1729,37 @@ static void normalize_vcf(args_t *args) flush_buffer(args, out, args->rbuf.n); hts_close(out); - fprintf(pysam_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); + fprintf(bcftools_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); if ( args->check_ref & CHECK_REF_FIX ) - fprintf(pysam_stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); + fprintf(bcftools_stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); } static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n"); - fprintf(pysam_stderr, " split multiallelic sites into multiple rows; recover multiallelics from\n"); - fprintf(pysam_stderr, " multiple rows.\n"); - fprintf(pysam_stderr, "Usage: bcftools norm [options] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); - fprintf(pysam_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); - fprintf(pysam_stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); - fprintf(pysam_stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); - fprintf(pysam_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " --threads number of extra (de)compression threads [0]\n"); - fprintf(pysam_stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Left-align and normalize indels; check if REF alleles match the reference;\n"); + fprintf(bcftools_stderr, " split multiallelic sites into multiple rows; recover multiallelics from\n"); + fprintf(bcftools_stderr, " multiple rows.\n"); + fprintf(bcftools_stderr, "Usage: bcftools norm [options] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); + fprintf(bcftools_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); + fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|any\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); + fprintf(bcftools_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads number of extra (de)compression threads [0]\n"); + fprintf(bcftools_stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -1842,7 +1842,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(pysam_stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); + fprintf(bcftools_stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); args->rmdup = COLLAPSE_NONE<<1; break; case 's': args->strict_filter = 1; break; diff --git a/bcftools/vcfplugin.c.pysam.c b/bcftools/vcfplugin.c.pysam.c index 6ea7eb6..f8e393b 100644 --- a/bcftools/vcfplugin.c.pysam.c +++ b/bcftools/vcfplugin.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. @@ -176,11 +176,11 @@ static void add_plugin_paths(args_t *args, const char *path) args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1)); args->plugin_paths[args->nplugin_paths] = dir; args->nplugin_paths++; - if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir); + if ( args->verbose > 1 ) fprintf(bcftools_stderr, "plugin directory %s .. ok\n", dir); } else { - if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno)); + if ( args->verbose > 1 ) fprintf(bcftools_stderr, "plugin directory %s .. %s\n", dir, strerror(errno)); free(dir); } @@ -218,8 +218,8 @@ static void *dlopen_plugin(args_t *args, const char *fname) handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though if ( args->verbose > 1 ) { - if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); - else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", tmp); + if ( !handle ) fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); + else fprintf(bcftools_stderr,"%s:\n\tdlopen .. ok\n", tmp); } free(tmp); if ( handle ) return handle; @@ -229,8 +229,8 @@ static void *dlopen_plugin(args_t *args, const char *fname) handle = dlopen(fname, RTLD_NOW); if ( args->verbose > 1 ) { - if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); - else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", fname); + if ( !handle ) fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); + else fprintf(bcftools_stderr,"%s:\n\tdlopen .. ok\n", fname); } return handle; @@ -238,11 +238,11 @@ static void *dlopen_plugin(args_t *args, const char *fname) static void print_plugin_usage_hint(void) { - fprintf(pysam_stderr, "\nNo functional bcftools plugins were found"); + fprintf(bcftools_stderr, "\nNo functional bcftools plugins were found"); if ( !getenv("BCFTOOLS_PLUGINS") ) - fprintf(pysam_stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n"); + fprintf(bcftools_stderr,". The environment variable BCFTOOLS_PLUGINS is not set.\n\n"); else - fprintf(pysam_stderr, + fprintf(bcftools_stderr, " in\n\tBCFTOOLS_PLUGINS=\"%s\".\n\n" "- Is the plugin path correct?\n\n" "- Run \"bcftools plugin -lv\" for more detailed error output.\n" @@ -272,19 +272,19 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) plugin->init = NULL; else - if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit .. ok\n"); + if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\tinit .. ok\n"); plugin->run = (dl_run_f) dlsym(plugin->handle, "run"); ret = dlerror(); if ( ret ) plugin->run = NULL; else - if ( args->verbose > 1 ) fprintf(pysam_stderr,"\trun .. ok\n"); + if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\trun .. ok\n"); if ( !plugin->init && !plugin->run ) { if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); - else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit/run .. not found\n"); + else if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\tinit/run .. not found\n"); return -1; } @@ -293,7 +293,7 @@ static int load_plugin(args_t *args, const char *fname, int exit_on_error, plugi if ( ret ) { if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name); - else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tversion .. not found\n"); + else if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\tversion .. not found\n"); return -1; } @@ -341,12 +341,12 @@ static void init_plugin(args_t *args) args->plugin.version(&bver, &hver); if ( strcmp(bver,bcftools_version()) && !warned_bcftools ) { - fprintf(pysam_stderr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver); + fprintf(bcftools_stderr,"WARNING: bcftools version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", bcftools_version(),args->plugin.name,bver); warned_bcftools = 1; } if ( strcmp(hver,hts_version()) && !warned_htslib ) { - fprintf(pysam_stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver); + fprintf(bcftools_stderr,"WARNING: htslib version mismatch .. bcftools at %s, the plugin \"%s\" at %s\n", hts_version(),args->plugin.name,hver); warned_htslib = 1; } args->drop_header += ret; @@ -401,11 +401,11 @@ static int list_plugins(args_t *args) for (i=0; iverbose ) - fprintf(pysam_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about()); + fprintf(bcftools_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about()); else - fprintf(pysam_stdout, "%s\n", plugins[i].name); + fprintf(bcftools_stdout, "%s\n", plugins[i].name); } - if ( args->verbose ) fprintf(pysam_stdout, "\n"); + if ( args->verbose ) fprintf(bcftools_stdout, "\n"); } else print_plugin_usage_hint(); @@ -452,29 +452,29 @@ static void destroy_data(args_t *args) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Run user defined plugin\n"); - fprintf(pysam_stderr, "Usage: bcftools plugin [OPTIONS] [-- PLUGIN_OPTIONS]\n"); - fprintf(pysam_stderr, " bcftools +name [OPTIONS] [-- PLUGIN_OPTIONS]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "VCF input options:\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true\n"); - fprintf(pysam_stderr, " -i, --include select sites for which the expression is true\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, "VCF output options:\n"); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(pysam_stderr, "Plugin options:\n"); - fprintf(pysam_stderr, " -h, --help list plugin's options\n"); - fprintf(pysam_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); - fprintf(pysam_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n"); - fprintf(pysam_stderr, " -V, --version print version string and exit\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Run user defined plugin\n"); + fprintf(bcftools_stderr, "Usage: bcftools plugin [OPTIONS] [-- PLUGIN_OPTIONS]\n"); + fprintf(bcftools_stderr, " bcftools +name [OPTIONS] [-- PLUGIN_OPTIONS]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "VCF input options:\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, "VCF output options:\n"); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(bcftools_stderr, "Plugin options:\n"); + fprintf(bcftools_stderr, " -h, --help list plugin's options\n"); + fprintf(bcftools_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); + fprintf(bcftools_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n"); + fprintf(bcftools_stderr, " -V, --version print version string and exit\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -584,17 +584,17 @@ int main_plugin(int argc, char *argv[]) { const char *bver, *hver; args->plugin.version(&bver, &hver); - fprintf(pysam_stdout, "bcftools %s using htslib %s\n", bcftools_version(), hts_version()); - fprintf(pysam_stdout, "plugin at %s using htslib %s\n\n", bver, hver); + fprintf(bcftools_stdout, "bcftools %s using htslib %s\n", bcftools_version(), hts_version()); + fprintf(bcftools_stdout, "plugin at %s using htslib %s\n\n", bver, hver); return 0; } if ( usage_only ) { if ( args->plugin.usage ) - fprintf(pysam_stderr,"%s",args->plugin.usage()); + fprintf(bcftools_stderr,"%s",args->plugin.usage()); else - fprintf(pysam_stderr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name); + fprintf(bcftools_stderr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name); return 0; } @@ -651,7 +651,7 @@ int main_plugin(int argc, char *argv[]) int main_plugin(int argc, char *argv[]) { - fprintf(pysam_stderr, "bcftools plugins are disabled. To use them, you will need to rebuild\n" + fprintf(bcftools_stderr, "bcftools plugins are disabled. To use them, you will need to rebuild\n" "bcftools from the source distribution with plugins enabled.\n"); return 1; } diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index 8fd7cf0..e9100e6 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfquery.c -- Extracts fields from VCF/BCF file. @@ -169,7 +169,7 @@ static void list_columns(args_t *args) for (i=0; iheader); i++) { if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue; - fprintf(pysam_stdout, "%s\n", reader->header->samples[i]); + fprintf(bcftools_stdout, "%s\n", reader->header->samples[i]); } if ( has_sample ) @@ -195,30 +195,30 @@ static int compare_header(bcf_hdr_t *hdr, char **a, int na, char **b, int nb) static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n"); - fprintf(pysam_stderr, "Usage: bcftools query [options] [ [...]]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -c, --collapse collapse lines with duplicate positions for , see man page [none]\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -f, --format see man page for details\n"); - fprintf(pysam_stderr, " -H, --print-header print header\n"); - fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -l, --list-samples print the list of samples and exit\n"); - fprintf(pysam_stderr, " -o, --output-file output file name [pysam_stdout]\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --samples list of samples to include\n"); - fprintf(pysam_stderr, " -S, --samples-file file of samples to include\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n"); - fprintf(pysam_stderr, " -v, --vcf-list process multiple VCFs listed in the file\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Examples:\n"); - fprintf(pysam_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Extracts fields from VCF/BCF file and prints them in user-defined format\n"); + fprintf(bcftools_stderr, "Usage: bcftools query [options] [ [...]]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -c, --collapse collapse lines with duplicate positions for , see man page [none]\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -f, --format see man page for details\n"); + fprintf(bcftools_stderr, " -H, --print-header print header\n"); + fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -l, --list-samples print the list of samples and exit\n"); + fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples list of samples to include\n"); + fprintf(bcftools_stderr, " -S, --samples-file file of samples to include\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -u, --allow-undef-tags print \".\" for undefined tags\n"); + fprintf(bcftools_stderr, " -v, --vcf-list process multiple VCFs listed in the file\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Examples:\n"); + fprintf(bcftools_stderr, "\tbcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%SAMPLE=%%GT]\\n' file.vcf.gz\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -317,7 +317,7 @@ int main_vcfquery(int argc, char *argv[]) } if ( !args->format_str ) usage(); - args->out = args->fn_out ? fopen(args->fn_out, "w") : pysam_stdout; + args->out = args->fn_out ? fopen(args->fn_out, "w") : bcftools_stdout; if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( !args->vcf_list ) diff --git a/bcftools/vcfroh.c.pysam.c b/bcftools/vcfroh.c.pysam.c index 2bf45b7..b303a40 100644 --- a/bcftools/vcfroh.c.pysam.c +++ b/bcftools/vcfroh.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfroh.c -- HMM model for detecting runs of autozygosity. @@ -243,11 +243,11 @@ static void init_data(args_t *args) if ( args->nbuf_olap<0 ) args->nbuf_olap = args->nbuf_max*0.01; } - fprintf(pysam_stderr,"Number of target samples: %d\n", args->roh_smpl->n); - fprintf(pysam_stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0)); - fprintf(pysam_stderr,"Number of sites in the buffer/overlap: "); - if ( args->nbuf_max ) fprintf(pysam_stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap); - else fprintf(pysam_stderr,"unlimited\n"); + fprintf(bcftools_stderr,"Number of target samples: %d\n", args->roh_smpl->n); + fprintf(bcftools_stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0)); + fprintf(bcftools_stderr,"Number of sites in the buffer/overlap: "); + if ( args->nbuf_max ) fprintf(bcftools_stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap); + else fprintf(bcftools_stderr,"unlimited\n"); args->smpl = (smpl_t*) calloc(args->roh_smpl->n,sizeof(smpl_t)); @@ -266,7 +266,7 @@ static void init_data(args_t *args) else if ( args->rec_rate > 0 ) hmm_set_tprob_func(args->hmm, set_tprob_rrate, args); - args->out = bgzf_open(strcmp("pysam_stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); + args->out = bgzf_open(strcmp("bcftools_stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu"); if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno)); // print header @@ -744,7 +744,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr case BCF_BT_INT8: BRANCH(int8_t); break; case BCF_BT_INT16: BRANCH(int16_t); break; case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); } #undef BRANCH } @@ -774,7 +774,7 @@ int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_fr case BCF_BT_INT8: BRANCH(int8_t); break; case BCF_BT_INT16: BRANCH(int16_t); break; case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); } #undef BRANCH } @@ -915,7 +915,7 @@ int process_line(args_t *args, bcf1_t *line, int ial) case BCF_BT_INT8: BRANCH(int8_t); break; case BCF_BT_INT16: BRANCH(int16_t); break; case BCF_BT_INT32: BRANCH(int32_t); break; - default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); + default: fprintf(bcftools_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1); } #undef BRANCH } @@ -1021,7 +1021,7 @@ static void vcfroh(args_t *args, bcf1_t *line) if ( skip_rid ) { - fprintf(pysam_stderr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line)); + fprintf(bcftools_stderr,"Skipping the sequence, no genmap for %s\n", bcf_seqname(args->hdr,line)); args->skip_rid = line->rid; return; } @@ -1037,41 +1037,41 @@ static void vcfroh(args_t *args, bcf1_t *line) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: HMM model for detecting runs of autozygosity.\n"); - fprintf(pysam_stderr, "Usage: bcftools roh [options] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "General Options:\n"); - fprintf(pysam_stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); - fprintf(pysam_stderr, " --AF-tag use TAG for allele frequency\n"); - fprintf(pysam_stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); - fprintf(pysam_stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); - fprintf(pysam_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); - fprintf(pysam_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); - fprintf(pysam_stderr, " -e, --estimate-AF [TAG], estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); - fprintf(pysam_stderr, " in . If TAG is not given, the frequency is estimated from GT by default\n"); - fprintf(pysam_stderr, " -G, --GTs-only use GTs and ignore PLs, instead using for PL of the two least likely genotypes.\n"); - fprintf(pysam_stderr, " Safe value to use is 30 to account for GT errors.\n"); - fprintf(pysam_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); - fprintf(pysam_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); - fprintf(pysam_stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); - fprintf(pysam_stderr, " is replaced with chromosome name\n"); - fprintf(pysam_stderr, " -M, --rec-rate constant recombination rate per bp\n"); - fprintf(pysam_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(pysam_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --samples list of samples to analyze [all samples]\n"); - fprintf(pysam_stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " --threads number of extra decompression threads [0]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "HMM Options:\n"); - fprintf(pysam_stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); - fprintf(pysam_stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); - fprintf(pysam_stderr, " -V, --viterbi-training estimate HMM parameters, is the convergence threshold, e.g. 1e-10 (experimental)\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: HMM model for detecting runs of autozygosity.\n"); + fprintf(bcftools_stderr, "Usage: bcftools roh [options] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "General Options:\n"); + fprintf(bcftools_stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); + fprintf(bcftools_stderr, " --AF-tag use TAG for allele frequency\n"); + fprintf(bcftools_stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); + fprintf(bcftools_stderr, " -b --buffer-size buffer size and the number of overlapping sites, 0 for unlimited [0]\n"); + fprintf(bcftools_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n"); + fprintf(bcftools_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n"); + fprintf(bcftools_stderr, " -e, --estimate-AF [TAG], estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n"); + fprintf(bcftools_stderr, " in . If TAG is not given, the frequency is estimated from GT by default\n"); + fprintf(bcftools_stderr, " -G, --GTs-only use GTs and ignore PLs, instead using for PL of the two least likely genotypes.\n"); + fprintf(bcftools_stderr, " Safe value to use is 30 to account for GT errors.\n"); + fprintf(bcftools_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n"); + fprintf(bcftools_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n"); + fprintf(bcftools_stderr, " -m, --genetic-map genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n"); + fprintf(bcftools_stderr, " is replaced with chromosome name\n"); + fprintf(bcftools_stderr, " -M, --rec-rate constant recombination rate per bp\n"); + fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples list of samples to analyze [all samples]\n"); + fprintf(bcftools_stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " --threads number of extra decompression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "HMM Options:\n"); + fprintf(bcftools_stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); + fprintf(bcftools_stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); + fprintf(bcftools_stderr, " -V, --viterbi-training estimate HMM parameters, is the convergence threshold, e.g. 1e-10 (experimental)\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -1169,7 +1169,7 @@ int main_vcfroh(int argc, char *argv[]) default: error("Unknown argument: %s\n", optarg); } } - if ( !args->output_fname ) args->output_fname = "pysam_stdout"; + if ( !args->output_fname ) args->output_fname = "bcftools_stdout"; if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG; char *fname = NULL; if ( optind==argc ) @@ -1212,11 +1212,11 @@ int main_vcfroh(int argc, char *argv[]) int i, nmin = 0; for (i=0; iroh_smpl->n; i++) if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused; - fprintf(pysam_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin); + fprintf(bcftools_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin); if ( nmin==0 ) { - fprintf(pysam_stderr,"No usable sites were found."); - if ( !naf_opts && !args->dflt_AF ) fprintf(pysam_stderr, " Consider using one of the AF options.\n"); + fprintf(bcftools_stderr,"No usable sites were found."); + if ( !naf_opts && !args->dflt_AF ) fprintf(bcftools_stderr, " Consider using one of the AF options.\n"); } destroy_data(args); free(args); diff --git a/bcftools/vcfsom.c.pysam.c b/bcftools/vcfsom.c.pysam.c index 58875f6..d806c01 100644 --- a/bcftools/vcfsom.c.pysam.c +++ b/bcftools/vcfsom.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfsom.c -- SOM (Self-Organizing Map) filtering. @@ -104,7 +104,7 @@ char *msprintf(const char *fmt, ...) /* * char *t, *p = str; * t = column_next(p, '\t'); - * if ( strlen("")==t-p && !strncmp(p,"",t-p) ) fprintf(pysam_stdout, "found!\n"); + * if ( strlen("")==t-p && !strncmp(p,"",t-p) ) fprintf(bcftools_stdout, "found!\n"); * * char *t; * t = column_next(str, '\t'); if ( !*t ) error("expected field\n", str); @@ -574,7 +574,7 @@ static void do_train(args_t *args) fprintf(fp,"%e\t%f\t%f\n", prev_score, (float)igood/ngood, (float)ibad/nbad); if ( !printed && (float)igood/ngood > 0.9 ) { - fprintf(pysam_stdout, "%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score); + fprintf(bcftools_stdout, "%.2f\t%.2f\t%e\t# %% of bad [1] and good [2] sites at a cutoff [3]\n", 100.*ibad/nbad,100.*igood/ngood,prev_score); printed = 1; } @@ -582,7 +582,7 @@ static void do_train(args_t *args) else if ( igoodprefix,strerror(errno)); @@ -607,36 +607,36 @@ static void do_classify(args_t *args) case MERGE_MAX: score = get_max_score(args, -1); break; case MERGE_AVG: score = get_avg_score(args, -1); break; } - fprintf(pysam_stdout, "%e\n", 1.0 - score/max_score); + fprintf(bcftools_stdout, "%e\n", 1.0 - score/max_score); } annots_reader_close(args); } static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: SOM (Self-Organizing Map) filtering.\n"); - fprintf(pysam_stderr, "Usage: bcftools som --train [options] \n"); - fprintf(pysam_stderr, " bcftools som --classify [options]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Model training options:\n"); - fprintf(pysam_stderr, " -f, --nfold n-fold cross-validation (number of maps) [5]\n"); - fprintf(pysam_stderr, " -p, --prefix prefix of output files\n"); - fprintf(pysam_stderr, " -s, --size map size [20]\n"); - fprintf(pysam_stderr, " -t, --train \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Classifying options:\n"); - fprintf(pysam_stderr, " -c, --classify \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Experimental training options (no reason to change):\n"); - fprintf(pysam_stderr, " -b, --bmu-threshold threshold for selection of best-matching unit [0.9]\n"); - fprintf(pysam_stderr, " -d, --som-dimension SOM dimension [2]\n"); - fprintf(pysam_stderr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n"); - fprintf(pysam_stderr, " -l, --learning-rate learning rate [1.0]\n"); - fprintf(pysam_stderr, " -m, --merge -f merge algorithm [avg]\n"); - fprintf(pysam_stderr, " -n, --ntrain-sites effective number of training sites [number of good sites]\n"); - fprintf(pysam_stderr, " -r, --random-seed random seed, 0 for time() [1]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: SOM (Self-Organizing Map) filtering.\n"); + fprintf(bcftools_stderr, "Usage: bcftools som --train [options] \n"); + fprintf(bcftools_stderr, " bcftools som --classify [options]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Model training options:\n"); + fprintf(bcftools_stderr, " -f, --nfold n-fold cross-validation (number of maps) [5]\n"); + fprintf(bcftools_stderr, " -p, --prefix prefix of output files\n"); + fprintf(bcftools_stderr, " -s, --size map size [20]\n"); + fprintf(bcftools_stderr, " -t, --train \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Classifying options:\n"); + fprintf(bcftools_stderr, " -c, --classify \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Experimental training options (no reason to change):\n"); + fprintf(bcftools_stderr, " -b, --bmu-threshold threshold for selection of best-matching unit [0.9]\n"); + fprintf(bcftools_stderr, " -d, --som-dimension SOM dimension [2]\n"); + fprintf(bcftools_stderr, " -e, --exclude-bad exclude bad sites from training, use for evaluation only\n"); + fprintf(bcftools_stderr, " -l, --learning-rate learning rate [1.0]\n"); + fprintf(bcftools_stderr, " -m, --merge -f merge algorithm [avg]\n"); + fprintf(bcftools_stderr, " -n, --ntrain-sites effective number of training sites [number of good sites]\n"); + fprintf(bcftools_stderr, " -r, --random-seed random seed, 0 for time() [1]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -692,7 +692,7 @@ int main_vcfsom(int argc, char *argv[]) case 'd': args->ndim = atoi(optarg); if ( args->ndim<2 ) error("Expected -d >=2, got %d\n", args->ndim); - if ( args->ndim>3 ) fprintf(pysam_stderr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim); + if ( args->ndim>3 ) fprintf(bcftools_stderr,"Warning: This will take a long time and is not going to make the results better: -d %d\n", args->ndim); break; case 't': args->action = SOM_TRAIN; break; case 'c': args->action = SOM_CLASSIFY; break; diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c index a07cd92..4a0325f 100644 --- a/bcftools/vcfsort.c.pysam.c +++ b/bcftools/vcfsort.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfsort.c -- sort subcommand @@ -163,7 +163,7 @@ void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) void merge_blocks(args_t *args) { - fprintf(pysam_stderr,"Merging %d temporary files\n", (int)args->nblk); + fprintf(bcftools_stderr,"Merging %d temporary files\n", (int)args->nblk); khp_blk_t *bhp = khp_init(blk); @@ -190,7 +190,7 @@ void merge_blocks(args_t *args) } if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname); - fprintf(pysam_stderr,"Cleaning\n"); + fprintf(bcftools_stderr,"Cleaning\n"); for (i=0; inblk; i++) { blk_t *blk = args->blk + i; @@ -201,21 +201,21 @@ void merge_blocks(args_t *args) rmdir(args->tmp_dir); free(args->blk); khp_destroy(blk, bhp); - fprintf(pysam_stderr,"Done\n"); + fprintf(bcftools_stderr,"Done\n"); } static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Sort VCF/BCF file.\n"); - fprintf(pysam_stderr, "Usage: bcftools sort [OPTIONS] \n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(pysam_stderr, " -o, --output-file output file name [pysam_stdout]\n"); - fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Sort VCF/BCF file.\n"); + fprintf(bcftools_stderr, "Usage: bcftools sort [OPTIONS] \n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 + fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -244,7 +244,7 @@ void init(args_t *args) args->tmp_dir = strdup(args->tmp_dir); mkdir_p(args->tmp_dir); } - fprintf(pysam_stderr,"Writing to %s\n", args->tmp_dir); + fprintf(bcftools_stderr,"Writing to %s\n", args->tmp_dir); } void destroy(args_t *args) { diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 57adbc0..875dd6a 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. @@ -198,17 +198,17 @@ static inline int clip_nonnegative(float x, int limit) static void _indel_ctx_print1(_idc1_t *idc) { int i; - fprintf(pysam_stdout, "%d\t", idc->cnt); + fprintf(bcftools_stdout, "%d\t", idc->cnt); for (i=0; ilen; i++) - fputc(idc->seq[i], pysam_stdout); - fputc('\n', pysam_stdout); + fputc(idc->seq[i], bcftools_stdout); + fputc('\n', bcftools_stdout); } static void _indel_ctx_print(indel_ctx_t *ctx) { int i; for (i=0; indat; i++) _indel_ctx_print1(&ctx->dat[i]); - fputc('\n',pysam_stdout); + fputc('\n',bcftools_stdout); } #endif static int _indel_ctx_lookup(indel_ctx_t *ctx, char *seq, int seq_len, int *hit) @@ -320,9 +320,9 @@ int indel_ctx_type(indel_ctx_t *ctx, char *chr, int pos, char *ref, char *alt, i } #if IC_DBG - fprintf(pysam_stdout,"ref: %s\n", ref); - fprintf(pysam_stdout,"alt: %s\n", alt); - fprintf(pysam_stdout,"ctx: %s\n", fai_ref); + fprintf(bcftools_stdout,"ref: %s\n", ref); + fprintf(bcftools_stdout,"alt: %s\n", alt); + fprintf(bcftools_stdout,"ctx: %s\n", fai_ref); _indel_ctx_print(ctx); #endif @@ -951,7 +951,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; - default: fprintf(pysam_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT } @@ -1015,7 +1015,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int { nmm++; bcf_sr_t *reader = &files->readers[0]; - fprintf(pysam_stdout, "DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); + fprintf(bcftools_stdout, "DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); } else { @@ -1024,7 +1024,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int } } float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0; - fprintf(pysam_stdout, "PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); + fprintf(bcftools_stdout, "PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); } } } @@ -1094,38 +1094,38 @@ static void do_vcf_stats(args_t *args) static void print_header(args_t *args) { int i; - fprintf(pysam_stdout, "# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version()); - fprintf(pysam_stdout, "# The command line was:\tbcftools %s ", args->argv[0]); + fprintf(bcftools_stdout, "# This file was produced by bcftools stats (%s+htslib-%s) and can be plotted using plot-vcfstats.\n", bcftools_version(),hts_version()); + fprintf(bcftools_stdout, "# The command line was:\tbcftools %s ", args->argv[0]); for (i=1; iargc; i++) - fprintf(pysam_stdout, " %s",args->argv[i]); - fprintf(pysam_stdout, "\n#\n"); + fprintf(bcftools_stdout, " %s",args->argv[i]); + fprintf(bcftools_stdout, "\n#\n"); - fprintf(pysam_stdout, "# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n"); + fprintf(bcftools_stdout, "# Definition of sets:\n# ID\t[2]id\t[3]tab-separated file names\n"); if ( args->files->nreaders==1 ) { const char *fname = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : ""; if ( args->split_by_id ) { - fprintf(pysam_stdout, "ID\t0\t%s:known (sites with ID different from \".\")\n", fname); - fprintf(pysam_stdout, "ID\t1\t%s:novel (sites where ID column is \".\")\n", fname); + fprintf(bcftools_stdout, "ID\t0\t%s:known (sites with ID different from \".\")\n", fname); + fprintf(bcftools_stdout, "ID\t1\t%s:novel (sites where ID column is \".\")\n", fname); } else - fprintf(pysam_stdout, "ID\t0\t%s\n", fname); + fprintf(bcftools_stdout, "ID\t0\t%s\n", fname); } else { const char *fname0 = strcmp("-",args->files->readers[0].fname) ? args->files->readers[0].fname : ""; const char *fname1 = strcmp("-",args->files->readers[1].fname) ? args->files->readers[1].fname : ""; - fprintf(pysam_stdout, "ID\t0\t%s\n", fname0); - fprintf(pysam_stdout, "ID\t1\t%s\n", fname1); - fprintf(pysam_stdout, "ID\t2\t%s\t%s\n", fname0,fname1); + fprintf(bcftools_stdout, "ID\t0\t%s\n", fname0); + fprintf(bcftools_stdout, "ID\t1\t%s\n", fname1); + fprintf(bcftools_stdout, "ID\t2\t%s\t%s\n", fname0,fname1); if ( args->verbose_sites ) { - fprintf(pysam_stdout, + fprintf(bcftools_stdout, "# Verbose per-site discordance output.\n" "# PSD\t[2]CHROM\t[3]POS\t[4]Number of matches\t[5]Number of mismatches\t[6]NRD\n"); - fprintf(pysam_stdout, + fprintf(bcftools_stdout, "# Verbose per-site and per-sample output. Genotype codes: %d:HomRefRef, %d:HomAltAlt, %d:HetAltRef, %d:HetAltAlt, %d:haploidRef, %d:haploidAlt\n" "# DBG\t[2]CHROM\t[3]POS\t[4]Sample\t[5]GT in %s\t[6]GT in %s\n", GT_HOM_RR, GT_HOM_AA, GT_HET_RA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A, fname0,fname1); @@ -1137,57 +1137,57 @@ static void print_header(args_t *args) static void print_stats(args_t *args) { int i, j,k, id; - fprintf(pysam_stdout, "# SN, Summary numbers:\n"); - fprintf(pysam_stdout, "# number of records .. number of data rows in the VCF\n"); - fprintf(pysam_stdout, "# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n"); - fprintf(pysam_stdout, "# number of SNPs .. number of rows with a SNP\n"); - fprintf(pysam_stdout, "# number of MNPs .. number of rows with a MNP, such as CC>TT\n"); - fprintf(pysam_stdout, "# number of indels .. number of rows with an indel\n"); - fprintf(pysam_stdout, "# number of others .. number of rows with other type, for example a symbolic allele or\n"); - fprintf(pysam_stdout, "# a complex substitution, such as ACT>TCGA\n"); - fprintf(pysam_stdout, "# number of multiallelic sites .. number of rows with multiple alternate alleles\n"); - fprintf(pysam_stdout, "# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n"); - fprintf(pysam_stdout, "# \n"); - fprintf(pysam_stdout, "# Note that rows containing multiple types will be counted multiple times, in each\n"); - fprintf(pysam_stdout, "# counter. For example, a row with a SNP and an indel increments both the SNP and\n"); - fprintf(pysam_stdout, "# the indel counter.\n"); - fprintf(pysam_stdout, "# \n"); - fprintf(pysam_stdout, "# SN\t[2]id\t[3]key\t[4]value\n"); + fprintf(bcftools_stdout, "# SN, Summary numbers:\n"); + fprintf(bcftools_stdout, "# number of records .. number of data rows in the VCF\n"); + fprintf(bcftools_stdout, "# number of no-ALTs .. reference-only sites, ALT is either \".\" or identical to REF\n"); + fprintf(bcftools_stdout, "# number of SNPs .. number of rows with a SNP\n"); + fprintf(bcftools_stdout, "# number of MNPs .. number of rows with a MNP, such as CC>TT\n"); + fprintf(bcftools_stdout, "# number of indels .. number of rows with an indel\n"); + fprintf(bcftools_stdout, "# number of others .. number of rows with other type, for example a symbolic allele or\n"); + fprintf(bcftools_stdout, "# a complex substitution, such as ACT>TCGA\n"); + fprintf(bcftools_stdout, "# number of multiallelic sites .. number of rows with multiple alternate alleles\n"); + fprintf(bcftools_stdout, "# number of multiallelic SNP sites .. number of rows with multiple alternate alleles, all SNPs\n"); + fprintf(bcftools_stdout, "# \n"); + fprintf(bcftools_stdout, "# Note that rows containing multiple types will be counted multiple times, in each\n"); + fprintf(bcftools_stdout, "# counter. For example, a row with a SNP and an indel increments both the SNP and\n"); + fprintf(bcftools_stdout, "# the indel counter.\n"); + fprintf(bcftools_stdout, "# \n"); + fprintf(bcftools_stdout, "# SN\t[2]id\t[3]key\t[4]value\n"); for (id=0; idfiles->nreaders; id++) - fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); + fprintf(bcftools_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header)); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - fprintf(pysam_stdout, "SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); - fprintf(pysam_stdout, "SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); - fprintf(pysam_stdout, "SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); - fprintf(pysam_stdout, "SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); - fprintf(pysam_stdout, "SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); - fprintf(pysam_stdout, "SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); - fprintf(pysam_stdout, "SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); - fprintf(pysam_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); + fprintf(bcftools_stdout, "SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); + fprintf(bcftools_stdout, "SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); + fprintf(bcftools_stdout, "SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); + fprintf(bcftools_stdout, "SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); + fprintf(bcftools_stdout, "SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); + fprintf(bcftools_stdout, "SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); + fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); + fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); } - fprintf(pysam_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); + fprintf(bcftools_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; int ts=0,tv=0; for (i=0; im_af; i++) { ts += stats->af_ts[i]; tv += stats->af_tv[i]; } - fprintf(pysam_stdout, "TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0); + fprintf(bcftools_stdout, "TSTV\t%d\t%d\t%d\t%.2f\t%d\t%d\t%.2f\n", id,ts,tv,tv?(float)ts/tv:0, stats->ts_alt1,stats->tv_alt1,stats->tv_alt1?(float)stats->ts_alt1/stats->tv_alt1:0); } if ( args->exons_fname ) { - fprintf(pysam_stdout, "# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n"); + fprintf(bcftools_stdout, "# FS, Indel frameshifts:\n# FS\t[2]id\t[3]in-frame\t[4]out-frame\t[5]not applicable\t[6]out/(in+out) ratio\t[7]in-frame (1st ALT)\t[8]out-frame (1st ALT)\t[9]not applicable (1st ALT)\t[10]out/(in+out) ratio (1st ALT)\n"); for (id=0; idnstats; id++) { int in=args->stats[id].in_frame, out=args->stats[id].out_frame, na=args->stats[id].na_frame; int in1=args->stats[id].in_frame_alt1, out1=args->stats[id].out_frame_alt1, na1=args->stats[id].na_frame_alt1; - fprintf(pysam_stdout, "FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0); + fprintf(bcftools_stdout, "FS\t%d\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%.2f\n", id, in,out,na,out?(float)out/(in+out):0,in1,out1,na1,out1?(float)out1/(in1+out1):0); } } if ( args->indel_ctx ) { - fprintf(pysam_stdout, "# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n"); + fprintf(bcftools_stdout, "# ICS, Indel context summary:\n# ICS\t[2]id\t[3]repeat-consistent\t[4]repeat-inconsistent\t[5]not applicable\t[6]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { int nc = 0, ni = 0, na = args->stats[id].n_repeat_na; @@ -1196,25 +1196,25 @@ static void print_stats(args_t *args) nc += args->stats[id].n_repeat[i][0] + args->stats[id].n_repeat[i][2]; ni += args->stats[id].n_repeat[i][1] + args->stats[id].n_repeat[i][3]; } - fprintf(pysam_stdout, "ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0); + fprintf(bcftools_stdout, "ICS\t%d\t%d\t%d\t%d\t%.4f\n", id, nc,ni,na,nc+ni ? (float)nc/(nc+ni) : 0.0); } - fprintf(pysam_stdout, "# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n"); + fprintf(bcftools_stdout, "# ICL, Indel context by length:\n# ICL\t[2]id\t[3]length of repeat element\t[4]repeat-consistent deletions)\t[5]repeat-inconsistent deletions\t[6]consistent insertions\t[7]inconsistent insertions\t[8]c/(c+i) ratio\n"); for (id=0; idnstats; id++) { for (i=1; istats[id].n_repeat[i][0]+args->stats[id].n_repeat[i][2], ni = args->stats[id].n_repeat[i][1]+args->stats[id].n_repeat[i][3]; - fprintf(pysam_stdout, "ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1, + fprintf(bcftools_stdout, "ICL\t%d\t%d\t%d\t%d\t%d\t%d\t%.4f\n", id, i+1, args->stats[id].n_repeat[i][0],args->stats[id].n_repeat[i][1],args->stats[id].n_repeat[i][2],args->stats[id].n_repeat[i][3], nc+ni ? (float)nc/(nc+ni) : 0.0); } } } - fprintf(pysam_stdout, "# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); + fprintf(bcftools_stdout, "# SiS, Singleton stats:\n# SiS\t[2]id\t[3]allele count\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - fprintf(pysam_stdout, "SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0], + fprintf(bcftools_stdout, "SiS\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,1,stats->af_snps[0],stats->af_ts[0],stats->af_tv[0], stats->af_repeats[0][0]+stats->af_repeats[1][0]+stats->af_repeats[2][0],stats->af_repeats[0][0],stats->af_repeats[1][0],stats->af_repeats[2][0]); // put the singletons stats into the first AF bin, note that not all of the stats is transferred (i.e. nrd mismatches) stats->af_snps[1] += stats->af_snps[0]; @@ -1242,7 +1242,7 @@ static void print_stats(args_t *args) args->af_gts_indels[1].n += args->af_gts_indels[0].n; } - fprintf(pysam_stdout, "# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); + fprintf(bcftools_stdout, "# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1250,25 +1250,25 @@ static void print_stats(args_t *args) { if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue; double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); - fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], + fprintf(bcftools_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i], stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]); } } #if QUAL_STATS - fprintf(pysam_stdout, "# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); + fprintf(bcftools_stdout, "# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=0; im_qual; i++) { if ( stats->qual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue; - fprintf(pysam_stdout, "QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]); + fprintf(bcftools_stdout, "QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]); } } #endif for (i=0; inusr; i++) { - fprintf(pysam_stdout, "# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", + fprintf(bcftools_stdout, "# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag); for (id=0; idnstats; id++) { @@ -1279,32 +1279,32 @@ static void print_stats(args_t *args) if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n"; - fprintf(pysam_stdout, fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); + fprintf(bcftools_stdout, fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } } - fprintf(pysam_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); + fprintf(bcftools_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; for (i=stats->m_indel-1; i>=0; i--) - if ( stats->deletions[i] ) fprintf(pysam_stdout, "IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); + if ( stats->deletions[i] ) fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); for (i=0; im_indel; i++) - if ( stats->insertions[i] ) fprintf(pysam_stdout, "IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); + if ( stats->insertions[i] ) fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); } - fprintf(pysam_stdout, "# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); + fprintf(bcftools_stdout, "# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); for (id=0; idnstats; id++) { int t; for (t=0; t<15; t++) { if ( t>>2 == (t&3) ) continue; - fprintf(pysam_stdout, "ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]); + fprintf(bcftools_stdout, "ST\t%d\t%c>%c\t%d\n", id, bcf_int2acgt(t>>2),bcf_int2acgt(t&3),args->stats[id].subst[t]); } } if ( args->files->nreaders>1 && args->files->n_smpl ) { - fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl); + fprintf(bcftools_stdout, "SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl); int x; for (x=0; x<2; x++) // x=0: snps, x=1: indels @@ -1312,12 +1312,12 @@ static void print_stats(args_t *args) gtcmp_t *stats; if ( x==0 ) { - fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); + fprintf(bcftools_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_snps; } else { - fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); + fprintf(bcftools_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n"); stats = args->af_gts_indels; } uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins @@ -1351,30 +1351,30 @@ static void print_stats(args_t *args) r2 *= r2; } double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); - fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); - if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f", r2); - else fprintf(pysam_stdout, "\t"NA_STRING); - fprintf(pysam_stdout, "\t%.0f\n", stats[i].n); + fprintf(bcftools_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); + fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); + fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); + if ( stats[i].n && !isnan(r2) ) fprintf(bcftools_stdout, "\t%f", r2); + else fprintf(bcftools_stdout, "\t"NA_STRING); + fprintf(bcftools_stdout, "\t%.0f\n", stats[i].n); } if ( x==0 ) { - fprintf(pysam_stdout, "# NRD and discordance is calculated as follows:\n"); - fprintf(pysam_stdout, "# m .. number of matches\n"); - fprintf(pysam_stdout, "# x .. number of mismatches\n"); - fprintf(pysam_stdout, "# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); - fprintf(pysam_stdout, "# RR discordance = xRR / (xRR + mRR)\n"); - fprintf(pysam_stdout, "# RA discordance = xRA / (xRA + mRA)\n"); - fprintf(pysam_stdout, "# AA discordance = xAA / (xAA + mAA)\n"); - fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); + fprintf(bcftools_stdout, "# NRD and discordance is calculated as follows:\n"); + fprintf(bcftools_stdout, "# m .. number of matches\n"); + fprintf(bcftools_stdout, "# x .. number of mismatches\n"); + fprintf(bcftools_stdout, "# NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA)\n"); + fprintf(bcftools_stdout, "# RR discordance = xRR / (xRR + mRR)\n"); + fprintf(bcftools_stdout, "# RA discordance = xRA / (xRA + mRA)\n"); + fprintf(bcftools_stdout, "# AA discordance = xAA / (xAA + mAA)\n"); + fprintf(bcftools_stdout, "# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); } else - fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); + fprintf(bcftools_stdout, "# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)]; uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)]; - fprintf(pysam_stdout, "NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', + fprintf(bcftools_stdout, "NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', m+mm ? mm*100.0/(m+mm) : 0, nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0, nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)] ? nrd_mm[T2S(GT_HET_RA)]*100.0/(nrd_m[T2S(GT_HET_RA)]+nrd_mm[T2S(GT_HET_RA)]) : 0, @@ -1387,12 +1387,12 @@ static void print_stats(args_t *args) gtcmp_t *stats; if ( x==0 ) { - fprintf(pysam_stdout, "# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); + fprintf(bcftools_stdout, "# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_snps; } else { - fprintf(pysam_stdout, "# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); + fprintf(bcftools_stdout, "# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n"); stats = args->smpl_gts_indels; } for (i=0; ifiles->n_smpl; i++) @@ -1410,17 +1410,17 @@ static void print_stats(args_t *args) r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n)); r2 *= r2; } - fprintf(pysam_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", + fprintf(bcftools_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); + fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]); - fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", + fprintf(bcftools_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]); - if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f\n", r2); - else fprintf(pysam_stdout, "\t"NA_STRING"\n"); + if ( stats[i].n && !isnan(r2) ) fprintf(bcftools_stdout, "\t%f\n", r2); + else fprintf(bcftools_stdout, "\t"NA_STRING"\n"); } } for (x=0; x<2; x++) // x=0: snps, x=1: indels @@ -1430,54 +1430,54 @@ static void print_stats(args_t *args) gtcmp_t *stats; if ( x==0 ) { - fprintf(pysam_stdout, "# GCTs, Genotype concordance table (SNPs)\n# GCTs"); + fprintf(bcftools_stdout, "# GCTs, Genotype concordance table (SNPs)\n# GCTs"); stats = args->smpl_gts_snps; } else { - fprintf(pysam_stdout, "# GCTi, Genotype concordance table (indels)\n# GCTi"); + fprintf(bcftools_stdout, "# GCTi, Genotype concordance table (indels)\n# GCTi"); stats = args->smpl_gts_indels; } i = 1; - fprintf(pysam_stdout, "\t[%d]sample", ++i); - fprintf(pysam_stdout, "\t[%d]RR Hom -> RR Hom", ++i); - fprintf(pysam_stdout, "\t[%d]RR Hom -> RA Het", ++i); - fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Hom", ++i); - fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Het", ++i); - fprintf(pysam_stdout, "\t[%d]RR Hom -> missing", ++i); - fprintf(pysam_stdout, "\t[%d]RA Het -> RR Hom", ++i); - fprintf(pysam_stdout, "\t[%d]RA Het -> RA Het", ++i); - fprintf(pysam_stdout, "\t[%d]RA Het -> AA Hom", ++i); - fprintf(pysam_stdout, "\t[%d]RA Het -> AA Het", ++i); - fprintf(pysam_stdout, "\t[%d]RA Het -> missing", ++i); - fprintf(pysam_stdout, "\t[%d]AA Hom -> RR Hom", ++i); - fprintf(pysam_stdout, "\t[%d]AA Hom -> RA Het", ++i); - fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Hom", ++i); - fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Het", ++i); - fprintf(pysam_stdout, "\t[%d]AA Hom -> missing", ++i); - fprintf(pysam_stdout, "\t[%d]AA Het -> RR Hom", ++i); - fprintf(pysam_stdout, "\t[%d]AA Het -> RA Het", ++i); - fprintf(pysam_stdout, "\t[%d]AA Het -> AA Hom", ++i); - fprintf(pysam_stdout, "\t[%d]AA Het -> AA Het", ++i); - fprintf(pysam_stdout, "\t[%d]AA Het -> missing", ++i); - fprintf(pysam_stdout, "\t[%d]missing -> RR Hom", ++i); - fprintf(pysam_stdout, "\t[%d]missing -> RA Het", ++i); - fprintf(pysam_stdout, "\t[%d]missing -> AA Hom", ++i); - fprintf(pysam_stdout, "\t[%d]missing -> AA Het", ++i); - fprintf(pysam_stdout, "\t[%d]missing -> missing\n", ++i); + fprintf(bcftools_stdout, "\t[%d]sample", ++i); + fprintf(bcftools_stdout, "\t[%d]RR Hom -> RR Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]RR Hom -> RA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]RR Hom -> AA Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]RR Hom -> AA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]RR Hom -> missing", ++i); + fprintf(bcftools_stdout, "\t[%d]RA Het -> RR Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]RA Het -> RA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]RA Het -> AA Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]RA Het -> AA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]RA Het -> missing", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Hom -> RR Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Hom -> RA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Hom -> AA Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Hom -> AA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Hom -> missing", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Het -> RR Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Het -> RA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Het -> AA Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Het -> AA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]AA Het -> missing", ++i); + fprintf(bcftools_stdout, "\t[%d]missing -> RR Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]missing -> RA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]missing -> AA Hom", ++i); + fprintf(bcftools_stdout, "\t[%d]missing -> AA Het", ++i); + fprintf(bcftools_stdout, "\t[%d]missing -> missing\n", ++i); for (i=0; ifiles->n_smpl; i++) { - fprintf(pysam_stdout, "GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); + fprintf(bcftools_stdout, "GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); for (j=0; j<5; j++) for (k=0; k<5; k++) - fprintf(pysam_stdout, "\t%"PRId64, stats[i].gt2gt[j][k]); - fprintf(pysam_stdout, "\n"); + fprintf(bcftools_stdout, "\t%"PRId64, stats[i].gt2gt[j][k]); + fprintf(bcftools_stdout, "\n"); } } } - fprintf(pysam_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); + fprintf(bcftools_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1486,19 +1486,19 @@ static void print_stats(args_t *args) for (i=0; idp.m_vals; i++) { if ( stats->dp.vals[i]==0 && stats->dp_sites.vals[i]==0 ) continue; - fprintf(pysam_stdout, "DP\t%d\t", id); - if ( i==0 ) fprintf(pysam_stdout, "<%d", stats->dp.min); - else if ( i+1==stats->dp.m_vals ) fprintf(pysam_stdout, ">%d", stats->dp.max); - else fprintf(pysam_stdout, "%d", idist_i2bin(&stats->dp,i)); - fprintf(pysam_stdout, "\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); - fprintf(pysam_stdout, "\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); + fprintf(bcftools_stdout, "DP\t%d\t", id); + if ( i==0 ) fprintf(bcftools_stdout, "<%d", stats->dp.min); + else if ( i+1==stats->dp.m_vals ) fprintf(bcftools_stdout, ">%d", stats->dp.max); + else fprintf(bcftools_stdout, "%d", idist_i2bin(&stats->dp,i)); + fprintf(bcftools_stdout, "\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); + fprintf(bcftools_stdout, "\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); } } if ( args->files->n_smpl ) { - fprintf(pysam_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n"); - fprintf(pysam_stdout, "# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" + fprintf(bcftools_stdout, "# PSC, Per-sample counts. Note that the ref/het/hom counts include only SNPs, for indels see PSI. Haploid counts include both SNPs and indels.\n"); + fprintf(bcftools_stdout, "# PSC\t[2]id\t[3]sample\t[4]nRefHom\t[5]nNonRefHom\t[6]nHets\t[7]nTransitions\t[8]nTransversions\t[9]nIndels\t[10]average depth\t[11]nSingletons" "\t[12]nHapRef\t[13]nHapAlt\n"); for (id=0; idnstats; id++) { @@ -1506,14 +1506,14 @@ static void print_stats(args_t *args) for (i=0; ifiles->n_smpl; i++) { float dp = stats->smpl_ndp[i] ? stats->smpl_dp[i]/(float)stats->smpl_ndp[i] : 0; - fprintf(pysam_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i], + fprintf(bcftools_stdout, "PSC\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.1f\t%d\t%d\t%d\n", id,args->files->samples[i], stats->smpl_homRR[i], stats->smpl_homAA[i], stats->smpl_hets[i], stats->smpl_ts[i], stats->smpl_tv[i], stats->smpl_indels[i],dp, stats->smpl_sngl[i], stats->smpl_hapRef[i], stats->smpl_hapAlt[i]); } } - fprintf(pysam_stdout, "# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); + fprintf(bcftools_stdout, "# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1528,12 +1528,12 @@ static void print_stats(args_t *args) } int nhom = stats->smpl_indel_homs[i]; int nhet = stats->smpl_indel_hets[i]; - fprintf(pysam_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); + fprintf(bcftools_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); } } #ifdef HWE_STATS - fprintf(pysam_stdout, "# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n"); + fprintf(bcftools_stdout, "# HWE\n# HWE\t[2]id\t[3]1st ALT allele frequency\t[4]Number of observations\t[5]25th percentile\t[6]median\t[7]75th percentile\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; @@ -1548,28 +1548,28 @@ static void print_stats(args_t *args) double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); int nprn = 3; - fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,af,sum_tot); + fprintf(bcftools_stdout, "HWE\t%d\t%f\t%d",id,af,sum_tot); for (j=0; jnaf_hwe; j++) { sum_tmp += ptr[j]; float frac = (float)sum_tmp/sum_tot; if ( frac >= 0.75 ) { - while (nprn>0) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } + while (nprn>0) { fprintf(bcftools_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } break; } if ( frac >= 0.5 ) { - while (nprn>1) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } + while (nprn>1) { fprintf(bcftools_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } continue; } if ( frac >= 0.25 ) { - while (nprn>2) { fprintf(pysam_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } + while (nprn>2) { fprintf(bcftools_stdout, "\t%f", (float)j/args->naf_hwe); nprn--; } } } assert(nprn==0); - fprintf(pysam_stdout, "\n"); + fprintf(bcftools_stdout, "\n"); } } #endif @@ -1578,35 +1578,35 @@ static void print_stats(args_t *args) static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n"); - fprintf(pysam_stderr, " When two files are given, the program generates separate stats for intersection\n"); - fprintf(pysam_stderr, " and the complements. By default only sites are compared, -s/-S must given to include\n"); - fprintf(pysam_stderr, " also sample columns.\n"); - fprintf(pysam_stderr, "Usage: bcftools stats [options] []\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " --af-bins allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); - fprintf(pysam_stderr, " --af-tag allele frequency tag to use, by default estimated from AN,AC or GT\n"); - fprintf(pysam_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); - fprintf(pysam_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(pysam_stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); - fprintf(pysam_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); - fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysam_stderr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); - fprintf(pysam_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); - fprintf(pysam_stderr, " -S, --samples-file file of samples to include\n"); - fprintf(pysam_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(pysam_stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); - fprintf(pysam_stderr, " --threads number of extra decompression threads [0]\n"); - fprintf(pysam_stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: Parses VCF or BCF and produces stats which can be plotted using plot-vcfstats.\n"); + fprintf(bcftools_stderr, " When two files are given, the program generates separate stats for intersection\n"); + fprintf(bcftools_stderr, " and the complements. By default only sites are compared, -s/-S must given to include\n"); + fprintf(bcftools_stderr, " also sample columns.\n"); + fprintf(bcftools_stderr, "Usage: bcftools stats [options] []\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Options:\n"); + fprintf(bcftools_stderr, " --af-bins allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); + fprintf(bcftools_stderr, " --af-tag allele frequency tag to use, by default estimated from AN,AC or GT\n"); + fprintf(bcftools_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); + fprintf(bcftools_stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); + fprintf(bcftools_stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); + fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); + fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); + fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); + fprintf(bcftools_stderr, " -S, --samples-file file of samples to include\n"); + fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); + fprintf(bcftools_stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); + fprintf(bcftools_stderr, " --threads number of extra decompression threads [0]\n"); + fprintf(bcftools_stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index a471f37..4fbe35a 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. @@ -115,7 +115,7 @@ static void init_data(args_t *args) for (i=0; iforce_samples) { - fprintf(pysam_stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); + fprintf(bcftools_stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); } else { error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } @@ -136,7 +136,7 @@ static void init_data(args_t *args) for (i=0; iforce_samples) { - fprintf(pysam_stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); + fprintf(bcftools_stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); continue; } else { error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); @@ -150,7 +150,7 @@ static void init_data(args_t *args) free(smpl); khash_str2int_destroy(hdr_samples); if (args->n_samples == 0) { - fprintf(pysam_stderr, "Warn: subsetting has removed all samples\n"); + fprintf(bcftools_stderr, "Warn: subsetting has removed all samples\n"); args->sites_only = 1; } } @@ -161,7 +161,7 @@ static void init_data(args_t *args) // determine variant types to include/exclude if (args->include_types || args->exclude_types) { if (args->include_types && args->exclude_types) { - fprintf(pysam_stderr, "Error: only supply one of --include-types, --exclude-types options\n"); + fprintf(bcftools_stderr, "Error: only supply one of --include-types, --exclude-types options\n"); exit(1); } char **type_list = 0; @@ -191,8 +191,8 @@ static void init_data(args_t *args) else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1; else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1; else { - fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]); - fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n"); + fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]); + fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } @@ -207,8 +207,8 @@ static void init_data(args_t *args) else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1; else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1; else { - fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]); - fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n"); + fprintf(bcftools_stderr, "[E::%s] unknown type\n", type_list[i]); + fprintf(bcftools_stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } @@ -298,7 +298,7 @@ int bcf_all_phased(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: fprintf(pysam_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; + default: fprintf(bcftools_stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; } #undef BRANCH_INT if (!sample_phased) { @@ -487,45 +487,45 @@ void set_allele_type (int *atype, char *atype_string) static void usage(args_t *args) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n"); - fprintf(pysam_stderr, "Usage: bcftools view [options] [region1 [...]]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Output options:\n"); - fprintf(pysam_stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n"); - fprintf(pysam_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); - fprintf(pysam_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); - fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(pysam_stderr, " -o, --output-file output file name [pysam_stdout]\n"); - fprintf(pysam_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(pysam_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(pysam_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(pysam_stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(pysam_stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(pysam_stderr, " --threads number of extra (de)compression threads [0]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Subset options:\n"); - fprintf(pysam_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); - fprintf(pysam_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(pysam_stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(pysam_stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(pysam_stderr, " --force-samples only warn about unknown subset samples\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Filter options:\n"); - fprintf(pysam_stderr, " -c/C, --min-ac/--max-ac [:] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(pysam_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(pysam_stderr, " -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n"); - fprintf(pysam_stderr, " -i/e, --include/--exclude select/exclude sites for which the expression is true (see man page for details)\n"); - fprintf(pysam_stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n"); - fprintf(pysam_stderr, " -m/M, --min-alleles/--max-alleles minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); - fprintf(pysam_stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n"); - fprintf(pysam_stderr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); - fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); - fprintf(pysam_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); - fprintf(pysam_stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); - fprintf(pysam_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); - fprintf(pysam_stderr, "\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "About: VCF/BCF conversion, view, subset and filter VCF/BCF files.\n"); + fprintf(bcftools_stderr, "Usage: bcftools view [options] [region1 [...]]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Output options:\n"); + fprintf(bcftools_stderr, " -G, --drop-genotypes drop individual genotype information (after subsetting if -s option set)\n"); + fprintf(bcftools_stderr, " -h/H, --header-only/--no-header print the header only/suppress the header in VCF output\n"); + fprintf(bcftools_stderr, " -l, --compression-level [0-9] compression level: 0 uncompressed, 1 best speed, 9 best compression [%d]\n", args->clevel); + fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); + fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); + fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(bcftools_stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(bcftools_stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); + fprintf(bcftools_stderr, " --threads number of extra (de)compression threads [0]\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Subset options:\n"); + fprintf(bcftools_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); + fprintf(bcftools_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); + fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " --force-samples only warn about unknown subset samples\n"); + fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, "Filter options:\n"); + fprintf(bcftools_stderr, " -c/C, --min-ac/--max-ac [:] minimum/maximum count for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(bcftools_stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(bcftools_stderr, " -g, --genotype [^] require one or more hom/het/missing genotype or, if prefixed with \"^\", exclude sites with hom/het/missing genotypes\n"); + fprintf(bcftools_stderr, " -i/e, --include/--exclude select/exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -k/n, --known/--novel select known/novel sites only (ID is not/is '.')\n"); + fprintf(bcftools_stderr, " -m/M, --min-alleles/--max-alleles minimum/maximum number of alleles listed in REF and ALT (e.g. -m2 -M2 for biallelic sites)\n"); + fprintf(bcftools_stderr, " -p/P, --phased/--exclude-phased select/exclude sites where all samples are phased\n"); + fprintf(bcftools_stderr, " -q/Q, --min-af/--max-af [:] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n"); + fprintf(bcftools_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n"); + fprintf(bcftools_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n"); + fprintf(bcftools_stderr, " -v/V, --types/--exclude-types select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n"); + fprintf(bcftools_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n"); + fprintf(bcftools_stderr, "\n"); exit(1); } @@ -756,7 +756,7 @@ int main_vcfview(int argc, char *argv[]) bcf_write1(args->out, out_hdr, line); } ret = args->files->errnum; - if ( ret ) fprintf(pysam_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); + if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); } hts_close(args->out); destroy_data(args); diff --git a/bcftools/vcmp.c.pysam.c b/bcftools/vcmp.c.pysam.c index f1345e2..80b2420 100644 --- a/bcftools/vcmp.c.pysam.c +++ b/bcftools/vcmp.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* vcmp.c -- reference allele utility functions. diff --git a/bcftools/version.c.pysam.c b/bcftools/version.c.pysam.c index af54532..236d935 100644 --- a/bcftools/version.c.pysam.c +++ b/bcftools/version.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "bcftools.pysam.h" /* version.c -- report version numbers for plugins. @@ -41,7 +41,7 @@ void error(const char *format, ...) { va_list ap; va_start(ap, format); - vfprintf(pysam_stderr, format, ap); + vfprintf(bcftools_stderr, format, ap); va_end(ap); exit(-1); } diff --git a/benchmark/AlignedSegment_bench.py b/benchmark/AlignedSegment_bench.py deleted file mode 100644 index 98286d0..0000000 --- a/benchmark/AlignedSegment_bench.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Benchmarking module for AlignedSegment functionality""" - -import timeit - -iterations = 10000 -repeats = 5 - -setup_binary_tag = """ -import pysam -import array -read = pysam.AlignedSegment() -read.set_tag('FZ', array.array('H', range(1000))) -""" - -setup_binary_tag_from_file = """ -import pysam -with pysam.AlignmentFile("../tests/pysam_data/example_btag.bam", "rb") as inf: - read = inf.fetch().next() -""" - -def test_read_binary_get_tag(read): - tags = read.get_tag('FZ') - -def test_read_and_process_binary_get_tag(read): - tags = sum(read.get_tag('FZ')) - -tests = ( - ("test_read_binary_get_tag", "setup_binary_tag"), - ("test_read_binary_get_tag", "setup_binary_tag_from_file"), - ("test_read_and_process_binary_get_tag", "setup_binary_tag"), - ) - -for repeat in range(repeats): - print ("# repeat=", repeat) - for testf, setup_name in tests: - setup = locals()[setup_name] - setup += """\nfrom __main__ import %s""" % testf - #try: - t = timeit.timeit("%s(read)" % testf, number=iterations, setup=setup) - #except AttributeError, msg: - # print msg - # continue - print ("%5.2f\t%s\t%s" % (t,testf, setup_name)) diff --git a/benchmark/tabix_bench.py b/benchmark/tabix_bench.py deleted file mode 100644 index 431cd6f..0000000 --- a/benchmark/tabix_bench.py +++ /dev/null @@ -1,76 +0,0 @@ -import gzip -import pysam -import timeit - -iterations = 5 -repeats = 100 -print ("repeats=", repeats, "iterations=", iterations) - -fn_compressed = '/tmp/windows_small.bed.gz' -fn_uncompressed = '/tmp/windows_small.bed' - -def test_python_compressed(): - '''iterate through with python.''' - f = gzip.open( fn_compressed) - l = len( [x.encode().split("\t") for x in f]) - -def test_python_uncompressed(): - '''iterate through with python.''' - f = open( "windows_small.bed") - l = len( [x.split("\t") for x in f]) - -def test_fetch_plain(): - """Stupid test function""" - f = pysam.Tabixfile(fn_compressed) - l = len( list(f.fetch()) ) - -def test_fetch_parsed(): - """Stupid test function""" - f = pysam.Tabixfile(fn_compressed) - l = len( list(f.fetch( parser = pysam.asBed())) ) - -def test_iterator_generic_compressed(): - f = gzip.open(fn_compressed) - l = len( list( pysam.tabix_generic_iterator( f, parser = pysam.asBed() ))) - -def test_iterator_generic_uncompressed(): - f = open("windows_small.bed") - l = len( list( pysam.tabix_generic_iterator( f, parser = pysam.asBed() ))) - -def test_iterator_parsed_compressed(): - f = gzip.open(fn_compressed) - l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() ))) - -def test_iterator_parsed_uncompressed(): - f = open("windows_small.bed") - l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() ))) - -def test_iterator_file_compressed(): - f = gzip.open("windows_small.bed") - l = len( list( pysam.tabix_file_iterator( f, parser = pysam.asBed() ))) - -def test_iterator_file_uncompressed(): - f = open("windows_small.bed") - l = len( list( pysam.tabix_file_iterator( f, parser = pysam.asBed() ))) - -tests = ( test_python_compressed, - test_python_uncompressed, - test_fetch_plain, - test_fetch_parsed, - test_iterator_generic_compressed, - test_iterator_generic_uncompressed, - test_iterator_parsed_compressed, - test_iterator_parsed_uncompressed, - test_iterator_file_compressed, - test_iterator_file_uncompressed ) - -for repeat in range( repeats ): - print ("# repeat=", repeat) - for test in tests: - try: - t = timeit.timeit( test, number = iterations ) - except AttributeError: - continue - print ("%5.2f\t%s" % (t,str(test))) - - diff --git a/benchmark/windows_small.bed.gz.tbi b/benchmark/windows_small.bed.gz.tbi deleted file mode 100644 index 500fbe742edec8df907c533568892f7b4d68fabc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 34590 zcmX7Pc|cO>_x`78%4wS1mll=Ml+2Q8no?1rvLdsjvQkT7#&T@cNK8?dHdD}~va(V{ zGqc4c+eAgEgs3!dDX~(~iv@;?hzPR&Zr`8(T)FqX=UvZvpXWL6h2;PC>;HcKS!e%m zL-;TMI}{zym`kK$NUwypPb{?7e7cRtnUnR{_`T3#@J%ZlKF_6_@@F5bBe z-aQ-~aL4Cx#KMS3+sH!^`)sXDj98*mQ$EVnMx>P`(HV#@`gF#|F&#;&Qa;*$P`DxI zmlSi%vH{nbR+aU4tJK=(RBokNq}F-v3nBe~{`1LW#*F%yKlFjiW#W^A^|yDiX!V@0 z7L|3m$f_>fZ?&HKGwtxY*8S2Oc9j{A3Y%S+zj;TE-bfg-&bNuCHLGQhRPW9TrU?YA zLVi$ce{Otc>Q=c+r_QBC{9X5cAk9Y|o)R7Z&nmU8=>&euoH}Q+Gn{iZ6G~nIHwDx^ zBjI<)sRK;E;*ZT*H6OWvVefT)C{O+&NN?cr#F2@KLQYuy4cnH;?FNlhc4_4s*q7bL zZgphNdi1E0kniy4^IL_fYge?2(M|i2fenqZwn!(&DSE6Y(uNVYzveWPE^3IW$%8mw zz@lpo^0;`5IT7~;z4yW9K9~9JtIeN1?4Q6QI9t1vZwNP z!gBALjl7m#A@W^=U_g|3ay-K)%eW%rk-bG&|@mUrw;+2sOf2$E-akhGcwHCEX3Mp2eUfVydnrfH7{5d z@0M7qDt@Qr@vBQEih^J2=$eHLQo@Lm!#x8jS7soMId?R_=bIU-O?PIdy4eJ=I+m;l zx9RVf=kYHic)CcNocv0#4duST)2WV%5Bv^M27Y52 zt^n*v-`0W*Ol9TQyaS_^vM*E_XqS%?$idQAm*F^IiznE64hf$XgFCD7U}*0IbxaqE zMqf?iF}kY(ij~{}PT?z+chnPU3_SC+{x;a)wL3~jIGa(n4%J|fF1PUSPj#&F<{t*D zG(B}k%0u9On9?!MW@s2x1g>YsV(t(TmCRWz!gDL1Li3-NN}}H?4+F>K>0SuUbn)nF zmR;!w)i)w4m*d3+IGc0DPIA!jAd8wdT3P9urvY2sBLE(P3z&9Ni% z7gIx1Z`0C#20|J-kfGPh<#2E=S|eAnzc4B@~Jbx75cVjP%&}VG~MDhMxMay3e>{ymxd~Vto@Hvc`og;~+Qp zGwqvBn1KC$DXkcFg5(_Yo=r7>O|6aM?P_43JPhuOCZ3h1em)Wwqqq*OwQD!Ozh(g7 zl3eLMyJEh!)=qH;$=Ai(wBL{6L#MMlm;bOpvzql>WsCG<2q2FbcJ5?G-di#CUDHwG;+7u1&7VRX zTIm|PoD4MX+{28lUGippnlpzsm$Tx{byM2(O5c{zTG$$GLmM23HZ@aMy0}d}XLqF! zAq3wfZbo}%jKauD5%{cd&S>I>mb1&>Bx1=tK8e<9iTnd*7W2^J*NJ>|LaOISxtAB8 zOhgR12z<=Gvt2)F9<)6GFcRi%tO#TUyr4XlxaN@@i(3FlYW?eIa~G|+{#$!m;f)y!Q4xRg?!MXdCU})Kj3I2Fm$-`~ zik$ri>5}|nXpHHN_Q*2XDRyR0`fnb{t)}jum(8Jx5XRq8CXsdy!@$Y>nEo5%@?x?I zeqKc0*@pADRe;m zM8J%vx7X#*ht{rE^aS#mqm@qAsnwyi9q&@9JTPc>?Gkj=dMln~@K;ZD+&s#ETE;6+ z>!!-%vE9+wwR!YanB;0>uJ1(vT|>P1B@M2UYU{SMzWqM z>|crSEH`l{AC*=#ITK;;jhQlii!3_; zakS#i>yk;{7KQa>x5Y6RJf`mVmfi1&x9vqvO4xH0Rv|80M3m zzo=j|necXt3TVFY6EW2NO8y1@;%uzr{~?C{o$n-prY6Czsg8G_1R?6x9fx&!RqC{M z?pSZm{~>@s-q`)#!h&9}!(!o9a@`8@p>T_3b7HXRH~gXKx=`}p@W~~0XG!>tbL+gw z?r_7ub&g~&c)%6S1?Jhp?2R>GwCGu1bCy{cMcBq!gq+LhkE@|E^F>)3YW{;TK5efl z#GV$7`gPwM%JNRIZ&AOr%cf~M&>S12qlf8PY!?E4>{9Gpvan}g@G2S<_dhLI_e_%cft7n zdY5`VxFHc5dfLD+Ox{lfC*`;cK?cTmuD|V^9i%^qSXEpai?%9bIPB=YJ3X%n+`>=_ z`q*%&^fYhhu_TJqG%t?WGflTOZ#gy^ibDNqJuU+;$C7)$pinBunCYyntMcsN#1Q#( z^+#4S;B9G|!ny16M@jA-hp~ANNeSPA*Z(P=0cWRp4jFl_AqFFX#h~0!56CntEg8Up zDf-FYLMnsU+g<_X3rW504oE74L~M7c)`t-Lz6WpoL+sMnm{L}O_)bOZ*%eL{S!R*l z=T%uzmvNhko#`6KX>{Lew5G3KC#zq) zV`Q{3E*8#SCi|*&@YowqeE;*ZYvcgLgHYWgNq6#aS}~PI+9)h`>LNjQ-5}VC*Ux6B z?A;x|D@lD=vK@b9&#G>%17l}+?}%|h*~9bWy$3HT|JyD|k~r8jrF35t&2`GnMBthD ze`ezHRcV>FPJ*=S6=il|eH$;Ldx~83Go9&=~gbqD`1lNn@3Z4Jcz}hBd%^BsF;(@ z_W^?elbWiB7e=)vmcl`xfu-dVN%DqLd3I~5D)+4}l^;kVAy+Uc8{@}_)g|S%5$fYcMIN)kHb~mhQ7yOzj!$ne?ix#`U~7PosSc`};1lRuTe5B`JY4g(pX~Wk)4jCY`wt)b>ypD}jK~du%Z=M78ZUikDjBLXov-StDsZM= z3HbA-I{x+H>p*1qb{DW&9`Z#a-452XEp8PEV!3DaXFzD@?3HqgTN$l#MdC4)(GTV(GgRq!G-EEvQ<&Tcn{K%g#D_000MMz9B^UX}!RBQ~2NbA1-3M-9i9Jg|HNPadrDWy<8Nu)e9 zY)S!=ManUKn7}-h9g*GqL(VO%3^!8az^3xM0Joo)#p=@}{N!q&7_2kCG_xHv?Ea2D zt~Hao}cmo%hjz^*qmY>2dDShgSoEY-bHy$CDpXei>hh?H*X>OAQe=JJF~%RF&h zG?bN0lY&F^FDxJR6Wa#^x~Y!KtdnKnNZmN7z5(`#w<8n;t3sFq1q7@6mY1CX$EY|T z3C{I`)cl`>pkNPxp6d9lI%z6Z9Svd~2E-y=`jB?E`YQPLlL(Ul1eKpxl)a$aTDOFP zp%9gnmkhqvGYSNQEA5mUk43`2t_E0OQzsDWpNM;^L0``-vX>*SpJ207%DSAgIwgTixnK^N{Ryu1j!xA0^Ufa zjO@aKOTmQ{gaC)|y=v$tSjovalMn$n!_>FHx!i9^-Z; zVA+b<=Lg&_+j0|DoQF_t=Q_?*gRC$kFGnrwxLhUh0+D?Z|7$7iOlaT=FIOMLa}IE2 zIkjGvTwe4bPcTy1$K2sACH=mjBGI1g32(X{`wQ~%3p^&FZYlX~vtCEd=wS(Q4cHYq zkV4xzZYn-~LvEl$B)AEO!J1kT@CU(i8{`a+uUl7~V-`dm3a88_FN9YxVnqn}^iFU` z^@Xq83exX$B4T_^9=Hb&FW(E&V8RvI8OX=&a9(cg0^~EO+;20;tk2kv`uAT13AV@L zz0c`|^Ox;yp@*aF93IM`w`CWoYsRD7ao+l1(A~r|-9U9^G$6R8k5f#_`TO(@ipe>N zpp(|P@!|Seir#mL38n$RA?@F43`kOB-+d-}z*#6;*`EyYn4-ms!C5GUc7v?y0LE$2 z0JP$OYo6{aN(ggZ1kOT)A21V~g&!qDx!^1`Za41K63y`Y(V^$=_W9?V<4_ygIPR*HHyf$?3^&)=;@y>v<+K= z_yLkO)6!EXFqM5aJsk+R;Sb>_uq&-jkTguC6U8Y6bQEeaUpQJBZLtse>0mMZ0IFf&ZC4QzFEEcjZj9tayb_LgTIZ`AFW~ z$+Nz|!2g;vCbNBc*oo{#NRt`bfVOv}ToN}`-|2Us^&Fw_t9e~|ebhW9=c zROmyxBa%~`atBfWZLPO%E@+Wv>^2Mq<1))FW%u$sHMRL4et9-_$lm>x+^*l3^nw%Q zh-~~Frxonf%Uh~o&s~^<`e{U;CRM_;syQhmTQa}Y{*j4D|C=o^FTq6oAnK0m=?$<=&3ncL{?m!dXMS$lFm;mC%Fu| z<|^jXX5u{F_sY)nZ0bAxQNvai#D`mmO)bS6i}|zki;C_%y-u4;>dV{@SI@*{@>bo^Lz&DVn$hCOm3Gjx;nEf&~Y7&(wmhRk;?x6q%32It&r{nD?#^@T7~g??m6CWrrd5 zZ^=z}&^1J*7mG4CoBlwiFEy`@79dTCTiR~Fnrch04A%evYpHRbbV!67F@QX%)?dRP zsm**TQ`g}2NT9ejI_G2H<+#nE3izG~cw?FVn(zP{pAw?JBs_2xPsFN^(uHRxjRkH` z)ia?H)gPRAYW@Dj9_m_WRA-443W$X34*iI;oHN`J&cSov*?Kk)l>a z=Y27a2UbyJ%Ut2OiNQ+s>ec?ze_(#Lg7*3EOZ;xM+wwxugD8huFsgR_M*rmo)x zpCqX-XvV{bC&l~{Q46ff8%?XXyrl@Gho?#=EPHbr0v=mGQgn<7F4{=+xQ&JtA~>&! zEM3S1cipD(X1hze)`odfqbw70>$bi>fjG=UB)r)U?Xt+FyNpDO1OF?zaxw5>>hZlFQb?Y{)EC%n2x02OUTJuS(sgQ&|j6 z<9%oqSNji9cfSe3`GafZ^~EH(N=YBb&;hwB@#1oV7d&c_$<(6knVHO2S6MFxF5JHL{V?Ow<78?K6 zE!mUoZepnvNOSm@>i#QvpIFAdxr2@}9PFi^qzk>xzGIqu!|E;ATqE?zU=#K=Y0h)d zQ22P374Au?k?R_#8}%5dN~_QrGV3h8N=^%r!=_9chK)o{EMzqOowk%!6Ec-0H6Rc* zaxMJJY&|AWslBfYLB~>lq%-G{h_Uiv?NvLy{xPOg>D&VT&Q)kB7m37JsXRUXBKSK{ zu067SzD=BU+@wY-#hiee`fno5 z$%hM7T7L|*^EIKR(VoRSG|c&zlp|U5>?HR|$}8n8&+jUo;k1=ougjk9Cibfa&D^GW zTxqH??Br6ebdk|~Y`=V->VUcbDMNEXcZ1 zMdEK?oE<)*P`@?K=h;wC_Ho~3ukVM2esBC>oQbSg4QV=dz+XJlv&AUXu&J6a617%W zISHvPzbInlVcW*j`(q|B-p3HCaAG`0-Yg6OoIIXE#cO|pRv zu!;T)I0pliC!M#dw*57SA{-qBJZ5~6uljGED*7MGZ)R16x8RP=+P&OQ^`4|nZEy9r zyHS$W}^Q+ANu5v6hB78U2t;v^Uv9HtYuUb5(hQW9az+_KK~ zxWrUuZgN$$qhLN(_+Mo1N*CryPtHn)`b>|IFkXlMIHQiGS-c-^z1kxtT!Jz(6Wm48 zBE6$J*_82vowLW$hK<>KJ?i*k#3;Nk7H-OIN1^YA0r6FmgIAKNAhy`;i4cX2b(tqvkKJ5gyEu!U3XDaW9oG8Fy;0L(P}y-zmNwm0c$wVs z-(f(aiHH});pULrgH2cg%VasLeMiMmwhh7(w7@rMJbQ zSSGBfOwCIj1R2Ti!|-RXL2kRki#rMm4*dalLDKvy)gR=glOg;)AnM+GSnUB~>4ybu z5a`!)dU}=nhgBaUoFqZ4aA!ASJ-Zv^B((u|KcjN{yld0AmEK45K~+F;BU_*Yp}0N^ zm<_&eA*)G7#hnwlN)Tj!+1LxJ8_9pm*gx6r2Z7y4<Vqr|gkW=(dDyy+6RbOol5c3shE?4d~KA&eKF?g3PlnOmG}bRILI|fxNYW^-~py zTD16u){tnpFH;ove)SQMOcUVmScwbb$y@HAFdoqI#e2HV@N z1`uFF6L0H5*6I-!(W~NY&MunO{av zWtr_@rYdW@={@11C}FYZL^^SaB~g9U7!zXmz4NUj$xGp4Nc6ADCwpD9yt{`uE^AKY8|Ag87j? zAEag{c8C6)W_KrX4+_e(7lS5_D@^<3FeZZ<)_nk zwxQ38RwIY9h8uCO2$qm(N*w19r~?#dzMG~OF0-d(SSl-7nlw=9W&f`aZ5a$j30+)a zUr|t37wcrY2V++}@j9=IFwA7S)L6rY5o(y66fry*B6v8}Vd<5ES*YHViDxw2(8k+6 zT_iWj=fJ_cnIE*}G~nz*swBa#{oe{;o@6He;7avfji*O+%pdnQ^j99xXV3kN5(e7O zR7ZD2*nwKX>h}8=1z!5kijOveubi2;*4vgy0Ji4RVmoFF(|1zNI ztzeVNkiR09&mPRJ48P5H)bCV`3k6qxI$Z$jODV;ucotj7Gk)%2d6mhzNmMbUhFw|#s_^Ci>RSX8Q>ZO+x}*OaOepR=!O`3|DSFM&EGTb)Pd{bH>soBsjK-9^ur2%9I#F&tCMb?Z9Kdi zitwj$%KW=ug4y>Ea@9-Ny5#17{sRz3)0P@4gmZ&P39re4Tz^dIJ$5Pq15lQ`!6omP(=0E`b4L?LTr8XM)ag;9=%Z z@~)=#fZ6lxT>WVc2})=#;ovdJS0vej&7g$=a$Oy$5V$2ef#pr*Js&}7z_8DK3=uIE zLQO&z{U=hk3>h8StpwUyij-6qii!#O?I}BkE_8kM&I(exRx5m-Y0-eq&%s2;ftpHx4NCr zJJ45cSZdskcWZ*P9xe~4xu8D}I?dkbf*16{8gIlf=!WO1eW1(f4gV)0+^^{Q8BvZb zoq9%(I=*}gR4G?`I%b0o`gx_kxm2=J4RK{=FjpGCZ5-!a?kY&b(687o^tJ_MpL1`h-XoRaOgEGPURb!r z9tC!MNfAGpWa=1<=6B3+M}H5l4LlHvs9sh34@bD9@t?*Zfy(JKYgxXo6HLIbzCRO| zFaC_G7CQ)=Y|Ecwl8BAASC1G>cCk~0reR?WJ=ziyn8ieD|ID_6YH7Mr;J)o0DsiLrL8gz4Yx zwL^<=waVY&gLy$>?M%jav;F}IKY_z>!F!`9^%{u_{=Wcb8ki`wcyUu8jI_P+1C1vO z8z&yJ+ta^=YGt?CZ?!W$uEMwgX>AYE|C+h(I=j{sknjFbAMQb&qrpyh#J{O)FwOElR5fArLfP?|M*9u8&J$C zXWwGwsT!ktE7>=`01L%#?W=UOus+syWbz+Mzq0YH_KeBqhcEJ!@z61eJi0}22sjF! zR_cQZ`p1+98RsR*p~JGG4^rb(FmbGsFPfani-5uDtU;0z8-4W-zP@LAA_nd$?Z~=+1@eK z77uj%1~*ZO#+o1yR8vz46!)o?xk(BA|B4~~(ea&|ad&jH^15|*)7|KqW#eanzh2~3 za01zZ}a)ZZ*e&Zl2 z*w2`JIY&AJ=TON{UVy9&>9lbVr+~zDaX?l5IK$>n$p9niWU80L#9Pn+E`iiFcF|dn9gJm*j0iD%9 z6BYuOV$816A+uXvBr^bWiYt9*S`|y|Q1g>MIJ1JJt^T#;w}U<#gY1iqPclDAw99F8 zANoeAi9S@4`}cKNNw|w!Tx97Fv&VNw-Gj^z)0h~XT1V}27iVo^MRNv~Su+JbVT*mU zn7;w9U8yrNic}X}iq+*oea4OUZqpZn)}E&5&cI!=)=v^zWt!EYkioRet$Z&N=h9l* z++$GgP$rPr@0jlh(cS}ISMqO?k?Z4$4`K`SJwg|`F*mM^XYckdOSBj$Vje7=vQIYZ zmWY(FNJnP(e%w0}zu0WFmuoAk^2Ea|c3Z3{Uq;QBF23%IpgaanUZAn`I%vRQCpkmk z?m(NHwYOAeUYi%$x8k}F!V80|5L45d(X(;)A=GbFdPzB3;l?KvLIb?RYqk*I9~20- zJ7KlN9QMd)ypD=yuD6IC<=K{3hJ7sLqYcCq$7y5?@q>om$EPgc>%;vAF>Qs*Kd7^B zu!z2Up@?IvrmYwBjdFi}&cU5>DxFwl!Y&CIH=SdT!vweZLe}p1wj;O-IfTE-!||vS z^VxM178Y4&=%oCT0)_w8cr+>%Y*4@;P0v+*oBNv&cECw7QsiKL$mW|b5x$@nn;P}kW#TlCrkCkVl^r~s z%XEs#Jr4EdSI^#7PJ(pZ%KsvwQ6i6OYP`(jns-TopJS z`ds)ABJW+g?+$U&UIKSR{NKv2UxTHo(eYbXsXfF<%Trr!v!?kx@MJ6X;m;=3<@LOp zCKm%Hjv_&f%!@ooB3;!q-v-Y5=O1-Pmr9y4PI6=R#K+FVJpu=}O1k9KDXwI9{5FqO zA2fELmHY%MdR}xqk%VJ_gPU~Cp^~AR=TiJ-OS~!*IN6%tmnS~@fL)Nw*M(y5m$T1q zO|+9|M+&}$tw0Xi!RV=&4&d^wvK_u ztnKk8Gq9LqT}`Bv>MOtr`rR#8Z$wR)D+MdLT&LnM8{L0o{S}3qmI>&@X5WBwmO}mi zKFCe(onqCKOk39vn$wTCWKBcCci=Zk@qI(ELzjRKM!i%1IA=9=wTqkiFIl_9m^+L- z>|J<@bhDDpj}f4W=EB|a1Ky@EezH$A)|K{^X!Z?wd@e|&O`4-ky3g(dyA|qN2u0{u z3S2lV$Hui~>~RtBtE4#N9_%Qe!6iGnY0uwSM{I)&Hl^5SOdGbxuOI~$#bej*#IDV6 zBAI8;I+@&H9S$y}E3?)LSGTGwhnWlgMH%MCbN!@Z+WZGQGY@izOVw(2zQM1rYPk_z00QuB!v$8z1u;2goZpFIxS)T=Vv z{&$3OsniFE0V6{p_wjyu>r5+;w|c?sS;O*Qo0ob{Xqn;$=FyvSSdz0_0><=3k_Pl< zyy(^5y8-|Y@;-CRoQfxC?AMy&Bb}Gmg{}utwK%z1%h-Zx_?Ep5m&&5OP;v z0(J(m1Uv$On$C@bL4Ca!|&S^4@i1fA$Ds{7JrM&1267LF%YxTuU zX-nAUt$?50=rduqQ=>tg021{t)8tLK!<77(pS9m!7pf%kHgl!U)(OxF4jHPzqpdm5 z%e<^>)epex8z=^jrkM?#-AbeUj4tv2s?Tc7QG!js=Fb8ry6zsx`!P|t4i^E1NP-4b zv2Un|tQt1|0k8(kqDI2}FjMC6d4j5s&a9uW8yJ9#JljK%ltT@4N5}7docT#-H_y5F z>kiJaG6dJ~O))ov8@M-q{9NKotHC+%;WCWI9DW{ny^ogDTfL6bWc-pEB;D^HaQD%C zJ&!wEFe}Ds=7W<7N5zI)g^7b!ZAd!`vV689vm*j`KY{OMRi4`se{f~pZ)!pZt#vG* ze~M@MO(6K_0z!hG{Wt=ex~UJgDZY)AxC&@7`iqn~DH)+S^`z0p0kS3*T%Ur(u_iB( z5%y~quCM!BeN*fouyXy1!A#&Uub@=E zoCm5+_Y)&@=?=wxyH-RJ+y9s#-ObE3HtzEaU_Fm5)8{QEs~QshblK0Ba3959)+?rm zj7oq}m@32_#OBxoPHvY~sd-g>f!~9skoPta-~TmXUa1byU38F>7M6i0b1$$er$2^E zuI+x4@z@wK9ltxK4h|-t=S}mXR>(4e+hxw-R68;gULUbSiC6<9X6LP;+8HlN4?K(& znUC^)CS8vjjmQe_F}Kzx%0>1NA~8Xz`mUSaT{FO43tagu_A1c772ku!C2kTr6}$R( zdm}87f=!-|{td{&AQipiwEj-2vm51nXgb9m7aT6QPz#er(+l&j^#ukN=m$$PfTxt* z@tbR71!`}0N`d2|CT{6nPlk)?6n6CKM`~uBpL&V>ex;k>!C1X_z+`aUE#qi=#faT5 z@Qe^J>rI0WH7E^kBV%d?EujF1wdI~s(+(;m*}b{BQKhdk(;%L3-1S)S*oF^P{T&gH zH^kQ!8pqn>MpAc4NEwP6i6DElve_DOgO!%wkkY)QU#^V)J$-EZn-W&IV!$O46e1sA zD+t8qwqJfIFp`?r3m@A>hGU`aYNf!27`85(dHn>aRpRR|vuqR}?G~ig$`l2U5}hPR z@o%67fkwrM2XL-d5^|r{B>1oP=?KJ$9YcJpb@pI0ap1{}iJYEZ@CSLvvBAb%P#-xq zY`9Puw&KM}k8mLw1aT0?CGQufJi?(*rP~)tAkTEoR6JvSwRy#h=X6o%$`@&j^*_ZC z9O-Q(qSWncldb6jWhe2Y#plONqY8h*V_ohOY%~fyTV>R21W!tINccif={Pi{9zJMA z8V%2M+)b8@Q+nMo&kT>n0qV7NPeI9b*S3)XbAev>uSxX;j|1=Lkj^)LL1kl^(0E9D zfAX>~>HlNw-wPf*EO#f*sw*+mM(UkjbJxu%aQco~5bxI5l(PFjNZEr#H0;alZm+Po>3uG)rKKUw?qVsy613O5&0VN}imc?({!AE-x-j^9V=rmLcR>IfW(f zRm18wL_$C0C`ajn!;RRi{bH2V!LI8n7s}Lamgs)DU%&3~h-}DQ^yUe2pJmpMLCTc> z1Y(mc9@dll)@TitlV*OF3XMMsbe8my5}El(+0TDaGSWZ3nO67|No`r~6jt~dDKK7N zn#qGt`Kif^VVM;4lyMxfsEEUZ8V_Y!X>JM9&&=1xb=$tPGo`uKG2MXKBFA9Nn?{3(P8xMM5!#=;eKh61+U8ykCW3UNlj9Nk17U*d~UoUXZMd( z^gpKeY?B-|-0TYsZ8i3ExQ&o|KfAl>q?USEH;Dzy=7k&f^`UPKO^2Fqk983Rvg_P! zI;2R=uHto&a?+Z`UZnGNld2CIaTQNQI>E}~wClcGo-85Pe#kP}cZ+%7&GUvUPg-|9 z5zaM7jLpqzNxwjfC`Gff-8&$YQVTQQfOpMw@5q&ri*r0t+E_cwWAE1Oe&l>|=V_b^q=&6kai-Id4235m^A34x>H z{O6G?K_8n8GF;Wps)_6w!r=>a{Tvlqu0jux$Ox5uHNonc)ucZ|;e}+1+l<2>xP=pz z&ngmfG6-m{l~$*ygMKU-)m?=UIOCq3D(j0ZP8(pJuljT<%d^Fjd^lc$m zGTqf>?^U@7b$XEwNmf1iAj{J&Co}8%WXpDFNIYoIpwd@0!(<{=3%7fh>D2Mr#lm^w zM`liDJ$OB5c3<0-eo0 zMiz6`eq-`-RY;j~taS`~$L@>dbBrl%cz@w-7@^LXsgTV3)>r~q7S*zF@z~-9>2>9? zM2A`w!lJnPRq$=>>H6v}RMTtH^Z4rKs6RiERwoS?`NL`_araJRBjS^g8{s#&J)g#M zE0JrjSR~I8DU(eG1M@rXY}9RLdb4l|>Cye|d>wcw_7-(fukeB>B_IXQTc|1EtaZ23 zRjrc#R-u{kT5&LM>A$JeREfRgG82V`LNNj8mZA&pjYP*KFU3zlL!9m zNS)5n9U)CWUNx; zZ>v(NL+CaJ$m$y^lB%(ep?=9x-4M<+7o3^&w+wLZrmiaG@*!cK3UonIzrM)mFdRN< zQYVA2%O@j1m+}CZG~u5m195r0aNyQkoD4b7dy^2V=9 zKZWxB?bL_2B->Y%!;W5#ZBcsfns3*e+bc`ApFFUsGj&g^@|vQFR>68uC|S~ly~ZcC zHDgk@K(48h$26Jc*yTp2I1 z!Rgtd?JTDVWRx2?&nJO~RN`ydne;DQ3b9jm22t{d;ZP%30$tR%VPsg{e#;m(hRgQGO`IqGe36-$!Sp>m8wwi2b)kEL|oXTldb!{QxYLwqih zi;~^Z(g;y3?uLXACMNx*6W~0digGu>Aovx*WIkt5eyYumWA}Os-QN&#d)ST8Eu(QS z2aUA|GvhlTfKgrN88dn2PM2I=Ds(Y>Hna-l3Fuu5Vjl9T;Db~>*D{O6yp8t0f+%nc zt5pc|s-=lE39>WeGC%FiVX(+d{q(xqJR1)lx^zrUjk-+%d?-kg^L;FJwb`sFIEK3} zTnRVsS0`b4J<0FHJ{WMXgn6Ck<~DH29aekt-cq@`pwO^~_(?m@qg7KV2^GcZRB}&N z6SqTrX*2Kj2D;c9>J~Qw-+MkiY?HvpvpzHshkdoklR36}Ab2Z{ggfXvvnrWaJjMH6EuI&>~#7*tk*RZHqUD=z)DTXJsCRKdXcWf%C zg2fltA+&_#=B#DP4xNGXG$8D3x|})9d1?tQl>fsta-NEM#m(GtLUI@Zt`zX_35k7~ zuFBWE34-)30qw>+m5^w@#blCX8c~*NOLrPYnf1DHY<~A~id-WyjDm+jNGgp8ejJ$_ z@*ONPGNR%`i2d}svRmn&kYI&e_<4CI60ETBi&#UR0-J(gp{$^&3CRigzm#QSy+L4* z|6OKNrUCCTnR6{a;5)9S7Yg`NVt-t{`}@YBe6D95BKWMF#sgc6X&1X$$F+IdH-0rx zOP8FHE*P#^WNDQ%v@H$VxWG}3<$p>|X4EuY9HKna606GY1IzF(*C?BZ!S?oNmZiB) zWMR644TR)>Sw#n$Q)Vl1shTXdXHBkBV2ER7D_a@^%ZgofhPdv}hK9p)OtIZvo987b zEC)X$JLRlIH%Ga8v}Sz{QFp}#mb7Vln#R*&t%LF>NgU7X>5qfE;?PtGJU_jrCi&5T z_EF8zdEXjRd`v;zEW>n45-TQ&al~D@L-huo9Cl>ua^)}Tr|rY4yic53XND+EwopH- zX0=u$%jQJ=&A>g0tNEtARkKLj*bc|&8qTRc`}8YSpTk4NC8`#1gv1}n2kP$m38 zY(0S|bWsKtWn#>yB}bol3dN5W5&8m^tDaIEN-{Z&B9UF^<`4ky>i) zCJ9gtrux8RqE&AGt#Vv>JKNa-(t=*c>ad%mI8XUs!j5f3cis`)aGqmgvWct+&d5s` z54`RF1eO>Bp37q+Vu`<4|NLU~-Ym-qS3Ovapi*4>R{jd0H%PxAe{@s&)TyJ$G$pU% zQ;?@#(;+#M7p6XXVXK(sH}zUdtBpgAT$FcJO<{*rkOleDxy^OYgY2!#8WZ;8 z>@B}dcA)sLJei|aD7<~{0BoI2sBfqYmtXcCaXnn!%u4!P3;bzvJ)}xrkW2Zk?{-s= zJH`5ME%D?y(K9}ssm za#|NiK|3{qji&z;d)@(fNd*3Bjyw2;li_9J&u^(3qiI1-)?oanKtUXM(4sY!`ZSEZ zcS~{=4JN~`&qjV+YFeq7ag-l3E-x>sb&?MVZ%3xs%MC&0cE_CMgV;^gQi~FrZUx4QP0YNm&l7{JXCxhZ(i8YYMcZ4uf|i? z+4GSLNu#IP{B$bmz*C;1^d;}i6uHiiTU(J!2EV=T3$KSS4*);ZC=T=G4l8}(DjQrd zdD^M7DT{-h)_HFFjBr%m;))alp zGE$9wFhUGQ3X{niLk43T!wjA?eP7S>dOgqYkEefz``qU`*SWU$`x^Is3XV=*X-F$c zIr`W$V%LytD>5=I)~vF$8#rVuzh7Iutji1xEE?} zeWBGzvR=3jgYMI@)zY%t^Y|q-osim_vQPzZ>^g<}>BP~<4Zqg_3|g6_s7;SLXyZmKH<}nkVR0BI^o%WOt|2we zTksLPa7~V1Y+bn2B_t)aHAn%u$C<+p?QU<3|03j?#`>K ze`KR#i;^LO4*D_J1JcKpqLN{Jc7Ev4Ls;pZdG%>)`sJh-@)XWUo(YEGfVe_bD)#Ww zZgK6Z`zxc)CliwxV~21veabd9G;QU;stq$@@ps6q8i~Ci zr~D@ZUcAv!m^LrmFLs_d8n8+(d0BZVn#MeC_vlS1AB-cCD!nSk(IjA{P%uqYWjBXgo22P$>W6 zNoNpB2a-otvrEQ`$e}Y`r78AD;nT=c)V^8sSy}AemcSj?nnQcM8x(BEMP}c;(7^{z zOuzYeBjCv?keOOv#;EVPJ+K=U7i-S!9Si*C4IZ4bA#grP{=&-5tV1c_ttpeNBk|Oi zCBhk^CX*mHv!(**+=C)LDIt^9h9Om_vf9lrj#Qi+Po6jy`R?6a32dOo!cz(G9_4`jUS9>up#(&(eh>M0mKUIcm?S%)TXb`q;&NSruSFoaxBa`;%i(Yg0P znmIF{TLa#?0;$;s&M2uJyp#0Ai7rjp_M48R-$9zJ$b4PJVV~L?+^;|tB`#}aMeno0 zcz=a9g~ImO0tx@toQSgA_$t=HO*JulVSnzPKo=|q0yFhWD0;GdA-)I&UNO@(@rs)` z9=YlIhdZk8)Iw!I2luLZ3AtHO6E|>C_`0Y}dl4$_F3&2mm|rxdZ`D)vxk1Ls&#l-b z``xn9BQQzm!-LyMEZ46GZ|G`!7J<}9H5yS$8$^u{W8r2-hD)-Yv9rV{TAgZ_ONuoRZ1L8M0zcC^cWVp^9?gCLcf!Cbwy9{ z$_%+QTUw;3V*-^1YYS_2NE6z>MoF6xwxb*kI?77Phc)t1gyRc++ryVX}gin@#oNQohCgrD_4IaZDsPcxG^ z+fQwhhC05bzW}3UxG%Ydm{OCpBkp12X$ER}*n#!nz(03aU4cZA&UFO1x~zXQ;tROi zcWqhFJYTj%)Ul6e!%7+^nFo+7wy8QFUxu5@=IibqWS{vUOKk zWwS}z4qjP@2XiLFS53j3T*;1Ea8I_^k%=`R@KG)JV`gw*BMN-w1~(S_>KFku?kCDs zZ8SwTZgdt*D}p)d>R&=w0*OjhG$7CwORoZZXP)h^B`-N#UD>~7y9hS2ed5nYa5-w? z@glfc%fK)atS-ILOfLji)4MtZ3o$Zedf}moEwJvMd|EoVY*yZ33bZjAupomu4p8~1 z$uLCW#_;;I8(4Q;<%vxAcp4-_)n^rmS}mtP2Xo@Ebe%$sL#Dy)f1)U0-OZps3E=WI z_D_3}wVTP{N72MD-asmNC2-H=c>v>v<>cWYV&$o{o_Bm@{%NJ+^J zEFtvvpD5t;ovWw)P~@c(-%{Q^Mu4fPx8Q9Iu<~)mDBzTa2g-g}AigxHV*m=TR%$+R z1JANyX$w6OT+T_FW(R`7+tR9Fs>7p`kd3Bhg=lm2RV8q_H!lSVw%CO^f(15|9>0QT znX;mQo(iUlEPcThVql8R^`-Pm@Vj4uHXLlAwBhjta7PuSZx!4e`Q|0PnY{Gt4QAtL z01!2hd>92r)0UJVz@oprV(9T;>MhRgMzFk$)Y-jxV4NQo;PjBC`GhF|vY67wv?Cz) zjI$xKd%$u#8+TVT$V)Tb528J8C zb#4MgS^Vs@449)@aYz+0zNfkA@rGUwl*{<>)B&tpI`SkM?8gUd-^9wldB*evxETdM z!mkfE&o-MlyEg+iL^Gb&05?mY?^pz*?|sNj;Ki0E%cg}ehRPpDxy{D2V5M*N0BA)w zVs;<=vnLfeq}tIAo-TgD4S=?fz%7)F`O9l@t2o%>Lf?Ozq@#~DtZ#jXf#bvaW*+!4 ztL81`JN{vQgI#xLp2}j;+UO@(&S=T-?&IM2F_*w4{Xd7IF6{gaTQy@gU~hJ*-hxr# z#UsB{LZ)P{fI2GB=@8=+bmwfm3{r(5+yPJRZ&b;`sYV-xTeqS@nD18yO3Ot>l_sl= zvy~OCHnvLFpCo58cfY&|$vkAl+@1gLO-IjOR3NIcLy?|5AMX04aTQU*sr2~Hs${Xc zQt}TLvfc*oH3MXQCpX+`ehI^W%xD!n;QT0g3vf2__rspkVwFnG=WbLaMoU3!Tfku< z(!a_}?s7+KHe8pjQHkOXj1pP3XTCFiv!x}1qpFr>Yl*Gk1`M}6XJ?cFu-2vCm5FB4 zeOoZfhE>>cL4X6nUZAeS7tUycv&c{@rwn2MuztTR&{Y&8lqxDY6>`O4sdFX8p|){j zxbT4pXqLjxJ+Nd7Y!tUl$AjkXFlUDw-H33oX4;7UiTk>Y1fEQ-F0mFq_HxF(R9;Zb zX0rN{G3toTwD4ta)M1+`jA4z2hqLzj?I60~;n} zKIY#wE-YlHTVVAtvo-s*SRVjd_gBWJl2OyDmyG+5=}n>xt-&c09dM*U-q|ZtVqtdS zIK$O*OsS2hr=RLEQ|EKuOwZ8O2L-cJ3RW4#s+ETtC#}P5!Y8iRC`B2B6CbVe0)Nmy zGWN~mvM5_l-8A^(Nn=)!q>CMn{pdcqK`nAWozX3_yTVyjq*q9H|zx0 zXhq3yWEM!lm;}3T+~>cP>&sjmAe!vrCZ44IQl)(ip;VO2S{;&n94_TQmO-gVoRpgcT-W-rL#t{qCNNyX?+A&silN$aAFw!$4YwtJ5@6rHqPu(cUbF09E5N+KMltH>*IggoG?+l zwG*qSK~GDXo4&!krTp5`Cb^a>{7U}gR49w;`pCCDEsCkyJ`>FyWxerV+O!h=P3dcO z-Q)nOc_}&MFnJnoB%CZto{}&13=wnqg1X|c_;qV)Wa)H)v}{XKG-q>Jbb3Z#%siywiGg-Wpzxxe?#{XK7t7jR%y9*zFC)0kGl!gICXcUcHEi?B{n*E9N4K;E%zc)b!%ily z@sYVE?wLO<`sRJ-lk9%e_m-dLDvJJIdcn=3cL6D=BPUKri)3^L`lxB=hpHo8@ z-(0R_pt)>lWj@4<1D+$ca7EqYyy{R&_Su5|DE%Q3@uncI_F^&YBYyWPE{Z`$aO39O`7(n_B{`Rwc6y{esTPc5p(p~x~O9%I7^zl;jf-=4kXQ8clVIA`a zLc%d9mwqY912O?s$MF+g7W9OGT{V|J9(qQANNv>78*`+d3K{+)pz#HW0h!e1Rlwum zyX+PmI%U!yiBJ2Fnu*W-r9SD|ACGUzr3#V{&rM22HTy!`h5C_j*M6ycJQIF}cqs>t ziGV7*9(xWdCpl?0zdAw2wT16qBbVIo4iet$F1MwhkFzFxl8mcQCyK`X2TU`z7xxe- zO%fXNUv{Ta5*DVF%Kg8o76|?X5;r;b`Ew=O4JcP-&lJo9-kYj<9N7V z7T-ofpCS9rJ%)(R=;!LJ9XqQMFc*5_Zd`!3)9>FW(Wl5sueyI8L1$1jYTI~HKaAj$ zKA=D1bNh}t9w&yl*<8MBMav`JJD8Q;>=qow+1r~>wB)pU7U@ol=Er6=y9wQ+>-5^I zBU|E+f&F;1DSd5Gm)DHTbgh<@Ex(X>$EEt^iA68;JA7_XrK1Lr=xjqzrRdeZfg&=S z-9CrJ{cx?$U4pesoOcfniq~+R$Zdu?DHA>CVnm{p5Peef{GPq}IwYCl=aKGdg1?K5 zXyJ$aza6@3O$$Fm#yLTA8=q@nTUT5rayOkFBZ2%OZF)1l1ZM?|G-rzbgwOT1ag+p0k~K$JMiNO{a;eURHf{$g z2e|DUjM@Xi8(I{gh7|OG9++#qA6wwkpC2j^oD~V{kA(3ycH0+$M;l^cU{G>keeo)g z63QE#sZ%@4N09?-V!0ZDaT|{~2@^ftM!ZKY=kVhQbT&mVU=l`8r)UJw55u&8v8a}# zHgotIG8#o;@GoyFjat)oG`4n>De{QEihZ_d;Hcs9pU3xIqI1<>ersj_u!0{}Ej9tm zl_YWk2T12m{*$ZTSliwPMp`JokAY}4F)2l$I-Vc-HDBk2j8HakfY@*F;D{ZM4w22c z+g@y{NnA={eY*SSEvgtWgVAyA>$C!*EhG&r_c?b`9O!K0s%pX7g&3U{5b`DgKd`vh=r zs`unkATUMRj1@P52}3UZUMn2sy)EjE4eiH)OU5D85OBkaIa4=NC!kn3UxCptwRgP# z1#oQrq!)29HeWa?z{TKz&O`BhC5D02J@q5~>5=ZksHAnUxG5xR4@fC$n&Bq)a#E2|*0;JKk4@vWT$q_c>~7YONi8<+kHJ?{M%k?xxI z2{vH1a`u!aP?&M2&JHLX#ku@Ur*=_v>lSbeLTot$NIcbM2X+~Y^u^$Fapsc{V#wU6 zdL0NCovoeIBzeT{lXWd$EZ(wiWo3I4Bu#~R-Lp&U*ga(sjn@T*yY1C&hT7tRh+);K z6S?Yo=i70>P|IhjAY^LIx%ZzpyU}J7Zsw|UueX+hk>?TA9AKd)HhmR@-Lv&}Y64!c z`|IVYYs9Sw-9KMYGe9jUxuCBM45h;D4CKyLPaV%yA5(0r1+)9#QS*V^E`b3MGKfd* z{O8?iBS~Ulr$#|jr-7Y3B59{Ukb3i|f+|sSuIVIW{D97z<4kK>gU*L#?$A5H45JeE zz}T9f<40Wky%yNuAmGOI*0+FfDE_T4!ID+W)GT~%bMvG(u*OArs4*>1ha@}>;tasn z)?e6K>1`3dVkU9jwHkTC^GfSJ@aSggBqy--?p?aY9KLLo`W);e=w!rMAeP!L55xk{ z`i})#b>?6v0JL&}9ejl%RDizu!VcZ}Iv)BB(m_8>+EdYj4J}?0c0U%4}FJ!^M4T#|O1Y44)+dd@c$@-bvXc*QW zXspX-8Nk9Ufg#_*_I&ZghdLwDvrO$9^6Mg$#iPu)fSHHOjC(nS0sOVA9LyI1dbOwf z2Vw)nYG)mCF3sU-i)yj})<;ZplGoX4w}kNcupA;n#_hRIz$81qC+w*XQ`~JvKUpV> z1&Vp@`Pdku>)s2*p2Xaw?S4|xgkDZM!1X;@{FnMn;_GW~x4)9|^>R!fIzd2k4dI&d z8*`n~;jZ1bEWgrF<%xut@RI>7B6SI|wAAm`s1W#~9ckh)8OU=*n0 zc1lsdKVoAO*@{`N*m0>0aHj`C*Vn2aj-aCnal8n0#;4P2X1YyCOCOobWWWX4Q4afjJ%xV{?Jy0ilJP_=z<)!c! zvp@j{nB@i15n$FB$aa8P-mWan`kek3v%W&pn+?7+8*q++K$IN734uWGG(ndFMriFQ z50+$TjNSuFX7T`VQ*ZRa$+-4Yn!Bn!fsI?E(eFVH&<|7Ofcz-FMv${KDcZBh=(gsl zgNPn`?3_I|v6+=yGLz;&@Xl@Pov1zC7do{Hf)rK-+U1~u0u$`vJvFherqDrp1@ft1`cx^S#_ZcPV#BUc)sj~>Xg!Ko$kMSw8o+fQ2 zU@+;tZw|=YdVML7mR{;LESv?HWyr!T-`8NIT$gznY{!8=4d4iEB)0S~Xm$M$w4DEf z7IPDj9l!AdY3X5>zcc21nLO__s2TXB*@O&(z0_LWj zjY|QA!huh$w?%cvUKMHLQf=*9y&q!&HYIqe?grR3RUXD73|j58z*L|6d=r7-040EK zeH0SAHQ?&C{Cg)r^iS|xgJ?jW^i2isoiAqY_UHs`@YpIbK;Rs$0s+(6Y{0&s(QNRw zIjwE;y!&uLyd(?K#ip(RjeA||u7Sm(GNXmSW*CBX_vY|bHohsqd|N6^KvHy`9l!vF z#0_cp25AM09b=`{z@!jykxW|5X0Cj-$4)Z^Neb&nml=pU6JVYq>VsAJ5Wo#y-oinF z<%a`(tAXyDCUqcU(w_QafowhgryxQZfmAqH)xj+EBFHR#YB9*HQ!@H2P*G(Q?zI>D z-K3^}GJwI_dDh_ZmUD*CcOZl&-5iwsJ(2b^~|^8(P;qIJpaA(v{my_WItE}JTg z4BFsKScw(PBmtVUz@`eeGNy-90@qZy0l4K6e@NQ{nurl=0yGh{TGcH3GukA<4O8rt zYk&)0qLxt@j=x`*P_rnTtWGP`hL(2^#y1WUXc(0k0xE%o@CqnyHuCKZFil7`+5vn3 z(1HOb=-$c=Tn<313FLPk{0s$kxIl zn2;nHU^S6aYAI!t)ng@)@{1-{R*v5I^1mhv8>NF$lT>Z}oPJ6D z{)%6{g=Qn)mnFM8UdqkBY^Ea2@8|P2rlOLEKL7yQ?UMYlQ_-9$ptrN-6rCk~^qY{pRn4)lDv#{nl%Fyi~MR0Yj0R4*D+~&2yuLRQi zhl~|#u$j~8*ft_wN9}+Wiwg-Le}IAk;l;B2pE@^^NMVRCf8w^vZiaL<7qWSRZ1R-D z<$?4zkNpJF+jM9O10u>wrh>)p3IrFymd>Gy;z8~q*eh7Bw-H!mfm>GF3J`3|z6he+ zcrRuGe00SFjN81wyjg@Up=^ftYpo5)ubYy#l+i4jg(eUJSd( zH3K*F@I?dm{@D#_21blkMnR4epNr7|vCxB2K>egB@i17#LJ)wQ=bYoYQHMGF^B2A` zpqeyY2>jZTeaTFPo4dz8+jux{@*g0>+g1DS96sq0XyE{?>9SflulEO$K*rdg)+x|r z4?9N#V-&7T@qoLr$I^fVhKVz%aMNk-ZTDVl6H4FK-2&rUj z7m>CEeV$D~zB)08e+ky&RGkt9PHmN>gGzy0nY3c{V#al* zH0X))g>`$+$zbB*_wKwvUnGdQITP_LomvA~66k8ZdN@MBYBoW>p!uxTkSECoxEVu0 z6IVbi5MBnBqaBN3fCVBGo&E)&_6i_e$lO)l(2KMJ9iIel*J=}4`RqibOLecGJv$_8 zd!+m4b<{-uq67A`r25A9= z1C3bU$nViuwU;#gnZGbl5BZ(5pebIG$-@eg6=Q^8}M>943Q^i#PL8v05$$ zSqy`;=AIT5+1@-s3+!{#J`}Kod$42Uz(fzZnt+463V2sQ3hT|aBEamlZE6eX#Q3+# zEbA-B?z2*glN!syw$l-30_M6O(XRsIw1^sHMLKjf+Y1+U+ruaAk&~9d5K9h#V&oIi z(guK#Ypx^=fF->Vll)+4EQ$^Qrk|6V32a1GivWd=f@;?SOYY5(0FENf{&sJJ>Y#hm z3z!b0+tYS9qQ_0B@syzabtP&ket)*b0m&Oyw}S6{sdkJU?A9wC)f}+=;4=yN9|)yr zwjtkT*I;PULmgo?M7oo1KMp@G#@zY(j2gXvtu(fBsP)H! zEmMPC^3Xnou~C#29;ynX+&R>uB;bBD@v=$l(i_K5=c#vfMrS?gO(&&pA1vGFQmGJ9 z`W`3gi3<;!`m*9!#Knx)otvvRE4y!MwT?gz6&RqbK#6|lkJBI8hm-3?He;wKX`q6U9>&;KAIsDBdD`y>w;AB zr;%xw(Lcdq{yO?KN2=}${_=EM-&9+a3w2i~eLA)id~rc1EmH_(i$|8I^Lff3 z>&@D@&($qUZP+GwtK$5vr%D&8t)}x>PR0asQ>HZ#HI-SewjpH}eo*ptE62*zI!f)` zHgy|*3Vwg|G_>LcyBl5r<(e2v@x7}udO`JUp3Jx+8!mC`=_|A`?_CnlL}qNwn(rrG z+B*TyJMlX2aIf=x9lW*6RJ`nDvL5niq&mG;$RFXLdGaoPihW2?8)x^S2*_(YBA3;o zRdXl_#yjm_cAd&&(cccel@_W-#h4hkop5%VStY0)ZYdPNn8yy4;d~4{3I#3a+AX8% z9vzmdKRSF@DCqrU^AF#F``VsRuXRbBz0Mcdg#2;ja};<3 zE!Pq!@^I?O2BCmmb3mZhEk5J}&aS>#RrTX6@-%@c z{R3OdiOyNL;rEWY8Wy!eSB7z6FCDSU=yA;)If0qR$(V=Elf6V_?mNRE2(?s3Q7dDCXlER_S*6=ishmE0&3>xLTe7#!V_kC4G{<5R`V3y zPC~wGOZ-vtUi89`*oDQm_nT8$Nxl}&9^PF1#L3<)Ur$NxFGAANPM=iN0<1${>DdnQ zHz06xyN<)`)c%Wgo;(|Z$j{Hwzdh(qXZ7_UT=ItHV#9@e0evyT5x&2?=5x8nRAOdD zamFT7v9km$AI*Vtu{wT1vsLM(qX&QfM?p;<97p5Sdb65D?07u7_we1>Gx$&rh~7a? z_eu0Rdg1il4ZfzJV@dGV;}}O&BRcd*#1>m81oz`BdW8S(q6U6dRIVAxI8x=hvk@Ji zc1V)J|i@uy= zdmQo~mzXuO;8EN4pyJPvCVLqlZyw}@q|}#};sslLcrlpRAqv{Y*khT)Qi^&zr0$)V z6jAc$&ZS>$q40l5>IeEIgly6hItSI-I0sP30J(Mpdyibcb$pCir0=emb`<2JYRoZF zoV>|Y$*k{`;{1J1(?!P}LFym5I#+(FkAnAcXo3&jUsYc;)>6o@>+s#BGx!LOLhXar z`LWCE=)sDaH&&I-OxxuQTV}$PqF9pY=0e2`@`WGl&BNl`r6*t z0j^yp;{|2QI(|UI$jL=JU+V}>QgAV>c+XnAh({}YD&apgP9P^*?na>?3Ll>$)@qjG z^v2bqF>a4t@Y0U_1$e^eK0o2$Tj$FBr6_x2dYDwAf_$;cm4Qn$mp$qQ zmaG>9=NrBZ?5e;;-uk($Ju4*n6lfyw-h6Ias>y ztIc9qs0~yG{-0QJVwh6%8F)hWZ{@(*3GgT1Jv%P5*@-)oH}~ugoS2=^ICN*vj$>^_ zh@i0V!&d6gn5ZJrLKH9z&K~bQv_HbZXUQ|eW+!~ss^w07;);dOFGhw9?b57OGpQ`w zX2Tz0|DKekrG%OB?(q&q`7?f1lY&u-n6X)_Mv~iz0%kp@(iPz|n1R{h zdXi?g&78!nRsFU4%RYB}QX4aErVVDTLjSElsDi=H4r8q9XT2?ag2pmzNM^IcpB?Jo zNZ;~t?e+GmPpq``v6IZS84jMc`eagnS=z$K6q#xB&0p`n&t;$|L2h=q_D=mQ#mYwm z=!ri(JN$uUm1Wa&f7a^5z4}BiOCM=OhD~SGtd-yG`paiA$k}1ES^a6R+dc=!GHqH_ zW{2Sy>t`jcd^UT$?Nj{`*43opPl}iT=^k&}B7cVFThixDo6r7@Rz5!-W!ijnn;ot& zZ9e^;^kh>B<5=2LXd*POkHCM)1hSMzVH7PG+?1jW^@at>!f)DTbz#1~g(hO#Uo&T-xJA)CQ zrMo)>CO%Z~E5KM_3|rEG~F9xY(p#P~5_2KQVJG z%D>mVl=Npv2}7^wse~f^O$N$IJ^YFo>Tg;BsUdwV`2K8g!M%D@cT1mvu#7SPsM%oK z+x3F-n91L?muB@#ZZ~}}#LO`-`&ruki}jUqnD4)X^GMf+&R_^PdMef5`}g82Ns8+V znC_E3mFMb}WYo&65ze(>~QqKn^4BED* z@_meduVMx1-HWl?{wdc>NqOfLFt350XE6T!J>{hECucDEzk}0BkG?5la_~Ksr(gP; zd@UtiUsJ+7d(~4ZTkm}e^E9xh@?@!hN_Q1W!e0TCz|m7#jP~ch>a*{WF=h}x8;rVB z-#267vqH}p)4ensoJQ(-KBn!@Z~UIra83#1<<(Ouf$~r3tS03pD`H%J2R|h}8C1YH zp6#g=YYgwbT>s;Ug-=cQ(y{taCoy*}^ei1s_m}+gj%47cgfWfiS$gr<|K-0v&%2in zSNhv@6qAH%6fl?adzOT;{$sMQNz2b{r2StSm6OKR6fqi;j$o$)oj49t$fta6k^ zn5Qayaeqa5w`YIfYP`595u*VARnTJ-j9s&WzHYjljUlfM+m2@~W?*JSXT9s}!Wc_; z#)G*tFk@?0HsfyKhmp16`{P!qgtS|}Vm$VpfNXo=itOegG zZ8H6#1UIkj8M}pC3;tf-Bq*o|hs{=&H!Wpg^6$@9mfZ_Wd1yI)KRg4I5j9)sayyJ) z0WLFJS!x#6>v4Nr@n8tuk--q*47VJ2mk<4^8oU>t#-+16pb z$1KJleH}X+mU0?ieW7PeF?}t#=Ur2(zY@GCqGzn^@mlb;amBA=rz+QI-NjAH9~9uo z{GPFYuxm8=cTG#rG5Tx4#^p_Y=M>>^O3#=?_FAx9aZ@U<65Qc;Wy+PX-dEP+@9t(| z&Irv`CYy(u9JxLIqi`n*!CB&@JE{;odk8y;V6yF@@o6p3sSM}`0e7q(FvVCgl{503qGm_ZX(BH)zlJ% zkWQbRe z3qG`Z6m+i-B0Ly9NO+?ZW%UBI*FY8BrHyF14o45zP*znCH(m)7$s#oRQ)tU;_$XPA zw*4IX=!HiQdD3Zo8!Z>G^aPmdy3Z+#TFp(6b0SV;-rE?1uC z#W4)wVadcLbIy7A0V7kVj|aVZMaEi>>U|G-1v7Nk@>t+qGW#jkuw%4xQC!>*iKAW> zm+XT;DSjLUc!P3u*8*kG*u8cjjN-?Ru)UF8GjNQuE=&`5K}CN{r3COJNPwpYYB5M% zko~!VfM?kL!+W~M+l%>=sG_-n=q&rm>VRBq|D8Ri*1YnbZpb$HFWuI|uNF}Q5S;@{ zr(xAv%%<@{6q9t}<&)@e|FaM8{Nf*u>A_L9{R)zJb{`n z?0hh-J{`L3gkmhP)53C~SBNuyw9qGoPDNy*b3Sy>q`G$CC1q6z(PS=z9_XT+%Z0WS zR};xLwCz}EneO+w0Mx^W;U0K4<=hgeWFNa-J$S^K(_zq1&Gta&6Wr@t7M597F*{EP zw-4UOmS_~u*vx;vI1%!ZT|hJLd70WchaBd$ckyTFg}}HunTJ{)i@XJ9^9v>nqy3BG z5*(#Cr|GM@5@n&FGSIYh&76u@l>ypcwmwQ`RR*X^>EFtr-VGV4?J^DIXRusF@zeU% zWw7stLXOVBHpChFUk#iot5S%jf2+|0o|JRXq2py+qvUAH>S090^`hDVYl?azv<0wm zgj}Si#`j3})cYP^|E)Dd1kd*zYz})EC4Pnn$0e@Bm45x(Hw#P1A5+5ZT&wuuKZKk7 z4U2?u!WLBEzSC>or&{H(*5AVxp2(!MkNg)x5q}GM0|xzpu0jgWEtOW zM_22+0txifeLTwditV`VbRI(H`?xs|I@YNv^EscU&s^#6^|z0;O1O5XxKm(LAo-2e{%Z9bYSZ3(piduAwZ(3TSaj;{=lMPcFhGg3i%!5@DZTAhl%1 zF_7a%zJO&JVS$2>Lloi)$wP@01TR9!x8=q9k3i+<2q+9`N(tlt)!BlhglQqhjrZLq zki>utsEqqS`TTKOL?P4xoEAE-K`ZDkgx8W08!+1LW2k&BS!_dt5!G7=Lz8dUrW4fduPN$Sgu6) zw@drkY8lVnVsA>{!1G$J5h5(C4@n3TZVCJ}`Sy))M<^Vd6np)nqQFb13b`BaOcxFN zwk+U@Wpad{`Fm{Al_ibR8$rbdP6T$@UrtS8OM>Z)OO^Z*Am9l496ztrbk$J&^b5KT zSOo!*f64vB0IMJ%Z2!&vFfdCAIEJvrK}H4zDe4`N{RgPA1v6Sb61oh|Nm)2d3(JTs zbVQXCXYPf^Ms^Zsu#{EZ=C1}T=q?6jl|2$~P*l^EO<5I0L>R+9nD-5K79no%?=C## z67LZxB|1J3%9AXcFR5a?AX;|0Uhfg{yeeBlbBy-q}KEAc0L;GqiwiEdgScR322=e21%u~0KNl;wON z4V?W@OPtYfh>2uD5v}e8w5rULNDimfB}08m#zx(9pv!J3#{2|j9YE2Ct{KYeQ7F!U z1x1uqzUEE?5#o$DEflraWgt`2bjfTEu5UNU5epjzy}Zz0(uOAHIvuPWO-Azkadta6^wF{NVk=T<=<`T54ADQT*v=Ei7gJNPg=i zd>6RyO&H{B82d&ejPPNT!sGXm>3Yg7&l&Hh>K9#jtnE>MmlkkpSC0MbKH>3ou`k*98n$wa5?DtTgmTH-nl3AYaXOc}aGxWg6Ri;GFNN}774W+g56<5f`c zoZeeFn`vtCCk1bIY2}he!42X=V{ro6c|_N^)>wEq zGt%q<3R~0la3h4KlLGB`Me)x+pw&N(ta3y}qr3juO^@_(LA@l-ST)2)EN$|>tQh=?x`%XtJX7hT;?hFBU! z>+YNua7`FIt9H{kpglw1B3nj+-!LUsG@kZ?a7QA%cd**2_=tdY;yLw7-t#(Bg$___5aONi_l%}l-1LbID?6rE)?Zv?Z_q(gdQa6e5;2WEA(O%>Pu8)l-W`f zOWvIMm>r`ctIXeg%(3{SfOW}4>s`T|4dw6O->^#E1&yR(H-b|stJH0LLbjQ0fKt~< zO4ohpuooIFF2j?Kyjb~ik3-P8I85rJX>oc<03V{Tzc4aTlTp}JD266~p@bc`Yb&&_ zC4ZuXfskv#pvm48of{#wE0Ij{-W)9CF>Fs5+8Mok3WgGgp#nBe!m@jr`cq#@@H5nMnctZh}O=jBK*Q^|+?H0;;SVb&DHEx)3f4ZN?EAhcFAlRe#^17jW)Vu=E zE|<9C6_a+4hdFnJ$|3+mwbSsZ5K{i+A?;DkZ8u?5eT*xQFsH`keEKQr+L8zN+XaqI zyORGv)ZJ%X>>3~}#kI5riVLL0JikftJ1ae4ApfCjVsz%hjvgcb3xxc^7-rXD@E4K% z03sQU)C>qx&UO4By^+EHUvIPxdLtJUeExsEk$)kBVnq_kV`ifgoYfZpiFe>_2VN(OIu8k%@}RP)0b1000.0) 3;1 30.6023 (0.00) 30 1 + test_fetch_plain 43,747.1233 (466.20) 48,690.7829 (243.60) 44,573.8085 (463.56) 1,170.8588 (446.20) 44,067.5411 (460.46) 1,436.5837 (>1000.0) 3;1 22.4347 (0.00) 22 1 + test_build_query_positions_from_bam_with_samtoolspipe 46,316.5548 (493.58) 53,700.3577 (268.66) 48,235.6640 (501.64) 1,861.0317 (709.21) 47,625.2194 (497.63) 2,603.7074 (>1000.0) 6;1 20.7315 (0.00) 21 1 + test_build_query_bases_with_reference_from_bam_with_samtoolspysam 51,565.1479 (549.51) 53,523.1121 (267.78) 51,778.6650 (538.49) 424.4621 (161.76) 51,656.8925 (539.76) 110.5051 (88.81) 1;4 19.3130 (0.00) 20 1 + test_build_query_bases_with_reference_from_bam_with_pysam 58,850.6740 (627.15) 62,164.5451 (311.01) 60,161.6779 (625.67) 1,120.6101 (427.05) 59,595.2785 (622.71) 1,995.0196 (>1000.0) 7;0 16.6219 (0.00) 16 1 + test_iterate_file_large_uncompressed 59,419.9076 (633.22) 69,053.0874 (345.48) 62,825.7126 (653.38) 3,805.1150 (>1000.0) 60,805.1391 (635.35) 6,407.6949 (>1000.0) 3;0 15.9170 (0.00) 14 1 + test_iterate_file_large_compressed 63,986.9571 (681.89) 74,156.9120 (371.01) 68,370.8835 (711.05) 3,254.6200 (>1000.0) 67,221.2075 (702.39) 4,770.3525 (>1000.0) 5;0 14.6261 (0.00) 16 1 + test_read_python_large_uncompressed 67,611.8080 (720.51) 89,112.1533 (445.83) 73,631.8916 (765.76) 5,978.7052 (>1000.0) 71,702.4822 (749.21) 7,856.0165 (>1000.0) 2;1 13.5811 (0.00) 13 1 + test_fetch_parsed 72,237.5344 (769.81) 81,976.9856 (410.13) 74,540.4099 (775.21) 2,326.3668 (886.54) 74,016.3419 (773.39) 1,579.0667 (>1000.0) 1;1 13.4155 (0.00) 14 1 + test_build_query_bases_from_bam_with_pysam_pileups 75,841.3766 (808.21) 87,975.1425 (440.14) 79,393.9784 (825.68) 3,516.2458 (>1000.0) 79,346.7313 (829.09) 5,217.5033 (>1000.0) 2;0 12.5954 (0.00) 13 1 + test_build_query_bases_with_reference_from_bam_with_samtoolspipe 131,162.9396 (>1000.0) 132,858.8147 (664.70) 131,838.0199 (>1000.0) 595.6090 (226.98) 131,801.5633 (>1000.0) 956.9665 (769.11) 2;0 7.5851 (0.00) 8 1 + test_iterate_parsed_large_uncompressed 132,991.8914 (>1000.0) 140,153.4099 (701.19) 134,101.5892 (>1000.0) 2,452.3161 (934.54) 133,218.1590 (>1000.0) 396.3616 (318.56) 1;1 7.4570 (0.00) 8 1 + test_iterate_generic_large_uncompressed 134,743.7277 (>1000.0) 142,329.4526 (712.08) 138,167.4954 (>1000.0) 3,346.7773 (>1000.0) 136,839.1849 (>1000.0) 6,541.1879 (>1000.0) 4;0 7.2376 (0.00) 8 1 + test_read_python_large_compressed 175,127.6311 (>1000.0) 190,855.1529 (954.86) 181,702.9339 (>1000.0) 5,756.2207 (>1000.0) 181,221.0185 (>1000.0) 8,577.6616 (>1000.0) 2;0 5.5035 (0.00) 6 1 + test_iterate_parsed_large_compressed 231,405.7611 (>1000.0) 243,728.8519 (>1000.0) 239,037.8296 (>1000.0) 5,212.7778 (>1000.0) 241,544.3324 (>1000.0) 8,062.6113 (>1000.0) 1;0 4.1834 (0.00) 5 1 + test_iterate_generic_large_compressed 235,042.3876 (>1000.0) 256,518.8371 (>1000.0) 242,535.5468 (>1000.0) 8,272.5197 (>1000.0) 240,163.0748 (>1000.0) 8,360.1568 (>1000.0) 1;0 4.1231 (0.00) 5 1 + --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + diff --git a/doc/developer.rst b/doc/developer.rst index 9931854..09ae832 100644 --- a/doc/developer.rst +++ b/doc/developer.rst @@ -17,22 +17,26 @@ directories: make -C doc html :file:`tests` - Code and data for testing + Code and data for testing and benchmarking :file:`htslib` Source code from htslib_ shipped with pysam. See - :file:`setup.py` about importing. + :file:`import.py` about importing. :file:`samtools` Source code from :term:`csamtools` shipped with pysam. See - :file:`setup.py` about importing. + :file:`import.py` about importing. + +:file:`bcftools` + Source code from :term:`cbcftools` shipped with pysam. See + :file:`import.py` about importing. Importing new versions of htslib and samtools ============================================= -See instructions in :file:`setup.py` to import the latest -version of htslib_ and samtools_. +See instructions in :file:`import.py` to import the latest +version of htslib_, samtools_ and bcftools_. Unit testing ============ @@ -40,10 +44,18 @@ Unit testing Unit tests are in the :file:`tests` directory. To run all unit tests, run:: - nosetests -s -v tests + pytest tests + +Benchmarking +============ + +To run the benchmarking suite, make sure that `pytest-benchmark +`_ is installed. To run +all benchmarks, type:: + + pytest tests/*_bench.py -Note to use the ``-s/--nocapture`` option to prevent nosetests from -captpuring standard output. +See :ref:`Benchmarking` for more on this topic. Contributors ============ diff --git a/doc/index.rst b/doc/index.rst index da36028..01b2a8e 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,8 +18,8 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.3*, *samtools-1.3* and -*bcftools-1.3*. +The current version wraps *htslib-1.7*, *samtools-1.7* and +*bcftools-1.6*. To install the latest release, type:: @@ -39,6 +39,7 @@ Contents faq.rst developer.rst release.rst + benchmarking.rst glossary.rst Indices and tables diff --git a/doc/release.rst b/doc/release.rst index 81cd274..d0ece25 100644 --- a/doc/release.rst +++ b/doc/release.rst @@ -2,6 +2,55 @@ Release notes ============= +Release 0.14.0 +============== + +This release wraps htslib/samtools versions 1.7.0. + +* SAM/BAM/CRAM headers are now managed by a separate AlignmentHeader + class. +* AlignmentFile.header.as_dict() returns an ordered dictionary. +* Use "stop" instead of "end" to ensure consistency to + VariantFile. The end designations have been kept for backwards + compatibility. + +* [#611] and [#293] CRAM repeated fetch now works, each iterator + reloads index if multiple_iterators=True +* [#608] pysam now wraps htslib 1.7 and samtools 1.7. +* [#580] reference_name and next_reference_name can now be set to "*" + (will be converted to None to indicate an unmapped location) +* [#302] providing no coordinate to count_coverage will not count from + start/end of contig. +* [#325] @SQ records will be automatically added to header if they are + absent from text section of header. +* [#529] add get_forward_sequence() and get_forward_qualities() + methods +* [#577] add from_string() and to_dict()/from_dict() methods to + AlignedSegment. Rename tostring() to to_string() throughout for + consistency +* [#589] return None from build_alignment_sequence if no MD tag is set +* [#528] add PileupColumn.__len__ method + +Backwards incompatible changes: + +* AlignmentFile.header now returns an AlignmentHeader object. Use + AlignmentFile.header.to_dict() to get the dictionary as + previously. Most dictionary accessor methods (keys(), values(), + __getitem__, ...) have been implemented to ensure some level of + backwards compatibility when only reading. + + The rationale for this change is to have consistency between + AlignmentFile and VariantFile. + +* AlignmentFile and FastaFile now raise IOError instead of OSError + +Medium term we plan to have a 1.0 release. The pysam +interface has grown over the years and the API is cluttered with +deprecated names (Samfile, getrname(), gettid(), ...). To work towards +this, the next release (0.15.0) will yield DeprecationWarnings +for any parts of the API that are considered obsolete and will not be +in 1.0. Once 1.0 has been reached, we will use semantic versioning. + Release 0.13.0 =============== @@ -11,6 +60,11 @@ contains a series of bugfixes. * [#544] reading header from remote TabixFiles now works. * [#531] add missing tag types H and A. A python float will now be added as 'f' type instead of 'd' type. +* [#543] use FastaFile instead of Fastafile in pileup. +* [#546] set is_modified flag in setAttribute so updated attributes + are output. +* [#537] allow tabix index files to be created in a custom location. +* [#530] add get_index_statistics() method Release 0.12.0.1 diff --git a/import.py b/import.py index 80e6d4b..89aa9f1 100644 --- a/import.py +++ b/import.py @@ -18,14 +18,15 @@ # modify config.h to set compatibility flags # # For bcftools, type: -# rm -rf bedtools -# python import.py bedtools download/bedtools +# rm -rf bcftools +# python import.py bcftools download/bedtools # git checkout -- bcftools/version.h # rm -rf bedtools/test bedtools/plugins import fnmatch import os import re +import itertools import shutil import sys import hashlib @@ -49,14 +50,13 @@ EXCLUDE = { "bamcheck.c", "chk_indel.c", "vcf-miniview.c", - "htslib-1.5", # do not import twice "hfile_irods.c", # requires irods library ), "bcftools": ( "test", "plugins", "peakfit.c", "peakfit.h", # needs to renamed, name conflict with samtools reheader - "reheader.c", + # "reheader.c", "polysomy.c"), "htslib": ( 'htslib/tabix.c', 'htslib/bgzip.c', @@ -91,7 +91,7 @@ def _update_pysam_files(cf, destdir): lines = "".join(infile.readlines()) with open(dest, "w", encoding="utf-8") as outfile: - outfile.write('#include "pysam.h"\n\n') + outfile.write('#include "{}.pysam.h"\n\n'.format(basename)) subname, _ = os.path.splitext(os.path.basename(filename)) if subname in MAIN.get(basename, []): lines = re.sub("int main\(", "int {}_main(".format( @@ -99,27 +99,27 @@ def _update_pysam_files(cf, destdir): else: lines = re.sub("int main\(", "int {}_{}_main(".format( basename, subname), lines) - lines = re.sub("stderr", "pysam_stderr", lines) - lines = re.sub("stdout", "pysam_stdout", lines) - lines = re.sub(" printf\(", " fprintf(pysam_stdout, ", lines) + lines = re.sub("stderr", "{}_stderr".format(basename), lines) + lines = re.sub("stdout", "{}_stdout".format(basename), lines) + lines = re.sub(" printf\(", " fprintf({}_stdout, ".format(basename), lines) lines = re.sub("([^kf])puts\(([^)]+)\)", - r"\1fputs(\2, pysam_stdout) & fputc('\\n', pysam_stdout)", + r"\1fputs(\2, {}_stdout) & fputc('\\n', {}_stdout)".format(basename, basename), lines) lines = re.sub("putchar\(([^)]+)\)", - r"fputc(\1, pysam_stdout)", lines) + r"fputc(\1, {}_stdout)".format(basename), lines) fn = os.path.basename(filename) # some specific fixes: SPECIFIC_SUBSTITUTIONS = { "bam_md.c": ( 'sam_open_format("-", mode_w', - 'sam_open_format(pysam_stdout_fn, mode_w'), + 'sam_open_format({}_stdout_fn, mode_w'.format(basename)), "phase.c": ( - 'putc("ACGT"[f->seq[j] == 1? (c&3, pysam_stdout) : (c>>16&3)]);', - 'putc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], pysam_stdout);'), + 'putc("ACGT"[f->seq[j] == 1? (c&3, {}_stdout) : (c>>16&3)]);'.format(basename), + 'putc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], {}_stdout);'.format(basename)), "cut_target.c": ( - 'putc(33 + (cns[j]>>8>>2, pysam_stdout));', - 'putc(33 + (cns[j]>>8>>2), pysam_stdout);') + 'putc(33 + (cns[j]>>8>>2, {}_stdout));'.format(basename), + 'putc(33 + (cns[j]>>8>>2), {}_stdout);'.format(basename)) } if fn in SPECIFIC_SUBSTITUTIONS: lines = lines.replace( @@ -127,15 +127,13 @@ def _update_pysam_files(cf, destdir): SPECIFIC_SUBSTITUTIONS[fn][1]) outfile.write(lines) - with open(os.path.join(destdir, "pysam.h"), "w")as outfile: - outfile.write("""#ifndef PYSAM_H -#define PYSAM_H -#include "stdio.h" -extern FILE * pysam_stderr; -extern FILE * pysam_stdout; -extern const char * pysam_stdout_fn; -#endif -""") + with open(os.path.join("import", "pysam.h")) as inf, \ + open(os.path.join(destdir, "{}.pysam.h".format(basename)), "w") as outf: + outf.write(re.sub("@pysam@", basename, inf.read())) + + with open(os.path.join("import", "pysam.c")) as inf, \ + open(os.path.join(destdir, "{}.pysam.c".format(basename)), "w") as outf: + outf.write(re.sub("@pysam@", basename, inf.read())) if len(sys.argv) >= 1: @@ -155,7 +153,8 @@ if len(sys.argv) >= 1: cfiles = locate("*.c", srcdir) hfiles = locate("*.h", srcdir) - + mfiles = itertools.chain(locate("README", srcdir), locate("LICENSE", srcdir)) + # remove unwanted files and htslib subdirectory. cfiles = [x for x in cfiles if os.path.basename(x) not in exclude and not re.search("htslib-", x)] @@ -191,6 +190,10 @@ if len(sys.argv) >= 1: _compareAndCopy(src_file, srcdir, destdir, exclude) ncopied += 1 + for src_file in mfiles: + _compareAndCopy(src_file, srcdir, destdir, exclude) + ncopied += 1 + cf = [] for src_file in cfiles: cf.append(_compareAndCopy(src_file, diff --git a/import/pysam.c b/import/pysam.c new file mode 100644 index 0000000..1642013 --- /dev/null +++ b/import/pysam.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include + +#include "@pysam@.pysam.h" + +FILE * @pysam@_stderr = NULL; +FILE * @pysam@_stdout = NULL; +const char * @pysam@_stdout_fn = NULL; +int @pysam@_stdout_fileno = STDOUT_FILENO; + + +FILE * @pysam@_set_stderr(int fd) +{ + if (@pysam@_stderr != NULL) + fclose(@pysam@_stderr); + @pysam@_stderr = fdopen(fd, "w"); + return @pysam@_stderr; +} + +void @pysam@_unset_stderr(void) +{ + if (@pysam@_stderr != NULL) + fclose(@pysam@_stderr); + @pysam@_stderr = fopen("/dev/null", "w"); +} + +FILE * @pysam@_set_stdout(int fd) +{ + if (@pysam@_stdout != NULL) + fclose(@pysam@_stdout); + @pysam@_stdout = fdopen(fd, "w"); + if (@pysam@_stdout == NULL) + { + fprintf(@pysam@_stderr, "could not set stdout to fd %i", fd); + } + @pysam@_stdout_fileno = fd; + return @pysam@_stdout; +} + +void @pysam@_set_stdout_fn(const char *fn) +{ + @pysam@_stdout_fn = fn; +} + +void @pysam@_unset_stdout(void) +{ + if (@pysam@_stdout != NULL) + fclose(@pysam@_stdout); + @pysam@_stdout = fopen("/dev/null", "w"); + @pysam@_stdout_fileno = STDOUT_FILENO; +} + +void @pysam@_set_optind(int val) +{ + // setting this in cython via + // "from posix.unistd cimport optind" + // did not work. + // + // setting to 0 forces a complete re-initialization + optind = val; +} + + + diff --git a/import/pysam.h b/import/pysam.h new file mode 100644 index 0000000..4a6ec29 --- /dev/null +++ b/import/pysam.h @@ -0,0 +1,47 @@ +#ifndef PYSAM_H +#define PYSAM_H + +#include "stdio.h" + +extern FILE * @pysam@_stderr; + +extern FILE * @pysam@_stdout; + +extern const char * @pysam@_stdout_fn; + +/*! set pysam standard error to point to file descriptor + + Setting the stderr will close the previous stderr. + */ +FILE * @pysam@_set_stderr(int fd); + +/*! set pysam standard output to point to file descriptor + + Setting the stderr will close the previous stdout. + */ +FILE * @pysam@_set_stdout(int fd); + +/*! set pysam standard output to point to filename + + */ +void @pysam@_set_stdout_fn(const char * fn); + +/*! set pysam standard error to /dev/null. + + Unsetting the stderr will close the previous stderr. + */ +void @pysam@_unset_stderr(void); + +/*! set pysam standard error to /dev/null. + + Unsetting the stderr will close the previous stderr. + */ +void @pysam@_unset_stdout(void); + +int @pysam@_dispatch(int argc, char *argv[]); + +void @pysam@_set_optind(int); + +extern int @pysam@_main(int argc, char *argv[]); + +#endif diff --git a/pysam/Pileup.py b/pysam/Pileup.py index 998127b..1fe05ec 100644 --- a/pysam/Pileup.py +++ b/pysam/Pileup.py @@ -2,36 +2,37 @@ import collections import pysam -PileupSubstitution = collections.namedtuple( "PileupSubstitution", - " ".join( (\ - "chromosome", - "pos", - "reference_base", - "genotype", - "consensus_quality", - "snp_quality", - "mapping_quality", - "coverage", - "read_bases", - "base_qualities" ) ) ) - -PileupIndel = collections.namedtuple( "PileupIndel", - " ".join( (\ - "chromosome", - "pos", - "reference_base", - "genotype", - "consensus_quality", - "snp_quality", - "mapping_quality", - "coverage", - "first_allele", - "second_allele", - "reads_first", - "reads_second", - "reads_diff" ) ) ) - -def iterate( infile ): +PileupSubstitution = collections.namedtuple("PileupSubstitution", + " ".join(( + "chromosome", + "pos", + "reference_base", + "genotype", + "consensus_quality", + "snp_quality", + "mapping_quality", + "coverage", + "read_bases", + "base_qualities"))) + +PileupIndel = collections.namedtuple("PileupIndel", + " ".join(( + "chromosome", + "pos", + "reference_base", + "genotype", + "consensus_quality", + "snp_quality", + "mapping_quality", + "coverage", + "first_allele", + "second_allele", + "reads_first", + "reads_second", + "reads_diff"))) + + +def iterate(infile): '''iterate over ``samtools pileup -c`` formatted file. *infile* can be any iterator over a lines. @@ -39,25 +40,29 @@ def iterate( infile ): The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution` or :class:`pysam.Pileup.PileupIndel`. - .. note:: + .. note:: + The parser converts to 0-based coordinates ''' - - conv_subst = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str) - conv_indel = (str,lambda x: int(x)-1,str,str,int,int,int,int,str,str,int,int,int) + + conv_subst = (str, lambda x: int(x) - 1, str, + str, int, int, int, int, str, str) + conv_indel = (str, lambda x: int(x) - 1, str, str, int, + int, int, int, str, str, int, int, int) for line in infile: d = line[:-1].split() if d[2] == "*": try: - yield PileupIndel( *[x(y) for x,y in zip(conv_indel,d) ] ) + yield PileupIndel(*[x(y) for x, y in zip(conv_indel, d)]) except TypeError: - raise pysam.SamtoolsError( "parsing error in line: `%s`" % line) + raise pysam.SamtoolsError("parsing error in line: `%s`" % line) else: try: - yield PileupSubstitution( *[x(y) for x,y in zip(conv_subst,d) ] ) + yield PileupSubstitution(*[x(y) for x, y in zip(conv_subst, d)]) except TypeError: - raise pysam.SamtoolsError( "parsing error in line: `%s`" % line) + raise pysam.SamtoolsError("parsing error in line: `%s`" % line) + ENCODE_GENOTYPE = { 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', @@ -68,7 +73,7 @@ ENCODE_GENOTYPE = { 'GT': 'k', 'TG': 'K', 'CG': 's', 'GC': 'S', 'AT': 'w', 'TA': 'W', - } +} DECODE_GENOTYPE = { 'A': 'AA', @@ -81,59 +86,67 @@ DECODE_GENOTYPE = { 'k': 'GT', 'K': 'GT', 's': 'CG', 'S': 'CG', 'w': 'AT', 'W': 'AT', - } +} + +# ------------------------------------------------------------ + -##------------------------------------------------------------ -def encodeGenotype( code ): +def encodeGenotype(code): '''encode genotypes like GG, GA into a one-letter code. The returned code is lower case if code[0] < code[1], otherwise it is uppercase. ''' - return ENCODE_GENOTYPE[ code.upper() ] + return ENCODE_GENOTYPE[code.upper()] -def decodeGenotype( code ): + +def decodeGenotype(code): '''decode single letter genotypes like m, M into two letters. This is the reverse operation to :meth:`encodeGenotype`. ''' - return DECODE_GENOTYPE[ code ] + return DECODE_GENOTYPE[code] + -def translateIndelGenotypeFromVCF( vcf_genotypes, ref ): +def translateIndelGenotypeFromVCF(vcf_genotypes, ref): '''translate indel from vcf to pileup format.''' # indels - def getPrefix( s1, s2 ): + def getPrefix(s1, s2): '''get common prefix of strings s1 and s2.''' - n = min( len( s1), len( s2 ) ) - for x in range( n ): - if s1[x] != s2[x]: return s1[:x] + n = min(len(s1), len(s2)) + for x in range(n): + if s1[x] != s2[x]: + return s1[:x] return s1[:n] - def getSuffix( s1, s2 ): + def getSuffix(s1, s2): '''get common sufix of strings s1 and s2.''' - n = min( len( s1), len( s2 ) ) - if s1[-1] != s2[-1]: return "" - for x in range( -2, -n - 1, -1 ): - if s1[x] != s2[x]: return s1[x+1:] + n = min(len(s1), len(s2)) + if s1[-1] != s2[-1]: + return "" + for x in range(-2, -n - 1, -1): + if s1[x] != s2[x]: + return s1[x + 1:] return s1[-n:] - def getGenotype( variant, ref ): + def getGenotype(variant, ref): + + if variant == ref: + return "*", 0 - if variant == ref: return "*", 0 - if len(ref) > len(variant): # is a deletion if ref.startswith(variant): return "-%s" % ref[len(variant):], len(variant) - 1 - elif ref.endswith( variant ): + elif ref.endswith(variant): return "-%s" % ref[:-len(variant)], -1 else: - prefix = getPrefix( ref, variant ) - suffix = getSuffix( ref, variant ) - shared = len(prefix) + len(suffix) - len(variant) + prefix = getPrefix(ref, variant) + suffix = getSuffix(ref, variant) + shared = len(prefix) + len(suffix) - len(variant) # print "-", prefix, suffix, ref, variant, shared, len(prefix), len(suffix), len(ref) if shared < 0: raise ValueError() - return "-%s" % ref[len(prefix):-(len(suffix)-shared)], len(prefix) - 1 + return "-%s" % ref[len(prefix):-(len(suffix) - shared)], len(prefix) - 1 elif len(ref) < len(variant): # is an insertion @@ -142,47 +155,49 @@ def translateIndelGenotypeFromVCF( vcf_genotypes, ref ): elif variant.endswith(ref): return "+%s" % variant[:len(ref)], 0 else: - prefix = getPrefix( ref, variant ) - suffix = getSuffix( ref, variant ) - shared = len(prefix) + len(suffix) - len(ref) + prefix = getPrefix(ref, variant) + suffix = getSuffix(ref, variant) + shared = len(prefix) + len(suffix) - len(ref) if shared < 0: raise ValueError() - return "+%s" % variant[len(prefix):-(len(suffix)-shared)], len(prefix) + return "+%s" % variant[len(prefix):-(len(suffix) - shared)], len(prefix) else: assert 0, "snp?" # in pileup, the position refers to the base # after the coordinate, hence subtract 1 - #pos -= 1 + # pos -= 1 genotypes, offsets = [], [] is_error = True for variant in vcf_genotypes: try: - g, offset = getGenotype( variant, ref ) + g, offset = getGenotype(variant, ref) except ValueError: break - genotypes.append( g ) - if g != "*": offsets.append( offset ) - - else: + genotypes.append(g) + if g != "*": + offsets.append(offset) + + else: is_error = False - if is_error: + if is_error: raise ValueError() - assert len(set(offsets )) == 1, "multiple offsets for indel" + assert len(set(offsets)) == 1, "multiple offsets for indel" offset = offsets[0] - genotypes = "/".join( genotypes ) + genotypes = "/".join(genotypes) return genotypes, offset -def vcf2pileup( vcf, sample ): + +def vcf2pileup(vcf, sample): '''convert vcf record to pileup record.''' - + chromosome = vcf.contig pos = vcf.pos reference = vcf.ref @@ -193,79 +208,75 @@ def vcf2pileup( vcf, sample ): # get genotype genotypes = data["GT"] if len(genotypes) > 1: - raise ValueError( "only single genotype per position, %s" % (str(vcf))) + raise ValueError("only single genotype per position, %s" % (str(vcf))) genotypes = genotypes[0] # not a variant - if genotypes[0] == ".": return None + if genotypes[0] == ".": + return None - genotypes = [ allelles[int(x)] for x in genotypes if x != "/" ] + genotypes = [allelles[int(x)] for x in genotypes if x != "/"] # snp_quality is "genotype quality" - snp_quality = consensus_quality = data.get( "GQ", [0])[0] - mapping_quality = vcf.info.get( "MQ", [0])[0] - coverage = data.get( "DP", 0) + snp_quality = consensus_quality = data.get("GQ", [0])[0] + mapping_quality = vcf.info.get("MQ", [0])[0] + coverage = data.get("DP", 0) - if len(reference) > 1 or max([len(x) for x in vcf.alt] ) > 1: + if len(reference) > 1 or max([len(x) for x in vcf.alt]) > 1: # indel - genotype, offset = translateIndelGenotypeFromVCF( genotypes, reference ) - - return PileupIndel( chromosome, - pos + offset, - "*", - genotype, - consensus_quality, - snp_quality, - mapping_quality, - coverage, - genotype, - "<" * len(genotype), - 0, - 0, - 0 ) - - else: - - genotype = encodeGenotype( "".join(genotypes) ) + genotype, offset = translateIndelGenotypeFromVCF(genotypes, reference) + + return PileupIndel(chromosome, + pos + offset, + "*", + genotype, + consensus_quality, + snp_quality, + mapping_quality, + coverage, + genotype, + "<" * len(genotype), + 0, + 0, + 0) - + else: + genotype = encodeGenotype("".join(genotypes)) read_bases = "" base_qualities = "" - return PileupSubstitution( chromosome, pos, reference, - genotype, - consensus_quality, - snp_quality, - mapping_quality, - coverage, read_bases, base_qualities ) + return PileupSubstitution(chromosome, pos, reference, + genotype, consensus_quality, + snp_quality, mapping_quality, + coverage, read_bases, + base_qualities) -def iterate_from_vcf( infile, sample ): +def iterate_from_vcf(infile, sample): '''iterate over a vcf-formatted file. *infile* can be any iterator over a lines. - The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution` - or :class:`pysam.Pileup.PileupIndel`. + The function yields named tuples of the type + :class:`pysam.Pileup.PileupSubstitution` or + :class:`pysam.Pileup.PileupIndel`. - Positions without a snp will be skipped. + Positions without a snp will be skipped. - This method is wasteful and written to support same - legacy code that expects samtools pileup output. + This method is wasteful and written to support same legacy code + that expects samtools pileup output. Better use the vcf parser directly. ''' - - vcf = pysam.VCF() - vcf.connect( infile ) + vcf.connect(infile) if sample not in vcf.getsamples(): - raise KeyErorr( "sample %s not vcf file" ) + raise KeyError("sample %s not vcf file") for row in vcf.fetch(): - result = vcf2pileup( row, sample ) - if result: yield result - + result = vcf2pileup(row, sample) + if result: + yield result diff --git a/pysam/__init__.py b/pysam/__init__.py index c142c6c..40877da 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -11,6 +11,8 @@ import pysam.libcfaidx as libcfaidx from pysam.libcfaidx import * import pysam.libctabix as libctabix from pysam.libctabix import * +# import pysam.libctabixproxies as libctabixproxies +# from pysam.libctabixproxies import * import pysam.libcsamfile as libcsamfile from pysam.libcsamfile import * import pysam.libcalignmentfile as libcalignmentfile @@ -38,6 +40,7 @@ __all__ = \ libcbcf.__all__ +\ libcbgzf.__all__ +\ libcfaidx.__all__ +\ + libctabixproxies.__all__ +\ libcalignmentfile.__all__ +\ libcalignedsegment.__all__ +\ libcsamfile.__all__ +\ @@ -74,8 +77,9 @@ def get_include(): def get_defines(): '''return a list of defined compilation parameters.''' - return [] #('_FILE_OFFSET_BITS', '64'), + # ('_FILE_OFFSET_BITS', '64'), # ('_USE_KNETFILE', '')] + return [] def get_libraries(): diff --git a/pysam/htslib_util.c b/pysam/htslib_util.c index c584a23..0830900 100644 --- a/pysam/htslib_util.c +++ b/pysam/htslib_util.c @@ -22,15 +22,12 @@ int hts_set_verbosity(int verbosity) return old_verbosity; } -int hts_get_verbosity() +int hts_get_verbosity(void) { return hts_verbose; } -int hts_get_hts_verbose(); - - // taken from samtools/bam_import.c static inline uint8_t * alloc_data(bam1_t *b, size_t size) { @@ -160,6 +157,3 @@ int aux_type2size(uint8_t type) return 0; } } - - - diff --git a/pysam/htslib_util.h b/pysam/htslib_util.h index cb72853..25bd3e4 100644 --- a/pysam/htslib_util.h +++ b/pysam/htslib_util.h @@ -9,7 +9,7 @@ int hts_useek(htsFile *fp, long uoffset, int where); long hts_utell(htsFile *fp); int hts_set_verbosity(int verbosity); -int hts_get_verbosity(); +int hts_get_verbosity(void); KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) @@ -95,15 +95,13 @@ static inline char pysam_bam_seqi(uint8_t * s, int i) { static inline uint8_t pysam_get_qual(bam1_t * b) { return b->core.qual;} - -static inline uint16_t pysam_get_n_cigar(bam1_t * b) { +static inline uint32_t pysam_get_n_cigar(bam1_t * b) { return b->core.n_cigar;} static inline void pysam_set_qual(bam1_t * b, uint8_t v) { b->core.qual=v;} - -static inline void pysam_set_n_cigar(bam1_t * b, uint16_t v) { +static inline void pysam_set_n_cigar(bam1_t * b, uint32_t v) { b->core.n_cigar=v;} static inline void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) { @@ -113,5 +111,4 @@ static inline void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) { b->core.flag &= ~flag; } - #endif diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd index d65beee..48ca93f 100644 --- a/pysam/libcalignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -20,13 +20,13 @@ cdef extern from "htslib_util.h": char pysam_bam_seqi(uint8_t * s, int i) uint8_t pysam_get_qual(bam1_t * b) - uint16_t pysam_get_n_cigar(bam1_t * b) + uint32_t pysam_get_n_cigar(bam1_t * b) void pysam_set_qual(bam1_t * b, uint8_t v) - void pysam_set_n_cigar(bam1_t * b, uint16_t v) + void pysam_set_n_cigar(bam1_t * b, uint32_t v) void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) -from pysam.libcalignmentfile cimport AlignmentFile +from pysam.libcalignmentfile cimport AlignmentFile, AlignmentHeader ctypedef AlignmentFile AlignmentFile_t @@ -36,8 +36,8 @@ cdef class AlignedSegment: # object that this AlignedSegment represents cdef bam1_t * _delegate - # the file from which this AlignedSegment originates (can be None) - cdef AlignmentFile _alignment_file + # the header that a read is associated with + cdef readonly AlignmentHeader header # caching of array properties for quick access cdef object cache_query_qualities @@ -57,7 +57,10 @@ cdef class AlignedSegment: cpdef has_tag(self, tag) # returns a valid sam alignment string - cpdef tostring(self, AlignmentFile_t handle) + cpdef to_string(self) + + # returns a valid sam alignment string (deprecated) + cpdef tostring(self, htsfile=*) cdef class PileupColumn: @@ -65,12 +68,14 @@ cdef class PileupColumn: cdef int tid cdef int pos cdef int n_pu - cdef AlignmentFile _alignment_file - + cdef AlignmentHeader header + cdef uint32_t min_base_quality + cdef uint8_t * buf + cdef char * reference_sequence cdef class PileupRead: - cdef AlignedSegment _alignment cdef int32_t _qpos + cdef AlignedSegment _alignment cdef int _indel cdef int _level cdef uint32_t _is_del @@ -78,8 +83,21 @@ cdef class PileupRead: cdef uint32_t _is_tail cdef uint32_t _is_refskip -# factor methods -cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file) -cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, int n_pu, AlignmentFile alignment_file) -cdef makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file) +# factory methods +cdef AlignedSegment makeAlignedSegment( + bam1_t * src, + AlignmentHeader header) + +cdef PileupColumn makePileupColumn( + bam_pileup1_t ** plp, + int tid, + int pos, + int n_pu, + uint32_t min_base_quality, + char * reference_sequence, + AlignmentHeader header) + +cdef PileupRead makePileupRead(bam_pileup1_t * src, + AlignmentHeader header) + cdef uint32_t get_alignment_length(bam1_t * src) diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index 4b3b4dd..e94db54 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -55,6 +55,8 @@ ############################################################################### import re import array +import json +import string import ctypes import struct @@ -67,7 +69,9 @@ from cpython cimport array as c_array from libc.stdint cimport INT8_MIN, INT16_MIN, INT32_MIN, \ INT8_MAX, INT16_MAX, INT32_MAX, \ UINT8_MAX, UINT16_MAX, UINT32_MAX +from libc.stdio cimport snprintf +from pysam.libchtslib cimport HTS_IDX_NOCOOR from pysam.libcutils cimport force_bytes, force_str, \ charptr_to_str, charptr_to_bytes from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, \ @@ -85,21 +89,66 @@ cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 cdef char* CODE2CIGAR= "MIDNSHP=XB" cdef int NCIGAR_CODES = 10 +# dimensioned for 8000 pileup limit (+ insertions/deletions) +cdef uint32_t MAX_PILEUP_BUFFER_SIZE = 10000 + if IS_PYTHON3: CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) + maketrans = str.maketrans else: CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR)) + maketrans = string.maketrans CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") +# names for keys in dictionary representation of an AlignedSegment +KEY_NAMES = ["name", "flag", "ref_name", "ref_pos", "map_quality", "cigar", + "next_ref_name", "next_ref_pos", "length", "seq", "qual", "tags"] + ##################################################################### # C multiplication with wrapping around cdef inline uint32_t c_mul(uint32_t a, uint32_t b): return (a * b) & 0xffffffff -##################################################################### -# typecode guessing +cdef inline uint8_t tolower(uint8_t ch): + if ch >= 65 and ch <= 90: + return ch + 32 + else: + return ch + + +cdef inline uint8_t toupper(uint8_t ch): + if ch >= 97 and ch <= 122: + return ch - 32 + else: + return ch + + +cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b): + if ch == '=': + if bam_is_rev(b): + return ',' + else: + return '.' + else: + if bam_is_rev(b): + return tolower(ch) + else: + return toupper(ch) + + +cdef inline bint pileup_base_qual_skip(bam_pileup1_t * p, uint32_t threshold): + cdef uint32_t c + if p.qpos < p.b.core.l_qseq: + c = bam_get_qual(p.b)[p.qpos] + else: + c = 0 + if c < threshold: + return True + return False + + cdef inline char map_typecode_htslib_to_python(uint8_t s): """map an htslib typecode to the corresponding python typecode to be used in the struct or array modules.""" @@ -111,6 +160,7 @@ cdef inline char map_typecode_htslib_to_python(uint8_t s): return 0 return parray_types[f - htslib_types] + cdef inline uint8_t map_typecode_python_to_htslib(char s): """determine value type from type code of array""" cdef char * f = strchr(parray_types, s) @@ -118,6 +168,29 @@ cdef inline uint8_t map_typecode_python_to_htslib(char s): return 0 return htslib_types[f - parray_types] + +cdef inline void update_bin(bam1_t * src): + if src.core.flag & BAM_FUNMAP: + # treat alignment as length of 1 for unmapped reads + src.core.bin = hts_reg2bin( + src.core.pos, + src.core.pos + 1, + 14, + 5) + elif pysam_get_n_cigar(src): + src.core.bin = hts_reg2bin( + src.core.pos, + bam_endpos(src), + 14, + 5) + else: + src.core.bin = hts_reg2bin( + src.core.pos, + src.core.pos + 1, + 14, + 5) + + # optional tag data manipulation cdef convert_binary_tag(uint8_t * tag): """return bytesize, number of values and array of values @@ -519,38 +592,53 @@ cdef inline object getQualitiesInRange(bam1_t *src, ##################################################################### -## private factory methods +## factory methods for instantiating extension classes cdef class AlignedSegment -cdef makeAlignedSegment(bam1_t * src, AlignmentFile alignment_file): +cdef AlignedSegment makeAlignedSegment(bam1_t *src, + AlignmentHeader header): '''return an AlignedSegment object constructed from `src`''' # note that the following does not call __init__ cdef AlignedSegment dest = AlignedSegment.__new__(AlignedSegment) dest._delegate = bam_dup1(src) - dest._alignment_file = alignment_file + dest.header = header return dest cdef class PileupColumn -cdef makePileupColumn(bam_pileup1_t ** plp, int tid, int pos, - int n_pu, AlignmentFile alignment_file): +cdef PileupColumn makePileupColumn(bam_pileup1_t ** plp, + int tid, + int pos, + int n_pu, + uint32_t min_base_quality, + char * reference_sequence, + AlignmentHeader header): '''return a PileupColumn object constructed from pileup in `plp` and setting additional attributes. ''' # note that the following does not call __init__ cdef PileupColumn dest = PileupColumn.__new__(PileupColumn) - dest._alignment_file = alignment_file + dest.header = header dest.plp = plp dest.tid = tid dest.pos = pos dest.n_pu = n_pu + dest.min_base_quality = min_base_quality + dest.reference_sequence = reference_sequence + dest.buf = calloc(MAX_PILEUP_BUFFER_SIZE, sizeof(uint8_t)) + if dest.buf == NULL: + raise MemoryError("could not allocate pileup buffer") + return dest + cdef class PileupRead -cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file): +cdef PileupRead makePileupRead(bam_pileup1_t *src, + AlignmentHeader header): '''return a PileupRead object construted from a bam_pileup1_t * object.''' + # note that the following does not call __init__ cdef PileupRead dest = PileupRead.__new__(PileupRead) - dest._alignment = makeAlignedSegment(src.b, alignment_file) + dest._alignment = makeAlignedSegment(src.b, header) dest._qpos = src.qpos dest._indel = src.indel dest._level = src.level @@ -561,8 +649,8 @@ cdef inline makePileupRead(bam_pileup1_t * src, AlignmentFile alignment_file): return dest -cdef inline uint32_t get_alignment_length(bam1_t * src): - cdef int k = 0 +cdef inline uint32_t get_alignment_length(bam1_t *src): + cdef uint32_t k = 0 cdef uint32_t l = 0 if src == NULL: return 0 @@ -570,7 +658,7 @@ cdef inline uint32_t get_alignment_length(bam1_t * src): if cigar_p == NULL: return 0 cdef int op - cdef int n = pysam_get_n_cigar(src) + cdef uint32_t n = pysam_get_n_cigar(src) for k from 0 <= k < n: op = cigar_p[k] & BAM_CIGAR_MASK if op == BAM_CSOFT_CLIP or op == BAM_CHARD_CLIP: @@ -578,6 +666,7 @@ cdef inline uint32_t get_alignment_length(bam1_t * src): l += cigar_p[k] >> BAM_CIGAR_SHIFT return l + cdef inline uint32_t get_md_reference_length(char * md_tag): cdef int l = 0 cdef int md_idx = 0 @@ -629,6 +718,10 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): if src == NULL: return None + cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") + if md_tag_ptr == NULL: + return None + cdef uint32_t start = getQueryStart(src) cdef uint32_t end = getQueryEnd(src) # get read sequence, taking into account soft-clipping @@ -682,12 +775,6 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): "Padding (BAM_CPAD, 6) is currently not supported. " "Please implement. Sorry about that.") - cdef uint8_t * md_tag_ptr = bam_aux_get(src, "MD") - if md_tag_ptr == NULL: - seq = PyBytes_FromStringAndSize(s, s_idx) - free(s) - return seq - cdef char * md_tag = bam_aux2Z(md_tag_ptr) cdef int md_idx = 0 s_idx = 0 @@ -695,7 +782,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): # Check if MD tag is valid by matching CIGAR length to MD tag defined length # Insertions would be in addition to what is described by MD, so we calculate # the number of insertions seperately. - insertions = 0 + cdef int insertions = 0 while s[s_idx] != 0: if s[s_idx] >= 'a': @@ -705,7 +792,9 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): cdef uint32_t md_len = get_md_reference_length(md_tag) if md_len + insertions > max_len: - raise AssertionError("Invalid MD tag: MD length {} mismatch with CIGAR length {}".format(md_len, max_len)) + raise AssertionError( + "Invalid MD tag: MD length {} mismatch with CIGAR length {} and {} insertions".format( + md_len, max_len, insertions)) while md_tag[md_idx] != 0: # c is numerical @@ -728,7 +817,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): if md_tag[md_idx] == '^': md_idx += 1 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: - assert s[s_idx] == '-' + # assert s[s_idx] == '-' s[s_idx] = md_tag[md_idx] s_idx += 1 md_idx += 1 @@ -753,6 +842,60 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): return seq +cdef inline bytes build_reference_sequence(bam1_t * src): + """return the reference sequence in the region that is covered by the + alignment of the read to the reference. + + This method requires the MD tag to be set. + + """ + cdef uint32_t k, i, l + cdef int op + cdef int s_idx = 0 + ref_seq = build_alignment_sequence(src) + if ref_seq is None: + raise ValueError("MD tag not present") + + cdef char * s = calloc(len(ref_seq) + 1, sizeof(char)) + if s == NULL: + raise ValueError( + "could not allocate sequence of length %i" % len(ref_seq)) + + cdef char * cref_seq = ref_seq + cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) + cdef uint32_t r_idx = 0 + for k from 0 <= k < pysam_get_n_cigar(src): + op = cigar_p[k] & BAM_CIGAR_MASK + l = cigar_p[k] >> BAM_CIGAR_SHIFT + if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: + for i from 0 <= i < l: + s[s_idx] = cref_seq[r_idx] + r_idx += 1 + s_idx += 1 + elif op == BAM_CDEL: + for i from 0 <= i < l: + s[s_idx] = cref_seq[r_idx] + r_idx += 1 + s_idx += 1 + elif op == BAM_CREF_SKIP: + pass + elif op == BAM_CINS: + r_idx += l + elif op == BAM_CSOFT_CLIP: + pass + elif op == BAM_CHARD_CLIP: + pass # advances neither + elif op == BAM_CPAD: + raise NotImplementedError( + "Padding (BAM_CPAD, 6) is currently not supported. " + "Please implement. Sorry about that.") + + seq = PyBytes_FromStringAndSize(s, s_idx) + free(s) + + return seq + + cdef class AlignedSegment: '''Class representing an aligned segment. @@ -770,10 +913,17 @@ cdef class AlignedSegment: One issue to look out for is that the sequence should always be set *before* the quality scores. Setting the sequence will also erase any quality scores that were set previously. + + Parameters + ---------- + + header -- :class:`~pysam.AlignmentHeader` object to map numerical + identifiers to chromosome names. If not given, an empty + header is created. ''' # Now only called when instances are created from Python - def __init__(self): + def __init__(self, AlignmentHeader header=None): # see bam_init1 self._delegate = calloc(1, sizeof(bam1_t)) if self._delegate == NULL: @@ -800,7 +950,9 @@ cdef class AlignedSegment: self.cache_query_alignment_qualities = None self.cache_query_sequence = None self.cache_query_alignment_sequence = None - + + self.header = header + def __dealloc__(self): bam_destroy1(self._delegate) @@ -812,7 +964,7 @@ cdef class AlignedSegment: As a result :term:`tid` is shown instead of the reference name. Similarly, the tags field is returned in its parsed state. - To get a valid SAM record, use :meth:`tostring`. + To get a valid SAM record, use :meth:`to_string`. """ # sam-parsing is done in sam.c/bam_format1_core which # requires a valid header. @@ -830,15 +982,14 @@ cdef class AlignedSegment: self.tags))) def __copy__(self): - return makeAlignedSegment(self._delegate, self._alignment_file) + return makeAlignedSegment(self._delegate, self.header) def __deepcopy__(self, memo): - return makeAlignedSegment(self._delegate, self._alignment_file) + return makeAlignedSegment(self._delegate, self.header) def compare(self, AlignedSegment other): '''return -1,0,1, if contents in this are binary <,=,> to *other* - ''' cdef int retval, x @@ -897,32 +1048,24 @@ cdef class AlignedSegment: return hash_value - cpdef tostring(self, AlignmentFile_t htsfile): + cpdef to_string(self): """returns a string representation of the aligned segment. - The output format is valid SAM format. - - Parameters - ---------- - - htsfile -- AlignmentFile object to map numerical - identifiers to chromosome names. + The output format is valid SAM format if a header is associated + with the AlignedSegment. """ - cdef int n_targets = htsfile.header.n_targets - - if self._delegate.core.tid >= n_targets \ - or self._delegate.core.mtid >= n_targets: - raise ValueError('htsfile does not match aligned segment') - cdef kstring_t line line.l = line.m = 0 line.s = NULL - if sam_format1(htsfile.header, self._delegate, &line) < 0: - if line.m: - free(line.s) - raise ValueError('sam_format failed') - + if self.header: + if sam_format1(self.header.ptr, self._delegate, &line) < 0: + if line.m: + free(line.s) + raise ValueError('sam_format failed') + else: + raise NotImplementedError("todo") + ret = force_str(line.s[:line.l]) if line.m: @@ -930,6 +1073,70 @@ cdef class AlignedSegment: return ret + @classmethod + def fromstring(cls, sam, AlignmentHeader header): + """parses a string representation of the aligned segment. + + The input format should be valid SAM format. + + Parameters + ---------- + sam -- :term:`SAM` formatted string + + """ + cdef AlignedSegment dest = cls.__new__(cls) + dest._delegate = calloc(1, sizeof(bam1_t)) + dest.header = header + + cdef kstring_t line + line.l = line.m = len(sam) + _sam = force_bytes(sam) + line.s = _sam + + sam_parse1(&line, dest.header.ptr, dest._delegate) + + return dest + + cpdef tostring(self, htsfile=None): + """deprecated, use :meth:`to_string()` instead. + + Parameters + ---------- + + htsfile -- (deprecated) AlignmentFile object to map numerical + identifiers to chromosome names. This parameter is present + for backwards compatibility and ignored. + """ + + return self.to_string() + + def to_dict(self): + """returns a json representation of the aligned segment. + + Field names are abbreviated versions of the class attributes. + """ + # let htslib do the string conversions, but treat optional field properly as list + vals = self.to_string().split("\t") + n = len(KEY_NAMES) - 1 + return dict(list(zip(KEY_NAMES[:-1], vals[:n])) + [(KEY_NAMES[-1], vals[n:])]) + + @classmethod + def from_dict(cls, sam_dict, AlignmentHeader header): + """parses a dictionary representation of the aligned segment. + + Parameters + ---------- + sam_dict -- dictionary of alignment values, keys corresponding to output from + :meth:`todict()`. + + """ + # let htslib do the parsing + # the tags field can be missing + return cls.fromstring( + "\t".join((sam_dict[x] for x in KEY_NAMES[:-1])) + + "\t" + + "\t".join(sam_dict.get(KEY_NAMES[-1], [])), header) + ######################################################## ## Basic attributes in order of appearance in SAM format property query_name: @@ -993,11 +1200,26 @@ cdef class AlignedSegment: self._delegate.core.flag = flag property reference_name: - """:term:`reference` name (None if no AlignmentFile is associated)""" + """:term:`reference` name""" def __get__(self): - if self._alignment_file is not None: - return self._alignment_file.getrname(self._delegate.core.tid) - return None + if self._delegate.core.tid == -1: + return None + if self.header: + return self.header.get_reference_name(self._delegate.core.tid) + else: + raise ValueError("reference_name unknown if no header associated with record") + def __set__(self, reference): + cdef int tid + if reference is None or reference == "*": + self._delegate.core.tid = -1 + elif self.header: + tid = self.header.get_tid(reference) + if tid < 0: + raise ValueError("reference {} does not exist in header".format( + reference)) + self._delegate.core.tid = tid + else: + raise ValueError("reference_name can not be set if no header associated with record") property reference_id: """:term:`reference` ID @@ -1006,33 +1228,27 @@ cdef class AlignedSegment: This field contains the index of the reference sequence in the sequence dictionary. To obtain the name of the - reference sequence, use - :meth:`pysam.AlignmentFile.getrname()` + reference sequence, use :meth:`get_reference_name()` """ - def __get__(self): return self._delegate.core.tid - def __set__(self, tid): self._delegate.core.tid = tid + def __get__(self): + return self._delegate.core.tid + def __set__(self, tid): + if tid != -1 and self.header and not self.header.is_valid_tid(tid): + raise ValueError("reference id {} does not exist in header".format( + tid)) + self._delegate.core.tid = tid property reference_start: """0-based leftmost coordinate""" - def __get__(self): return self._delegate.core.pos + def __get__(self): + return self._delegate.core.pos def __set__(self, pos): ## setting the position requires updating the "bin" attribute cdef bam1_t * src src = self._delegate src.core.pos = pos - if pysam_get_n_cigar(src): - src.core.bin = hts_reg2bin( - src.core.pos, - bam_endpos(src), - 14, - 5) - else: - src.core.bin = hts_reg2bin( - src.core.pos, - src.core.pos + 1, - 14, - 5) + update_bin(src) property mapping_quality: """mapping quality""" @@ -1080,17 +1296,39 @@ cdef class AlignedSegment: property next_reference_id: """the :term:`reference` id of the mate/next read.""" - def __get__(self): return self._delegate.core.mtid + def __get__(self): + return self._delegate.core.mtid def __set__(self, mtid): + if mtid != -1 and self.header and not self.header.is_valid_tid(mtid): + raise ValueError("reference id {} does not exist in header".format( + mtid)) self._delegate.core.mtid = mtid property next_reference_name: """:term:`reference` name of the mate/next read (None if no AlignmentFile is associated)""" def __get__(self): - if self._alignment_file is not None: - return self._alignment_file.getrname(self._delegate.core.mtid) - return None + if self._delegate.core.mtid == -1: + return None + if self.header: + return self.header.get_reference_name(self._delegate.core.mtid) + else: + raise ValueError("next_reference_name unknown if no header associated with record") + + def __set__(self, reference): + cdef int mtid + if reference is None or reference == "*": + self._delegate.core.mtid = -1 + elif reference == "=": + self._delegate.core.mtid = self._delegate.core.tid + elif self.header: + mtid = self.header.get_tid(reference) + if mtid < 0: + raise ValueError("reference {} does not exist in header".format( + reference)) + self._delegate.core.mtid = mtid + else: + raise ValueError("next_reference_name can not be set if no header associated with record") property next_reference_start: """the position of the mate/next read.""" @@ -1316,6 +1554,10 @@ cdef class AlignedSegment: return (self.flag & BAM_FUNMAP) != 0 def __set__(self, val): pysam_update_flag(self._delegate, val, BAM_FUNMAP) + # setting the unmapped flag requires recalculation of + # bin as alignment length is now implicitely 1 + update_bin(self._delegate) + property mate_is_unmapped: """true if the mate is unmapped""" def __get__(self): @@ -1510,7 +1752,7 @@ cdef class AlignedSegment: thus be of the same length as the read. """ - cdef uint32_t k, i, pos + cdef uint32_t k, i, l, pos cdef int op cdef uint32_t * cigar_p cdef bam1_t * src @@ -1575,46 +1817,37 @@ cdef class AlignedSegment: return None def get_reference_sequence(self): - """return the reference sequence. + """return the reference sequence in the region that is covered by the + alignment of the read to the reference. This method requires the MD tag to be set. + """ - cdef uint32_t k, i - cdef int op - cdef bam1_t * src = self._delegate - ref_seq = force_str(build_alignment_sequence(src)) - if ref_seq is None: - raise ValueError("MD tag not present") + return force_str(build_reference_sequence(self._delegate)) - cdef uint32_t * cigar_p = pysam_bam_get_cigar(src) - cdef uint32_t r_idx = 0 - result = [] - for k from 0 <= k < pysam_get_n_cigar(src): - op = cigar_p[k] & BAM_CIGAR_MASK - l = cigar_p[k] >> BAM_CIGAR_SHIFT - if op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF: - for i from 0 <= i < l: - result.append(ref_seq[r_idx]) - r_idx += 1 - elif op == BAM_CDEL: - for i from 0 <= i < l: - result.append(ref_seq[r_idx]) - r_idx += 1 - elif op == BAM_CREF_SKIP: - pass - elif op == BAM_CINS: - r_idx += l - elif op == BAM_CSOFT_CLIP: - pass - elif op == BAM_CHARD_CLIP: - pass # advances neither - elif op == BAM_CPAD: - raise NotImplementedError( - "Padding (BAM_CPAD, 6) is currently not supported. " - "Please implement. Sorry about that.") + def get_forward_sequence(self): + """return the original read sequence. + + Reads mapping to the reverse strand will be reverse + complemented. + """ + s = force_str(self.query_sequence) + if self.is_reverse: + s = s.translate(maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1] + return s - return "".join(result) + def get_forward_qualities(self): + """return the original read sequence. + + Reads mapping to the reverse strand will be reverse + complemented. + """ + if self.is_reverse: + return self.query_qualities[::-1] + else: + return self.query_qualities + def get_aligned_pairs(self, matches_only=False, with_seq=False): """a list of aligned read (query) and reference positions. @@ -1651,7 +1884,8 @@ cdef class AlignedSegment: # read sequence, cigar and MD tag are consistent. if _with_seq: - ref_seq = force_str(self.get_reference_sequence()) + # force_str required for py2/py3 compatibility + ref_seq = force_str(build_reference_sequence(src)) if ref_seq is None: raise ValueError("MD tag not present") @@ -1920,7 +2154,7 @@ cdef class AlignedSegment: cdef uint32_t * cigar_p cdef bam1_t * src cdef uint32_t op, l - cdef int k + cdef uint32_t k src = self._delegate if pysam_get_n_cigar(src) == 0: @@ -1939,7 +2173,7 @@ cdef class AlignedSegment: cdef uint32_t * p cdef bam1_t * src cdef op, l - cdef int k, ncigar + cdef int k k = 0 @@ -1952,8 +2186,8 @@ cdef class AlignedSegment: if values is None: values = [] - ncigar = len(values) - # create space for cigar data within src.data + cdef uint32_t ncigar = len(values) + cdef bam1_t * retval = pysam_bam_update(src, pysam_get_n_cigar(src) * 4, ncigar * 4, @@ -1975,12 +2209,7 @@ cdef class AlignedSegment: k += 1 ## setting the cigar string requires updating the bin - src.core.bin = hts_reg2bin( - src.core.pos, - bam_endpos(src), - 14, - 5) - + update_bin(src) cpdef set_tag(self, tag, @@ -2539,7 +2768,6 @@ cdef class PileupColumn: This class is a proxy for results returned by the samtools pileup engine. If the underlying engine iterator advances, the results of this column will change. - ''' def __init__(self): raise TypeError("this class cannot be instantiated from Python") @@ -2552,6 +2780,22 @@ cdef class PileupColumn: "\n" +\ "\n".join(map(str, self.pileups)) + def __dealloc__(self): + if self.buf is not NULL: + free(self.buf) + + def set_min_base_quality(self, min_base_quality): + """set the minimum base quality for this pileup column. + """ + self.min_base_quality = min_base_quality + + def __len__(self): + """return number of reads aligned to this column. + + see :meth:`get_num_aligned` + """ + return self.get_num_aligned() + property reference_id: '''the reference sequence number as defined in the header''' def __get__(self): @@ -2565,7 +2809,9 @@ cdef class PileupColumn: return None property nsegments: - '''number of reads mapping to this column.''' + '''number of reads mapping to this column. + + Note that this number ignores the base quality filter.''' def __get__(self): return self.n_pu def __set__(self, n): @@ -2579,17 +2825,20 @@ cdef class PileupColumn: property pileups: '''list of reads (:class:`pysam.PileupRead`) aligned to this column''' def __get__(self): - cdef int x - pileups = [] - if self.plp == NULL or self.plp[0] == NULL: raise ValueError("PileupColumn accessed after iterator finished") + cdef int x + cdef bam_pileup1_t * p = NULL + pileups = [] + # warning: there could be problems if self.n and self.buf are # out of sync. for x from 0 <= x < self.n_pu: - pileups.append(makePileupRead(&(self.plp[0][x]), - self._alignment_file)) + p = &(self.plp[0][x]) + if pileup_base_qual_skip(p, self.min_base_quality): + continue + pileups.append(makePileupRead(p, self.header)) return pileups ######################################################## @@ -2597,23 +2846,274 @@ cdef class PileupColumn: # Functions, properties for compatibility with pysam < 0.8 ######################################################## property pos: + """deprecated: use reference_pos""" def __get__(self): return self.reference_pos def __set__(self, v): self.reference_pos = v property tid: + """deprecated: use reference_id""" def __get__(self): return self.reference_id def __set__(self, v): self.reference_id = v property n: + """deprecated: use nsegments""" def __get__(self): return self.nsegments def __set__(self, v): self.nsegments = v + def get_num_aligned(self): + """return number of aligned bases at pileup column position. + + This method applies a base quality filter and the number is + equal to the size of :meth:`get_query_sequences`, + :meth:`get_mapping_qualities`, etc. + + """ + cdef uint32_t x = 0 + cdef uint32_t c = 0 + cdef uint32_t cnt = 0 + cdef bam_pileup1_t * p = NULL + for x from 0 <= x < self.n_pu: + p = &(self.plp[0][x]) + if pileup_base_qual_skip(p, self.min_base_quality): + continue + cnt += 1 + return cnt + + def get_query_sequences(self, bint mark_matches=False, bint mark_ends=False, bint add_indels=False): + """query bases/sequences at pileup column position. + + Optionally, the bases/sequences can be annotated according to the samtools + mpileup format. This is the format description from the samtools mpileup tool:: + + Information on match, mismatch, indel, strand, mapping + quality and start and end of a read are all encoded at the + read base column. At this column, a dot stands for a match + to the reference base on the forward strand, a comma for a + match on the reverse strand, a '>' or '<' for a reference + skip, `ACGTN' for a mismatch on the forward strand and + `acgtn' for a mismatch on the reverse strand. A pattern + `\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion + between this reference position and the next reference + position. The length of the insertion is given by the + integer in the pattern, followed by the inserted + sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' + represents a deletion from the reference. The deleted bases + will be presented as `*' in the following lines. Also at + the read base column, a symbol `^' marks the start of a + read. The ASCII of the character following `^' minus 33 + gives the mapping quality. A symbol `$' marks the end of a + read segment + + To reproduce samtools mpileup format, set all of mark_matches, + mark_ends and add_indels to True. + + Parameters + ---------- + + mark_matches: bool + + If True, output bases matching the reference as "," or "." + for forward and reverse strand, respectively. This mark + requires the reference sequence. If no reference is + present, this option is ignored. + + mark_ends : bool + + If True, add markers "^" and "$" for read start and end, respectively. + + add_indels : bool + + If True, add bases for bases inserted into the reference and + 'N's for base skipped from the reference. If a reference sequence + is given, add the actual bases. + + Returns + ------- + + list: a list of bases/sequences per read at pileup column position. + + """ + cdef uint32_t x = 0 + cdef uint32_t j = 0 + cdef uint32_t c = 0 + cdef uint32_t n = 0 + cdef uint8_t cc = 0 + cdef uint8_t rb = 0 + cdef uint8_t * buf = self.buf + cdef bam_pileup1_t * p = NULL + + # todo: reference sequence to count matches/mismatches + # todo: convert assertions to exceptions + for x from 0 <= x < self.n_pu: + p = &(self.plp[0][x]) + if pileup_base_qual_skip(p, self.min_base_quality): + continue + # see samtools pileup_seq + if mark_ends and p.is_head: + buf[n] = '^' + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + + if p.b.core.qual > 93: + buf[n] = 126 + else: + buf[n] = p.b.core.qual + 33 + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + + if not p.is_del: + if p.qpos < p.b.core.l_qseq: + cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)] + else: + cc = 'N' + + if mark_matches and self.reference_sequence != NULL: + rb = self.reference_sequence[self.reference_pos] + if seq_nt16_table[cc] == seq_nt16_table[rb]: + cc = "=" + buf[n] = strand_mark_char(cc, p.b) + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + elif add_indels: + if p.is_refskip: + if bam_is_rev(p.b): + buf[n] = '<' + else: + buf[n] = '>' + else: + buf[n] = '*' + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + if add_indels: + if p.indel > 0: + buf[n] = '+' + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + n += snprintf(&(buf[n]), + MAX_PILEUP_BUFFER_SIZE - n, + "%i", + p.indel) + assert n < MAX_PILEUP_BUFFER_SIZE + for j from 1 <= j <= p.indel: + cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)] + buf[n] = strand_mark_char(cc, p.b) + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + elif p.indel < 0: + buf[n] = '-' + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + n += snprintf(&(buf[n]), + MAX_PILEUP_BUFFER_SIZE - n, + "%i", + -p.indel) + assert n < MAX_PILEUP_BUFFER_SIZE + for j from 1 <= j <= -p.indel: + # TODO: out-of-range check here? + if self.reference_sequence == NULL: + cc = 'N' + else: + cc = self.reference_sequence[self.reference_pos + j] + buf[n] = strand_mark_char(cc, p.b) + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + if mark_ends and p.is_tail: + buf[n] = '$' + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + + buf[n] = ':' + n += 1 + assert n < MAX_PILEUP_BUFFER_SIZE + + # quicker to ensemble all and split than to encode all separately. + # ignore last ":" + return force_str(PyBytes_FromStringAndSize(buf, n-1)).split(":") + + def get_query_qualities(self): + """query base quality scores at pileup column position. + + Returns + ------- + + list: a list of quality scores + """ + cdef uint32_t x = 0 + cdef bam_pileup1_t * p = NULL + cdef uint32_t c = 0 + result = [] + for x from 0 <= x < self.n_pu: + p = &(self.plp[0][x]) + if p.qpos < p.b.core.l_qseq: + c = bam_get_qual(p.b)[p.qpos] + else: + c = 0 + if c < self.min_base_quality: + continue + result.append(c) + return result + + def get_mapping_qualities(self): + """query mapping quality scores at pileup column position. + + Returns + ------- + + list: a list of quality scores + """ + cdef uint32_t x = 0 + cdef bam_pileup1_t * p = NULL + result = [] + for x from 0 <= x < self.n_pu: + p = &(self.plp[0][x]) + if pileup_base_qual_skip(p, self.min_base_quality): + continue + result.append(p.b.core.qual) + return result + + def get_query_positions(self): + """positions in read at pileup column position. + + Returns + ------- + + list: a list of read positions + """ + + cdef uint32_t x = 0 + cdef bam_pileup1_t * p = NULL + result = [] + for x from 0 <= x < self.n_pu: + p = &(self.plp[0][x]) + if pileup_base_qual_skip(p, self.min_base_quality): + continue + result.append(p.qpos) + return result + + def get_query_names(self): + """query/read names aligned at pileup column position. + + Returns + ------- + + list: a list of query names at pileup column position. + """ + cdef uint32_t x = 0 + cdef bam_pileup1_t * p = NULL + result = [] + for x from 0 <= x < self.n_pu: + p = &(self.plp[0][x]) + if pileup_base_qual_skip(p, self.min_base_quality): + continue + result.append(charptr_to_str(pysam_bam_get_qname(p.b))) + return result + cdef class PileupRead: '''Representation of a read aligned to a particular position in the @@ -2697,6 +3197,7 @@ cdef class PileupRead: def __get__(self): return self._is_refskip + cpdef enum CIGAR_OPS: CMATCH = 0 @@ -2763,5 +3264,5 @@ __all__ = [ "FSECONDARY", "FQCFAIL", "FDUP", - "FSUPPLEMENTARY"] - + "FSUPPLEMENTARY", + "KEY_NAMES"] diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index fb2bd0c..7410230 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -18,11 +18,6 @@ cdef extern from "htslib_util.h": char * pysam_bam_get_qname(bam1_t * b) -cdef extern from "samfile_util.h": - - int bam_cap_mapQ(bam1_t *b, char *ref, int thres) - int bam_prob_realn(bam1_t *b, const char *ref) - #################################################################### # Utility types @@ -34,15 +29,24 @@ ctypedef struct __iterdata: int tid char * seq int seq_len + int min_mapping_quality + int flag_require + int flag_filter + bint compute_baq + bint redo_baq + bint ignore_orphans + int adjust_capq_threshold +cdef class AlignmentHeader(object): + cdef bam_hdr_t *ptr + cdef class AlignmentFile(HTSFile): cdef readonly object reference_filename + cdef readonly AlignmentHeader header # pointer to index cdef hts_idx_t *index - # header structure - cdef bam_hdr_t * header # current read within iteration cdef bam1_t * b @@ -77,7 +81,8 @@ cdef class IteratorRow: cdef bam1_t * b cdef AlignmentFile samfile cdef htsFile * htsfile - cdef bam_hdr_t * header + cdef hts_idx_t * index + cdef AlignmentHeader header cdef int owns_samfile @@ -86,12 +91,14 @@ cdef class IteratorRowRegion(IteratorRow): cdef bam1_t * getCurrent(self) cdef int cnext(self) + cdef class IteratorRowHead(IteratorRow): cdef int max_rows cdef int current_row cdef bam1_t * getCurrent(self) cdef int cnext(self) + cdef class IteratorRowAll(IteratorRow): cdef bam1_t * getCurrent(self) cdef int cnext(self) @@ -116,27 +123,29 @@ cdef class IteratorColumn: cdef int tid cdef int pos cdef int n_plp - cdef int mask + cdef uint32_t min_base_quality cdef bam_pileup1_t * plp - cdef bam_plp_t pileup_iter + cdef bam_mplp_t pileup_iter cdef __iterdata iterdata cdef AlignmentFile samfile cdef FastaFile fastafile cdef stepper cdef int max_depth + cdef bint ignore_overlaps cdef int cnext(self) - cdef char * getSequence(self) - cdef setMask(self, mask) - cdef setupIteratorData(self, - int tid, - int start, - int stop, - int multiple_iterators=?) + cdef char * get_sequence(self) + cdef _setup_iterator(self, + int tid, + int start, + int stop, + int multiple_iterators=?) cdef reset(self, tid, start, stop) cdef _free_pileup_iter(self) - + # backwards compatibility + cdef char * getSequence(self) + cdef class IteratorColumnRegion(IteratorColumn): cdef int start @@ -151,6 +160,6 @@ cdef class IteratorColumnAllRefs(IteratorColumn): cdef class IndexedReads: cdef AlignmentFile samfile cdef htsFile * htsfile - cdef index + cdef object index cdef int owns_samfile - cdef bam_hdr_t * header + cdef AlignmentHeader header diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 1599dfa..439cc55 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -8,6 +8,8 @@ # # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files # +# class AlignmentHeader manage SAM/BAM/CRAM header data +# # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping # the original sort order intact # @@ -57,7 +59,6 @@ import collections import re import warnings import array - from libc.errno cimport errno, EPIPE from libc.string cimport strcmp, strpbrk, strerror from cpython cimport array as c_array @@ -76,6 +77,13 @@ else: cimport cython +__all__ = [ + "AlignmentFile", + "AlignmentHeader", + "IteratorRow", + "IteratorColumn", + "IndexedReads"] + IndexStats = collections.namedtuple("IndexStats", ("contig", "mapped", @@ -88,11 +96,11 @@ IndexStats = collections.namedtuple("IndexStats", cdef int MAX_POS = 2 << 29 # valid types for SAM headers -VALID_HEADER_TYPES = {"HD" : dict, - "SQ" : list, - "RG" : list, - "PG" : list, - "CO" : list} +VALID_HEADER_TYPES = {"HD" : collections.Mapping, + "SQ" : collections.Sequence, + "RG" : collections.Sequence, + "PG" : collections.Sequence, + "CO" : collections.Sequence} # order of records within SAM headers VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO") @@ -148,139 +156,409 @@ def build_header_line(fields, record): return "\t".join(line) -cdef bam_hdr_t * build_header_from_dict(new_header): - '''return a new header built from a dictionary in `new_header`. - This method inserts the text field, target_name and target_len. - ''' - cdef list lines = [] +cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr): + if not hdr: + raise ValueError('cannot create AlignmentHeader, received NULL pointer') - # create new header and copy old data - cdef bam_hdr_t * dest = bam_hdr_init() + # check: is AlignmetHeader.__cinit__ called? + cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader) + header.ptr = hdr - # first: defined tags - for record in VALID_HEADERS: - if record in new_header: - ttype = VALID_HEADER_TYPES[record] - data = new_header[record] - if type(data) != type(ttype()): - raise ValueError( - "invalid type for record %s: %s, expected %s" % - (record, type(data), type(ttype()))) - if type(data) is dict: + return header + + +# the following should be class-method for VariantHeader, but cdef @classmethods +# are not implemented in cython. +cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest, + reference_names, + reference_lengths, + add_sq_text=True, + text=None) except -1: + """build header from list of reference names and lengths. + """ + +cdef class AlignmentHeader(object): + """header information for a :class:`AlignmentFile` object + + Parameters + ---------- + header_dict : dict + build header from a multi-level dictionary. The + first level are the four types ('HD', 'SQ', ...). The second + level are a list of lines, with each line being a list of + tag-value pairs. The header is constructed first from all the + defined fields, followed by user tags in alphabetical + order. Alternatively, an :class:`~pysam.AlignmentHeader` + object can be passed directly. + + text : string + use the string provided as the header + + reference_names : list + see reference_lengths + + reference_lengths : list + build header from list of chromosome names and lengths. By + default, 'SQ' and 'LN' tags will be added to the header + text. This option can be changed by unsetting the flag + `add_sq_text`. + + add_sq_text : bool + do not add 'SQ' and 'LN' tags to header. This option permits + construction :term:`SAM` formatted files without a header. + + """ + + # See makeVariantHeader for C constructor + def __cinit__(self): + self.ptr = NULL + + # Python constructor + def __init__(self): + self.ptr = bam_hdr_init() + if self.ptr is NULL: + raise MemoryError("could not create header") + + @classmethod + def _from_text_and_lengths(cls, text, reference_names, reference_lengths): + + cdef AlignmentHeader self = AlignmentHeader() + cdef char *ctext + cdef int l_text + cdef int n, x + if text is not None: + btext = force_bytes(text) + ctext = btext + l_text = len(btext) + self.ptr.text = calloc(l_text + 1, sizeof(char)) + if self.ptr.text == NULL: + raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char)) + self.ptr.l_text = l_text + memcpy(self.ptr.text, ctext, l_text + 1) + + if reference_names and reference_lengths: + reference_names = [force_bytes(ref) for ref in reference_names] + + self.ptr.n_targets = len(reference_names) + + n = sum([len(reference_names) + 1]) + self.ptr.target_name = calloc(n, sizeof(char*)) + if self.ptr.target_name == NULL: + raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *))) + + self.ptr.target_len = calloc(n, sizeof(uint32_t)) + if self.ptr.target_len == NULL: + raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t))) + + for x from 0 <= x < self.ptr.n_targets: + self.ptr.target_len[x] = reference_lengths[x] + name = reference_names[x] + self.ptr.target_name[x] = calloc(len(name) + 1, sizeof(char)) + if self.ptr.target_name[x] == NULL: + raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char))) + strncpy(self.ptr.target_name[x], name, len(name)) + + return self + + @classmethod + def from_text(cls, text): + + reference_names, reference_lengths = [], [] + for line in text.splitlines(): + if line.startswith("@SQ"): + fields = dict([x.split(":", 1) for x in line.split("\t")[1:]]) + try: + reference_names.append(fields["SN"]) + reference_lengths.append(int(fields["LN"])) + except KeyError: + raise KeyError("incomplete sequence information in '%s'" % str(fields)) + except ValueError: + raise ValueError("wrong sequence information in '%s'" % str(fields)) + + return cls._from_text_and_lengths(text, reference_names, reference_lengths) + + @classmethod + def from_dict(cls, header_dict): + + cdef list lines = [] + # first: defined tags + for record in VALID_HEADERS: + if record in header_dict: + data = header_dict[record] + if not isinstance(data, VALID_HEADER_TYPES[record]): + raise ValueError( + "invalid type for record %s: %s, expected %s".format( + record, type(data), VALID_HEADER_TYPES[record])) + if isinstance(data, collections.Mapping): + lines.append(build_header_line(data, record)) + else: + for fields in header_dict[record]: + lines.append(build_header_line(fields, record)) + + # then: user tags (lower case), sorted alphabetically + for record, data in sorted(header_dict.items()): + if record in VALID_HEADERS: + continue + if isinstance(data, collections.Mapping): lines.append(build_header_line(data, record)) else: - for fields in new_header[record]: + for fields in header_dict[record]: lines.append(build_header_line(fields, record)) - # then: user tags (lower case), sorted alphabetically - for record, data in sorted(new_header.items()): - if record in VALID_HEADERS: continue - if type(data) is dict: - lines.append(build_header_line(data, record)) - else: - for fields in new_header[record]: - lines.append(build_header_line(fields, record)) - - text = "\n".join(lines) + "\n" - if dest.text != NULL: - free(dest.text) - dest.text = calloc(len(text), sizeof(char)) - if dest.text == NULL: - raise MemoryError("could not allocate {} bytes".format(len(text) * sizeof(char))) - dest.l_text = len(text) - cdef bytes btext = text.encode('ascii') - strncpy(dest.text, btext, dest.l_text) - - cdef bytes bseqname - # collect targets - if "SQ" in new_header: - seqs = [] - for fields in new_header["SQ"]: - try: - seqs.append( (fields["SN"], fields["LN"] ) ) - except KeyError: - raise KeyError( "incomplete sequence information in '%s'" % str(fields)) - - dest.n_targets = len(seqs) - dest.target_name = calloc(dest.n_targets, sizeof(char*)) - if dest.target_name == NULL: - raise MemoryError("could not allocate {} bytes".format(dest.n_targets, sizeof(char *))) - dest.target_len = calloc(dest.n_targets, sizeof(uint32_t)) - if dest.target_len == NULL: - raise MemoryError("could not allocate {} bytes".format(dest.n_targets * sizeof(uint32_t))) - - for x from 0 <= x < dest.n_targets: - seqname, seqlen = seqs[x] - dest.target_name[x] = calloc( - len(seqname) + 1, sizeof(char)) - if dest.target_name[x] == NULL: - raise MemoryError("could not allocate {} bytes".format(len(seqname) + 1, sizeof(char))) - bseqname = seqname.encode('ascii') - strncpy(dest.target_name[x], bseqname, - len(seqname) + 1) - dest.target_len[x] = seqlen - - return dest - - -cdef bam_hdr_t * build_header_from_list(reference_names, - reference_lengths, - add_sq_text=True, - text=None): - - assert len(reference_names) == len(reference_lengths), \ - "unequal names and lengths of reference sequences" - - cdef bam_hdr_t * dest = bam_hdr_init() - - # allocate and fill header - reference_names = [force_bytes(ref) for ref in reference_names] - dest.n_targets = len(reference_names) - n = 0 - for x in reference_names: - n += len(x) + 1 - dest.target_name = calloc(n, sizeof(char*)) - if dest.target_name == NULL: - raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *))) - - dest.target_len = calloc(n, sizeof(uint32_t)) - if dest.target_len == NULL: - raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t))) - - for x from 0 <= x < dest.n_targets: - dest.target_len[x] = reference_lengths[x] - name = reference_names[x] - dest.target_name[x] = calloc( - len(name) + 1, sizeof(char)) - if dest.target_name[x] == NULL: - raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char))) - strncpy(dest.target_name[x], name, len(name)) - - # Optionally, if there is no text, add a SAM - # compatible header to output file. - if text is None and add_sq_text: + text = "\n".join(lines) + "\n" + + reference_names, reference_lengths = [], [] + if "SQ" in header_dict: + for fields in header_dict["SQ"]: + try: + reference_names.append(fields["SN"]) + reference_lengths.append(fields["LN"]) + except KeyError: + raise KeyError("incomplete sequence information in '%s'" % str(fields)) + + return cls._from_text_and_lengths(text, reference_names, reference_lengths) + + @classmethod + def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True): + + if len(reference_names) != len(reference_lengths): + raise ValueError("number of reference names and lengths do not match") + + # optionally, if there is no text, add a SAM compatible header to output file. + if text is None and add_sq_text: + text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip( + reference_names, reference_lengths)]) + + return cls._from_text_and_lengths(text, reference_names, reference_lengths) + + def __dealloc__(self): + bam_hdr_destroy(self.ptr) + self.ptr = NULL + + def __bool__(self): + return self.ptr != NULL + + def copy(self): + return makeAlignmentHeader(bam_hdr_dup(self.ptr)) + + property nreferences: + """"int with the number of :term:`reference` sequences in the file. + + This is a read-only attribute.""" + def __get__(self): + return self.ptr.n_targets + + property references: + """tuple with the names of :term:`reference` sequences. This is a + read-only attribute""" + def __get__(self): + t = [] + cdef int x + for x in range(self.ptr.n_targets): + t.append(charptr_to_str(self.ptr.target_name[x])) + return tuple(t) + + property lengths: + """tuple of the lengths of the :term:`reference` sequences. This is a + read-only attribute. The lengths are in the same order as + :attr:`pysam.AlignmentFile.references` + """ + def __get__(self): + t = [] + cdef int x + for x in range(self.ptr.n_targets): + t.append(self.ptr.target_len[x]) + return tuple(t) + + def _build_sequence_section(self): + """return sequence section of header. + + The sequence section is built from the list of reference names and + lengths stored in the BAM-file and not from any @SQ entries that + are part of the header's text section. + """ + + cdef int x text = [] - for x from 0 <= x < dest.n_targets: - text.append("@SQ\tSN:%s\tLN:%s\n" % \ - (force_str(reference_names[x]), - reference_lengths[x])) - text = ''.join(text) + for x in range(self.ptr.n_targets): + text.append("@SQ\tSN:{}\tLN:{}\n".format( + force_str(self.ptr.target_name[x]), + self.ptr.target_len[x])) + return "".join(text) + + def to_dict(self): + """return two-level dictionary with header information from the file. - cdef char * ctext = NULL + The first level contains the record (``HD``, ``SQ``, etc) and + the second level contains the fields (``VN``, ``LN``, etc). + + The parser is validating and will raise an AssertionError if + if encounters any record or field tags that are not part of + the SAM specification. Use the + :attr:`pysam.AlignmentFile.text` attribute to get the unparsed + header. + + The parsing follows the SAM format specification with the + exception of the ``CL`` field. This option will consume the + rest of a header line irrespective of any additional fields. + This behaviour has been added to accommodate command line + options that contain characters that are not valid field + separators. - if text is not None: - # copy without \0 - text = force_bytes(text) - ctext = text - dest.l_text = strlen(ctext) - dest.text = calloc( - strlen(ctext), sizeof(char)) - if dest.text == NULL: - raise MemoryError("could not allocate {} bytes".format(strlen(ctext), sizeof(char))) - memcpy(dest.text, ctext, strlen(ctext)) + If no @SQ entries are within the text section of the header, + this will be automatically added from the reference names and + lengths stored in the binary part of the header. + """ + result = collections.OrderedDict() + + # convert to python string + t = self.__str__() + for line in t.split("\n"): + if not line.strip(): + continue + assert line.startswith("@"), \ + "header line without '@': '%s'" % line + fields = line[1:].split("\t") + record = fields[0] + assert record in VALID_HEADER_TYPES, \ + "header line with invalid type '%s': '%s'" % (record, line) + + # treat comments + if record == "CO": + if record not in result: + result[record] = [] + result[record].append("\t".join( fields[1:])) + continue + # the following is clumsy as generators do not work? + x = {} + + for idx, field in enumerate(fields[1:]): + if ":" not in field: + raise ValueError("malformatted header: no ':' in field" ) + key, value = field.split(":", 1) + if key in ("CL",): + # special treatment for command line + # statements (CL). These might contain + # characters that are non-conformant with + # the valid field separators in the SAM + # header. Thus, in contravention to the + # SAM API, consume the rest of the line. + key, value = "\t".join(fields[idx+1:]).split(":", 1) + x[key] = KNOWN_HEADER_FIELDS[record][key](value) + break + + # interpret type of known header record tags, default to str + x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) + + if VALID_HEADER_TYPES[record] == collections.Mapping: + if record in result: + raise ValueError( + "multiple '%s' lines are not permitted" % record) + + result[record] = x + elif VALID_HEADER_TYPES[record] == collections.Sequence: + if record not in result: result[record] = [] + result[record].append(x) + + # if there are no SQ lines in the header, add the + # reference names from the information in the bam + # file. + # + # Background: c-samtools keeps the textual part of the + # header separate from the list of reference names and + # lengths. Thus, if a header contains only SQ lines, + # the SQ information is not part of the textual header + # and thus are missing from the output. See issue 84. + if "SQ" not in result: + sq = [] + for ref, length in zip(self.references, self.lengths): + sq.append({'LN': length, 'SN': ref }) + result["SQ"] = sq + + return result + + def as_dict(self): + """deprecated: use :meth:`to_dict()`""" + return self.to_dict() - return dest + def get_reference_name(self, tid): + if tid == -1: + return None + if not 0 <= tid < self.ptr.n_targets: + raise ValueError("reference_id %i out of range 0<=tid<%i" % + (tid, self.ptr.n_targets)) + return charptr_to_str(self.ptr.target_name[tid]) + + def get_reference_length(self, reference): + cdef int tid = self.get_tid(reference) + if tid < 0: + raise KeyError("unknown reference {}".format(reference)) + else: + return self.ptr.target_len[tid] + + def is_valid_tid(self, int tid): + """ + return True if the numerical :term:`tid` is valid; False otherwise. + + Note that the unmapped tid code (-1) counts as an invalid. + """ + return 0 <= tid < self.ptr.n_targets + + def get_tid(self, reference): + """ + return the numerical :term:`tid` corresponding to + :term:`reference` + + returns -1 if reference is not known. + """ + reference = force_bytes(reference) + return bam_name2id(self.ptr, reference) + + def __str__(self): + '''string with the full contents of the :term:`sam file` header as a + string. + + If no @SQ entries are within the text section of the header, + this will be automatically added from the reference names and + lengths stored in the binary part of the header. + + See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed + representation of the header. + ''' + text = from_string_and_size(self.ptr.text, self.ptr.l_text) + if "@SQ" not in text: + text += "\n" + self._build_sequence_section() + return text + + # dictionary access methods, for backwards compatibility. + def __setitem__(self, key, value): + raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()") + + def __getitem__(self, key): + return self.to_dict().__getitem__(key) + + def items(self): + return self.to_dict().items() + + # PY2 compatibility + def iteritems(self): + return self.to_dict().items() + + def keys(self): + return self.to_dict().keys() + + def values(self): + return self.to_dict().values() + + def get(self, *args): + return self.to_dict().get(*args) + + def __len__(self): + return self.to_dict().__len__() + + def __contains__(self, key): + return self.to_dict().__contains__(key) cdef class AlignmentFile(HTSFile): @@ -353,14 +631,16 @@ cdef class AlignmentFile(HTSFile): f2 = pysam.AlignmentFile('ex1.sam') template : AlignmentFile - when writing, copy header frem `template`. + when writing, copy header from file `template`. - header : dict + header : dict or AlignmentHeader when writing, build header from a multi-level dictionary. The first level are the four types ('HD', 'SQ', ...). The second level are a list of lines, with each line being a list of tag-value pairs. The header is constructed first from all the - defined fields, followed by user tags in alphabetical order. + defined fields, followed by user tags in alphabetical + order. Alternatively, an :class:`~pysam.AlignmentHeader` + object can be passed directly. text : string when writing, use the string provided as the header @@ -427,6 +707,10 @@ cdef class AlignmentFile(HTSFile): Issue a warning, instead of raising an error if the current file appears to be truncated due to a missing EOF marker. Only applies to bgzipped formats. (Default=False) + + format_options: list + A list of key=value strings, as accepted by --input-fmt-option and + --output-fmt-option in samtools. """ def __cinit__(self, *args, **kwargs): @@ -445,7 +729,7 @@ cdef class AlignmentFile(HTSFile): # allocate memory for iterator self.b = calloc(1, sizeof(bam1_t)) - if self.b is NULL: + if self.b == NULL: raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t))) def has_index(self): @@ -497,7 +781,8 @@ cdef class AlignmentFile(HTSFile): referencenames=None, referencelengths=None, duplicate_filehandle=True, - ignore_truncation=False): + ignore_truncation=False, + format_options=None): '''open a sam, bam or cram formatted file. If _open is called on an existing file, the current file @@ -508,7 +793,8 @@ cdef class AlignmentFile(HTSFile): cdef char *creference_filename = NULL cdef char *cindexname = NULL cdef char *cmode = NULL - + cdef bam_hdr_t * hdr = NULL + # for backwards compatibility: if referencenames is not None: reference_names = referencenames @@ -574,32 +860,42 @@ cdef class AlignmentFile(HTSFile): if mode[0] == 'w': # open file for writing - # header structure (used for writing) + if not (template or header or reference_names): + raise ValueError( + "either supply options `template`, `header` or both `reference_names` " + "and `reference_lengths` for writing") + if template: - self.header = bam_hdr_dup(template.header) - elif header: - self.header = build_header_from_dict(header) - else: - assert reference_names and reference_lengths, \ - ("either supply options `template`, `header` " - "or both `reference_names` and `reference_lengths` " - "for writing") - # build header from a target names and lengths - self.header = build_header_from_list( + # header is copied, though at the moment not strictly + # necessary as AlignmentHeader is immutable. + self.header = template.header.copy() + elif isinstance(header, AlignmentHeader): + self.header = header.copy() + elif isinstance(header, collections.Mapping): + self.header = AlignmentHeader.from_dict(header) + elif reference_names and reference_lengths: + self.header = AlignmentHeader.from_references( reference_names, reference_lengths, add_sq_text=add_sq_text, text=text) - + elif text: + self.header = AlignmentHeader.from_text(text) + else: + raise ValueError("not enough information to construct header. Please provide template, " + "header, text or reference_names/reference_lengths") + self.htsfile = self._open_htsfile() if self.htsfile == NULL: if errno: - raise IOError(errno, "could not open alignment file `{}`: {}".format(force_str(filename), - force_str(strerror(errno)))) + raise IOError(errno, "could not open alignment file `{}`: {}".format( + force_str(filename), + force_str(strerror(errno)))) else: raise ValueError("could not open alignment file `{}`".format(force_str(filename))) - + if format_options and len(format_options): + self.add_hts_options(format_options) # set filename with reference sequences. If no filename # is given, the CRAM reference arrays will be built from # the @SQ header in the header @@ -609,8 +905,9 @@ cdef class AlignmentFile(HTSFile): # write header to htsfile if "b" in mode or "c" in mode or "h" in mode: + hdr = self.header.ptr with nogil: - sam_hdr_write(self.htsfile, self.header) + sam_hdr_write(self.htsfile, hdr) elif mode[0] == "r": # open file for reading @@ -626,33 +923,40 @@ cdef class AlignmentFile(HTSFile): if self.htsfile.format.category != sequence_data: raise ValueError("file does not contain alignment data") + if format_options and len(format_options): + self.add_hts_options(format_options) + self.check_truncation(ignore_truncation) - # bam files require a valid header + # bam/cram files require a valid header if self.is_bam or self.is_cram: with nogil: - self.header = sam_hdr_read(self.htsfile) - - # in sam files a header is optional, but requires - # reference names and lengths - elif reference_names and reference_lengths: - self.header = build_header_from_list( - reference_names, - reference_lengths, - add_sq_text=add_sq_text, - text=text) - else: - with nogil: - self.header = sam_hdr_read(self.htsfile) - - if self.header == NULL: + hdr = sam_hdr_read(self.htsfile) + if hdr == NULL: raise ValueError( - "file `{}` does not have valid header, " - "please provide reference_names and reference_lengths".format(force_str(filename))) - - if self.header == NULL: - raise ValueError("file `{}` does not have valid header".format(force_str(filename))) - + "file does not have a valid header (mode='%s') " + "- is it BAM/CRAM format?" % mode) + self.header = makeAlignmentHeader(hdr) + else: + # in sam files a header is optional. If not given, + # user may provide reference names and lengths to built + # an on-the-fly header. + if reference_names and reference_lengths: + # build header from a target names and lengths + self.header = AlignmentHeader.from_references( + reference_names=reference_names, + reference_lengths=reference_lengths, + add_sq_text=add_sq_text, + text=text) + else: + with nogil: + hdr = sam_hdr_read(self.htsfile) + if hdr == NULL: + raise ValueError( + "SAM? file does not have a valid header (mode='%s'), " + "please provide reference_names and reference_lengths") + self.header = makeAlignmentHeader(hdr) + # set filename with reference sequences if self.is_cram and reference_filename: creference_filename = self.reference_filename @@ -660,19 +964,16 @@ cdef class AlignmentFile(HTSFile): CRAM_OPT_REFERENCE, creference_filename) - if check_sq and self.header.n_targets == 0: + if check_sq and self.header.nreferences == 0: raise ValueError( ("file has no sequences defined (mode='%s') - " "is it SAM/BAM format? Consider opening with " "check_sq=False") % mode) if self.is_bam or self.is_cram: - # open index for remote files - # returns NULL if there is no index or index could - # not be opened - index_filename = index_filename or filepath_index - if index_filename: - cindexname = bindex_filename = encode_filename(index_filename) + self.index_filename = index_filename or filepath_index + if self.index_filename: + cindexname = bfile_name = encode_filename(self.index_filename) if cfilename or cindexname: with nogil: @@ -682,7 +983,7 @@ cdef class AlignmentFile(HTSFile): if errno: raise IOError(errno, force_str(strerror(errno))) else: - raise IOError('unable to open index file `%s`' % index_filename) + raise IOError('unable to open index file `%s`' % self.index_filename) elif require_index: raise IOError('unable to open index file') @@ -691,35 +992,6 @@ cdef class AlignmentFile(HTSFile): if not self.is_stream: self.start_offset = self.tell() - def is_valid_tid(self, tid): - """ - return True if the numerical :term:`tid` is valid; False otherwise. - """ - return 0 <= tid < self.header.n_targets - - def get_tid(self, reference): - """ - return the numerical :term:`tid` corresponding to - :term:`reference` - - returns -1 if reference is not known. - """ - if not self.is_open: - raise ValueError("I/O operation on closed file") - reference = force_bytes(reference) - return bam_name2id(self.header, reference) - - def get_reference_name(self, tid): - """ - return :term:`reference` name corresponding to numerical :term:`tid` - """ - if not self.is_open: - raise ValueError("I/O operation on closed file") - if not 0 <= tid < self.header.n_targets: - raise ValueError("reference_id %i out of range 0<=tid<%i" % - (tid, self.header.n_targets)) - return charptr_to_str(self.header.target_name[tid]) - def fetch(self, contig=None, start=None, @@ -790,10 +1062,11 @@ cdef class AlignmentFile(HTSFile): if not self.is_open: raise ValueError( "I/O operation on closed file" ) - has_coord, rtid, rstart, rstop = self.parse_region(contig, start, stop, region, tid, - end=end, reference=reference) + has_coord, rtid, rstart, rstop = self.parse_region( + contig, start, stop, region, tid, + end=end, reference=reference) - # Turn of re-opening if htsfile is a stream + # Turn of re-opening if htsfile is a stream if self.is_stream: multiple_iterators = False @@ -945,6 +1218,16 @@ cdef class AlignmentFile(HTSFile): Parameters ---------- + truncate : bool + + By default, the samtools pileup engine outputs all reads + overlapping a region. If truncate is True and a region is + given, only columns in the exact region specificied are + returned. + + max_depth : int + Maximum read depth permitted. The default limit is '8000'. + stepper : string The stepper controls how the iterator advances. Possible options for the stepper are @@ -954,26 +1237,67 @@ cdef class AlignmentFile(HTSFile): BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP ``nofilter`` - uses every single read + uses every single read turning off any filtering. ``samtools`` same filter and read processing as in :term:`csamtools` - pileup. This requires a 'fastafile' to be given. - + pileup. For full compatibility, this requires a + 'fastafile' to be given. The following options all pertain + to filtering of the ``samtools`` stepper. fastafile : :class:`~pysam.FastaFile` object. This is required for some of the steppers. - max_depth : int - Maximum read depth permitted. The default limit is '8000'. + ignore_overlaps: bool - truncate : bool + If set to True, detect if read pairs overlap and only take + the higher quality base. This is the default. - By default, the samtools pileup engine outputs all reads - overlapping a region. If truncate is True and a region is - given, only columns in the exact region specificied are - returned. + flag_filter : int + + ignore reads where any of the bits in the flag are set. The default is + BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. + + flag_require : int + + only use reads where certain flags are set. The default is 0. + + ignore_orphans: bool + + ignore orphans (paired reads that are not in a proper pair). + The default is to ignore orphans. + + min_base_quality: int + + Minimum base quality. Bases below the minimum quality will + not be output. + + adjust_capq_threshold: int + + adjust mapping quality. The default is 0 for no + adjustment. The recommended value for adjustment is 50. + + min_mapping_quality : int + + only use reads above a minimum mapping quality. The default is 0. + + compute_baq: bool + + re-alignment computing per-Base Alignment Qualities (BAQ). The + default is to do re-alignment. Realignment requires a reference + sequence. If none is present, no realignment will be performed. + + redo_baq: bool + + recompute per-Base Alignment Quality on the fly ignoring + existing base qualities. The default is False (use existing + base qualities). + + adjust_capq_threshold: int + + adjust mapping quality. The default is 0 for no + adjustment. The recommended value for adjustment is 50. Returns ------- @@ -1000,7 +1324,7 @@ cdef class AlignmentFile(HTSFile): stop=rstop, **kwargs) else: - return IteratorColumnAllRefs(self, **kwargs ) + return IteratorColumnAllRefs(self, **kwargs) else: raise NotImplementedError( @@ -1111,7 +1435,7 @@ cdef class AlignmentFile(HTSFile): @cython.boundscheck(False) # we do manual bounds checking def count_coverage(self, - contig=None, + contig, start=None, stop=None, region=None, @@ -1134,10 +1458,12 @@ cdef class AlignmentFile(HTSFile): reference_name of the genomic region (chromosome) start : int - start of the genomic region (0-based inclusive) + start of the genomic region (0-based inclusive). If not + given, count from the start of the chromosome. stop : int - end of the genomic region (0-based exclusive) + end of the genomic region (0-based exclusive). If not given, + count to the end of the chromosome. region : int a region string. @@ -1182,8 +1508,17 @@ cdef class AlignmentFile(HTSFile): """ - cdef int _start = start - cdef int _stop = stop if stop is not None else end + + cdef uint32_t contig_length = self.get_reference_length(contig) + cdef int _start = start if start is not None else 0 + cdef int _stop = stop if stop is not None else contig_length + _stop = _stop if _stop < contig_length else contig_length + + if _stop == _start: + raise ValueError("interval of size 0") + if _stop < _start: + raise ValueError("interval of size less than 0") + cdef int length = _stop - _start cdef c_array.array int_array_template = array.array('L', []) cdef c_array.array count_a @@ -1282,9 +1617,7 @@ cdef class AlignmentFile(HTSFile): hts_idx_destroy(self.index) self.index = NULL - if self.header != NULL: - bam_hdr_destroy(self.header) - self.header = NULL + self.header = None if ret < 0: global errno @@ -1304,9 +1637,7 @@ cdef class AlignmentFile(HTSFile): hts_idx_destroy(self.index) self.index = NULL - if self.header != NULL: - bam_hdr_destroy(self.header) - self.header = NULL + self.header = None if self.b: bam_destroy1(self.b) @@ -1341,7 +1672,7 @@ cdef class AlignmentFile(HTSFile): with nogil: ret = sam_write1(self.htsfile, - self.header, + self.header.ptr, read._delegate) # kbj: Still need to raise an exception with except -1. Otherwise @@ -1366,38 +1697,6 @@ cdef class AlignmentFile(HTSFile): ############################################################### ## properties ############################################################### - property nreferences: - """"int with the number of :term:`reference` sequences in the file. - This is a read-only attribute.""" - def __get__(self): - if not self.is_open: - raise ValueError("I/O operation on closed file") - return self.header.n_targets - - property references: - """tuple with the names of :term:`reference` sequences. This is a - read-only attribute""" - def __get__(self): - if not self.is_open: raise ValueError( "I/O operation on closed file" ) - t = [] - for x from 0 <= x < self.header.n_targets: - t.append(charptr_to_str(self.header.target_name[x])) - return tuple(t) - - property lengths: - """tuple of the lengths of the :term:`reference` sequences. This is a - read-only attribute. The lengths are in the same order as - :attr:`pysam.AlignmentFile.references` - - """ - def __get__(self): - if not self.is_open: - raise ValueError("I/O operation on closed file") - t = [] - for x from 0 <= x < self.header.n_targets: - t.append(self.header.target_len[x]) - return tuple(t) - property mapped: """int with total number of mapped alignments according to the statistics recorded in the index. This is a read-only @@ -1408,7 +1707,7 @@ cdef class AlignmentFile(HTSFile): cdef int tid cdef uint64_t total = 0 cdef uint64_t mapped, unmapped - for tid from 0 <= tid < self.header.n_targets: + for tid from 0 <= tid < self.header.nreferences: with nogil: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) total += mapped @@ -1424,7 +1723,7 @@ cdef class AlignmentFile(HTSFile): cdef int tid cdef uint64_t total = hts_idx_get_n_no_coor(self.index) cdef uint64_t mapped, unmapped - for tid from 0 <= tid < self.header.n_targets: + for tid from 0 <= tid < self.header.nreferences: with nogil: hts_idx_get_stat(self.index, tid, &mapped, &unmapped) total += unmapped @@ -1468,115 +1767,6 @@ cdef class AlignmentFile(HTSFile): return results - property text: - '''string with the full contents of the :term:`sam file` header as a - string. - - This is a read-only attribute. - - See :attr:`pysam.AlignmentFile.header` to get a parsed - representation of the header. - ''' - def __get__(self): - if not self.is_open: - raise ValueError( "I/O operation on closed file" ) - return from_string_and_size(self.header.text, self.header.l_text) - - property header: - """two-level dictionay with header information from the file. - - This is a read-only attribute. - - The first level contains the record (``HD``, ``SQ``, etc) and - the second level contains the fields (``VN``, ``LN``, etc). - - The parser is validating and will raise an AssertionError if - if encounters any record or field tags that are not part of - the SAM specification. Use the - :attr:`pysam.AlignmentFile.text` attribute to get the unparsed - header. - - The parsing follows the SAM format specification with the - exception of the ``CL`` field. This option will consume the - rest of a header line irrespective of any additional fields. - This behaviour has been added to accommodate command line - options that contain characters that are not valid field - separators. - - """ - def __get__(self): - if not self.is_open: - raise ValueError( "I/O operation on closed file" ) - - result = {} - - if self.header.text != NULL: - # convert to python string (note: call self.text to - # create 0-terminated string) - t = self.text - for line in t.split("\n"): - if not line.strip(): continue - assert line.startswith("@"), \ - "header line without '@': '%s'" % line - fields = line[1:].split("\t") - record = fields[0] - assert record in VALID_HEADER_TYPES, \ - "header line with invalid type '%s': '%s'" % (record, line) - - # treat comments - if record == "CO": - if record not in result: - result[record] = [] - result[record].append("\t".join( fields[1:])) - continue - # the following is clumsy as generators do not work? - x = {} - - for idx, field in enumerate(fields[1:]): - if ":" not in field: - raise ValueError("malformatted header: no ':' in field" ) - key, value = field.split(":", 1) - if key in ("CL",): - # special treatment for command line - # statements (CL). These might contain - # characters that are non-conformant with - # the valid field separators in the SAM - # header. Thus, in contravention to the - # SAM API, consume the rest of the line. - key, value = "\t".join(fields[idx+1:]).split(":", 1) - x[key] = KNOWN_HEADER_FIELDS[record][key](value) - break - - # interpret type of known header record tags, default to str - x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value) - - if VALID_HEADER_TYPES[record] == dict: - if record in result: - raise ValueError( - "multiple '%s' lines are not permitted" % record) - - result[record] = x - elif VALID_HEADER_TYPES[record] == list: - if record not in result: result[record] = [] - result[record].append(x) - - # if there are no SQ lines in the header, add the - # reference names from the information in the bam - # file. - # - # Background: c-samtools keeps the textual part of the - # header separate from the list of reference names and - # lengths. Thus, if a header contains only SQ lines, - # the SQ information is not part of the textual header - # and thus are missing from the output. See issue 84. - if "SQ" not in result: - sq = [] - for ref, length in zip(self.references, self.lengths): - sq.append({'LN': length, 'SN': ref }) - result["SQ"] = sq - - return result - ############################################################### ## file-object like iterator access ## note: concurrent access will cause errors (see IteratorRow @@ -1586,7 +1776,7 @@ cdef class AlignmentFile(HTSFile): if not self.is_open: raise ValueError("I/O operation on closed file") - if not self.is_bam and self.header.n_targets == 0: + if not self.is_bam and self.header.nreferences == 0: raise NotImplementedError( "can not iterate over samfile without header") return self @@ -1599,21 +1789,100 @@ cdef class AlignmentFile(HTSFile): cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`. ''' cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr with nogil: ret = sam_read1(self.htsfile, - self.header, + hdr, self.b) return ret def __next__(self): cdef int ret = self.cnext() if (ret >= 0): - return makeAlignedSegment(self.b, self) + return makeAlignedSegment(self.b, self.header) elif ret == -2: raise IOError('truncated file') else: raise StopIteration + ########################################### + # methods/properties referencing the header + def is_valid_tid(self, int tid): + """ + return True if the numerical :term:`tid` is valid; False otherwise. + + Note that the unmapped tid code (-1) counts as an invalid. + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.is_valid_tid(tid) + + def get_tid(self, reference): + """ + return the numerical :term:`tid` corresponding to + :term:`reference` + + returns -1 if reference is not known. + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.get_tid(reference) + + def get_reference_name(self, tid): + """ + return :term:`reference` name corresponding to numerical :term:`tid` + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.get_reference_name(tid) + + def get_reference_length(self, reference): + """ + return :term:`reference` name corresponding to numerical :term:`tid` + """ + if self.header is None: + raise ValueError("header not available in closed files") + return self.header.get_reference_length(reference) + + property nreferences: + """"int with the number of :term:`reference` sequences in the file. + This is a read-only attribute.""" + def __get__(self): + if self.header: + return self.header.nreferences + else: + raise ValueError("header not available in closed files") + + property references: + """tuple with the names of :term:`reference` sequences. This is a + read-only attribute""" + def __get__(self): + if self.header: + return self.header.references + else: + raise ValueError("header not available in closed files") + + property lengths: + """tuple of the lengths of the :term:`reference` sequences. This is a + read-only attribute. The lengths are in the same order as + :attr:`pysam.AlignmentFile.references` + + """ + def __get__(self): + if self.header: + return self.header.lengths + else: + raise ValueError("header not available in closed files") + + # Compatibility functions for pysam < 0.14 + property text: + """deprecated, use .header directly""" + def __get__(self): + if self.header: + return self.header.__str__() + else: + raise ValueError("header not available in closed files") + # Compatibility functions for pysam < 0.8.3 def gettid(self, reference): """deprecated, use get_tid() instead""" @@ -1652,7 +1921,8 @@ cdef class IteratorRow: def __init__(self, AlignmentFile samfile, int multiple_iterators=False): cdef char *cfilename cdef char *creference_filename - + cdef char *cindexname = NULL + if not samfile.is_open: raise ValueError("I/O operation on closed file") @@ -1663,16 +1933,30 @@ cdef class IteratorRow: # reopen the file - note that this makes the iterator # slow and causes pileup to slow down significantly. if multiple_iterators: + cfilename = samfile.filename with nogil: self.htsfile = hts_open(cfilename, 'r') assert self.htsfile != NULL - # read header - required for accurate positioning - # could a tell/seek work? + + if samfile.has_index(): + if samfile.index_filename: + cindexname = samfile.index_filename + with nogil: + self.index = sam_index_load2(self.htsfile, cfilename, cindexname) + else: + self.index = NULL + + # need to advance in newly opened file to position after header + # better: use seek/tell? with nogil: - self.header = sam_hdr_read(self.htsfile) - assert self.header != NULL + hdr = sam_hdr_read(self.htsfile) + if hdr is NULL: + raise IOError("unable to read header information") + self.header = makeAlignmentHeader(hdr) + self.owns_samfile = True + # options specific to CRAM files if samfile.is_cram and samfile.reference_filename: creference_filename = samfile.reference_filename @@ -1681,9 +1965,10 @@ cdef class IteratorRow: creference_filename) else: - self.htsfile = self.samfile.htsfile + self.htsfile = samfile.htsfile + self.index = samfile.index self.owns_samfile = False - self.header = self.samfile.header + self.header = samfile.header self.retval = 0 @@ -1693,7 +1978,7 @@ cdef class IteratorRow: bam_destroy1(self.b) if self.owns_samfile: hts_close(self.htsfile) - bam_hdr_destroy(self.header) + hts_idx_destroy(self.index) cdef class IteratorRowRegion(IteratorRow): @@ -1714,15 +1999,15 @@ cdef class IteratorRowRegion(IteratorRow): int tid, int beg, int stop, int multiple_iterators=False): - IteratorRow.__init__(self, samfile, - multiple_iterators=multiple_iterators) - if not samfile.has_index(): raise ValueError("no index available for iteration") + IteratorRow.__init__(self, samfile, + multiple_iterators=multiple_iterators) + with nogil: self.iter = sam_itr_queryi( - self.samfile.index, + self.index, tid, beg, stop) @@ -1744,15 +2029,17 @@ cdef class IteratorRowRegion(IteratorRow): def __next__(self): self.cnext() if self.retval >= 0: - return makeAlignedSegment(self.b, self.samfile) + return makeAlignedSegment(self.b, self.header) + elif self.retval == -1: + raise StopIteration elif self.retval == -2: # Note: it is currently not the case that hts_iter_next # returns -2 for a truncated file. # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625 raise IOError('truncated file') else: - raise StopIteration - + raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval)) + def __dealloc__(self): hts_itr_destroy(self.iter) @@ -1769,7 +2056,9 @@ cdef class IteratorRowHead(IteratorRow): """ - def __init__(self, AlignmentFile samfile, int n, + def __init__(self, + AlignmentFile samfile, + int n, int multiple_iterators=False): IteratorRow.__init__(self, samfile, @@ -1787,9 +2076,10 @@ cdef class IteratorRowHead(IteratorRow): cdef int cnext(self): '''cversion of iterator. Used by IteratorColumn''' cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr with nogil: ret = sam_read1(self.htsfile, - self.samfile.header, + hdr, self.b) return ret @@ -1800,7 +2090,7 @@ cdef class IteratorRowHead(IteratorRow): cdef int ret = self.cnext() if ret >= 0: self.current_row += 1 - return makeAlignedSegment(self.b, self.samfile) + return makeAlignedSegment(self.b, self.header) elif ret == -2: raise IOError('truncated file') else: @@ -1835,16 +2125,17 @@ cdef class IteratorRowAll(IteratorRow): cdef int cnext(self): '''cversion of iterator. Used by IteratorColumn''' cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr with nogil: ret = sam_read1(self.htsfile, - self.samfile.header, + hdr, self.b) return ret def __next__(self): cdef int ret = self.cnext() if ret >= 0: - return makeAlignedSegment(self.b, self.samfile) + return makeAlignedSegment(self.b, self.header) elif ret == -2: raise IOError('truncated file') else: @@ -1905,7 +2196,7 @@ cdef class IteratorRowAllRefs(IteratorRow): # If current iterator is not exhausted, return aligned read if self.rowiter.retval > 0: - return makeAlignedSegment(self.rowiter.b, self.samfile) + return makeAlignedSegment(self.rowiter.b, self.header) self.tid += 1 @@ -1952,16 +2243,17 @@ cdef class IteratorRowSelection(IteratorRow): self.current_pos += 1 cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr with nogil: ret = sam_read1(self.htsfile, - self.samfile.header, + hdr, self.b) return ret def __next__(self): cdef int ret = self.cnext() if ret >= 0: - return makeAlignedSegment(self.b, self.samfile) + return makeAlignedSegment(self.b, self.header) elif ret == -2: raise IOError('truncated file') else: @@ -1971,8 +2263,7 @@ cdef class IteratorRowSelection(IteratorRow): cdef int __advance_nofilter(void *data, bam1_t *b): '''advance without any read filtering. ''' - cdef __iterdata * d - d = <__iterdata*>data + cdef __iterdata * d = <__iterdata*>data cdef int ret with nogil: ret = sam_itr_next(d.htsfile, d.iter, b) @@ -1980,95 +2271,86 @@ cdef int __advance_nofilter(void *data, bam1_t *b): cdef int __advance_all(void *data, bam1_t *b): - '''only use reads for pileup passing basic - filters: + '''only use reads for pileup passing basic filters such as BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP ''' - cdef __iterdata * d + cdef __iterdata * d = <__iterdata*>data cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP - d = <__iterdata*>data cdef int ret - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - while ret >= 0 and b.core.flag & mask: + while 1: with nogil: ret = sam_itr_next(d.htsfile, d.iter, b) + if ret < 0: + break + if b.core.flag & d.flag_filter: + continue + break return ret -cdef int __advance_snpcalls(void * data, bam1_t * b): +cdef int __advance_samtools(void * data, bam1_t * b): '''advance using same filter and read processing as in the samtools pileup. ''' - - # Note that this method requries acces to some - # functions in the samtools code base and is thus - # not htslib only. - # The functions accessed in samtools are: - # 1. bam_prob_realn - # 2. bam_cap_mapQ - cdef __iterdata * d - d = <__iterdata*>data - + cdef __iterdata * d = <__iterdata*>data cdef int ret - cdef int skip = 0 cdef int q - cdef int is_cns = 1 - cdef int is_nobaq = 0 - cdef int capQ_thres = 0 - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - - # reload sequence - if d.fastafile != NULL and b.core.tid != d.tid: - if d.seq != NULL: - free(d.seq) - d.tid = b.core.tid + while 1: with nogil: - d.seq = faidx_fetch_seq( - d.fastafile, - d.header.target_name[d.tid], - 0, MAX_POS, - &d.seq_len) - - if d.seq == NULL: - raise ValueError( - "reference sequence for '%s' (tid=%i) not found" % \ - (d.header.target_name[d.tid], - d.tid)) + ret = sam_itr_next(d.htsfile, d.iter, b) + if ret < 0: + break + if b.core.flag & d.flag_filter: + continue + if d.flag_require and not (b.core.flag & d.flag_require): + continue + + # reload sequence + if d.fastafile != NULL and b.core.tid != d.tid: + if d.seq != NULL: + free(d.seq) + d.tid = b.core.tid + with nogil: + d.seq = faidx_fetch_seq( + d.fastafile, + d.header.target_name[d.tid], + 0, MAX_POS, + &d.seq_len) - while ret >= 0: - skip = 0 + if d.seq == NULL: + raise ValueError( + "reference sequence for '{}' (tid={}) not found".format( + d.header.target_name[d.tid], d.tid)) # realign read - changes base qualities - if d.seq != NULL and is_cns and not is_nobaq: - # flag: - # apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4; - sam_prob_realn(b, d.seq, d.seq_len, 0) - - if d.seq != NULL and capQ_thres > 10: - q = sam_cap_mapq(b, d.seq, d.seq_len, capQ_thres) + if d.seq != NULL and d.compute_baq: + # 4th option to realign is flag: + # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4 + if d.redo_baq: + sam_prob_realn(b, d.seq, d.seq_len, 7) + else: + sam_prob_realn(b, d.seq, d.seq_len, 3) + + if d.seq != NULL and d.adjust_capq_threshold > 10: + q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold) if q < 0: - skip = 1 + continue elif b.core.qual > q: b.core.qual = q - if b.core.flag & BAM_FUNMAP: - skip = 1 - elif b.core.flag & 1 and not b.core.flag & 2: - skip = 1 - - if not skip: - break - # additional filters - - with nogil: - ret = sam_itr_next(d.htsfile, d.iter, b) - + + if b.core.qual < d.min_mapping_quality: + continue + if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR): + continue + + break + return ret + cdef class IteratorColumn: '''abstract base class for iterators over columns. @@ -2080,7 +2362,7 @@ cdef class IteratorColumn: consider the conversion to a list:: f = AlignmentFile("file.bam", "rb") - result = list( f.pileup() ) + result = list(f.pileup()) Here, ``result`` will contain ``n`` objects of type :class:`~pysam.PileupColumn` for ``n`` columns, but each object in @@ -2088,44 +2370,40 @@ cdef class IteratorColumn: The desired behaviour can be achieved by list comprehension:: - result = [ x.pileups() for x in f.pileup() ] + result = [x.pileups() for x in f.pileup()] ``result`` will be a list of ``n`` lists of objects of type :class:`~pysam.PileupRead`. - If the iterator is associated with a :class:`~pysam.Fastafile` using the - :meth:`addReference` method, then the iterator will export the - current sequence via the methods :meth:`getSequence` and - :meth:`seq_len`. - - Optional kwargs to the iterator: - - stepper - The stepper controls how the iterator advances. - - Valid values are None, "all" (default), "nofilter" or "samtools". - - See AlignmentFile.pileup for description. - - fastafile - A :class:`~pysam.FastaFile` object - - max_depth - maximum read depth. The default is 8000. + If the iterator is associated with a :class:`~pysam.Fastafile` + using the :meth:`add_reference` method, then the iterator will + export the current sequence via the methods :meth:`get_sequence` + and :meth:`seq_len`. + See :class:`~AlignmentFile.pileup` for kwargs to the iterator. ''' - def __cinit__( self, AlignmentFile samfile, **kwargs ): + def __cinit__( self, AlignmentFile samfile, **kwargs): self.samfile = samfile self.fastafile = kwargs.get("fastafile", None) - self.stepper = kwargs.get("stepper", None) + self.stepper = kwargs.get("stepper", "samtools") self.max_depth = kwargs.get("max_depth", 8000) + self.ignore_overlaps = kwargs.get("ignore_overlaps", True) + self.min_base_quality = kwargs.get("min_base_quality", 13) self.iterdata.seq = NULL + self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0) + self.iterdata.flag_require = kwargs.get("flag_require", 0) + self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) + self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0) + self.iterdata.compute_baq = kwargs.get("compute_baq", True) + self.iterdata.redo_baq = kwargs.get("redo_baq", False) + self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True) + self.tid = 0 self.pos = 0 self.n_plp = 0 self.plp = NULL - self.pileup_iter = NULL + self.pileup_iter = NULL def __iter__(self): return self @@ -2134,12 +2412,14 @@ cdef class IteratorColumn: '''perform next iteration. ''' # do not release gil here because of call-backs - self.plp = bam_plp_auto(self.pileup_iter, - &self.tid, - &self.pos, - &self.n_plp) + cdef int ret = bam_mplp_auto(self.pileup_iter, + &self.tid, + &self.pos, + &self.n_plp, + &self.plp) + return ret - cdef char * getSequence(self): + cdef char * get_sequence(self): '''return current reference sequence underlying the iterator. ''' return self.iterdata.seq @@ -2149,7 +2429,7 @@ cdef class IteratorColumn: def __get__(self): return self.iterdata.seq_len - def addReference(self, FastaFile fastafile): + def add_reference(self, FastaFile fastafile): ''' add reference sequences in `fastafile` to iterator.''' self.fastafile = fastafile @@ -2158,25 +2438,16 @@ cdef class IteratorColumn: self.iterdata.tid = -1 self.iterdata.fastafile = self.fastafile.fastafile - def hasReference(self): + def has_reference(self): ''' return true if iterator is associated with a reference''' return self.fastafile - - cdef setMask(self, mask): - '''set masking flag in iterator. - - reads with bits set in `mask` will be skipped. - ''' - raise NotImplementedError() - # self.mask = mask - # bam_plp_set_mask( self.pileup_iter, self.mask ) - - cdef setupIteratorData( self, - int tid, - int start, - int stop, - int multiple_iterators=0 ): + + cdef _setup_iterator(self, + int tid, + int start, + int stop, + int multiple_iterators=0): '''setup the iterator structure''' self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators) @@ -2184,7 +2455,7 @@ cdef class IteratorColumn: self.iterdata.iter = self.iter.iter self.iterdata.seq = NULL self.iterdata.tid = -1 - self.iterdata.header = self.samfile.header + self.iterdata.header = self.samfile.header.ptr if self.fastafile is not None: self.iterdata.fastafile = self.fastafile.fastafile @@ -2195,31 +2466,36 @@ cdef class IteratorColumn: # pileup_iter self._free_pileup_iter() + cdef void * data[1] + data[0] = &self.iterdata + if self.stepper is None or self.stepper == "all": with nogil: - self.pileup_iter = bam_plp_init( - &__advance_all, - &self.iterdata) + self.pileup_iter = bam_mplp_init(1, + &__advance_all, + data) elif self.stepper == "nofilter": with nogil: - self.pileup_iter = bam_plp_init( - &__advance_nofilter, - &self.iterdata) + self.pileup_iter = bam_mplp_init(1, + &__advance_nofilter, + data) elif self.stepper == "samtools": with nogil: - self.pileup_iter = bam_plp_init( - &__advance_snpcalls, - &self.iterdata) + self.pileup_iter = bam_mplp_init(1, + &__advance_samtools, + data) else: raise ValueError( "unknown stepper option `%s` in IteratorColumn" % self.stepper) if self.max_depth: with nogil: - bam_plp_set_maxcnt(self.pileup_iter, self.max_depth) - - # bam_plp_set_mask( self.pileup_iter, self.mask ) + bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth) + if self.ignore_overlaps: + with nogil: + bam_mplp_init_overlaps(self.pileup_iter) + cdef reset(self, tid, start, stop): '''reset iterator position. @@ -2236,21 +2512,22 @@ cdef class IteratorColumn: self.iterdata.seq = NULL self.iterdata.tid = -1 - # self.pileup_iter = bam_plp_init( &__advancepileup, &self.iterdata ) + # self.pileup_iter = bam_mplp_init(1 + # &__advancepileup, + # &self.iterdata) with nogil: - bam_plp_reset(self.pileup_iter) - + bam_mplp_reset(self.pileup_iter) + cdef _free_pileup_iter(self): '''free the memory alloc'd by bam_plp_init. - This is needed before setupIteratorData allocates - another pileup_iter, or else memory will be lost. - ''' - if self.pileup_iter != NULL: + This is needed before setup_iterator allocates another + pileup_iter, or else memory will be lost. ''' + if self.pileup_iter != NULL: with nogil: - bam_plp_reset(self.pileup_iter) - bam_plp_destroy(self.pileup_iter) - self.pileup_iter = NULL + bam_mplp_reset(self.pileup_iter) + bam_mplp_destroy(self.pileup_iter) + self.pileup_iter = NULL def __dealloc__(self): # reset in order to avoid memory leak messages for iterators @@ -2261,12 +2538,22 @@ cdef class IteratorColumn: if self.iterdata.seq != NULL: free(self.iterdata.seq) self.iterdata.seq = NULL + + # backwards compatibility + + def hasReference(self): + return self.has_reference() + cdef char * getSequence(self): + return self.get_sequence() + def addReference(self, FastaFile fastafile): + return self.add_reference(fastafile) - + cdef class IteratorColumnRegion(IteratorColumn): '''iterates over a region only. ''' - def __cinit__(self, AlignmentFile samfile, + def __cinit__(self, + AlignmentFile samfile, int tid = 0, int start = 0, int stop = MAX_POS, @@ -2274,30 +2561,36 @@ cdef class IteratorColumnRegion(IteratorColumn): **kwargs ): # initialize iterator - self.setupIteratorData(tid, start, stop, 1) + self._setup_iterator(tid, start, stop, 1) self.start = start self.stop = stop self.truncate = truncate def __next__(self): + cdef int n + while 1: - self.cnext() - if self.n_plp < 0: + n = self.cnext() + if n < 0: raise ValueError("error during iteration" ) - if self.plp == NULL: + if n == 0: raise StopIteration if self.truncate: - if self.start > self.pos: continue - if self.pos >= self.stop: raise StopIteration + if self.start > self.pos: + continue + if self.pos >= self.stop: + raise StopIteration return makePileupColumn(&self.plp, - self.tid, - self.pos, - self.n_plp, - self.samfile) + self.tid, + self.pos, + self.n_plp, + self.min_base_quality, + self.iterdata.seq, + self.samfile.header) cdef class IteratorColumnAllRefs(IteratorColumn): @@ -2313,30 +2606,33 @@ cdef class IteratorColumnAllRefs(IteratorColumn): raise StopIteration # initialize iterator - self.setupIteratorData(self.tid, 0, MAX_POS, 1) + self._setup_iterator(self.tid, 0, MAX_POS, 1) def __next__(self): + cdef int n while 1: - self.cnext() - - if self.n_plp < 0: - raise ValueError("error during iteration" ) + n = self.cnext() + if n < 0: + raise ValueError("error during iteration") + + # proceed to next reference or stop + if n == 0: + self.tid += 1 + if self.tid < self.samfile.nreferences: + self._setup_iterator(self.tid, 0, MAX_POS, 0) + else: + raise StopIteration + continue # return result, if within same reference - if self.plp != NULL: - return makePileupColumn(&self.plp, - self.tid, - self.pos, - self.n_plp, - self.samfile) - - # otherwise, proceed to next reference or stop - self.tid += 1 - if self.tid < self.samfile.nreferences: - self.setupIteratorData(self.tid, 0, MAX_POS, 0) - else: - raise StopIteration + return makePileupColumn(&self.plp, + self.tid, + self.pos, + self.n_plp, + self.min_base_quality, + self.iterdata.seq, + self.samfile.header) cdef class SNPCall: @@ -2428,8 +2724,8 @@ cdef class IndexedReads: # makes sure that samfile stays alive as long as this # object is alive. self.samfile = samfile - - assert samfile.is_bam, "can only IndexReads on bam files" + cdef bam_hdr_t * hdr = NULL + assert samfile.is_bam, "can only apply IndexReads on bam files" # multiple_iterators the file - note that this makes the iterator # slow and causes pileup to slow down significantly. @@ -2437,14 +2733,20 @@ cdef class IndexedReads: cfilename = samfile.filename with nogil: self.htsfile = hts_open(cfilename, 'r') - assert self.htsfile != NULL - # read header - required for accurate positioning + if self.htsfile == NULL: + raise OSError("unable to reopen htsfile") + + # need to advance in newly opened file to position after header + # better: use seek/tell? with nogil: - self.header = sam_hdr_read(self.htsfile) + hdr = sam_hdr_read(self.htsfile) + if hdr == NULL: + raise OSError("unable to read header information") + self.header = makeAlignmentHeader(hdr) self.owns_samfile = True else: self.htsfile = self.samfile.htsfile - self.header = self.samfile.header + self.header = samfile.header self.owns_samfile = False def build(self): @@ -2452,21 +2754,22 @@ cdef class IndexedReads: self.index = collections.defaultdict(list) - # this method will start indexing from the current file - # position if you decide + # this method will start indexing from the current file position cdef int ret = 1 cdef bam1_t * b = calloc(1, sizeof( bam1_t)) if b == NULL: - raise ValueError("could not allocate {} bytes".format(sizeof(bam1_t))) + raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t))) cdef uint64_t pos - + cdef bam_hdr_t * hdr = self.header.ptr + while ret > 0: with nogil: pos = bgzf_tell(hts_get_bgzfp(self.htsfile)) ret = sam_read1(self.htsfile, - self.samfile.header, + hdr, b) + if ret > 0: qname = charptr_to_str(pysam_bam_get_qname(b)) self.index[qname].append(pos) @@ -2500,10 +2803,3 @@ cdef class IndexedReads: def __dealloc__(self): if self.owns_samfile: hts_close(self.htsfile) - bam_hdr_destroy(self.header) - -__all__ = [ - "AlignmentFile", - "IteratorRow", - "IteratorColumn", - "IndexedReads"] diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index 67565f0..5087dff 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -222,6 +222,7 @@ cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id): cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1: + if sample < 0: raise ValueError('genotype is only valid as a format field') @@ -1168,7 +1169,8 @@ cdef inline bcf_sync_end(VariantRecord record): ref_len = 0 # Delete INFO/END if no alleles are present or if rlen is equal to len(ref) - if not record.ptr.n_allele or record.ptr.rlen == ref_len: + # Always keep END for symbolic alleles + if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len): # If INFO/END is not defined in the header, it doesn't exist in the record if end_id >= 0: info = bcf_get_info(hdr, record.ptr, b'END') @@ -1184,6 +1186,17 @@ cdef inline bcf_sync_end(VariantRecord record): bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen) +cdef inline int has_symbolic_allele(VariantRecord record): + """Return index of first symbolic allele. 0 if no symbolic alleles.""" + + for i in range(1, record.ptr.n_allele): + alt = record.ptr.d.allele[i] + if alt[0] == b'<' and alt[len(alt) - 1] == b'>': + return i + + return 0 + + ######################################################################## ######################################################################## ## Variant Header objects @@ -3143,9 +3156,8 @@ cdef class VariantRecord(object): alleles = [r.d.allele[i] for i in range(r.n_allele)] alleles[0] = value else: - alleles = [value] + alleles = [value, ''] self.alleles = alleles - self.ptr.rlen = len(value) bcf_sync_end(self) @property @@ -3166,6 +3178,9 @@ cdef class VariantRecord(object): @alleles.setter def alleles(self, values): cdef bcf1_t *r = self.ptr + + # Cache rlen of symbolic alleles before call to bcf_update_alleles_str + cdef int rlen = r.rlen if bcf_unpack(r, BCF_UN_STR) < 0: raise ValueError('Error unpacking VariantRecord') @@ -3183,7 +3198,11 @@ cdef class VariantRecord(object): if bcf_update_alleles_str(self.header.ptr, r, value) < 0: raise ValueError('Error updating alleles') - self.ptr.rlen = len(values[0]) + # Reset rlen if alternate allele isn't symbolic, otherwise used cached + if has_symbolic_allele(self): + self.ptr.rlen = rlen + else: + self.ptr.rlen = len(values[0]) bcf_sync_end(self) @property diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd index 7c8e632..f1c8d76 100644 --- a/pysam/libcbcftools.pxd +++ b/pysam/libcbcftools.pxd @@ -1,3 +1,9 @@ -cdef extern from "cbcftools_util.h": +cdef extern from "bcftools.pysam.h": int bcftools_main(int argc, char *argv[]) + void bcftools_set_stderr(int fd) + void bcftools_unset_stderr() + void bcftools_set_stdout(int fd) + void bcftools_set_stdout_fn(const char *) + void bcftools_unset_stdout() + void bcftools_set_optind(int) diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd index 9ac09e6..53ad767 100644 --- a/pysam/libcfaidx.pxd +++ b/pysam/libcfaidx.pxd @@ -44,6 +44,7 @@ cdef class FastaFile: cdef class FastqProxy: cdef kseq_t * _delegate + cdef cython.str to_string(self) cdef cython.str tostring(self) cpdef array.array get_quality_array(self, int offset=*) @@ -53,6 +54,7 @@ cdef class FastxRecord: Python container for pysam.libcfaidx.FastqProxy with persistence. """ cdef public str comment, quality, sequence, name + cdef cython.str to_string(self) cdef cython.str tostring(self) cpdef array.array get_quality_array(self, int offset=*) diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index bf04217..ca2f518 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -63,7 +63,7 @@ from cpython cimport PyErr_SetString, \ from cpython.version cimport PY_MAJOR_VERSION from pysam.libchtslib cimport \ - faidx_nseq, fai_load, fai_destroy, fai_fetch, \ + faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \ faidx_seq_len, faidx_iseq, faidx_seq_len, \ faidx_fetch_seq, hisremote, \ bgzf_open, bgzf_close @@ -99,6 +99,10 @@ cdef class FastaFile: Optional, filename of the index. By default this is the filename + ".fai". + filepath_index_compressed : string + Optional, filename of the index if fasta file is. By default this is + the filename + ".gzi". + Raises ------ @@ -128,7 +132,7 @@ cdef class FastaFile: return faidx_nseq(self.fastafile) - def _open(self, filename, filepath_index=None): + def _open(self, filename, filepath_index=None, filepath_index_compressed=None): '''open an indexed fasta file. This method expects an indexed fasta file. @@ -140,24 +144,43 @@ cdef class FastaFile: self._filename = encode_filename(filename) cdef char *cfilename = self._filename + cdef char *cindexname = NULL + cdef char *cindexname_compressed = NULL self.is_remote = hisremote(cfilename) - - if filepath_index is not None: - raise NotImplementedError( - "setting an explicit path for the index " - "is not implemented") - + # open file for reading if (self._filename != b"-" and not self.is_remote and not os.path.exists(filename)): raise IOError("file `%s` not found" % filename) - with nogil: - self.fastafile = fai_load(cfilename) + # 3 modes to open: + # compressed fa: fai_load3 with filename, index_fai and index_gzi + # uncompressed fa: fai_load3 with filename and index_fai + # uncompressed fa: fai_load with default index name + if filepath_index: + # when opening, set flags to 0 - do not automatically + # build index if it does not exist. + + if not os.path.exists(filepath_index): + raise IOError("filename {} does not exist".format(filepath_index)) + cindexname = bindex_filename = encode_filename(filepath_index) + + if filepath_index_compressed: + if not os.path.exists(filepath_index_compressed): + raise IOError("filename {} does not exist".format(filepath_index_compressed)) + cindexname_compressed = bindex_filename_compressed = encode_filename(filepath_index_compressed) + with nogil: + self.fastafile = fai_load3(cfilename, cindexname, cindexname_compressed, 0) + else: + with nogil: + self.fastafile = fai_load3(cfilename, cindexname, NULL, 0) + else: + with nogil: + self.fastafile = fai_load(cfilename) if self.fastafile == NULL: - raise IOError("could not open file `%s`" % filename) + raise IOError("error when opening file `%s`" % filename) cdef int nreferences = faidx_nseq(self.fastafile) cdef int x @@ -359,7 +382,7 @@ cdef class FastqProxy: else: return None - cdef cython.str tostring(self): + cdef cython.str to_string(self): if self.comment is None: comment = "" else: @@ -370,9 +393,13 @@ cdef class FastqProxy: else: return "@%s%s\n%s\n+\n%s" % (self.name, comment, self.sequence, self.quality) - + + cdef cython.str tostring(self): + """deprecated : use :meth:`to_string`""" + return self.to_string() + def __str__(self): - return self.tostring() + return self.to_string() cpdef array.array get_quality_array(self, int offset=33): '''return quality values as integer array after subtracting offset.''' @@ -411,7 +438,7 @@ cdef class FastxRecord: def __deepcopy__(self, memo): return FastxRecord(self.name, self.comment, self.sequence, self.quality) - cdef cython.str tostring(self): + cdef cython.str to_string(self): if self.name is None: raise ValueError("can not write record without name") @@ -428,6 +455,10 @@ cdef class FastxRecord: else: return "@%s%s\n%s\n+\n%s" % (self.name, comment, self.sequence, self.quality) + + cdef cython.str tostring(self): + """deprecated : use :meth:`to_string`""" + return self.to_string() def set_name(self, name): if name is None: @@ -452,7 +483,7 @@ cdef class FastxRecord: self.quality = None def __str__(self): - return self.tostring() + return self.to_string() cpdef array.array get_quality_array(self, int offset=33): '''return quality values as array after subtracting offset.''' @@ -484,8 +515,8 @@ cdef class FastxFile: If True (default) make a copy of the entry in the file during iteration. If set to False, no copy will be made. This will - permit faster iteration, but an entry will not persist when - the iteration continues or is not in-place modifyable. + permit much faster iteration, but an entry will not persist + when the iteration continues and an entry is read-only. Notes ----- diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 7abd472..119dab2 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -424,7 +424,7 @@ cdef extern from "htslib/hts.h" nogil: no_compression, gzip, bgzf, custom compression_maximum - enum hts_fmt_option: + cdef enum hts_fmt_option: CRAM_OPT_DECODE_MD, CRAM_OPT_PREFIX, CRAM_OPT_VERBOSITY, @@ -472,6 +472,27 @@ cdef extern from "htslib/hts.h" nogil: int hts_verbose + cdef union hts_opt_val_union: + int i + char *s + + ctypedef struct hts_opt: + char *arg + hts_fmt_option opt + hts_opt_val_union val + void *next + + # @abstract Parses arg and appends it to the option list. + # @return 0 on success and -1 on failure + int hts_opt_add(hts_opt **opts, const char *c_arg) + + # @abstract Applies an hts_opt option list to a given htsFile. + # @return 0 on success and -1 on failure + int hts_opt_apply(htsFile *fp, hts_opt *opts) + + # @abstract Frees an hts_opt list. + void hts_opt_free(hts_opt *opts) + # @abstract Table for converting a nucleotide character to 4-bit encoding. # The input character may be either an IUPAC ambiguity code, '=' for 0, or # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 @@ -939,7 +960,8 @@ cdef extern from "htslib/sam.h" nogil: # 4. seq is nybble-encoded according to seq_nt16_table. ctypedef struct bam1_t: bam1_core_t core - int l_data, m_data + int l_data + uint32_t m_data uint8_t *data uint64_t id @@ -1203,7 +1225,12 @@ cdef extern from "htslib/sam.h" nogil: void bam_mplp_destroy(bam_mplp_t iter) void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) - + void bam_mplp_reset(bam_mplp_t iter) + void bam_mplp_constructor(bam_mplp_t iter, + int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) + void bam_mplp_destructor(bam_mplp_t iter, + int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) + # Added by AH # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *" diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index f6943ea..7096a99 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -562,30 +562,55 @@ cdef class HTSFile(object): with nogil: return hts_hopen(hfile, cfilename, cmode) + def add_hts_options(self, format_options=None): + """Given a list of key=value format option strings, add them to an open htsFile + """ + cdef int rval + cdef hts_opt *opts = NULL + + if format_options: + for format_option in format_options: + rval = hts_opt_add(&opts, format_option) + if rval != 0: + if opts != NULL: + hts_opt_free(opts) + raise RuntimeError('Invalid format option ({}) specified'.format(format_option)) + if opts != NULL: + rval = hts_opt_apply(self.htsfile, opts) + if rval != 0: + hts_opt_free(opts) + raise RuntimeError('An error occured while applying the requested format options') + hts_opt_free(opts) + def parse_region(self, contig=None, start=None, stop=None, region=None,tid=None, - reference=None, end=None): + reference=None, end=None): """parse alternative ways to specify a genomic region. A region can either be specified by :term:`contig`, `start` and `stop`. `start` and `stop` denote 0-based, half-open intervals. :term:`reference` and `end` are also accepted for - backward compatiblity as synonyms for :term:`contig` and `stop`, - respectively. + backward compatiblity as synonyms for :term:`contig` and + `stop`, respectively. Alternatively, a samtools :term:`region` string can be supplied. - If any of the coordinates are missing they will be replaced by the - minimum (`start`) or maximum (`stop`) coordinate. + If any of the coordinates are missing they will be replaced by + the minimum (`start`) or maximum (`stop`) coordinate. - Note that region strings are 1-based inclusive, while `start` and `stop` denote - an interval in 0-based, half-open coordinates (like BED files and Python slices). + Note that region strings are 1-based inclusive, while `start` + and `stop` denote an interval in 0-based, half-open + coordinates (like BED files and Python slices). + + If `contig` or `region` or are ``*``, unmapped reads at the end + of a BAM file will be returned. Setting either to ``.`` will + iterate from the beginning of the file. Returns ------- - tuple : a tuple of `flag`, :term:`tid`, `start` and `stop`. The - flag indicates whether no coordinates were supplied and the - genomic region is the complete genomic space. + tuple : a tuple of `flag`, :term:`tid`, `start` and + `stop`. The flag indicates whether no coordinates were + supplied and the genomic region is the complete genomic space. Raises ------ @@ -640,10 +665,15 @@ cdef class HTSFile(object): raise IndexError('invalid tid') rtid = tid else: - rtid = self.get_tid(contig) + if contig == "*": + rtid = HTS_IDX_NOCOOR + elif contig == ".": + rtid = HTS_IDX_START + else: + rtid = self.get_tid(contig) + if rtid < 0: + raise ValueError('invalid contig `%s`' % contig) - if rtid < 0: - raise ValueError('invalid contig `%s`' % contig) if rstart > rstop: raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop)) if not 0 <= rstart < MAX_POS: diff --git a/pysam/libcsamfile.pxd b/pysam/libcsamfile.pxd index de36998..dff1345 100644 --- a/pysam/libcsamfile.pxd +++ b/pysam/libcsamfile.pxd @@ -36,10 +36,10 @@ cdef extern from "htslib_util.h": uint8_t pysam_get_qual(bam1_t * b) uint8_t pysam_get_l_qname(bam1_t * b) uint16_t pysam_get_flag(bam1_t * b) - uint16_t pysam_get_n_cigar(bam1_t * b) + uint32_t pysam_get_n_cigar(bam1_t * b) void pysam_set_bin(bam1_t * b, uint16_t v) void pysam_set_qual(bam1_t * b, uint8_t v) void pysam_set_l_qname(bam1_t * b, uint8_t v) void pysam_set_flag(bam1_t * b, uint16_t v) - void pysam_set_n_cigar(bam1_t * b, uint16_t v) + void pysam_set_n_cigar(bam1_t * b, uint32_t v) void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag) diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd index 5fdc57f..ff797f8 100644 --- a/pysam/libcsamtools.pxd +++ b/pysam/libcsamtools.pxd @@ -1,3 +1,9 @@ -cdef extern from "csamtools_util.h": +cdef extern from "samtools.pysam.h": int samtools_main(int argc, char *argv[]) + void samtools_set_stderr(int fd) + void samtools_unset_stderr() + void samtools_set_stdout(int fd) + void samtools_set_stdout_fn(const char *) + void samtools_unset_stdout() + void samtools_set_optind(int) diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index 23e5832..10177ce 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -72,10 +72,11 @@ cimport pysam.libctabixproxies as ctabixproxies from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \ - tbx_index_build2, tbx_index_load, tbx_itr_queryi, tbx_itr_querys, \ + tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \ tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \ tbx_destroy, hisremote, region_list, hts_getline, \ - TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC + TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, htsExactFormat, bcf, \ + bcf_index_build2 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size @@ -374,6 +375,7 @@ cdef class TabixFile: # open file cdef char *cfilename = self.filename + cdef char *cfilename_index = self.filename_index with nogil: self.htsfile = hts_open(cfilename, 'r') @@ -383,9 +385,8 @@ cdef class TabixFile: #if self.htsfile.format.category != region_list: # raise ValueError("file does not contain region data") - cfilename = self.filename_index with nogil: - self.index = tbx_index_load(cfilename) + self.index = tbx_index_load2(cfilename, cfilename_index) if self.index == NULL: raise IOError("could not open index for `%s`" % filename) @@ -534,6 +535,7 @@ cdef class TabixFile: def __get__(self): cdef char *cfilename = self.filename + cdef char *cfilename_index = self.filename_index cdef kstring_t buffer buffer.l = buffer.m = 0 @@ -550,7 +552,7 @@ cdef class TabixFile: raise OSError("could not open {} for reading header".format(self.filename)) with nogil: - tbx = tbx_index_load(cfilename) + tbx = tbx_index_load2(cfilename, cfilename_index) if tbx == NULL: raise OSError("could not load .tbi/.csi index of {}".format(self.filename)) @@ -887,6 +889,7 @@ def tabix_index(filename, int min_shift=-1, index=None, keep_original=False, + csi=False, ): '''index tab-separated *filename* using tabix. @@ -922,10 +925,14 @@ def tabix_index(filename, *index* controls the filename which should be used for creating the index. If not set, the default is to append ``.tbi`` to *filename*. + If *csi* is set, create a CSI index, the default is to create a + TBI index. + When automatically compressing files, if *keep_original* is set the uncompressed file will not be deleted. returns the filename of the compressed data + ''' if not os.path.exists(filename): @@ -939,15 +946,16 @@ def tabix_index(filename, if not is_gzip_file(filename): tabix_compress(filename, filename + ".gz", force=force) if not keep_original: - os.unlink( filename ) + os.unlink(filename) filename += ".gz" - index = index or filename + ".tbi" - - if not force and os.path.exists(index): - raise IOError( - "Filename '%s' already exists, use *force* to overwrite" % index) + fn = encode_filename(filename) + cdef char *cfn = fn + cdef htsFile *fp = hts_open(cfn, "r") + cdef htsExactFormat fmt = fp.format.format + hts_close(fp) + # columns (1-based): # preset-code, contig, start, end, metachar for # comments, lines to ignore at beginning @@ -959,8 +967,13 @@ def tabix_index(filename, 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0), 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0), } - - if preset: + + conf_data = None + if preset == "bcf" or fmt == bcf: + csi = True + if min_shift == -1: + min_shift = 14 + elif preset: try: conf_data = preset2conf[preset] except KeyError: @@ -981,18 +994,35 @@ def tabix_index(filename, if zerobased: preset = preset | TBX_UCSC - conf_data = (preset, seq_col+1, start_col+1, end_col+1, ord(meta_char), line_skip) - - cdef tbx_conf_t conf - conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data + conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip) + cdef tbx_conf_t conf + if conf_data: + conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data - fn = encode_filename(filename) + if csi: + suffix = ".csi" + else: + suffix = ".tbi" + index = index or filename + suffix fn_index = encode_filename(index) - cdef char *cfn = fn + + if not force and os.path.exists(index): + raise IOError( + "filename '%s' already exists, use *force* to overwrite" % index) + cdef char *fnidx = fn_index - with nogil: - tbx_index_build2(cfn, fnidx, min_shift, &conf) + cdef int retval = 0 + + if csi and fmt == bcf: + with nogil: + retval = bcf_index_build2(cfn, fnidx, min_shift) + else: + with nogil: + retval = tbx_index_build2(cfn, fnidx, min_shift, &conf) + + if retval != 0: + raise OSError("building of index for {} failed".format(filename)) return filename @@ -1217,6 +1247,7 @@ class tabix_generic_iterator: # python version - required for python 2.7 def next(self): return self.__next__() + def tabix_iterator(infile, parser): """return an iterator over all entries in a file. diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx index e2d7ef4..f95425a 100644 --- a/pysam/libctabixproxies.pyx +++ b/pysam/libctabixproxies.pyx @@ -451,7 +451,7 @@ cdef class GTFProxy(NamedTupleProxy): '''return max number of fields.''' return 9 - def as_dict(self): + def to_dict(self): """parse attributes - return as dict The dictionary can be modified to update attributes. @@ -461,7 +461,12 @@ cdef class GTFProxy(NamedTupleProxy): self.attributes) self.is_modified = True return self.attribute_dict - + + def as_dict(self): + """deprecated: use :meth:`to_dict` + """ + return self.to_dict() + def from_dict(self, d): '''set attributes from a dictionary.''' self.attribute_dict = None @@ -656,7 +661,7 @@ cdef class GTFProxy(NamedTupleProxy): # for backwards compatibility def asDict(self, *args, **kwargs): - return self.as_dict(*args, **kwargs) + return self.to_dict(*args, **kwargs) def fromDict(self, *args, **kwargs): return self.from_dict(*args, **kwargs) @@ -818,3 +823,11 @@ cdef class VCFProxy(NamedTupleProxy): idx, f = self.map_key2field[key] TupleProxy._setindex(self, idx, str(value)) + +__all__ = [ + "TupleProxy", + "NamedTupleProxy", + "GTFProxy", + "GFF3Proxy", + "BedProxy", + "VCFProxy"] diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd index 479d337..f2d0aeb 100644 --- a/pysam/libcutils.pxd +++ b/pysam/libcutils.pxd @@ -26,13 +26,3 @@ cdef bytes force_bytes(object s, encoding=*) cdef bytes encode_filename(object filename) cdef from_string_and_size(const char *s, size_t length) -cdef extern from "pysam_util.h": - - void pysam_set_stderr(int fd) - void pysam_unset_stderr() - void pysam_set_stdout(int fd) - void pysam_set_stdout_fn(const char *) - void pysam_unset_stdout() - void set_optind(int) - extern int samtools_main(int argc, char *argv[]) - extern int bcftools_main(int argc, char *argv[]) diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index 3609c3b..66f9bf9 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -16,8 +16,11 @@ from libc.stdio cimport fprintf, stderr, fflush from libc.stdio cimport stdout as c_stdout from posix.fcntl cimport open as c_open, O_WRONLY -from libcbcftools cimport bcftools_main -from libcsamtools cimport samtools_main +from libcsamtools cimport samtools_main, samtools_set_stdout, samtools_set_stderr, \ + samtools_unset_stderr, samtools_unset_stdout, samtools_set_stdout_fn, samtools_set_optind + +from libcbcftools cimport bcftools_main, bcftools_set_stdout, bcftools_set_stderr, \ + bcftools_unset_stderr, bcftools_unset_stdout, bcftools_set_stdout_fn, bcftools_set_optind ##################################################################### # hard-coded constants @@ -167,17 +170,15 @@ cpdef parse_region(reference=None, region=None): """parse alternative ways to specify a genomic region. A region can either be specified by :term:`reference`, `start` and - `end`. `start` and `end` denote 0-based, half-open - intervals. + `end`. `start` and `end` denote 0-based, half-open intervals. - Alternatively, a samtools :term:`region` string can be - supplied. + Alternatively, a samtools :term:`region` string can be supplied. If any of the coordinates are missing they will be replaced by the minimum (`start`) or maximum (`end`) coordinate. - Note that region strings are 1-based, while `start` and `end` denote - an interval in python coordinates. + Note that region strings are 1-based, while `start` and `end` + denote an interval in python coordinates. Returns ------- @@ -191,11 +192,9 @@ cpdef parse_region(reference=None, for invalid or out of bounds regions. """ - cdef int rtid cdef long long rstart cdef long long rend - rtid = -1 rstart = 0 rend = MAX_POS if start != None: @@ -262,8 +261,9 @@ def _pysam_dispatch(collection, # redirect stderr to file stderr_h, stderr_f = tempfile.mkstemp() - pysam_set_stderr(stderr_h) - + samtools_set_stderr(stderr_h) + bcftools_set_stderr(stderr_h) + # redirect stdout to file if save_stdout: stdout_f = save_stdout @@ -272,8 +272,11 @@ def _pysam_dispatch(collection, if stdout_h == -1: raise IOError("error while opening {} for writing".format(stdout_f)) - pysam_set_stdout_fn(force_bytes(stdout_f)) - pysam_set_stdout(stdout_h) + samtools_set_stdout_fn(force_bytes(stdout_f)) + samtools_set_stdout(stdout_h) + bcftools_set_stdout_fn(force_bytes(stdout_f)) + bcftools_set_stdout(stdout_h) + elif catch_stdout: stdout_h, stdout_f = tempfile.mkstemp() MAP_STDOUT_OPTIONS = { @@ -299,12 +302,15 @@ def _pysam_dispatch(collection, if stdout_option is not None and not is_usage: os.close(stdout_h) - pysam_set_stdout_fn(force_bytes(stdout_f)) + samtools_set_stdout_fn(force_bytes(stdout_f)) + bcftools_set_stdout_fn(force_bytes(stdout_f)) args.extend(stdout_option.format(stdout_f).split(" ")) else: - pysam_set_stdout(stdout_h) + samtools_set_stdout(stdout_h) + bcftools_set_stdout(stdout_h) else: - pysam_set_stdout_fn("-") + samtools_set_stdout_fn("-") + bcftools_set_stdout_fn("-") # setup the function call to samtools/bcftools main cdef char ** cargs @@ -335,9 +341,11 @@ def _pysam_dispatch(collection, # between getopt and getopt_long if method in [b'index', b'cat', b'quickcheck', b'faidx', b'kprobaln']: - set_optind(1) + samtools_set_optind(1) + bcftools_set_optind(1) else: - set_optind(0) + samtools_set_optind(0) + bcftools_set_optind(0) # call samtools/bcftools if collection == b"samtools": @@ -363,18 +371,21 @@ def _pysam_dispatch(collection, os.remove(fn) return out - pysam_unset_stderr() - out_stderr = _collect(stderr_f) + samtools_unset_stderr() + bcftools_unset_stderr() + if save_stdout or catch_stdout: + samtools_unset_stdout() + bcftools_unset_stdout() + + out_stderr = _collect(stderr_f) if save_stdout: - pysam_unset_stdout() out_stdout = None elif catch_stdout: - pysam_unset_stdout() out_stdout = _collect(stdout_f) else: out_stdout = None - + return retval, out_stderr, out_stdout diff --git a/pysam/namedtuple.py b/pysam/namedtuple.py deleted file mode 100644 index a60fb1a..0000000 --- a/pysam/namedtuple.py +++ /dev/null @@ -1,117 +0,0 @@ -from operator import itemgetter as _itemgetter -from keyword import iskeyword as _iskeyword -import sys as _sys - -def namedtuple(typename, field_names, verbose=False, rename=False): - """Returns a new subclass of tuple with named fields. - - >>> Point = namedtuple('Point', 'x y') - >>> Point.__doc__ # docstring for the new class - 'Point(x, y)' - >>> p = Point(11, y=22) # instantiate with positional args or keywords - >>> p[0] + p[1] # indexable like a plain tuple - 33 - >>> x, y = p # unpack like a regular tuple - >>> x, y - (11, 22) - >>> p.x + p.y # fields also accessable by name - 33 - >>> d = p._asdict() # convert to a dictionary - >>> d['x'] - 11 - >>> Point(**d) # convert from a dictionary - Point(x=11, y=22) - >>> p._replace(x=100) # _replace() is like str.replace() but targets named fields - Point(x=100, y=22) - - """ - - # Parse and validate the field names. Validation serves two purposes, - # generating informative error messages and preventing template injection attacks. - if isinstance(field_names, basestring): - field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas - field_names = tuple(map(str, field_names)) - if rename: - names = list(field_names) - seen = set() - for i, name in enumerate(names): - if (not min(c.isalnum() or c=='_' for c in name) or _iskeyword(name) - or not name or name[0].isdigit() or name.startswith('_') - or name in seen): - names[i] = '_%d' % i - seen.add(name) - field_names = tuple(names) - for name in (typename,) + field_names: - if not min(c.isalnum() or c=='_' for c in name): - raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name) - if _iskeyword(name): - raise ValueError('Type names and field names cannot be a keyword: %r' % name) - if name[0].isdigit(): - raise ValueError('Type names and field names cannot start with a number: %r' % name) - seen_names = set() - for name in field_names: - if name.startswith('_') and not rename: - raise ValueError('Field names cannot start with an underscore: %r' % name) - if name in seen_names: - raise ValueError('Encountered duplicate field name: %r' % name) - seen_names.add(name) - - # Create and fill-in the class template - numfields = len(field_names) - argtxt = repr(field_names).replace("'", "")[1:-1] # tuple repr without parens or quotes - reprtxt = ', '.join('%s=%%r' % name for name in field_names) - template = '''class %(typename)s(tuple): - '%(typename)s(%(argtxt)s)' \n - __slots__ = () \n - _fields = %(field_names)r \n - def __new__(_cls, %(argtxt)s): - return _tuple.__new__(_cls, (%(argtxt)s)) \n - @classmethod - def _make(cls, iterable, new=tuple.__new__, len=len): - 'Make a new %(typename)s object from a sequence or iterable' - result = new(cls, iterable) - if len(result) != %(numfields)d: - raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result)) - return result \n - def __repr__(self): - return '%(typename)s(%(reprtxt)s)' %% self \n - def _asdict(self): - 'Return a new dict which maps field names to their values' - return dict(zip(self._fields, self)) \n - def _replace(_self, **kwds): - 'Return a new %(typename)s object replacing specified fields with new values' - result = _self._make(map(kwds.pop, %(field_names)r, _self)) - if kwds: - raise ValueError('Got unexpected field names: %%r' %% kwds.keys()) - return result \n - def __getnewargs__(self): - return tuple(self) \n\n''' % locals() - for i, name in enumerate(field_names): - template += ' %s = _property(_itemgetter(%d))\n' % (name, i) - if verbose: - print template - - # Execute the template string in a temporary namespace - namespace = dict(_itemgetter=_itemgetter, __name__='namedtuple_%s' % typename, - _property=property, _tuple=tuple) - try: - exec template in namespace - except SyntaxError, e: - raise SyntaxError(e.message + ':\n' + template) - result = namespace[typename] - - # For pickling to work, the __module__ variable needs to be set to the frame - # where the named tuple is created. Bypass this step in enviroments where - # sys._getframe is not defined (Jython for example) or sys._getframe is not - # defined for arguments greater than 0 (IronPython). - try: - result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__') - except (AttributeError, ValueError): - pass - - return result - - - - - diff --git a/pysam/pysam_util.c b/pysam/pysam_util.c index 5940a35..349af44 100644 --- a/pysam/pysam_util.c +++ b/pysam/pysam_util.c @@ -2,72 +2,35 @@ #include #include #include - -/* #include "bam.h" */ -/* #include "bam_endian.h" */ +#include +#include #include "htslib/khash.h" #include "htslib/ksort.h" #include "htslib/knetfile.h" -#include "pysam_util.h" - - -FILE * pysam_stderr = NULL; -FILE * pysam_stdout = NULL; -const char * pysam_stdout_fn = NULL; -int PYSAM_STDOUT_FILENO = STDOUT_FILENO; - - -FILE * pysam_set_stderr(int fd) -{ - if (pysam_stderr != NULL) - fclose(pysam_stderr); - pysam_stderr = fdopen(fd, "w"); - return pysam_stderr; -} - -void pysam_unset_stderr(void) -{ - if (pysam_stderr != NULL) - fclose(pysam_stderr); - pysam_stderr = fopen("/dev/null", "w"); -} -FILE * pysam_set_stdout(int fd) +#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700) +/* + * A rudimentary emulation of getline() for systems that dont support it + * natively. Since this is used for PPD file reading, it assumes (possibly + * falsely) that BUFSIZ is big enough. + */ +ssize_t +getline(char **line, size_t *linelen, FILE *fp) { - if (pysam_stdout != NULL) - fclose(pysam_stdout); - pysam_stdout = fdopen(fd, "w"); - if (pysam_stdout == NULL) + if (*linelen == 0) { - fprintf(pysam_stderr, "could not set stdout to fd %i", fd); + *linelen = BUFSIZ; + *line = malloc(*linelen); } - PYSAM_STDOUT_FILENO = fd; - return pysam_stdout; -} -void pysam_set_stdout_fn(const char *fn) -{ - pysam_stdout_fn = fn; -} + memset(*line, 0, *linelen); + fgets(*line, *linelen, fp); -void pysam_unset_stdout(void) -{ - if (pysam_stdout != NULL) - fclose(pysam_stdout); - pysam_stdout = fopen("/dev/null", "w"); - PYSAM_STDOUT_FILENO = STDOUT_FILENO; -} + return (strlen(*line)); -void set_optind(int val) -{ - // setting this in cython via - // "from posix.unistd cimport optind" - // did not work. - // - // setting to 0 forces a complete re-initialization - optind = val; } +#endif diff --git a/pysam/pysam_util.h b/pysam/pysam_util.h index 8627d96..789e9d0 100644 --- a/pysam/pysam_util.h +++ b/pysam/pysam_util.h @@ -1,41 +1,5 @@ #ifndef PYSAM_UTIL_H #define PYSAM_UTIL_H -/*! set pysam standard error to point to file descriptor - - Setting the stderr will close the previous stderr. - */ -FILE * pysam_set_stderr(int fd); - -/*! set pysam standard output to point to file descriptor - - Setting the stderr will close the previous stdout. - */ -FILE * pysam_set_stdout(int fd); - -/*! set pysam standard output to point to filename - - */ -void pysam_set_stdout_fn(const char * fn); - -/*! set pysam standard error to /dev/null. - - Unsetting the stderr will close the previous stderr. - */ -void pysam_unset_stderr(void); - -/*! set pysam standard error to /dev/null. - - Unsetting the stderr will close the previous stderr. - */ -void pysam_unset_stdout(void); - -int pysam_dispatch(int argc, char *argv[]); - -void set_optind(int); - -extern int samtools_main(int argc, char *argv[]); - -extern int bcftools_main(int argc, char *argv[]); #endif diff --git a/pysam/samfile_util.c b/pysam/samfile_util.c deleted file mode 100644 index b6917ed..0000000 --- a/pysam/samfile_util.c +++ /dev/null @@ -1,16 +0,0 @@ -#include "samfile_util.h" -#include "htslib/sam.h" - -// taken from bam_md.c -// replace bam1_{qual,seq,cigar} with bam_get_{qual,seq,cigar} -// bam1_seqi -> bam_seqi -// bam_nt16_table -> seq_nt16_table - -#include -#include -#include - -char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; - - - diff --git a/pysam/samfile_util.h b/pysam/samfile_util.h deleted file mode 100644 index 94ce096..0000000 --- a/pysam/samfile_util.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef SAMFILE_UTIL_H -#define SAMFILE_UTIL_H - -#include "htslib/sam.h" - -#endif - diff --git a/pysam/tabix_util.c b/pysam/tabix_util.c deleted file mode 100644 index 319808a..0000000 --- a/pysam/tabix_util.c +++ /dev/null @@ -1,28 +0,0 @@ -#include -#include -#include -#include - -#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700) -/* - * A rudimentary emulation of getline() for systems that dont support it - * natively. Since this is used for PPD file reading, it assumes (possibly - * falsely) that BUFSIZ is big enough. - */ -ssize_t -getline(char **line, size_t *linelen, FILE *fp) -{ - if (*linelen == 0) - { - *linelen = BUFSIZ; - *line = malloc(*linelen); - } - - memset(*line, 0, *linelen); - fgets(*line, *linelen, fp); - - return (strlen(*line)); - -} -#endif - diff --git a/pysam/tabix_util.h b/pysam/tabix_util.h deleted file mode 100644 index 65412ca..0000000 --- a/pysam/tabix_util.h +++ /dev/null @@ -1,12 +0,0 @@ -/* See issue 122 - On some MACOSX systems getline is not defined. - */ -#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700) -#include "unistd.h" -ssize_t getline(char **line, size_t *linelen, FILE *fp); -#endif - - - - - diff --git a/pysam/utils.py b/pysam/utils.py index 239f5db..528c411 100644 --- a/pysam/utils.py +++ b/pysam/utils.py @@ -41,7 +41,7 @@ class PysamDispatcher(object): self.dispatch = dispatch self.parsers = parsers self.stderr = [] - + def __call__(self, *args, **kwargs): '''execute a samtools command. @@ -70,7 +70,7 @@ class PysamDispatcher(object): "%s returned with error %i: " "stdout=%s, stderr=%s" % (self.collection, - retval, + retval, stdout, stderr)) @@ -102,4 +102,3 @@ class PysamDispatcher(object): return stderr else: return stdout - diff --git a/pysam/version.py b/pysam/version.py index ab9aeaf..43da562 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,10 +1,10 @@ # pysam versioning information -__version__ = "0.13" +__version__ = "0.14" # TODO: upgrade number -__samtools_version__ = "1.6" +__samtools_version__ = "1.7" # TODO: upgrade code and number __bcftools_version__ = "1.6" -__htslib_version__ = "1.6" +__htslib_version__ = "1.7" diff --git a/run_tests_travis.sh b/run_tests_travis.sh index 2378fcd..b2659bc 100755 --- a/run_tests_travis.sh +++ b/run_tests_travis.sh @@ -38,7 +38,7 @@ conda config --add channels bioconda # pin versions, so that tests do not fail when pysam/htslib out of step # add htslib dependencies -conda install -y "samtools=1.6" "bcftools=1.6" "htslib=1.6" xz curl bzip2 +conda install -y "samtools=1.7" "bcftools=1.6" "htslib=1.7" xz curl bzip2 # Need to make C compiler and linker use the anaconda includes and libraries: export PREFIX=~/miniconda3/ diff --git a/samtools/LICENSE b/samtools/LICENSE new file mode 100644 index 0000000..aeaae3c --- /dev/null +++ b/samtools/LICENSE @@ -0,0 +1,33 @@ +The MIT/Expat License + +Copyright (C) 2008-2014 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + + +[The use of a range of years within a copyright notice in this distribution +should be interpreted as being equivalent to a list of years including the +first and last year specified and all consecutive years between them. + +For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009, +2011-2012" should be interpreted as being identical to a notice that reads +"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice +that reads "Copyright (C) 2005-2012" should be interpreted as being identical +to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012".] diff --git a/samtools/README b/samtools/README new file mode 100644 index 0000000..7088f79 --- /dev/null +++ b/samtools/README @@ -0,0 +1,54 @@ +Samtools implements various utilities for post-processing alignments in the +SAM, BAM, and CRAM formats, including indexing, variant calling (in conjunction +with bcftools), and a simple alignment viewer. + + +Building samtools +================= + +The typical simple case of building Samtools using the HTSlib bundled within +this Samtools release tarball is done as follows: + + cd .../samtools-1.7 # Within the unpacked release directory + ./configure + make + +You may wish to copy the resulting samtools executable into somewhere on your +$PATH, or run it where it is. + +Rather than running-in-place like that, the next simplest typical case is to +install samtools etc properly into a directory of your choosing. Building for +installation using the HTSlib bundled within this Samtools release tarball, +and building the various HTSlib utilities such as bgzip is done as follows: + + cd .../samtools-1.7 # Within the unpacked release directory + ./configure --prefix=/path/to/location + make all all-htslib + make install install-htslib + +You will likely wish to add /path/to/location/bin to your $PATH. + +See INSTALL for full building and installation instructions and details. + + +Using an optimised zlib library +=============================== + +Samtools has been minimally tested against both the Intel-optimised and +CloudFlare-optimised zlibs and shown to work. + +They can be downloaded from: + + https://github.com/jtkukunas/zlib # Intel + https://github.com/cloudflare/zlib # CloudFlare + +Neither Samtools nor HTSlib needs recompiling to use these optimised libraries, +but the LD_LIBRARY_PATH environment variable should be set to a directory +containing the libz.so.1 file. + +Benchmarks comparing the various zlibs are available at: + + http://www.htslib.org/benchmarks/zlib.html + +It is recommended that you perform your own rigorous tests for an entire +pipeline if you wish to switch to one of the optimised zlib implementations. diff --git a/samtools/bam.c.pysam.c b/samtools/bam.c.pysam.c index 188fe8c..982bf41 100644 --- a/samtools/bam.c.pysam.c +++ b/samtools/bam.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam.c -- BAM format. @@ -51,7 +51,7 @@ int bam_view1(const bam_header_t *header, const bam1_t *b) char *s = bam_format1(header, b); int ret = -1; if (!s) return -1; - if (fputs(s, pysam_stdout) & fputc('\n', pysam_stdout) != EOF) ret = 0; + if (fputs(s, samtools_stdout) & fputc('\n', samtools_stdout) != EOF) ret = 0; free(s); return ret; } diff --git a/samtools/bam.h b/samtools/bam.h index 2120875..d4df937 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */ @copyright Genome Research Ltd. */ -#define BAM_VERSION "1.6" +#define BAM_VERSION "1.7" #include #include diff --git a/samtools/bam2bcf.c.pysam.c b/samtools/bam2bcf.c.pysam.c index 3e3e01c..3bd623d 100644 --- a/samtools/bam2bcf.c.pysam.c +++ b/samtools/bam2bcf.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam2bcf.c -- variant calling. @@ -110,7 +110,7 @@ static int get_position(const bam_pileup1_t *p, int *len) if ( cig==BAM_CHARD_CLIP ) continue; if ( cig==BAM_CPAD ) continue; if ( cig==BAM_CREF_SKIP ) continue; - fprintf(pysam_stderr,"todo: cigar %d\n", cig); + fprintf(samtools_stderr,"todo: cigar %d\n", cig); assert(0); } *len = n_tot_bases; @@ -481,7 +481,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) double sum = 0; const double log2 = log(2.0); - // fprintf(pysam_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); + // fprintf(samtools_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp); int i; for (i=0; in; i++) { @@ -496,7 +496,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) else tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p; sum += tmp; - // fprintf(pysam_stderr,"oi=%d %e\n", oi,tmp); + // fprintf(samtools_stderr,"oi=%d %e\n", oi,tmp); } call->seg_bias = sum; } @@ -660,7 +660,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int } } -// if (ref_base < 0) fprintf(pysam_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); +// if (ref_base < 0) fprintf(samtools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); call->shift = (int)(sum_min + .499); } // combine annotations diff --git a/samtools/bam2bcf_indel.c.pysam.c b/samtools/bam2bcf_indel.c.pysam.c index fcbc90f..8241d37 100644 --- a/samtools/bam2bcf_indel.c.pysam.c +++ b/samtools/bam2bcf_indel.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam2bcf_indel.c -- indel caller. @@ -227,7 +227,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla free(aux); // TODO revisit how/whether to control printing this warning if (hts_verbose >= 2) - fprintf(pysam_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); + fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); return -1; } types = (int*)calloc(n_types, sizeof(int)); @@ -300,7 +300,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; if (max_i >= 0) r[max_i] = 15; if (max2_i >= 0) r[max2_i] = 15; - //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysam_stderr); fputc('\n', pysam_stderr); + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], samtools_stderr); fputc('\n', samtools_stderr); } free(ref0); free(cns); } @@ -368,7 +368,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); else ir = est_indelreg(pos, ref, -types[t], 0); if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(pysam_stderr, "%d, %d, %d\n", pos, types[t], ir); +// fprintf(samtools_stderr, "%d, %d, %d\n", pos, types[t], ir); // realignment for (s = K = 0; s < n; ++s) { // write ref2 @@ -430,11 +430,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } /* for (l = 0; l < tend - tbeg + abs(types[t]); ++l) - fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysam_stderr); - fputc('\n', pysam_stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysam_stderr); - fputc('\n', pysam_stderr); - fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); + fputc("ACGTN"[(int)ref2[tbeg-left+l]], samtools_stderr); + fputc('\n', samtools_stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], samtools_stderr); + fputc('\n', samtools_stderr); + fprintf(samtools_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); */ } } @@ -493,7 +493,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(pysam_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); +// fprintf(samtools_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -525,7 +525,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (x == bca->indel_types[j]) break; p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); if ((p->aux>>16&0x3f) > 0) ++n_alt; - //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); + //fprintf(samtools_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff); } } diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index 4d9110b..ebe60d5 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam2depth.c -- depth subcommand. @@ -75,26 +75,26 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here int read_file_list(const char *file_list,int *n,char **argv[]); static int usage() { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -a output all positions (including zero depth)\n"); - fprintf(pysam_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); - fprintf(pysam_stderr, " -b list of positions or regions\n"); - fprintf(pysam_stderr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(pysam_stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); - fprintf(pysam_stderr, " -d/-m maximum coverage depth [8000]\n"); // the htslib's default - fprintf(pysam_stderr, " -q base quality threshold [0]\n"); - fprintf(pysam_stderr, " -Q mapping quality threshold [0]\n"); - fprintf(pysam_stderr, " -r region\n"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); + fprintf(samtools_stderr, "Options:\n"); + fprintf(samtools_stderr, " -a output all positions (including zero depth)\n"); + fprintf(samtools_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); + fprintf(samtools_stderr, " -b list of positions or regions\n"); + fprintf(samtools_stderr, " -f list of input BAM filenames, one per line [null]\n"); + fprintf(samtools_stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); + fprintf(samtools_stderr, " -d/-m maximum coverage depth [8000]\n"); // the htslib's default + fprintf(samtools_stderr, " -q base quality threshold [0]\n"); + fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); + fprintf(samtools_stderr, " -r region\n"); - sam_global_opt_help(pysam_stderr, "-.--.-"); + sam_global_opt_help(samtools_stderr, "-.--.-"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); - fprintf(pysam_stderr, "position, and coverage depth. Note that positions with zero coverage may be\n"); - fprintf(pysam_stderr, "omitted by default; see the -a option.\n"); - fprintf(pysam_stderr, "\n"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); + fprintf(samtools_stderr, "position, and coverage depth. Note that positions with zero coverage may be\n"); + fprintf(samtools_stderr, "omitted by default; see the -a option.\n"); + fprintf(samtools_stderr, "\n"); return 1; } @@ -164,18 +164,18 @@ int main_depth(int argc, char *argv[]) rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; if (baseQ) rf |= SAM_QUAL; if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header if (data[i]->hdr == NULL) { - fprintf(pysam_stderr, "Couldn't read header for \"%s\"\n", + fprintf(samtools_stderr, "Couldn't read header for \"%s\"\n", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; @@ -221,10 +221,10 @@ int main_depth(int argc, char *argv[]) // Horribly inefficient, but the bed API is an obfuscated black box. if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; - fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1); + fputs(h->target_name[last_tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); for (i = 0; i < n; i++) - fputc('\t', pysam_stdout), fputc('0', pysam_stdout); - fputc('\n', pysam_stdout); + fputc('\t', samtools_stdout), fputc('0', samtools_stdout); + fputc('\n', samtools_stdout); } } last_tid++; @@ -238,17 +238,17 @@ int main_depth(int argc, char *argv[]) if (last_pos < beg) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) continue; - fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1); + fputs(h->target_name[tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); for (i = 0; i < n; i++) - fputc('\t', pysam_stdout), fputc('0', pysam_stdout); - fputc('\n', pysam_stdout); + fputc('\t', samtools_stdout), fputc('0', samtools_stdout); + fputc('\n', samtools_stdout); } last_tid = tid; last_pos = pos; } if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; - fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", pos+1); // a customized fprintf(pysam_stdout, ) would be faster + fputs(h->target_name[tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", pos+1); // a customized fprintf(samtools_stdout, ) would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { @@ -256,9 +256,9 @@ int main_depth(int argc, char *argv[]) if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } - fprintf(pysam_stdout, "\t%d", n_plp[i] - m); // this the depth to output + fprintf(samtools_stdout, "\t%d", n_plp[i] - m); // this the depth to output } - fputc('\n', pysam_stdout); + fputc('\n', samtools_stdout); } if (ret < 0) status = EXIT_FAILURE; free(n_plp); free(plp); @@ -275,10 +275,10 @@ int main_depth(int argc, char *argv[]) if (last_pos >= end) break; if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; - fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1); + fputs(h->target_name[last_tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); for (i = 0; i < n; i++) - fputc('\t', pysam_stdout), fputc('0', pysam_stdout); - fputc('\n', pysam_stdout); + fputc('\t', samtools_stdout), fputc('0', samtools_stdout); + fputc('\n', samtools_stdout); } last_tid++; last_pos = -1; diff --git a/samtools/bam_addrprg.c.pysam.c b/samtools/bam_addrprg.c.pysam.c index 56986dd..6d65ccb 100644 --- a/samtools/bam_addrprg.c.pysam.c +++ b/samtools/bam_addrprg.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_addrprg.c -- samtools command to add or replace readgroups. @@ -100,7 +100,7 @@ static char* basic_unescape(const char* in) if (*in == '\\') { ++in; if (*in == '\0') { - fprintf(pysam_stderr, "[%s] Unterminated escape sequence.\n", __func__); + fprintf(samtools_stderr, "[%s] Unterminated escape sequence.\n", __func__); free(out); return NULL; } @@ -112,11 +112,11 @@ static char* basic_unescape(const char* in) *ptr = '\t'; break; case 'n': - fprintf(pysam_stderr, "[%s] \\n in escape sequence is not supported.\n", __func__); + fprintf(samtools_stderr, "[%s] \\n in escape sequence is not supported.\n", __func__); free(out); return NULL; default: - fprintf(pysam_stderr, "[%s] Unsupported escape sequence.\n", __func__); + fprintf(samtools_stderr, "[%s] Unsupported escape sequence.\n", __func__); free(out); return NULL; } @@ -220,7 +220,7 @@ static void usage(FILE *fp) "\n" "Options:\n" " -m MODE Set the mode of operation from one of overwrite_all, orphan_only [overwrite_all]\n" - " -o FILE Where to write output to [pysam_stdout]\n" + " -o FILE Where to write output to [samtools_stdout]\n" " -r STRING @RG line text\n" " -R STRING ID of @RG line in existing header to use\n" ); @@ -232,11 +232,11 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) *opts = NULL; int n; - if (argc == 1) { usage(pysam_stdout); return true; } + if (argc == 1) { usage(samtools_stdout); return true; } parsed_opts_t* retval = calloc(1, sizeof(parsed_opts_t)); if (! retval ) { - fprintf(pysam_stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__); + fprintf(samtools_stderr, "[%s] Out of memory allocating parsed_opts_t\n", __func__); return false; } // Set defaults @@ -270,7 +270,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) } else if (strcmp(optarg, "orphan_only") == 0) { retval->mode = orphan_only; } else { - usage(pysam_stderr); + usage(samtools_stderr); return false; } break; @@ -279,17 +279,17 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) retval->output_name = strdup(optarg); break; case 'h': - usage(pysam_stdout); + usage(samtools_stdout); free(retval); return true; case '?': - usage(pysam_stderr); + usage(samtools_stderr); free(retval); return false; case 'O': default: if (parse_sam_global_opt(n, optarg, lopts, &retval->ga) == 0) break; - usage(pysam_stderr); + usage(samtools_stderr); free(retval); return false; } @@ -297,13 +297,13 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) retval->rg_line = ks_release(&rg_line); if (argc-optind < 1) { - fprintf(pysam_stderr, "You must specify an input file.\n"); - usage(pysam_stderr); + fprintf(samtools_stderr, "You must specify an input file.\n"); + usage(samtools_stderr); cleanup_opts(retval); return false; } if (retval->rg_id && retval->rg_line) { - fprintf(pysam_stderr, "The options -r and -R are mutually exclusive.\n"); + fprintf(samtools_stderr, "The options -r and -R are mutually exclusive.\n"); cleanup_opts(retval); return false; } @@ -313,7 +313,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) char* tmp = basic_unescape(retval->rg_line); if ((retval->rg_id = get_rg_id(tmp)) == NULL) { - fprintf(pysam_stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__); + fprintf(samtools_stderr, "[%s] The supplied RG line lacks an ID tag.\n", __func__); free(tmp); cleanup_opts(retval); return false; @@ -324,7 +324,7 @@ static bool parse_args(int argc, char** argv, parsed_opts_t** opts) if (retval->ga.nthreads > 0) { if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) { - fprintf(pysam_stderr, "Error creating thread pool\n"); + fprintf(samtools_stderr, "Error creating thread pool\n"); return false; } } @@ -362,7 +362,7 @@ static void orphan_only_func(const state_t* state, bam1_t* file_read) static bool init(const parsed_opts_t* opts, state_t** state_out) { state_t* retval = (state_t*) calloc(1, sizeof(state_t)); if (retval == NULL) { - fprintf(pysam_stderr, "[init] Out of memory allocating state struct.\n"); + fprintf(samtools_stderr, "[init] Out of memory allocating state struct.\n"); return false; } *state_out = retval; @@ -392,14 +392,14 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { // Append new RG line to header. // Check does not already exist if ( confirm_rg(retval->output_header, opts->rg_id) ) { - fprintf(pysam_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); + fprintf(samtools_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); return false; } retval->rg_id = strdup(opts->rg_id); size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; char* new_header = malloc(new_len); if (!new_header) { - fprintf(pysam_stderr, "[init] Out of memory whilst writing new header.\n"); + fprintf(samtools_stderr, "[init] Out of memory whilst writing new header.\n"); return false; } sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); @@ -410,13 +410,13 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) { if (opts->rg_id) { // Confirm what has been supplied exists if ( !confirm_rg(retval->output_header, opts->rg_id) ) { - fprintf(pysam_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); + fprintf(samtools_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); return false; } retval->rg_id = strdup(opts->rg_id); } else { if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { - fprintf(pysam_stderr, "No RG specified on command line or in existing header.\n"); + fprintf(samtools_stderr, "No RG specified on command line or in existing header.\n"); return false; } } diff --git a/samtools/bam_aux.c.pysam.c b/samtools/bam_aux.c.pysam.c index c6bd0aa..637e766 100644 --- a/samtools/bam_aux.c.pysam.c +++ b/samtools/bam_aux.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_aux.c -- remaining aux field handling. diff --git a/samtools/bam_cat.c.pysam.c b/samtools/bam_cat.c.pysam.c index 20adbc1..4cf5540 100644 --- a/samtools/bam_cat.c.pysam.c +++ b/samtools/bam_cat.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_cat.c -- efficiently concatenates bam files. @@ -210,7 +210,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t int vmin = cram_minor_vers(in_c); if ((vers_maj != -1 && vers_maj != vmaj) || (vers_min != -1 && vers_min != vmin)) { - fprintf(pysam_stderr, "[%s] ERROR: input files have differing version numbers.\n", + fprintf(samtools_stderr, "[%s] ERROR: input files have differing version numbers.\n", __func__); return NULL; } @@ -230,7 +230,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t int added; new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); - //fprintf(pysam_stderr, "RG %s: #%d -> #%d\n", + //fprintf(samtools_stderr, "RG %s: #%d -> #%d\n", // rg2id_in->id[ki], ki, new_rg); if (added) { @@ -246,7 +246,7 @@ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t } if (new_rg != ki && rg2id_in->n_id > 1) { - fprintf(pysam_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", + fprintf(samtools_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", __func__); return NULL; } @@ -313,7 +313,7 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); - //fprintf(pysam_stderr, "Creating cram vers %s\n", vers); + //fprintf(samtools_stderr, "Creating cram vers %s\n", vers); cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? if (sam_hdr_write(out, new_h) < 0) { @@ -374,7 +374,7 @@ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) // we need to edit the compression header. IF WE CAN. if (new_rg) { int zero = 0; - //fprintf(pysam_stderr, "Transcode RG %d to %d\n", 0, new_rg); + //fprintf(samtools_stderr, "Transcode RG %d to %d\n", 0, new_rg); cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; @@ -429,7 +429,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) const int es=BGZF_EMPTY_BLOCK_SIZE; int i; - fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(pysam_stdout), "w"); + fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(samtools_stdout), "w"); if (fp == 0) { print_error_errno("cat", "fail to open output file '%s'", outbam); return -1; @@ -443,7 +443,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) buf = (uint8_t*) malloc(BUF_SIZE); if (!buf) { - fprintf(pysam_stderr, "[%s] Couldn't allocate buffer\n", __func__); + fprintf(samtools_stderr, "[%s] Couldn't allocate buffer\n", __func__); goto fail; } for(i = 0; i < nfn; ++i){ @@ -459,7 +459,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) old = bam_hdr_read(in); if (old == NULL) { - fprintf(pysam_stderr, "[%s] ERROR: couldn't read header for '%s'.\n", + fprintf(samtools_stderr, "[%s] ERROR: couldn't read header for '%s'.\n", __func__, fn[i]); goto fail; } @@ -480,7 +480,7 @@ int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) if(len [... ]\n"); - fprintf(pysam_stderr, " samtools cat [options] [... ]\n\n"); - fprintf(pysam_stderr, "Concatenate BAM or CRAM files, first those in , then those\non the command line.\n\n"); - fprintf(pysam_stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); - fprintf(pysam_stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); - fprintf(pysam_stderr, " -o FILE output BAM/CRAM\n"); + fprintf(samtools_stderr, "Usage: samtools cat [options] [... ]\n"); + fprintf(samtools_stderr, " samtools cat [options] [... ]\n\n"); + fprintf(samtools_stderr, "Concatenate BAM or CRAM files, first those in , then those\non the command line.\n\n"); + fprintf(samtools_stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); + fprintf(samtools_stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); + fprintf(samtools_stderr, " -o FILE output BAM/CRAM\n"); return 1; } @@ -618,7 +618,7 @@ int main_cat(int argc, char *argv[]) default: sam_close(in); - fprintf(pysam_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); + fprintf(samtools_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__); return 1; } diff --git a/samtools/bam_color.c.pysam.c b/samtools/bam_color.c.pysam.c index 6bd12c4..762e83b 100644 --- a/samtools/bam_color.c.pysam.c +++ b/samtools/bam_color.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_color.c -- color-space support. diff --git a/samtools/bam_flags.c.pysam.c b/samtools/bam_flags.c.pysam.c index 4895f9a..9c6424f 100644 --- a/samtools/bam_flags.c.pysam.c +++ b/samtools/bam_flags.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_flags.c -- flags subcommand. @@ -37,24 +37,24 @@ DEALINGS IN THE SOFTWARE. */ static void usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Convert between textual and numeric flag representation\n"); - fprintf(pysam_stderr, "Usage: samtools flags INT|STR[,...]\n"); - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Flags:\n"); - fprintf(pysam_stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); - fprintf(pysam_stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR); - fprintf(pysam_stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); - fprintf(pysam_stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); - fprintf(pysam_stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); - fprintf(pysam_stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); - fprintf(pysam_stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); - fprintf(pysam_stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); - fprintf(pysam_stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); - fprintf(pysam_stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); - fprintf(pysam_stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); - fprintf(pysam_stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); - fprintf(pysam_stderr, "\n"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "About: Convert between textual and numeric flag representation\n"); + fprintf(samtools_stderr, "Usage: samtools flags INT|STR[,...]\n"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "Flags:\n"); + fprintf(samtools_stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); + fprintf(samtools_stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR); + fprintf(samtools_stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); + fprintf(samtools_stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); + fprintf(samtools_stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); + fprintf(samtools_stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); + fprintf(samtools_stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); + fprintf(samtools_stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); + fprintf(samtools_stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); + fprintf(samtools_stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); + fprintf(samtools_stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); + fprintf(samtools_stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); + fprintf(samtools_stderr, "\n"); } @@ -64,8 +64,8 @@ int main_flags(int argc, char *argv[]) else { int mask = bam_str2flag(argv[1]); - if ( mask<0 ) { fprintf(pysam_stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; } - fprintf(pysam_stdout, "0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask)); + if ( mask<0 ) { fprintf(samtools_stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; } + fprintf(samtools_stdout, "0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask)); } return 0; } diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c index 3b5dd4a..f0a9730 100644 --- a/samtools/bam_import.c.pysam.c +++ b/samtools/bam_import.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_import.c -- SAM format parsing. @@ -62,6 +62,6 @@ bam_header_t *sam_header_read2(const char *fn) free(str->s); free(str); header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); free(samstr.s); - fprintf(pysam_stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets); + fprintf(samtools_stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets); return header; } diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c index a91ee76..e13d453 100644 --- a/samtools/bam_index.c.pysam.c +++ b/samtools/bam_index.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_index.c -- index and idxstats subcommands. @@ -66,12 +66,12 @@ int bam_index(int argc, char *argv[]) case 'm': csi = 1; min_shift = atoi(optarg); break; case '@': n_threads = atoi(optarg); break; default: - index_usage(pysam_stderr); + index_usage(samtools_stderr); return 1; } if (optind == argc) { - index_usage(pysam_stdout); + index_usage(samtools_stdout); return 1; } @@ -110,7 +110,7 @@ int bam_idxstats(int argc, char *argv[]) samFile* fp; if (argc < 2) { - fprintf(pysam_stderr, "Usage: samtools idxstats \n"); + fprintf(samtools_stderr, "Usage: samtools idxstats \n"); return 1; } fp = sam_open(argv[1], "r"); @@ -132,14 +132,14 @@ int bam_idxstats(int argc, char *argv[]) int i; for (i = 0; i < header->n_targets; ++i) { // Print out contig name and length - fprintf(pysam_stdout, "%s\t%d", header->target_name[i], header->target_len[i]); + fprintf(samtools_stdout, "%s\t%d", header->target_name[i], header->target_len[i]); // Now fetch info about it from the meta bin uint64_t u, v; hts_idx_get_stat(idx, i, &u, &v); - fprintf(pysam_stdout, "\t%" PRIu64 "\t%" PRIu64 "\n", u, v); + fprintf(samtools_stdout, "\t%" PRIu64 "\t%" PRIu64 "\n", u, v); } // Dump information about unmapped reads - fprintf(pysam_stdout, "*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx)); + fprintf(samtools_stdout, "*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx)); bam_hdr_destroy(header); hts_idx_destroy(idx); sam_close(fp); diff --git a/samtools/bam_lpileup.c.pysam.c b/samtools/bam_lpileup.c.pysam.c index 93fde4f..8a1555c 100644 --- a/samtools/bam_lpileup.c.pysam.c +++ b/samtools/bam_lpileup.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_lpileup.c -- lplbuf routines. @@ -182,14 +182,14 @@ static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl } tv->n_pre = l; /* - fprintf(pysam_stderr, "%d\t", pos+1); + fprintf(samtools_stderr, "%d\t", pos+1); for (i = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; - if (p->is_head) fprintf(pysam_stderr, "^"); - if (p->is_tail) fprintf(pysam_stderr, "$"); - fprintf(pysam_stderr, "%d,", p->level); + if (p->is_head) fprintf(samtools_stderr, "^"); + if (p->is_tail) fprintf(samtools_stderr, "$"); + fprintf(samtools_stderr, "%d,", p->level); } - fprintf(pysam_stderr, "\n"); + fprintf(samtools_stderr, "\n"); */ return 0; } diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index cf6a82a..21bf90a 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -29,15 +29,18 @@ DEALINGS IN THE SOFTWARE #include #include #include -#include #include #include +#include +#include #include "htslib/thread_pool.h" #include "htslib/sam.h" #include "sam_opts.h" #include "samtools.h" #include "htslib/khash.h" #include "htslib/klist.h" +#include "htslib/kstring.h" +#include "tmp_file.h" typedef struct { int32_t single; @@ -126,6 +129,7 @@ static int key_equal(key_data_t a, key_data_t b) { KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer +KHASH_MAP_INIT_STR(duplicates, int) // map of duplicates for supplementary dup id /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ @@ -438,24 +442,53 @@ static void make_single_key(key_data_t *key, bam1_t *bam) { key->orientation = orientation; } +/* Add the duplicate name to a hash if it does not exist. */ + +static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { + khiter_t d; + int ret; + + d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); + + if (d == kh_end(d_hash)) { + d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); + + if (ret > 0) { + kh_value(d_hash, d) = 1; + } else if (ret == 0) { + kh_value(d_hash, d)++; + } else { + fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n"); + return 1; + } + } + + return 0; +} + /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. Generally the highest quality scoring is chosen as the original and all others the duplicates. The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). While single reads are compared to only one read of a pair, the pair will chosen as the original. - The comparison is done on position and orientation, see above for details. */ + The comparison is done on position and orientation, see above for details. + + Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write + step. This is because the duplicate can occur before the primary read.*/ -static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32_t max_length, int do_stats) { +static int bam_mark_duplicates(samFile *in, samFile *out, char *prefix, int remove_dups, int32_t max_length, int do_stats, int supp, int tag) { bam_hdr_t *header; khiter_t k; khash_t(reads) *pair_hash = kh_init(reads); khash_t(reads) *single_hash = kh_init(reads); klist_t(read_queue) *read_buffer = kl_init(read_queue); kliter_t(read_queue) *rq; + khash_t(duplicates) *dup_hash = kh_init(duplicates); int32_t prev_tid, prev_coord; read_queue_t *in_read; int ret; int reading, writing, excluded, duplicate, single, pair, single_dup, examined; + tmp_file_t temp; if ((header = sam_hdr_read(in)) == NULL) { fprintf(stderr, "[markdup] error reading header\n"); @@ -489,6 +522,13 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 // get the buffer going in_read = kl_pushp(read_queue, read_buffer); + // handling supplementary reads needs a temporary file + if (supp) { + if (tmp_file_open_write(&temp, prefix, 1)) { + fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", prefix); + return 1; + } + } if ((in_read->b = bam_init1()) == NULL) { fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); @@ -519,6 +559,7 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { examined++; + // look at the pairs first if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { int ret, mate_tmp; @@ -557,6 +598,21 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 bp->p = in_read->b; dup->core.flag |= BAM_FDUP; single_dup++; + + if (tag) { + if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + } + + if (supp) { + if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, dup)) { + return 1; + } + } + } } } else { fprintf(stderr, "[markdup] error: single hashing failure.\n"); @@ -611,6 +667,22 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 dup->core.flag |= BAM_FDUP; + if (tag) { + if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + + } + + if (supp) { + if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, dup)) { + return 1; + } + } + } + duplicate++; } else { fprintf(stderr, "[markdup] error: pair hashing failure.\n"); @@ -637,6 +709,22 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { // if matched against one of a pair just mark as duplicate + + if (tag) { + if (bam_aux_append(in_read->b, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + } + + if (supp) { + if (bam_aux_get(in_read->b, "SA") || (in_read->b->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, in_read->b)) { + return 1; + } + } + } + in_read->b->core.flag |= BAM_FDUP; } else { int64_t old_score, new_score; @@ -655,6 +743,21 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 } dup->core.flag |= BAM_FDUP; + + if (tag) { + if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + } + + if (supp) { + if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, dup)) { + return 1; + } + } + } } single_dup++; @@ -680,9 +783,16 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 } if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { - if (sam_write1(out, header, in_read->b) < 0) { - fprintf(stderr, "[markdup] error: writing output failed.\n"); - return 1; + if (supp) { + if (tmp_file_write(&temp, in_read->b)) { + fprintf(stderr, "[markdup] error: writing temp output failed.\n"); + return 1; + } + } else { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(stderr, "[markdup] error: writing output failed.\n"); + return 1; + } } writing++; @@ -725,9 +835,16 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 if (bam_get_qname(in_read->b)) { // last entry will be blank if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { - if (sam_write1(out, header, in_read->b) < 0) { - fprintf(stderr, "[markdup] error: writing final output failed.\n"); - return 1; + if (supp) { + if (tmp_file_write(&temp, in_read->b)) { + fprintf(stderr, "[markdup] error: writing temp output failed.\n"); + return 1; + } + } else { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(stderr, "[markdup] error: writing output failed.\n"); + return 1; + } } writing++; @@ -739,6 +856,56 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 rq = kl_begin(read_buffer); } + if (supp) { + bam1_t *b; + + if (tmp_file_end_write(&temp)) { + fprintf(stderr, "[markdup] error: unable to end tmp writing.\n"); + return 1; + } + + // read data from temp file and mark duplicate supplementary alignments + + if (tmp_file_begin_read(&temp, NULL)) { + return 1; + } + + b = bam_init1(); + + while ((ret = tmp_file_read(&temp, b)) > 0) { + + if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP)) { + k = kh_get(duplicates, dup_hash, bam_get_qname(b)); + + if (k != kh_end(dup_hash)) { + b->core.flag |= BAM_FDUP; + } + } + + if (!remove_dups || !(b->core.flag & BAM_FDUP)) { + if (sam_write1(out, header, b) < 0) { + fprintf(stderr, "[markdup] error: writing final output failed.\n"); + return 1; + } + } + } + + if (ret == -1) { + fprintf(stderr, "[markdup] error: failed to read tmp file.\n"); + return 1; + } + + for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { + if (kh_exist(dup_hash, k)) { + free((char *)kh_key(dup_hash, k)); + } + } + + tmp_file_destroy(&temp, b, 0); + kh_destroy(duplicates, dup_hash); + bam_destroy1(b); + } + if (do_stats) { fprintf(stderr, "READ %d WRITTEN %d \n" "EXCLUDED %d EXAMINED %d\n" @@ -762,8 +929,12 @@ static int markdup_usage(void) { fprintf(stderr, "Usage: samtools markdup \n\n"); fprintf(stderr, "Option: \n"); fprintf(stderr, " -r Remove duplicate reads\n"); - fprintf(stderr, " -l Max read length (default 300 bases)\n"); + fprintf(stderr, " -l INT Max read length (default 300 bases)\n"); + fprintf(stderr, " -S Mark supplemenary alignments of duplicates as duplicates (slower).\n"); fprintf(stderr, " -s Report stats.\n"); + fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); + fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." + " Mainly for information and debugging.\n"); sam_global_opt_help(stderr, "-.O..@"); @@ -775,23 +946,29 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { - int c, ret, remove_dups = 0, report_stats = 0; + int c, ret, remove_dups = 0, report_stats = 0, include_supplementary = 0, tag_dup = 0; int32_t max_length = 300; samFile *in = NULL, *out = NULL; char wmode[3] = {'w', 'b', 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; + kstring_t tmpprefix = {0, 0, NULL}; + struct stat st; + unsigned int t; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "rsl:O:@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rsl:StT:O:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_dups = 1; break; case 'l': max_length = atoi(optarg); break; case 's': report_stats = 1; break; + case 'T': kputs(optarg, &tmpprefix); break; + case 'S': include_supplementary = 1; break; + case 't': tag_dup = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -827,7 +1004,24 @@ int bam_markdup(int argc, char **argv) { } // actual stuff happens here - ret = bam_mark_duplicates(in, out, remove_dups, max_length, report_stats); + + // we need temp files so fix up the name here + if (tmpprefix.l == 0) { + + if (strcmp(argv[optind + 1], "-") != 0) + ksprintf(&tmpprefix, "%s.", argv[optind + 1]); + else + kputc('.', &tmpprefix); + } + + if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) { + if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix); + } + + t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); + ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); + + ret = bam_mark_duplicates(in, out, tmpprefix.s, remove_dups, max_length, report_stats, include_supplementary, tag_dup); sam_close(in); @@ -838,6 +1032,7 @@ int bam_markdup(int argc, char **argv) { if (p.pool) hts_tpool_destroy(p.pool); + free(tmpprefix.s); sam_global_args_free(&ga); return ret; diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index 11b298c..ce621d3 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. @@ -31,15 +31,18 @@ DEALINGS IN THE SOFTWARE #include #include #include -#include #include #include +#include +#include #include "htslib/thread_pool.h" #include "htslib/sam.h" #include "sam_opts.h" #include "samtools.h" #include "htslib/khash.h" #include "htslib/klist.h" +#include "htslib/kstring.h" +#include "tmp_file.h" typedef struct { int32_t single; @@ -128,6 +131,7 @@ static int key_equal(key_data_t a, key_data_t b) { KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer +KHASH_MAP_INIT_STR(duplicates, int) // map of duplicates for supplementary dup id /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ @@ -274,7 +278,7 @@ static int64_t get_mate_score(bam1_t *b) { if ((data = bam_aux_get(b, "ms"))) { score = bam_aux2i(data); } else { - fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n"); + fprintf(samtools_stderr, "[markdup] error: no ms score tag.\n"); return -1; } @@ -321,7 +325,7 @@ static int make_pair_key(key_data_t *key, bam1_t *bam) { other_end = unclipped_other_end(bam->core.mpos, cig); other_coord = unclipped_other_start(bam->core.mpos, cig); } else { - fprintf(pysam_stderr, "[markdup] error: no MC tag.\n"); + fprintf(samtools_stderr, "[markdup] error: no MC tag.\n"); return 1; } @@ -440,27 +444,56 @@ static void make_single_key(key_data_t *key, bam1_t *bam) { key->orientation = orientation; } +/* Add the duplicate name to a hash if it does not exist. */ + +static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { + khiter_t d; + int ret; + + d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); + + if (d == kh_end(d_hash)) { + d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); + + if (ret > 0) { + kh_value(d_hash, d) = 1; + } else if (ret == 0) { + kh_value(d_hash, d)++; + } else { + fprintf(samtools_stderr, "[markdup] error: unable to store supplementary duplicates.\n"); + return 1; + } + } + + return 0; +} + /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. Generally the highest quality scoring is chosen as the original and all others the duplicates. The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). While single reads are compared to only one read of a pair, the pair will chosen as the original. - The comparison is done on position and orientation, see above for details. */ + The comparison is done on position and orientation, see above for details. + + Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write + step. This is because the duplicate can occur before the primary read.*/ -static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32_t max_length, int do_stats) { +static int bam_mark_duplicates(samFile *in, samFile *out, char *prefix, int remove_dups, int32_t max_length, int do_stats, int supp, int tag) { bam_hdr_t *header; khiter_t k; khash_t(reads) *pair_hash = kh_init(reads); khash_t(reads) *single_hash = kh_init(reads); klist_t(read_queue) *read_buffer = kl_init(read_queue); kliter_t(read_queue) *rq; + khash_t(duplicates) *dup_hash = kh_init(duplicates); int32_t prev_tid, prev_coord; read_queue_t *in_read; int ret; int reading, writing, excluded, duplicate, single, pair, single_dup, examined; + tmp_file_t temp; if ((header = sam_hdr_read(in)) == NULL) { - fprintf(pysam_stderr, "[markdup] error reading header\n"); + fprintf(samtools_stderr, "[markdup] error reading header\n"); return 1; } @@ -475,13 +508,13 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 // looking for SO:queryname within @HD only // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { - fprintf(pysam_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); + fprintf(samtools_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); return 1; } } if (sam_hdr_write(out, header) < 0) { - fprintf(pysam_stderr, "[markdup] error writing header.\n"); + fprintf(samtools_stderr, "[markdup] error writing header.\n"); return 1; } @@ -491,9 +524,16 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 // get the buffer going in_read = kl_pushp(read_queue, read_buffer); + // handling supplementary reads needs a temporary file + if (supp) { + if (tmp_file_open_write(&temp, prefix, 1)) { + fprintf(samtools_stderr, "[markdup] error: unable to open tmp file %s.\n", prefix); + return 1; + } + } if ((in_read->b = bam_init1()) == NULL) { - fprintf(pysam_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); return 1; } @@ -505,7 +545,7 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 if (in_read->b->core.tid >= 0) { // -1 for unmapped reads if (in_read->b->core.tid < prev_tid || ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { - fprintf(pysam_stderr, "[markdup] error: bad coordinate order.\n"); + fprintf(samtools_stderr, "[markdup] error: bad coordinate order.\n"); return 1; } } @@ -521,6 +561,7 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { examined++; + // look at the pairs first if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { int ret, mate_tmp; @@ -529,7 +570,7 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 in_hash_t *bp; if (make_pair_key(&pair_key, in_read->b)) { - fprintf(pysam_stderr, "[markdup] error: unable to assign pair hash key.\n"); + fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); return 1; } @@ -559,9 +600,24 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 bp->p = in_read->b; dup->core.flag |= BAM_FDUP; single_dup++; + + if (tag) { + if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + } + + if (supp) { + if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, dup)) { + return 1; + } + } + } } } else { - fprintf(pysam_stderr, "[markdup] error: single hashing failure.\n"); + fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); return 1; } @@ -580,14 +636,14 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 bp = &kh_val(pair_hash, k); if ((mate_tmp = get_mate_score(bp->p)) == -1) { - fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n"); + fprintf(samtools_stderr, "[markdup] error: no ms score tag.\n"); return 1; } else { old_score = calc_score(bp->p) + mate_tmp; } if ((mate_tmp = get_mate_score(in_read->b)) == -1) { - fprintf(pysam_stderr, "[markdup] error: no ms score tag.\n"); + fprintf(samtools_stderr, "[markdup] error: no ms score tag.\n"); return 1; } else { new_score = calc_score(in_read->b) + mate_tmp; @@ -613,9 +669,25 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 dup->core.flag |= BAM_FDUP; + if (tag) { + if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + + } + + if (supp) { + if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, dup)) { + return 1; + } + } + } + duplicate++; } else { - fprintf(pysam_stderr, "[markdup] error: pair hashing failure.\n"); + fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n"); return 1; } } else { // do the single (or effectively single) reads @@ -639,6 +711,22 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { // if matched against one of a pair just mark as duplicate + + if (tag) { + if (bam_aux_append(in_read->b, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + } + + if (supp) { + if (bam_aux_get(in_read->b, "SA") || (in_read->b->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, in_read->b)) { + return 1; + } + } + } + in_read->b->core.flag |= BAM_FDUP; } else { int64_t old_score, new_score; @@ -657,11 +745,26 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 } dup->core.flag |= BAM_FDUP; + + if (tag) { + if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { + fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); + return 1; + } + } + + if (supp) { + if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { + if (add_duplicate(dup_hash, dup)) { + return 1; + } + } + } } single_dup++; } else { - fprintf(pysam_stderr, "[markdup] error: single hashing failure.\n"); + fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); return 1; } } @@ -682,9 +785,16 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 } if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { - if (sam_write1(out, header, in_read->b) < 0) { - fprintf(pysam_stderr, "[markdup] error: writing output failed.\n"); - return 1; + if (supp) { + if (tmp_file_write(&temp, in_read->b)) { + fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); + return 1; + } + } else { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); + return 1; + } } writing++; @@ -710,13 +820,13 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 in_read = kl_pushp(read_queue, read_buffer); if ((in_read->b = bam_init1()) == NULL) { - fprintf(pysam_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); return 1; } } if (ret < -1) { - fprintf(pysam_stderr, "[markdup] error: truncated input file.\n"); + fprintf(samtools_stderr, "[markdup] error: truncated input file.\n"); return 1; } @@ -727,9 +837,16 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 if (bam_get_qname(in_read->b)) { // last entry will be blank if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { - if (sam_write1(out, header, in_read->b) < 0) { - fprintf(pysam_stderr, "[markdup] error: writing final output failed.\n"); - return 1; + if (supp) { + if (tmp_file_write(&temp, in_read->b)) { + fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); + return 1; + } + } else { + if (sam_write1(out, header, in_read->b) < 0) { + fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); + return 1; + } } writing++; @@ -741,8 +858,58 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 rq = kl_begin(read_buffer); } + if (supp) { + bam1_t *b; + + if (tmp_file_end_write(&temp)) { + fprintf(samtools_stderr, "[markdup] error: unable to end tmp writing.\n"); + return 1; + } + + // read data from temp file and mark duplicate supplementary alignments + + if (tmp_file_begin_read(&temp, NULL)) { + return 1; + } + + b = bam_init1(); + + while ((ret = tmp_file_read(&temp, b)) > 0) { + + if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP)) { + k = kh_get(duplicates, dup_hash, bam_get_qname(b)); + + if (k != kh_end(dup_hash)) { + b->core.flag |= BAM_FDUP; + } + } + + if (!remove_dups || !(b->core.flag & BAM_FDUP)) { + if (sam_write1(out, header, b) < 0) { + fprintf(samtools_stderr, "[markdup] error: writing final output failed.\n"); + return 1; + } + } + } + + if (ret == -1) { + fprintf(samtools_stderr, "[markdup] error: failed to read tmp file.\n"); + return 1; + } + + for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { + if (kh_exist(dup_hash, k)) { + free((char *)kh_key(dup_hash, k)); + } + } + + tmp_file_destroy(&temp, b, 0); + kh_destroy(duplicates, dup_hash); + bam_destroy1(b); + } + if (do_stats) { - fprintf(pysam_stderr, "READ %d WRITTEN %d \n" + fprintf(samtools_stderr, "READ %d WRITTEN %d \n" "EXCLUDED %d EXAMINED %d\n" "PAIRED %d SINGLE %d\n" "DULPICATE PAIR %d DUPLICATE SINGLE %d\n" @@ -760,16 +927,20 @@ static int bam_mark_duplicates(samFile *in, samFile *out, int remove_dups, int32 static int markdup_usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Usage: samtools markdup \n\n"); - fprintf(pysam_stderr, "Option: \n"); - fprintf(pysam_stderr, " -r Remove duplicate reads\n"); - fprintf(pysam_stderr, " -l Max read length (default 300 bases)\n"); - fprintf(pysam_stderr, " -s Report stats.\n"); - - sam_global_opt_help(pysam_stderr, "-.O..@"); - - fprintf(pysam_stderr, "\nThe input file must be coordinate sorted and must have gone" + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "Usage: samtools markdup \n\n"); + fprintf(samtools_stderr, "Option: \n"); + fprintf(samtools_stderr, " -r Remove duplicate reads\n"); + fprintf(samtools_stderr, " -l INT Max read length (default 300 bases)\n"); + fprintf(samtools_stderr, " -S Mark supplemenary alignments of duplicates as duplicates (slower).\n"); + fprintf(samtools_stderr, " -s Report stats.\n"); + fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); + fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." + " Mainly for information and debugging.\n"); + + sam_global_opt_help(samtools_stderr, "-.O..@"); + + fprintf(samtools_stderr, "\nThe input file must be coordinate sorted and must have gone" " through fixmates with the mate scoring option on.\n"); return 1; @@ -777,23 +948,29 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { - int c, ret, remove_dups = 0, report_stats = 0; + int c, ret, remove_dups = 0, report_stats = 0, include_supplementary = 0, tag_dup = 0; int32_t max_length = 300; samFile *in = NULL, *out = NULL; char wmode[3] = {'w', 'b', 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; + kstring_t tmpprefix = {0, 0, NULL}; + struct stat st; + unsigned int t; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "rsl:O:@:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rsl:StT:O:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_dups = 1; break; case 'l': max_length = atoi(optarg); break; case 's': report_stats = 1; break; + case 'T': kputs(optarg, &tmpprefix); break; + case 'S': include_supplementary = 1; break; + case 't': tag_dup = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -820,7 +997,7 @@ int bam_markdup(int argc, char **argv) { if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { - fprintf(pysam_stderr, "[markdup] error creating thread pool\n"); + fprintf(samtools_stderr, "[markdup] error creating thread pool\n"); return 1; } @@ -829,17 +1006,35 @@ int bam_markdup(int argc, char **argv) { } // actual stuff happens here - ret = bam_mark_duplicates(in, out, remove_dups, max_length, report_stats); + + // we need temp files so fix up the name here + if (tmpprefix.l == 0) { + + if (strcmp(argv[optind + 1], "-") != 0) + ksprintf(&tmpprefix, "%s.", argv[optind + 1]); + else + kputc('.', &tmpprefix); + } + + if (stat(tmpprefix.s, &st) == 0 && S_ISDIR(st.st_mode)) { + if (tmpprefix.s[tmpprefix.l-1] != '/') kputc('/', &tmpprefix); + } + + t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); + ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); + + ret = bam_mark_duplicates(in, out, tmpprefix.s, remove_dups, max_length, report_stats, include_supplementary, tag_dup); sam_close(in); if (sam_close(out) < 0) { - fprintf(pysam_stderr, "[markdup] error closing output file\n"); + fprintf(samtools_stderr, "[markdup] error closing output file\n"); ret = 1; } if (p.pool) hts_tpool_destroy(p.pool); + free(tmpprefix.s); sam_global_args_free(&ga); return ret; diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index 8857aeb..57159cc 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_mate.c -- fix mate pairing information and clean up flags. @@ -262,7 +262,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop str.l = str.m = 0; str.s = 0; header = sam_hdr_read(in); if (header == NULL) { - fprintf(pysam_stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); + fprintf(samtools_stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); return 1; } // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. @@ -273,7 +273,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop // Looking for SO:coordinate within the @HD line only // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { - fprintf(pysam_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); + fprintf(samtools_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); goto fail; } } @@ -330,7 +330,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop if (do_mate_scoring) { if ((add_mate_score(pre, cur) == -1) || (add_mate_score(cur, pre) == -1)) { - fprintf(pysam_stderr, "[bam_mating_core] ERROR: unable to add mate score.\n"); + fprintf(samtools_stderr, "[bam_mating_core] ERROR: unable to add mate score.\n"); goto fail; } } @@ -408,7 +408,7 @@ void usage(FILE* where) fprintf(where, "\n" -"As elsewhere in samtools, use '-' as the filename for stdin/pysam_stdout. The input\n" +"As elsewhere in samtools, use '-' as the filename for stdin/samtools_stdout. The input\n" "file must be grouped by read name (e.g. sorted by name). Coordinated sorted\n" "input is not accepted.\n"); } @@ -426,7 +426,7 @@ int bam_mating(int argc, char *argv[]) }; // parse args - if (argc == 1) { usage(pysam_stdout); return 0; } + if (argc == 1) { usage(samtools_stdout); return 0; } while ((c = getopt_long(argc, argv, "rpcmO:@:", lopts, NULL)) >= 0) { switch (c) { case 'r': remove_reads = 1; break; @@ -435,10 +435,10 @@ int bam_mating(int argc, char *argv[]) case 'm': mate_score = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': usage(pysam_stderr); goto fail; + case '?': usage(samtools_stderr); goto fail; } } - if (optind+1 >= argc) { usage(pysam_stderr); goto fail; } + if (optind+1 >= argc) { usage(samtools_stderr); goto fail; } // init if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { @@ -453,7 +453,7 @@ int bam_mating(int argc, char *argv[]) if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { - fprintf(pysam_stderr, "Error creating thread pool\n"); + fprintf(samtools_stderr, "Error creating thread pool\n"); goto fail; } hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); @@ -466,7 +466,7 @@ int bam_mating(int argc, char *argv[]) // cleanup sam_close(in); if (sam_close(out) < 0) { - fprintf(pysam_stderr, "[bam_mating] error while closing output file\n"); + fprintf(samtools_stderr, "[bam_mating] error while closing output file\n"); res = 1; } diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c index 5e4cdb5..f266fe7 100644 --- a/samtools/bam_md.c.pysam.c +++ b/samtools/bam_md.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_md.c -- calmd subcommand. @@ -118,7 +118,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { - fprintf(pysam_stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); + fprintf(samtools_stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } @@ -136,7 +136,7 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) if (i < str->l) is_diff = 1; } else is_diff = 1; if (is_diff) { - fprintf(pysam_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); + fprintf(samtools_stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } @@ -164,7 +164,7 @@ void bam_fillmd1(bam1_t *b, char *ref, int flag) } int calmd_usage() { - fprintf(pysam_stderr, + fprintf(samtools_stderr, "Usage: samtools calmd [-eubrAES] \n" "Options:\n" " -e change identical bases to '='\n" @@ -175,7 +175,7 @@ int calmd_usage() { " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" " -E extended BAQ for better sensitivity but lower specificity\n"); - sam_global_opt_help(pysam_stderr, "-....@"); + sam_global_opt_help(samtools_stderr, "-....@"); return 1; } @@ -214,7 +214,7 @@ int bam_fillmd(int argc, char *argv[]) case 'A': baq_flag |= 1; break; case 'E': baq_flag |= 2; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(pysam_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); + fprintf(samtools_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); /* else fall-through */ case '?': return calmd_usage(); } @@ -232,11 +232,11 @@ int bam_fillmd(int argc, char *argv[]) header = sam_hdr_read(fp); if (header == NULL || header->n_targets == 0) { - fprintf(pysam_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); + fprintf(samtools_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); goto fail; } - fpout = sam_open_format(pysam_stdout_fn, mode_w, &ga.out); + fpout = sam_open_format(samtools_stdout_fn, mode_w, &ga.out); if (fpout == NULL) { print_error_errno("calmd", "Failed to open output"); goto fail; @@ -248,7 +248,7 @@ int bam_fillmd(int argc, char *argv[]) if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { - fprintf(pysam_stderr, "Error creating thread pool\n"); + fprintf(samtools_stderr, "Error creating thread pool\n"); goto fail; } hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); @@ -265,7 +265,7 @@ int bam_fillmd(int argc, char *argv[]) b = bam_init1(); if (!b) { - fprintf(pysam_stderr, "[bam_fillmd] Failed to allocate bam struct\n"); + fprintf(samtools_stderr, "[bam_fillmd] Failed to allocate bam struct\n"); goto fail; } while ((ret = sam_read1(fp, header, b)) >= 0) { @@ -275,7 +275,7 @@ int bam_fillmd(int argc, char *argv[]) ref = fai_fetch(fai, header->target_name[b->core.tid], &len); tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? - fprintf(pysam_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", + fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", header->target_name[tid]); if (is_realn || capQ > 10) goto fail; // Would otherwise crash } @@ -293,7 +293,7 @@ int bam_fillmd(int argc, char *argv[]) } } if (ret < -1) { - fprintf(pysam_stderr, "[bam_fillmd] Error reading input.\n"); + fprintf(samtools_stderr, "[bam_fillmd] Error reading input.\n"); goto fail; } bam_destroy1(b); @@ -303,7 +303,7 @@ int bam_fillmd(int argc, char *argv[]) fai_destroy(fai); sam_close(fp); if (sam_close(fpout) < 0) { - fprintf(pysam_stderr, "[bam_fillmd] error when closing output file\n"); + fprintf(samtools_stderr, "[bam_fillmd] error when closing output file\n"); return 1; } if (p.pool) hts_tpool_destroy(p.pool); diff --git a/samtools/bam_plbuf.c.pysam.c b/samtools/bam_plbuf.c.pysam.c index 76c1ac1..a2e3d18 100644 --- a/samtools/bam_plbuf.c.pysam.c +++ b/samtools/bam_plbuf.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_plbuf.c -- plbuf routines (previously in bam_pileup.c). diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index 7fd5bea..77999e6 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_plcmd.c -- mpileup subcommand. @@ -264,7 +264,7 @@ static int mplp_func(void *data, bam1_t *b) if (ma->conf->fai && b->core.tid >= 0) { has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence - fprintf(pysam_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", + fprintf(samtools_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", __func__, b->core.pos, ref_len, b->core.tid); skip = 1; continue; @@ -301,7 +301,7 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); if (id < 0 || id >= m->n) { assert(q); // otherwise a bug - fprintf(pysam_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); + fprintf(samtools_stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); exit(EXIT_FAILURE); } if (m->n_plp[id] == m->m_plp[id]) { @@ -352,7 +352,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) sm = bam_smpl_init(); if (n == 0) { - fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__); + fprintf(samtools_stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } @@ -363,15 +363,15 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in); if ( !data[i]->fp ) { - fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); + fprintf(samtools_stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(data[i]->fp, conf->fai_fname) != 0) { - fprintf(pysam_stderr, "[%s] failed to process %s: %s\n", + fprintf(samtools_stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } @@ -379,7 +379,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { - fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); + fprintf(samtools_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(EXIT_FAILURE); } bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); @@ -388,11 +388,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == NULL) { - fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); + fprintf(samtools_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(EXIT_FAILURE); } if ( (data[i]->iter=sam_itr_querys(idx, h_tmp, conf->reg)) == 0) { - fprintf(pysam_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); + fprintf(samtools_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]); exit(EXIT_FAILURE); } if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid; @@ -417,7 +417,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); - fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); + fprintf(samtools_stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { @@ -429,7 +429,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { - fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); + fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } @@ -545,10 +545,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } else { - pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : pysam_stdout; + pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : samtools_stdout; if (pileup_fp == NULL) { - fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); + fprintf(samtools_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(EXIT_FAILURE); } } @@ -558,10 +558,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) - fprintf(pysam_stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); + fprintf(samtools_stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; - fprintf(pysam_stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); + fprintf(samtools_stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); @@ -698,7 +698,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if ( c < conf->min_baseQ ) continue; if (n > 0) putc(',', pileup_fp); - fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(pysam_stdout, ) is very slow... + fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(samtools_stdout, ) is very slow... n++; } if (!n) putc('*', pileup_fp); @@ -799,7 +799,7 @@ int read_file_list(const char *file_list,int *n,char **argv[]) FILE *fh = fopen(file_list,"r"); if ( !fh ) { - fprintf(pysam_stderr,"%s: %s\n", file_list,strerror(errno)); + fprintf(samtools_stderr,"%s: %s\n", file_list,strerror(errno)); return 1; } @@ -821,9 +821,9 @@ int read_file_list(const char *file_list,int *n,char **argv[]) for (i=0; i 0) // this is not a good style, but forgive me... khash_str2int_inc(mplp.rghash, strdup(buf)); fclose(fp_rg); @@ -1104,7 +1104,7 @@ int bam_mpileup(int argc, char *argv[]) if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break; /* else fall-through */ case '?': - print_usage(pysam_stderr, &mplp); + print_usage(samtools_stderr, &mplp); return 1; } } @@ -1116,13 +1116,13 @@ int bam_mpileup(int argc, char *argv[]) if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { - fprintf(pysam_stderr,"Error: The -B option cannot be combined with -E\n"); + fprintf(samtools_stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { - print_usage(pysam_stderr, &mplp); + print_usage(samtools_stderr, &mplp); return 1; } int ret; diff --git a/samtools/bam_quickcheck.c b/samtools/bam_quickcheck.c index 02616fe..e7c2397 100644 --- a/samtools/bam_quickcheck.c +++ b/samtools/bam_quickcheck.c @@ -30,43 +30,66 @@ DEALINGS IN THE SOFTWARE. */ #include #include +/* File status flags (zero means OK). It's possible for more than one to be + * set on a single file. The final exit status is the bitwise-or of the + * status of all the files. */ +#define QC_FAIL_OPEN 2 +#define QC_NOT_SEQUENCE 4 +#define QC_BAD_HEADER 8 +#define QC_NO_EOF_BLOCK 16 +#define QC_FAIL_CLOSE 32 + static void usage_quickcheck(FILE *write_to) { fprintf(write_to, "Usage: samtools quickcheck [options] [...]\n" "Options:\n" " -v verbose output (repeat for more verbosity)\n" +" -q suppress warning messages\n" "\n" "Notes:\n" "\n" -"1. In order to use this command effectively, you should check its exit status;\n" -" without any -v options it will NOT print any output, even when some files\n" -" fail the check. One way to use quickcheck might be as a check that all\n" -" BAM files in a directory are okay:\n" +"1. By default quickcheck will emit a warning message if and only if a file\n" +" fails the checks, in which case the exit status is non-zero. Under normal\n" +" behaviour with valid data it will be silent and has a zero exit status.\n" +" The warning messages are purely for manual inspection and should not be \n" +" parsed by scripts.\n" +"\n" +"2. In order to use this command programmatically, you should check its exit\n" +" status. One way to use quickcheck might be as a check that all BAM files in\n" +" a directory are okay:\n" "\n" "\tsamtools quickcheck *.bam && echo 'all ok' \\\n" "\t || echo 'fail!'\n" "\n" -" To also determine which files have failed, use the -v option:\n" +" The first level of verbosity lists only files that fail to stdout.\n" +" To obtain a parsable list of files that have failed, use this option:\n" "\n" -"\tsamtools quickcheck -v *.bam > bad_bams.fofn \\\n" +"\tsamtools quickcheck -qv *.bam > bad_bams.fofn \\\n" "\t && echo 'all ok' \\\n" "\t || echo 'some files failed check, see bad_bams.fofn'\n" ); } +#define QC_ERR(state, v, msg, arg1) \ + file_state |= (state); \ + if (!quiet || verbose >= (v)) fprintf(stderr, (msg), (arg1)) + int main_quickcheck(int argc, char** argv) { - int verbose = 0; + int verbose = 0, quiet = 0; hts_verbose = 0; - const char* optstring = "v"; + const char* optstring = "vq"; int opt; while ((opt = getopt(argc, argv, optstring)) != -1) { switch (opt) { case 'v': verbose++; break; + case 'q': + quiet = 1; + break; default: usage_quickcheck(stderr); return 1; @@ -101,28 +124,24 @@ int main_quickcheck(int argc, char** argv) // attempt to open htsFile *hts_fp = hts_open(fn, "r"); if (hts_fp == NULL) { - if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading.\n", fn); - file_state |= 2; + QC_ERR(QC_FAIL_OPEN, 2, "%s could not be opened for reading.\n", fn); } else { if (verbose >= 3) fprintf(stderr, "opened %s\n", fn); // make sure we have sequence data const htsFormat *fmt = hts_get_format(hts_fp); if (fmt->category != sequence_data ) { - if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data.\n", fn); - file_state |= 4; + QC_ERR(QC_NOT_SEQUENCE, 2, "%s was not identified as sequence data.\n", fn); } else { if (verbose >= 3) fprintf(stderr, "%s is sequence data\n", fn); // check header bam_hdr_t *header = sam_hdr_read(hts_fp); if (header == NULL) { - if (verbose >= 2) fprintf(stderr, "%s caused an error whilst reading its header.\n", fn); - file_state |= 8; + QC_ERR(QC_BAD_HEADER, 2, "%s caused an error whilst reading its header.\n", fn); } else { if (header->n_targets <= 0) { - if (verbose >= 2) fprintf(stderr, "%s had no targets in header.\n", fn); - file_state |= 8; + QC_ERR(QC_BAD_HEADER, 2, "%s had no targets in header.\n", fn); } else { if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, header->n_targets); @@ -133,14 +152,12 @@ int main_quickcheck(int argc, char** argv) // check EOF on formats that support this int ret; if ((ret = hts_check_EOF(hts_fp)) < 0) { - if (verbose >= 2) fprintf(stderr, "%s caused an error whilst checking for EOF block.\n", fn); - file_state |= 16; - } + QC_ERR(QC_NO_EOF_BLOCK, 2, "%s caused an error whilst checking for EOF block.\n", fn); + } else { switch (ret) { case 0: - if (verbose >= 2) fprintf(stderr, "%s was missing EOF block when one should be present.\n", fn); - file_state |= 16; + QC_ERR(QC_NO_EOF_BLOCK, 2, "%s was missing EOF block when one should be present.\n", fn); break; case 1: if (verbose >= 3) fprintf(stderr, "%s has good EOF block.\n", fn); @@ -155,8 +172,7 @@ int main_quickcheck(int argc, char** argv) } if (hts_close(hts_fp) < 0) { - file_state |= 32; - if (verbose >= 2) fprintf(stderr, "%s did not close cleanly.\n", fn); + QC_ERR(QC_FAIL_CLOSE, 2, "%s did not close cleanly.\n", fn); } } diff --git a/samtools/bam_quickcheck.c.pysam.c b/samtools/bam_quickcheck.c.pysam.c index c9dc3d2..fc0af2e 100644 --- a/samtools/bam_quickcheck.c.pysam.c +++ b/samtools/bam_quickcheck.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_quickcheck.c -- quickcheck subcommand. @@ -32,45 +32,68 @@ DEALINGS IN THE SOFTWARE. */ #include #include +/* File status flags (zero means OK). It's possible for more than one to be + * set on a single file. The final exit status is the bitwise-or of the + * status of all the files. */ +#define QC_FAIL_OPEN 2 +#define QC_NOT_SEQUENCE 4 +#define QC_BAD_HEADER 8 +#define QC_NO_EOF_BLOCK 16 +#define QC_FAIL_CLOSE 32 + static void usage_quickcheck(FILE *write_to) { fprintf(write_to, "Usage: samtools quickcheck [options] [...]\n" "Options:\n" " -v verbose output (repeat for more verbosity)\n" +" -q suppress warning messages\n" "\n" "Notes:\n" "\n" -"1. In order to use this command effectively, you should check its exit status;\n" -" without any -v options it will NOT print any output, even when some files\n" -" fail the check. One way to use quickcheck might be as a check that all\n" -" BAM files in a directory are okay:\n" +"1. By default quickcheck will emit a warning message if and only if a file\n" +" fails the checks, in which case the exit status is non-zero. Under normal\n" +" behaviour with valid data it will be silent and has a zero exit status.\n" +" The warning messages are purely for manual inspection and should not be \n" +" parsed by scripts.\n" +"\n" +"2. In order to use this command programmatically, you should check its exit\n" +" status. One way to use quickcheck might be as a check that all BAM files in\n" +" a directory are okay:\n" "\n" "\tsamtools quickcheck *.bam && echo 'all ok' \\\n" "\t || echo 'fail!'\n" "\n" -" To also determine which files have failed, use the -v option:\n" +" The first level of verbosity lists only files that fail to samtools_stdout.\n" +" To obtain a parsable list of files that have failed, use this option:\n" "\n" -"\tsamtools quickcheck -v *.bam > bad_bams.fofn \\\n" +"\tsamtools quickcheck -qv *.bam > bad_bams.fofn \\\n" "\t && echo 'all ok' \\\n" "\t || echo 'some files failed check, see bad_bams.fofn'\n" ); } +#define QC_ERR(state, v, msg, arg1) \ + file_state |= (state); \ + if (!quiet || verbose >= (v)) fprintf(samtools_stderr, (msg), (arg1)) + int main_quickcheck(int argc, char** argv) { - int verbose = 0; + int verbose = 0, quiet = 0; hts_verbose = 0; - const char* optstring = "v"; + const char* optstring = "vq"; int opt; while ((opt = getopt(argc, argv, optstring)) != -1) { switch (opt) { case 'v': verbose++; break; + case 'q': + quiet = 1; + break; default: - usage_quickcheck(pysam_stderr); + usage_quickcheck(samtools_stderr); return 1; } } @@ -79,12 +102,12 @@ int main_quickcheck(int argc, char** argv) argv += optind; if (argc < 1) { - usage_quickcheck(pysam_stdout); + usage_quickcheck(samtools_stdout); return 1; } if (verbose >= 2) { - fprintf(pysam_stderr, "verbosity set to %d\n", verbose); + fprintf(samtools_stderr, "verbosity set to %d\n", verbose); } if (verbose >= 4) { @@ -98,36 +121,32 @@ int main_quickcheck(int argc, char** argv) char* fn = argv[i]; int file_state = 0; - if (verbose >= 3) fprintf(pysam_stderr, "checking %s\n", fn); + if (verbose >= 3) fprintf(samtools_stderr, "checking %s\n", fn); // attempt to open htsFile *hts_fp = hts_open(fn, "r"); if (hts_fp == NULL) { - if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading.\n", fn); - file_state |= 2; + QC_ERR(QC_FAIL_OPEN, 2, "%s could not be opened for reading.\n", fn); } else { - if (verbose >= 3) fprintf(pysam_stderr, "opened %s\n", fn); + if (verbose >= 3) fprintf(samtools_stderr, "opened %s\n", fn); // make sure we have sequence data const htsFormat *fmt = hts_get_format(hts_fp); if (fmt->category != sequence_data ) { - if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data.\n", fn); - file_state |= 4; + QC_ERR(QC_NOT_SEQUENCE, 2, "%s was not identified as sequence data.\n", fn); } else { - if (verbose >= 3) fprintf(pysam_stderr, "%s is sequence data\n", fn); + if (verbose >= 3) fprintf(samtools_stderr, "%s is sequence data\n", fn); // check header bam_hdr_t *header = sam_hdr_read(hts_fp); if (header == NULL) { - if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst reading its header.\n", fn); - file_state |= 8; + QC_ERR(QC_BAD_HEADER, 2, "%s caused an error whilst reading its header.\n", fn); } else { if (header->n_targets <= 0) { - if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header.\n", fn); - file_state |= 8; + QC_ERR(QC_BAD_HEADER, 2, "%s had no targets in header.\n", fn); } else { - if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header.\n", fn, header->n_targets); + if (verbose >= 3) fprintf(samtools_stderr, "%s has %d targets in header.\n", fn, header->n_targets); } bam_hdr_destroy(header); } @@ -135,35 +154,32 @@ int main_quickcheck(int argc, char** argv) // check EOF on formats that support this int ret; if ((ret = hts_check_EOF(hts_fp)) < 0) { - if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst checking for EOF block.\n", fn); - file_state |= 16; - } + QC_ERR(QC_NO_EOF_BLOCK, 2, "%s caused an error whilst checking for EOF block.\n", fn); + } else { switch (ret) { case 0: - if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block when one should be present.\n", fn); - file_state |= 16; + QC_ERR(QC_NO_EOF_BLOCK, 2, "%s was missing EOF block when one should be present.\n", fn); break; case 1: - if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block.\n", fn); + if (verbose >= 3) fprintf(samtools_stderr, "%s has good EOF block.\n", fn); break; case 2: - if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn); + if (verbose >= 3) fprintf(samtools_stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn); break; case 3: - if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn); + if (verbose >= 3) fprintf(samtools_stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn); break; } } if (hts_close(hts_fp) < 0) { - file_state |= 32; - if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly.\n", fn); + QC_ERR(QC_FAIL_CLOSE, 2, "%s did not close cleanly.\n", fn); } } if (file_state > 0 && verbose >= 1) { - fprintf(pysam_stdout, "%s\n", fn); + fprintf(samtools_stdout, "%s\n", fn); } ret |= file_state; } diff --git a/samtools/bam_reheader.c.pysam.c b/samtools/bam_reheader.c.pysam.c index 562c8e4..4ba6353 100644 --- a/samtools/bam_reheader.c.pysam.c +++ b/samtools/bam_reheader.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_reheader.c -- reheader subcommand. @@ -54,11 +54,11 @@ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, if (in->is_write) return -1; buf = malloc(BUF_SIZE); if (!buf) { - fprintf(pysam_stderr, "Out of memory\n"); + fprintf(samtools_stderr, "Out of memory\n"); return -1; } if (bam_hdr_read(in) == NULL) { - fprintf(pysam_stderr, "Couldn't read header\n"); + fprintf(samtools_stderr, "Couldn't read header\n"); goto fail; } fp = bgzf_fdopen(fd, "w"); @@ -100,13 +100,13 @@ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, if (bgzf_raw_write(fp, buf, len) < 0) goto write_fail; } if (len < 0) { - fprintf(pysam_stderr, "[%s] Error reading input file\n", __func__); + fprintf(samtools_stderr, "[%s] Error reading input file\n", __func__); goto fail; } free(buf); fp->block_offset = in->block_offset = 0; if (bgzf_close(fp) < 0) { - fprintf(pysam_stderr, "[%s] Error closing output file\n", __func__); + fprintf(samtools_stderr, "[%s] Error closing output file\n", __func__); return -1; } return 0; @@ -121,7 +121,7 @@ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, } /* - * Reads a file and outputs a new CRAM file to pysam_stdout with 'h' + * Reads a file and outputs a new CRAM file to samtools_stdout with 'h' * replaced as the header. No checks are made to the validity. * * FIXME: error checking @@ -205,7 +205,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { - fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__, + fprintf(samtools_stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } @@ -238,7 +238,7 @@ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list goto err; if (cram_block_get_uncomp_size(b) < header_len+4) { - fprintf(pysam_stderr, "New header will not fit. Use non-inplace version (%d > %d)\n", + fprintf(samtools_stderr, "New header will not fit. Use non-inplace version (%d > %d)\n", header_len+4, cram_block_get_uncomp_size(b)); ret = -2; goto err; @@ -301,7 +301,7 @@ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { - fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__, + fprintf(samtools_stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } @@ -373,7 +373,7 @@ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list goto err; if (old_container_sz != container_sz) { - fprintf(pysam_stderr, "Quirk of fate makes this troublesome! " + fprintf(samtools_stderr, "Quirk of fate makes this troublesome! " "Please use non-inplace version.\n"); goto err; } @@ -392,7 +392,7 @@ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list goto err; if (cram_block_size(b) > cram_container_get_length(c)) { - fprintf(pysam_stderr, "New header will not fit. Use non-inplace version" + fprintf(samtools_stderr, "New header will not fit. Use non-inplace version" " (%d > %d)\n", (int)cram_block_size(b), cram_container_get_length(c)); ret = -2; @@ -430,7 +430,7 @@ int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG); case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG); default: - fprintf(pysam_stderr, "[%s] unsupported CRAM version %d\n", __func__, + fprintf(samtools_stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); return -1; } @@ -444,7 +444,7 @@ static void usage(FILE *fp, int ret) { "Options:\n" " -P, --no-PG Do not generate an @PG header line.\n" " -i, --in-place Modify the bam/cram file directly.\n" - " (Defaults to outputting to pysam_stdout.)\n"); + " (Defaults to outputting to samtools_stdout.)\n"); exit(ret); } @@ -466,15 +466,15 @@ int main_reheader(int argc, char *argv[]) switch (c) { case 'P': add_PG = 0; break; case 'i': inplace = 1; break; - case 'h': usage(pysam_stdout, 0); break; + case 'h': usage(samtools_stdout, 0); break; default: - fprintf(pysam_stderr, "Invalid option '%c'\n", c); - usage(pysam_stderr, 1); + fprintf(samtools_stderr, "Invalid option '%c'\n", c); + usage(samtools_stderr, 1); } } if (argc - optind != 2) - usage(pysam_stderr, 1); + usage(samtools_stderr, 1); { // read the header samFile *fph = sam_open(argv[optind], "r"); @@ -485,7 +485,7 @@ int main_reheader(int argc, char *argv[]) h = sam_hdr_read(fph); sam_close(fph); if (h == NULL) { - fprintf(pysam_stderr, "[%s] failed to read the header for '%s'.\n", + fprintf(samtools_stderr, "[%s] failed to read the header for '%s'.\n", __func__, argv[1]); return 1; } @@ -496,7 +496,7 @@ int main_reheader(int argc, char *argv[]) return 1; } if (hts_get_format(in)->format == bam) { - r = bam_reheader(in->fp.bgzf, h, fileno(pysam_stdout), arg_list, add_PG); + r = bam_reheader(in->fp.bgzf, h, fileno(samtools_stdout), arg_list, add_PG); } else { if (inplace) r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG); diff --git a/samtools/bam_rmdup.c.pysam.c b/samtools/bam_rmdup.c.pysam.c index 6742fc8..64326de 100644 --- a/samtools/bam_rmdup.c.pysam.c +++ b/samtools/bam_rmdup.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_rmdup.c -- duplicate read detection. @@ -156,7 +156,7 @@ int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) if (c->tid != last_tid) { clear_best(aux, 0); if (kh_size(del_set)) { // check - fprintf(pysam_stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + fprintf(samtools_stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); clear_del_set(del_set); } if ((int)c->tid == -1) { // append unmapped reads @@ -167,7 +167,7 @@ int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) break; } last_tid = c->tid; - fprintf(pysam_stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); + fprintf(samtools_stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); } } if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { @@ -189,7 +189,7 @@ int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) bam_copy1(p, b); // replaced as b } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed if (ret == 0) - fprintf(pysam_stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); + fprintf(samtools_stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); } else { // not found in best_hash kh_val(q->best_hash, k) = bam_dup1(b); stack_insert(&stack, kh_val(q->best_hash, k)); @@ -206,7 +206,7 @@ int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) last_pos = c->pos; } if (r < -1) { - fprintf(pysam_stderr, "[%s] failed to read input file\n", __func__); + fprintf(samtools_stderr, "[%s] failed to read input file\n", __func__); goto fail; } @@ -214,7 +214,7 @@ int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) if (kh_exist(aux, k)) { lib_aux_t *q = &kh_val(aux, k); if (dump_best(&stack, out, hdr) < 0) goto write_fail; - fprintf(pysam_stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + fprintf(samtools_stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); kh_destroy(pos, q->best_hash); free((char*)kh_key(aux, k)); @@ -255,12 +255,12 @@ int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); static int rmdup_usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Usage: samtools rmdup [-sS] \n\n"); - fprintf(pysam_stderr, "Option: -s rmdup for SE reads\n"); - fprintf(pysam_stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "Usage: samtools rmdup [-sS] \n\n"); + fprintf(samtools_stderr, "Option: -s rmdup for SE reads\n"); + fprintf(samtools_stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); - sam_global_opt_help(pysam_stderr, "-....-"); + sam_global_opt_help(samtools_stderr, "-....-"); return 1; } @@ -296,7 +296,7 @@ int bam_rmdup(int argc, char *argv[]) } header = sam_hdr_read(in); if (header == NULL || header->n_targets == 0) { - fprintf(pysam_stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); + fprintf(samtools_stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); return 1; } @@ -317,7 +317,7 @@ int bam_rmdup(int argc, char *argv[]) bam_hdr_destroy(header); sam_close(in); if (sam_close(out) < 0) { - fprintf(pysam_stderr, "[bam_rmdup] error closing output file\n"); + fprintf(samtools_stderr, "[bam_rmdup] error closing output file\n"); ret = 1; } return ret; diff --git a/samtools/bam_rmdupse.c.pysam.c b/samtools/bam_rmdupse.c.pysam.c index 3a3d0d0..2038dbf 100644 --- a/samtools/bam_rmdupse.c.pysam.c +++ b/samtools/bam_rmdupse.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_rmdupse.c -- duplicate read detection for unpaired reads. @@ -188,7 +188,7 @@ int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) } } if (r < -1) { - fprintf(pysam_stderr, "[%s] error reading input file\n", __func__); + fprintf(samtools_stderr, "[%s] error reading input file\n", __func__); goto fail; } @@ -197,7 +197,7 @@ int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) for (k = kh_begin(aux); k != kh_end(aux); ++k) { if (kh_exist(aux, k)) { lib_aux_t *q = &kh_val(aux, k); - fprintf(pysam_stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + fprintf(samtools_stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); kh_destroy(best, q->left); kh_destroy(best, q->rght); free((char*)kh_key(aux, k)); diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index b1d5898..509c1d9 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -1787,41 +1787,6 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, return -1; } -static int change_SO(bam_hdr_t *h, const char *so) -{ - char *p, *q, *beg = NULL, *end = NULL, *newtext; - if (h->l_text > 3) { - if (strncmp(h->text, "@HD", 3) == 0) { - if ((p = strchr(h->text, '\n')) == 0) return -1; - *p = '\0'; - if ((q = strstr(h->text, "\tSO:")) != 0) { - *p = '\n'; // change back - if (strncmp(q + 4, so, p - q - 4) != 0) { - beg = q; - for (q += 4; *q != '\n' && *q != '\t'; ++q); - end = q; - } else return 0; // no need to change - } else beg = end = p, *p = '\n'; - } - } - if (beg == NULL) { // no @HD - h->l_text += strlen(so) + 15; - newtext = (char*)malloc(h->l_text + 1); - if (!newtext) return -1; - snprintf(newtext, h->l_text + 1, - "@HD\tVN:1.3\tSO:%s\n%s", so, h->text); - } else { // has @HD but different or no SO - h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end); - newtext = (char*)malloc(h->l_text + 1); - if (!newtext) return -1; - snprintf(newtext, h->l_text + 1, "%.*s\tSO:%s%s", - (int) (beg - h->text), h->text, so, end); - } - free(h->text); - h->text = newtext; - return 0; -} - // Function to compare reads and determine which one is < or > the other // Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag. // Returns a value less than, equal to or greater than zero if a is less than, @@ -2120,11 +2085,16 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const else new_so = "coordinate"; - if (change_SO(header, new_so) != 0) { + if (sam_hdr_change_HD(header, "SO", new_so) != 0) { print_error("sort", "failed to change sort order header to '%s'\n", new_so); goto err; } + if (sam_hdr_change_HD(header, "GO", NULL) != 0) { + print_error("sort", + "failed to delete group order header\n"); + goto err; + } // No gain to using the thread pool here as the flow of this code // is such that we are *either* reading *or* sorting. Hence a shared diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 8989fc5..d38a311 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_sort.c -- sorting and merging. @@ -526,7 +526,7 @@ static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, if (iter == kh_end(sq_tids)) { // Warn about this, but it's not really fatal. - fprintf(pysam_stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", + fprintf(samtools_stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", __func__, (int) (matches[1].rm_eo - matches[1].rm_so), text + matches[1].rm_so); @@ -763,7 +763,7 @@ static int finish_rg_pg(bool is_rg, klist_t(hdrln) *hdr_lines, idx = kh_get(c2c, pg_map, id); if (idx == kh_end(pg_map)) { // Not found, warn. - fprintf(pysam_stderr, "[W::%s] Tag %s%s not found in @PG records\n", + fprintf(samtools_stderr, "[W::%s] Tag %s%s not found in @PG records\n", __func__, search + 1, id); } else { // Remember new id and splice points on original string @@ -914,7 +914,7 @@ static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) { + ks_len(&merged_hdr->out_pg) + ks_len(&merged_hdr->out_co)); if (txt_sz >= INT32_MAX) { - fprintf(pysam_stderr, "[%s] Output header text too long\n", __func__); + fprintf(samtools_stderr, "[%s] Output header text too long\n", __func__); return NULL; } @@ -1031,7 +1031,7 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl) } } else { char *tmp = strdup(decoded_rg); - fprintf(pysam_stderr, + fprintf(samtools_stderr, "[bam_translate] RG tag \"%s\" on read \"%s\" encountered " "with no corresponding entry in header, tag lost. " "Unknown tags are only reported once per input file for " @@ -1061,7 +1061,7 @@ static void bam_translate(bam1_t* b, trans_tbl_t* tbl) } } else { char *tmp = strdup(decoded_pg); - fprintf(pysam_stderr, + fprintf(samtools_stderr, "[bam_translate] PG tag \"%s\" on read \"%s\" encountered " "with no corresponding entry in header, tag lost. " "Unknown tags are only reported once per input file for " @@ -1255,7 +1255,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { - fprintf(pysam_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); + fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } // Potential future improvement is to share headers between CRAM files for @@ -1271,7 +1271,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m // Did we get an @HD line? if (!merged_hdr->have_hd) { - fprintf(pysam_stderr, "[W::%s] No @HD tag found.\n", __func__); + fprintf(samtools_stderr, "[W::%s] No @HD tag found.\n", __func__); /* FIXME: Should we add an @HD line here, and if so what should we put in it? Ideally we want a way of getting htslib to tell us the SAM version number to assume given no @HD line. Is @@ -1308,8 +1308,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m end = INT_MAX; } if (tid < 0) { - if (name_lim) fprintf(pysam_stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); - else fprintf(pysam_stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); + if (name_lim) fprintf(samtools_stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); + else fprintf(samtools_stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); goto fail; } for (i = 0; i < n; ++i) { @@ -1317,7 +1317,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (idx == NULL) { - fprintf(pysam_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", + fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", __func__, fn[i]); goto fail; } @@ -1329,12 +1329,12 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m hts_idx_destroy(idx); if (iter[i] == NULL) { if (mapped_tid != INT32_MIN) { - fprintf(pysam_stderr, + fprintf(samtools_stderr, "[%s] failed to get iterator over " "{%s, %d, %d, %d}\n", __func__, fn[i], mapped_tid, beg, end); } else { - fprintf(pysam_stderr, + fprintf(samtools_stderr, "[%s] failed to get iterator over " "{%s, HTS_IDX_NONE, 0, 0}\n", __func__, fn[i]); @@ -1349,7 +1349,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); if (iter[i] == NULL) { - fprintf(pysam_stderr, "[%s] failed to get iterator\n", __func__); + fprintf(samtools_stderr, "[%s] failed to get iterator\n", __func__); goto fail; } } @@ -1532,7 +1532,7 @@ int bam_merge(int argc, char *argv[]) }; if (argc == 1) { - merge_usage(pysam_stdout); + merge_usage(samtools_stdout); return 0; } @@ -1571,12 +1571,12 @@ int bam_merge(int argc, char *argv[]) default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': merge_usage(pysam_stderr); return 1; + case '?': merge_usage(samtools_stderr); return 1; } } if ( argc - optind < 1 ) { print_error("merge", "You must at least specify the output file"); - merge_usage(pysam_stderr); + merge_usage(samtools_stderr); return 1; } @@ -1585,7 +1585,7 @@ int bam_merge(int argc, char *argv[]) FILE *fp = fopen(argv[optind], "rb"); if (fp != NULL) { fclose(fp); - fprintf(pysam_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); + fprintf(samtools_stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); return 1; } } @@ -1599,7 +1599,7 @@ int bam_merge(int argc, char *argv[]) } if (fn_size+nargcfiles < 1) { print_error("merge", "You must specify at least one (and usually two or more) input files"); - merge_usage(pysam_stderr); + merge_usage(samtools_stderr); return 1; } strcpy(mode, "wb"); @@ -1789,41 +1789,6 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, return -1; } -static int change_SO(bam_hdr_t *h, const char *so) -{ - char *p, *q, *beg = NULL, *end = NULL, *newtext; - if (h->l_text > 3) { - if (strncmp(h->text, "@HD", 3) == 0) { - if ((p = strchr(h->text, '\n')) == 0) return -1; - *p = '\0'; - if ((q = strstr(h->text, "\tSO:")) != 0) { - *p = '\n'; // change back - if (strncmp(q + 4, so, p - q - 4) != 0) { - beg = q; - for (q += 4; *q != '\n' && *q != '\t'; ++q); - end = q; - } else return 0; // no need to change - } else beg = end = p, *p = '\n'; - } - } - if (beg == NULL) { // no @HD - h->l_text += strlen(so) + 15; - newtext = (char*)malloc(h->l_text + 1); - if (!newtext) return -1; - snprintf(newtext, h->l_text + 1, - "@HD\tVN:1.3\tSO:%s\n%s", so, h->text); - } else { // has @HD but different or no SO - h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end); - newtext = (char*)malloc(h->l_text + 1); - if (!newtext) return -1; - snprintf(newtext, h->l_text + 1, "%.*s\tSO:%s%s", - (int) (beg - h->text), h->text, so, end); - } - free(h->text); - h->text = newtext; - return 0; -} - // Function to compare reads and determine which one is < or > the other // Handle sort-by-pos and sort-by-name. Used as the secondary sort in bam1_lt_by_tag, if reads are equivalent by tag. // Returns a value less than, equal to or greater than zero if a is less than, @@ -2122,11 +2087,16 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const else new_so = "coordinate"; - if (change_SO(header, new_so) != 0) { + if (sam_hdr_change_HD(header, "SO", new_so) != 0) { print_error("sort", "failed to change sort order header to '%s'\n", new_so); goto err; } + if (sam_hdr_change_HD(header, "GO", NULL) != 0) { + print_error("sort", + "failed to delete group order header\n"); + goto err; + } // No gain to using the thread pool here as the flow of this code // is such that we are *either* reading *or* sorting. Hence a shared @@ -2212,7 +2182,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const goto err; } } else { // then merge - fprintf(pysam_stderr, + fprintf(samtools_stderr, "[bam_sort_core] merging from %d files and %d in-memory blocks...\n", n_files, num_in_mem); fns = (char**)calloc(n_files, sizeof(char*)); @@ -2284,7 +2254,7 @@ static void complain_about_memory_setting(size_t max_mem) { if (max_mem > nine_k) { max_mem >>= 10; suffix = "K"; } if (max_mem > nine_k) { max_mem >>= 10; suffix = "M"; } - fprintf(pysam_stderr, + fprintf(samtools_stderr, "[bam_sort] -m setting (%zu%s bytes) is less than the minimum required (%zuM).\n\n" "Trying to run with -m too small can lead to the creation of a very large number\n" "of temporary files. This may make sort fail due to it exceeding limits on the\n" @@ -2329,22 +2299,22 @@ int bam_sort(int argc, char *argv[]) default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': sort_usage(pysam_stderr); ret = EXIT_FAILURE; goto sort_end; + case '?': sort_usage(samtools_stderr); ret = EXIT_FAILURE; goto sort_end; } } nargs = argc - optind; if (nargs == 0 && isatty(STDIN_FILENO)) { - sort_usage(pysam_stdout); + sort_usage(samtools_stdout); ret = EXIT_SUCCESS; goto sort_end; } else if (nargs >= 2) { // If exactly two, user probably tried to specify legacy if (nargs == 2) - fprintf(pysam_stderr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n"); + fprintf(samtools_stderr, "[bam_sort] Use -T PREFIX / -o FILE to specify temporary and final output files\n"); - sort_usage(pysam_stderr); + sort_usage(samtools_stderr); ret = EXIT_FAILURE; goto sort_end; } @@ -2379,7 +2349,7 @@ int bam_sort(int argc, char *argv[]) // If we failed on opening the input file & it has no .bam/.cram/etc // extension, the user probably tried legacy -o if (ret == -2 && o_seen && nargs > 0 && sam_open_mode(dummy, argv[optind], NULL) < 0) - fprintf(pysam_stderr, "[bam_sort] Note the argument has been replaced by -T/-o options\n"); + fprintf(samtools_stderr, "[bam_sort] Note the argument has been replaced by -T/-o options\n"); ret = EXIT_FAILURE; } diff --git a/samtools/bam_split.c.pysam.c b/samtools/bam_split.c.pysam.c index 8a584ed..9395c81 100644 --- a/samtools/bam_split.c.pysam.c +++ b/samtools/bam_split.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_split.c -- split subcommand. @@ -99,7 +99,7 @@ static void usage(FILE *write_to) // Takes the command line options and turns them into something we can understand static parsed_opts_t* parse_args(int argc, char** argv) { - if (argc == 1) { usage(pysam_stdout); return NULL; } + if (argc == 1) { usage(samtools_stdout); return NULL; } const char* optstring = "vf:u:@:"; char* delim; @@ -137,7 +137,7 @@ static parsed_opts_t* parse_args(int argc, char** argv) if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break; /* else fall-through */ case '?': - usage(pysam_stdout); + usage(samtools_stdout); free(retval); return NULL; } @@ -150,7 +150,7 @@ static parsed_opts_t* parse_args(int argc, char** argv) if (argc != 1) { print_error("split", "Invalid number of arguments: %d", argc); - usage(pysam_stderr); + usage(samtools_stderr); free(retval); return NULL; } @@ -191,11 +191,11 @@ static char* expand_format_string(const char* format_string, const char* basenam kputs("bam", &str); break; case '\0': - // Error is: fprintf(pysam_stderr, "bad format string, trailing %%\n"); + // Error is: fprintf(samtools_stderr, "bad format string, trailing %%\n"); free(str.s); return NULL; default: - // Error is: fprintf(pysam_stderr, "bad format string, unknown format specifier\n"); + // Error is: fprintf(samtools_stderr, "bad format string, unknown format specifier\n"); free(str.s); return NULL; } @@ -351,7 +351,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) if (opts->ga.nthreads > 0) { if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) { - fprintf(pysam_stderr, "Error creating thread pool\n"); + fprintf(samtools_stderr, "Error creating thread pool\n"); return NULL; } } @@ -402,7 +402,7 @@ static state_t* init(parsed_opts_t* opts, const char *arg_list) // Open output files for RGs if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; - if (opts->verbose) fprintf(pysam_stderr, "@RG's found %zu\n",retval->output_count); + if (opts->verbose) fprintf(samtools_stderr, "@RG's found %zu\n",retval->output_count); retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *)); retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); @@ -522,9 +522,9 @@ static bool split(state_t* state) // otherwise write to the unaccounted bam if there is one or fail if (state->unaccounted_file == NULL) { if (tag) { - fprintf(pysam_stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag)); + fprintf(samtools_stderr, "Read \"%s\" with unaccounted for tag \"%s\".\n", bam_get_qname(file_read), bam_aux2Z(tag)); } else { - fprintf(pysam_stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read)); + fprintf(samtools_stderr, "Read \"%s\" has no RG tag.\n", bam_get_qname(file_read)); } bam_destroy1(file_read); return false; diff --git a/samtools/bam_stat.c.pysam.c b/samtools/bam_stat.c.pysam.c index 31e9b28..40c17c4 100644 --- a/samtools/bam_stat.c.pysam.c +++ b/samtools/bam_stat.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bam_stat.c -- flagstat subcommand. @@ -84,7 +84,7 @@ bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) flagstat_loop(s, c); bam_destroy1(b); if (ret != -1) - fprintf(pysam_stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + fprintf(samtools_stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); return s; } @@ -125,13 +125,13 @@ int bam_flagstat(int argc, char *argv[]) default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': - usage_exit(pysam_stderr, EXIT_FAILURE); + usage_exit(samtools_stderr, EXIT_FAILURE); } } if (argc != optind+1) { - if (argc == optind) usage_exit(pysam_stdout, EXIT_SUCCESS); - else usage_exit(pysam_stderr, EXIT_FAILURE); + if (argc == optind) usage_exit(samtools_stdout, EXIT_SUCCESS); + else usage_exit(samtools_stderr, EXIT_FAILURE); } fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { @@ -143,34 +143,34 @@ int bam_flagstat(int argc, char *argv[]) if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } header = sam_hdr_read(fp); if (header == NULL) { - fprintf(pysam_stderr, "Failed to read header for \"%s\"\n", argv[optind]); + fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", argv[optind]); return 1; } s = bam_flagstat_core(fp, header); - fprintf(pysam_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); - fprintf(pysam_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); - fprintf(pysam_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); - fprintf(pysam_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); - fprintf(pysam_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); - fprintf(pysam_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); - fprintf(pysam_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); - fprintf(pysam_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); - fprintf(pysam_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); - fprintf(pysam_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); - fprintf(pysam_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); - fprintf(pysam_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); - fprintf(pysam_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); + fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); + fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); + fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); + fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); + fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); + fprintf(samtools_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); + fprintf(samtools_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); + fprintf(samtools_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); + fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); + fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_hdr_destroy(header); sam_close(fp); diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c index e24689e..c1c89fb 100644 --- a/samtools/bamshuf.c +++ b/samtools/bamshuf.c @@ -110,6 +110,18 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, fprintf(stderr, "Couldn't read header for '%s'\n", fn); goto fail; } + + if (sam_hdr_change_HD(h, "SO", "unsorted") != 0) { + print_error("collate", + "failed to change sort order header to 'unsorted'\n"); + goto fail; + } + if (sam_hdr_change_HD(h, "GO", "query") != 0) { + print_error("collate", + "failed to change group order header to 'query'\n"); + goto fail; + } + fnt = (char**)calloc(n_files, sizeof(char*)); if (!fnt) goto mem_fail; fpt = (samFile**)calloc(n_files, sizeof(samFile*)); diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c index 04cd37b..008aa0c 100644 --- a/samtools/bamshuf.c.pysam.c +++ b/samtools/bamshuf.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bamshuf.c -- collate subcommand. @@ -80,7 +80,7 @@ static inline int elem_lt(elem_t x, elem_t y) KSORT_INIT(bamshuf, elem_t, elem_lt) static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, - int is_pysam_stdout, sam_global_args *ga) + int is_samtools_stdout, sam_global_args *ga) { samFile *fp, *fpw = NULL, **fpt = NULL; char **fnt = NULL, modew[8]; @@ -109,9 +109,21 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, h = sam_hdr_read(fp); if (h == NULL) { - fprintf(pysam_stderr, "Couldn't read header for '%s'\n", fn); + fprintf(samtools_stderr, "Couldn't read header for '%s'\n", fn); goto fail; } + + if (sam_hdr_change_HD(h, "SO", "unsorted") != 0) { + print_error("collate", + "failed to change sort order header to 'unsorted'\n"); + goto fail; + } + if (sam_hdr_change_HD(h, "GO", "query") != 0) { + print_error("collate", + "failed to change group order header to 'query'\n"); + goto fail; + } + fnt = (char**)calloc(n_files, sizeof(char*)); if (!fnt) goto mem_fail; fpt = (samFile**)calloc(n_files, sizeof(samFile*)); @@ -149,7 +161,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, bam_destroy1(b); b = NULL; if (r < -1) { - fprintf(pysam_stderr, "Error reading input file\n"); + fprintf(samtools_stderr, "Error reading input file\n"); goto fail; } for (i = 0; i < n_files; ++i) { @@ -157,7 +169,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, r = sam_close(fpt[i]); fpt[i] = NULL; if (r < 0) { - fprintf(pysam_stderr, "Error on closing '%s'\n", fnt[i]); + fprintf(samtools_stderr, "Error on closing '%s'\n", fnt[i]); return 1; } @@ -170,7 +182,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, fp = NULL; // merge sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL); - if (!is_pysam_stdout) { // output to a file + if (!is_samtools_stdout) { // output to a file char *fnw = (char*)calloc(l + 5, 1); if (!fnw) goto mem_fail; if (ga->out.format == unknown_format) @@ -179,9 +191,9 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out)); fpw = sam_open_format(fnw, modew, &ga->out); free(fnw); - } else fpw = sam_open_format("-", modew, &ga->out); // output to pysam_stdout + } else fpw = sam_open_format("-", modew, &ga->out); // output to samtools_stdout if (fpw == NULL) { - if (is_pysam_stdout) print_error_errno("collate", "Cannot open standard output"); + if (is_samtools_stdout) print_error_errno("collate", "Cannot open standard output"); else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre); goto fail; } @@ -212,7 +224,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, // Slurp in one of the split files for (j = 0; j < c; ++j) { if (sam_read1(fp, h, a[j].b) < 0) { - fprintf(pysam_stderr, "Error reading '%s'\n", fnt[i]); + fprintf(samtools_stderr, "Error reading '%s'\n", fnt[i]); goto fail; } a[j].key = hash_X31_Wang(bam_get_qname(a[j].b)); @@ -238,7 +250,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, free(a); free(fnt); free(cnt); sam_global_args_free(ga); if (sam_close(fpw) < 0) { - fprintf(pysam_stderr, "Error on closing output\n"); + fprintf(samtools_stderr, "Error on closing output\n"); return 1; } @@ -246,7 +258,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, return 0; mem_fail: - fprintf(pysam_stderr, "Out of memory\n"); + fprintf(samtools_stderr, "Out of memory\n"); fail: if (fp) sam_close(fp); @@ -273,7 +285,7 @@ static int usage(FILE *fp, int n_files) { fprintf(fp, "Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] \n\n" "Options:\n" - " -O output to pysam_stdout\n" + " -O output to samtools_stdout\n" " -u uncompressed BAM output\n" " -l INT compression level [%d]\n" // DEF_CLEVEL " -n INT number of temporary files [%d]\n", // n_files @@ -286,7 +298,7 @@ static int usage(FILE *fp, int n_files) { int main_bamshuf(int argc, char *argv[]) { - int c, n_files = 64, clevel = DEF_CLEVEL, is_pysam_stdout = 0, is_un = 0; + int c, n_files = 64, clevel = DEF_CLEVEL, is_samtools_stdout = 0, is_un = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), @@ -298,15 +310,15 @@ int main_bamshuf(int argc, char *argv[]) case 'n': n_files = atoi(optarg); break; case 'l': clevel = atoi(optarg); break; case 'u': is_un = 1; break; - case 'O': is_pysam_stdout = 1; break; + case 'O': is_samtools_stdout = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ - case '?': return usage(pysam_stderr, n_files); + case '?': return usage(samtools_stderr, n_files); } } if (is_un) clevel = 0; if (optind + 2 > argc) - return usage(pysam_stderr, n_files); + return usage(samtools_stderr, n_files); - return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_pysam_stdout, &ga); + return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_samtools_stdout, &ga); } diff --git a/samtools/bamtk.c b/samtools/bamtk.c index 9316386..d1e89c6 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -1,6 +1,6 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2017 Genome Research Ltd. + Copyright (C) 2008-2018 Genome Research Ltd. Author: Heng Li @@ -31,7 +31,6 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "samtools.h" -#include "version.h" int bam_taf2baf(int argc, char *argv[]); int bam_mpileup(int argc, char *argv[]); @@ -64,10 +63,6 @@ int main_addreplacerg(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int dict_main(int argc, char *argv[]); -const char *samtools_version() -{ - return SAMTOOLS_VERSION; -} static void usage(FILE *fp) { @@ -90,7 +85,6 @@ static void usage(FILE *fp) " calmd recalculate MD/NM tags and '=' bases\n" " fixmate fix mate information\n" " reheader replace BAM header\n" -" rmdup remove PCR duplicates\n" " targetcut cut fosmid regions (for fosmid pool only)\n" " addreplacerg adds or replaces RG tags\n" " markdup mark duplicates\n" @@ -201,7 +195,7 @@ int main(int argc, char *argv[]) printf( "samtools %s\n" "Using htslib %s\n" -"Copyright (C) 2017 Genome Research Ltd.\n", +"Copyright (C) 2018 Genome Research Ltd.\n", samtools_version(), hts_version()); } else if (strcmp(argv[1], "--version-only") == 0) { diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index 67c09c8..e14f01c 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -1,8 +1,8 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2017 Genome Research Ltd. + Copyright (C) 2008-2018 Genome Research Ltd. Author: Heng Li @@ -33,15 +33,13 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "samtools.h" -#include "version.h" int bam_taf2baf(int argc, char *argv[]); int bam_mpileup(int argc, char *argv[]); int bam_merge(int argc, char *argv[]); int bam_index(int argc, char *argv[]); int bam_sort(int argc, char *argv[]); -/* AH: removed */ -/* int bam_tview_main(int argc, char *argv[]); */ +// int bam_tview_main(int argc, char *argv[]); int bam_mating(int argc, char *argv[]); int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); @@ -67,10 +65,6 @@ int main_addreplacerg(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int dict_main(int argc, char *argv[]); -const char *samtools_version() -{ - return SAMTOOLS_VERSION; -} static void usage(FILE *fp) { @@ -93,7 +87,6 @@ static void usage(FILE *fp) " calmd recalculate MD/NM tags and '=' bases\n" " fixmate fix mate information\n" " reheader replace BAM header\n" -" rmdup remove PCR duplicates\n" " targetcut cut fosmid regions (for fosmid pool only)\n" " addreplacerg adds or replaces RG tags\n" " markdup mark duplicates\n" @@ -146,13 +139,13 @@ int _CRT_glob = 0; int samtools_main(int argc, char *argv[]) { #ifdef _WIN32 - setmode(fileno(pysam_stdout), O_BINARY); + setmode(fileno(samtools_stdout), O_BINARY); setmode(fileno(stdin), O_BINARY); #endif - if (argc < 2) { usage(pysam_stderr); return 1; } + if (argc < 2) { usage(samtools_stderr); return 1; } if (strcmp(argv[1], "help") == 0 || strcmp(argv[1], "--help") == 0) { - if (argc == 2) { usage(pysam_stdout); return 0; } + if (argc == 2) { usage(samtools_stdout); return 0; } // Otherwise change "samtools help COMMAND [...]" to "samtools COMMAND"; // main_xyz() functions by convention display the subcommand's usage @@ -196,24 +189,22 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1); else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { - fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); + fprintf(samtools_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; } -/* AH: - else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); -*/ + // else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); else if (strcmp(argv[1], "--version") == 0) { - fprintf(pysam_stdout, + fprintf(samtools_stdout, "samtools %s\n" "Using htslib %s\n" -"Copyright (C) 2017 Genome Research Ltd.\n", +"Copyright (C) 2018 Genome Research Ltd.\n", samtools_version(), hts_version()); } else if (strcmp(argv[1], "--version-only") == 0) { - fprintf(pysam_stdout, "%s+htslib-%s\n", samtools_version(), hts_version()); + fprintf(samtools_stdout, "%s+htslib-%s\n", samtools_version(), hts_version()); } else { - fprintf(pysam_stderr, "[main] unrecognized command '%s'\n", argv[1]); + fprintf(samtools_stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } return ret; diff --git a/samtools/bedcov.c b/samtools/bedcov.c index 1113e17..1098309 100644 --- a/samtools/bedcov.c +++ b/samtools/bedcov.c @@ -128,6 +128,12 @@ int main_bedcov(int argc, char *argv[]) int tid, beg, end, pos; bam_mplp_t mplp; + if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */ + /* Track and browser lines. Also look for a trailing *space* in + case someone has badly-chosen a chromosome name (it would + be followed by a tab in that case). */ + if (strncmp(str.s, "track ", 6) == 0) continue; + if (strncmp(str.s, "browser ", 8) == 0) continue; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c index 3fd6d4c..fa7c9a2 100644 --- a/samtools/bedcov.c.pysam.c +++ b/samtools/bedcov.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bedcov.c -- bedcov subcommand. @@ -91,10 +91,10 @@ int main_bedcov(int argc, char *argv[]) if (usage) break; } if (usage || optind + 2 > argc) { - fprintf(pysam_stderr, "Usage: samtools bedcov [options] [...]\n\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -Q mapping quality threshold [0]\n"); - sam_global_opt_help(pysam_stderr, "-.--.-"); + fprintf(samtools_stderr, "Usage: samtools bedcov [options] [...]\n\n"); + fprintf(samtools_stderr, "Options:\n"); + fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); + sam_global_opt_help(samtools_stderr, "-.--.-"); return 1; } memset(&str, 0, sizeof(kstring_t)); @@ -108,13 +108,13 @@ int main_bedcov(int argc, char *argv[]) if (aux[i]->fp) idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { - fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); + fprintf(samtools_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); if (aux[i]->header == NULL) { - fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n", + fprintf(samtools_stderr, "ERROR: failed to read header for '%s'\n", argv[i+optind+1]); return 2; } @@ -130,6 +130,12 @@ int main_bedcov(int argc, char *argv[]) int tid, beg, end, pos; bam_mplp_t mplp; + if (str.l == 0 || *str.s == '#') continue; /* empty or comment line */ + /* Track and browser lines. Also look for a trailing *space* in + case someone has badly-chosen a chromosome name (it would + be followed by a tab in that case). */ + if (strncmp(str.s, "track ", 6) == 0) continue; + if (strncmp(str.s, "browser ", 8) == 0) continue; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; @@ -157,12 +163,12 @@ int main_bedcov(int argc, char *argv[]) kputc('\t', &str); kputl(cnt[i], &str); } - fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout); + fputs(str.s, samtools_stdout) & fputc('\n', samtools_stdout); bam_mplp_destroy(mplp); continue; bed_error: - fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s); + fprintf(samtools_stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); diff --git a/samtools/bedidx.c b/samtools/bedidx.c index 86d2338..3489c27 100644 --- a/samtools/bedidx.c +++ b/samtools/bedidx.c @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "bedidx.h" #include "htslib/ksort.h" KSORT_INIT_GENERIC(uint64_t) @@ -38,48 +39,91 @@ KSORT_INIT_GENERIC(uint64_t) #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 8192) +/*! @typedef + * @abstract bed_reglist_t - value type of the BED hash table + * This structure encodes the list of intervals (ranges) for the regions provided via BED file or + * command line arguments. + * @field *a pointer to the array of intervals (kept as 64 bit integers). The upper 32 bits + * encode the beginning of the interval, while the lower 32 bits encode the end, for easy sorting. + * |-- 32 bits --|-- 32 bits --| + * |---- beg ----|---- end ----| + * @field n actual number of elements contained by a + * @field m number of allocated elements to a (n <= m) + * @field *idx index array for computing the minimum offset + */ typedef struct { int n, m; uint64_t *a; int *idx; + int filter; } bed_reglist_t; #include "htslib/khash.h" KHASH_MAP_INIT_STR(reg, bed_reglist_t) -#define LIDX_SHIFT 13 - typedef kh_reg_t reghash_t; -void bed_destroy(void *_h); +#if 0 +// Debug function +static void bed_print(void *reg_hash) { + reghash_t *h = (reghash_t *)reg_hash; + bed_reglist_t *p; + khint_t k; + int i; + const char *reg; + uint32_t beg, end; + if (!h) { + printf("Hash table is empty!\n"); + return; + } + for (k = kh_begin(h); k < kh_end(h); k++) { + if (kh_exist(h,k)) { + reg = kh_key(h,k); + printf("Region: '%s'\n", reg); + if ((p = &kh_val(h,k)) != NULL && p->n > 0) { + printf("Filter: %d\n", p->filter); + for (i=0; in; i++) { + beg = (uint32_t)(p->a[i]>>32); + end = (uint32_t)(p->a[i]); -int *bed_index_core(int n, uint64_t *a, int *n_idx) + printf("\tinterval[%d]: %d-%d\n",i,beg,end); + } + } else { + printf("Region '%s' has no intervals!\n", reg); + } + } + } +} +#endif + +static int *bed_index_core(int n, uint64_t *a) { - int i, j, m, *idx; - m = *n_idx = 0; idx = 0; + int i, j, l, *idx; + l = 0; idx = 0; for (i = 0; i < n; ++i) { int beg, end; beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; - if (m < end + 1) { - int oldm = m; - m = end + 1; - kroundup32(m); - idx = realloc(idx, m * sizeof(int)); - for (j = oldm; j < m; ++j) idx[j] = -1; - } - if (beg == end) { - if (idx[beg] < 0) idx[beg] = i; - } else { - for (j = beg; j <= end; ++j) - if (idx[j] < 0) idx[j] = i; + if (l < end + 1) { + int old_l = l; + l = end + 1; + kroundup32(l); + idx = realloc(idx, l * sizeof(int)); + if (!idx) + return NULL; + + for (j = old_l; j < l; ++j) + idx[j] = -1; } - *n_idx = end + 1; + + for (j = beg; j < end+1; ++j) + if (idx[j] < 0) + idx[j] = i; } return idx; } -void bed_index(void *_h) +static void bed_index(void *_h) { reghash_t *h = (reghash_t*)_h; khint_t k; @@ -88,23 +132,36 @@ void bed_index(void *_h) bed_reglist_t *p = &kh_val(h, k); if (p->idx) free(p->idx); ks_introsort(uint64_t, p->n, p->a); - p->idx = bed_index_core(p->n, p->a, &p->m); + p->idx = bed_index_core(p->n, p->a); } } } -int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +static int bed_minoff(const bed_reglist_t *p, unsigned int beg, unsigned int end) { + int i, min_off=0; + + if (p && p->idx) { + min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; + if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here + int n = beg>>LIDX_SHIFT; + if (n > p->n) + n = p->n; + for (i = n - 1; i >= 0; --i) + if (p->idx[i] >= 0) + break; + min_off = i >= 0? p->idx[i] : 0; + } + } + + return min_off; +} + +static int bed_overlap_core(const bed_reglist_t *p, int beg, int end) { int i, min_off; if (p->n == 0) return 0; - min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; - if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here - int n = beg>>LIDX_SHIFT; - if (n > p->n) n = p->n; - for (i = n - 1; i >= 0; --i) - if (p->idx[i] >= 0) break; - min_off = i >= 0? p->idx[i] : 0; - } + min_off = bed_minoff(p, beg, end); + for (i = min_off; i < p->n; ++i) { if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) @@ -123,6 +180,40 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end) return bed_overlap_core(&kh_val(h, k), beg, end); } +/** @brief Trim a sorted interval list, inside a region hash table, + * by removing completely contained intervals and merging adjacent or + * overlapping intervals. + * @param reg_hash the region hash table with interval lists as values + */ + +static void bed_unify(void *reg_hash) { + + int i, j, new_n; + reghash_t *h; + bed_reglist_t *p; + + if (!reg_hash) + return; + + h = (reghash_t *)reg_hash; + + for (i = kh_begin(h); i < kh_end(h); i++) { + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || !(p->n)) + continue; + + for (new_n = 0, j = 1; j < p->n; j++) { + if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { + p->a[++new_n] = p->a[j]; + } else { + if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) + p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); + } + } + + p->n = ++new_n; + } +} + /* "BED" file reader, which actually reads two different formats. BED files contain between three and nine fields per line, of which @@ -217,8 +308,8 @@ void *bed_read(const char *fn) // Add begin,end to the list if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); + p->m = p->m ? p->m<<1 : 4; + p->a = realloc(p->a, p->m * sizeof(uint64_t)); if (NULL == p->a) goto fail; } p->a[p->n++] = (uint64_t)beg<<32 | end; @@ -230,6 +321,7 @@ void *bed_read(const char *fn) gzclose(fp); free(str.s); bed_index(h); + //bed_unify(h); return h; fail: fprintf(stderr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno)); @@ -243,8 +335,13 @@ void *bed_read(const char *fn) void bed_destroy(void *_h) { - reghash_t *h = (reghash_t*)_h; + reghash_t *h; khint_t k; + + if (!_h) + return; + + h = (reghash_t*)_h; for (k = 0; k < kh_end(h); ++k) { if (kh_exist(h, k)) { free(kh_val(h, k).a); @@ -254,3 +351,250 @@ void bed_destroy(void *_h) } kh_destroy(reg, h); } + +static void *bed_insert(void *reg_hash, char *reg, unsigned int beg, unsigned int end) { + + reghash_t *h; + khint_t k; + bed_reglist_t *p; + + if (!reg_hash) + return NULL; + + h = (reghash_t *)reg_hash; + + // Put reg in the hash table if not already there + k = kh_get(reg, h, reg); //looks strange, but only the second reg is the actual region name. + if (k == kh_end(h)) { // absent from the hash table + int ret; + char *s = strdup(reg); + if (NULL == s) goto fail; + k = kh_put(reg, h, s, &ret); + if (-1 == ret) { + free(s); + goto fail; + } + memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); + } + p = &kh_val(h, k); + + // Add beg and end to the list + if (p->n == p->m) { + p->m = p->m ? p->m<<1 : 4; + p->a = realloc(p->a, p->m * sizeof(uint64_t)); + if (NULL == p->a) goto fail; + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + +fail: + return h; +} + +/* @brief Filter a region hash table (coming from the BED file) by another + * region hash table (coming from CLI), so that only intervals contained in + * both hash tables are kept. + * @param reg_hash the target region hash table + * @param tmp_hash the filter region hash table + * @return pointer to the filtered hash table + */ + +static void *bed_filter(void *reg_hash, void *tmp_hash) { + + reghash_t *h; + reghash_t *t; + bed_reglist_t *p, *q; + khint_t l, k; + uint64_t *new_a; + int i, j, new_n, min_off; + const char *reg; + uint32_t beg, end; + + h = (reghash_t *)reg_hash; + t = (reghash_t *)tmp_hash; + if (!h) + return NULL; + if (!t) + return h; + + for (l = kh_begin(t); l < kh_end(t); l++) { + if (!kh_exist(t,l) || !(q = &kh_val(t,l)) || !(q->n)) + continue; + + reg = kh_key(t,l); + k = kh_get(reg, h, reg); //looks strange, but only the second reg is a proper argument. + if (k == kh_end(h) || !(p = &kh_val(h, k)) || !(p->n)) + continue; + + new_a = (uint64_t *)calloc(q->n + p->n, sizeof(uint64_t)); + if (!new_a) + return NULL; + new_n = 0; + + for (i = 0; i < q->n; i++) { + beg = (uint32_t)(q->a[i]>>32); + end = (uint32_t)(q->a[i]); + + min_off = bed_minoff(p, beg, end); + for (j = min_off; j < p->n; ++j) { + if ((uint32_t)(p->a[j]>>32) >= end) break; // out of range; no need to proceed + if ((uint32_t)(p->a[j]) > beg && (uint32_t)(p->a[j]>>32) < end) { + new_a[new_n++] = ((uint64_t)MAX((uint32_t)(p->a[j]>>32), beg) << 32) | MIN((uint32_t)p->a[j], end); + } + } + } + + if (new_n > 0) { + free(p->a); + p->a = new_a; + p->n = new_n; + p->m = new_n; + p->filter = FILTERED; + } else { + free(new_a); + p->filter = ALL; + } + } + + return h; +} + +void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op) { + + reghash_t *h = (reghash_t *)reg_hash; + reghash_t *t = NULL; + + int i; + char reg[1024]; + const char *q; + int beg, end; + + if (h) { + t = kh_init(reg); + if (!t) { + fprintf(stderr, "Error when creating the temporary region hash table!\n"); + return NULL; + } + } else { + h = kh_init(reg); + if (!h) { + fprintf(stderr, "Error when creating the region hash table!\n"); + return NULL; + } + *op = 1; + } + + for (i=first; i 1024) { + fprintf(stderr, "Region name '%s' is too long (bigger than %d).\n", regs[i], 1024); + continue; + } + strncpy(reg, regs[i], q - regs[i]); + reg[q - regs[i]] = 0; + } else { + // not parsable as a region, but possibly a sequence named "foo:a" + if (strlen(regs[i]) + 1 > 1024) { + fprintf(stderr, "Region name '%s' is too long (bigger than %d).\n", regs[i], 1024); + continue; + } + strcpy(reg, regs[i]); + beg = 0; end = INT_MAX; + } + + //if op==1 insert reg to the bed hash table + if (*op && !(bed_insert(h, reg, beg, end))) { + fprintf(stderr, "Error when inserting region='%s' in the bed hash table at address=%p!\n", regs[i], h); + } + //if op==0, first insert the regions in the temporary hash table, + //then filter the bed hash table using it + if (!(*op) && !(bed_insert(t, reg, beg, end))) { + fprintf(stderr, "Error when inserting region='%s' in the temporary hash table at address=%p!\n", regs[i], t); + } + } + + if (!(*op)) { + bed_index(t); + bed_unify(t); + h = bed_filter(h, t); + bed_destroy(t); + } + + if (h) { + bed_index(h); + bed_unify(h); + } + + return h; +} + +const char* bed_get(void *reg_hash, int i, int filter) { + + reghash_t *h; + bed_reglist_t *p; + + if (!reg_hash) + return NULL; + + h = (reghash_t *)reg_hash; + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || (p->filter < filter)) + return NULL; + + return kh_key(h, i); +} + +hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) { + + reghash_t *h; + bed_reglist_t *p; + khint_t i; + hts_reglist_t *reglist = NULL; + int count = 0; + int j; + + if (!reg_hash) + return NULL; + + h = (reghash_t *)reg_hash; + + for (i = kh_begin(h); i < kh_end(h); i++) { + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || (p->filter < filter)) + continue; + count++; + } + if (!count) + return NULL; + + reglist = (hts_reglist_t *)calloc(count, sizeof(hts_reglist_t)); + if (!reglist) + return NULL; + + *n_reg = count; + count = 0; + + for (i = kh_begin(h); i < kh_end(h) && count < *n_reg; i++) { + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || (p->filter < filter)) + continue; + + reglist[count].reg = kh_key(h,i); + reglist[count].intervals = (hts_pair32_t *)calloc(p->n, sizeof(hts_pair32_t)); + if(!(reglist[count].intervals)) { + hts_reglist_free(reglist, count); + return NULL; + } + reglist[count].count = p->n; + reglist[count].max_end = 0; + + for (j = 0; j < p->n; j++) { + reglist[count].intervals[j].beg = (uint32_t)(p->a[j]>>32); + reglist[count].intervals[j].end = (uint32_t)(p->a[j]); + + if (reglist[count].intervals[j].end > reglist[count].max_end) + reglist[count].max_end = reglist[count].intervals[j].end; + } + count++; + } + + return reglist; +} diff --git a/samtools/bedidx.c.pysam.c b/samtools/bedidx.c.pysam.c index 1998435..fa92fa0 100644 --- a/samtools/bedidx.c.pysam.c +++ b/samtools/bedidx.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* bedidx.c -- BED file indexing. @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "bedidx.h" #include "htslib/ksort.h" KSORT_INIT_GENERIC(uint64_t) @@ -40,48 +41,91 @@ KSORT_INIT_GENERIC(uint64_t) #include "htslib/kseq.h" KSTREAM_INIT(gzFile, gzread, 8192) +/*! @typedef + * @abstract bed_reglist_t - value type of the BED hash table + * This structure encodes the list of intervals (ranges) for the regions provided via BED file or + * command line arguments. + * @field *a pointer to the array of intervals (kept as 64 bit integers). The upper 32 bits + * encode the beginning of the interval, while the lower 32 bits encode the end, for easy sorting. + * |-- 32 bits --|-- 32 bits --| + * |---- beg ----|---- end ----| + * @field n actual number of elements contained by a + * @field m number of allocated elements to a (n <= m) + * @field *idx index array for computing the minimum offset + */ typedef struct { int n, m; uint64_t *a; int *idx; + int filter; } bed_reglist_t; #include "htslib/khash.h" KHASH_MAP_INIT_STR(reg, bed_reglist_t) -#define LIDX_SHIFT 13 - typedef kh_reg_t reghash_t; -void bed_destroy(void *_h); +#if 0 +// Debug function +static void bed_print(void *reg_hash) { + reghash_t *h = (reghash_t *)reg_hash; + bed_reglist_t *p; + khint_t k; + int i; + const char *reg; + uint32_t beg, end; + if (!h) { + fprintf(samtools_stdout, "Hash table is empty!\n"); + return; + } + for (k = kh_begin(h); k < kh_end(h); k++) { + if (kh_exist(h,k)) { + reg = kh_key(h,k); + fprintf(samtools_stdout, "Region: '%s'\n", reg); + if ((p = &kh_val(h,k)) != NULL && p->n > 0) { + fprintf(samtools_stdout, "Filter: %d\n", p->filter); + for (i=0; in; i++) { + beg = (uint32_t)(p->a[i]>>32); + end = (uint32_t)(p->a[i]); + + fprintf(samtools_stdout, "\tinterval[%d]: %d-%d\n",i,beg,end); + } + } else { + fprintf(samtools_stdout, "Region '%s' has no intervals!\n", reg); + } + } + } +} +#endif -int *bed_index_core(int n, uint64_t *a, int *n_idx) +static int *bed_index_core(int n, uint64_t *a) { - int i, j, m, *idx; - m = *n_idx = 0; idx = 0; + int i, j, l, *idx; + l = 0; idx = 0; for (i = 0; i < n; ++i) { int beg, end; beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; - if (m < end + 1) { - int oldm = m; - m = end + 1; - kroundup32(m); - idx = realloc(idx, m * sizeof(int)); - for (j = oldm; j < m; ++j) idx[j] = -1; - } - if (beg == end) { - if (idx[beg] < 0) idx[beg] = i; - } else { - for (j = beg; j <= end; ++j) - if (idx[j] < 0) idx[j] = i; + if (l < end + 1) { + int old_l = l; + l = end + 1; + kroundup32(l); + idx = realloc(idx, l * sizeof(int)); + if (!idx) + return NULL; + + for (j = old_l; j < l; ++j) + idx[j] = -1; } - *n_idx = end + 1; + + for (j = beg; j < end+1; ++j) + if (idx[j] < 0) + idx[j] = i; } return idx; } -void bed_index(void *_h) +static void bed_index(void *_h) { reghash_t *h = (reghash_t*)_h; khint_t k; @@ -90,23 +134,36 @@ void bed_index(void *_h) bed_reglist_t *p = &kh_val(h, k); if (p->idx) free(p->idx); ks_introsort(uint64_t, p->n, p->a); - p->idx = bed_index_core(p->n, p->a, &p->m); + p->idx = bed_index_core(p->n, p->a); + } + } +} + +static int bed_minoff(const bed_reglist_t *p, unsigned int beg, unsigned int end) { + int i, min_off=0; + + if (p && p->idx) { + min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; + if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here + int n = beg>>LIDX_SHIFT; + if (n > p->n) + n = p->n; + for (i = n - 1; i >= 0; --i) + if (p->idx[i] >= 0) + break; + min_off = i >= 0? p->idx[i] : 0; } } + + return min_off; } -int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +static int bed_overlap_core(const bed_reglist_t *p, int beg, int end) { int i, min_off; if (p->n == 0) return 0; - min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; - if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here - int n = beg>>LIDX_SHIFT; - if (n > p->n) n = p->n; - for (i = n - 1; i >= 0; --i) - if (p->idx[i] >= 0) break; - min_off = i >= 0? p->idx[i] : 0; - } + min_off = bed_minoff(p, beg, end); + for (i = min_off; i < p->n; ++i) { if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) @@ -125,6 +182,40 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end) return bed_overlap_core(&kh_val(h, k), beg, end); } +/** @brief Trim a sorted interval list, inside a region hash table, + * by removing completely contained intervals and merging adjacent or + * overlapping intervals. + * @param reg_hash the region hash table with interval lists as values + */ + +static void bed_unify(void *reg_hash) { + + int i, j, new_n; + reghash_t *h; + bed_reglist_t *p; + + if (!reg_hash) + return; + + h = (reghash_t *)reg_hash; + + for (i = kh_begin(h); i < kh_end(h); i++) { + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || !(p->n)) + continue; + + for (new_n = 0, j = 1; j < p->n; j++) { + if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { + p->a[++new_n] = p->a[j]; + } else { + if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) + p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); + } + } + + p->n = ++new_n; + } +} + /* "BED" file reader, which actually reads two different formats. BED files contain between three and nine fields per line, of which @@ -197,7 +288,7 @@ void *bed_read(const char *fn) // has called their reference "browser" or "track". if (0 == strcmp(ref, "browser")) continue; if (0 == strcmp(ref, "track")) continue; - fprintf(pysam_stderr, "[bed_read] Parse error reading %s at line %u\n", + fprintf(samtools_stderr, "[bed_read] Parse error reading %s at line %u\n", fn, line); goto fail_no_msg; } @@ -219,8 +310,8 @@ void *bed_read(const char *fn) // Add begin,end to the list if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); + p->m = p->m ? p->m<<1 : 4; + p->a = realloc(p->a, p->m * sizeof(uint64_t)); if (NULL == p->a) goto fail; } p->a[p->n++] = (uint64_t)beg<<32 | end; @@ -232,9 +323,10 @@ void *bed_read(const char *fn) gzclose(fp); free(str.s); bed_index(h); + //bed_unify(h); return h; fail: - fprintf(pysam_stderr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno)); + fprintf(samtools_stderr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno)); fail_no_msg: if (ks) ks_destroy(ks); if (fp) gzclose(fp); @@ -245,8 +337,13 @@ void *bed_read(const char *fn) void bed_destroy(void *_h) { - reghash_t *h = (reghash_t*)_h; + reghash_t *h; khint_t k; + + if (!_h) + return; + + h = (reghash_t*)_h; for (k = 0; k < kh_end(h); ++k) { if (kh_exist(h, k)) { free(kh_val(h, k).a); @@ -256,3 +353,250 @@ void bed_destroy(void *_h) } kh_destroy(reg, h); } + +static void *bed_insert(void *reg_hash, char *reg, unsigned int beg, unsigned int end) { + + reghash_t *h; + khint_t k; + bed_reglist_t *p; + + if (!reg_hash) + return NULL; + + h = (reghash_t *)reg_hash; + + // Put reg in the hash table if not already there + k = kh_get(reg, h, reg); //looks strange, but only the second reg is the actual region name. + if (k == kh_end(h)) { // absent from the hash table + int ret; + char *s = strdup(reg); + if (NULL == s) goto fail; + k = kh_put(reg, h, s, &ret); + if (-1 == ret) { + free(s); + goto fail; + } + memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); + } + p = &kh_val(h, k); + + // Add beg and end to the list + if (p->n == p->m) { + p->m = p->m ? p->m<<1 : 4; + p->a = realloc(p->a, p->m * sizeof(uint64_t)); + if (NULL == p->a) goto fail; + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + +fail: + return h; +} + +/* @brief Filter a region hash table (coming from the BED file) by another + * region hash table (coming from CLI), so that only intervals contained in + * both hash tables are kept. + * @param reg_hash the target region hash table + * @param tmp_hash the filter region hash table + * @return pointer to the filtered hash table + */ + +static void *bed_filter(void *reg_hash, void *tmp_hash) { + + reghash_t *h; + reghash_t *t; + bed_reglist_t *p, *q; + khint_t l, k; + uint64_t *new_a; + int i, j, new_n, min_off; + const char *reg; + uint32_t beg, end; + + h = (reghash_t *)reg_hash; + t = (reghash_t *)tmp_hash; + if (!h) + return NULL; + if (!t) + return h; + + for (l = kh_begin(t); l < kh_end(t); l++) { + if (!kh_exist(t,l) || !(q = &kh_val(t,l)) || !(q->n)) + continue; + + reg = kh_key(t,l); + k = kh_get(reg, h, reg); //looks strange, but only the second reg is a proper argument. + if (k == kh_end(h) || !(p = &kh_val(h, k)) || !(p->n)) + continue; + + new_a = (uint64_t *)calloc(q->n + p->n, sizeof(uint64_t)); + if (!new_a) + return NULL; + new_n = 0; + + for (i = 0; i < q->n; i++) { + beg = (uint32_t)(q->a[i]>>32); + end = (uint32_t)(q->a[i]); + + min_off = bed_minoff(p, beg, end); + for (j = min_off; j < p->n; ++j) { + if ((uint32_t)(p->a[j]>>32) >= end) break; // out of range; no need to proceed + if ((uint32_t)(p->a[j]) > beg && (uint32_t)(p->a[j]>>32) < end) { + new_a[new_n++] = ((uint64_t)MAX((uint32_t)(p->a[j]>>32), beg) << 32) | MIN((uint32_t)p->a[j], end); + } + } + } + + if (new_n > 0) { + free(p->a); + p->a = new_a; + p->n = new_n; + p->m = new_n; + p->filter = FILTERED; + } else { + free(new_a); + p->filter = ALL; + } + } + + return h; +} + +void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op) { + + reghash_t *h = (reghash_t *)reg_hash; + reghash_t *t = NULL; + + int i; + char reg[1024]; + const char *q; + int beg, end; + + if (h) { + t = kh_init(reg); + if (!t) { + fprintf(samtools_stderr, "Error when creating the temporary region hash table!\n"); + return NULL; + } + } else { + h = kh_init(reg); + if (!h) { + fprintf(samtools_stderr, "Error when creating the region hash table!\n"); + return NULL; + } + *op = 1; + } + + for (i=first; i 1024) { + fprintf(samtools_stderr, "Region name '%s' is too long (bigger than %d).\n", regs[i], 1024); + continue; + } + strncpy(reg, regs[i], q - regs[i]); + reg[q - regs[i]] = 0; + } else { + // not parsable as a region, but possibly a sequence named "foo:a" + if (strlen(regs[i]) + 1 > 1024) { + fprintf(samtools_stderr, "Region name '%s' is too long (bigger than %d).\n", regs[i], 1024); + continue; + } + strcpy(reg, regs[i]); + beg = 0; end = INT_MAX; + } + + //if op==1 insert reg to the bed hash table + if (*op && !(bed_insert(h, reg, beg, end))) { + fprintf(samtools_stderr, "Error when inserting region='%s' in the bed hash table at address=%p!\n", regs[i], h); + } + //if op==0, first insert the regions in the temporary hash table, + //then filter the bed hash table using it + if (!(*op) && !(bed_insert(t, reg, beg, end))) { + fprintf(samtools_stderr, "Error when inserting region='%s' in the temporary hash table at address=%p!\n", regs[i], t); + } + } + + if (!(*op)) { + bed_index(t); + bed_unify(t); + h = bed_filter(h, t); + bed_destroy(t); + } + + if (h) { + bed_index(h); + bed_unify(h); + } + + return h; +} + +const char* bed_get(void *reg_hash, int i, int filter) { + + reghash_t *h; + bed_reglist_t *p; + + if (!reg_hash) + return NULL; + + h = (reghash_t *)reg_hash; + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || (p->filter < filter)) + return NULL; + + return kh_key(h, i); +} + +hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *n_reg) { + + reghash_t *h; + bed_reglist_t *p; + khint_t i; + hts_reglist_t *reglist = NULL; + int count = 0; + int j; + + if (!reg_hash) + return NULL; + + h = (reghash_t *)reg_hash; + + for (i = kh_begin(h); i < kh_end(h); i++) { + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || (p->filter < filter)) + continue; + count++; + } + if (!count) + return NULL; + + reglist = (hts_reglist_t *)calloc(count, sizeof(hts_reglist_t)); + if (!reglist) + return NULL; + + *n_reg = count; + count = 0; + + for (i = kh_begin(h); i < kh_end(h) && count < *n_reg; i++) { + if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || (p->filter < filter)) + continue; + + reglist[count].reg = kh_key(h,i); + reglist[count].intervals = (hts_pair32_t *)calloc(p->n, sizeof(hts_pair32_t)); + if(!(reglist[count].intervals)) { + hts_reglist_free(reglist, count); + return NULL; + } + reglist[count].count = p->n; + reglist[count].max_end = 0; + + for (j = 0; j < p->n; j++) { + reglist[count].intervals[j].beg = (uint32_t)(p->a[j]>>32); + reglist[count].intervals[j].end = (uint32_t)(p->a[j]); + + if (reglist[count].intervals[j].end > reglist[count].max_end) + reglist[count].max_end = reglist[count].intervals[j].end; + } + count++; + } + + return reglist; +} diff --git a/samtools/bedidx.h b/samtools/bedidx.h new file mode 100644 index 0000000..a33a65f --- /dev/null +++ b/samtools/bedidx.h @@ -0,0 +1,20 @@ +#ifndef BEDIDX_H +#define BEDIDX_H + +#include "htslib/hts.h" + +#define LIDX_SHIFT 13 +#define ALL 0 +#define FILTERED 1 + +#define MIN(A,B) ( ( (A) < (B) ) ? (A) : (B) ) +#define MAX(A,B) ( ( (A) > (B) ) ? (A) : (B) ) + +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); +void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op); +const char* bed_get(void *reg_hash, int index, int filter); +hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *count_regs); + +#endif diff --git a/samtools/cut_target.c b/samtools/cut_target.c index 7d541fa..6348a0a 100644 --- a/samtools/cut_target.c +++ b/samtools/cut_target.c @@ -123,7 +123,7 @@ static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) s = b[i]>>s&1; } // print - for (i = 0, s = -1; i <= l; ++i) { + for (i = 0, s = -1; i < INT_MAX && i <= l; ++i) { if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { if (s >= 0) { int j; diff --git a/samtools/cut_target.c.pysam.c b/samtools/cut_target.c.pysam.c index e55f749..a156773 100644 --- a/samtools/cut_target.c.pysam.c +++ b/samtools/cut_target.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* cut_target.c -- targetcut subcommand. @@ -125,22 +125,22 @@ static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) s = b[i]>>s&1; } // print - for (i = 0, s = -1; i <= l; ++i) { + for (i = 0, s = -1; i < INT_MAX && i <= l; ++i) { if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { if (s >= 0) { int j; - fprintf(pysam_stdout, "%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); + fprintf(samtools_stdout, "%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); for (j = s; j < i; ++j) { int c = cns[j]>>8; - if (c == 0) fputc('N', pysam_stdout); - else fputc("ACGT"[c&3], pysam_stdout); + if (c == 0) fputc('N', samtools_stdout); + else fputc("ACGT"[c&3], samtools_stdout); } - fputc('\t', pysam_stdout); + fputc('\t', samtools_stdout); for (j = s; j < i; ++j) - fputc(33 + (cns[j]>>8>>2), pysam_stdout); - fputc('\n', pysam_stdout); + fputc(33 + (cns[j]>>8>>2), samtools_stdout); + fputc('\n', samtools_stdout); } - //if (s >= 0) fprintf(pysam_stdout, "%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); + //if (s >= 0) fprintf(samtools_stdout, "%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); s = -1; } else if ((b[i]>>2&3) && s < 0) s = i; } @@ -199,11 +199,11 @@ int main_cut_target(int argc, char *argv[]) } if (ga.reference) { g.fai = fai_load(ga.reference); - if (g.fai == 0) fprintf(pysam_stderr, "[%s] fail to load the fasta index.\n", __func__); + if (g.fai == 0) fprintf(samtools_stderr, "[%s] fail to load the fasta index.\n", __func__); } if (usage || argc == optind) { - fprintf(pysam_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); - sam_global_opt_help(pysam_stderr, "-.--f-"); + fprintf(samtools_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); + sam_global_opt_help(samtools_stderr, "-.--f-"); return 1; } l = max_l = 0; cns = 0; diff --git a/samtools/dict.c b/samtools/dict.c index cb5622e..7321a77 100644 --- a/samtools/dict.c +++ b/samtools/dict.c @@ -71,8 +71,8 @@ static void write_dict(const char *fn, args_t *args) if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n"); while ((l = kseq_read(seq)) >= 0) { for (i = k = 0; i < seq->seq.l; ++i) { - if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); - else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; + if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~') + seq->seq.s[k++] = toupper(seq->seq.s[i]); } hts_md5_reset(md5); hts_md5_update(md5, (unsigned char*)seq->seq.s, k); diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c index c4e4045..f42e416 100644 --- a/samtools/dict.c.pysam.c +++ b/samtools/dict.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* dict.c -- create a sequence dictionary file. @@ -54,14 +54,14 @@ static void write_dict(const char *fn, args_t *args) fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { - fprintf(pysam_stderr, "dict: %s: No such file or directory\n", fn); + fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn); exit(1); } - FILE *out = pysam_stdout; + FILE *out = samtools_stdout; if (args->output_fname) { out = fopen(args->output_fname, "w"); if (out == NULL) { - fprintf(pysam_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); + fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); exit(1); } } @@ -73,8 +73,8 @@ static void write_dict(const char *fn, args_t *args) if (args->header) fprintf(out, "@HD\tVN:1.0\tSO:unsorted\n"); while ((l = kseq_read(seq)) >= 0) { for (i = k = 0; i < seq->seq.l; ++i) { - if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); - else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; + if (seq->seq.s[i] >= '!' && seq->seq.s[i] <= '~') + seq->seq.s[k++] = toupper(seq->seq.s[i]); } hts_md5_reset(md5); hts_md5_update(md5, (unsigned char*)seq->seq.s, k); @@ -104,15 +104,15 @@ static void write_dict(const char *fn, args_t *args) static int dict_usage(void) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "About: Create a sequence dictionary file from a fasta file\n"); - fprintf(pysam_stderr, "Usage: samtools dict [options] \n\n"); - fprintf(pysam_stderr, "Options: -a, --assembly STR assembly\n"); - fprintf(pysam_stderr, " -H, --no-header do not print @HD line\n"); - fprintf(pysam_stderr, " -o, --output STR file to write out dict file [pysam_stdout]\n"); - fprintf(pysam_stderr, " -s, --species STR species\n"); - fprintf(pysam_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); - fprintf(pysam_stderr, "\n"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "About: Create a sequence dictionary file from a fasta file\n"); + fprintf(samtools_stderr, "Usage: samtools dict [options] \n\n"); + fprintf(samtools_stderr, "Options: -a, --assembly STR assembly\n"); + fprintf(samtools_stderr, " -H, --no-header do not print @HD line\n"); + fprintf(samtools_stderr, " -o, --output STR file to write out dict file [samtools_stdout]\n"); + fprintf(samtools_stderr, " -s, --species STR species\n"); + fprintf(samtools_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); + fprintf(samtools_stderr, "\n"); return 1; } diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c index ec8c90f..37eb247 100644 --- a/samtools/faidx.c.pysam.c +++ b/samtools/faidx.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* faidx.c -- faidx subcommand. @@ -48,18 +48,18 @@ int faidx_main(int argc, char *argv[]) switch(c) { case 'h': - return usage(pysam_stdout, EXIT_SUCCESS); + return usage(samtools_stdout, EXIT_SUCCESS); default: - return usage(pysam_stderr, EXIT_FAILURE); + return usage(samtools_stderr, EXIT_FAILURE); } } if ( argc==optind ) - return usage(pysam_stdout, EXIT_SUCCESS); + return usage(samtools_stdout, EXIT_SUCCESS); if ( argc==2 ) { if (fai_build(argv[optind]) != 0) { - fprintf(pysam_stderr, "Could not build fai index %s.fai\n", argv[optind]); + fprintf(samtools_stderr, "Could not build fai index %s.fai\n", argv[optind]); return EXIT_FAILURE; } return 0; @@ -67,7 +67,7 @@ int faidx_main(int argc, char *argv[]) faidx_t *fai = fai_load(argv[optind]); if ( !fai ) { - fprintf(pysam_stderr, "Could not load fai index of %s\n", argv[optind]); + fprintf(samtools_stderr, "Could not load fai index of %s\n", argv[optind]); return EXIT_FAILURE; } @@ -75,11 +75,11 @@ int faidx_main(int argc, char *argv[]) while ( ++optind%s\n", argv[optind]); + fprintf(samtools_stdout, ">%s\n", argv[optind]); int seq_len; char *seq = fai_fetch(fai, argv[optind], &seq_len); if ( seq_len < 0 ) { - fprintf(pysam_stderr, "Failed to fetch sequence in %s\n", argv[optind]); + fprintf(samtools_stderr, "Failed to fetch sequence in %s\n", argv[optind]); exit_status = EXIT_FAILURE; break; } @@ -87,8 +87,8 @@ int faidx_main(int argc, char *argv[]) for (i=0; i 1 > 2) + */ +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define LZ4_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || defined(__GNUC__) +# define LZ4_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +/*-************************************ +* Dependency +**************************************/ +#include "lz4.h" +/* see also "memory routines" below */ + + +/*-************************************ +* Compiler Options +**************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# include +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ +#endif /* _MSC_VER */ + +#ifndef FORCE_INLINE +# ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +# endif /* _MSC_VER */ +#endif /* FORCE_INLINE */ + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +/*-************************************ +* Memory routines +**************************************/ +#include /* malloc, calloc, free */ +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include /* memset, memcpy */ +#define MEM_INIT memset + + +/*-************************************ +* Basic Types +**************************************/ +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef uintptr_t uptrval; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; + typedef size_t uptrval; /* generally true, except OpenVMS-64 */ +#endif + +#if defined(__x86_64__) + typedef U64 reg_t; /* 64-bits in x32 mode */ +#else + typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif + +/*-************************************ +* Reading and writing into memory +**************************************/ +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + + +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) +/* lie to the compiler about data alignment; use with caution */ + +static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } +static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; } +static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; } + +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } + +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign; + +static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } +static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static reg_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; } + +static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } +static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } + +#else /* safe and portable access through memcpy() */ + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static reg_t LZ4_read_ARCH(const void* memPtr) +{ + reg_t val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +static void LZ4_write32(void* memPtr, U32 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* LZ4_FORCE_MEMORY_ACCESS */ + + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read16(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) { + LZ4_write16(memPtr, value); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +static void LZ4_copy8(void* dst, const void* src) +{ + memcpy(dst,src,8); +} + +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ +static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { LZ4_copy8(d,s); d+=8; s+=8; } while (d=2) +# include +# define DEBUGLOG(l, ...) { \ + if (l<=LZ4_DEBUG) { \ + fprintf(stderr, __FILE__ ": "); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + + +/*-************************************ +* Common functions +**************************************/ +static unsigned LZ4_NbCommonBytes (register reg_t val) +{ + if (LZ4_isLittleEndian()) { + if (sizeof(val)==8) { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, (U32)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else /* Big Endian CPU */ { + if (sizeof(val)==8) { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll((U64)val) >> 3); +# else + unsigned r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } + } +} + +#define STEPSIZE sizeof(reg_t) +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + while (likely(pIn compression run slower on incompressible data */ + + +/*-************************************ +* Local Structures and types +**************************************/ +typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; +typedef enum { byPtr, byU32, byU16 } tableType_t; + +typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { full = 0, partial = 1 } earlyEnd_directive; + + +/*-************************************ +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState() { return LZ4_STREAMSIZE; } + + +/*-****************************** +* Compression functions +********************************/ +static U32 LZ4_hash4(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +static U32 LZ4_hash5(U64 sequence, tableType_t const tableType) +{ + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + if (LZ4_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + +FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType) +{ + if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType); + return LZ4_hash4(LZ4_read32(p), tableType); +} + +static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } + } +} + +FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { const U32* const hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { const U16* const hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + + +/** LZ4_compress_generic() : + inlined, to ensure branches are decided at compilation time */ +FORCE_INLINE int LZ4_compress_generic( + LZ4_stream_t_internal* const cctx, + const char* const source, + char* const dest, + const int inputSize, + const int maxOutputSize, + const limitedOutput_directive outputLimited, + const tableType_t tableType, + const dict_directive dict, + const dictIssue_directive dictIssue, + const U32 acceleration) +{ + const BYTE* ip = (const BYTE*) source; + const BYTE* base; + const BYTE* lowLimit; + const BYTE* const lowRefLimit = ip - cctx->dictSize; + const BYTE* const dictionary = cctx->dictionary; + const BYTE* const dictEnd = dictionary + cctx->dictSize; + const ptrdiff_t dictDelta = dictEnd - (const BYTE*)source; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 forwardH; + + /* Init conditions */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported inputSize, too large (or negative) */ + switch(dict) + { + case noDict: + default: + base = (const BYTE*)source; + lowLimit = (const BYTE*)source; + break; + case withPrefix64k: + base = (const BYTE*)source - cctx->currentOffset; + lowLimit = (const BYTE*)source - cctx->dictSize; + break; + case usingExtDict: + base = (const BYTE*)source - cctx->currentOffset; + lowLimit = (const BYTE*)source; + break; + } + if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (inputSizehashTable, tableType, base); + ip++; forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for ( ; ; ) { + ptrdiff_t refDelta = 0; + const BYTE* match; + BYTE* token; + + /* Find a match */ + { const BYTE* forwardIp = ip; + unsigned step = 1; + unsigned searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base); + if (dict==usingExtDict) { + if (match < (const BYTE*)source) { + refDelta = dictDelta; + lowLimit = dictionary; + } else { + refDelta = 0; + lowLimit = (const BYTE*)source; + } } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base); + + } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) + || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while (((ip>anchor) & (match+refDelta > lowLimit)) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } + + /* Encode Literals */ + { unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputLimited) && /* Check output buffer overflow */ + (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) + return 0; + if (litLength >= RUN_MASK) { + int len = (int)litLength-RUN_MASK; + *token = (RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += MINMATCH + matchCode; + if (ip==limit) { + unsigned const more = LZ4_count(ip, (const BYTE*)source, matchlimit); + matchCode += more; + ip += more; + } + } else { + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += MINMATCH + matchCode; + } + + if ( outputLimited && /* Check output buffer overflow */ + (unlikely(op + (1 + LASTLITERALS) + (matchCode>>8) > olimit)) ) + return 0; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*255) op+=4, LZ4_write32(op, 0xFFFFFFFF), matchCode -= 4*255; + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); + } + + anchor = ip; + + /* Test end of chunk */ + if (ip > mflimit) break; + + /* Fill table */ + LZ4_putPosition(ip-2, cctx->hashTable, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, cctx->hashTable, tableType, base); + if (dict==usingExtDict) { + if (match < (const BYTE*)source) { + refDelta = dictDelta; + lowLimit = dictionary; + } else { + refDelta = 0; + lowLimit = (const BYTE*)source; + } } + LZ4_putPosition(ip, cctx->hashTable, tableType, base); + if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) + && (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { size_t const lastRun = (size_t)(iend - anchor); + if ( (outputLimited) && /* Check output buffer overflow */ + ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) ) + return 0; + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRun<internal_donotuse; + LZ4_resetStream((LZ4_stream_t*)state); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + if (maxOutputSize >= LZ4_compressBound(inputSize)) { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(ctx, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(ctx, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } else { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(ctx, source, dest, inputSize, maxOutputSize, limitedOutput, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } +} + + +int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ +#if (LZ4_HEAPMODE) + void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctx; + void* const ctxPtr = &ctx; +#endif + + int const result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + +#if (LZ4_HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1); +} + + +/* hidden debug function */ +/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ +int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t ctx; + LZ4_resetStream(&ctx); + + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(&ctx.internal_donotuse, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(&ctx.internal_donotuse, source, dest, inputSize, maxOutputSize, limitedOutput, sizeof(void*)==8 ? byU32 : byPtr, noDict, noDictIssue, acceleration); +} + + +/*-****************************** +* *_destSize() variant +********************************/ + +static int LZ4_compress_destSize_generic( + LZ4_stream_t_internal* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + const int targetDstSize, + const tableType_t tableType) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* base = (const BYTE*) src; + const BYTE* lowLimit = (const BYTE*) src; + const BYTE* anchor = ip; + const BYTE* const iend = ip + *srcSizePtr; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + targetDstSize; + BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */; + BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */); + BYTE* const oMaxSeq = oMaxLit - 1 /* token */; + + U32 forwardH; + + + /* Init conditions */ + if (targetDstSize < 1) return 0; /* Impossible to store anything */ + if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (*srcSizePtrhashTable, tableType, base); + ip++; forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for ( ; ; ) { + const BYTE* match; + BYTE* token; + + /* Find a match */ + { const BYTE* forwardIp = ip; + unsigned step = 1; + unsigned searchMatchNb = 1 << LZ4_skipTrigger; + + do { + U32 h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx->hashTable, tableType, base); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx->hashTable, tableType, base); + + } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } + + /* Encode Literal length */ + { unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if (op + ((litLength+240)/255) + litLength > oMaxLit) { + /* Not enough space for a last match */ + op--; + goto _last_literals; + } + if (litLength>=RUN_MASK) { + unsigned len = litLength - RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< oMaxMatch) { + /* Match description too long : reduce it */ + matchLength = (15-1) + (oMaxMatch-op) * 255; + } + ip += MINMATCH + matchLength; + + if (matchLength>=ML_MASK) { + *token += ML_MASK; + matchLength -= ML_MASK; + while (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of block */ + if (ip > mflimit) break; + if (op > oMaxSeq) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx->hashTable, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx->hashTable, tableType, base); + LZ4_putPosition(ip, ctx->hashTable, tableType, base); + if ( (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { size_t lastRunSize = (size_t)(iend - anchor); + if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) { + /* adapt lastRunSize to fill 'dst' */ + lastRunSize = (oend-op) - 1; + lastRunSize -= (lastRunSize+240)/255; + } + ip = anchor + lastRunSize; + + if (lastRunSize >= RUN_MASK) { + size_t accumulator = lastRunSize - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); + } else { + if (*srcSizePtr < LZ4_64Klimit) + return LZ4_compress_destSize_generic(&state->internal_donotuse, src, dst, srcSizePtr, targetDstSize, byU16); + else + return LZ4_compress_destSize_generic(&state->internal_donotuse, src, dst, srcSizePtr, targetDstSize, sizeof(void*)==8 ? byU32 : byPtr); + } +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (LZ4_HEAPMODE) + LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctxBody; + LZ4_stream_t* ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); + +#if (LZ4_HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/*-****************************** +* Streaming functions +********************************/ + +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + LZ4_resetStream(lz4s); + return lz4s; +} + +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); +} + +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + if (!LZ4_stream) return 0; /* support free on NULL */ + FREEMEM(LZ4_stream); + return (0); +} + + +#define HASH_UNIT sizeof(reg_t) +int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + const BYTE* base; + + if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ + LZ4_resetStream(LZ4_dict); + + if (dictSize < (int)HASH_UNIT) { + dict->dictionary = NULL; + dict->dictSize = 0; + return 0; + } + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + dict->currentOffset += 64 KB; + base = p - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += dict->dictSize; + + while (p <= dictEnd-HASH_UNIT) { + LZ4_putPosition(p, dict->hashTable, byU32, base); + p+=3; + } + + return dict->dictSize; +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) +{ + if ((LZ4_dict->currentOffset > 0x80000000) || + ((uptrval)LZ4_dict->currentOffset > (uptrval)src)) { /* address space overflow */ + /* rescale hash table */ + U32 const delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = (const BYTE*) source; + if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ + if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; + LZ4_renormDictT(streamPtr, smallest); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + /* Check overlapping input/dictionary space */ + { const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); + else + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); + streamPtr->dictSize += (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } + + /* external dictionary mode */ + { int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); + else + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) +{ + LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse; + int result; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = dictEnd; + if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; + LZ4_renormDictT(streamPtr, smallest); + + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + + return result; +} + + +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its memory location, + * save it into a safer place (char* safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue(). + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + */ +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; + + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/*-***************************** +* Decompression functions +*******************************/ +/*! LZ4_decompress_generic() : + * This generic decompression function cover all use cases. + * It shall be instantiated several times, using different sets of directives + * Note that it is important this generic function is really inlined, + * in order to remove useless branches during compilation optimization. + */ +FORCE_INLINE int LZ4_decompress_generic( + const char* const source, + char* const dest, + int inputSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ + + int endOnInput, /* endOnOutputSize, endOnInputSize */ + int partialDecoding, /* full, partial */ + int targetOutputSize, /* only used if partialDecoding==partial */ + int dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* == dest when no prefix */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + /* Local Variables */ + const BYTE* ip = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + const BYTE* const lowLimit = lowPrefix - dictSize; + + const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; + const unsigned dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; + const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Special cases */ + if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ + if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); + + /* Main Loop : decode sequences */ + while (1) { + size_t length; + const BYTE* match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while ( likely(endOnInput ? ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) ) + { + if (partialDecoding) { + if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ + if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ + } else { + if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ + } + memcpy(op, ip, length); + ip += length; + op += length; + break; /* Necessarily EOF, due to parsing restrictions */ + } + LZ4_wildCopy(op, ip, cpy); + ip += length; op = cpy; + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside buffers */ + LZ4_write32(op, (U32)offset); /* costs ~1%; silence an msan warning when offset==0 */ + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; + length += s; + } while (s==255); + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + + /* check external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ + + if (length <= (size_t)(lowPrefix-match)) { + /* match can be copied as a single segment from external dictionary */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match encompass external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix-match); + size_t const restSize = length - copySize; + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op-lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } else { + memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + + /* copy match within block */ + cpy = op + length; + if (unlikely(offset<8)) { + const int dec64 = dec64table[offset]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[offset]; + memcpy(op+4, match, 4); + match -= dec64; + } else { LZ4_copy8(op, match); match+=8; } + op += 8; + + if (unlikely(cpy>oend-12)) { + BYTE* const oCopyLimit = oend-(WILDCOPYLENGTH-1); + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ + if (op < oCopyLimit) { + LZ4_wildCopy(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op16) LZ4_wildCopy(op+8, match+8, cpy); + } + op=cpy; /* correction */ + } + + /* end of decoding */ + if (endOnInput) + return (int) (((char*)op)-dest); /* Nb of output bytes decoded */ + else + return (int) (((const char*)ip)-source); /* Nb of input bytes read */ + + /* Overflow error detected */ +_output_error: + return (int) (-(((const char*)ip)-source))-1; +} + + +int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0); +} + +int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0); +} + +int LZ4_decompress_fast(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB); +} + + +/*===== streaming decompression functions =====*/ + +LZ4_streamDecode_t* LZ4_createStreamDecode(void) +{ + LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(1, sizeof(LZ4_streamDecode_t)); + return lz4s; +} + +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream) +{ + if (!LZ4_stream) return 0; /* support free on NULL */ + FREEMEM(LZ4_stream); + return 0; +} + +/*! + * LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * This function is not necessary if previous data is still available where it was decoded. + * Loading a size of 0 is allowed (same effect as no dictionary). + * Return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + lz4sd->prefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) { + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += result; + lz4sd->prefixEnd += result; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) { + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += originalSize; + lz4sd->prefixEnd += originalSize; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); + if (dictStart+dictSize == dest) { + if (dictSize >= (int)(64 KB - 1)) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); + } + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize); +} + +/* debug function */ +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + + +/*=************************************************* +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); } +int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); } +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); } +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); } +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); } +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); } + +/* +These function names are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } + +static void LZ4_init(LZ4_stream_t* lz4ds, BYTE* base) +{ + MEM_INIT(lz4ds, 0, sizeof(LZ4_stream_t)); + lz4ds->internal_donotuse.bufferStart = base; +} + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + if ((((uptrval)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ + LZ4_init((LZ4_stream_t*)state, (BYTE*)inputBuffer); + return 0; +} + +void* LZ4_create (char* inputBuffer) +{ + LZ4_stream_t* lz4ds = (LZ4_stream_t*)ALLOCATOR(8, sizeof(LZ4_stream_t)); + LZ4_init (lz4ds, (BYTE*)inputBuffer); + return lz4ds; +} + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)LZ4_Data)->internal_donotuse; + int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); + return (char*)(ctx->bufferStart + dictSize); +} + +/* Obsolete streaming decompression functions */ + +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/samtools/lz4/lz4.c.pysam.c b/samtools/lz4/lz4.c.pysam.c new file mode 100644 index 0000000..a4a8ab4 --- /dev/null +++ b/samtools/lz4/lz4.c.pysam.c @@ -0,0 +1,1480 @@ +#include "samtools.pysam.h" + +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2017, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ + + +/*-************************************ +* Tuning parameters +**************************************/ +/* + * LZ4_HEAPMODE : + * Select how default compression functions will allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). + */ +#ifndef LZ4_HEAPMODE +# define LZ4_HEAPMODE 0 +#endif + +/* + * ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define ACCELERATION_DEFAULT 1 + + +/*-************************************ +* CPU Feature Detection +**************************************/ +/* LZ4_FORCE_MEMORY_ACCESS + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets which assembly generation depends on alignment. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define LZ4_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || defined(__GNUC__) +# define LZ4_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +/*-************************************ +* Dependency +**************************************/ +#include "lz4.h" +/* see also "memory routines" below */ + + +/*-************************************ +* Compiler Options +**************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# include +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ +#endif /* _MSC_VER */ + +#ifndef FORCE_INLINE +# ifdef _MSC_VER /* Visual Studio */ +# define FORCE_INLINE static __forceinline +# else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +# endif /* _MSC_VER */ +#endif /* FORCE_INLINE */ + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +/*-************************************ +* Memory routines +**************************************/ +#include /* malloc, calloc, free */ +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include /* memset, memcpy */ +#define MEM_INIT memset + + +/*-************************************ +* Basic Types +**************************************/ +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef uintptr_t uptrval; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; + typedef size_t uptrval; /* generally true, except OpenVMS-64 */ +#endif + +#if defined(__x86_64__) + typedef U64 reg_t; /* 64-bits in x32 mode */ +#else + typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif + +/*-************************************ +* Reading and writing into memory +**************************************/ +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + + +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) +/* lie to the compiler about data alignment; use with caution */ + +static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } +static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; } +static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; } + +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } + +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign; + +static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } +static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static reg_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; } + +static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } +static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } + +#else /* safe and portable access through memcpy() */ + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static reg_t LZ4_read_ARCH(const void* memPtr) +{ + reg_t val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +static void LZ4_write32(void* memPtr, U32 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* LZ4_FORCE_MEMORY_ACCESS */ + + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read16(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } +} + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) { + LZ4_write16(memPtr, value); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE) value; + p[1] = (BYTE)(value>>8); + } +} + +static void LZ4_copy8(void* dst, const void* src) +{ + memcpy(dst,src,8); +} + +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ +static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { LZ4_copy8(d,s); d+=8; s+=8; } while (d=2) +# include +# define DEBUGLOG(l, ...) { \ + if (l<=LZ4_DEBUG) { \ + fprintf(samtools_stderr, __FILE__ ": "); \ + fprintf(samtools_stderr, __VA_ARGS__); \ + fprintf(samtools_stderr, " \n"); \ + } } +#else +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + + +/*-************************************ +* Common functions +**************************************/ +static unsigned LZ4_NbCommonBytes (register reg_t val) +{ + if (LZ4_isLittleEndian()) { + if (sizeof(val)==8) { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, (U32)val ); + return (int)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else /* Big Endian CPU */ { + if (sizeof(val)==8) { +# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll((U64)val) >> 3); +# else + unsigned r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else /* 32 bits */ { +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } + } +} + +#define STEPSIZE sizeof(reg_t) +static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + while (likely(pIn compression run slower on incompressible data */ + + +/*-************************************ +* Local Structures and types +**************************************/ +typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; +typedef enum { byPtr, byU32, byU16 } tableType_t; + +typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { full = 0, partial = 1 } earlyEnd_directive; + + +/*-************************************ +* Local Utils +**************************************/ +int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState() { return LZ4_STREAMSIZE; } + + +/*-****************************** +* Compression functions +********************************/ +static U32 LZ4_hash4(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +static U32 LZ4_hash5(U64 sequence, tableType_t const tableType) +{ + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; + if (LZ4_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); +} + +FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType) +{ + if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType); + return LZ4_hash4(LZ4_read32(p), tableType); +} + +static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } + } +} + +FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { const U32* const hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { const U16* const hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ +} + +FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + + +/** LZ4_compress_generic() : + inlined, to ensure branches are decided at compilation time */ +FORCE_INLINE int LZ4_compress_generic( + LZ4_stream_t_internal* const cctx, + const char* const source, + char* const dest, + const int inputSize, + const int maxOutputSize, + const limitedOutput_directive outputLimited, + const tableType_t tableType, + const dict_directive dict, + const dictIssue_directive dictIssue, + const U32 acceleration) +{ + const BYTE* ip = (const BYTE*) source; + const BYTE* base; + const BYTE* lowLimit; + const BYTE* const lowRefLimit = ip - cctx->dictSize; + const BYTE* const dictionary = cctx->dictionary; + const BYTE* const dictEnd = dictionary + cctx->dictSize; + const ptrdiff_t dictDelta = dictEnd - (const BYTE*)source; + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const olimit = op + maxOutputSize; + + U32 forwardH; + + /* Init conditions */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported inputSize, too large (or negative) */ + switch(dict) + { + case noDict: + default: + base = (const BYTE*)source; + lowLimit = (const BYTE*)source; + break; + case withPrefix64k: + base = (const BYTE*)source - cctx->currentOffset; + lowLimit = (const BYTE*)source - cctx->dictSize; + break; + case usingExtDict: + base = (const BYTE*)source - cctx->currentOffset; + lowLimit = (const BYTE*)source; + break; + } + if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (inputSizehashTable, tableType, base); + ip++; forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for ( ; ; ) { + ptrdiff_t refDelta = 0; + const BYTE* match; + BYTE* token; + + /* Find a match */ + { const BYTE* forwardIp = ip; + unsigned step = 1; + unsigned searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base); + if (dict==usingExtDict) { + if (match < (const BYTE*)source) { + refDelta = dictDelta; + lowLimit = dictionary; + } else { + refDelta = 0; + lowLimit = (const BYTE*)source; + } } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base); + + } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) + || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while (((ip>anchor) & (match+refDelta > lowLimit)) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } + + /* Encode Literals */ + { unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputLimited) && /* Check output buffer overflow */ + (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) + return 0; + if (litLength >= RUN_MASK) { + int len = (int)litLength-RUN_MASK; + *token = (RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += MINMATCH + matchCode; + if (ip==limit) { + unsigned const more = LZ4_count(ip, (const BYTE*)source, matchlimit); + matchCode += more; + ip += more; + } + } else { + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += MINMATCH + matchCode; + } + + if ( outputLimited && /* Check output buffer overflow */ + (unlikely(op + (1 + LASTLITERALS) + (matchCode>>8) > olimit)) ) + return 0; + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*255) op+=4, LZ4_write32(op, 0xFFFFFFFF), matchCode -= 4*255; + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); + } + + anchor = ip; + + /* Test end of chunk */ + if (ip > mflimit) break; + + /* Fill table */ + LZ4_putPosition(ip-2, cctx->hashTable, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, cctx->hashTable, tableType, base); + if (dict==usingExtDict) { + if (match < (const BYTE*)source) { + refDelta = dictDelta; + lowLimit = dictionary; + } else { + refDelta = 0; + lowLimit = (const BYTE*)source; + } } + LZ4_putPosition(ip, cctx->hashTable, tableType, base); + if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) + && (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { size_t const lastRun = (size_t)(iend - anchor); + if ( (outputLimited) && /* Check output buffer overflow */ + ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) ) + return 0; + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRun<internal_donotuse; + LZ4_resetStream((LZ4_stream_t*)state); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + if (maxOutputSize >= LZ4_compressBound(inputSize)) { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(ctx, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(ctx, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } else { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(ctx, source, dest, inputSize, maxOutputSize, limitedOutput, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } +} + + +int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ +#if (LZ4_HEAPMODE) + void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctx; + void* const ctxPtr = &ctx; +#endif + + int const result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + +#if (LZ4_HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + + +int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) +{ + return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1); +} + + +/* hidden debug function */ +/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ +int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t ctx; + LZ4_resetStream(&ctx); + + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(&ctx.internal_donotuse, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + else + return LZ4_compress_generic(&ctx.internal_donotuse, source, dest, inputSize, maxOutputSize, limitedOutput, sizeof(void*)==8 ? byU32 : byPtr, noDict, noDictIssue, acceleration); +} + + +/*-****************************** +* *_destSize() variant +********************************/ + +static int LZ4_compress_destSize_generic( + LZ4_stream_t_internal* const ctx, + const char* const src, + char* const dst, + int* const srcSizePtr, + const int targetDstSize, + const tableType_t tableType) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* base = (const BYTE*) src; + const BYTE* lowLimit = (const BYTE*) src; + const BYTE* anchor = ip; + const BYTE* const iend = ip + *srcSizePtr; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + targetDstSize; + BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */; + BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */); + BYTE* const oMaxSeq = oMaxLit - 1 /* token */; + + U32 forwardH; + + + /* Init conditions */ + if (targetDstSize < 1) return 0; /* Impossible to store anything */ + if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ + if (*srcSizePtrhashTable, tableType, base); + ip++; forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for ( ; ; ) { + const BYTE* match; + BYTE* token; + + /* Find a match */ + { const BYTE* forwardIp = ip; + unsigned step = 1; + unsigned searchMatchNb = 1 << LZ4_skipTrigger; + + do { + U32 h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimit)) goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx->hashTable, tableType, base); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, ctx->hashTable, tableType, base); + + } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match) != LZ4_read32(ip)) ); + } + + /* Catch up */ + while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } + + /* Encode Literal length */ + { unsigned litLength = (unsigned)(ip - anchor); + token = op++; + if (op + ((litLength+240)/255) + litLength > oMaxLit) { + /* Not enough space for a last match */ + op--; + goto _last_literals; + } + if (litLength>=RUN_MASK) { + unsigned len = litLength - RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(litLength< oMaxMatch) { + /* Match description too long : reduce it */ + matchLength = (15-1) + (oMaxMatch-op) * 255; + } + ip += MINMATCH + matchLength; + + if (matchLength>=ML_MASK) { + *token += ML_MASK; + matchLength -= ML_MASK; + while (matchLength >= 255) { matchLength-=255; *op++ = 255; } + *op++ = (BYTE)matchLength; + } + else *token += (BYTE)(matchLength); + } + + anchor = ip; + + /* Test end of block */ + if (ip > mflimit) break; + if (op > oMaxSeq) break; + + /* Fill table */ + LZ4_putPosition(ip-2, ctx->hashTable, tableType, base); + + /* Test next position */ + match = LZ4_getPosition(ip, ctx->hashTable, tableType, base); + LZ4_putPosition(ip, ctx->hashTable, tableType, base); + if ( (match+MAX_DISTANCE>=ip) + && (LZ4_read32(match)==LZ4_read32(ip)) ) + { token=op++; *token=0; goto _next_match; } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { size_t lastRunSize = (size_t)(iend - anchor); + if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) { + /* adapt lastRunSize to fill 'dst' */ + lastRunSize = (oend-op) - 1; + lastRunSize -= (lastRunSize+240)/255; + } + ip = anchor + lastRunSize; + + if (lastRunSize >= RUN_MASK) { + size_t accumulator = lastRunSize - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); + } else { + if (*srcSizePtr < LZ4_64Klimit) + return LZ4_compress_destSize_generic(&state->internal_donotuse, src, dst, srcSizePtr, targetDstSize, byU16); + else + return LZ4_compress_destSize_generic(&state->internal_donotuse, src, dst, srcSizePtr, targetDstSize, sizeof(void*)==8 ? byU32 : byPtr); + } +} + + +int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) +{ +#if (LZ4_HEAPMODE) + LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ +#else + LZ4_stream_t ctxBody; + LZ4_stream_t* ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); + +#if (LZ4_HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + + + +/*-****************************** +* Streaming functions +********************************/ + +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); + LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ + LZ4_resetStream(lz4s); + return lz4s; +} + +void LZ4_resetStream (LZ4_stream_t* LZ4_stream) +{ + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); +} + +int LZ4_freeStream (LZ4_stream_t* LZ4_stream) +{ + if (!LZ4_stream) return 0; /* support free on NULL */ + FREEMEM(LZ4_stream); + return (0); +} + + +#define HASH_UNIT sizeof(reg_t) +int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + const BYTE* base; + + if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ + LZ4_resetStream(LZ4_dict); + + if (dictSize < (int)HASH_UNIT) { + dict->dictionary = NULL; + dict->dictSize = 0; + return 0; + } + + if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; + dict->currentOffset += 64 KB; + base = p - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += dict->dictSize; + + while (p <= dictEnd-HASH_UNIT) { + LZ4_putPosition(p, dict->hashTable, byU32, base); + p+=3; + } + + return dict->dictSize; +} + + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) +{ + if ((LZ4_dict->currentOffset > 0x80000000) || + ((uptrval)LZ4_dict->currentOffset > (uptrval)src)) { /* address space overflow */ + /* rescale hash table */ + U32 const delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; + else LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + + +int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = (const BYTE*) source; + if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ + if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; + LZ4_renormDictT(streamPtr, smallest); + if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; + + /* Check overlapping input/dictionary space */ + { const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE*)source) { + int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); + else + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); + streamPtr->dictSize += (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } + + /* external dictionary mode */ + { int result; + if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); + else + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } +} + + +/* Hidden debug function, to force external dictionary mode */ +int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) +{ + LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse; + int result; + const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; + + const BYTE* smallest = dictEnd; + if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; + LZ4_renormDictT(streamPtr, smallest); + + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + + return result; +} + + +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its memory location, + * save it into a safer place (char* safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue(). + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + */ +int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; + + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + + + +/*-***************************** +* Decompression functions +*******************************/ +/*! LZ4_decompress_generic() : + * This generic decompression function cover all use cases. + * It shall be instantiated several times, using different sets of directives + * Note that it is important this generic function is really inlined, + * in order to remove useless branches during compilation optimization. + */ +FORCE_INLINE int LZ4_decompress_generic( + const char* const source, + char* const dest, + int inputSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ + + int endOnInput, /* endOnOutputSize, endOnInputSize */ + int partialDecoding, /* full, partial */ + int targetOutputSize, /* only used if partialDecoding==partial */ + int dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* == dest when no prefix */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) +{ + /* Local Variables */ + const BYTE* ip = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + const BYTE* const lowLimit = lowPrefix - dictSize; + + const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; + const unsigned dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; + const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Special cases */ + if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ + if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); + + /* Main Loop : decode sequences */ + while (1) { + size_t length; + const BYTE* match; + size_t offset; + + /* get literal length */ + unsigned const token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) { + unsigned s; + do { + s = *ip++; + length += s; + } while ( likely(endOnInput ? ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) ) + { + if (partialDecoding) { + if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ + if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ + } else { + if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ + } + memcpy(op, ip, length); + ip += length; + op += length; + break; /* Necessarily EOF, due to parsing restrictions */ + } + LZ4_wildCopy(op, ip, cpy); + ip += length; op = cpy; + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside buffers */ + LZ4_write32(op, (U32)offset); /* costs ~1%; silence an msan warning when offset==0 */ + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + unsigned s; + do { + s = *ip++; + if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; + length += s; + } while (s==255); + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + + /* check external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ + + if (length <= (size_t)(lowPrefix-match)) { + /* match can be copied as a single segment from external dictionary */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match encompass external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix-match); + size_t const restSize = length - copySize; + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op-lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } else { + memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + + /* copy match within block */ + cpy = op + length; + if (unlikely(offset<8)) { + const int dec64 = dec64table[offset]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[offset]; + memcpy(op+4, match, 4); + match -= dec64; + } else { LZ4_copy8(op, match); match+=8; } + op += 8; + + if (unlikely(cpy>oend-12)) { + BYTE* const oCopyLimit = oend-(WILDCOPYLENGTH-1); + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ + if (op < oCopyLimit) { + LZ4_wildCopy(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op16) LZ4_wildCopy(op+8, match+8, cpy); + } + op=cpy; /* correction */ + } + + /* end of decoding */ + if (endOnInput) + return (int) (((char*)op)-dest); /* Nb of output bytes decoded */ + else + return (int) (((const char*)ip)-source); /* Nb of input bytes read */ + + /* Overflow error detected */ +_output_error: + return (int) (-(((const char*)ip)-source))-1; +} + + +int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0); +} + +int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0); +} + +int LZ4_decompress_fast(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB); +} + + +/*===== streaming decompression functions =====*/ + +LZ4_streamDecode_t* LZ4_createStreamDecode(void) +{ + LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(1, sizeof(LZ4_streamDecode_t)); + return lz4s; +} + +int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream) +{ + if (!LZ4_stream) return 0; /* support free on NULL */ + FREEMEM(LZ4_stream); + return 0; +} + +/*! + * LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * This function is not necessary if previous data is still available where it was decoded. + * Loading a size of 0 is allowed (same effect as no dictionary). + * Return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + lz4sd->prefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in "streaming" mode. + Previously decoded blocks must still be available at the memory position where they were decoded. + If it's not possible, save the relevant part of decoded data into a safe buffer, + and indicate where it stands using LZ4_setStreamDecode() +*/ +int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) { + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += result; + lz4sd->prefixEnd += result; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixEnd == (BYTE*)dest) { + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize += originalSize; + lz4sd->prefixEnd += originalSize; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) return result; + lz4sd->prefixSize = originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize) +{ + if (dictSize==0) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); + if (dictStart+dictSize == dest) { + if (dictSize >= (int)(64 KB - 1)) + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); + } + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + +int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize); +} + +/* debug function */ +int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); +} + + +/*=************************************************* +* Obsolete Functions +***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); } +int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); } +int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); } +int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); } +int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); } +int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); } + +/* +These function names are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } + +static void LZ4_init(LZ4_stream_t* lz4ds, BYTE* base) +{ + MEM_INIT(lz4ds, 0, sizeof(LZ4_stream_t)); + lz4ds->internal_donotuse.bufferStart = base; +} + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + if ((((uptrval)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ + LZ4_init((LZ4_stream_t*)state, (BYTE*)inputBuffer); + return 0; +} + +void* LZ4_create (char* inputBuffer) +{ + LZ4_stream_t* lz4ds = (LZ4_stream_t*)ALLOCATOR(8, sizeof(LZ4_stream_t)); + LZ4_init (lz4ds, (BYTE*)inputBuffer); + return lz4ds; +} + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)LZ4_Data)->internal_donotuse; + int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); + return (char*)(ctx->bufferStart + dictSize); +} + +/* Obsolete streaming decompression functions */ + +int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); +} + +#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/samtools/lz4/lz4.h b/samtools/lz4/lz4.h new file mode 100644 index 0000000..86ca0d5 --- /dev/null +++ b/samtools/lz4/lz4.h @@ -0,0 +1,463 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-2017, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef LZ4_H_2983827168210 +#define LZ4_H_2983827168210 + +/* --- Dependency --- */ +#include /* size_t */ + + +/** + Introduction + + LZ4 is lossless compression algorithm, providing compression speed at 400 MB/s per core, + scalable with multi-cores CPU. It features an extremely fast decoder, with speed in + multiple GB/s per core, typically reaching RAM speed limits on multi-core systems. + + The LZ4 compression library provides in-memory compression and decompression functions. + Compression can be done in: + - a single step (described as Simple Functions) + - a single step, reusing a context (described in Advanced Functions) + - unbounded multiple steps (described as Streaming compression) + + lz4.h provides block compression functions. It gives full buffer control to user. + Decompressing an lz4-compressed block also requires metadata (such as compressed size). + Each application is free to encode such metadata in whichever way it wants. + + An additional format, called LZ4 frame specification (doc/lz4_Frame_format.md), + take care of encoding standard metadata alongside LZ4-compressed blocks. + If your application requires interoperability, it's recommended to use it. + A library is provided to take care of it, see lz4frame.h. +*/ + +/*^*************************************************************** +* Export parameters +*****************************************************************/ +/* +* LZ4_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +* LZ4LIB_API : +* Control library symbols visibility. +*/ +#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1) +# define LZ4LIB_API __declspec(dllexport) +#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1) +# define LZ4LIB_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#elif defined(__GNUC__) && (__GNUC__ >= 4) +# define LZ4LIB_API __attribute__ ((__visibility__ ("default"))) +#else +# define LZ4LIB_API +#endif + + +/*------ Version ------*/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR 8 /* for new (non-breaking) interface capabilities */ +#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ + +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) + +#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE +#define LZ4_QUOTE(str) #str +#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str) +#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) + +LZ4LIB_API int LZ4_versionNumber (void); /**< library version number; to be used when checking dll version */ +LZ4LIB_API const char* LZ4_versionString (void); /**< library version string; to be used when checking dll version */ + + +/*-************************************ +* Tuning parameter +**************************************/ +/*! + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio + * Reduced memory usage can improve speed, due to cache effect + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache + */ +#ifndef LZ4_MEMORY_USAGE +# define LZ4_MEMORY_USAGE 14 +#endif + +/*-************************************ +* Simple Functions +**************************************/ +/*! LZ4_compress_default() : + Compresses 'sourceSize' bytes from buffer 'source' + into already allocated 'dest' buffer of size 'maxDestSize'. + Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize). + It also runs faster, so it's a recommended setting. + If the function cannot compress 'source' into a more limited 'dest' budget, + compression stops *immediately*, and the function result is zero. + As a consequence, 'dest' content is not valid. + This function never writes outside 'dest' buffer, nor read outside 'source' buffer. + sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxDestSize : full or partial size of buffer 'dest' (which must be already allocated) + return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize) + or 0 if compression fails */ +LZ4LIB_API int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); + +/*! LZ4_decompress_safe() : + compressedSize : is the precise full size of the compressed block. + maxDecompressedSize : is the size of destination buffer, which must be already allocated. + return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize) + If destination buffer is not large enough, decoding will stop and output an error code (<0). + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function is protected against buffer overflow exploits, including malicious data packets. + It never writes outside output buffer, nor reads outside input buffer. +*/ +LZ4LIB_API int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); + + +/*-************************************ +* Advanced Functions +**************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) + +/*! +LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) + This function is primarily useful for memory allocation purposes (destination buffer size). + Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). + Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize) + inputSize : max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ +LZ4LIB_API int LZ4_compressBound(int inputSize); + +/*! +LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows to select an "acceleration" factor. + The larger the acceleration value, the faster the algorithm, but also the lesser the compression. + It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. + An acceleration value of "1" is the same as regular LZ4_compress_default() + Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1. +*/ +LZ4LIB_API int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); + + +/*! +LZ4_compress_fast_extState() : + Same compression function, just using an externally allocated memory space to store compression state. + Use LZ4_sizeofState() to know how much memory must be allocated, + and allocate it on 8-bytes boundaries (using malloc() typically). + Then, provide it as 'void* state' to compression function. +*/ +LZ4LIB_API int LZ4_sizeofState(void); +LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); + + +/*! +LZ4_compress_destSize() : + Reverse the logic, by compressing as much data as possible from 'source' buffer + into already allocated buffer 'dest' of size 'targetDestSize'. + This function either compresses the entire 'source' content into 'dest' if it's large enough, + or fill 'dest' buffer completely with as much data as possible from 'source'. + *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'. + New value is necessarily <= old value. + return : Nb bytes written into 'dest' (necessarily <= targetDestSize) + or 0 if compression fails +*/ +LZ4LIB_API int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); + + +/*! +LZ4_decompress_fast() : + originalSize : is the original and therefore uncompressed size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is detected malformed, the function will stop decoding and return a negative result. + Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. + note : This function fully respect memory boundaries for properly formed compressed data. + It is a bit faster than LZ4_decompress_safe(). + However, it does not provide any protection against intentionally modified data stream (malicious input). + Use this function in trusted environment only (data to decode comes from a trusted source). +*/ +LZ4LIB_API int LZ4_decompress_fast (const char* source, char* dest, int originalSize); + +/*! +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'compressedSize' at position 'source' + into destination buffer 'dest' of size 'maxDecompressedSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ +LZ4LIB_API int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); + + +/*-********************************************* +* Streaming Compression Functions +***********************************************/ +typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */ + +/*! LZ4_createStream() and LZ4_freeStream() : + * LZ4_createStream() will allocate and initialize an `LZ4_stream_t` structure. + * LZ4_freeStream() releases its memory. + */ +LZ4LIB_API LZ4_stream_t* LZ4_createStream(void); +LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr); + +/*! LZ4_resetStream() : + * An LZ4_stream_t structure can be allocated once and re-used multiple times. + * Use this function to init an allocated `LZ4_stream_t` structure and start a new compression. + */ +LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr); + +/*! LZ4_loadDict() : + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' will remain in memory. + * Loading a size of 0 is allowed. + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/*! LZ4_compress_fast_continue() : + * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. + * Important : Previous data blocks are assumed to remain present and unmodified ! + * 'dst' buffer must be already allocated. + * If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function @return==0. + * After an error, the stream status is invalid, it can only be reset or freed. + */ +LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its current memory location, + * save it into a safer place (char* safeBuffer). + * Note : it's not necessary to call LZ4_loadDict() after LZ4_saveDict(), dictionary is immediately usable. + * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + */ +LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); + + +/*-********************************************** +* Streaming Decompression Functions +* Bufferless synchronous API +************************************************/ +typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* incomplete type (defined later) */ + +/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() : + * creation / destruction of streaming decompression tracking structure */ +LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void); +LZ4LIB_API int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); + +/*! LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * Setting a size of 0 is allowed (same effect as reset). + * @return : 1 if OK, 0 if error + */ +LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); + +/*! LZ4_decompress_*_continue() : + * These decoding functions allow decompression of multiple blocks in "streaming" mode. + * Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB) + * In the case of a ring buffers, decoding buffer must be either : + * - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions) + * In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB). + * - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block. + * In which case, encoding and decoding buffers do not need to be synchronized, + * and encoding ring buffer can have any size, including small ones ( < 64 KB). + * - _At least_ 64 KB + 8 bytes + maxBlockSize. + * In which case, encoding and decoding buffers do not need to be synchronized, + * and encoding ring buffer can have any size, including larger than decoding buffer. + * Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer, + * and indicate where it is saved using LZ4_setStreamDecode() +*/ +LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); +LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); + + +/*! LZ4_decompress_*_usingDict() : + * These decoding functions work the same as + * a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue() + * They are stand-alone, and don't need an LZ4_streamDecode_t structure. + */ +LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); +LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); + + +/*^********************************************** + * !!!!!! STATIC LINKING ONLY !!!!!! + ***********************************************/ +/*-************************************ + * Private definitions + ************************************** + * Do not use these definitions. + * They are exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`. + * Using these definitions will expose code to API and/or ABI break in future versions of the library. + **************************************/ +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) /* required as macro for static allocation */ + +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +#include + +typedef struct { + uint32_t hashTable[LZ4_HASH_SIZE_U32]; + uint32_t currentOffset; + uint32_t initCheck; + const uint8_t* dictionary; + uint8_t* bufferStart; /* obsolete, used for slideInputBuffer */ + uint32_t dictSize; +} LZ4_stream_t_internal; + +typedef struct { + const uint8_t* externalDict; + size_t extDictSize; + const uint8_t* prefixEnd; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#else + +typedef struct { + unsigned int hashTable[LZ4_HASH_SIZE_U32]; + unsigned int currentOffset; + unsigned int initCheck; + const unsigned char* dictionary; + unsigned char* bufferStart; /* obsolete, used for slideInputBuffer */ + unsigned int dictSize; +} LZ4_stream_t_internal; + +typedef struct { + const unsigned char* externalDict; + size_t extDictSize; + const unsigned char* prefixEnd; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#endif + +/*! + * LZ4_stream_t : + * information structure to track an LZ4 stream. + * init this structure before first use. + * note : only use in association with static linking ! + * this definition is not API/ABI safe, + * it may change in a future version ! + */ +#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(unsigned long long)) +union LZ4_stream_u { + unsigned long long table[LZ4_STREAMSIZE_U64]; + LZ4_stream_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_stream_t */ + + +/*! + * LZ4_streamDecode_t : + * information structure to track an LZ4 stream during decompression. + * init this structure using LZ4_setStreamDecode (or memset()) before first use + * note : only use in association with static linking ! + * this definition is not API/ABI safe, + * and may change in a future version ! + */ +#define LZ4_STREAMDECODESIZE_U64 4 +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) +union LZ4_streamDecode_u { + unsigned long long table[LZ4_STREAMDECODESIZE_U64]; + LZ4_streamDecode_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_streamDecode_t */ + + +/*-************************************ +* Obsolete Functions +**************************************/ + +/*! Deprecation warnings + Should deprecation warnings be a problem, + it is generally possible to disable them, + typically with -Wno-deprecated-declarations for gcc + or _CRT_SECURE_NO_WARNINGS in Visual. + Otherwise, it's also possible to define LZ4_DISABLE_DEPRECATE_WARNINGS */ +#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS +# define LZ4_DEPRECATED(message) /* disable deprecation warnings */ +#else +# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define LZ4_DEPRECATED(message) [[deprecated(message)]] +# elif (LZ4_GCC_VERSION >= 405) || defined(__clang__) +# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +# elif (LZ4_GCC_VERSION >= 301) +# define LZ4_DEPRECATED(message) __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") +# define LZ4_DEPRECATED(message) +# endif +#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */ + +/* Obsolete compression functions */ +LZ4LIB_API LZ4_DEPRECATED("use LZ4_compress_default() instead") int LZ4_compress (const char* source, char* dest, int sourceSize); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_compress_default() instead") int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); + +/* Obsolete decompression functions */ +LZ4LIB_API LZ4_DEPRECATED("use LZ4_decompress_fast() instead") int LZ4_uncompress (const char* source, char* dest, int outputSize); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_decompress_safe() instead") int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); + +/* Obsolete streaming functions; use new streaming interface whenever possible */ +LZ4LIB_API LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_createStream() instead") int LZ4_sizeofStreamState(void); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_resetStream() instead") int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_saveDict() instead") char* LZ4_slideInputBuffer (void* state); + +/* Obsolete streaming decoding functions */ +LZ4LIB_API LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); +LZ4LIB_API LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); + +#endif /* LZ4_H_2983827168210 */ + + +#if defined (__cplusplus) +} +#endif diff --git a/samtools/misc/ace2sam.c.pysam.c b/samtools/misc/ace2sam.c.pysam.c index 02d2f58..1135967 100644 --- a/samtools/misc/ace2sam.c.pysam.c +++ b/samtools/misc/ace2sam.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* The MIT License @@ -51,7 +51,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) // a fatal error static void fatal(const char *msg) { - fprintf(pysam_stderr, "E %s\n", msg); + fprintf(samtools_stderr, "E %s\n", msg); exit(1); } // remove pads @@ -82,13 +82,13 @@ int samtools_ace2sam_main(int argc, char *argv[]) } } if (argc == optind) { - fprintf(pysam_stderr, "\nUsage: ace2sam [-pc] \n\n"); - fprintf(pysam_stderr, "Options: -p output padded SAM\n"); - fprintf(pysam_stderr, " -c write the contig sequence in SAM\n\n"); - fprintf(pysam_stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); - fprintf(pysam_stderr, " 2. The order of reads in AF and in RD must be identical\n"); - fprintf(pysam_stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); - fprintf(pysam_stderr, " 4. This program writes the headerless SAM to pysam_stdout and header to pysam_stderr\n\n"); + fprintf(samtools_stderr, "\nUsage: ace2sam [-pc] \n\n"); + fprintf(samtools_stderr, "Options: -p output padded SAM\n"); + fprintf(samtools_stderr, " -c write the contig sequence in SAM\n\n"); + fprintf(samtools_stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); + fprintf(samtools_stderr, " 2. The order of reads in AF and in RD must be identical\n"); + fprintf(samtools_stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); + fprintf(samtools_stderr, " 4. This program writes the headerless SAM to samtools_stdout and header to samtools_stderr\n\n"); return 1; } @@ -113,14 +113,14 @@ int samtools_ace2sam_main(int argc, char *argv[]) if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences - fprintf(pysam_stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line + fprintf(samtools_stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; - fprintf(pysam_stderr, "S >%s\n", t[0].s); + fprintf(samtools_stderr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { - fputs("S ", pysam_stderr); + fputs("S ", samtools_stderr); for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) - fputc(cns->s[i + k], pysam_stderr); - fputc('\n', pysam_stderr); + fputc(cns->s[i + k], samtools_stderr); + fputc('\n', samtools_stderr); } #define __padded2cigar(sp) do { \ @@ -154,7 +154,7 @@ int samtools_ace2sam_main(int argc, char *argv[]) if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*" for (i = 0; i < t[2].l; ++i) { // read the consensus quality int q; - if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(pysam_stderr, "E truncated contig quality\n"); + if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(samtools_stderr, "E truncated contig quality\n"); if (s.l) { q = atoi(s.s) + 33; if (q > 126) q = 126; @@ -164,14 +164,14 @@ int samtools_ace2sam_main(int argc, char *argv[]) if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); ks_getuntil(ks, '\n', &s, &dret); // skip the empty line if (write_cns) { - if (t[4].l) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); + if (t[4].l) fputs(t[4].s, samtools_stdout) & fputc('\n', samtools_stdout); t[4].l = 0; } } else if (strcmp(s.s, "AF") == 0) { // padded read position int reversed, neg, pos; if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'"); if (write_cns) { - if (t[4].l) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); + if (t[4].l) fputs(t[4].s, samtools_stdout) & fputc('\n', samtools_stdout); t[4].l = 0; } ks_getuntil(ks, 0, &s, &dret); // read name @@ -244,7 +244,7 @@ int samtools_ace2sam_main(int argc, char *argv[]) kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ kputs("\t*", &t[4]); // QUAL - fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); // print to pysam_stdout + fputs(t[4].s, samtools_stdout) & fputc('\n', samtools_stdout); // print to samtools_stdout ++af_i; } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); } diff --git a/samtools/padding.c.pysam.c b/samtools/padding.c.pysam.c index 901f027..dfaedb7 100644 --- a/samtools/padding.c.pysam.c +++ b/samtools/padding.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* padding.c -- depad subcommand. @@ -99,10 +99,10 @@ static int unpad_seq(bam1_t *b, kstring_t *s) for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; - fprintf(pysam_stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); + fprintf(samtools_stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { - fprintf(pysam_stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); + fprintf(samtools_stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } @@ -117,7 +117,7 @@ int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); if (fai_ref_len != ref_len) { - fprintf(pysam_stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); + fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); free(fai_ref); return -1; } @@ -131,7 +131,7 @@ int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) } else { int i = seq_nt16_table[(int)base]; if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 - fprintf(pysam_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); + fprintf(samtools_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); free(fai_ref); return -1; } @@ -152,19 +152,19 @@ int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); if (fai_ref_len != padded_len) { - fprintf(pysam_stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); + fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); free(fai_ref); return -1; } for (k = 0; k < padded_len; ++k) { - //fprintf(pysam_stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref)); + //fprintf(samtools_stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref)); base = fai_ref[k]; if (base == '-' || base == '*') { gaps += 1; } else { int i = seq_nt16_table[(int)base]; if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 - fprintf(pysam_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name); + fprintf(samtools_stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name); free(fai_ref); return -1; } @@ -197,7 +197,7 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) b = bam_init1(); if (!b) { - fprintf(pysam_stderr, "[depad] Couldn't allocate bam struct\n"); + fprintf(samtools_stderr, "[depad] Couldn't allocate bam struct\n"); return -1; } r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; @@ -210,20 +210,20 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) uint32_t *cigar = bam_get_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { - // fprintf(pysam_stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); + // fprintf(samtools_stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); r_tid = b->core.tid; if (0!=unpad_seq(b, &r)) { - fprintf(pysam_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); + fprintf(samtools_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); return -1; }; if (h->target_len[r_tid] != r.l) { - fprintf(pysam_stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); + fprintf(samtools_stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); return -1; } if (fai) { // Check the embedded reference matches the FASTA file if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { - fprintf(pysam_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); + fprintf(samtools_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); return -1; } assert(r.l == q.l); @@ -231,7 +231,7 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) for (i = 0; i < r.l; ++i) { if (r.s[i] != q.s[i]) { // Show gaps as ASCII 45 - fprintf(pysam_stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", + fprintf(samtools_stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", h->target_name[b->core.tid], i+1, r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); @@ -245,25 +245,25 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) } else if (b->core.n_cigar > 0) { int i, k, op; if (b->core.tid < 0) { - fprintf(pysam_stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); + fprintf(samtools_stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b)); return -1; } else if (b->core.tid == r_tid) { ; // good case, reference available - //fprintf(pysam_stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); + //fprintf(samtools_stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); } else if (fai) { if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { - fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); r_tid = b->core.tid; - // fprintf(pysam_stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); + // fprintf(samtools_stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); } else { - fprintf(pysam_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); + fprintf(samtools_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); return -1; } if (0!=unpad_seq(b, &q)) { - fprintf(pysam_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); + fprintf(samtools_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b)); return -1; }; if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { @@ -332,32 +332,32 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) if (b->core.pos != -1) b->core.pos = posmap[b->core.pos]; if (b->core.mtid < 0 || b->core.mpos < 0) { /* Nice case, no mate to worry about*/ - // fprintf(pysam_stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); + // fprintf(samtools_stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b)); /* TODO - Warning if FLAG says mate should be mapped? */ /* Clean up funny input where mate position is given but mate reference is missing: */ b->core.mtid = -1; b->core.mpos = -1; } else if (b->core.mtid == b->core.tid) { /* Nice case, same reference */ - // fprintf(pysam_stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); + // fprintf(samtools_stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b)); b->core.mpos = posmap[b->core.mpos]; } else { /* Nasty case, Must load alternative posmap */ - // fprintf(pysam_stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); + // fprintf(samtools_stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); if (!fai) { - fprintf(pysam_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); + fprintf(samtools_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); return -1; } /* Temporarily load the other reference sequence */ if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { - fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); + fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); return -1; } posmap = update_posmap(posmap, r); b->core.mpos = posmap[b->core.mpos]; /* Restore the reference and posmap*/ if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { - fprintf(pysam_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); return -1; } posmap = update_posmap(posmap, r); @@ -372,7 +372,7 @@ int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) } } if (read_ret < -1) { - fprintf(pysam_stderr, "[depad] truncated file.\n"); + fprintf(samtools_stderr, "[depad] truncated file.\n"); ret = 1; } free(r.s); free(q.s); free(posmap); @@ -390,10 +390,10 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) for (i = 0; i < old->n_targets; ++i) { unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); if (unpadded_len < 0) { - fprintf(pysam_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); + fprintf(samtools_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); } else { header->target_len[i] = unpadded_len; - //fprintf(pysam_stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); + //fprintf(samtools_stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); } } /* Duplicating the header allocated new buffer for header string */ @@ -415,7 +415,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) char *name = strstr(text, "\tSN:"); char *name_end; if (!name) { - fprintf(pysam_stderr, "Unable to find SN: header field\n"); + fprintf(samtools_stderr, "Unable to find SN: header field\n"); return NULL; } name += 4; @@ -445,7 +445,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) strcat(newtext, len_buf); } else { - fprintf(pysam_stderr, "LN value of the reference is larger than the original!\n"); + fprintf(samtools_stderr, "LN value of the reference is larger than the original!\n"); exit(1); } break; @@ -487,7 +487,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) /* Check we didn't overflow the buffer */ assert (strlen(header->text) <= strlen(old->text)); if (strlen(header->text) < header->l_text) { - //fprintf(pysam_stderr, "[depad] Reallocating header buffer\n"); + //fprintf(samtools_stderr, "[depad] Reallocating header buffer\n"); assert (newtext == header->text); newtext = malloc(strlen(header->text) + 1); strcpy(newtext, header->text); @@ -495,7 +495,7 @@ bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) header->text = newtext; header->l_text = strlen(newtext); } - //fprintf(pysam_stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); + //fprintf(samtools_stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); return header; } @@ -536,7 +536,7 @@ int main_pad2unpad(int argc, char *argv[]) break; case '?': is_long_help = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(pysam_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); + fprintf(samtools_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); return usage(is_long_help); } } @@ -561,19 +561,19 @@ int main_pad2unpad(int argc, char *argv[]) goto depad_end; } if (fn_list && hts_set_fai_filename(in, fn_list) != 0) { - fprintf(pysam_stderr, "[depad] failed to load reference file \"%s\".\n", fn_list); + fprintf(samtools_stderr, "[depad] failed to load reference file \"%s\".\n", fn_list); ret = 1; goto depad_end; } if ((h = sam_hdr_read(in)) == 0) { - fprintf(pysam_stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); + fprintf(samtools_stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); ret = 1; goto depad_end; } if (fai) { h_fix = fix_header(h, fai); } else { - fprintf(pysam_stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); + fprintf(samtools_stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); h_fix = h; } char wmode[2]; @@ -590,7 +590,7 @@ int main_pad2unpad(int argc, char *argv[]) hts_set_opt(out, CRAM_OPT_NO_REF, 1); if (sam_hdr_write(out, h_fix) != 0) { - fprintf(pysam_stderr, "[depad] failed to write header.\n"); + fprintf(samtools_stderr, "[depad] failed to write header.\n"); ret = 1; goto depad_end; } @@ -604,7 +604,7 @@ depad_end: if (h) bam_hdr_destroy(h); if (in) sam_close(in); if (out && sam_close(out) < 0) { - fprintf(pysam_stderr, "[depad] error on closing output file.\n"); + fprintf(samtools_stderr, "[depad] error on closing output file.\n"); ret = 1; } free(fn_list); free(fn_out); @@ -613,21 +613,21 @@ depad_end: static int usage(int is_long_help) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Usage: samtools depad \n\n"); - fprintf(pysam_stderr, "Options:\n"); - fprintf(pysam_stderr, " -s Output is SAM (default is BAM)\n"); - fprintf(pysam_stderr, " -S Input is SAM (default is BAM)\n"); - fprintf(pysam_stderr, " -u Uncompressed BAM output (can't use with -s)\n"); - fprintf(pysam_stderr, " -1 Fast compression BAM output (can't use with -s)\n"); - fprintf(pysam_stderr, " -T, --reference FILE\n"); - fprintf(pysam_stderr, " Padded reference sequence file [null]\n"); - fprintf(pysam_stderr, " -o FILE Output file name [pysam_stdout]\n"); - fprintf(pysam_stderr, " -? Longer help\n"); - sam_global_opt_help(pysam_stderr, "-...--"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "Usage: samtools depad \n\n"); + fprintf(samtools_stderr, "Options:\n"); + fprintf(samtools_stderr, " -s Output is SAM (default is BAM)\n"); + fprintf(samtools_stderr, " -S Input is SAM (default is BAM)\n"); + fprintf(samtools_stderr, " -u Uncompressed BAM output (can't use with -s)\n"); + fprintf(samtools_stderr, " -1 Fast compression BAM output (can't use with -s)\n"); + fprintf(samtools_stderr, " -T, --reference FILE\n"); + fprintf(samtools_stderr, " Padded reference sequence file [null]\n"); + fprintf(samtools_stderr, " -o FILE Output file name [samtools_stdout]\n"); + fprintf(samtools_stderr, " -? Longer help\n"); + sam_global_opt_help(samtools_stderr, "-...--"); if (is_long_help) - fprintf(pysam_stderr, + fprintf(samtools_stderr, "Notes:\n" "\n" "1. Requires embedded reference sequences (before the reads for that reference),\n" diff --git a/samtools/phase.c.pysam.c b/samtools/phase.c.pysam.c index 2cfb3ae..f74ba48 100644 --- a/samtools/phase.c.pysam.c +++ b/samtools/phase.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* phase.c -- phase subcommand. @@ -393,8 +393,8 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos min_pos = i? cns[vpos]>>32 : 0x7fffffff; if (vpos == 1) { - fprintf(pysam_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); - fprintf(pysam_stdout, "M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, + fprintf(samtools_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); + fprintf(samtools_stdout, "M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1); for (k = 0; k < kh_end(hash); ++k) { if (kh_exist(hash, k)) { @@ -412,7 +412,7 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * { // phase int **cnt; uint64_t *mask; - fprintf(pysam_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); + fprintf(samtools_stdout, "PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); sitemask = calloc(vpos, 1); cnt = count_all(g->k, vpos, hash); path = dynaprog(g->k, vpos, cnt); @@ -433,13 +433,13 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * } } for (i = 0; i < n_masked; ++i) - fprintf(pysam_stdout, "FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); + fprintf(samtools_stdout, "FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); for (i = 0; i < vpos; ++i) { uint64_t x = pcnt[i]; int8_t c[2]; c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); - fprintf(pysam_stdout, "M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], + fprintf(samtools_stdout, "M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); } free(path); free(pcnt); free(regmask); free(sitemask); @@ -451,17 +451,17 @@ static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t * ks_introsort_rseq(n_seqs, seqs); for (i = 0; i < n_seqs; ++i) { frag_t *f = seqs[i]; - fprintf(pysam_stdout, "EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); + fprintf(samtools_stdout, "EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); for (j = 0; j < f->vlen; ++j) { uint32_t c = cns[f->vpos + j]; - if (f->seq[j] == 0) fputc('N', pysam_stdout); - else fputc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], pysam_stdout); + if (f->seq[j] == 0) fputc('N', samtools_stdout); + else fputc("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)], samtools_stdout); } - fprintf(pysam_stdout, "\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); + fprintf(samtools_stdout, "\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); } free(seqs); - fprintf(pysam_stdout, "//\n"); - fflush(pysam_stdout); + fprintf(samtools_stdout, "//\n"); + fflush(samtools_stdout); g->vpos_shift += vpos; if (dump_aln(g, min_pos, hash) < 0) return -1; return vpos; @@ -610,20 +610,20 @@ int main_phase(int argc, char *argv[]) if (usage) break; } if (usage || argc == optind) { - fprintf(pysam_stderr, "\n"); - fprintf(pysam_stderr, "Usage: samtools phase [options] \n\n"); - fprintf(pysam_stderr, "Options: -k INT block length [%d]\n", g.k); - fprintf(pysam_stderr, " -b STR prefix of BAMs to output [null]\n"); - fprintf(pysam_stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); - fprintf(pysam_stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); - fprintf(pysam_stderr, " -D INT max read depth [%d]\n", g.max_depth); -// fprintf(pysam_stderr, " -l FILE list of sites to phase [null]\n"); - fprintf(pysam_stderr, " -F do not attempt to fix chimeras\n"); - fprintf(pysam_stderr, " -A drop reads with ambiguous phase\n"); -// fprintf(pysam_stderr, " -e do not discover SNPs (effective with -l)\n"); - fprintf(pysam_stderr, "\n"); - - sam_global_opt_help(pysam_stderr, "-....-"); + fprintf(samtools_stderr, "\n"); + fprintf(samtools_stderr, "Usage: samtools phase [options] \n\n"); + fprintf(samtools_stderr, "Options: -k INT block length [%d]\n", g.k); + fprintf(samtools_stderr, " -b STR prefix of BAMs to output [null]\n"); + fprintf(samtools_stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); + fprintf(samtools_stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); + fprintf(samtools_stderr, " -D INT max read depth [%d]\n", g.max_depth); +// fprintf(samtools_stderr, " -l FILE list of sites to phase [null]\n"); + fprintf(samtools_stderr, " -F do not attempt to fix chimeras\n"); + fprintf(samtools_stderr, " -A drop reads with ambiguous phase\n"); +// fprintf(samtools_stderr, " -e do not discover SNPs (effective with -l)\n"); + fprintf(samtools_stderr, "\n"); + + sam_global_opt_help(samtools_stderr, "-....-"); return 1; } @@ -634,7 +634,7 @@ int main_phase(int argc, char *argv[]) } g.fp_hdr = sam_hdr_read(g.fp); if (g.fp_hdr == NULL) { - fprintf(pysam_stderr, "[%s] Failed to read header for '%s'\n", + fprintf(samtools_stderr, "[%s] Failed to read header for '%s'\n", __func__, argv[optind]); return 1; } @@ -657,20 +657,20 @@ int main_phase(int argc, char *argv[]) seqs = kh_init(64); em = errmod_init(1. - 0.83); bases = calloc(g.max_depth, 2); - fprintf(pysam_stdout, "CC\n"); - fprintf(pysam_stdout, "CC\tDescriptions:\nCC\n"); - fprintf(pysam_stdout, "CC\t CC comments\n"); - fprintf(pysam_stdout, "CC\t PS start of a phase set\n"); - fprintf(pysam_stdout, "CC\t FL filtered region\n"); - fprintf(pysam_stdout, "CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); - fprintf(pysam_stdout, "CC\t EV supporting reads; SAM format\n"); - fprintf(pysam_stdout, "CC\t // end of a phase set\nCC\n"); - fprintf(pysam_stdout, "CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); - fprintf(pysam_stdout, "CC\t PS chr phaseSetStart phaseSetEnd\n"); - fprintf(pysam_stdout, "CC\t FL chr filterStart filterEnd\n"); - fprintf(pysam_stdout, "CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); - fprintf(pysam_stdout, "CC\nCC\n"); - fflush(pysam_stdout); + fprintf(samtools_stdout, "CC\n"); + fprintf(samtools_stdout, "CC\tDescriptions:\nCC\n"); + fprintf(samtools_stdout, "CC\t CC comments\n"); + fprintf(samtools_stdout, "CC\t PS start of a phase set\n"); + fprintf(samtools_stdout, "CC\t FL filtered region\n"); + fprintf(samtools_stdout, "CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); + fprintf(samtools_stdout, "CC\t EV supporting reads; SAM format\n"); + fprintf(samtools_stdout, "CC\t // end of a phase set\nCC\n"); + fprintf(samtools_stdout, "CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); + fprintf(samtools_stdout, "CC\t PS chr phaseSetStart phaseSetEnd\n"); + fprintf(samtools_stdout, "CC\t FL chr filterStart filterEnd\n"); + fprintf(samtools_stdout, "CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); + fprintf(samtools_stdout, "CC\nCC\n"); + fflush(samtools_stdout); while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { int i, k, c, tmp, dophase = 1, in_set = 0; float q[16]; @@ -777,7 +777,7 @@ int main_phase(int argc, char *argv[]) int res = 0; for (c = 0; c <= 2; ++c) { if (sam_close(g.out[c]) < 0) { - fprintf(pysam_stderr, "[%s] error on closing '%s'\n", + fprintf(samtools_stderr, "[%s] error on closing '%s'\n", __func__, g.out_name[c]); res = 1; } diff --git a/samtools/pysam.h b/samtools/pysam.h deleted file mode 100644 index b0fc4fb..0000000 --- a/samtools/pysam.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef PYSAM_H -#define PYSAM_H -#include "stdio.h" -extern FILE * pysam_stderr; -extern FILE * pysam_stdout; -extern const char * pysam_stdout_fn; -#endif diff --git a/samtools/sam.c.pysam.c b/samtools/sam.c.pysam.c index f7db820..457769d 100644 --- a/samtools/sam.c.pysam.c +++ b/samtools/sam.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* sam.c -- format-neutral SAM/BAM API. @@ -68,7 +68,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) } fp->is_write = 0; if (fp->header->n_targets == 0 && bam_verbose >= 1) - fprintf(pysam_stderr, "[samopen] no @SQ lines in the header.\n"); + fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n"); } else { enum htsExactFormat fmt = hts_get_format(fp->file)->format; @@ -77,7 +77,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { if (sam_hdr_write(fp->file, fp->header) < 0) { if (bam_verbose >= 1) - fprintf(pysam_stderr, "[samopen] Couldn't write header\n"); + fprintf(samtools_stderr, "[samopen] Couldn't write header\n"); sam_close(hts_fp); free(fp); return NULL; @@ -136,11 +136,11 @@ char *samfaipath(const char *fn_ref) strcat(strcpy(fn_list, fn_ref), ".fai"); if (access(fn_list, R_OK) == -1) { // fn_list is unreadable if (access(fn_ref, R_OK) == -1) { - fprintf(pysam_stderr, "[samfaipath] fail to read file %s.\n", fn_ref); + fprintf(samtools_stderr, "[samfaipath] fail to read file %s.\n", fn_ref); } else { - if (bam_verbose >= 3) fprintf(pysam_stderr, "[samfaipath] build FASTA index...\n"); + if (bam_verbose >= 3) fprintf(samtools_stderr, "[samfaipath] build FASTA index...\n"); if (fai_build(fn_ref) == -1) { - fprintf(pysam_stderr, "[samfaipath] fail to build FASTA index.\n"); + fprintf(samtools_stderr, "[samfaipath] fail to build FASTA index.\n"); free(fn_list); fn_list = 0; } } diff --git a/samtools/sam_header.c b/samtools/sam_header.c index 64da68f..defa5c3 100644 --- a/samtools/sam_header.c +++ b/samtools/sam_header.c @@ -713,7 +713,7 @@ void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], HeaderTag *key, *value; key = header_line_has_tag(hline,key_tag); value = header_line_has_tag(hline,value_tag); - if ( !key && !value ) + if ( !key || !value ) { l = l->next; continue; diff --git a/samtools/sam_header.c.pysam.c b/samtools/sam_header.c.pysam.c index e39807d..32332bd 100644 --- a/samtools/sam_header.c.pysam.c +++ b/samtools/sam_header.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* sam_header.c -- basic SAM/BAM header API. @@ -83,7 +83,7 @@ static void debug(const char *format, ...) { va_list ap; va_start(ap, format); - vfprintf(pysam_stderr, format, ap); + vfprintf(samtools_stderr, format, ap); va_end(ap); } @@ -715,7 +715,7 @@ void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], HeaderTag *key, *value; key = header_line_has_tag(hline,key_tag); value = header_line_has_tag(hline,value_tag); - if ( !key && !value ) + if ( !key || !value ) { l = l->next; continue; @@ -777,8 +777,8 @@ void *sam_header_merge(int n, const void **_dicts) if ( status==2 ) { - print_header_line(pysam_stderr,tmpl_hlines->data); - print_header_line(pysam_stderr,out_hlines->data); + print_header_line(samtools_stderr,tmpl_hlines->data); + print_header_line(samtools_stderr,out_hlines->data); debug("Conflicting lines, cannot merge the headers.\n"); return 0; } diff --git a/samtools/sam_opts.c.pysam.c b/samtools/sam_opts.c.pysam.c index aed4869..d965208 100644 --- a/samtools/sam_opts.c.pysam.c +++ b/samtools/sam_opts.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* sam_opts.c -- utilities to aid parsing common command line options. @@ -84,7 +84,7 @@ int parse_sam_global_opt(int c, const char *optarg, const struct option *lopt, } if (!lopt->name) { - fprintf(pysam_stderr, "Unexpected global option: %s\n", lopt->name); + fprintf(samtools_stderr, "Unexpected global option: %s\n", lopt->name); return -1; } diff --git a/samtools/sam_utils.c b/samtools/sam_utils.c index 4f8964a..efa6e2f 100644 --- a/samtools/sam_utils.c +++ b/samtools/sam_utils.c @@ -28,8 +28,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "samtools.h" +#include "version.h" static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) { @@ -58,3 +60,29 @@ void print_error_errno(const char *subcommand, const char *format, ...) vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); va_end(args); } + +const char *samtools_version() +{ + return SAMTOOLS_VERSION; +} + +const char *samtools_version_short() +{ + char *sv, *hyph, *v; + int len; + + v = SAMTOOLS_VERSION; + hyph = strchr(v, '-'); + if (!hyph) + return strdup(v); + + len = hyph - v; + sv = (char *)malloc(len+1); + if (!sv) + return NULL; + + strncpy(sv, v, len); + sv[len] = '\0'; + + return (const char*)sv; +} diff --git a/samtools/sam_utils.c.pysam.c b/samtools/sam_utils.c.pysam.c index 0a78619..53f1763 100644 --- a/samtools/sam_utils.c.pysam.c +++ b/samtools/sam_utils.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* sam_utils.c -- various utilities internal to samtools. @@ -30,18 +30,20 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "samtools.h" +#include "version.h" static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra) { - fflush(pysam_stdout); - if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand); - else fprintf(pysam_stderr, "samtools: "); - vfprintf(pysam_stderr, format, args); - if (extra) fprintf(pysam_stderr, ": %s\n", extra); - else fprintf(pysam_stderr, "\n"); - fflush(pysam_stderr); + fflush(samtools_stdout); + if (subcommand && *subcommand) fprintf(samtools_stderr, "samtools %s: ", subcommand); + else fprintf(samtools_stderr, "samtools: "); + vfprintf(samtools_stderr, format, args); + if (extra) fprintf(samtools_stderr, ": %s\n", extra); + else fprintf(samtools_stderr, "\n"); + fflush(samtools_stderr); } void print_error(const char *subcommand, const char *format, ...) @@ -60,3 +62,29 @@ void print_error_errno(const char *subcommand, const char *format, ...) vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); va_end(args); } + +const char *samtools_version() +{ + return SAMTOOLS_VERSION; +} + +const char *samtools_version_short() +{ + char *sv, *hyph, *v; + int len; + + v = SAMTOOLS_VERSION; + hyph = strchr(v, '-'); + if (!hyph) + return strdup(v); + + len = hyph - v; + sv = (char *)malloc(len+1); + if (!sv) + return NULL; + + strncpy(sv, v, len); + sv[len] = '\0'; + + return (const char*)sv; +} diff --git a/samtools/sam_view.c b/samtools/sam_view.c index ceb1080..bce2c06 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -45,6 +45,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/bgzf.h" #include "samtools.h" #include "sam_opts.h" +#include "bedidx.h" #define DEFAULT_BARCODE_TAG "BC" #define DEFAULT_QUALITY_TAG "QT" @@ -70,6 +71,7 @@ typedef struct samview_settings { void* bed; size_t remove_aux_len; char** remove_aux; + int multi_region; } samview_settings_t; @@ -77,9 +79,6 @@ typedef struct samview_settings { extern const char *bam_get_library(bam_hdr_t *header, const bam1_t *b); extern int bam_remove_B(bam1_t *b); extern char *samfaipath(const char *fn_ref); -void *bed_read(const char *fn); -void bed_destroy(void *_h); -int bed_overlap(const void *_h, const char *chr, int beg, int end); // Returns 0 to indicate read should be output 1 otherwise static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) @@ -97,7 +96,7 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin return 1; if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) return 1; - if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) + if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); @@ -254,6 +253,8 @@ int main_samview(int argc, char *argv[]) char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; + int filter_state = ALL, filter_op = 0; + int result; samview_settings_t settings = { .rghash = NULL, @@ -267,6 +268,7 @@ int main_samview(int argc, char *argv[]) .subsam_frac = -1., .library = NULL, .bed = NULL, + .multi_region = 0 }; static const struct option lopts[] = { @@ -278,7 +280,7 @@ int main_samview(int argc, char *argv[]) strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:M", lopts, NULL)) >= 0) { switch (c) { case 's': @@ -342,7 +344,7 @@ int main_samview(int argc, char *argv[]) settings.remove_aux[settings.remove_aux_len-1] = optarg; } break; - + case 'M': settings.multi_region = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, is_long_help); @@ -465,45 +467,69 @@ int main_samview(int argc, char *argv[]) } if (is_header_only) goto view_end; // no need to print alignments - if (optind + 1 >= argc) { // convert/print the entire file + if (settings.multi_region) { + if (optind < argc - 1) { //regions have been specified in the command line + settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file + if (!filter_op) + filter_state = FILTERED; + } + bam1_t *b = bam_init1(); - int r; - while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' - if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } - count++; + if (settings.bed == NULL) { // index is unavailable or no regions have been specified + while ((result = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' + if (!process_aln(header, b, &settings)) { + if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + count++; + } else { + if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + } + } + if (result < -1) { + fprintf(stderr, "[main_samview] truncated file.\n"); + ret = 1; + } + } else { + hts_idx_t *idx = sam_index_load(in, fn_in); // load index + if (idx != NULL) { + + int regcount = 0; + + hts_reglist_t *reglist = bed_reglist(settings.bed, filter_state, ®count); + if(reglist) { + hts_itr_multi_t *iter = sam_itr_regions(idx, header, reglist, regcount); + if (iter) { + // fetch alignments + while ((result = sam_itr_multi_next(in, iter, b)) >= 0) { + if (!process_aln(header, b, &settings)) { + if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + count++; + } else { + if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + } + } + if (result < -1) { + fprintf(stderr, "[main_samview] retrieval of region %d failed due to truncated file or corrupt BAM index file\n", iter->curr_tid); + ret = 1; + } + + hts_itr_multi_destroy(iter); + } else { + fprintf(stderr, "[main_samview] iterator could not be created. Aborting.\n"); + } + } else { + fprintf(stderr, "[main_samview] region list is empty or could not be created. Aborting.\n"); + } + hts_idx_destroy(idx); // destroy the BAM index } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); } } - if (r < -1) { - fprintf(stderr, "[main_samview] truncated file.\n"); - ret = 1; - } bam_destroy1(b); - } else { // retrieve alignments in specified regions - int i; - bam1_t *b; - hts_idx_t *idx = sam_index_load(in, fn_in); // load index - if (idx == 0) { // index is unavailable - fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - ret = 1; - goto view_end; - } - b = bam_init1(); - for (i = optind + 1; i < argc; ++i) { - int result; - hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' - if (iter == NULL) { // region invalid or reference name not found - int beg, end; - if (hts_parse_reg(argv[i], &beg, &end)) - fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); - else - fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); - continue; - } - // fetch alignments - while ((result = sam_itr_next(in, iter, b)) >= 0) { + } else { + if (optind + 1 >= argc) { // convert/print the entire file + bam1_t *b = bam_init1(); + int r; + while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; @@ -511,15 +537,51 @@ int main_samview(int argc, char *argv[]) if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } - hts_itr_destroy(iter); - if (result < -1) { - fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + if (r < -1) { + fprintf(stderr, "[main_samview] truncated file.\n"); ret = 1; - break; } + bam_destroy1(b); + } else { // retrieve alignments in specified regions + int i; + bam1_t *b; + hts_idx_t *idx = sam_index_load(in, fn_in); // load index + if (idx == 0) { // index is unavailable + fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); + ret = 1; + goto view_end; + } + b = bam_init1(); + for (i = optind + 1; i < argc; ++i) { + int result; + hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' + if (iter == NULL) { // region invalid or reference name not found + int beg, end; + if (hts_parse_reg(argv[i], &beg, &end)) + fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); + else + fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); + continue; + } + // fetch alignments + while ((result = sam_itr_next(in, iter, b)) >= 0) { + if (!process_aln(header, b, &settings)) { + if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + count++; + } else { + if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + } + } + hts_itr_destroy(iter); + if (result < -1) { + fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + ret = 1; + break; + } + } + bam_destroy1(b); + hts_idx_destroy(idx); // destroy the BAM index } - bam_destroy1(b); - hts_idx_destroy(idx); // destroy the BAM index } view_end: @@ -589,6 +651,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) " -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n" " fraction of templates/read pairs to keep; INT part sets seed)\n" +" -M use the multi-region iterator (increases the speed, removes\n" +" duplicates and outputs the reads as they are ordered in the file)\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" @@ -1069,15 +1133,12 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t char *seq = get_read(b); if (!seq) return false; - if (state->use_oq) { - oq = bam_aux_get(b, "OQ"); - if (oq) { - oq++; - qual = strdup(bam_aux2Z(oq)); - if (!qual) goto fail; - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - reverse(qual); - } + if (state->use_oq) oq = bam_aux_get(b, "OQ"); + if (oq && *oq=='Z') { + qual = strdup(bam_aux2Z(oq)); + if (!qual) goto fail; + if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented + reverse(qual); } } else { if (get_quality(b, &qual) < 0) goto fail; diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index 5113339..4e3f8ab 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* sam_view.c -- SAM<->BAM<->CRAM conversion. @@ -47,6 +47,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/bgzf.h" #include "samtools.h" #include "sam_opts.h" +#include "bedidx.h" #define DEFAULT_BARCODE_TAG "BC" #define DEFAULT_QUALITY_TAG "QT" @@ -72,6 +73,7 @@ typedef struct samview_settings { void* bed; size_t remove_aux_len; char** remove_aux; + int multi_region; } samview_settings_t; @@ -79,9 +81,6 @@ typedef struct samview_settings { extern const char *bam_get_library(bam_hdr_t *header, const bam1_t *b); extern int bam_remove_B(bam1_t *b); extern char *samfaipath(const char *fn_ref); -void *bed_read(const char *fn); -void bed_destroy(void *_h); -int bed_overlap(const void *_h, const char *chr, int beg, int end); // Returns 0 to indicate read should be output 1 otherwise static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) @@ -99,7 +98,7 @@ static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settin return 1; if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) return 1; - if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) + if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) return 1; if (settings->subsam_frac > 0.) { uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); @@ -256,6 +255,8 @@ int main_samview(int argc, char *argv[]) char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; + int filter_state = ALL, filter_op = 0; + int result; samview_settings_t settings = { .rghash = NULL, @@ -269,6 +270,7 @@ int main_samview(int argc, char *argv[]) .subsam_frac = -1., .library = NULL, .bed = NULL, + .multi_region = 0 }; static const struct option lopts[] = { @@ -280,7 +282,7 @@ int main_samview(int argc, char *argv[]) strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:M", lopts, NULL)) >= 0) { switch (c) { case 's': @@ -337,17 +339,17 @@ int main_samview(int argc, char *argv[]) case 'x': { if (strlen(optarg) != 2) { - fprintf(pysam_stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); - return usage(pysam_stderr, EXIT_FAILURE, is_long_help); + fprintf(samtools_stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); + return usage(samtools_stderr, EXIT_FAILURE, is_long_help); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); settings.remove_aux[settings.remove_aux_len-1] = optarg; } break; - + case 'M': settings.multi_region = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) - return usage(pysam_stderr, EXIT_FAILURE, is_long_help); + return usage(samtools_stderr, EXIT_FAILURE, is_long_help); break; } } @@ -367,7 +369,7 @@ int main_samview(int argc, char *argv[]) strcat(out_mode, tmp); strcat(out_un_mode, tmp); } - if (argc == optind && isatty(STDIN_FILENO)) return usage(pysam_stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... + if (argc == optind && isatty(STDIN_FILENO)) return usage(samtools_stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... fn_in = (optind < argc)? argv[optind] : "-"; // generate the fn_list if necessary @@ -381,13 +383,13 @@ int main_samview(int argc, char *argv[]) if (fn_list) { if (hts_set_fai_filename(in, fn_list) != 0) { - fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if ((header = sam_hdr_read(in)) == 0) { - fprintf(pysam_stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); + fprintf(samtools_stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); ret = 1; goto view_end; } @@ -407,7 +409,7 @@ int main_samview(int argc, char *argv[]) } if (fn_list) { if (hts_set_fai_filename(out, fn_list) != 0) { - fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } @@ -416,7 +418,7 @@ int main_samview(int argc, char *argv[]) out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(out, header) != 0) { - fprintf(pysam_stderr, "[main_samview] failed to write the SAM header\n"); + fprintf(samtools_stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } @@ -429,7 +431,7 @@ int main_samview(int argc, char *argv[]) } if (fn_list) { if (hts_set_fai_filename(un_out, fn_list) != 0) { - fprintf(pysam_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); + fprintf(samtools_stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } @@ -438,7 +440,7 @@ int main_samview(int argc, char *argv[]) out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(un_out, header) != 0) { - fprintf(pysam_stderr, "[main_samview] failed to write the SAM header\n"); + fprintf(samtools_stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } @@ -458,7 +460,7 @@ int main_samview(int argc, char *argv[]) if (ga.nthreads > 1) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { - fprintf(pysam_stderr, "Error creating thread pool\n"); + fprintf(samtools_stderr, "Error creating thread pool\n"); ret = 1; goto view_end; } @@ -467,45 +469,69 @@ int main_samview(int argc, char *argv[]) } if (is_header_only) goto view_end; // no need to print alignments - if (optind + 1 >= argc) { // convert/print the entire file + if (settings.multi_region) { + if (optind < argc - 1) { //regions have been specified in the command line + settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file + if (!filter_op) + filter_state = FILTERED; + } + bam1_t *b = bam_init1(); - int r; - while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' - if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } - count++; + if (settings.bed == NULL) { // index is unavailable or no regions have been specified + while ((result = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' + if (!process_aln(header, b, &settings)) { + if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + count++; + } else { + if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + } + } + if (result < -1) { + fprintf(samtools_stderr, "[main_samview] truncated file.\n"); + ret = 1; + } + } else { + hts_idx_t *idx = sam_index_load(in, fn_in); // load index + if (idx != NULL) { + + int regcount = 0; + + hts_reglist_t *reglist = bed_reglist(settings.bed, filter_state, ®count); + if(reglist) { + hts_itr_multi_t *iter = sam_itr_regions(idx, header, reglist, regcount); + if (iter) { + // fetch alignments + while ((result = sam_itr_multi_next(in, iter, b)) >= 0) { + if (!process_aln(header, b, &settings)) { + if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + count++; + } else { + if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + } + } + if (result < -1) { + fprintf(samtools_stderr, "[main_samview] retrieval of region %d failed due to truncated file or corrupt BAM index file\n", iter->curr_tid); + ret = 1; + } + + hts_itr_multi_destroy(iter); + } else { + fprintf(samtools_stderr, "[main_samview] iterator could not be created. Aborting.\n"); + } + } else { + fprintf(samtools_stderr, "[main_samview] region list is empty or could not be created. Aborting.\n"); + } + hts_idx_destroy(idx); // destroy the BAM index } else { - if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + fprintf(samtools_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); } } - if (r < -1) { - fprintf(pysam_stderr, "[main_samview] truncated file.\n"); - ret = 1; - } bam_destroy1(b); - } else { // retrieve alignments in specified regions - int i; - bam1_t *b; - hts_idx_t *idx = sam_index_load(in, fn_in); // load index - if (idx == 0) { // index is unavailable - fprintf(pysam_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - ret = 1; - goto view_end; - } - b = bam_init1(); - for (i = optind + 1; i < argc; ++i) { - int result; - hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' - if (iter == NULL) { // region invalid or reference name not found - int beg, end; - if (hts_parse_reg(argv[i], &beg, &end)) - fprintf(pysam_stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); - else - fprintf(pysam_stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); - continue; - } - // fetch alignments - while ((result = sam_itr_next(in, iter, b)) >= 0) { + } else { + if (optind + 1 >= argc) { // convert/print the entire file + bam1_t *b = bam_init1(); + int r; + while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; @@ -513,20 +539,56 @@ int main_samview(int argc, char *argv[]) if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } - hts_itr_destroy(iter); - if (result < -1) { - fprintf(pysam_stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + if (r < -1) { + fprintf(samtools_stderr, "[main_samview] truncated file.\n"); ret = 1; - break; } + bam_destroy1(b); + } else { // retrieve alignments in specified regions + int i; + bam1_t *b; + hts_idx_t *idx = sam_index_load(in, fn_in); // load index + if (idx == 0) { // index is unavailable + fprintf(samtools_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); + ret = 1; + goto view_end; + } + b = bam_init1(); + for (i = optind + 1; i < argc; ++i) { + int result; + hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' + if (iter == NULL) { // region invalid or reference name not found + int beg, end; + if (hts_parse_reg(argv[i], &beg, &end)) + fprintf(samtools_stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); + else + fprintf(samtools_stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); + continue; + } + // fetch alignments + while ((result = sam_itr_next(in, iter, b)) >= 0) { + if (!process_aln(header, b, &settings)) { + if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + count++; + } else { + if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } + } + } + hts_itr_destroy(iter); + if (result < -1) { + fprintf(samtools_stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + ret = 1; + break; + } + } + bam_destroy1(b); + hts_idx_destroy(idx); // destroy the BAM index } - bam_destroy1(b); - hts_idx_destroy(idx); // destroy the BAM index } view_end: if (is_count && ret == 0) { - if (fprintf(fn_out? fp_out : pysam_stdout, "%" PRId64 "\n", count) < 0) { + if (fprintf(fn_out? fp_out : samtools_stdout, "%" PRId64 "\n", count) < 0) { if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out); else print_error_errno("view", "writing to standard output failed"); ret = EXIT_FAILURE; @@ -574,7 +636,7 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -h include header in SAM output\n" " -H print SAM header only (no alignments)\n" " -c print only the count of matching records\n" -" -o FILE output file name [pysam_stdout]\n" +" -o FILE output file name [samtools_stdout]\n" " -U FILE output reads not selected by filters to FILE [null]\n" // extra input " -t FILE FILE listing reference names and lengths (see long help) [null]\n" @@ -591,6 +653,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) " -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n" " fraction of templates/read pairs to keep; INT part sets seed)\n" +" -M use the multi-region iterator (increases the speed, removes\n" +" duplicates and outputs the reads as they are ordered in the file)\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" @@ -646,7 +710,7 @@ int main_import(int argc, char *argv[]) int argc2, ret; char **argv2; if (argc != 4) { - fprintf(pysam_stderr, "Usage: samtools import \n"); + fprintf(samtools_stderr, "Usage: samtools import \n"); return 1; } argc2 = 6; @@ -729,7 +793,7 @@ typedef struct bam2fq_state { BGZF *fpse; BGZF *fpr[3]; BGZF *fpi[2]; - BGZF *hpysam_stdout; + BGZF *hsamtools_stdout; bam_hdr_t *h; bool has12, use_oq, copy_tags, illumina_tag; int flag_on, flag_off, flag_alloff; @@ -933,7 +997,7 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li if (state->copy_tags) { for (i = 0; copied_tags[i]; ++i) { if (!copy_tag(copied_tags[i], rec, linebuf)) { - fprintf(pysam_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); + fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); return false; } } @@ -943,7 +1007,7 @@ static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *li kliter_t(ktaglist) *p; for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { if (!copy_tag(kl_val(p), rec, linebuf)) { - fprintf(pysam_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); + fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); return false; } } @@ -1071,15 +1135,12 @@ static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t char *seq = get_read(b); if (!seq) return false; - if (state->use_oq) { - oq = bam_aux_get(b, "OQ"); - if (oq) { - oq++; - qual = strdup(bam_aux2Z(oq)); - if (!qual) goto fail; - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - reverse(qual); - } + if (state->use_oq) oq = bam_aux_get(b, "OQ"); + if (oq && *oq=='Z') { + qual = strdup(bam_aux2Z(oq)); + if (!qual) goto fail; + if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented + reverse(qual); } } else { if (get_quality(b, &qual) < 0) goto fail; @@ -1160,10 +1221,10 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) case 'c': opts->compression_level = atoi(optarg); break; case 'T': opts->extra_tags = strdup(optarg); break; case 'v': opts->def_qual = atoi(optarg); break; - case '?': bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false; + case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; default: if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { - bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false; + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } break; } @@ -1183,43 +1244,43 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) } } if (nIndex>2) { - fprintf(pysam_stderr,"Invalid index format: more than 2 indexes\n"); - bam2fq_usage(pysam_stderr, argv[0]); + fprintf(samtools_stderr,"Invalid index format: more than 2 indexes\n"); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } if (opts->index_file[1] && !opts->index_file[0]) { - fprintf(pysam_stderr, "Index one specified, but index two not given\n"); - bam2fq_usage(pysam_stderr, argv[0]); + fprintf(samtools_stderr, "Index one specified, but index two not given\n"); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } if (nIndex==2 && !opts->index_file[1]) { - fprintf(pysam_stderr, "index_format specifies two indexes, but only one index file given\n"); - bam2fq_usage(pysam_stderr, argv[0]); + fprintf(samtools_stderr, "index_format specifies two indexes, but only one index file given\n"); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } if (nIndex==1 && !opts->index_file[0]) { - fprintf(pysam_stderr, "index_format specifies an index, but no index file given\n"); - bam2fq_usage(pysam_stderr, argv[0]); + fprintf(samtools_stderr, "index_format specifies an index, but no index file given\n"); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } if (nIndex==0 && opts->index_file[0]) { - fprintf(pysam_stderr, "index_format not specified, but index file given\n"); - bam2fq_usage(pysam_stderr, argv[0]); + fprintf(samtools_stderr, "index_format not specified, but index file given\n"); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } if (opts->def_qual < 0 || 93 < opts->def_qual) { - fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); - bam2fq_usage(pysam_stderr, argv[0]); + fprintf(samtools_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } @@ -1231,21 +1292,21 @@ static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) opts->filetype = FASTA; } else { print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); - bam2fq_usage(pysam_stderr, argv[0]); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } if ((argc - (optind)) == 0) { - fprintf(pysam_stderr, "No input file specified.\n"); - bam2fq_usage(pysam_stdout, argv[0]); + fprintf(samtools_stderr, "No input file specified.\n"); + bam2fq_usage(samtools_stdout, argv[0]); free_opts(opts); return false; } if ((argc - (optind)) != 1) { - fprintf(pysam_stderr, "Too many arguments.\n"); - bam2fq_usage(pysam_stderr, argv[0]); + fprintf(samtools_stderr, "Too many arguments.\n"); + bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; } @@ -1285,7 +1346,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->filetype = opts->filetype; state->def_qual = opts->def_qual; state->index_sequence = NULL; - state->hpysam_stdout = bgzf_dopen(fileno(pysam_stdout), "wu"); + state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu"); state->compression_level = opts->compression_level; state->taglist = kl_init(ktaglist); @@ -1294,7 +1355,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) char *s = strtok_r(opts->extra_tags, ",", &save_p); while (s) { if (strlen(s) != 2) { - fprintf(pysam_stderr, "Parsing extra tags - '%s' is not two characters\n", s); + fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s); free(state); return false; } @@ -1315,12 +1376,12 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); free(state); return false; } if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { - fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); free(state); return false; } @@ -1343,7 +1404,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) return false; } } else { - state->fpr[i] = state->hpysam_stdout; + state->fpr[i] = state->hsamtools_stdout; } } for (i = 0; i < 2; i++) { @@ -1360,7 +1421,7 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) state->h = sam_hdr_read(state->fp); if (state->h == NULL) { - fprintf(pysam_stderr, "Failed to read header for \"%s\"\n", opts->fn_input); + fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", opts->fn_input); free(state); return false; } @@ -1377,7 +1438,7 @@ static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } int i; for (i = 0; i < 3; ++i) { - if (state->fpr[i] == state->hpysam_stdout) { + if (state->fpr[i] == state->hsamtools_stdout) { if (i==0 && bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; } } else { if (bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } @@ -1423,7 +1484,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) while (true) { int res = sam_read1(state->fp, state->h, b); if (res < -1) { - fprintf(pysam_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); + fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); return false; } at_eof = res < 0; @@ -1481,7 +1542,7 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) if (state->fpi[0]) if (!tags2fq(b, state, opts)) return false; records[which_readpart(b)] = b; if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) { - fprintf(pysam_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); + fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); return false; } score[which_readpart(b)] = b_score; @@ -1496,8 +1557,8 @@ static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) free(linebuf[0].s); free(linebuf[1].s); free(linebuf[2].s); - fprintf(pysam_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); - fprintf(pysam_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); + fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); + fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); return valid; } diff --git a/samtools/sample.c.pysam.c b/samtools/sample.c.pysam.c index dff8188..8b39a90 100644 --- a/samtools/sample.c.pysam.c +++ b/samtools/sample.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* sample.c -- group data by sample. diff --git a/samtools/samtools.h b/samtools/samtools.h index 1e72654..7a406a2 100644 --- a/samtools/samtools.h +++ b/samtools/samtools.h @@ -26,6 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #define SAMTOOLS_H const char *samtools_version(void); +const char *samtools_version_short(void); #if defined __GNUC__ && __GNUC__ >= 2 #define CHECK_PRINTF(fmt,args) __attribute__ ((format (printf, fmt, args))) diff --git a/samtools/samtools.pysam.c b/samtools/samtools.pysam.c new file mode 100644 index 0000000..c276a8a --- /dev/null +++ b/samtools/samtools.pysam.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include + +#include "samtools.pysam.h" + +FILE * samtools_stderr = NULL; +FILE * samtools_stdout = NULL; +const char * samtools_stdout_fn = NULL; +int samtools_stdout_fileno = STDOUT_FILENO; + + +FILE * samtools_set_stderr(int fd) +{ + if (samtools_stderr != NULL) + fclose(samtools_stderr); + samtools_stderr = fdopen(fd, "w"); + return samtools_stderr; +} + +void samtools_unset_stderr(void) +{ + if (samtools_stderr != NULL) + fclose(samtools_stderr); + samtools_stderr = fopen("/dev/null", "w"); +} + +FILE * samtools_set_stdout(int fd) +{ + if (samtools_stdout != NULL) + fclose(samtools_stdout); + samtools_stdout = fdopen(fd, "w"); + if (samtools_stdout == NULL) + { + fprintf(samtools_stderr, "could not set stdout to fd %i", fd); + } + samtools_stdout_fileno = fd; + return samtools_stdout; +} + +void samtools_set_stdout_fn(const char *fn) +{ + samtools_stdout_fn = fn; +} + +void samtools_unset_stdout(void) +{ + if (samtools_stdout != NULL) + fclose(samtools_stdout); + samtools_stdout = fopen("/dev/null", "w"); + samtools_stdout_fileno = STDOUT_FILENO; +} + +void samtools_set_optind(int val) +{ + // setting this in cython via + // "from posix.unistd cimport optind" + // did not work. + // + // setting to 0 forces a complete re-initialization + optind = val; +} + + + diff --git a/samtools/samtools.pysam.h b/samtools/samtools.pysam.h new file mode 100644 index 0000000..e2bfd85 --- /dev/null +++ b/samtools/samtools.pysam.h @@ -0,0 +1,47 @@ +#ifndef PYSAM_H +#define PYSAM_H + +#include "stdio.h" + +extern FILE * samtools_stderr; + +extern FILE * samtools_stdout; + +extern const char * samtools_stdout_fn; + +/*! set pysam standard error to point to file descriptor + + Setting the stderr will close the previous stderr. + */ +FILE * samtools_set_stderr(int fd); + +/*! set pysam standard output to point to file descriptor + + Setting the stderr will close the previous stdout. + */ +FILE * samtools_set_stdout(int fd); + +/*! set pysam standard output to point to filename + + */ +void samtools_set_stdout_fn(const char * fn); + +/*! set pysam standard error to /dev/null. + + Unsetting the stderr will close the previous stderr. + */ +void samtools_unset_stderr(void); + +/*! set pysam standard error to /dev/null. + + Unsetting the stderr will close the previous stderr. + */ +void samtools_unset_stdout(void); + +int samtools_dispatch(int argc, char *argv[]); + +void samtools_set_optind(int); + +extern int samtools_main(int argc, char *argv[]); + +#endif diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index bbae50c..1c94a10 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* stats.c -- This is the former bamcheck integrated into samtools/htslib. @@ -1242,7 +1242,7 @@ void init_regions(stats_t *stats, const char *file) if ( tid < 0 ) { if ( !warned ) - fprintf(pysam_stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s); + fprintf(samtools_stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line.s); warned = 1; continue; } @@ -1336,7 +1336,7 @@ void init_group_id(stats_t *stats, const char *id) { khiter_t k = kh_get(kh_rg, stats->rg_hash, key); if ( k != kh_end(stats->rg_hash) ) - fprintf(pysam_stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); + fprintf(samtools_stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); int ret; k = kh_put(kh_rg, stats->rg_hash, key, &ret); kh_value(stats->rg_hash, k) = val; @@ -1346,7 +1346,7 @@ void init_group_id(stats_t *stats, const char *id) if ( !n ) error("The sample or read group \"%s\" not present.\n", id); #else - fprintf(pysam_stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n"); + fprintf(samtools_stderr, "Samtools-htslib: init_group_id() header parsing not yet implemented\n"); abort(); #endif } @@ -1356,35 +1356,35 @@ static void error(const char *format, ...) { if ( !format ) { - fprintf(pysam_stdout, "About: The program collects statistics from BAM files. The output can be visualized using plot-bamstats.\n"); - fprintf(pysam_stdout, "Usage: samtools stats [OPTIONS] file.bam\n"); - fprintf(pysam_stdout, " samtools stats [OPTIONS] file.bam chr:from-to\n"); - fprintf(pysam_stdout, "Options:\n"); - fprintf(pysam_stdout, " -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); - fprintf(pysam_stdout, " -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); - fprintf(pysam_stdout, " -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); - fprintf(pysam_stdout, " -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); - fprintf(pysam_stdout, " --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); - fprintf(pysam_stdout, " -h, --help This help message\n"); - fprintf(pysam_stdout, " -i, --insert-size Maximum insert size [8000]\n"); - fprintf(pysam_stdout, " -I, --id Include only listed read group or sample name\n"); - fprintf(pysam_stdout, " -l, --read-length Include in the statistics only reads with the given read length []\n"); - fprintf(pysam_stdout, " -m, --most-inserts Report only the main part of inserts [0.99]\n"); - fprintf(pysam_stdout, " -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); - fprintf(pysam_stdout, " -q, --trim-quality The BWA trimming parameter [0]\n"); - fprintf(pysam_stdout, " -r, --ref-seq Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n"); - fprintf(pysam_stdout, " -s, --sam Ignored (input format is auto-detected).\n"); - fprintf(pysam_stdout, " -S, --split Also write statistics to separate files split by tagged field.\n"); - fprintf(pysam_stdout, " -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); - fprintf(pysam_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); - sam_global_opt_help(pysam_stdout, "-.--.@"); - fprintf(pysam_stdout, "\n"); + fprintf(samtools_stdout, "About: The program collects statistics from BAM files. The output can be visualized using plot-bamstats.\n"); + fprintf(samtools_stdout, "Usage: samtools stats [OPTIONS] file.bam\n"); + fprintf(samtools_stdout, " samtools stats [OPTIONS] file.bam chr:from-to\n"); + fprintf(samtools_stdout, "Options:\n"); + fprintf(samtools_stdout, " -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); + fprintf(samtools_stdout, " -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); + fprintf(samtools_stdout, " -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); + fprintf(samtools_stdout, " -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); + fprintf(samtools_stdout, " --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); + fprintf(samtools_stdout, " -h, --help This help message\n"); + fprintf(samtools_stdout, " -i, --insert-size Maximum insert size [8000]\n"); + fprintf(samtools_stdout, " -I, --id Include only listed read group or sample name\n"); + fprintf(samtools_stdout, " -l, --read-length Include in the statistics only reads with the given read length []\n"); + fprintf(samtools_stdout, " -m, --most-inserts Report only the main part of inserts [0.99]\n"); + fprintf(samtools_stdout, " -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); + fprintf(samtools_stdout, " -q, --trim-quality The BWA trimming parameter [0]\n"); + fprintf(samtools_stdout, " -r, --ref-seq Reference sequence (required for GC-depth and mismatches-per-cycle calculation).\n"); + fprintf(samtools_stdout, " -s, --sam Ignored (input format is auto-detected).\n"); + fprintf(samtools_stdout, " -S, --split Also write statistics to separate files split by tagged field.\n"); + fprintf(samtools_stdout, " -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); + fprintf(samtools_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); + sam_global_opt_help(samtools_stdout, "-.--.@"); + fprintf(samtools_stdout, "\n"); } else { va_list ap; va_start(ap, format); - vfprintf(pysam_stderr, format, ap); + vfprintf(samtools_stderr, format, ap); va_end(ap); } exit(1); @@ -1712,13 +1712,13 @@ int main_stats(int argc, char *argv[]) } if (ret < -1) { - fprintf(pysam_stderr, "Failure while decoding file\n"); + fprintf(samtools_stderr, "Failure while decoding file\n"); return 1; } } round_buffer_flush(all_stats, -1); - output_stats(pysam_stdout, all_stats, sparse); + output_stats(samtools_stdout, all_stats, sparse); if (info->split_tag) output_split_stats(split_hash, bam_fname, sparse); diff --git a/samtools/stats_isize.c.pysam.c b/samtools/stats_isize.c.pysam.c index 6ae9088..492780b 100644 --- a/samtools/stats_isize.c.pysam.c +++ b/samtools/stats_isize.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* stats_isize.c -- generalised insert size calculation for samtools stats. @@ -96,7 +96,7 @@ static void sparse_set_f(isize_data_t data, int at, isize_insert_t field, uint64 kh_value(h, it) = rec; a->max = max(at, a->max); } else { - fprintf(pysam_stderr, "%s\n", "Failed to allocate memory for isize_sparse_record_t"); + fprintf(samtools_stderr, "%s\n", "Failed to allocate memory for isize_sparse_record_t"); exit(11); } } else { diff --git a/samtools/test/merge/test_bam_translate.c.pysam.c b/samtools/test/merge/test_bam_translate.c.pysam.c index 193954d..21db13c 100644 --- a/samtools/test/merge/test_bam_translate.c.pysam.c +++ b/samtools/test/merge/test_bam_translate.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/merge/test_bam_translate.c -- header merging test harness. @@ -35,40 +35,40 @@ DEALINGS IN THE SOFTWARE. */ #include void dump_read(bam1_t* b) { - fprintf(pysam_stdout, "->core.tid:(%d)\n", b->core.tid); - fprintf(pysam_stdout, "->core.pos:(%d)\n", b->core.pos); - fprintf(pysam_stdout, "->core.bin:(%d)\n", b->core.bin); - fprintf(pysam_stdout, "->core.qual:(%d)\n", b->core.qual); - fprintf(pysam_stdout, "->core.l_qname:(%d)\n", b->core.l_qname); - fprintf(pysam_stdout, "->core.flag:(%d)\n", b->core.flag); - fprintf(pysam_stdout, "->core.n_cigar:(%d)\n", b->core.n_cigar); - fprintf(pysam_stdout, "->core.l_qseq:(%d)\n", b->core.l_qseq); - fprintf(pysam_stdout, "->core.mtid:(%d)\n", b->core.mtid); - fprintf(pysam_stdout, "->core.mpos:(%d)\n", b->core.mpos); - fprintf(pysam_stdout, "->core.isize:(%d)\n", b->core.isize); + fprintf(samtools_stdout, "->core.tid:(%d)\n", b->core.tid); + fprintf(samtools_stdout, "->core.pos:(%d)\n", b->core.pos); + fprintf(samtools_stdout, "->core.bin:(%d)\n", b->core.bin); + fprintf(samtools_stdout, "->core.qual:(%d)\n", b->core.qual); + fprintf(samtools_stdout, "->core.l_qname:(%d)\n", b->core.l_qname); + fprintf(samtools_stdout, "->core.flag:(%d)\n", b->core.flag); + fprintf(samtools_stdout, "->core.n_cigar:(%d)\n", b->core.n_cigar); + fprintf(samtools_stdout, "->core.l_qseq:(%d)\n", b->core.l_qseq); + fprintf(samtools_stdout, "->core.mtid:(%d)\n", b->core.mtid); + fprintf(samtools_stdout, "->core.mpos:(%d)\n", b->core.mpos); + fprintf(samtools_stdout, "->core.isize:(%d)\n", b->core.isize); if (b->data) { - fprintf(pysam_stdout, "->data:"); + fprintf(samtools_stdout, "->data:"); int i; for (i = 0; i < b->l_data; ++i) { - fprintf(pysam_stdout, "%x ", b->data[i]); + fprintf(samtools_stdout, "%x ", b->data[i]); } - fprintf(pysam_stdout, "\n"); + fprintf(samtools_stdout, "\n"); } if (b->core.l_qname) { - fprintf(pysam_stdout, "qname: %s\n",bam_get_qname(b)); + fprintf(samtools_stdout, "qname: %s\n",bam_get_qname(b)); } if (b->core.l_qseq) { - fprintf(pysam_stdout, "qseq:"); + fprintf(samtools_stdout, "qseq:"); int i; for (i = 0; i < b->core.l_qseq; ++i) { - fprintf(pysam_stdout, "%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]); + fprintf(samtools_stdout, "%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]); } - fprintf(pysam_stdout, "\n"); - fprintf(pysam_stdout, "qual:"); + fprintf(samtools_stdout, "\n"); + fprintf(samtools_stdout, "qual:"); for (i = 0; i < b->core.l_qseq; ++i) { - fprintf(pysam_stdout, "%c",bam_get_qual(b)[i]); + fprintf(samtools_stdout, "%c",bam_get_qual(b)[i]); } - fprintf(pysam_stdout, "\n"); + fprintf(samtools_stdout, "\n"); } @@ -77,18 +77,18 @@ void dump_read(bam1_t* b) { uint8_t* aux = bam_get_aux(b); while (i < bam_get_l_aux(b)) { - fprintf(pysam_stdout, "%.2s:%c:",aux+i,*(aux+i+2)); + fprintf(samtools_stdout, "%.2s:%c:",aux+i,*(aux+i+2)); i += 2; switch (*(aux+i)) { case 'Z': - while (*(aux+1+i) != '\0') { putc(*(aux+1+i), pysam_stdout); ++i; } + while (*(aux+1+i) != '\0') { putc(*(aux+1+i), samtools_stdout); ++i; } break; } - putc('\n',pysam_stdout); + putc('\n',samtools_stdout); ++i;++i; } } - fprintf(pysam_stdout, "\n"); + fprintf(samtools_stdout, "\n"); } void trans_tbl_test_init(trans_tbl_t* tbl, int32_t n_targets) @@ -357,30 +357,30 @@ int samtools_test_bam_translate_main(int argc, char**argv) bam1_t* b; - // Setup pysam_stderr redirect + // Setup samtools_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr + FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr char* tempfname = (optind < argc)? argv[optind] : "test_bam_translate.tmp"; FILE* check = NULL; // setup - if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // TID test + if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // TID test trans_tbl_t tbl1; setup_test_1(&b,&tbl1); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } - if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bam_translate(b, &tbl1); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } @@ -392,33 +392,33 @@ int samtools_test_bam_translate_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 1\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl1); - if (verbose) fprintf(pysam_stdout, "END test 1\n"); + if (verbose) fprintf(samtools_stdout, "END test 1\n"); // setup - if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); // RG exists and translate test + if (verbose) fprintf(samtools_stdout, "BEGIN test 2\n"); // RG exists and translate test trans_tbl_t tbl2; setup_test_2(&b,&tbl2); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } - if (verbose) fprintf(pysam_stdout, "RUN test 2\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 2\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bam_translate(b, &tbl2); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 2\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 2\n"); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } @@ -430,33 +430,33 @@ int samtools_test_bam_translate_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 2\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 2\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl2); - if (verbose) fprintf(pysam_stdout, "END test 2\n"); + if (verbose) fprintf(samtools_stdout, "END test 2\n"); - if (verbose) fprintf(pysam_stdout, "BEGIN test 3\n"); // PG exists and translate test + if (verbose) fprintf(samtools_stdout, "BEGIN test 3\n"); // PG exists and translate test // setup trans_tbl_t tbl3; setup_test_3(&b,&tbl3); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } - if (verbose) fprintf(pysam_stdout, "RUN test 3\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 3\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bam_translate(b, &tbl3); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 3\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 3\n"); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } @@ -468,33 +468,33 @@ int samtools_test_bam_translate_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 3\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 3\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl3); - if (verbose) fprintf(pysam_stdout, "END test 3\n"); + if (verbose) fprintf(samtools_stdout, "END test 3\n"); - if (verbose) fprintf(pysam_stdout, "BEGIN test 4\n"); // RG test non-existent + if (verbose) fprintf(samtools_stdout, "BEGIN test 4\n"); // RG test non-existent // setup trans_tbl_t tbl4; setup_test_4(&b,&tbl4); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } - if (verbose) fprintf(pysam_stdout, "RUN test 4\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 4\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bam_translate(b, &tbl4); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 4\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 4\n"); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } // check result @@ -505,32 +505,32 @@ int samtools_test_bam_translate_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 4\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 4\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl4); - if (verbose) fprintf(pysam_stdout, "END test 4\n"); + if (verbose) fprintf(samtools_stdout, "END test 4\n"); - if (verbose) fprintf(pysam_stdout, "BEGIN test 5\n"); // PG test non-existent + if (verbose) fprintf(samtools_stdout, "BEGIN test 5\n"); // PG test non-existent // setup trans_tbl_t tbl5; setup_test_5(&b,&tbl5); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); - fprintf(pysam_stdout, "RUN test 5\n"); + fprintf(samtools_stdout, "RUN test 5\n"); } // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bam_translate(b, &tbl5); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 5\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 5\n"); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } @@ -542,33 +542,33 @@ int samtools_test_bam_translate_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 5\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 5\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl5); - if (verbose) fprintf(pysam_stdout, "END test 5\n"); + if (verbose) fprintf(samtools_stdout, "END test 5\n"); - if (verbose) fprintf(pysam_stdout, "BEGIN test 6\n"); // RG and PG exists and translate test + if (verbose) fprintf(samtools_stdout, "BEGIN test 6\n"); // RG and PG exists and translate test // setup trans_tbl_t tbl6; setup_test_6(&b,&tbl6); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } - if (verbose) fprintf(pysam_stdout, "RUN test 6\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 6\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bam_translate(b, &tbl6); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 6\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 6\n"); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_read(b); } @@ -580,21 +580,21 @@ int samtools_test_bam_translate_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 6\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 6\n"); } fclose(check); // teardown bam_destroy1(b); trans_tbl_destroy(&tbl6); - if (verbose) fprintf(pysam_stdout, "END test 6\n"); + if (verbose) fprintf(samtools_stdout, "END test 6\n"); // Cleanup free(res.s); remove(tempfname); if (failure > 0) - fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); - fclose(orig_pysam_stderr); + fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_samtools_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/merge/test_rtrans_build.c.pysam.c b/samtools/test/merge/test_rtrans_build.c.pysam.c index 0ac1367..5ba47e7 100644 --- a/samtools/test/merge/test_rtrans_build.c.pysam.c +++ b/samtools/test/merge/test_rtrans_build.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/merge/test_rtrans_build.c -- header translation test harness. @@ -29,13 +29,13 @@ DEALINGS IN THE SOFTWARE. */ #include "../../bam_sort.c" void dump_rtrans(int* rtrans, int n, int n_targets) { - fprintf(pysam_stdout, "->n_targets:(%d)\n", n_targets); + fprintf(samtools_stdout, "->n_targets:(%d)\n", n_targets); int i, j; for (i = 0; i < n; ++i) { - fprintf(pysam_stderr, "%d",rtrans[i*n_targets+0]); + fprintf(samtools_stderr, "%d",rtrans[i*n_targets+0]); for (j = 1; j < n_targets; ++j) - fprintf(pysam_stderr, "\t%d",rtrans[i*n_targets+j]); - fprintf(pysam_stderr, "\n"); + fprintf(samtools_stderr, "\t%d",rtrans[i*n_targets+j]); + fprintf(samtools_stderr, "\n"); } } @@ -83,7 +83,7 @@ int samtools_test_rtrans_build_main(int argc, char**argv) const long GIMMICK_SEED = 0x1234330e; srand48(GIMMICK_SEED); - if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); + if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // setup trans_tbl_t tbl_1[2]; int n_targets_1 = 3; @@ -94,29 +94,29 @@ int samtools_test_rtrans_build_main(int argc, char**argv) if (verbose > 1) { // dump_trans_tid } - if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); rtrans_1 = rtrans_build(n_1, n_targets_1, &tbl_1[0]); - if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); if (verbose > 1) { - fprintf(pysam_stdout, "rtrans\n"); + fprintf(samtools_stdout, "rtrans\n"); dump_rtrans(rtrans_1, n_1, n_targets_1); } if (check_test_1(&tbl_1[0], rtrans_1)) { ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 1\n"); } // teardown trans_tbl_destroy(&tbl_1[0]); trans_tbl_destroy(&tbl_1[1]); free(rtrans_1); - if (verbose) fprintf(pysam_stdout, "END test 1\n"); + if (verbose) fprintf(samtools_stdout, "END test 1\n"); if (success == NUM_TESTS) { return 0; } else { - fprintf(pysam_stderr, "%d failures %d successes\n", failure, success); + fprintf(samtools_stderr, "%d failures %d successes\n", failure, success); return 1; } } diff --git a/samtools/test/merge/test_trans_tbl_init.c.pysam.c b/samtools/test/merge/test_trans_tbl_init.c.pysam.c index af8af43..f3abf71 100644 --- a/samtools/test/merge/test_trans_tbl_init.c.pysam.c +++ b/samtools/test/merge/test_trans_tbl_init.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/merge/test_trans_tbl_init.c -- merge test harness. @@ -36,16 +36,16 @@ typedef struct refseq_info { } refseq_info_t; void dump_header(bam_hdr_t* hdr) { - fprintf(pysam_stdout, "->n_targets:(%d)\n", hdr->n_targets); + fprintf(samtools_stdout, "->n_targets:(%d)\n", hdr->n_targets); int i; for (i = 0; i < hdr->n_targets; ++i) { - fprintf(pysam_stdout, "->target_name[%d]:(%s)\n",i,hdr->target_name[i]); - fprintf(pysam_stdout, "->target_len[%d]:(%d)\n",i,hdr->target_len[i]); + fprintf(samtools_stdout, "->target_name[%d]:(%s)\n",i,hdr->target_name[i]); + fprintf(samtools_stdout, "->target_len[%d]:(%d)\n",i,hdr->target_len[i]); } - fprintf(pysam_stdout, "->text:("); - fwrite((void*)hdr->text, (size_t) hdr->l_text, 1, pysam_stdout); - fprintf(pysam_stdout, ")\n"); + fprintf(samtools_stdout, "->text:("); + fwrite((void*)hdr->text, (size_t) hdr->l_text, 1, samtools_stdout); + fprintf(samtools_stdout, ")\n"); } static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) { @@ -351,7 +351,7 @@ int samtools_test_trans_tbl_init_main(int argc, char**argv) bam_hdr_t* out; bam_hdr_t* translate; - if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); + if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // setup trans_tbl_t tbl_1; merged_header_t *merged_hdr = init_merged_header(); @@ -359,36 +359,36 @@ int samtools_test_trans_tbl_init_main(int argc, char**argv) assert(translate); // test if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); } - if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); - fprintf(pysam_stdout, "out\n"); + fprintf(samtools_stdout, "out\n"); dump_header(out); } if (check_test_1(translate, out, &tbl_1)) { - if (verbose) fprintf(pysam_stdout, "Test 1 : PASS\n"); + if (verbose) fprintf(samtools_stdout, "Test 1 : PASS\n"); ++success; } else { - if (verbose) fprintf(pysam_stdout, "Test 1 : FAIL\n"); - fprintf(pysam_stderr, "Test 1 : FAIL\n"); + if (verbose) fprintf(samtools_stdout, "Test 1 : FAIL\n"); + fprintf(samtools_stderr, "Test 1 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_1); - if (verbose) fprintf(pysam_stdout, "END test 1\n"); + if (verbose) fprintf(samtools_stdout, "END test 1\n"); // test - if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); + if (verbose) fprintf(samtools_stdout, "BEGIN test 2\n"); // reinit trans_tbl_t tbl_2; @@ -396,108 +396,108 @@ int samtools_test_trans_tbl_init_main(int argc, char**argv) translate = setup_test_2(merged_hdr); assert(translate); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); } - if (verbose) fprintf(pysam_stdout, "RUN test 2\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 2\n"); trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) fprintf(pysam_stdout, "END RUN test 2\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 2\n"); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); - fprintf(pysam_stdout, "out\n"); + fprintf(samtools_stdout, "out\n"); dump_header(out); } if (check_test_2(translate, out, &tbl_2)) { - if (verbose) fprintf(pysam_stdout, "Test 2 : PASS\n"); + if (verbose) fprintf(samtools_stdout, "Test 2 : PASS\n"); ++success; } else { - if (verbose) fprintf(pysam_stdout, "Test 2 : FAIL\n"); - fprintf(pysam_stderr, "Test 2 : FAIL\n"); + if (verbose) fprintf(samtools_stdout, "Test 2 : FAIL\n"); + fprintf(samtools_stderr, "Test 2 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_2); - if (verbose) fprintf(pysam_stdout, "END test 2\n"); + if (verbose) fprintf(samtools_stdout, "END test 2\n"); // test - if (verbose) fprintf(pysam_stdout, "BEGIN test 3\n"); + if (verbose) fprintf(samtools_stdout, "BEGIN test 3\n"); // reinit trans_tbl_t tbl_3; merged_hdr = init_merged_header(); translate = setup_test_3(merged_hdr); assert(translate); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); } - if (verbose) fprintf(pysam_stdout, "RUN test 3\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 3\n"); trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) fprintf(pysam_stdout, "END RUN test 3\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 3\n"); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); - fprintf(pysam_stdout, "out\n"); + fprintf(samtools_stdout, "out\n"); dump_header(out); } if (check_test_3(translate, out, &tbl_3)) { - if (verbose) fprintf(pysam_stdout, "Test 3 : PASS\n"); + if (verbose) fprintf(samtools_stdout, "Test 3 : PASS\n"); ++success; } else { - if (verbose) fprintf(pysam_stdout, "Test 3 : FAIL\n"); - fprintf(pysam_stderr, "Test 3 : FAIL\n"); + if (verbose) fprintf(samtools_stdout, "Test 3 : FAIL\n"); + fprintf(samtools_stderr, "Test 3 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_3); - if (verbose) fprintf(pysam_stdout, "END test 3\n"); + if (verbose) fprintf(samtools_stdout, "END test 3\n"); // test - if (verbose) fprintf(pysam_stdout, "BEGIN test 4\n"); + if (verbose) fprintf(samtools_stdout, "BEGIN test 4\n"); // reinit trans_tbl_t tbl_4; merged_hdr = init_merged_header(); translate = setup_test_4(merged_hdr); assert(translate); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); } - if (verbose) fprintf(pysam_stdout, "RUN test 4\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 4\n"); trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) fprintf(pysam_stdout, "END RUN test 4\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 4\n"); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); - fprintf(pysam_stdout, "out\n"); + fprintf(samtools_stdout, "out\n"); dump_header(out); } if (check_test_4(translate, out, &tbl_4)) { - if (verbose) fprintf(pysam_stdout, "Test 4 : PASS\n"); + if (verbose) fprintf(samtools_stdout, "Test 4 : PASS\n"); ++success; } else { - if (verbose) fprintf(pysam_stdout, "Test 4 : FAIL\n"); - fprintf(pysam_stderr, "Test 4 : FAIL\n"); + if (verbose) fprintf(samtools_stdout, "Test 4 : FAIL\n"); + fprintf(samtools_stderr, "Test 4 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_4); - if (verbose) fprintf(pysam_stdout, "END test 4\n"); + if (verbose) fprintf(samtools_stdout, "END test 4\n"); // test - if (verbose) fprintf(pysam_stdout, "BEGIN test 5\n"); + if (verbose) fprintf(samtools_stdout, "BEGIN test 5\n"); // reinit trans_tbl_t tbl_5; merged_hdr = init_merged_header(); @@ -505,74 +505,74 @@ int samtools_test_trans_tbl_init_main(int argc, char**argv) assert(translate); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); } - if (verbose) fprintf(pysam_stdout, "RUN test 5\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 5\n"); trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) fprintf(pysam_stdout, "END RUN test 5\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 5\n"); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); - fprintf(pysam_stdout, "out\n"); + fprintf(samtools_stdout, "out\n"); dump_header(out); } if (check_test_5(translate, out, &tbl_5)) { - if (verbose) fprintf(pysam_stdout, "Test 5 : PASS\n"); + if (verbose) fprintf(samtools_stdout, "Test 5 : PASS\n"); ++success; } else { - if (verbose) fprintf(pysam_stdout, "Test 5 : FAIL\n"); - fprintf(pysam_stderr, "Test 5 : FAIL\n"); + if (verbose) fprintf(samtools_stdout, "Test 5 : FAIL\n"); + fprintf(samtools_stderr, "Test 5 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_5); - if (verbose) fprintf(pysam_stdout, "END test 5\n"); + if (verbose) fprintf(samtools_stdout, "END test 5\n"); // test - if (verbose) fprintf(pysam_stdout, "BEGIN test 6\n"); + if (verbose) fprintf(samtools_stdout, "BEGIN test 6\n"); // reinit trans_tbl_t tbl_6; merged_hdr = init_merged_header(); translate = setup_test_6(merged_hdr); assert(translate); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); } - if (verbose) fprintf(pysam_stdout, "RUN test 6\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 6\n"); trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename"); out = finish_merged_header(merged_hdr); free_merged_header(merged_hdr); - if (verbose) fprintf(pysam_stdout, "END RUN test 6\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 6\n"); if (verbose > 1) { - fprintf(pysam_stdout, "translate\n"); + fprintf(samtools_stdout, "translate\n"); dump_header(translate); - fprintf(pysam_stdout, "out\n"); + fprintf(samtools_stdout, "out\n"); dump_header(out); } if (check_test_6(translate, out, &tbl_6)) { - if (verbose) fprintf(pysam_stdout, "Test 6 : PASS\n"); + if (verbose) fprintf(samtools_stdout, "Test 6 : PASS\n"); ++success; } else { - if (verbose) fprintf(pysam_stdout, "Test 6 : FAIL\n"); - fprintf(pysam_stderr, "Test 6 : FAIL\n"); + if (verbose) fprintf(samtools_stdout, "Test 6 : FAIL\n"); + fprintf(samtools_stderr, "Test 6 : FAIL\n"); ++failure; } // teardown bam_hdr_destroy(translate); bam_hdr_destroy(out); trans_tbl_destroy(&tbl_6); - if (verbose) fprintf(pysam_stdout, "END test 6\n"); + if (verbose) fprintf(samtools_stdout, "END test 6\n"); if (success == NUM_TESTS) { return 0; } else { - fprintf(pysam_stderr, "%d failures %d successes\n", failure, success); + fprintf(samtools_stderr, "%d failures %d successes\n", failure, success); return 1; } } diff --git a/samtools/test/split/test_count_rg.c.pysam.c b/samtools/test/split/test_count_rg.c.pysam.c index 25131a8..c6f7fef 100644 --- a/samtools/test/split/test_count_rg.c.pysam.c +++ b/samtools/test/split/test_count_rg.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/split/test_count_rg.c -- split test cases. @@ -57,7 +57,7 @@ int samtools_test_count_rg_main(int argc, char**argv) ++verbose; break; default: - fprintf(pysam_stdout, + fprintf(samtools_stdout, "usage: test_count_rg [-v]\n\n" " -v verbose output\n" ); @@ -66,32 +66,32 @@ int samtools_test_count_rg_main(int argc, char**argv) } - // Setup pysam_stderr redirect + // Setup samtools_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr + FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; FILE* check = NULL; // setup - if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // TID test + if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // TID test bam_hdr_t* hdr1; size_t count; char** output; setup_test_1(&hdr1); if (verbose > 1) { - fprintf(pysam_stdout, "hdr1\n"); + fprintf(samtools_stdout, "hdr1\n"); dump_hdr(hdr1); } - if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bool result_1 = count_RG(hdr1, &count, &output); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); if (verbose > 1) { - fprintf(pysam_stdout, "b\n"); + fprintf(samtools_stdout, "b\n"); dump_hdr(hdr1); } @@ -103,7 +103,7 @@ int samtools_test_count_rg_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 1\n"); } fclose(check); @@ -114,14 +114,14 @@ int samtools_test_count_rg_main(int argc, char**argv) } free(output); bam_hdr_destroy(hdr1); - if (verbose) fprintf(pysam_stdout, "END test 1\n"); + if (verbose) fprintf(samtools_stdout, "END test 1\n"); // Cleanup free(res.s); remove(tempfname); if (failure > 0) - fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); - fclose(orig_pysam_stderr); + fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_samtools_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/split/test_expand_format_string.c.pysam.c b/samtools/test/split/test_expand_format_string.c.pysam.c index fe9a426..1583818 100644 --- a/samtools/test/split/test_expand_format_string.c.pysam.c +++ b/samtools/test/split/test_expand_format_string.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/split/test_expand_format_string.c -- split format string test cases. @@ -57,7 +57,7 @@ int samtools_test_expand_format_string_main(int argc, char**argv) ++verbose; break; default: - fprintf(pysam_stdout, + fprintf(samtools_stdout, "usage: test_expand_format_string [-v]\n\n" " -v verbose output\n" ); @@ -66,34 +66,34 @@ int samtools_test_expand_format_string_main(int argc, char**argv) } - // Setup pysam_stderr redirect + // Setup samtools_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr + FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr char* tempfname = (optind < argc)? argv[optind] : "test_expand_format_string.tmp"; FILE* check = NULL; // setup - if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // default format string test + if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // default format string test const char* format_string_1 = "%*_%#.bam"; const char* basename_1 = "basename"; const char* rg_id_1 = "1#2.3"; const int rg_idx_1 = 4; if (verbose > 1) { - fprintf(pysam_stdout, "format_string:%s\n" + fprintf(samtools_stdout, "format_string:%s\n" "basename:%s\n" "rg_id:%s\n" "rg_idx:%d\n", format_string_1, basename_1, rg_id_1, rg_idx_1); } - if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe char* output_1 = expand_format_string(format_string_1, basename_1, rg_id_1, rg_idx_1, NULL); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); if (verbose > 1) { - fprintf(pysam_stdout, "format_string:%s\n" + fprintf(samtools_stdout, "format_string:%s\n" "basename:%s\n" "rg_id:%s\n" "rg_idx:%d\n", format_string_1, basename_1, rg_id_1, rg_idx_1); @@ -108,20 +108,20 @@ int samtools_test_expand_format_string_main(int argc, char**argv) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 1\n"); } fclose(check); // teardown free(output_1); - if (verbose) fprintf(pysam_stdout, "END test 1\n"); + if (verbose) fprintf(samtools_stdout, "END test 1\n"); // Cleanup test harness free(res.s); remove(tempfname); if (failure > 0) - fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); - fclose(orig_pysam_stderr); + fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_samtools_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/split/test_filter_header_rg.c b/samtools/test/split/test_filter_header_rg.c index cccf0e9..3792ab5 100644 --- a/samtools/test/split/test_filter_header_rg.c +++ b/samtools/test/split/test_filter_header_rg.c @@ -40,10 +40,11 @@ void setup_test_1(bam_hdr_t** hdr_in) } bool check_test_1(const bam_hdr_t* hdr) { - const char *test1_res = + char test1_res[200]; + snprintf(test1_res, 199, "@HD\tVN:1.4\n" "@SQ\tSN:blah\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; + "@PG\tID:samtools\tPN:samtools\tVN:%s\tCL:test_filter_header_rg foo bar baz\n", samtools_version()); if (strcmp(hdr->text, test1_res)) { return false; @@ -63,11 +64,12 @@ void setup_test_2(bam_hdr_t** hdr_in) } bool check_test_2(const bam_hdr_t* hdr) { - const char *test2_res = + char test2_res[200]; + snprintf(test2_res, 199, "@HD\tVN:1.4\n" "@SQ\tSN:blah\n" "@RG\tID:fish\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; + "@PG\tID:samtools\tPN:samtools\tVN:%s\tCL:test_filter_header_rg foo bar baz\n", samtools_version()); if (strcmp(hdr->text, test2_res)) { return false; @@ -112,7 +114,7 @@ int main(int argc, char *argv[]) bam_hdr_t* hdr1; const char* id_to_keep_1 = "1#2.3"; setup_test_1(&hdr1); - if (verbose > 1) { + if (verbose > 0) { printf("hdr1\n"); dump_hdr(hdr1); } @@ -124,7 +126,7 @@ int main(int argc, char *argv[]) fclose(stderr); if (verbose) printf("END RUN test 1\n"); - if (verbose > 1) { + if (verbose > 0) { printf("hdr1\n"); dump_hdr(hdr1); } @@ -151,7 +153,7 @@ int main(int argc, char *argv[]) bam_hdr_t* hdr2; const char* id_to_keep_2 = "fish"; setup_test_2(&hdr2); - if (verbose > 1) { + if (verbose > 0) { printf("hdr2\n"); dump_hdr(hdr2); } @@ -163,7 +165,7 @@ int main(int argc, char *argv[]) fclose(stderr); if (verbose) printf("END RUN test 2\n"); - if (verbose > 1) { + if (verbose > 0) { printf("hdr2\n"); dump_hdr(hdr2); } diff --git a/samtools/test/split/test_filter_header_rg.c.pysam.c b/samtools/test/split/test_filter_header_rg.c.pysam.c index c9284f6..54227fc 100644 --- a/samtools/test/split/test_filter_header_rg.c.pysam.c +++ b/samtools/test/split/test_filter_header_rg.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/split/test_filter_header_rg.c -- split test cases. @@ -42,10 +42,11 @@ void setup_test_1(bam_hdr_t** hdr_in) } bool check_test_1(const bam_hdr_t* hdr) { - const char *test1_res = + char test1_res[200]; + snprintf(test1_res, 199, "@HD\tVN:1.4\n" "@SQ\tSN:blah\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; + "@PG\tID:samtools\tPN:samtools\tVN:%s\tCL:test_filter_header_rg foo bar baz\n", samtools_version()); if (strcmp(hdr->text, test1_res)) { return false; @@ -65,11 +66,12 @@ void setup_test_2(bam_hdr_t** hdr_in) } bool check_test_2(const bam_hdr_t* hdr) { - const char *test2_res = + char test2_res[200]; + snprintf(test2_res, 199, "@HD\tVN:1.4\n" "@SQ\tSN:blah\n" "@RG\tID:fish\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; + "@PG\tID:samtools\tPN:samtools\tVN:%s\tCL:test_filter_header_rg foo bar baz\n", samtools_version()); if (strcmp(hdr->text, test2_res)) { return false; @@ -94,7 +96,7 @@ int samtools_test_filter_header_rg_main(int argc, char *argv[]) ++verbose; break; default: - fprintf(pysam_stdout, + fprintf(samtools_stdout, "usage: test_filter_header_rg [-v]\n\n" " -v verbose output\n" ); @@ -103,31 +105,31 @@ int samtools_test_filter_header_rg_main(int argc, char *argv[]) } - // Setup pysam_stderr redirect + // Setup samtools_stderr redirect kstring_t res = { 0, 0, NULL }; - FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr + FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; FILE* check = NULL; // setup - if (verbose) fprintf(pysam_stdout, "BEGIN test 1\n"); // test eliminating a tag that isn't there + if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // test eliminating a tag that isn't there bam_hdr_t* hdr1; const char* id_to_keep_1 = "1#2.3"; setup_test_1(&hdr1); - if (verbose > 1) { - fprintf(pysam_stdout, "hdr1\n"); + if (verbose > 0) { + fprintf(samtools_stdout, "hdr1\n"); dump_hdr(hdr1); } - if (verbose) fprintf(pysam_stdout, "RUN test 1\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 1\n"); - if (verbose > 1) { - fprintf(pysam_stdout, "hdr1\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); + if (verbose > 0) { + fprintf(samtools_stdout, "hdr1\n"); dump_hdr(hdr1); } @@ -141,32 +143,32 @@ int samtools_test_filter_header_rg_main(int argc, char *argv[]) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 1\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 1\n"); } fclose(check); // teardown bam_hdr_destroy(hdr1); - if (verbose) fprintf(pysam_stdout, "END test 1\n"); + if (verbose) fprintf(samtools_stdout, "END test 1\n"); - if (verbose) fprintf(pysam_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there + if (verbose) fprintf(samtools_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there bam_hdr_t* hdr2; const char* id_to_keep_2 = "fish"; setup_test_2(&hdr2); - if (verbose > 1) { - fprintf(pysam_stdout, "hdr2\n"); + if (verbose > 0) { + fprintf(samtools_stdout, "hdr2\n"); dump_hdr(hdr2); } - if (verbose) fprintf(pysam_stdout, "RUN test 2\n"); + if (verbose) fprintf(samtools_stdout, "RUN test 2\n"); // test - xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list); - fclose(pysam_stderr); + fclose(samtools_stderr); - if (verbose) fprintf(pysam_stdout, "END RUN test 2\n"); - if (verbose > 1) { - fprintf(pysam_stdout, "hdr2\n"); + if (verbose) fprintf(samtools_stdout, "END RUN test 2\n"); + if (verbose > 0) { + fprintf(samtools_stdout, "hdr2\n"); dump_hdr(hdr2); } @@ -180,13 +182,13 @@ int samtools_test_filter_header_rg_main(int argc, char *argv[]) ++success; } else { ++failure; - if (verbose) fprintf(pysam_stdout, "FAIL test 2\n"); + if (verbose) fprintf(samtools_stdout, "FAIL test 2\n"); } fclose(check); // teardown bam_hdr_destroy(hdr2); - if (verbose) fprintf(pysam_stdout, "END test 2\n"); + if (verbose) fprintf(samtools_stdout, "END test 2\n"); // Cleanup @@ -194,8 +196,8 @@ int samtools_test_filter_header_rg_main(int argc, char *argv[]) free(arg_list); remove(tempfname); if (failure > 0) - fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); - fclose(orig_pysam_stderr); + fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_samtools_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/split/test_parse_args.c.pysam.c b/samtools/test/split/test_parse_args.c.pysam.c index 2c3e749..01d9bcb 100644 --- a/samtools/test/split/test_parse_args.c.pysam.c +++ b/samtools/test/split/test_parse_args.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/split/test_parse_args.c -- split test cases. @@ -82,7 +82,7 @@ int samtools_test_parse_args_main(int argc, char**argv) ++verbose; break; default: - fprintf(pysam_stdout, + fprintf(samtools_stdout, "usage: test_parse_args [-v]\n\n" " -v verbose output\n" ); @@ -90,58 +90,58 @@ int samtools_test_parse_args_main(int argc, char**argv) } } - // Setup pysam_stdout and pysam_stderr redirect - kstring_t res_pysam_stdout = { 0, 0, NULL }; - kstring_t res_pysam_stderr = { 0, 0, NULL }; - FILE* orig_pysam_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save pysam_stderr - FILE* orig_pysam_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save pysam_stderr - char* tempfname_pysam_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o"; - char* tempfname_pysam_stderr = (optind < argc)? argv[optind] : "test_parse_args.tmp.e"; - FILE* check_pysam_stdout = NULL; - FILE* check_pysam_stderr = NULL; + // Setup samtools_stdout and samtools_stderr redirect + kstring_t res_samtools_stdout = { 0, 0, NULL }; + kstring_t res_samtools_stderr = { 0, 0, NULL }; + FILE* orig_samtools_stdout = fdopen(dup(STDOUT_FILENO), "a"); // Save samtools_stderr + FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr + char* tempfname_samtools_stdout = (optind < argc)? argv[optind] : "test_parse_args.tmp.o"; + char* tempfname_samtools_stderr = (optind < argc)? argv[optind] : "test_parse_args.tmp.e"; + FILE* check_samtools_stdout = NULL; + FILE* check_samtools_stderr = NULL; // Cleanup getopt optind = 1; // setup - if (verbose) fprintf(orig_pysam_stdout,"BEGIN test 1\n"); // test eliminating a tag that isn't there + if (verbose) fprintf(orig_samtools_stdout,"BEGIN test 1\n"); // test eliminating a tag that isn't there int argc_1; char** argv_1; setup_test_1(&argc_1, &argv_1); if (verbose > 1) { - fprintf(orig_pysam_stdout, "argc: %d\n", argc_1); + fprintf(orig_samtools_stdout, "argc: %d\n", argc_1); } - if (verbose) fprintf(orig_pysam_stdout,"RUN test 1\n"); + if (verbose) fprintf(orig_samtools_stdout,"RUN test 1\n"); // test - xfreopen(tempfname_pysam_stdout, "w", pysam_stdout); // Redirect pysam_stdout to pipe - xfreopen(tempfname_pysam_stderr, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname_samtools_stdout, "w", samtools_stdout); // Redirect samtools_stdout to pipe + xfreopen(tempfname_samtools_stderr, "w", samtools_stderr); // Redirect samtools_stderr to pipe parsed_opts_t* result_1 = parse_args(argc_1, argv_1); - fclose(pysam_stdout); - fclose(pysam_stderr); + fclose(samtools_stdout); + fclose(samtools_stderr); - if (verbose) fprintf(orig_pysam_stdout, "END RUN test 1\n"); + if (verbose) fprintf(orig_samtools_stdout, "END RUN test 1\n"); if (verbose > 1) { - fprintf(orig_pysam_stdout, "argc: %d\n", argc_1); + fprintf(orig_samtools_stdout, "argc: %d\n", argc_1); } // check result - res_pysam_stdout.l = res_pysam_stderr.l = 0; - check_pysam_stdout = fopen(tempfname_pysam_stdout, "r"); - check_pysam_stderr = fopen(tempfname_pysam_stderr, "r"); + res_samtools_stdout.l = res_samtools_stderr.l = 0; + check_samtools_stdout = fopen(tempfname_samtools_stdout, "r"); + check_samtools_stderr = fopen(tempfname_samtools_stderr, "r"); if ( !result_1 - && kgetline(&res_pysam_stdout, (kgets_func *)fgets, check_pysam_stdout) >= 0 - && !feof(check_pysam_stdout) - && res_pysam_stdout.l > 0 - && kgetline(&res_pysam_stderr, (kgets_func *)fgets, check_pysam_stderr) < 0 - && (feof(check_pysam_stderr) || res_pysam_stderr.l == 0)) { + && kgetline(&res_samtools_stdout, (kgets_func *)fgets, check_samtools_stdout) >= 0 + && !feof(check_samtools_stdout) + && res_samtools_stdout.l > 0 + && kgetline(&res_samtools_stderr, (kgets_func *)fgets, check_samtools_stderr) < 0 + && (feof(check_samtools_stderr) || res_samtools_stderr.l == 0)) { ++success; } else { ++failure; - if (verbose) fprintf(orig_pysam_stdout, "FAIL test 1\n"); + if (verbose) fprintf(orig_samtools_stdout, "FAIL test 1\n"); } - fclose(check_pysam_stderr); - fclose(check_pysam_stdout); + fclose(check_samtools_stderr); + fclose(check_samtools_stdout); // teardown cleanup_opts(result_1); @@ -150,49 +150,49 @@ int samtools_test_parse_args_main(int argc, char**argv) free(argv_1[i]); } free(argv_1); - if (verbose) fprintf(orig_pysam_stdout, "END test 1\n"); + if (verbose) fprintf(orig_samtools_stdout, "END test 1\n"); // Cleanup getopt optind = 1; - if (verbose) fprintf(orig_pysam_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there + if (verbose) fprintf(orig_samtools_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there int argc_2; char** argv_2; setup_test_2(&argc_2, &argv_2); if (verbose > 1) { - fprintf(orig_pysam_stdout, "argc: %d\n", argc_2); + fprintf(orig_samtools_stdout, "argc: %d\n", argc_2); } - if (verbose) fprintf(orig_pysam_stdout, "RUN test 2\n"); + if (verbose) fprintf(orig_samtools_stdout, "RUN test 2\n"); // test - xfreopen(tempfname_pysam_stdout, "w", pysam_stdout); // Redirect pysam_stdout to pipe - xfreopen(tempfname_pysam_stderr, "w", pysam_stderr); // Redirect pysam_stderr to pipe + xfreopen(tempfname_samtools_stdout, "w", samtools_stdout); // Redirect samtools_stdout to pipe + xfreopen(tempfname_samtools_stderr, "w", samtools_stderr); // Redirect samtools_stderr to pipe parsed_opts_t* result_2 = parse_args(argc_2, argv_2); - fclose(pysam_stdout); - fclose(pysam_stderr); + fclose(samtools_stdout); + fclose(samtools_stderr); - if (verbose) fprintf(orig_pysam_stdout, "END RUN test 2\n"); + if (verbose) fprintf(orig_samtools_stdout, "END RUN test 2\n"); if (verbose > 1) { - fprintf(orig_pysam_stdout, "argc: %d\n", argc_2); + fprintf(orig_samtools_stdout, "argc: %d\n", argc_2); } // check result - res_pysam_stdout.l = res_pysam_stderr.l = 0; - check_pysam_stdout = fopen(tempfname_pysam_stdout, "r"); - check_pysam_stderr = fopen(tempfname_pysam_stderr, "r"); + res_samtools_stdout.l = res_samtools_stderr.l = 0; + check_samtools_stdout = fopen(tempfname_samtools_stdout, "r"); + check_samtools_stderr = fopen(tempfname_samtools_stderr, "r"); if ( result_2 && check_test_2(result_2) - && kgetline(&res_pysam_stdout, (kgets_func *)fgets, check_pysam_stdout) < 0 - && (feof(check_pysam_stdout) || res_pysam_stdout.l == 0) - && kgetline(&res_pysam_stderr, (kgets_func *)fgets, check_pysam_stderr) < 0 - && (feof(check_pysam_stderr) || res_pysam_stderr.l == 0)) { + && kgetline(&res_samtools_stdout, (kgets_func *)fgets, check_samtools_stdout) < 0 + && (feof(check_samtools_stdout) || res_samtools_stdout.l == 0) + && kgetline(&res_samtools_stderr, (kgets_func *)fgets, check_samtools_stderr) < 0 + && (feof(check_samtools_stderr) || res_samtools_stderr.l == 0)) { ++success; } else { ++failure; - if (verbose) fprintf(orig_pysam_stdout, "FAIL test 2\n"); + if (verbose) fprintf(orig_samtools_stdout, "FAIL test 2\n"); } - fclose(check_pysam_stdout); - fclose(check_pysam_stderr); + fclose(check_samtools_stdout); + fclose(check_samtools_stderr); // teardown cleanup_opts(result_2); @@ -202,18 +202,18 @@ int samtools_test_parse_args_main(int argc, char**argv) } free(argv_2); - if (verbose) fprintf(orig_pysam_stdout, "END test 2\n"); + if (verbose) fprintf(orig_samtools_stdout, "END test 2\n"); // Cleanup - free(res_pysam_stdout.s); - free(res_pysam_stderr.s); - remove(tempfname_pysam_stdout); - remove(tempfname_pysam_stderr); - fclose(orig_pysam_stdout); + free(res_samtools_stdout.s); + free(res_samtools_stderr.s); + remove(tempfname_samtools_stdout); + remove(tempfname_samtools_stderr); + fclose(orig_samtools_stdout); if (failure > 0) - fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success); - fclose(orig_pysam_stderr); + fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); + fclose(orig_samtools_stderr); return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/samtools/test/test.c b/samtools/test/test.c index fb0b549..0b4d585 100644 --- a/samtools/test/test.c +++ b/samtools/test/test.c @@ -53,9 +53,3 @@ void dump_hdr(const bam_hdr_t* hdr) } printf("text: \"%s\"\n", hdr->text); } - -// For tests, just return a constant that can be embedded in expected output. -const char *samtools_version(void) -{ - return "x.y.test"; -} diff --git a/samtools/test/test.c.pysam.c b/samtools/test/test.c.pysam.c index bf460e8..df87fbb 100644 --- a/samtools/test/test.c.pysam.c +++ b/samtools/test/test.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/test.c -- test harness utility routines. @@ -37,7 +37,7 @@ DEALINGS IN THE SOFTWARE. */ void xfreopen(const char *path, const char *mode, FILE *stream) { if (freopen(path, mode, stream) == NULL) { - fprintf(pysam_stderr, __FILE__": error reopening %s: %s\n", + fprintf(samtools_stderr, __FILE__": error reopening %s: %s\n", path, strerror(errno)); exit(2); } @@ -45,19 +45,13 @@ void xfreopen(const char *path, const char *mode, FILE *stream) void dump_hdr(const bam_hdr_t* hdr) { - fprintf(pysam_stdout, "n_targets: %d\n", hdr->n_targets); - fprintf(pysam_stdout, "ignore_sam_err: %d\n", hdr->ignore_sam_err); - fprintf(pysam_stdout, "l_text: %u\n", hdr->l_text); - fprintf(pysam_stdout, "idx\ttarget_len\ttarget_name:\n"); + fprintf(samtools_stdout, "n_targets: %d\n", hdr->n_targets); + fprintf(samtools_stdout, "ignore_sam_err: %d\n", hdr->ignore_sam_err); + fprintf(samtools_stdout, "l_text: %u\n", hdr->l_text); + fprintf(samtools_stdout, "idx\ttarget_len\ttarget_name:\n"); int32_t target; for (target = 0; target < hdr->n_targets; ++target) { - fprintf(pysam_stdout, "%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); + fprintf(samtools_stdout, "%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); } - fprintf(pysam_stdout, "text: \"%s\"\n", hdr->text); -} - -// For tests, just return a constant that can be embedded in expected output. -const char *samtools_version(void) -{ - return "x.y.test"; + fprintf(samtools_stdout, "text: \"%s\"\n", hdr->text); } diff --git a/samtools/test/tview/test_get_rg_sample.c.pysam.c b/samtools/test/tview/test_get_rg_sample.c.pysam.c index 8c441f9..0256876 100644 --- a/samtools/test/tview/test_get_rg_sample.c.pysam.c +++ b/samtools/test/tview/test_get_rg_sample.c.pysam.c @@ -1,4 +1,4 @@ -#include "pysam.h" +#include "samtools.pysam.h" /* test/tview/test_get_rg_sample.c -- tview test cases. @@ -79,7 +79,7 @@ int samtools_test_get_rg_sample_main(int argc, char** argv) if (success == NUM_TESTS) { return 0; } else { - fprintf(pysam_stderr, "%d failures %d successes\n", failure, success); + fprintf(samtools_stderr, "%d failures %d successes\n", failure, success); return 1; } } diff --git a/samtools/tmp_file.c b/samtools/tmp_file.c new file mode 100644 index 0000000..85d2822 --- /dev/null +++ b/samtools/tmp_file.c @@ -0,0 +1,507 @@ +/* + tmp_file.c - write to and read from a temporary binary file + for fast storage plus added compression. + + Copyright (C) 2017 Genome Research Ltd. + + Author: Andrew Whitwham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#endif /* _WIN32 */ + +#include "tmp_file.h" +#include "htslib/sam.h" + + +static void tmp_print_error(tmp_file_t *tmp, const char *fmt, ...) { + va_list argp; + + if (tmp->verbose) { + va_start(argp, fmt); + vfprintf(stderr, fmt, argp); + va_end(argp); + } +} + + +static int tmp_file_init(tmp_file_t *tmp, int verbose) { + tmp->stream = LZ4_createStream(); + tmp->data_size = 0; + tmp->group_size = TMP_SAM_GROUP_SIZE; + tmp->input_size = 0; + tmp->read_size = 0; + tmp->output_size = 0; + tmp->entry_number = 0; + tmp->offset = 0; + tmp->max_data_size = TMP_SAM_MAX_DATA + sizeof(bam1_t); // arbitrary but growable + tmp->ring_buffer_size = TMP_SAM_RING_SIZE; // arbitrary (min 64K) but growable + tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size * tmp->group_size); + tmp->data = NULL; + tmp->ring_buffer = malloc(sizeof(uint8_t) * tmp->ring_buffer_size); + tmp->ring_index = tmp->ring_buffer; + tmp->comp_buffer = malloc(tmp->comp_buffer_size); + tmp->verbose = verbose; + tmp->dict = NULL; + tmp->groups_written = 0; + + if (!tmp->ring_buffer || !tmp->comp_buffer || !tmp->stream) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression buffers.\n"); + return TMP_SAM_MEM_ERROR; + } + + return TMP_SAM_OK; +} + + +/* + * Opens the temp file and initialises memory. + * Verbose mode prints out error messages to stderr. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_open_write(tmp_file_t *tmp, char *tmp_name, int verbose) { + int ret; + unsigned int count = 1; + const unsigned int max_count = 100000; // more tries than this then something else is wrong + int fd; + + if ((ret = tmp_file_init(tmp, verbose))) { + return ret; + } + + // make space to write extended file name + if ((tmp->name = malloc(strlen(tmp_name) + 7)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate memory for %s.\n", tmp_name); + return TMP_SAM_MEM_ERROR; + } + + // make sure temp file has a unique name + while (count < max_count) { + sprintf(tmp->name, "%s.%d", tmp_name, count); + + + #ifdef _WIN32 + if ((fd = _open(tmp->name, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600)) == -1) { + #else + if ((fd = open(tmp->name, O_RDWR|O_CREAT|O_EXCL, 0600)) == -1) { + #endif /* _WIN32 */ + + if (errno != EEXIST) { + tmp_print_error(tmp, "[tmp_file] Error: unable to create tmp file %s.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + count++; + continue; + } + + break; + } + + if (count >= max_count) { + tmp_print_error(tmp, "[tmp_file] Error: unable to create unique temp file.\n"); + return TMP_SAM_FILE_ERROR; + } + + if ((tmp->fp = fdopen(fd, "w+b")) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to open write file %s.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + #ifndef _WIN32 + unlink(tmp->name); // should auto delete when closed on linux + #endif + + return TMP_SAM_OK; +} + + +/* + * The ring buffer stores precompressionn/post decompression data. LZ4 requires that + * previous data (64K worth) be available for efficient compression. This function grows + * the ring buffer when needed. + * Returns 0 on success, a negative number on failure. + */ +static int tmp_file_grow_ring_buffer(tmp_file_t *tmp, size_t new_size) { + // save the dictionary so lz4 can continue to function + int dict_size = 64 * 1024; // 64K max size + + if (tmp->groups_written) { + // if compression has been done then there is a dictionary to save + + if (tmp->dict == NULL) { + + if ((tmp->dict = malloc(sizeof(char) * dict_size)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate memory for compression dictionary.\n"); + return TMP_SAM_MEM_ERROR; + } + } + + if (LZ4_saveDict(tmp->stream, tmp->dict, dict_size) == 0) { + tmp_print_error(tmp, "[tmp_file] Error: unable to save compression dictionary.\n"); + return TMP_SAM_LZ4_ERROR; + } + } + + if ((tmp->ring_buffer = realloc(tmp->ring_buffer, sizeof(char) * new_size)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to reallocate ring buffer.\n"); + return TMP_SAM_MEM_ERROR; + } + + tmp->ring_buffer_size = new_size; + + return TMP_SAM_OK; +} + + +/* + * This does the actual compression and writing to disk. On disk format consists of a + * single size_t for the size of the compressed data followed by the data itself. + * Returns 0 on success, a negative number on failure. + */ +static int tmp_file_write_to_file(tmp_file_t *tmp) { + size_t comp_size; + + if (tmp->input_size > tmp->max_data_size) { + tmp->max_data_size += tmp->input_size + sizeof(bam1_t); + tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size); + + if ((tmp->comp_buffer = realloc(tmp->comp_buffer, sizeof(char) * tmp->comp_buffer_size)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to reallocate compression buffer.\n"); + return TMP_SAM_MEM_ERROR; + } + + // make sure the ring buffer is big enough to accommodate the new max_data_size + if (tmp->ring_buffer_size < tmp->max_data_size * 5) { + int ret; + if ((ret = tmp_file_grow_ring_buffer(tmp, tmp->max_data_size * 5))) { + return ret; + } + } + } + + tmp->ring_index = tmp->ring_buffer + tmp->offset; + + comp_size = LZ4_compress_fast_continue(tmp->stream, (const char *)tmp->ring_index, + tmp->comp_buffer, tmp->input_size, tmp->comp_buffer_size, 1); + + if (comp_size == 0) { + tmp_print_error(tmp, "[tmp_file] Error: compression failed.\n"); + return TMP_SAM_LZ4_ERROR; + } + + if (fwrite(&comp_size, sizeof(size_t), 1, tmp->fp) < 1) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write size failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + if (fwrite(tmp->comp_buffer, sizeof(char), comp_size, tmp->fp) < comp_size) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write data failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + tmp->offset += tmp->input_size; + + if (tmp->offset >= tmp->ring_buffer_size - tmp->max_data_size) + tmp->offset = 0; + + tmp->input_size = 0; + tmp->entry_number = 0; + tmp->groups_written++; + + return TMP_SAM_OK; +} + + +/* + * Stores an in memory bam structure for writing and if enough are gathered together writes + * it to disk. Mulitiple alignments compress better that single ones though after a certain number + * there is a law of diminishing returns. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) { + + if ((tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { + int ret; + + if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 5))) { + tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n", + (tmp->input_size + inbam->l_data)); + + return ret; + } + } + + tmp->ring_index = tmp->ring_buffer + tmp->offset + tmp->input_size; + + // copy data into the ring buffer + memcpy(tmp->ring_index, inbam, sizeof(bam1_t)); + memcpy(tmp->ring_index + sizeof(bam1_t) , inbam->data, inbam->l_data); + tmp->input_size += sizeof(bam1_t) + inbam->l_data; + tmp->entry_number++; + + if (tmp->entry_number == tmp->group_size) { + // actually write out the data + int ret; + + if ((ret = tmp_file_write_to_file(tmp))) { + return ret; + } + } + + return TMP_SAM_OK; +} + + +/* + * Closes the file after writing out any remaining alignments. Adds a size_t 0 to + * mark the end of the file. Companion function to tmp_file_open_read below. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_close_write(tmp_file_t *tmp) { + size_t terminator = 0; + + if (tmp->entry_number) { + int ret; + + if ((ret = tmp_file_write_to_file(tmp))) { + return ret; + } + } + + if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + if (fclose(tmp->fp)) { + tmp_print_error(tmp, "[tmp_file] Error: closing tmp file %s failed.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + LZ4_freeStream(tmp->stream); + + return TMP_SAM_OK; +} + + +/* + * Opens the file for reading. Optionally, if given a pointer to an existing + * bam1_t structure, it will free the data entry to prevent memory leaks. + * Companion function to tmp_file_close_write above. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam) { + + if ((tmp->fp = fopen(tmp->name, "rb")) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to open read file %s.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + tmp->dstream = LZ4_createStreamDecode(); + tmp->offset = 0; + + if (inbam) { + free(inbam->data); + } + + if (!tmp->dstream) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); + return TMP_SAM_MEM_ERROR; + } + + + return TMP_SAM_OK; +} + + +/* + * An alternative to tmp_file_close_write that does the same job without actually + * closing the file. Companion function to tmp_file_begin_read below. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_end_write(tmp_file_t *tmp) { + size_t terminator = 0; + + if (tmp->entry_number) { + int ret; + + if ((ret = tmp_file_write_to_file(tmp))) { + return ret; + } + } + + if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + fflush(tmp->fp); + + LZ4_freeStream(tmp->stream); + + return TMP_SAM_OK; +} + + +/* + * An alternative to tmp_file_open_read but works on an open file. + * Companion function to tmp_file_end_write above. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam) { + + rewind(tmp->fp); + + tmp->dstream = LZ4_createStreamDecode(); + tmp->offset = 0; + tmp->entry_number = tmp->group_size; + + if (inbam) { + free(inbam->data); + } + + if (!tmp->dstream) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); + return TMP_SAM_MEM_ERROR; + } + + return TMP_SAM_OK; +} + + +/* + * Read the next alignment, either from memory or from disk. + * Returns size of entry on success, 0 on end of file or a negative on error. + */ +int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) { + int entry_size; + + if (tmp->entry_number == tmp->group_size) { + // read more data + size_t comp_size; + + if (fread(&comp_size, sizeof(size_t), 1, tmp->fp) == 0 || comp_size == 0) { + return TMP_SAM_OK; + } + + if (tmp->offset >= tmp->ring_buffer_size - tmp->max_data_size) + tmp->offset = 0; + + tmp->ring_index = tmp->ring_buffer + tmp->offset; + + if (fread(tmp->comp_buffer, sizeof(char), comp_size, tmp->fp) > comp_size) { + tmp_print_error(tmp, "[tmp_file] Error: error reading compressed data.\n"); + return TMP_SAM_FILE_ERROR; + } + + tmp->output_size = LZ4_decompress_safe_continue(tmp->dstream, tmp->comp_buffer, + (char *)tmp->ring_index, comp_size, tmp->max_data_size); + + if (tmp->output_size == 0) { + tmp_print_error(tmp, "[tmp_file] Error: decompression failed.\n"); + return TMP_SAM_LZ4_ERROR; + } + + tmp->entry_number = 0; + tmp->read_size = 0; + } + + tmp->ring_index = tmp->ring_buffer + tmp->offset; + memcpy(inbam, tmp->ring_index, sizeof(bam1_t)); + + if ((unsigned int)inbam->l_data > tmp->data_size) { + if ((tmp->data = realloc(tmp->data, sizeof(uint8_t) * inbam->l_data)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp data memory.\n"); + return TMP_SAM_MEM_ERROR; + } + + tmp->data_size = inbam->l_data; + } + + inbam->data = tmp->data; + entry_size = sizeof(bam1_t); + + memcpy(inbam->data, tmp->ring_index + entry_size, inbam->l_data); + entry_size += inbam->l_data; + + tmp->offset += entry_size; + tmp->read_size += entry_size; + tmp->entry_number++; + + if (tmp->read_size > tmp->output_size) { + tmp_print_error(tmp, "[tmp_file] Error: wrong size of data returned RS:%ld OS:%ld EN:%ld GS:%ld.\n", + tmp->read_size, tmp->output_size, tmp->entry_number, tmp->group_size); + return TMP_SAM_LZ4_ERROR; + } + + if (tmp->read_size == tmp->output_size && tmp->entry_number != tmp->group_size) { + // hopefully the last entries in the read file + tmp->entry_number = tmp->group_size; + } + + return entry_size; +} + + +/* + * Frees up memory, closes the file and optionally deletes it. Giving this function + * pointer to the bam1_t structure used for reading will set its data value to null, + * preventing bam_destroy1() from trying to free already freed memory. + * Returns 0 on success, a negative number or EOF on failure. + */ +int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete) { + int ret = 0; + + ret = fclose(tmp->fp); + + if (delete && ret == 0) { + if (unlink(tmp->name)) { + tmp_print_error(tmp, "[tmp_file] Error: unable to delete file %s.\n", tmp->name); + ret = TMP_SAM_FILE_ERROR; + } + } + + LZ4_freeStreamDecode(tmp->dstream); + free(tmp->ring_buffer); + free(tmp->comp_buffer); + free(tmp->name); + free(tmp->data); + free(tmp->dict); + + + if (inbam) { + inbam->data = NULL; + } + + return ret; +} diff --git a/samtools/tmp_file.c.pysam.c b/samtools/tmp_file.c.pysam.c new file mode 100644 index 0000000..3a00b38 --- /dev/null +++ b/samtools/tmp_file.c.pysam.c @@ -0,0 +1,509 @@ +#include "samtools.pysam.h" + +/* + tmp_file.c - write to and read from a temporary binary file + for fast storage plus added compression. + + Copyright (C) 2017 Genome Research Ltd. + + Author: Andrew Whitwham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#endif /* _WIN32 */ + +#include "tmp_file.h" +#include "htslib/sam.h" + + +static void tmp_print_error(tmp_file_t *tmp, const char *fmt, ...) { + va_list argp; + + if (tmp->verbose) { + va_start(argp, fmt); + vfprintf(samtools_stderr, fmt, argp); + va_end(argp); + } +} + + +static int tmp_file_init(tmp_file_t *tmp, int verbose) { + tmp->stream = LZ4_createStream(); + tmp->data_size = 0; + tmp->group_size = TMP_SAM_GROUP_SIZE; + tmp->input_size = 0; + tmp->read_size = 0; + tmp->output_size = 0; + tmp->entry_number = 0; + tmp->offset = 0; + tmp->max_data_size = TMP_SAM_MAX_DATA + sizeof(bam1_t); // arbitrary but growable + tmp->ring_buffer_size = TMP_SAM_RING_SIZE; // arbitrary (min 64K) but growable + tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size * tmp->group_size); + tmp->data = NULL; + tmp->ring_buffer = malloc(sizeof(uint8_t) * tmp->ring_buffer_size); + tmp->ring_index = tmp->ring_buffer; + tmp->comp_buffer = malloc(tmp->comp_buffer_size); + tmp->verbose = verbose; + tmp->dict = NULL; + tmp->groups_written = 0; + + if (!tmp->ring_buffer || !tmp->comp_buffer || !tmp->stream) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression buffers.\n"); + return TMP_SAM_MEM_ERROR; + } + + return TMP_SAM_OK; +} + + +/* + * Opens the temp file and initialises memory. + * Verbose mode prints out error messages to samtools_stderr. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_open_write(tmp_file_t *tmp, char *tmp_name, int verbose) { + int ret; + unsigned int count = 1; + const unsigned int max_count = 100000; // more tries than this then something else is wrong + int fd; + + if ((ret = tmp_file_init(tmp, verbose))) { + return ret; + } + + // make space to write extended file name + if ((tmp->name = malloc(strlen(tmp_name) + 7)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate memory for %s.\n", tmp_name); + return TMP_SAM_MEM_ERROR; + } + + // make sure temp file has a unique name + while (count < max_count) { + sprintf(tmp->name, "%s.%d", tmp_name, count); + + + #ifdef _WIN32 + if ((fd = _open(tmp->name, O_RDWR|O_CREAT|O_EXCL|O_BINARY|O_TEMPORARY, 0600)) == -1) { + #else + if ((fd = open(tmp->name, O_RDWR|O_CREAT|O_EXCL, 0600)) == -1) { + #endif /* _WIN32 */ + + if (errno != EEXIST) { + tmp_print_error(tmp, "[tmp_file] Error: unable to create tmp file %s.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + count++; + continue; + } + + break; + } + + if (count >= max_count) { + tmp_print_error(tmp, "[tmp_file] Error: unable to create unique temp file.\n"); + return TMP_SAM_FILE_ERROR; + } + + if ((tmp->fp = fdopen(fd, "w+b")) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to open write file %s.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + #ifndef _WIN32 + unlink(tmp->name); // should auto delete when closed on linux + #endif + + return TMP_SAM_OK; +} + + +/* + * The ring buffer stores precompressionn/post decompression data. LZ4 requires that + * previous data (64K worth) be available for efficient compression. This function grows + * the ring buffer when needed. + * Returns 0 on success, a negative number on failure. + */ +static int tmp_file_grow_ring_buffer(tmp_file_t *tmp, size_t new_size) { + // save the dictionary so lz4 can continue to function + int dict_size = 64 * 1024; // 64K max size + + if (tmp->groups_written) { + // if compression has been done then there is a dictionary to save + + if (tmp->dict == NULL) { + + if ((tmp->dict = malloc(sizeof(char) * dict_size)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate memory for compression dictionary.\n"); + return TMP_SAM_MEM_ERROR; + } + } + + if (LZ4_saveDict(tmp->stream, tmp->dict, dict_size) == 0) { + tmp_print_error(tmp, "[tmp_file] Error: unable to save compression dictionary.\n"); + return TMP_SAM_LZ4_ERROR; + } + } + + if ((tmp->ring_buffer = realloc(tmp->ring_buffer, sizeof(char) * new_size)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to reallocate ring buffer.\n"); + return TMP_SAM_MEM_ERROR; + } + + tmp->ring_buffer_size = new_size; + + return TMP_SAM_OK; +} + + +/* + * This does the actual compression and writing to disk. On disk format consists of a + * single size_t for the size of the compressed data followed by the data itself. + * Returns 0 on success, a negative number on failure. + */ +static int tmp_file_write_to_file(tmp_file_t *tmp) { + size_t comp_size; + + if (tmp->input_size > tmp->max_data_size) { + tmp->max_data_size += tmp->input_size + sizeof(bam1_t); + tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size); + + if ((tmp->comp_buffer = realloc(tmp->comp_buffer, sizeof(char) * tmp->comp_buffer_size)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to reallocate compression buffer.\n"); + return TMP_SAM_MEM_ERROR; + } + + // make sure the ring buffer is big enough to accommodate the new max_data_size + if (tmp->ring_buffer_size < tmp->max_data_size * 5) { + int ret; + if ((ret = tmp_file_grow_ring_buffer(tmp, tmp->max_data_size * 5))) { + return ret; + } + } + } + + tmp->ring_index = tmp->ring_buffer + tmp->offset; + + comp_size = LZ4_compress_fast_continue(tmp->stream, (const char *)tmp->ring_index, + tmp->comp_buffer, tmp->input_size, tmp->comp_buffer_size, 1); + + if (comp_size == 0) { + tmp_print_error(tmp, "[tmp_file] Error: compression failed.\n"); + return TMP_SAM_LZ4_ERROR; + } + + if (fwrite(&comp_size, sizeof(size_t), 1, tmp->fp) < 1) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write size failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + if (fwrite(tmp->comp_buffer, sizeof(char), comp_size, tmp->fp) < comp_size) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write data failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + tmp->offset += tmp->input_size; + + if (tmp->offset >= tmp->ring_buffer_size - tmp->max_data_size) + tmp->offset = 0; + + tmp->input_size = 0; + tmp->entry_number = 0; + tmp->groups_written++; + + return TMP_SAM_OK; +} + + +/* + * Stores an in memory bam structure for writing and if enough are gathered together writes + * it to disk. Mulitiple alignments compress better that single ones though after a certain number + * there is a law of diminishing returns. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) { + + if ((tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { + int ret; + + if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 5))) { + tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n", + (tmp->input_size + inbam->l_data)); + + return ret; + } + } + + tmp->ring_index = tmp->ring_buffer + tmp->offset + tmp->input_size; + + // copy data into the ring buffer + memcpy(tmp->ring_index, inbam, sizeof(bam1_t)); + memcpy(tmp->ring_index + sizeof(bam1_t) , inbam->data, inbam->l_data); + tmp->input_size += sizeof(bam1_t) + inbam->l_data; + tmp->entry_number++; + + if (tmp->entry_number == tmp->group_size) { + // actually write out the data + int ret; + + if ((ret = tmp_file_write_to_file(tmp))) { + return ret; + } + } + + return TMP_SAM_OK; +} + + +/* + * Closes the file after writing out any remaining alignments. Adds a size_t 0 to + * mark the end of the file. Companion function to tmp_file_open_read below. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_close_write(tmp_file_t *tmp) { + size_t terminator = 0; + + if (tmp->entry_number) { + int ret; + + if ((ret = tmp_file_write_to_file(tmp))) { + return ret; + } + } + + if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + if (fclose(tmp->fp)) { + tmp_print_error(tmp, "[tmp_file] Error: closing tmp file %s failed.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + LZ4_freeStream(tmp->stream); + + return TMP_SAM_OK; +} + + +/* + * Opens the file for reading. Optionally, if given a pointer to an existing + * bam1_t structure, it will free the data entry to prevent memory leaks. + * Companion function to tmp_file_close_write above. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam) { + + if ((tmp->fp = fopen(tmp->name, "rb")) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to open read file %s.\n", tmp->name); + return TMP_SAM_FILE_ERROR; + } + + tmp->dstream = LZ4_createStreamDecode(); + tmp->offset = 0; + + if (inbam) { + free(inbam->data); + } + + if (!tmp->dstream) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); + return TMP_SAM_MEM_ERROR; + } + + + return TMP_SAM_OK; +} + + +/* + * An alternative to tmp_file_close_write that does the same job without actually + * closing the file. Companion function to tmp_file_begin_read below. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_end_write(tmp_file_t *tmp) { + size_t terminator = 0; + + if (tmp->entry_number) { + int ret; + + if ((ret = tmp_file_write_to_file(tmp))) { + return ret; + } + } + + if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { + tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); + return TMP_SAM_FILE_ERROR; + } + + fflush(tmp->fp); + + LZ4_freeStream(tmp->stream); + + return TMP_SAM_OK; +} + + +/* + * An alternative to tmp_file_open_read but works on an open file. + * Companion function to tmp_file_end_write above. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam) { + + rewind(tmp->fp); + + tmp->dstream = LZ4_createStreamDecode(); + tmp->offset = 0; + tmp->entry_number = tmp->group_size; + + if (inbam) { + free(inbam->data); + } + + if (!tmp->dstream) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); + return TMP_SAM_MEM_ERROR; + } + + return TMP_SAM_OK; +} + + +/* + * Read the next alignment, either from memory or from disk. + * Returns size of entry on success, 0 on end of file or a negative on error. + */ +int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) { + int entry_size; + + if (tmp->entry_number == tmp->group_size) { + // read more data + size_t comp_size; + + if (fread(&comp_size, sizeof(size_t), 1, tmp->fp) == 0 || comp_size == 0) { + return TMP_SAM_OK; + } + + if (tmp->offset >= tmp->ring_buffer_size - tmp->max_data_size) + tmp->offset = 0; + + tmp->ring_index = tmp->ring_buffer + tmp->offset; + + if (fread(tmp->comp_buffer, sizeof(char), comp_size, tmp->fp) > comp_size) { + tmp_print_error(tmp, "[tmp_file] Error: error reading compressed data.\n"); + return TMP_SAM_FILE_ERROR; + } + + tmp->output_size = LZ4_decompress_safe_continue(tmp->dstream, tmp->comp_buffer, + (char *)tmp->ring_index, comp_size, tmp->max_data_size); + + if (tmp->output_size == 0) { + tmp_print_error(tmp, "[tmp_file] Error: decompression failed.\n"); + return TMP_SAM_LZ4_ERROR; + } + + tmp->entry_number = 0; + tmp->read_size = 0; + } + + tmp->ring_index = tmp->ring_buffer + tmp->offset; + memcpy(inbam, tmp->ring_index, sizeof(bam1_t)); + + if ((unsigned int)inbam->l_data > tmp->data_size) { + if ((tmp->data = realloc(tmp->data, sizeof(uint8_t) * inbam->l_data)) == NULL) { + tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp data memory.\n"); + return TMP_SAM_MEM_ERROR; + } + + tmp->data_size = inbam->l_data; + } + + inbam->data = tmp->data; + entry_size = sizeof(bam1_t); + + memcpy(inbam->data, tmp->ring_index + entry_size, inbam->l_data); + entry_size += inbam->l_data; + + tmp->offset += entry_size; + tmp->read_size += entry_size; + tmp->entry_number++; + + if (tmp->read_size > tmp->output_size) { + tmp_print_error(tmp, "[tmp_file] Error: wrong size of data returned RS:%ld OS:%ld EN:%ld GS:%ld.\n", + tmp->read_size, tmp->output_size, tmp->entry_number, tmp->group_size); + return TMP_SAM_LZ4_ERROR; + } + + if (tmp->read_size == tmp->output_size && tmp->entry_number != tmp->group_size) { + // hopefully the last entries in the read file + tmp->entry_number = tmp->group_size; + } + + return entry_size; +} + + +/* + * Frees up memory, closes the file and optionally deletes it. Giving this function + * pointer to the bam1_t structure used for reading will set its data value to null, + * preventing bam_destroy1() from trying to free already freed memory. + * Returns 0 on success, a negative number or EOF on failure. + */ +int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete) { + int ret = 0; + + ret = fclose(tmp->fp); + + if (delete && ret == 0) { + if (unlink(tmp->name)) { + tmp_print_error(tmp, "[tmp_file] Error: unable to delete file %s.\n", tmp->name); + ret = TMP_SAM_FILE_ERROR; + } + } + + LZ4_freeStreamDecode(tmp->dstream); + free(tmp->ring_buffer); + free(tmp->comp_buffer); + free(tmp->name); + free(tmp->data); + free(tmp->dict); + + + if (inbam) { + inbam->data = NULL; + } + + return ret; +} diff --git a/samtools/tmp_file.h b/samtools/tmp_file.h new file mode 100644 index 0000000..74e4126 --- /dev/null +++ b/samtools/tmp_file.h @@ -0,0 +1,144 @@ +/* + tmp_file.h - write to and read from a temporary binary file + for fast storage plus added compression. + + Copyright (C) 2017 Genome Research Ltd. + + Author: Andrew Whitwham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE +*/ + +#ifndef _TMP_SAM_FILE_H_ +#define _TMP_SAM_FILE_H_ + +#include +#include "htslib/sam.h" + +#ifdef _cplusplus +extern "C" { +#endif + +// Group size that seems to give reasonable compression. +#define TMP_SAM_GROUP_SIZE 100 + +// Arbitrary initial size values but growable. +#define TMP_SAM_MAX_DATA 1024 +#define TMP_SAM_RING_SIZE 1048576 + +// Error numbers. +#define TMP_SAM_OK 0 +#define TMP_SAM_MEM_ERROR -1 +#define TMP_SAM_FILE_ERROR -2 +#define TMP_SAM_LZ4_ERROR -3 +#define TMP_SAM_INPUT_ERROR -4 + +typedef struct { + FILE *fp; + LZ4_stream_t *stream; + LZ4_streamDecode_t *dstream; + size_t data_size; + size_t max_data_size; + size_t ring_buffer_size; + size_t comp_buffer_size; + size_t offset; + uint8_t *data; + uint8_t *ring_buffer; + uint8_t *ring_index; + char *comp_buffer; + char *name; + size_t group_size; + size_t input_size; + size_t read_size; + size_t output_size; + size_t entry_number; + int verbose; + char *dict; + size_t groups_written; +} tmp_file_t; + + +/* + * Opens the temp file and initialises memory. + * Verbose mode prints out error messages to stderr. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_open_write(tmp_file_t *tmp, char *tmp_name, int verbose); + + +/* + * Stores an in memory bam structure for writing and if enough are gathered together writes + * it to disk. Mulitiple alignments compress better that single ones though after a certain number + * there is a law of diminishing returns. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam); + + +/* + * Closes the file after writing out any remaining alignments. Adds a size_t 0 to + * mark the end of the file. Companion function to tmp_file_open_read below. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_close_write(tmp_file_t *tmp); + + +/* + * Opens the file for reading. Optionally, if given a pointer to an existing + * bam1_t structure, it will free the data entry to prevent memory leaks. + * Companion function to tmp_file_close_write above. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam); + + +/* + * An alternative to tmp_file_close_write that does the same job without actually + * closing the file. Companion function to tmp_file_begin_read below. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_end_write(tmp_file_t *tmp); + +/* + * An alternative to tmp_file_open_read but works on an open file. + * Companion function to tmp_file_end_write above. + * Returns 0 on success, a negative number on failure. + */ +int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam); + +/* + * Read the next alignment, either from memory or from disk. + * Returns size of entry on success, 0 on end of file or a negative on error. + */ +int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam); + + +/* + * Frees up memory, closes the file and optionally deletes it. Giving this function + * pointer to the bam1_t structure used for reading will set its data value to null, + * preventing bam_destroy1() from trying to free already freed memory. + * Returns 0 on success, a negative number or EOF on failure. + */ +int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete); + +#ifdef __cplusplus +} +#endif + +#endif /* _TMP_SAM_FILE_H_ */ diff --git a/samtools/version.h b/samtools/version.h index e74ad87..9dcb73f 100644 --- a/samtools/version.h +++ b/samtools/version.h @@ -1 +1 @@ -#define SAMTOOLS_VERSION "1.6" +#define SAMTOOLS_VERSION "1.7" diff --git a/setup.cfg b/setup.cfg index 5cb6c3f..1f061e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,3 +6,5 @@ universal = 0 # -v: verbose output addopts = -s -v testpaths = pysam tests +pep8maxlinelength = 120 +pep8ignore = E402 diff --git a/setup.py b/setup.py index 608badb..49261f9 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ This module provides a low-level wrapper around the htslib C-API as using cython and a high-level API for convenient access to the data within standard genomic file formats. -The current version wraps htslib-1.4.1, samtools-1.4.1 and bcftools-1.4.1. +The current version wraps htslib-1.7, samtools-1.7 and bcftools-1.6. See: http://www.htslib.org @@ -32,6 +32,12 @@ import sys import sysconfig from contextlib import contextmanager from setuptools import Extension, setup +from cy_build import CyExtension as Extension, cy_build_ext as build_ext +try: + import cython + HAVE_CYTHON = True +except ImportError: + HAVE_CYTHON = False IS_PYTHON3 = sys.version_info.major >= 3 @@ -109,17 +115,23 @@ def distutils_dir_name(dname): platform=sysconfig.get_platform(), version=sys.version_info) + +def get_pysam_version(): + sys.path.insert(0, "pysam") + import version + return version.__version__ + + # How to link against HTSLIB -# separate: use included htslib and include in each extension -# module. No dependencies between modules and works -# with setup.py install, but wasteful in terms of -# memory and compilation time. -# shared: share chtslib across extension modules. This would be -# the ideal method, but currently requires -# LD_LIBRARY_PATH to be set correctly when using -# pysam. +# shared: build shared chtslib from builtin htslib code. # external: use shared libhts.so compiled outside of # pysam +# separate: use included htslib and include in each extension +# module. No dependencies between modules and works with +# setup.py install, but wasteful in terms of memory and +# compilation time. Fallback if shared module compilation +# fails. + HTSLIB_MODE = os.environ.get("HTSLIB_MODE", "shared") HTSLIB_LIBRARY_DIR = os.environ.get("HTSLIB_LIBRARY_DIR", None) HTSLIB_INCLUDE_DIR = os.environ.get("HTSLIB_INCLUDE_DIR", None) @@ -141,40 +153,30 @@ package_dirs = {'pysam': 'pysam', config_headers = ["samtools/config.h", "bcftools/config.h"] -from cy_build import CyExtension as Extension, cy_build_ext as build_ext - cmdclass = {'build_ext': build_ext} -# Check if cython is available -# # If cython is available, the pysam will be built using cython from # the .pyx files. If no cython is available, the C-files included in the # distribution will be used. -try: - import cython - HAVE_CYTHON = True +if HAVE_CYTHON: print ("# pysam: cython is available - using cythonize if necessary") source_pattern = "pysam/libc%s.pyx" -except ImportError: - HAVE_CYTHON = False +else: print ("# pysam: no cython available - using pre-compiled C") - # no Cython available - use existing C code source_pattern = "pysam/libc%s.c" -# collect pysam version -sys.path.insert(0, "pysam") -import version -version = version.__version__ +# Exit if there are no pre-compiled files and no cython available +fn = source_pattern % "htslib" +if not os.path.exists(fn): + raise ValueError( + "no cython installed, but can not find {}." + "Make sure that cython is installed when building " + "from the repository" + .format(fn)) # exclude sources that contain a main function EXCLUDE = { "samtools": ( - "razip.c", "bgzip.c", "main.c", - "calDepth.c", "bam2bed.c", "wgsim.c", - "md5fa.c", "md5sum-lite.c", "maq2sam.c", - "bamcheck.c", "chk_indel.c", "vcf-miniview.c", - "htslib-1.3", # do not import twice - "hfile_irods.c", # requires irods library ), "bcftools": ( "test", "plugins", "peakfit.c", @@ -183,8 +185,9 @@ EXCLUDE = { "reheader.c", "polysomy.c"), "htslib": ( - 'htslib/tabix.c', 'htslib/bgzip.c', - 'htslib/htsfile.c', 'htslib/hfile_irods.c'), + 'htslib/tabix.c', + 'htslib/bgzip.c', + 'htslib/htsfile.c'), } print ("# pysam: htslib mode is {}".format(HTSLIB_MODE)) @@ -242,7 +245,6 @@ if HTSLIB_LIBRARY_DIR: htslib_library_dirs = [HTSLIB_LIBRARY_DIR] htslib_include_dirs = [HTSLIB_INCLUDE_DIR] external_htslib_libraries = ['z', 'hts'] - elif HTSLIB_MODE == 'separate': # add to each pysam component a separately compiled # htslib @@ -250,7 +252,6 @@ elif HTSLIB_MODE == 'separate': shared_htslib_sources = htslib_sources htslib_library_dirs = [] htslib_include_dirs = ['htslib'] - elif HTSLIB_MODE == 'shared': # link each pysam component against the same # htslib built from sources included in the pysam @@ -261,20 +262,9 @@ elif HTSLIB_MODE == 'shared': os.path.join("build", distutils_dir_name("lib"), "pysam")] htslib_include_dirs = ['htslib'] - else: raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE) -suffix = sysconfig.get_config_var('EXT_SUFFIX') -if not suffix: - suffix = sysconfig.get_config_var('SO') -internal_htslib_libraries = [os.path.splitext("chtslib{}".format(suffix))[0]] - -internal_tools_libraries = [ - os.path.splitext("csamtools{}".format(suffix))[0], - os.path.splitext("cbcftools{}".format(suffix))[0], - ] - # build config.py with open(os.path.join("pysam", "config.py"), "w") as outf: outf.write('HTSLIB = "{}"\n'.format(HTSLIB_SOURCE)) @@ -307,42 +297,6 @@ for fn in config_headers: outf.write( "/* conservative compilation options */\n") -parts = ["samtools", - "bcftools", - "htslib", - "tabix", - "faidx", - "samfile", - "utils", - "alignmentfile", - "tabixproxies", - "vcf", - "bcf"] - -# Exit if there are no pre-compiled files and no cython available -fn = source_pattern % "htslib" -if not os.path.exists(fn): - raise ValueError( - "no cython installed, but can not find {}." - "Make sure that cython is installed when building " - "from the repository" - .format(fn)) - - -####################################################### -classifiers = """ -Development Status :: 3 - Beta -Operating System :: MacOS :: MacOS X -Operating System :: POSIX -Operating System :: POSIX :: Linux -Operating System :: Unix -Programming Language :: Python -Topic :: Scientific/Engineering -Topic :: Scientific/Engineering :: Bioinformatics -""" - -####################################################### - ####################################################### # Windows compatibility - untested if platform.system() == 'Windows': @@ -364,227 +318,115 @@ else: define_macros = [] -samtools_include_dirs = [os.path.abspath("samtools")] - -chtslib = Extension( - "pysam.libchtslib", - [source_pattern % "htslib", - "pysam/htslib_util.c"] + - shared_htslib_sources + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=["pysam", "."] + include_os + htslib_include_dirs, - libraries=external_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -# samfile requires functions defined in bam_md.c -# for __advance_samtools method. -# Selected ones have been copied into samfile_utils.c -# Needs to be devolved somehow. -csamfile = Extension( - "pysam.libcsamfile", - [source_pattern % "samfile", - "pysam/htslib_util.c", - "pysam/samfile_util.c"] + - htslib_sources + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=["pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -# alignmentfile requires functions defined in bam_md.c -# for __advance_samtools method. -# Selected ones have been copied into samfile_utils.c -# Needs to be devolved somehow. -calignmentfile = Extension( - "pysam.libcalignmentfile", - [source_pattern % "alignmentfile", - "pysam/htslib_util.c", - "pysam/samfile_util.c"] + - htslib_sources + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=["pysam"] + samtools_include_dirs + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -# alignmentfile requires functions defined in bam_md.c -# for __advance_samtools method. -# Selected ones have been copied into samfile_utils.c -# Needs to be devolved somehow. -calignedsegment = Extension( - "pysam.libcalignedsegment", - [source_pattern % "alignedsegment", - "pysam/htslib_util.c", - "pysam/samfile_util.c"] + - htslib_sources + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=["pysam", "."] + samtools_include_dirs + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -ctabix = Extension( - "pysam.libctabix", - [source_pattern % "tabix", - "pysam/tabix_util.c"] + - htslib_sources + - os_c_files, - library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["pysam", "."] + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - - +suffix = sysconfig.get_config_var('EXT_SUFFIX') +if not suffix: + suffix = sysconfig.get_config_var('SO') -cutils = Extension( - "pysam.libcutils", - [source_pattern % "utils", "pysam/pysam_util.c"] + - htslib_sources + - os_c_files, - library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["pysam", "."] + - include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries + internal_tools_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -csamtools = Extension( - "pysam.libcsamtools", - [source_pattern % "samtools"] + - glob.glob(os.path.join("samtools", "*.pysam.c")) + - htslib_sources + - os_c_files, - library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["pysam", "."] + samtools_include_dirs + - include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -cbcftools = Extension( - "pysam.libcbcftools", - [source_pattern % "bcftools"] + - glob.glob(os.path.join("bcftools", "*.pysam.c")) + - htslib_sources + - os_c_files, - library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["bcftools", "pysam", "."] + samtools_include_dirs + - include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -cfaidx = Extension( - "pysam.libcfaidx", - [source_pattern % "faidx"] + - htslib_sources + - os_c_files, - library_dirs=["pysam"] + htslib_library_dirs, - include_dirs=["pysam", "."] + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -ctabixproxies = Extension( - "pysam.libctabixproxies", - [source_pattern % "tabixproxies"] + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=include_os, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -cvcf = Extension( - "pysam.libcvcf", - [source_pattern % "vcf"] + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=["htslib", "."] + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -cbcf = Extension( - "pysam.libcbcf", - [source_pattern % "bcf"] + - htslib_sources + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=["htslib", "."] + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, - language="c", - extra_compile_args=extra_compile_args, - define_macros=define_macros -) - -cbgzf = Extension( - "pysam.libcbgzf", - [source_pattern % "bgzf"] + - htslib_sources + - os_c_files, - library_dirs=htslib_library_dirs, - include_dirs=["htslib", "."] + include_os + htslib_include_dirs, - libraries=external_htslib_libraries + internal_htslib_libraries, +internal_htslib_libraries = [ + os.path.splitext("chtslib{}".format(suffix))[0]] +internal_samtools_libraries = [ + os.path.splitext("csamtools{}".format(suffix))[0], + os.path.splitext("cbcftools{}".format(suffix))[0], + ] +internal_pysamutil_libraries = [ + os.path.splitext("cutils{}".format(suffix))[0]] + +libraries_for_pysam_module = external_htslib_libraries + internal_htslib_libraries + internal_pysamutil_libraries + +# Order of modules matters in order to make sure that dependencies are resolved. +# The structures of dependencies is as follows: +# libchtslib: htslib utility functions and htslib itself if builtin is set. +# libcsamtools: samtools code (builtin) +# libcbcftools: bcftools code (builtin) +# libcutils: General utility functions, depends on all of the above +# libcXXX (pysam module): depends on libchtslib and libcutils + +# The list below uses the union of include_dirs and library_dirs for +# reasons of simplicity. + +modules = [ + dict(name="pysam.libchtslib", + sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + shared_htslib_sources + os_c_files, + libraries=external_htslib_libraries), + dict(name="pysam.libcsamtools", + sources=[source_pattern % "samtools"] + glob.glob(os.path.join("samtools", "*.pysam.c")) + + [os.path.join("samtools", "lz4", "lz4.c")] + htslib_sources + os_c_files, + libraries=external_htslib_libraries + internal_htslib_libraries), + dict(name="pysam.libcbcftools", + sources=[source_pattern % "bcftools"] + glob.glob(os.path.join("bcftools", "*.pysam.c")) + htslib_sources + os_c_files, + libraries=external_htslib_libraries + internal_htslib_libraries), + dict(name="pysam.libcutils", + sources=[source_pattern % "utils", "pysam/pysam_util.c"] + htslib_sources + os_c_files, + libraries=external_htslib_libraries + internal_htslib_libraries + internal_samtools_libraries), + dict(name="pysam.libcalignmentfile", + sources=[source_pattern % "alignmentfile"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libcsamfile", + sources=[source_pattern % "samfile"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libcalignedsegment", + sources=[source_pattern % "alignedsegment"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libctabix", + sources=[source_pattern % "tabix"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libcfaidx", + sources=[source_pattern % "faidx"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libcbcf", + sources=[source_pattern % "bcf"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libcbgzf", + sources=[source_pattern % "bgzf"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libctabixproxies", + sources=[source_pattern % "tabixproxies"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), + dict(name="pysam.libcvcf", + sources=[source_pattern % "vcf"] + htslib_sources + os_c_files, + libraries=libraries_for_pysam_module), +] + +common_options = dict( language="c", extra_compile_args=extra_compile_args, - define_macros=define_macros -) + define_macros=define_macros, + # for out-of-tree compilation, use absolute paths + library_dirs=[os.path.abspath(x) for x in ["pysam"] + htslib_library_dirs], + include_dirs=[os.path.abspath(x) for x in htslib_include_dirs + \ + ["samtools", "samtools/lz4", "bcftools", "pysam", "."] + include_os]) + +# add common options (in python >3.5, could use n = {**a, **b} +for module in modules: + module.update(**common_options) +classifiers = """ +Development Status :: 4 - Beta +Intended Audience :: Science/Research +Intended Audience :: Developers +License :: OSI Approved +Programming Language :: Python +Topic :: Software Development +Topic :: Scientific/Engineering +Operating System :: POSIX +Operating System :: Unix +Operating System :: MacOS +""" + metadata = { 'name': "pysam", - 'version': version, + 'version': get_pysam_version(), 'description': "pysam", 'long_description': __doc__, 'author': "Andreas Heger", 'author_email': "andreas.heger@gmail.com", 'license': "MIT", - 'platforms': "ALL", + 'platforms': ["POSIX", "UNIX", "MacOS"], + 'classifiers': [_f for _f in classifiers.split("\n") if _f], 'url': "https://github.com/pysam-developers/pysam", 'packages': package_list, 'requires': ['cython (>=0.21)'], - 'ext_modules': [chtslib, - csamfile, - calignmentfile, - calignedsegment, - ctabix, - ctabixproxies, - cvcf, - cbcf, - cbgzf, - cfaidx, - csamtools, - cbcftools, - cutils], + 'ext_modules': [Extension(**opts) for opts in modules], 'cmdclass': cmdclass, 'package_dir': package_dirs, 'package_data': {'': ['*.pxd', '*.h'], }, diff --git a/tests/AlignedSegment_bench.py b/tests/AlignedSegment_bench.py new file mode 100644 index 0000000..f5bccd6 --- /dev/null +++ b/tests/AlignedSegment_bench.py @@ -0,0 +1,30 @@ +"""Benchmarking module for AlignedSegment functionality""" +import os +import array +import pysam + + +from TestUtils import BAM_DATADIR + + +def set_binary_tag(): + read = pysam.AlignedSegment() + read.set_tag('FZ', array.array('H', range(1000))) + return len(read.get_tag('FZ')) + + +def read_binary_tag(fn): + with pysam.AlignmentFile(fn) as inf: + read = next(inf.fetch()) + return len(read.get_tag('FZ')) + + +def test_set_binary_tag(benchmark): + result = benchmark(set_binary_tag) + assert result == 1000 + + +def test_read_binary_tag(benchmark): + result = benchmark(read_binary_tag, os.path.join( + BAM_DATADIR, "example_btag.bam")) + assert result == 260 diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 920ddbc..3327279 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -1,22 +1,32 @@ import os import pysam import unittest +import json import collections +import string import copy import array -from TestUtils import checkFieldEqual, BAM_DATADIR, WORKDIR +from TestUtils import checkFieldEqual, BAM_DATADIR, get_temp_filename, get_temp_context, IS_PYTHON3 +if IS_PYTHON3: + maketrans = str.maketrans +else: + maketrans = string.maketrans class ReadTest(unittest.TestCase): def build_read(self): '''build an example read.''' - a = pysam.AlignedSegment() + header = pysam.AlignmentHeader.from_references( + ["chr1", "chr2"], + [10000000, 10000000]) + + a = pysam.AlignedSegment(header) a.query_name = "read_12345" - a.query_sequence = "ACGT" * 10 + a.query_sequence = "ATGC" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 @@ -36,10 +46,12 @@ class TestAlignedSegment(ReadTest): ''' def testEmpty(self): + a = pysam.AlignedSegment() self.assertEqual(a.query_name, None) self.assertEqual(a.query_sequence, None) - self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) + self.assertEqual(pysam.qualities_to_qualitystring( + a.query_qualities), None) self.assertEqual(a.flag, 0) self.assertEqual(a.reference_id, -1) self.assertEqual(a.mapping_quality, 0) @@ -48,7 +60,7 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.next_reference_id, -1) self.assertEqual(a.next_reference_start, -1) self.assertEqual(a.template_length, 0) - + def testStrOfEmptyRead(self): a = pysam.AlignedSegment() s = str(a) @@ -75,7 +87,7 @@ class TestAlignedSegment(ReadTest): self.assertFalse(a != b) self.assertFalse(b != a) - b.tid = 2 + b.tid = 1 self.assertFalse(a == b) self.assertFalse(b == a) self.assertTrue(a != b) @@ -85,7 +97,7 @@ class TestAlignedSegment(ReadTest): a = self.build_read() b = self.build_read() self.assertEqual(hash(a), hash(b)) - b.tid = 2 + b.tid = 1 self.assertNotEqual(hash(a), hash(b)) def testUpdate(self): @@ -111,15 +123,15 @@ class TestAlignedSegment(ReadTest): checkFieldEqual(self, a, b) # check seq - b.query_sequence = "ACGT" + b.query_sequence = "ATGC" checkFieldEqual(self, a, b, ("query_sequence", "query_qualities", "query_length")) - b.query_sequence = "ACGT" * 3 + b.query_sequence = "ATGC" * 3 checkFieldEqual(self, a, b, ("query_sequence", "query_qualities", "query_length")) - b.query_sequence = "ACGT" * 10 + b.query_sequence = "ATGC" * 10 checkFieldEqual(self, a, b, ("query_qualities",)) # reset qual @@ -148,23 +160,25 @@ class TestAlignedSegment(ReadTest): ''' a = self.build_read() a.query_sequence = a.query_sequence[5:10] - self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) + self.assertEqual(pysam.qualities_to_qualitystring( + a.query_qualities), None) a = self.build_read() s = pysam.qualities_to_qualitystring(a.query_qualities) a.query_sequence = a.query_sequence[5:10] a.query_qualities = pysam.qualitystring_to_array(s[5:10]) - self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), s[5:10]) + self.assertEqual(pysam.qualities_to_qualitystring( + a.query_qualities), s[5:10]) def testLargeRead(self): '''build an example read.''' a = pysam.AlignedSegment() a.query_name = "read_12345" - a.query_sequence = "ACGT" * 200 + a.query_sequence = "ATGC" * 200 a.flag = 0 - a.reference_id = 0 + a.reference_id = -1 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 4 * 200), ) @@ -362,7 +376,7 @@ class TestAlignedSegment(ReadTest): (3, 23, 'A'), (4, 24, 'c'), (None, 25, 'T'), (None, 26, 'T'), (5, 27, 'A'), (6, 28, 'A'), (7, 29, 'A'), (8, 30, 'A')] - ) + ) a.cigarstring = "5M2D2I2M" a.set_tag("MD", "4C^TT2") @@ -373,7 +387,20 @@ class TestAlignedSegment(ReadTest): (None, 25, 'T'), (None, 26, 'T'), (5, None, None), (6, None, None), (7, 27, 'A'), (8, 28, 'A')] - ) + ) + + def test_get_aligned_pairs_with_malformed_MD_tag(self): + + a = self.build_read() + a.query_sequence = "A" * 9 + + # out of range issue, see issue #560 + a.cigarstring = "64M2D85M2S" + a.set_tag("MD", "64^TG86A0") + self.assertRaises( + AssertionError, + a.get_aligned_pairs, + with_seq=True) def test_get_aligned_pairs_skip_reference(self): a = self.build_read() @@ -425,7 +452,6 @@ class TestAlignedSegment(ReadTest): self.assertEqual(a.query_alignment_length, 20) def test_query_length_is_limited(self): - a = self.build_read() a.query_name = "A" * 1 a.query_name = "A" * 251 @@ -436,9 +462,184 @@ class TestAlignedSegment(ReadTest): "query_name", "A" * 252) + def test_header_accessible(self): + a = self.build_read() + self.assertTrue(isinstance(a.header, pysam.AlignmentHeader)) + + def test_bin_values_for_unmapped_reads_ignore_length(self): + a = self.build_read() + # use a long read + a.cigarstring="2000000M" + self.assertEqual(a.bin, 9) + # changing unmapped flag changes bin because length is 0 + a.is_unmapped = True + self.assertTrue(a.is_unmapped) + self.assertEqual(a.bin, 4681) + + # unmapped read without chromosomal location + a.reference_start = -1 + self.assertEqual(a.reference_start, -1) + self.assertEqual(a.bin, 4680) + + def test_bin_values_for_mapped_reads_are_updated(self): + a = self.build_read() + a.pos = 20000 + self.assertFalse(a.is_unmapped) + self.assertEqual(a.bin, 4682) + + # updating length updates bin + a.cigarstring="2000000M" + self.assertEqual(a.bin, 9) + + # updating length updates bin + a.cigarstring="20M" + self.assertEqual(a.bin, 4682) + # updating length updates bin + a.reference_start = 2000000 + self.assertEqual(a.bin, 4803) + + +class TestTidMapping(ReadTest): + + def test_reference_name_can_be_set_to_none(self): + a = self.build_read() + a.reference_name = None + self.assertEqual(a.reference_name, None) + self.assertEqual(a.reference_id, -1) + + def test_reference_name_can_be_set_to_asterisk(self): + a = self.build_read() + a.reference_name = "*" + self.assertEqual(a.reference_name, None) + self.assertEqual(a.reference_id, -1) + + def test_reference_name_can_be_set_to_chromosome(self): + a = self.build_read() + a.reference_name = "chr1" + self.assertEqual(a.reference_name, "chr1") + self.assertEqual(a.reference_id, 0) + + def test_reference_name_can_not_be_set_to_unknown_chromosome(self): + a = self.build_read() + self.assertRaises(ValueError, + setattr, + a, + "reference_name", + "chrX") + + def test_tid_can_be_set_to_missing(self): + a = self.build_read() + a.reference_id = -1 + self.assertEqual(a.reference_id, -1) + self.assertEqual(a.reference_name, None) + + def test_tid_can_be_set_to_missing_without_header(self): + a = pysam.AlignedSegment() + a.reference_id = -1 + self.assertEqual(a.reference_id, -1) + self.assertEqual(a.reference_name, None) + + def test_tid_can_be_set_without_header(self): + a = pysam.AlignedSegment() + a.reference_id = 1 + self.assertRaises(ValueError, getattr, a, "reference_name") + + def test_tid_can_be_set_to_chromosome(self): + a = self.build_read() + a.reference_id = 0 + self.assertEqual(a.reference_id, 0) + self.assertEqual(a.reference_name, "chr1") + + def test_tid_can_not_be_set_to_unknown_chromosome(self): + a = self.build_read() + self.assertRaises(ValueError, + setattr, + a, + "reference_id", + 2) + + def test_unmapped_tid_is_asterisk_in_output(self): + a = self.build_read() + a.reference_id = -1 + self.assertEqual(a.to_string().split("\t")[2], "*") + + +class TestNextTidMapping(ReadTest): + + def test_next_reference_name_can_be_set_to_none(self): + a = self.build_read() + a.next_reference_name = None + self.assertEqual(a.next_reference_name, None) + self.assertEqual(a.next_reference_id, -1) + + def test_next_reference_name_can_be_set_to_asterisk(self): + a = self.build_read() + a.next_reference_name = "*" + self.assertEqual(a.next_reference_name, None) + self.assertEqual(a.next_reference_id, -1) + + def test_next_reference_name_can_be_set_to_chromosome(self): + a = self.build_read() + a.next_reference_name = "chr1" + self.assertEqual(a.next_reference_name, "chr1") + self.assertEqual(a.next_reference_id, 0) + + def test_next_reference_name_can_not_be_set_to_unknown_chromosome(self): + a = self.build_read() + self.assertRaises(ValueError, + setattr, + a, + "next_reference_name", + "chrX") + + def test_next_tid_can_be_set_to_missing(self): + a = self.build_read() + a.next_reference_id = -1 + self.assertEqual(a.next_reference_id, -1) + self.assertEqual(a.next_reference_name, None) + + def test_next_tid_can_be_set_to_equal(self): + a = self.build_read() + a.reference_name = "chr1" + a.next_reference_name = "=" + self.assertEqual(a.next_reference_id, a.reference_id) + self.assertEqual(a.next_reference_name, a.reference_name) + self.assertEqual(a.to_string().split("\t")[6], "=") + + def test_next_tid_can_be_set_to_missing_without_header(self): + a = pysam.AlignedSegment() + a.next_reference_id = -1 + self.assertEqual(a.next_reference_id, -1) + self.assertEqual(a.next_reference_name, None) + + def test_next_tid_can_be_set_without_header(self): + a = pysam.AlignedSegment() + a.next_reference_id = 1 + self.assertRaises(ValueError, getattr, a, "next_reference_name") + + def test_next_tid_can_be_set_to_chromosome(self): + a = self.build_read() + a.next_reference_id = 0 + self.assertEqual(a.next_reference_id, 0) + self.assertEqual(a.next_reference_name, "chr1") + + def test_next_tid_can_not_be_set_to_unknown_chromosome(self): + a = self.build_read() + self.assertRaises(ValueError, + setattr, + a, + "next_reference_id", + 2) + + def test_next_unmapped_tid_is_asterisk_in_output(self): + a = self.build_read() + a.next_reference_id = -1 + self.assertEqual(a.to_string().split("\t")[6], "*") + + class TestCigar(ReadTest): - + def testCigarString(self): r = self.build_read() self.assertEqual(r.cigarstring, "10M1D9M1I20M") @@ -450,16 +651,16 @@ class TestCigar(ReadTest): def testCigar(self): r = self.build_read() - self.assertEqual(r.cigartuples, [(0, 10), (2, 1), (0, 9), (1, 1), (0, 20)]) + self.assertEqual( + r.cigartuples, [(0, 10), (2, 1), (0, 9), (1, 1), (0, 20)]) # unsetting cigar string r.cigartuples = None self.assertEqual(r.cigartuples, None) class TestCigarStats(ReadTest): - + def testStats(self): - a = self.build_read() a.cigarstring = None @@ -563,7 +764,7 @@ class TestTags(ReadTest): read.set_tag, key, array.array(dtype, range(10))) - + def testAddTagsType(self): a = self.build_read() a.tags = None @@ -834,7 +1035,7 @@ class TestTags(ReadTest): tags = [('XC', 85), ('XT', 'M'), ('NM', 5), ('SM', 29), ('AM', 29), ('XM', 1), ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'), - ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')] + ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')] # noqa r.tags = tags r.tags += [("RG", rg)] * 100 @@ -853,17 +1054,15 @@ class TestTags(ReadTest): r = self.build_read() x = -2 r.tags = [("XD", x)] - with pysam.AlignmentFile( - "tests/test.bam", - "wb", - referencenames=("chr1",), - referencelengths = (1000,)) as outf: - outf.write(r) - with pysam.AlignmentFile("tests/test.bam") as inf: - r = next(inf) - - self.assertEqual(r.tags, [("XD", x)]) - os.unlink("tests/test.bam") + with get_temp_context("negative_integers.bam") as fn: + with pysam.AlignmentFile(fn, + "wb", + referencenames=("chr1",), + referencelengths=(1000,)) as outf: + outf.write(r) + with pysam.AlignmentFile(fn) as inf: + r = next(inf) + self.assertEqual(r.tags, [("XD", x)]) class TestCopy(ReadTest): @@ -905,7 +1104,7 @@ class TestSetTagGetTag(ReadTest): self.assertEqual(t, alt_value_type) else: self.assertEqual(t, value_type) - + def test_set_tag_with_A(self): self.check_tag('TT', "x", value_type="A") @@ -940,10 +1139,12 @@ class TestSetTagGetTag(ReadTest): self.check_tag('TT', "AE12", value_type="H") def test_set_tag_with_automated_type_detection(self): - self.check_tag('TT', -(1 << 7), value_type=None, alt_value_type="c") - self.check_tag('TT', -(1 << 7) - 1, value_type=None, alt_value_type="s") - self.check_tag('TT', -(1 << 15), value_type=None, alt_value_type="s") - self.check_tag('TT', -(1 << 15) - 1, value_type=None, alt_value_type="i") + self.check_tag('TT', -(1 << 7), value_type=None, alt_value_type="c") + self.check_tag('TT', -(1 << 7) - 1, + value_type=None, alt_value_type="s") + self.check_tag('TT', -(1 << 15), value_type=None, alt_value_type="s") + self.check_tag('TT', -(1 << 15) - 1, + value_type=None, alt_value_type="i") self.check_tag('TT', -(1 << 31), value_type=None, alt_value_type="i") self.assertRaises( ValueError, @@ -952,12 +1153,14 @@ class TestSetTagGetTag(ReadTest): -(1 << 31) - 1, value_type=None, alt_value_type="i") - + self.check_tag('TT', (1 << 8) - 1, value_type=None, alt_value_type="C") self.check_tag('TT', (1 << 8), value_type=None, alt_value_type="S") - self.check_tag('TT', (1 << 16) - 1, value_type=None, alt_value_type="S") + self.check_tag('TT', (1 << 16) - 1, + value_type=None, alt_value_type="S") self.check_tag('TT', (1 << 16), value_type=None, alt_value_type="I") - self.check_tag('TT', (1 << 32) - 1, value_type=None, alt_value_type="I") + self.check_tag('TT', (1 << 32) - 1, + value_type=None, alt_value_type="I") self.assertRaises( ValueError, self.check_tag, @@ -978,22 +1181,10 @@ class TestSetTagsGetTag(TestSetTagGetTag): else: self.assertEqual(t, value_type) self.assertEqual(v, value) - - -class TestAsString(unittest.TestCase): - - def testAsString(self): - with open(os.path.join(BAM_DATADIR, "ex2.sam")) as samf: - reference = [x[:-1] for x in samf if not x.startswith("@")] - - with pysam.AlignmentFile( - os.path.join(BAM_DATADIR, "ex2.bam"), "r") as pysamf: - for s, p in zip(reference, pysamf): - self.assertEqual(s, p.tostring(pysamf)) class TestEnums(unittest.TestCase): - + def test_cigar_enums_are_defined(self): self.assertEqual(pysam.CMATCH, 0) self.assertEqual(pysam.CINS, 1) @@ -1021,5 +1212,137 @@ class TestEnums(unittest.TestCase): self.assertEqual(pysam.FSUPPLEMENTARY, 2048) +class TestBuildingReadsWithoutHeader(unittest.TestCase): + + def build_read(self): + '''build an example read, but without header information.''' + + a = pysam.AlignedSegment() + a.query_name = "read_12345" + a.query_sequence = "ATGC" * 10 + a.flag = 0 + a.reference_id = -1 + a.reference_start = 20 + a.mapping_quality = 20 + a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) + a.next_reference_id = 0 + a.next_reference_start = 200 + a.template_length = 167 + a.query_qualities = pysam.qualitystring_to_array("1234") * 10 + # todo: create tags + return a + + def test_read_can_be_constructed_without_header(self): + read = self.build_read() + self.assertEqual(read.query_name, "read_12345") + + def test_reference_id_can_be_set(self): + read = self.build_read() + read.reference_id = 2 + self.assertEqual(read.reference_id, 2) + + def test_reference_name_is_not_available(self): + read = self.build_read() + self.assertRaises( + ValueError, + setattr, + read, + "reference_name", + "chr2") + + def test_read_can_be_written_to_file(self): + tmpfilename = get_temp_filename(".bam") + with pysam.AlignmentFile(tmpfilename, "wb", + reference_names=["chr1", "chr2", "chr3"], + reference_lengths=[1000, 2000, 3000]) as outf: + read = self.build_read() + read.reference_id = 2 + outf.write(read) + + stdout = pysam.samtools.view(tmpfilename) + chromosome = stdout.split("\t")[2] + self.assertEqual(chromosome, "chr3") + os.unlink(tmpfilename) + + +class TestForwardStrandValues(ReadTest): + + def test_sequence_is_complemented(self): + a = self.build_read() + a.is_reverse = False + fwd_seq = a.query_sequence + + rev_seq = fwd_seq.translate(maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1] + self.assertEqual(fwd_seq, a.get_forward_sequence()) + a.is_reverse = True + self.assertEqual(fwd_seq, a.query_sequence) + self.assertEqual(rev_seq, a.get_forward_sequence()) + + def test_qualities_are_complemented(self): + a = self.build_read() + a.is_reverse = False + fwd_qual = a.query_qualities + rev_qual = fwd_qual[::-1] + self.assertEqual(fwd_qual, a.get_forward_qualities()) + a.is_reverse = True + self.assertEqual(fwd_qual, a.query_qualities) + self.assertEqual(rev_qual, a.get_forward_qualities()) + + +class TestExportImport(ReadTest): + + def test_string_export(self): + a = self.build_read() + self.assertEqual(a.to_string(), + "read_12345\t0\tchr1\t21\t20\t10M1D9M1I20M\t=\t201\t167\t" + "ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC\t1234123412341234123412341234123412341234") + + def test_string_export_import_without_tags(self): + a = self.build_read() + a.tags = [] + b = pysam.AlignedSegment.fromstring(a.to_string(), a.header) + self.assertEqual(a, b) + + def test_string_export_import_with_tags(self): + a = self.build_read() + a.tags = [("XD", 12), ("RF", "abc")] + b = pysam.AlignedSegment.fromstring(a.to_string(), a.header) + self.assertEqual(a, b) + + def test_to_string_without_alignment_file(self): + with open(os.path.join(BAM_DATADIR, "ex2.sam")) as samf: + reference = [x[:-1] for x in samf if not x.startswith("@")] + + with pysam.AlignmentFile( + os.path.join(BAM_DATADIR, "ex2.bam"), "r") as pysamf: + for s, p in zip(reference, pysamf): + self.assertEqual(s, p.to_string()) + + def test_dict_export(self): + a = self.build_read() + a.tags = [("XD", 12), ("RF", "abc")] + + self.assertEqual( + a.to_dict(), + json.loads( + '{"name": "read_12345", "flag": "0", "ref_name": "chr1", "ref_pos": "21", ' + '"map_quality": "20", "cigar": "10M1D9M1I20M", "next_ref_name": "=", ' + '"next_ref_pos": "201", "length": "167", ' + '"seq": "ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC", ' + '"qual": "1234123412341234123412341234123412341234", "tags": ["XD:i:12", "RF:Z:abc"]}')) + + def test_string_export_import_without_tags(self): + a = self.build_read() + a.tags = [] + b = pysam.AlignedSegment.from_dict(a.to_dict(), a.header) + self.assertEqual(a, b) + + def test_string_export_import_with_tags(self): + a = self.build_read() + a.tags = [("XD", 12), ("RF", "abc")] + b = pysam.AlignedSegment.from_dict(a.to_dict(), a.header) + self.assertEqual(a, b) + + if __name__ == "__main__": unittest.main() diff --git a/tests/AlignmentFileFetchTestUtils.py b/tests/AlignmentFileFetchTestUtils.py new file mode 100644 index 0000000..100b405 --- /dev/null +++ b/tests/AlignmentFileFetchTestUtils.py @@ -0,0 +1,86 @@ +import os +import subprocess +import pysam + +from TestUtils import BAM_DATADIR, force_str + +def build_fetch_with_samtoolsshell(fn): + retval = os.popen("samtools view {} 2> /dev/null | wc -l".format(fn)).read() + return int(retval.strip()) + + +def build_fetch_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "view", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + return len(proc.stdout.readlines()) + + +def build_fetch_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + return len(list(inf.fetch())) + + +def build_query_sequences_with_samtoolsshell(fn): + retval = os.popen("samtools view {} 2> /dev/null | cut -f 11".format(fn)).read() + return force_str(retval).splitlines() + + +def build_query_sequences_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "view", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + data = [force_str(x).split()[10] for x in proc.stdout.readlines()] + return data + + +def build_query_sequences_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + data = [x.query_sequence for x in inf] + return data + + +def build_query_qualities_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + data = [x.query_qualities for x in inf] + return data + + +def build_query_sequences_flagfilter_with_samtoolsshell(fn): + retval = os.popen("samtools view -f 2 {} 2> /dev/null | cut -f 11".format(fn)).read() + return force_str(retval).splitlines() + + +def build_query_sequences_flagfilter_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "view", "-f", "2", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + data = [force_str(x).split()[10] for x in proc.stdout.readlines()] + return data + +def build_query_sequences_flagfilter_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + data = [x.query_sequence for x in inf if x.is_proper_pair] + return data + + +def build_query_sequences_directflagfilter_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + data = [x.query_sequence for x in inf if x.flag & 2] + return data + + +def build_aligned_pairs_with_pysam(*args, **kwargs): + matches_only = kwargs.pop("matches_only", False) + with_seq = kwargs.pop("with_seq", False) + with pysam.AlignmentFile(*args, **kwargs) as inf: + data = [x.get_aligned_pairs(matches_only=matches_only, with_seq=with_seq) + for x in inf if not x.is_unmapped] + return data + diff --git a/tests/AlignmentFileFetch_bench.py b/tests/AlignmentFileFetch_bench.py new file mode 100644 index 0000000..bb8ce43 --- /dev/null +++ b/tests/AlignmentFileFetch_bench.py @@ -0,0 +1,98 @@ +"""Benchmarking module for AlignmentFile functionality""" +import os +import pytest + + +from TestUtils import BAM_DATADIR, force_str, flatten_nested_list +from AlignmentFileFetchTestUtils import * + + +def test_build_fetch_from_bam_with_samtoolsshell(benchmark): + result = benchmark(build_fetch_with_samtoolsshell, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 3270 + + +def test_build_fetch_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_fetch_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 3270 + + +def test_build_fetch_from_bam_with_pysam(benchmark): + result = benchmark(build_fetch_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 3270 + + +def test_build_query_sequences_from_bam_with_samtoolsshell(benchmark): + result = benchmark(build_query_sequences_with_samtoolsshell, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3270 + + +def test_build_query_sequences_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_query_sequences_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3270 + + +def test_build_query_sequences_from_bam_with_pysam(benchmark): + result = benchmark(build_query_sequences_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3270 + + +def test_build_query_qualities_from_bam_with_pysam(benchmark): + result = benchmark(build_query_qualities_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3270 + + +def test_build_query_sequences_from_bam_flagfilter_with_samtoolsshell(benchmark): + result = benchmark(build_query_sequences_flagfilter_with_samtoolsshell, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3124 + + +def test_build_query_sequences_from_bam_flagfilter_with_samtoolspipe(benchmark): + result = benchmark(build_query_sequences_flagfilter_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3124 + + +def test_build_query_sequences_from_bam_flagfilter_with_pysam(benchmark): + result = benchmark(build_query_sequences_flagfilter_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3124 + + +def test_build_query_sequences_from_bam_directflagfilter_with_pysam(benchmark): + result = benchmark(build_query_sequences_flagfilter_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len(result) == 3124 + + +@pytest.mark.aligned_pairs +def test_build_aligned_pairs_default_with_pysam(benchmark): + result = benchmark(build_aligned_pairs_with_pysam, + os.path.join(BAM_DATADIR, "with_md.bam")) + assert len(result) == 3235 + + +@pytest.mark.aligned_pairs +def test_build_aligned_pairs_matchesonly_with_pysam(benchmark): + result = benchmark(build_aligned_pairs_with_pysam, + os.path.join(BAM_DATADIR, "with_md.bam"), + matches_only=True) + assert len(result) == 3235 + + +@pytest.mark.aligned_pairs +def test_build_aligned_pairs_withseq_with_pysam(benchmark): + result = benchmark(build_aligned_pairs_with_pysam, + os.path.join(BAM_DATADIR, "with_md.bam"), + with_seq=True) + assert len(result) == 3235 + + diff --git a/tests/AlignmentFileHeader_test.py b/tests/AlignmentFileHeader_test.py new file mode 100644 index 0000000..1cdbb69 --- /dev/null +++ b/tests/AlignmentFileHeader_test.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python +'''unit testing code for pysam. + +Execute in the :file:`tests` directory as it requires the Makefile +and data files located there. +''' + +import unittest +import os +import shutil +import sys +import re +import copy +import collections +from collections import OrderedDict as odict +import subprocess +import logging +import array +if sys.version_info.major >= 3: + from io import StringIO +else: + from StringIO import StringIO + +from functools import partial + +import pysam +import pysam.samtools +from TestUtils import checkBinaryEqual, checkURL, \ + check_samtools_view_equal, checkFieldEqual, force_str, \ + get_temp_filename, BAM_DATADIR + + +class TestHeaderConstruction(unittest.TestCase): + """testing header construction.""" + + header_dict = odict( + [('SQ', [odict([('LN', 1575), ('SN', 'chr1'), ('AH', 'chr1:5000000-5010000')]), + odict([('LN', 1584), ('SN', 'chr2'), ('AH', '*')])]), + ('RG', [odict([('LB', 'SC_1'), ('ID', 'L1'), ('SM', 'NA12891'), + ('PU', 'SC_1_10'), ("CN", "name:with:colon")]), + odict([('LB', 'SC_2'), ('ID', 'L2'), ('SM', 'NA12891'), + ('PU', 'SC_2_12'), ("CN", "name:with:colon")])]), + ('PG', [odict([('ID', 'P1'), ('VN', '1.0')]), + odict([('ID', 'P2'), ('VN', '1.1')])]), + ('HD', odict([('VN', '1.0')])), + ('CO', ['this is a comment', 'this is another comment']), + ]) + + header_text = ("@HD\tVN:1.0\n" + "@SQ\tSN:chr1\tLN:1575\tAH:chr1:5000000-5010000\n" + "@SQ\tSN:chr2\tLN:1584\tAH:*\n" + "@RG\tID:L1\tPU:SC_1_10\tLB:SC_1\tSM:NA12891\tCN:name:with:colon\n" + "@RG\tID:L2\tPU:SC_2_12\tLB:SC_2\tSM:NA12891\tCN:name:with:colon\n" + "@PG\tID:P1\tVN:1.0\n" + "@PG\tID:P2\tVN:1.1\n" + "@CO\tthis is a comment\n" + "@CO\tthis is another comment\n") + + + header_from_references = odict( + [('SQ', [odict([('LN', 1575), ('SN', 'chr1')]), + odict([('LN', 1584), ('SN', 'chr2')])]), + ('RG', [odict([('LB', 'SC_1'), ('ID', 'L1'), ('SM', 'NA12891'), + ('PU', 'SC_1_10'), ("CN", "name:with:colon")]), + odict([('LB', 'SC_2'), ('ID', 'L2'), ('SM', 'NA12891'), + ('PU', 'SC_2_12'), ("CN", "name:with:colon")])]), + ('PG', [odict([('ID', 'P1'), ('VN', '1.0')]), + odict([('ID', 'P2'), ('VN', '1.1')])]), + ('HD', odict([('VN', '1.0')])), + ('CO', ['this is a comment', 'this is another comment']), + ]) + + header_without_text = odict( + [('SQ', [odict([('LN', 1575), ('SN', 'chr1')]), + odict([('LN', 1584), ('SN', 'chr2')])]), + ]) + + def compare_headers(self, test_header, ref_header=None): + '''compare two headers a and b.''' + test_header_dict = test_header.as_dict() + if ref_header is None: + ref_header = self.header_dict + + for ak, av in test_header_dict.items(): + self.assertTrue(ak in self.header_dict, "key '%s' not in '%s' " % (ak, ref_header)) + self.assertEqual(av, ref_header[ak]) + for ak, av in ref_header.items(): + self.assertTrue(ak in test_header_dict, "key '%s' not in '%s' " % (ak, test_header_dict)) + self.assertEqual(av, test_header_dict[ak]) + + def check_name_mapping(self, test_header): + for x, y in enumerate(("chr1", "chr2")): + tid = test_header.get_tid(y) + ref = test_header.get_reference_name(x) + self.assertEqual(tid, x) + self.assertEqual(ref, y) + + self.assertEqual(test_header.get_tid("chr?"), -1) + self.assertRaises(ValueError, test_header.get_reference_name, 2) + + def test_header_constructed_from_dict(self): + header = pysam.AlignmentHeader.from_dict(self.header_dict) + self.compare_headers(header) + self.check_name_mapping(header) + + def test_header_constructed_from_text(self): + header = pysam.AlignmentHeader.from_text(self.header_text) + self.compare_headers(header) + self.check_name_mapping(header) + + def test_header_constructed_from_header(self): + header = pysam.AlignmentHeader.from_text(self.header_text) + self.compare_headers(header.copy()) + self.check_name_mapping(header) + + def test_header_constructed_from_references(self): + text = re.sub("@SQ[^\n]+\n", "", self.header_text) + assert "@SQ" not in text + header = pysam.AlignmentHeader.from_references( + reference_names=["chr1", "chr2"], + reference_lengths=[1575, 1584], + text=text) + self.compare_headers(header, self.header_from_references) + self.check_name_mapping(header) + + def test_header_constructed_from_references_without_text(self): + header = pysam.AlignmentHeader.from_references( + reference_names=["chr1", "chr2"], + reference_lengths=[1575, 1584]) + self.compare_headers(header, self.header_without_text) + self.check_name_mapping(header) + + +class TestHeaderSAM(unittest.TestCase): + """testing header manipulation""" + + header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'}, + {'LN': 1584, 'SN': 'chr2', 'AH': '*'}], + 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', + 'PU': 'SC_1_10', "CN": "name:with:colon"}, + {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', + 'PU': 'SC_2_12', "CN": "name:with:colon"}], + 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}], + 'HD': {'VN': '1.0'}, + 'CO': ['this is a comment', 'this is another comment'], + } + + def compare_headers(self, a, b): + '''compare two headers a and b.''' + for ak, av in a.items(): + self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) + self.assertEqual(av, b[ak]) + + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(BAM_DATADIR, "ex3.sam"), + "r") + + def test_header_content_is_as_expected(self): + self.compare_headers(self.header, self.samfile.header.to_dict()) + self.compare_headers(self.samfile.header.to_dict(), self.header) + + def test_text_access_works(self): + self.assertEqual(self.samfile.text, self.samfile.header.__str__()) + + def test_name_mapping(self): + for x, y in enumerate(("chr1", "chr2")): + tid = self.samfile.gettid(y) + ref = self.samfile.getrname(x) + self.assertEqual(tid, x) + self.assertEqual(ref, y) + + self.assertEqual(self.samfile.gettid("chr?"), -1) + self.assertRaises(ValueError, self.samfile.getrname, 2) + + def test_dictionary_access_works(self): + for key in self.header.keys(): + self.compare_headers({key: self.header[key]}, + {key: self.samfile.header[key]}) + + def test_dictionary_setting_raises_error(self): + self.assertRaises(TypeError, + self.samfile.header.__setitem__, + "CO", + ["This is a final comment"]) + + def test_dictionary_len_works(self): + self.assertEqual(len(self.header), len(self.samfile.header)) + + def test_dictionary_keys_works(self): + # sort for py2.7 + self.assertEqual(sorted(self.header.keys()), + sorted(self.samfile.header.keys())) + + def test_dictionary_values_works(self): + self.assertEqual(len(self.header.values()), len(self.samfile.header.values())) + + def test_dictionary_get_works(self): + self.assertEqual(self.header.get("HD"), {'VN': '1.0'}) + self.assertEqual(self.header.get("UK", "xyz"), "xyz") + self.assertEqual(self.header.get("UK"), None) + + def test_dictionary_contains_works(self): + self.assertTrue("HD" in self.header) + self.assertFalse("UK" in self.header) + + def tearDown(self): + self.samfile.close() + + +class TestHeaderBAM(TestHeaderSAM): + + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(BAM_DATADIR, "ex3.bam"), + "rb") + + +class TestHeaderCRAM(TestHeaderSAM): + + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(BAM_DATADIR, "ex3.cram"), + "rc") + + def compare_headers(self, a, b): + '''compare two headers a and b.''' + def _strip(dd): + for x in dd: + for y in ("M5", "UR"): + if y in x: + del x[y] + for ak, av in a.items(): + _strip(av) + self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) + _strip(b[ak]) + + self.assertEqual(av, b[ak]) + + +class TestHeaderFromRefs(unittest.TestCase): + '''see issue 144 + + reference names need to be converted to string for python 3 + ''' + + # def testHeader( self ): + # refs = ['chr1', 'chr2'] + # tmpfile = "tmp_%i" % id(self) + # s = pysam.AlignmentFile(tmpfile, 'wb', + # referencenames=refs, + # referencelengths=[100]*len(refs)) + # s.close() + + # self.assertTrue( checkBinaryEqual( 'issue144.bam', tmpfile ), + # 'bam files differ') + # os.unlink( tmpfile ) + + + +class TestHeaderWriteRead(unittest.TestCase): + header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, + {'LN': 1584, 'SN': 'chr2'}], + 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', + 'PU': 'SC_1_10', "CN": "name:with:colon"}, + {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', + 'PU': 'SC_2_12', "CN": "name:with:colon"}], + 'PG': [{'ID': 'P1', 'VN': '1.0', 'CL': 'tool'}, + {'ID': 'P2', 'VN': '1.1', 'CL': 'tool with in option -R a\tb', + 'PP': 'P1'}], + 'HD': {'VN': '1.0'}, + 'CO': ['this is a comment', 'this is another comment'], + } + + def compare_headers(self, a, header_b): + '''compare two headers a and b. + + Ignore M5 and UR field as they are set application specific. + ''' + b = header_b.to_dict() + for ak, av in a.items(): + self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) + self.assertEqual( + len(av), len(b[ak]), + "unequal number of entries for key {}: {} vs {}" + .format(ak, av, b[ak])) + + for row_a, row_b in zip(av, b[ak]): + if isinstance(row_b, dict): + for x in ["M5", "UR"]: + try: + del row_b[x] + except KeyError: + pass + self.assertEqual(row_a, row_b) + + def check_read_write(self, flag_write, header): + + fn = get_temp_filename() + with pysam.AlignmentFile( + fn, + flag_write, + header=header, + reference_filename=os.path.join(BAM_DATADIR, "ex1.fa")) as outf: + a = pysam.AlignedSegment() + a.query_name = "abc" + outf.write(a) + + with pysam.AlignmentFile(fn) as inf: + read_header = inf.header + + os.unlink(fn) + self.compare_headers(header, read_header) + + def test_SAM(self): + self.check_read_write("wh", self.header) + + def test_BAM(self): + self.check_read_write("wb", self.header) + + def test_CRAM(self): + header = copy.copy(self.header) + # for CRAM, \t needs to be quoted: + header['PG'][1]['CL'] = re.sub(r"\t", r"\\\\t", header['PG'][1]['CL']) + self.check_read_write("wc", header) diff --git a/tests/AlignmentFilePileup_bench.py b/tests/AlignmentFilePileup_bench.py new file mode 100644 index 0000000..24a06cb --- /dev/null +++ b/tests/AlignmentFilePileup_bench.py @@ -0,0 +1,147 @@ +"""Benchmarking module for AlignmentFile functionality""" +import os + +from TestUtils import BAM_DATADIR, force_str, flatten_nested_list +from PileupTestUtils import * + + +def test_build_pileup_from_bam_with_samtoolsshell(benchmark): + result = benchmark(build_pileup_with_samtoolsshell, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 2998 + + +def test_build_pileup_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_pileup_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 2998 + + +def test_build_pileup_from_bam_with_pysam(benchmark): + result = benchmark(build_pileup_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 2998 + + +def test_build_depth_from_bam_with_samtoolsshell(benchmark): + result = benchmark(build_depth_with_samtoolsshell, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 107241 + + +def test_build_depth_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_depth_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert sum(result) == 107241 + + +def test_build_depth_from_bam_with_pysam(benchmark): + result = benchmark(build_depth_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + # different value, as samtools filters with a minimum + # base quality of 13 + assert sum(result) == 110015 + + +def test_build_depth_with_filter_from_bam_with_pysam(benchmark): + result = benchmark(build_depth_with_filter_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert sum(result) == 107241 + + +def test_build_query_bases_from_bam_with_samtoolsshell(benchmark): + result = benchmark(build_query_bases_with_samtoolsshell, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 116308 + + +def test_build_query_bases_from_bam_with_samtoolspysam(benchmark): + result = benchmark(build_query_bases_with_samtoolspysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len("".join(flatten_nested_list(result))) == 116308 + + +def test_build_query_bases_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_query_bases_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len("".join(flatten_nested_list(result))) == 116308 + + +def test_build_query_bases_from_bam_with_pysam_pileups(benchmark): + # note that there is no overlap detection here + result = benchmark(build_query_bases_with_pysam_pileups, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len("".join(flatten_nested_list(result))) == 107241 + + +def test_build_query_bases_from_bam_with_pysam(benchmark): + result = benchmark(build_query_bases_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len("".join(flatten_nested_list(result))) == 116308 + + +# note that pileups with/without reference sequence will differ due to +# realignment. +def test_build_query_bases_with_reference_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_query_bases_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam"), + "-f", os.path.join(BAM_DATADIR, "ex1.fa")) + assert len("".join(flatten_nested_list(result))) == 115924 + + +def test_build_query_bases_with_reference_from_bam_with_pysam(benchmark): + with pysam.FastaFile(os.path.join(BAM_DATADIR, "ex1.fa")) as fasta: + result = benchmark(build_query_bases_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam"), + fastafile=fasta) + assert len("".join(flatten_nested_list(result))) == 115924 + + +def test_build_query_bases_with_reference_from_bam_with_samtoolspysam(benchmark): + result = benchmark(build_query_bases_with_samtoolspysam, + os.path.join(BAM_DATADIR, "ex2.bam"), + "-f", os.path.join(BAM_DATADIR, "ex1.fa")) + assert len("".join(flatten_nested_list(result))) == 115924 + + +def test_build_query_qualities_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_query_qualities_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len("".join(result)) == 107241 + + +def test_build_query_qualities_from_bam_with_pysam(benchmark): + result = benchmark(build_query_qualities_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert sum([len(x) for x in result]) == 107241 + + +def test_build_query_names_from_bam_with_pysam(benchmark): + result = benchmark(build_query_names_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len("".join([x for column in result for x in column])) == 2307343 + + +def test_build_mapping_qualities_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_mapping_qualities_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert len("".join(result)) == 107241 + + +def test_build_mapping_qualities_from_bam_with_pysam(benchmark): + result = benchmark(build_mapping_qualities_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert sum([len(x) for x in result]) == 107241 + + +def test_build_query_positions_from_bam_with_samtoolspipe(benchmark): + result = benchmark(build_query_positions_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + # positions output by samtools are 1-based + assert sum([sum(x) - len(x) for x in result]) == 1841699 + + +def test_build_query_positions_from_bam_with_pysam(benchmark): + result = benchmark(build_query_positions_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert sum([sum(x) for x in result]) == 1841699 diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py new file mode 100644 index 0000000..1851da8 --- /dev/null +++ b/tests/AlignmentFilePileup_test.py @@ -0,0 +1,384 @@ +"""Benchmarking module for AlignmentFile functionality""" +import os +import re +import unittest +from TestUtils import BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list +from PileupTestUtils import * + + +class TestPileupReadSelection(unittest.TestCase): + '''test pileup functionality.''' + + samfilename = os.path.join(BAM_DATADIR, "ex1.bam") + fastafilename = os.path.join(BAM_DATADIR, "ex1.fa") + + def setUp(self): + + self.samfile = pysam.AlignmentFile(self.samfilename) + self.fastafile = pysam.FastaFile(self.fastafilename) + + def tearDown(self): + self.samfile.close() + self.fastafile.close() + + def check_equal(self, references, iterator): + + for x, column in enumerate(iterator): + v = references[x][:-1].split("\t") + self.assertEqual( + len(v), 6, + "expected 6 values, got {}".format(v)) + (contig, pos, reference_base, + read_bases, read_qualities, alignment_mapping_qualities) \ + = v + self.assertEqual(int(pos) - 1, column.reference_pos) + + def test_samtools_stepper(self): + refs = force_str( + pysam.samtools.mpileup( + "-f", self.fastafilename, + self.samfilename)).splitlines(True) + iterator = self.samfile.pileup( + stepper="samtools", + fastafile=self.fastafile) + self.check_equal(refs, iterator) + + def test_all_stepper(self): + refs = force_str( + pysam.samtools.mpileup( + "-f", self.fastafilename, + "-A", "-B", + self.samfilename)).splitlines(True) + + iterator = self.samfile.pileup( + stepper="all", + fastafile=self.fastafile) + self.check_equal(refs, iterator) + + def test_ignore_overlaps(self): + refs = force_str( + pysam.samtools.mpileup( + "-f", self.fastafilename, + "-A", "-B", "-x", + self.samfilename)).splitlines(True) + + iterator = self.samfile.pileup( + stepper="all", + fastafile=self.fastafile, + ignore_overlaps=False) + self.check_equal(refs, iterator) + + def test_samtools_stepper_mapping_quality_threshold(self): + refs = force_str( + pysam.samtools.mpileup( + "-f", self.fastafilename, + "--min-MQ", "15", + self.samfilename)).splitlines(True) + iterator = self.samfile.pileup( + stepper="samtools", + fastafile=self.fastafile, + min_mapping_quality=15) + self.check_equal(refs, iterator) + + def test_samtools_stepper_base_quality_threshold(self): + refs = force_str( + pysam.samtools.mpileup( + "-f", self.fastafilename, + "--min-BQ", "20", + self.samfilename)).splitlines(True) + iterator = self.samfile.pileup( + stepper="samtools", + fastafile=self.fastafile, + min_base_quality=20) + self.check_equal(refs, iterator) + + def test_samtools_stepper_ignore_orphans(self): + refs = force_str( + pysam.samtools.mpileup( + "-f", self.fastafilename, + "--count-orphans", + self.samfilename)).splitlines(True) + iterator = self.samfile.pileup( + stepper="samtools", + fastafile=self.fastafile, + ignore_orphans=False) + self.check_equal(refs, iterator) + + def test_samtools_stepper_redo_baq(self): + refs = force_str( + pysam.samtools.mpileup( + "-f", self.fastafilename, + "--redo-BAQ", + self.samfilename)).splitlines(True) + iterator = self.samfile.pileup( + stepper="samtools", + fastafile=self.fastafile, + redo_baq=True) + self.check_equal(refs, iterator) + + +class TestPileupReadSelectionFastafile(TestPileupReadSelection): + '''test pileup functionality - backwards compatibility''' + + samfilename = os.path.join(BAM_DATADIR, "ex1.bam") + fastafilename = os.path.join(BAM_DATADIR, "ex1.fa") + + def setUp(self): + + self.samfile = pysam.AlignmentFile(self.samfilename) + self.fastafile = pysam.Fastafile(self.fastafilename) + + +class TestPileupQueryPosition(unittest.TestCase): + + filename = "test_query_position.bam" + + def testPileup(self): + last = {} + with pysam.AlignmentFile(os.path.join(BAM_DATADIR, self.filename)) as inf: + for col in inf.pileup(): + for r in col.pileups: + # print r.alignment.query_name + # print r.query_position, r.query_position_or_next, r.is_del + if r.is_del: + self.assertEqual(r.query_position, None) + self.assertEqual(r.query_position_or_next, + last[r.alignment.query_name] + 1) + else: + self.assertNotEqual(r.query_position, None) + last[r.alignment.query_name] = r.query_position + + +class TestPileupObjects(unittest.TestCase): + + def setUp(self): + self.samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"), + "rb") + + def testPileupColumn(self): + for pcolumn1 in self.samfile.pileup(region="chr1:105-106"): + if pcolumn1.reference_pos == 104: + self.assertEqual( + pcolumn1.reference_id, 0, + "chromosome/target id mismatch in position 1: %s != %s" % + (pcolumn1.reference_id, 0)) + self.assertEqual( + pcolumn1.reference_pos, 105 - 1, + "position mismatch in position 1: %s != %s" % + (pcolumn1.reference_pos, 105 - 1)) + self.assertEqual( + pcolumn1.nsegments, 1, + "# reads mismatch in position 1: %s != %s" % + (pcolumn1.nsegments, 1)) + self.assertEqual( + len(pcolumn1.pileups), 1, + "# reads aligned to column mismatch in position 1" + ": %s != %s" % + (len(pcolumn1.pileups), 1)) + + for pcolumn2 in self.samfile.pileup(region="chr2:1480-1481"): + if pcolumn2.reference_pos == 1479: + self.assertEqual( + pcolumn2.reference_id, 1, + "chromosome/target id mismatch in position 1: %s != %s" % + (pcolumn2.reference_id, 1)) + self.assertEqual( + pcolumn2.reference_pos, 1480 - 1, + "position mismatch in position 1: %s != %s" % + (pcolumn2.reference_pos, 1480 - 1)) + self.assertEqual( + pcolumn2.nsegments, 12, + "# reads mismatch in position 1: %s != %s" % + (pcolumn2.nsegments, 12)) + + def tearDown(self): + self.samfile.close() + + def testIteratorOutOfScope(self): + '''test if exception is raised if pileup col is accessed after + iterator is exhausted.''' + + for pileupcol in self.samfile.pileup(): + pass + + self.assertRaises(ValueError, getattr, pileupcol, "pileups") + + +class TestIteratorColumnBAM(unittest.TestCase): + + '''test iterator column against contents of ex4.bam.''' + + # note that samfile contains 1-based coordinates + # 1D means deletion with respect to reference sequence + # + mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35), + 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35), + } + + def setUp(self): + self.samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex4.bam"), + "rb") + + def checkRange(self, contig, start=None, end=None, truncate=False): + '''compare results from iterator with those from samtools.''' + # check if the same reads are returned and in the same order + for column in self.samfile.pileup( + contig, start, end, truncate=truncate, min_base_quality=0): + if truncate: + self.assertGreaterEqual(column.reference_pos, start) + self.assertLess(column.reference_pos, end) + thiscov = len(column.pileups) + refcov = self.mCoverages[ + self.samfile.getrname(column.reference_id)][column.reference_pos] + self.assertEqual(thiscov, refcov, + "wrong coverage at pos %s:%i %i should be %i" % ( + self.samfile.getrname(column.reference_id), + column.reference_pos, thiscov, refcov)) + + def testIterateAll(self): + '''check random access per contig''' + self.checkRange(None) + + def testIteratePerContig(self): + '''check random access per contig''' + for contig in self.samfile.references: + self.checkRange(contig) + + def testIterateRanges(self): + '''check random access per range''' + for contig, length in zip( + self.samfile.references, self.samfile.lengths): + for start in range(1, length, 90): + # this includes empty ranges + self.checkRange(contig, start, start + 90) + + def testInverse(self): + '''test the inverse, is point-wise pileup accurate.''' + for contig, refseq in list(self.mCoverages.items()): + refcolumns = sum(refseq) + for pos, refcov in enumerate(refseq): + columns = list(self.samfile.pileup(contig, pos, pos + 1)) + if refcov == 0: + # if no read, no coverage + self.assertEqual( + len(columns), + refcov, + "wrong number of pileup columns returned for position %s:%i, %i should be %i" % ( + contig, pos, + len(columns), refcov)) + elif refcov == 1: + # one read, all columns of the read are returned + self.assertEqual( + len(columns), + refcolumns, + "pileup incomplete at position %i: got %i, expected %i " % + (pos, len(columns), refcolumns)) + + def testIterateTruncate(self): + '''check random access per range''' + for contig, length in zip(self.samfile.references, + self.samfile.lengths): + for start in range(1, length, 90): + # this includes empty ranges + self.checkRange(contig, start, start + 90, truncate=True) + + def tearDown(self): + self.samfile.close() + + +class TestIteratorColumn2(unittest.TestCase): + + '''test iterator column against contents of ex1.bam.''' + + def setUp(self): + self.samfile = pysam.AlignmentFile( + os.path.join(BAM_DATADIR, "ex1.bam"), + "rb") + + def testStart(self): + # print self.samfile.fetch().next().reference_start + # print self.samfile.pileup().next().reference_start + pass + + def testTruncate(self): + '''see issue 107.''' + # note that ranges in regions start from 1 + p = self.samfile.pileup(region='chr1:170:172', truncate=True) + columns = [x.reference_pos for x in p] + self.assertEqual(len(columns), 3) + self.assertEqual(columns, [169, 170, 171]) + + p = self.samfile.pileup('chr1', 169, 172, truncate=True) + columns = [x.reference_pos for x in p] + + self.assertEqual(len(columns), 3) + self.assertEqual(columns, [169, 170, 171]) + + def testAccessOnClosedIterator(self): + '''see issue 131 + + Accessing pileup data after iterator has closed. + ''' + pcolumn = self.samfile.pileup('chr1', 170, 180).__next__() + self.assertRaises(ValueError, getattr, pcolumn, "pileups") + + def testStr(self): + '''test if PileupRead can be printed.''' + iter = self.samfile.pileup('chr1', 170, 180) + pcolumn = iter.__next__() + s = str(pcolumn) + self.assertEqual(len(s.split("\n")), 2) + + +@unittest.skipIf(not IS_PYTHON3, + "tests requires at least python3 for subprocess context manager") +class PileUpColumnTests(unittest.TestCase): + + fn = os.path.join(BAM_DATADIR, "ex2.bam") + fn_fasta = os.path.join(BAM_DATADIR, "ex1.fa") + + def test_pileup_depths_are_equal(self): + samtools_result = build_depth_with_samtoolspipe(self.fn) + pysam_result = build_depth_with_filter_with_pysam(self.fn) + self.assertEqual(pysam_result, samtools_result) + + def test_pileup_query_bases_without_reference_are_equal(self): + samtools_result = build_query_bases_with_samtoolspipe(self.fn) + pysam_result = build_query_bases_with_pysam(self.fn) + self.assertEqual(["".join(x) for x in pysam_result], samtools_result) + + def test_pileup_query_bases_with_reference_are_equal(self): + samtools_result = build_query_bases_with_samtoolspipe(self.fn, "-f", self.fn_fasta) + with pysam.FastaFile(self.fn_fasta) as fasta: + pysam_result = build_query_bases_with_pysam(self.fn, fastafile=fasta, stepper="samtools") + self.assertEqual(["".join(x) for x in pysam_result], samtools_result) + + def test_pileup_query_qualities_are_equal(self): + samtools_result = build_query_qualities_with_samtoolspipe(self.fn) + pysam_result = build_query_qualities_with_pysam(self.fn) + pysam_result = [ + [chr(min(126, x + 33)) for x in l] for l in pysam_result] + self.assertEqual("".join(flatten_nested_list(pysam_result)), + "".join(flatten_nested_list(samtools_result))) + + def test_pileup_mapping_qualities_are_equal(self): + samtools_result = build_mapping_qualities_with_samtoolspipe(self.fn) + pysam_result = build_mapping_qualities_with_pysam(self.fn) + # convert to chars + pysam_result = [ + [chr(min(126, x + 33)) for x in l] for l in pysam_result] + + self.assertEqual("".join(flatten_nested_list(pysam_result)), + "".join(flatten_nested_list(samtools_result))) + + def test_pileup_query_qualities_from_pileups_are_equal(self): + samtools_result = build_query_qualities_with_samtoolspipe(self.fn) + pysam_result = build_query_qualities_with_pysam_pileups(self.fn) + pysam_result = [ + "".join([chr(min(126, x + 33)) for x in l]) for l in pysam_result] + + self.assertEqual(pysam_result, samtools_result) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/AlignmentFile_bench.py b/tests/AlignmentFile_bench.py new file mode 100644 index 0000000..275a5fb --- /dev/null +++ b/tests/AlignmentFile_bench.py @@ -0,0 +1,60 @@ +"""Benchmarking module for AlignmentFile functionality""" +import os +import subprocess +import pysam + + +from TestUtils import BAM_DATADIR + + +def count_number_lines_with_samtools(fn): + os.system("samtools view {} | wc -l > /dev/null".format(fn)) + return 3270 + + +def count_number_lines_with_samtoolspipe(fn): + with subprocess.Popen(["samtools", "view", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE) as proc: + return len(proc.stdout.readlines()) + + +def count_number_lines_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + return len(list(inf.fetch())) + + +def test_count_number_lines_from_sam_with_samtools(benchmark): + result = benchmark(count_number_lines_with_samtools, + os.path.join(BAM_DATADIR, "ex2.sam")) + assert result == 3270 + + +def test_count_number_lines_from_sam_with_samtoolspipe(benchmark): + result = benchmark(count_number_lines_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.sam")) + assert result == 3270 + + +def test_count_number_lines_from_sam_with_pysam(benchmark): + result = benchmark(count_number_lines_with_pysam, + os.path.join(BAM_DATADIR, "ex2.sam"), "r") + assert result == 3270 + + +def test_count_number_lines_from_bam_with_samtools(benchmark): + result = benchmark(count_number_lines_with_samtools, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 3270 + + +def test_count_number_lines_from_bam_with_samtoolspipe(benchmark): + result = benchmark(count_number_lines_with_samtoolspipe, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 3270 + + +def test_count_number_lines_from_bam_with_pysam(benchmark): + result = benchmark(count_number_lines_with_pysam, + os.path.join(BAM_DATADIR, "ex2.bam")) + assert result == 3270 diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index e6f9bdb..700018c 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -53,99 +53,46 @@ class BasicTestBAMFromFetch(unittest.TestCase): self.samfile.close() def testARqname(self): - self.assertEqual( - self.reads[0].query_name, - "read_28833_29006_6945", - "read name mismatch in read 1: %s != %s" % ( - self.reads[0].query_name, "read_28833_29006_6945")) - self.assertEqual( - self.reads[1].query_name, - "read_28701_28881_323b", - "read name mismatch in read 2: %s != %s" % ( - self.reads[1].query_name, "read_28701_28881_323b")) + self.assertEqual(self.reads[0].query_name, + "read_28833_29006_6945") + self.assertEqual(self.reads[1].query_name, + "read_28701_28881_323b") def testARflag(self): - self.assertEqual( - self.reads[0].flag, 99, - "flag mismatch in read 1: %s != %s" % ( - self.reads[0].flag, 99)) - self.assertEqual( - self.reads[1].flag, 147, - "flag mismatch in read 2: %s != %s" % ( - self.reads[1].flag, 147)) + self.assertEqual(self.reads[0].flag, 99) + self.assertEqual(self.reads[1].flag, 147) def testARrname(self): - self.assertEqual( - self.reads[0].reference_id, 0, - "chromosome/target id mismatch in read 1: %s != %s" % - (self.reads[0].reference_id, 0)) - self.assertEqual( - self.reads[1].reference_id, 1, - "chromosome/target id mismatch in read 2: %s != %s" % - (self.reads[1].reference_id, 1)) + self.assertEqual(self.reads[0].reference_id, 0) + self.assertEqual(self.reads[1].reference_id, 1) def testARpos(self): - self.assertEqual( - self.reads[0].reference_start, 33 - 1, - "mapping position mismatch in read 1: %s != %s" % - (self.reads[0].reference_start, 33 - 1)) - self.assertEqual( - self.reads[1].reference_start, 88 - 1, - "mapping position mismatch in read 2: %s != %s" % - (self.reads[1].reference_start, 88 - 1)) + self.assertEqual(self.reads[0].reference_start, 33 - 1) + self.assertEqual(self.reads[1].reference_start, 88 - 1) def testARmapq(self): - self.assertEqual( - self.reads[0].mapping_quality, 20, - "mapping quality mismatch in read 1: %s != %s" % - (self.reads[0].mapping_quality, 20)) - self.assertEqual( - self.reads[1].mapping_quality, 30, - "mapping quality mismatch in read 2: %s != %s" % ( - self.reads[1].mapping_quality, 30)) + self.assertEqual(self.reads[0].mapping_quality, 20) + self.assertEqual(self.reads[1].mapping_quality, 30) def testARcigar(self): - self.assertEqual( - self.reads[0].cigartuples, - [(0, 10), (2, 1), (0, 25)], - "read name length mismatch in read 1: %s != %s" % - (self.reads[0].cigartuples, [(0, 10), (2, 1), (0, 25)])) - self.assertEqual( - self.reads[1].cigartuples, [(0, 35)], - "read name length mismatch in read 2: %s != %s" % - (self.reads[1].cigartuples, [(0, 35)])) + self.assertEqual(self.reads[0].cigartuples, [(0, 10), (2, 1), (0, 25)]) + self.assertEqual(self.reads[1].cigartuples, [(0, 35)]) def testARcigarstring(self): self.assertEqual(self.reads[0].cigarstring, '10M1D25M') self.assertEqual(self.reads[1].cigarstring, '35M') def testARmrnm(self): - self.assertEqual( - self.reads[0].next_reference_id, 0, - "mate reference sequence name mismatch in read 1: %s != %s" % - (self.reads[0].next_reference_id, 0)) - self.assertEqual( - self.reads[1].next_reference_id, 1, - "mate reference sequence name mismatch in read 2: %s != %s" % - (self.reads[1].next_reference_id, 1)) - self.assertEqual( - self.reads[0].next_reference_id, 0, - "mate reference sequence name mismatch in read 1: %s != %s" % - (self.reads[0].next_reference_id, 0)) - self.assertEqual( - self.reads[1].next_reference_id, 1, - "mate reference sequence name mismatch in read 2: %s != %s" % - (self.reads[1].next_reference_id, 1)) + self.assertEqual(self.reads[0].next_reference_id, 0) + self.assertEqual(self.reads[1].next_reference_id, 1) + self.assertEqual(self.reads[0].next_reference_id, 0) + self.assertEqual(self.reads[1].next_reference_id, 1) def testARmpos(self): - self.assertEqual(self.reads[ - 0].next_reference_start, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].next_reference_start, 200 - 1)) - self.assertEqual(self.reads[ - 1].next_reference_start, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].next_reference_start, 500 - 1)) - self.assertEqual(self.reads[ - 0].next_reference_start, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].next_reference_start, 200 - 1)) - self.assertEqual(self.reads[ - 1].next_reference_start, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].next_reference_start, 500 - 1)) + self.assertEqual(self.reads[0].next_reference_start, 200 - 1) + self.assertEqual(self.reads[1].next_reference_start, 500 - 1) + self.assertEqual(self.reads[0].next_reference_start, 200 - 1) + self.assertEqual(self.reads[1].next_reference_start, 500 - 1) def testARQueryLength(self): self.assertEqual( @@ -166,12 +113,15 @@ class BasicTestBAMFromFetch(unittest.TestCase): (self.reads[1].query_length, 35)) def testARseq(self): - self.assertEqual(self.reads[0].query_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 1: %s != %s" % ( - self.reads[0].query_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG")) - self.assertEqual(self.reads[1].query_sequence, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "sequence size mismatch in read 2: %s != %s" % ( - self.reads[1].query_sequence, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA")) - self.assertEqual(self.reads[3].query_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 4: %s != %s" % ( - self.reads[3].query_sequence, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG")) + self.assertEqual( + self.reads[0].query_sequence, + "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG") + self.assertEqual( + self.reads[1].query_sequence, + "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA") + self.assertEqual( + self.reads[3].query_sequence, + "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG") def testARqual(self): self.assertEqual( @@ -206,19 +156,22 @@ class BasicTestBAMFromFetch(unittest.TestCase): def testARqqual(self): self.assertEqual( - pysam.qualities_to_qualitystring(self.reads[0].query_alignment_qualities), + pysam.qualities_to_qualitystring( + self.reads[0].query_alignment_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<", "qquality string mismatch in read 1: %s != %s" % (pysam.qualities_to_qualitystring(self.reads[0].query_alignment_qualities), "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")) self.assertEqual( - pysam.qualities_to_qualitystring(self.reads[1].query_alignment_qualities), + pysam.qualities_to_qualitystring( + self.reads[1].query_alignment_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "qquality string mismatch in read 2: %s != %s" % (pysam.qualities_to_qualitystring(self.reads[1].query_alignment_qualities), "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<")) self.assertEqual( - pysam.qualities_to_qualitystring(self.reads[3].query_alignment_qualities), + pysam.qualities_to_qualitystring( + self.reads[3].query_alignment_qualities), "<<<<<<<<<<<<<<<<<:<9/,&,22", "qquality string mismatch in read 3: %s != %s" % (pysam.qualities_to_qualitystring(self.reads[3].query_alignment_qualities), @@ -404,7 +357,7 @@ class BasicTestSAMFromStringIO(BasicTestBAMFromFetch): def testRaises(self): statement = "samtools view -h {}".format( - os.path.join(BAM_DATADIR, "ex3.bam")) + os.path.join(BAM_DATADIR, "ex3.bam")) stdout = subprocess.check_output(statement.split(" ")) bam = StringIO() if sys.version_info.major >= 3: @@ -522,12 +475,17 @@ class TestIO(unittest.TestCase): "rb", "wb") def testCRAM2CRAM(self): + # in some systems different reference sequence paths might be + # embedded in the CRAM files which will result in different headers + # see #542 self.checkEcho("ex2.cram", "ex2.cram", "tmp_ex2.cram", "rc", "wc", sequence_filename=os.path.join(BAM_DATADIR, "ex1.fa"), - checkf=check_samtools_view_equal) + checkf=partial( + check_samtools_view_equal, + without_header=True)) def testSAM2BAM(self): self.checkEcho("ex2.sam", @@ -613,9 +571,9 @@ class TestIO(unittest.TestCase): check_header=True) with pysam.AlignmentFile( - input_filename, - check_header=False, - check_sq=False) as infile: + input_filename, + check_header=False, + check_sq=False) as infile: result = list(infile.fetch(until_eof=True)) self.assertEqual(2, len(result)) @@ -638,7 +596,7 @@ class TestIO(unittest.TestCase): check_header=True) with pysam.AlignmentFile( - input_filename, check_sq=False) as infile: + input_filename, check_sq=False) as infile: result = list(infile.fetch(until_eof=True)) def test_fail_read_sam_without_header(self): @@ -718,12 +676,13 @@ class TestIO(unittest.TestCase): # python file needs to be closed separately self.assertFalse(f.closed) - def testClosedFile(self): + def test_accessing_attributes_in_closed_file_raises_errors(self): '''test that access to a closed samfile raises ValueError.''' samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"), "rb") samfile.close() + self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120) self.assertRaises(ValueError, samfile.pileup, 'chr1', 100, 120) self.assertRaises(ValueError, samfile.getrname, 0) @@ -733,13 +692,36 @@ class TestIO(unittest.TestCase): self.assertRaises(ValueError, getattr, samfile, "nreferences") self.assertRaises(ValueError, getattr, samfile, "references") self.assertRaises(ValueError, getattr, samfile, "lengths") - self.assertRaises(ValueError, getattr, samfile, "text") - self.assertRaises(ValueError, getattr, samfile, "header") + self.assertEqual(samfile.header, None) # write on closed file self.assertEqual(0, samfile.write(None)) + def test_header_available_after_closing_file(self): + + def load_bam(): + with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"), "rb") as inf: + header = inf.header + return header + + header = load_bam() + self.assertTrue(header) + self.assertEqual(header.nreferences, 2) + self.assertEqual(header.references, ("chr1", "chr2")) + def test_reference_name_available_after_closing_file(self): + """read tids can be mapped to references after AlignmentFile has been closed. + + see issue #517""" + + def load_bam(): + with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"), "rb") as inf: + read = next(inf) + return read + + read = load_bam() + self.assertEqual(read.reference_name, "chr1") + # TOOD # def testReadingFromSamFileWithoutHeader(self): # '''read from samfile without header. @@ -752,15 +734,16 @@ class TestIO(unittest.TestCase): def testReadingFromFileWithoutIndex(self): '''read from bam file without index.''' + dest = get_temp_filename("tmp_ex2.bam") shutil.copyfile(os.path.join(BAM_DATADIR, "ex2.bam"), - 'tests/tmp_ex2.bam') - samfile = pysam.AlignmentFile('tests/tmp_ex2.bam', + dest) + samfile = pysam.AlignmentFile(dest, "rb") self.assertRaises(ValueError, samfile.fetch) self.assertEqual( len(list(samfile.fetch(until_eof=True))), 3270) - os.unlink('tests/tmp_ex2.bam') + os.unlink(dest) # def testReadingUniversalFileMode(self): # '''read from samfile without header. @@ -855,9 +838,19 @@ class TestIO(unittest.TestCase): IndexError, samfile.fetch, tid=-1) - self.assertEqual(len(list(samfile.fetch('chr1',start=1000, end=2000))), + self.assertEqual(len(list(samfile.fetch('chr1', start=1000, end=2000))), len(list(samfile.fetch(tid=0, start=1000, end=2000)))) + def test_write_bam_to_unknown_path_fails(self): + '''see issue 116''' + input_filename = os.path.join(BAM_DATADIR, "ex1.bam") + with pysam.AlignmentFile(input_filename) as inf: + self.assertRaises(IOError, + pysam.AlignmentFile, + "missing_directory/new_file.bam", + "wb", + template=inf) + class TestAutoDetect(unittest.TestCase): @@ -993,88 +986,6 @@ class TestIteratorRowAllBAM(unittest.TestCase): self.samfile.close() -class TestIteratorColumnBAM(unittest.TestCase): - - '''test iterator column against contents of ex4.bam.''' - - # note that samfile contains 1-based coordinates - # 1D means deletion with respect to reference sequence - # - mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35), - 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35), - } - - def setUp(self): - self.samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex4.bam"), - "rb") - - def checkRange(self, contig, start=None, end=None, truncate=False): - '''compare results from iterator with those from samtools.''' - # check if the same reads are returned and in the same order - for column in self.samfile.pileup( - contig, start, end, truncate=truncate): - if truncate: - self.assertGreaterEqual(column.reference_pos, start) - self.assertLess(column.reference_pos, end) - thiscov = len(column.pileups) - refcov = self.mCoverages[ - self.samfile.getrname(column.reference_id)][column.reference_pos] - self.assertEqual(thiscov, refcov, - "wrong coverage at pos %s:%i %i should be %i" % ( - self.samfile.getrname(column.reference_id), - column.reference_pos, thiscov, refcov)) - - def testIterateAll(self): - '''check random access per contig''' - self.checkRange(None) - - def testIteratePerContig(self): - '''check random access per contig''' - for contig in self.samfile.references: - self.checkRange(contig) - - def testIterateRanges(self): - '''check random access per range''' - for contig, length in zip( - self.samfile.references, self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange(contig, start, start + 90) - - def testInverse(self): - '''test the inverse, is point-wise pileup accurate.''' - for contig, refseq in list(self.mCoverages.items()): - refcolumns = sum(refseq) - for pos, refcov in enumerate(refseq): - columns = list(self.samfile.pileup(contig, pos, pos + 1)) - if refcov == 0: - # if no read, no coverage - self.assertEqual( - len(columns), - refcov, - "wrong number of pileup columns returned for position %s:%i, %i should be %i" % ( - contig, pos, - len(columns), refcov)) - elif refcov == 1: - # one read, all columns of the read are returned - self.assertEqual( - len(columns), - refcolumns, - "pileup incomplete at position %i: got %i, expected %i " % - (pos, len(columns), refcolumns)) - - def testIterateTruncate(self): - '''check random access per range''' - for contig, length in zip(self.samfile.references, - self.samfile.lengths): - for start in range(1, length, 90): - # this includes empty ranges - self.checkRange(contig, start, start + 90, truncate=True) - - def tearDown(self): - self.samfile.close() - - class TestIteratorRowCRAM(TestIteratorRowBAM): filename = os.path.join(BAM_DATADIR, "ex2.cram") mode = "rc" @@ -1084,9 +995,6 @@ class TestIteratorRowCRAMWithReferenceFilename(TestIteratorRowCRAM): reference_filename = os.path.join(BAM_DATADIR, "ex1.fa") -########################################################## -########################################################## -########################################################## # needs to be implemented # class TestAlignedSegmentFromSamWithoutHeader(TestAlignedSegmentFromBam): # @@ -1095,50 +1003,6 @@ class TestIteratorRowCRAMWithReferenceFilename(TestIteratorRowCRAM): # self.reads=list(self.samfile.fetch()) -class TestIteratorColumn2(unittest.TestCase): - - '''test iterator column against contents of ex1.bam.''' - - def setUp(self): - self.samfile = pysam.AlignmentFile( - os.path.join(BAM_DATADIR, "ex1.bam"), - "rb") - - def testStart(self): - # print self.samfile.fetch().next().reference_start - # print self.samfile.pileup().next().reference_start - pass - - def testTruncate(self): - '''see issue 107.''' - # note that ranges in regions start from 1 - p = self.samfile.pileup(region='chr1:170:172', truncate=True) - columns = [x.reference_pos for x in p] - self.assertEqual(len(columns), 3) - self.assertEqual(columns, [169, 170, 171]) - - p = self.samfile.pileup('chr1', 169, 172, truncate=True) - columns = [x.reference_pos for x in p] - - self.assertEqual(len(columns), 3) - self.assertEqual(columns, [169, 170, 171]) - - def testAccessOnClosedIterator(self): - '''see issue 131 - - Accessing pileup data after iterator has closed. - ''' - pcolumn = self.samfile.pileup('chr1', 170, 180).__next__() - self.assertRaises(ValueError, getattr, pcolumn, "pileups") - - def testStr(self): - '''test if PileupRead can be printed.''' - iter = self.samfile.pileup('chr1', 170, 180) - pcolumn = iter.__next__() - s = str(pcolumn) - self.assertEqual(len(s.split("\n")), 2) - - class TestFloatTagBug(unittest.TestCase): '''see issue 71''' @@ -1163,13 +1027,14 @@ class TestLargeFieldBug(unittest.TestCase): causes an error: NotImplementedError: tags field too large ''' - samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "issue100.bam")) + samfile = pysam.AlignmentFile( + os.path.join(BAM_DATADIR, "issue100.bam")) read = next(samfile.fetch(until_eof=True)) new_read = pysam.AlignedSegment() new_read.tags = read.tags self.assertEqual(new_read.tags, read.tags) - + class TestClipping(unittest.TestCase): def testClipping(self): @@ -1186,7 +1051,8 @@ class TestClipping(unittest.TestCase): self.assertEqual(pysam.qualities_to_qualitystring(read.query_qualities), None) self.assertEqual( - pysam.qualities_to_qualitystring(read.query_alignment_qualities), + pysam.qualities_to_qualitystring( + read.query_alignment_qualities), None) elif read.query_name == "r002": @@ -1197,7 +1063,8 @@ class TestClipping(unittest.TestCase): pysam.qualities_to_qualitystring(read.query_qualities), '01234567890') self.assertEqual( - pysam.qualities_to_qualitystring(read.query_alignment_qualities), + pysam.qualities_to_qualitystring( + read.query_alignment_qualities), '567890') elif read.query_name == "r003": @@ -1208,7 +1075,8 @@ class TestClipping(unittest.TestCase): pysam.qualities_to_qualitystring(read.query_qualities), '01234567890') self.assertEqual( - pysam.qualities_to_qualitystring(read.query_alignment_qualities), + pysam.qualities_to_qualitystring( + read.query_alignment_qualities), '012345') elif read.query_name == "r004": @@ -1219,262 +1087,32 @@ class TestClipping(unittest.TestCase): pysam.qualities_to_qualitystring(read.query_qualities), '01234') self.assertEqual( - pysam.qualities_to_qualitystring(read.query_alignment_qualities), + pysam.qualities_to_qualitystring( + read.query_alignment_qualities), '01234') -class TestHeaderSAM(unittest.TestCase): - - """testing header manipulation""" - - header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'}, - {'LN': 1584, 'SN': 'chr2', 'AH': '*'}], - 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', - 'PU': 'SC_1_10', "CN": "name:with:colon"}, - {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', - 'PU': 'SC_2_12', "CN": "name:with:colon"}], - 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}], - 'HD': {'VN': '1.0'}, - 'CO': ['this is a comment', 'this is another comment'], - } - - def compareHeaders(self, a, b): - '''compare two headers a and b.''' - for ak, av in a.items(): - self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) - self.assertEqual(av, b[ak]) - - def setUp(self): - self.samfile = pysam.AlignmentFile( - os.path.join(BAM_DATADIR, "ex3.sam"), - "r") - - def testHeaders(self): - self.compareHeaders(self.header, self.samfile.header) - self.compareHeaders(self.samfile.header, self.header) - - def testNameMapping(self): - for x, y in enumerate(("chr1", "chr2")): - tid = self.samfile.gettid(y) - ref = self.samfile.getrname(x) - self.assertEqual(tid, x) - self.assertEqual(ref, y) - - self.assertEqual(self.samfile.gettid("chr?"), -1) - self.assertRaises(ValueError, self.samfile.getrname, 2) - - def tearDown(self): - self.samfile.close() - - -class TestHeaderBAM(TestHeaderSAM): - - def setUp(self): - self.samfile = pysam.AlignmentFile( - os.path.join(BAM_DATADIR, "ex3.bam"), - "rb") - - -class TestHeaderCRAM(TestHeaderSAM): - - def setUp(self): - self.samfile = pysam.AlignmentFile( - os.path.join(BAM_DATADIR, "ex3.cram"), - "rc") - - def compareHeaders(self, a, b): - '''compare two headers a and b.''' - def _strip(dd): - for x in dd: - for y in ("M5", "UR"): - if y in x: - del x[y] - - for ak, av in a.items(): - _strip(av) - self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) - _strip(b[ak]) - - self.assertEqual(av, b[ak]) - - -class TestHeaderFromRefs(unittest.TestCase): - - '''see issue 144 +class TestUnmappedReadsRetrieval(unittest.TestCase): - reference names need to be converted to string for python 3 - ''' - - # def testHeader( self ): - # refs = ['chr1', 'chr2'] - # tmpfile = "tmp_%i" % id(self) - # s = pysam.AlignmentFile(tmpfile, 'wb', - # referencenames=refs, - # referencelengths=[100]*len(refs)) - # s.close() - - # self.assertTrue( checkBinaryEqual( 'issue144.bam', tmpfile ), - # 'bam files differ') - # os.unlink( tmpfile ) - - -class TestHeader1000Genomes(unittest.TestCase): - - '''see issue 110''' - # bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase2b_alignment/data/NA07048/exome_alignment/NA07048.unmapped.ILLUMINA.bwa.CEU.exome.20120522_p2b.bam" - bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam" - - def testRead(self): - - if not checkURL(self.bamfile): - return - - f = pysam.AlignmentFile(self.bamfile, "rb") - data = f.header.copy() - self.assertTrue(data) - - -class TestHeaderWriteRead(unittest.TestCase): - header = {'SQ': [{'LN': 1575, 'SN': 'chr1'}, - {'LN': 1584, 'SN': 'chr2'}], - 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', - 'PU': 'SC_1_10', "CN": "name:with:colon"}, - {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', - 'PU': 'SC_2_12', "CN": "name:with:colon"}], - 'PG': [{'ID': 'P1', 'VN': '1.0', 'CL': 'tool'}, - {'ID': 'P2', 'VN': '1.1', 'CL': 'tool with in option -R a\tb', - 'PP': 'P1'}], - 'HD': {'VN': '1.0'}, - 'CO': ['this is a comment', 'this is another comment'], - } - - def compare_headers(self, a, b): - '''compare two headers a and b. - - Ignore M5 and UR field as they are set application specific. - ''' - for ak, av in a.items(): - self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b)) - self.assertEqual( - len(av), len(b[ak]), - "unequal number of entries for key {}: {} vs {}" - .format(ak, av, b[ak])) - - for row_a, row_b in zip(av, b[ak]): - if isinstance(row_b, dict): - for x in ["M5", "UR"]: - try: - del row_b[x] - except KeyError: - pass - self.assertEqual(row_a, row_b) - - def check_read_write(self, flag_write, header): - - fn = get_temp_filename() - with pysam.AlignmentFile( - fn, - flag_write, - header=header, - reference_filename=os.path.join(BAM_DATADIR, "ex1.fa")) as outf: - a = pysam.AlignedSegment() - a.query_name = "abc" - outf.write(a) - - with pysam.AlignmentFile(fn) as inf: - read_header = inf.header - - os.unlink(fn) - self.compare_headers(header, read_header) - - def test_SAM(self): - self.check_read_write("wh", self.header) - - def test_BAM(self): - self.check_read_write("wb", self.header) - - def test_CRAM(self): - header = copy.copy(self.header) - # for CRAM, \t needs to be quoted: - header['PG'][1]['CL'] = re.sub(r"\t", r"\\\\t", header['PG'][1]['CL']) - self.check_read_write("wc", header) - - -class TestUnmappedReads(unittest.TestCase): - - # TODO - # def testSAM(self): - # samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex5.sam"), - # "r") - # self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2) - # samfile.close() - - def testBAM(self): - samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex5.bam"), - "rb") - self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2) - samfile.close() - - -class TestPileupObjects(unittest.TestCase): - - def setUp(self): - self.samfile = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex1.bam"), - "rb") - - def testPileupColumn(self): - for pcolumn1 in self.samfile.pileup(region="chr1:105"): - if pcolumn1.reference_pos == 104: - self.assertEqual( - pcolumn1.reference_id, 0, - "chromosome/target id mismatch in position 1: %s != %s" % - (pcolumn1.reference_id, 0)) - self.assertEqual( - pcolumn1.reference_pos, 105 - 1, - "position mismatch in position 1: %s != %s" % - (pcolumn1.reference_pos, 105 - 1)) - self.assertEqual( - pcolumn1.nsegments, 2, - "# reads mismatch in position 1: %s != %s" % - (pcolumn1.nsegments, 2)) - for pcolumn2 in self.samfile.pileup(region="chr2:1480"): - if pcolumn2.reference_pos == 1479: - self.assertEqual( - pcolumn2.reference_id, 1, - "chromosome/target id mismatch in position 1: %s != %s" % - (pcolumn2.reference_id, 1)) - self.assertEqual( - pcolumn2.reference_pos, 1480 - 1, - "position mismatch in position 1: %s != %s" % - (pcolumn2.reference_pos, 1480 - 1)) - self.assertEqual( - pcolumn2.nsegments, 12, - "# reads mismatch in position 1: %s != %s" % - (pcolumn2.nsegments, 12)) - - def testPileupRead(self): - for pcolumn1 in self.samfile.pileup(region="chr1:105"): - if pcolumn1.reference_pos == 104: - self.assertEqual( - len(pcolumn1.pileups), 2, - "# reads aligned to column mismatch in position 1" - ": %s != %s" % - (len(pcolumn1.pileups), 2)) - - # self.assertEqual( pcolumn1.pileups[0] # need to test additional - # properties here - - def tearDown(self): - self.samfile.close() + def test_fetch_from_sam_with_until_eof_reads_unmapped_reads(self): + with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex5.sam"), + "rb") as samfile: + self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2) - def testIteratorOutOfScope(self): - '''test if exception is raised if pileup col is accessed after - iterator is exhausted.''' + def test_fetch_from_bam_with_until_eof_reads_unmapped_reads(self): + with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "ex5.bam"), + "rb") as samfile: + self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2) - for pileupcol in self.samfile.pileup(): - pass + def test_fetch_with_asterisk_only_returns_unmapped_reads(self): + with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "test_mapped_unmapped.bam"), + "rb") as samfile: + self.assertEqual(len(list(samfile.fetch(region="*"))), 4) - self.assertRaises(ValueError, getattr, pileupcol, "pileups") + def test_fetch_with_asterisk_only_returns_unmapped_reads_by_contig(self): + with pysam.AlignmentFile(os.path.join(BAM_DATADIR, "test_mapped_unmapped.bam"), + "rb") as samfile: + self.assertEqual(len(list(samfile.fetch(contig="*"))), 4) class TestContextManager(unittest.TestCase): @@ -1572,7 +1210,7 @@ class TestWrongFormat(unittest.TestCase): def testOpenBamAsSam(self): # test fails, needs to be implemented. # sam.fetch() fails on reading, not on opening - #self.assertRaises(ValueError, pysam.AlignmentFile, + # self.assertRaises(ValueError, pysam.AlignmentFile, # os.path.join(BAM_DATADIR, 'ex1.bam'), # 'r') pass @@ -1598,7 +1236,7 @@ class TestDeNovoConstruction(unittest.TestCase): read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 - ''' + ''' # noqa header = {'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, @@ -1609,7 +1247,9 @@ class TestDeNovoConstruction(unittest.TestCase): def setUp(self): - a = pysam.AlignedSegment() + header = pysam.AlignmentHeader.from_dict(self.header) + + a = pysam.AlignedSegment(header) a.query_name = "read_28833_29006_6945" a.query_sequence = "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 @@ -1625,7 +1265,7 @@ class TestDeNovoConstruction(unittest.TestCase): a.tags = (("NM", 1), ("RG", "L1")) - b = pysam.AlignedSegment() + b = pysam.AlignedSegment(header) b.query_name = "read_28701_28881_323b" b.query_sequence = "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA" b.flag = 147 @@ -1667,8 +1307,6 @@ class TestDeNovoConstruction(unittest.TestCase): references = list(infile) for denovo, reference in zip(references, self.reads): checkFieldEqual(self, reference, denovo) - print("reference", str(reference), reference.get_tags(with_value_type=True)) - print("denovo", str(denovo), denovo.get_tags(with_value_type=True)) self.assertEqual(reference.compare(denovo), 0) # TODO @@ -1715,15 +1353,19 @@ class TestDeNovoConstructionUserTags(TestDeNovoConstruction): class TestEmptyHeader(unittest.TestCase): - '''see issue 84.''' def testEmptyHeader(self): s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, 'example_empty_header.bam')) - self.assertEqual(s.header, {'SQ': [{'LN': 1000, 'SN': 'chr1'}]}) + self.assertEqual(s.header.to_dict(), {'SQ': [{'LN': 1000, 'SN': 'chr1'}]}) + def test_bam_without_seq_in_header(self): + s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, "example_no_seq_in_header.bam")) + self.assertTrue("SQ" in s.header.to_dict()) + self.assertTrue("@SQ" in str(s.header)) + class TestHeaderWithProgramOptions(unittest.TestCase): '''see issue 39.''' @@ -1732,7 +1374,7 @@ class TestHeaderWithProgramOptions(unittest.TestCase): s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, 'rg_with_tab.bam')) self.assertEqual( - s.header, + s.header.to_dict(), {'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}], 'PG': [{'PN': 'bwa', @@ -1755,7 +1397,9 @@ class TestTruncatedBAM(unittest.TestCase): def testTruncatedBam2(self): s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, 'ex2_truncated.bam'), ignore_truncation=True) - iterall = lambda x: len([a for a in x]) + + def iterall(x): + return len([a for a in x]) self.assertRaises(IOError, iterall, s) @@ -1783,6 +1427,7 @@ COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204, 295, 0, 200, 16, 172, 3, 16, 182, 3, 11, 0, 0, 223, 111, 103, 0, 5, 225, 0, 95] + class TestBTagSam(unittest.TestCase): '''see issue 81.''' @@ -1855,22 +1500,43 @@ class TestDoubleFetchBAM(unittest.TestCase): samfile1.fetch(multiple_iterators=True)): self.assertEqual(a.compare(b), 0) - def testDoubleFetchWithRegion(self): + def testDoubleFetchWithRegionTrueTrue(self): with pysam.AlignmentFile(self.filename, self.mode) as samfile1: - contig, start, stop = 'chr1', 200, 3000000 + contig, start, stop = 'chr2', 200, 3000000 # just making sure the test has something to catch self.assertTrue(len(list(samfile1.fetch(contig, start, stop))) > 0) - # see Issue #293 - # The following fails for CRAM files, but works for BAM - # files when the first is multiple_iterators=False: for a, b in zip(samfile1.fetch(contig, start, stop, multiple_iterators=True), samfile1.fetch(contig, start, stop, multiple_iterators=True)): self.assertEqual(a.compare(b), 0) + def testDoubleFetchWithRegionFalseTrue(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + contig, start, stop = 'chr2', 200, 3000000 + # just making sure the test has something to catch + self.assertTrue(len(list(samfile1.fetch(contig, start, stop))) > 0) + + for a, b in zip(samfile1.fetch(contig, start, stop, + multiple_iterators=False), + samfile1.fetch(contig, start, stop, + multiple_iterators=True)): + self.assertEqual(a.compare(b), 0) + + def testDoubleFetchWithRegionTrueFalse(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + contig, start, stop = 'chr2', 200, 3000000 + # just making sure the test has something to catch + self.assertTrue(len(list(samfile1.fetch(contig, start, stop))) > 0) + + for a, b in zip(samfile1.fetch(contig, start, stop, + multiple_iterators=True), + samfile1.fetch(contig, start, stop, + multiple_iterators=False)): + self.assertEqual(a.compare(b), 0) + def testDoubleFetchUntilEOF(self): with pysam.AlignmentFile(self.filename, self.mode) as samfile1: @@ -2015,69 +1681,6 @@ class TestLargeOptValues(unittest.TestCase): self.check(samfile) -class TestPileup(unittest.TestCase): - - '''test pileup functionality.''' - - samfilename = os.path.join(BAM_DATADIR, "ex1.bam") - fastafilename = os.path.join(BAM_DATADIR, "ex1.fa") - - def setUp(self): - - self.samfile = pysam.AlignmentFile(self.samfilename) - self.fastafile = pysam.FastaFile(self.fastafilename) - - def tearDown(self): - self.samfile.close() - self.fastafile.close() - - def checkEqual(self, references, iterator): - - for x, column in enumerate(iterator): - v = references[x][:-1].split("\t") - self.assertEqual( - len(v), 6, - "expected 6 values, got {}".format(v)) - (contig, pos, reference_base, - read_bases, read_qualities, alignment_mapping_qualities) \ - = v - self.assertEqual(int(pos) - 1, column.reference_pos) - - def testSamtoolsStepper(self): - refs = force_str( - pysam.samtools.mpileup( - "-f", self.fastafilename, - self.samfilename)).splitlines(True) - iterator = self.samfile.pileup( - stepper="samtools", - fastafile=self.fastafile) - self.checkEqual(refs, iterator) - - def testAllStepper(self): - refs = force_str( - pysam.samtools.mpileup( - "-f", self.fastafilename, - "-A", "-B", - self.samfilename)).splitlines(True) - - iterator = self.samfile.pileup( - stepper="all", - fastafile=self.fastafile) - self.checkEqual(refs, iterator) - - -class TestPileupFastafile(TestPileup): - '''test pileup functionality - backwards compatibility''' - - samfilename = os.path.join(BAM_DATADIR, "ex1.bam") - fastafilename = os.path.join(BAM_DATADIR, "ex1.fa") - - def setUp(self): - - self.samfile = pysam.AlignmentFile(self.samfilename) - self.fastafile = pysam.Fastafile(self.fastafilename) - - class TestCountCoverage(unittest.TestCase): samfilename = os.path.join(BAM_DATADIR, "ex1.bam") @@ -2085,41 +1688,46 @@ class TestCountCoverage(unittest.TestCase): def setUp(self): - self.samfile = pysam.AlignmentFile(self.samfilename) - self.fastafile = pysam.FastaFile(self.fastafilename) - - samfile = pysam.AlignmentFile( - "tests/test_count_coverage_read_all.bam", 'wb', - template=self.samfile) - for ii, read in enumerate(self.samfile.fetch()): - # if ii % 2 == 0: # setting BFUNMAP makes no sense... - #read.flag = read.flag | 0x4 - if ii % 3 == 0: - read.flag = read.flag | 0x100 - if ii % 5 == 0: - read.flag = read.flag | 0x200 - if ii % 7 == 0: - read.flag = read.flag | 0x400 - samfile.write(read) - samfile.close() - pysam.samtools.index("tests/test_count_coverage_read_all.bam") + self.fastafile = pysam.Fastafile(self.fastafilename) + self.tmpfilename = get_temp_filename(".bam") + + with pysam.AlignmentFile(self.samfilename) as inf: + with pysam.AlignmentFile( + self.tmpfilename, + 'wb', + template=inf) as outf: + for ii, read in enumerate(inf.fetch()): + # if ii % 2 == 0: # setting BFUNMAP makes no sense... + #read.flag = read.flag | 0x4 + if ii % 3 == 0: + read.flag = read.flag | 0x100 + if ii % 5 == 0: + read.flag = read.flag | 0x200 + if ii % 7 == 0: + read.flag = read.flag | 0x400 + outf.write(read) + pysam.samtools.index(self.tmpfilename) def tearDown(self): - self.samfile.close() self.fastafile.close() - os.unlink("tests/test_count_coverage_read_all.bam") - os.unlink("tests/test_count_coverage_read_all.bam.bai") + os.unlink(self.tmpfilename) + os.unlink(self.tmpfilename + ".bai") - def count_coverage_python(self, bam, chrom, start, stop, + def count_coverage_python(self, + bam, chrom, start, stop, read_callback, quality_threshold=15): + stop = min(stop, bam.get_reference_length(chrom)) l = stop - start count_a = array.array('L', [0] * l) count_c = array.array('L', [0] * l) count_g = array.array('L', [0] * l) count_t = array.array('L', [0] * l) - for p in bam.pileup(chrom, start, stop, truncate=True, - stepper='nofilter'): + for p in bam.pileup(chrom, start, stop, + truncate=True, + stepper='nofilter', + min_base_quality=quality_threshold, + ignore_overlaps=False): rpos = p.reference_pos - start for read in p.pileups: if not read.is_del and not read.is_refskip and \ @@ -2140,18 +1748,56 @@ class TestCountCoverage(unittest.TestCase): pass return count_a, count_c, count_g, count_t - def test_count_coverage(self): + def test_count_coverage_with_coordinates_works(self): + + with pysam.AlignmentFile(self.samfilename) as inf: + c = inf.count_coverage("chr1") + self.assertEqual(len(c[0]), inf.get_reference_length("chr1")) + self.assertEqual(len(c[0]), 1575) + + c = inf.count_coverage("chr1", 100) + self.assertEqual(len(c[0]), inf.get_reference_length("chr1") - 100) + + c = inf.count_coverage("chr1", 100, 200) + self.assertEqual(len(c[0]), 200 - 100) + + c = inf.count_coverage("chr1", None, 200) + self.assertEqual(len(c[0]), 200) + + c = inf.count_coverage("chr1", None, inf.get_reference_length("chr1") + 10000) + self.assertEqual(len(c[0]), inf.get_reference_length("chr1")) + + def test_count_coverage_without_coordinates_fails(self): + with pysam.AlignmentFile(self.samfilename) as inf: + self.assertRaises(TypeError, inf.count_coverage) + + def test_count_coverage_wrong_coordinates_fails(self): + with pysam.AlignmentFile(self.samfilename) as inf: + self.assertRaises(ValueError, inf.count_coverage, "chr1", 200, 100) + self.assertRaises(KeyError, inf.count_coverage, "chrUnknown", 100, 200) + + def test_counting_the_same_region_works(self): + + with pysam.AlignmentFile(self.samfilename) as inf: + c1 = inf.count_coverage("chr1") + c2 = inf.count_coverage("chr1") + self.assertEqual(c1, c2) + + def test_count_coverage_counts_as_expected(self): chrom = 'chr1' start = 0 - stop = 2000 - manual_counts = self.count_coverage_python( - self.samfile, chrom, start, stop, - lambda read: True, - quality_threshold=0) - fast_counts = self.samfile.count_coverage( - chrom, start, stop, - read_callback=lambda read: True, - quality_threshold=0) + stop = 1000 + + with pysam.AlignmentFile(self.samfilename) as inf: + manual_counts = self.count_coverage_python( + inf, chrom, start, stop, + lambda read: True, + quality_threshold=0) + + fast_counts = inf.count_coverage( + chrom, start, stop, + read_callback=lambda read: True, + quality_threshold=0) self.assertEqual(list(fast_counts[0]), list(manual_counts[0])) self.assertEqual(list(fast_counts[1]), list(manual_counts[1])) @@ -2161,15 +1807,17 @@ class TestCountCoverage(unittest.TestCase): def test_count_coverage_quality_filter(self): chrom = 'chr1' start = 0 - stop = 2000 - manual_counts = self.count_coverage_python( - self.samfile, chrom, start, stop, - lambda read: True, - quality_threshold=0) - fast_counts = self.samfile.count_coverage( - chrom, start, stop, - read_callback=lambda read: True, - quality_threshold=15) + stop = 1000 + with pysam.AlignmentFile(self.samfilename) as inf: + manual_counts = self.count_coverage_python( + inf, chrom, start, stop, + lambda read: True, + quality_threshold=0) + fast_counts = inf.count_coverage( + chrom, start, stop, + read_callback=lambda read: True, + quality_threshold=15) + # we filtered harder, should be less for i in range(4): for r in range(start, stop): @@ -2178,22 +1826,25 @@ class TestCountCoverage(unittest.TestCase): def test_count_coverage_read_callback(self): chrom = 'chr1' start = 0 - stop = 2000 - manual_counts = self.count_coverage_python( - self.samfile, chrom, start, stop, - lambda read: read.flag & 0x10, - quality_threshold=0) - fast_counts = self.samfile.count_coverage( - chrom, start, stop, - read_callback=lambda read: True, - quality_threshold=0) - for i in range(4): - for r in range(start, stop): - self.assertTrue(fast_counts[i][r] >= manual_counts[i][r]) - fast_counts = self.samfile.count_coverage( - chrom, start, stop, - read_callback=lambda read: read.flag & 0x10, - quality_threshold=0) + stop = 1000 + with pysam.AlignmentFile(self.samfilename) as inf: + manual_counts = self.count_coverage_python( + inf, chrom, start, stop, + lambda read: read.flag & 0x10, + quality_threshold=0) + fast_counts = inf.count_coverage( + chrom, start, stop, + read_callback=lambda read: True, + quality_threshold=0) + + for i in range(4): + for r in range(start, stop): + self.assertTrue(fast_counts[i][r] >= manual_counts[i][r]) + + fast_counts = inf.count_coverage( + chrom, start, stop, + read_callback=lambda read: read.flag & 0x10, + quality_threshold=0) self.assertEqual(fast_counts[0], manual_counts[0]) self.assertEqual(fast_counts[1], manual_counts[1]) @@ -2204,17 +1855,16 @@ class TestCountCoverage(unittest.TestCase): chrom = 'chr1' start = 0 - stop = 2000 + stop = 1000 def filter(read): return not (read.flag & (0x4 | 0x100 | 0x200 | 0x400)) - with pysam.AlignmentFile("tests/test_count_coverage_read_all.bam") as samfile: - + with pysam.AlignmentFile(self.tmpfilename) as samfile: fast_counts = samfile.count_coverage( chrom, start, stop, read_callback='all', - #read_callback = lambda read: ~(read.flag & (0x4 | 0x100 | 0x200 | 0x400)), + # read_callback = lambda read: ~(read.flag & (0x4 | 0x100 | 0x200 | 0x400)), quality_threshold=0) manual_counts = samfile.count_coverage( chrom, start, stop, @@ -2228,61 +1878,41 @@ class TestCountCoverage(unittest.TestCase): self.assertEqual(fast_counts[3], manual_counts[3]) def test_count_coverage_nofilter(self): - samfile = pysam.AlignmentFile( - "tests/test_count_coverage_nofilter.bam", 'wb', template=self.samfile) - for ii, read in enumerate(self.samfile.fetch()): - # if ii % 2 == 0: # setting BFUNMAP makes no sense... - #read.flag = read.flag | 0x4 - if ii % 3 == 0: - read.flag = read.flag | 0x100 - if ii % 5 == 0: - read.flag = read.flag | 0x200 - if ii % 7 == 0: - read.flag = read.flag | 0x400 - samfile.write(read) - samfile.close() - pysam.samtools.index("tests/test_count_coverage_nofilter.bam") + + with pysam.AlignmentFile(self.samfilename) as inf: + with pysam.AlignmentFile( + self.tmpfilename, 'wb', template=inf) as outf: + + for ii, read in enumerate(inf.fetch()): + # if ii % 2 == 0: # setting BFUNMAP makes no sense... + # read.flag = read.flag | 0x4 + if ii % 3 == 0: + read.flag = read.flag | 0x100 + if ii % 5 == 0: + read.flag = read.flag | 0x200 + if ii % 7 == 0: + read.flag = read.flag | 0x400 + outf.write(read) + + pysam.samtools.index(self.tmpfilename) chr = 'chr1' start = 0 - stop = 2000 + stop = 1000 - with pysam.AlignmentFile("tests/test_count_coverage_nofilter.bam") as samfile: + with pysam.AlignmentFile(self.tmpfilename) as inf: + fast_counts = inf.count_coverage(chr, start, stop, + read_callback='nofilter', + quality_threshold=0) - fast_counts = samfile.count_coverage(chr, start, stop, - read_callback='nofilter', - quality_threshold=0) - - manual_counts = self.count_coverage_python(samfile, chr, start, stop, + manual_counts = self.count_coverage_python(inf, chr, start, stop, read_callback=lambda x: True, quality_threshold=0) - os.unlink("tests/test_count_coverage_nofilter.bam") - os.unlink("tests/test_count_coverage_nofilter.bam.bai") self.assertEqual(fast_counts[0], manual_counts[0]) self.assertEqual(fast_counts[1], manual_counts[1]) self.assertEqual(fast_counts[2], manual_counts[2]) self.assertEqual(fast_counts[3], manual_counts[3]) - - -class TestPileupQueryPosition(unittest.TestCase): - - filename = "test_query_position.bam" - - def testPileup(self): - last = {} - with pysam.AlignmentFile(os.path.join(BAM_DATADIR, self.filename)) as inf: - for col in inf.pileup(): - for r in col.pileups: - # print r.alignment.query_name - # print r.query_position, r.query_position_or_next, r.is_del - if r.is_del: - self.assertEqual(r.query_position, None) - self.assertEqual(r.query_position_or_next, - last[r.alignment.query_name] + 1) - else: - self.assertNotEqual(r.query_position, None) - last[r.alignment.query_name] = r.query_position - + class TestFindIntrons(unittest.TestCase): samfilename = os.path.join(BAM_DATADIR, "ex_spliced.bam") @@ -2296,16 +1926,18 @@ class TestFindIntrons(unittest.TestCase): def test_total(self): all_read_counts = self.samfile.count() splice_sites = self.samfile.find_introns(self.samfile.fetch()) - self.assertEqual(sum(splice_sites.values()), all_read_counts -1) # there is a single unspliced read in there + # there is a single unspliced read in there + self.assertEqual(sum(splice_sites.values()), all_read_counts - 1) def test_first(self): reads = list(self.samfile.fetch())[:10] splice_sites = self.samfile.find_introns(reads) - starts = [14792+38 - 1] - stops = [14792+38 + 140 - 1] + starts = [14792 + 38 - 1] + stops = [14792 + 38 + 140 - 1] self.assertEqual(len(splice_sites), 1) self.assertTrue((starts[0], stops[0]) in splice_sites) - self.assertEqual(splice_sites[(starts[0], stops[0])], 9) # first one is the unspliced read + # first one is the unspliced read + self.assertEqual(splice_sites[(starts[0], stops[0])], 9) def test_all(self): reads = list(self.samfile.fetch()) @@ -2320,8 +1952,8 @@ class TestFindIntrons(unittest.TestCase): (17055, 17605): 3, (17055, 17914): 1, (17368, 17605): 7, - }) - self.assertEqual(should, splice_sites) + }) + self.assertEqual(should, splice_sites) class TestLogging(unittest.TestCase): @@ -2501,7 +2133,7 @@ class TestMappedUnmapped(unittest.TestCase): counts_contigs = [x.contig for x in counts] self.assertEqual(sorted(counts_contigs), sorted(inf.references)) - + for contig in inf.references: unmapped_flag = 0 unmapped_nopos = 0 @@ -2517,6 +2149,7 @@ class TestMappedUnmapped(unittest.TestCase): self.assertEqual(cc.unmapped, unmapped_flag) self.assertEqual(cc.total, mapped_flag + unmapped_flag) + class TestSamtoolsProxy(unittest.TestCase): '''tests for sanity checking access to samtools functions.''' @@ -2620,15 +2253,104 @@ class TestSanityCheckingBAM(unittest.TestCase): read = pysam.AlignedSegment() self.check_write(read) + +class TestHeader1000Genomes(unittest.TestCase): + + '''see issue 110''' + bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam" # noqa + + def testRead(self): + + if not checkURL(self.bamfile): + return + + f = pysam.AlignmentFile(self.bamfile, "rb") + data = f.header.copy() + self.assertTrue(data) + + +class TestLargeCigar(unittest.TestCase): + + def setUp(self): + self.read_length = 70000 + self.header = pysam.AlignmentHeader.from_references( + ["chr1", "chr2"], + [self.read_length * 2, self.read_length * 2]) + + def build_read(self): + '''build an example read.''' + + a = pysam.AlignedSegment(self.header) + l = self.read_length + a.query_name = "read_12345" + a.query_sequence = "A" * (l + 1) + a.flag = 0 + a.reference_id = 0 + a.reference_start = 20 + a.mapping_quality = 20 + a.cigarstring = "1M1D" * l + "1M" + self.assertEqual(len(a.cigartuples), 2 * l + 1) + a.next_reference_id = 0 + a.next_reference_start = 0 + a.template_length = l + a.query_qualities = pysam.qualitystring_to_array("1") * (l + 1) + return a + + def check_read(self, read, mode="bam"): + fn = get_temp_filename("tmp_largecigar.{}".format(mode)) + fn_reference = get_temp_filename("tmp_largecigar.fa") + + nrows = int(self.read_length * 2 / 80) + + s = "\n".join(["A" * 80 for x in range(nrows)]) + with open(fn_reference, "w") as outf: + outf.write(">chr1\n{seq}\n>chr2\n{seq}\n".format( + seq=s)) + + if mode == "bam": + write_mode = "wb" + elif mode == "sam": + write_mode = "w" + elif mode == "cram": + write_mode = "wc" + + with pysam.AlignmentFile(fn, write_mode, + header=self.header, + reference_filename=fn_reference) as outf: + outf.write(read) + + with pysam.AlignmentFile(fn) as inf: + ref_read = next(inf) + + if mode == "cram": + # in CRAM, the tag field is kept, while it is emptied by the BAM/SAM reader + self.assertEqual(read.cigarstring, ref_read.cigarstring) + else: + self.assertEqual(read, ref_read) + + os.unlink(fn) + os.unlink(fn_reference) + + def test_reading_writing_sam(self): + read = self.build_read() + self.check_read(read, mode="sam") + + def test_reading_writing_bam(self): + read = self.build_read() + self.check_read(read, mode="bam") + + def test_reading_writing_cram(self): + read = self.build_read() + self.check_read(read, mode="cram") + # SAM writing fails, as query length is 0 # class TestSanityCheckingSAM(TestSanityCheckingSAM): # mode = "w" - if __name__ == "__main__": # build data files - print ("building data files") + print("building data files") subprocess.call("make -C %s" % BAM_DATADIR, shell=True) - print ("starting tests") + print("starting tests") unittest.main() - print ("completed tests") + print("completed tests") diff --git a/tests/PileupTestUtils.py b/tests/PileupTestUtils.py new file mode 100644 index 0000000..652bd5b --- /dev/null +++ b/tests/PileupTestUtils.py @@ -0,0 +1,160 @@ +import os +import subprocess +import pysam + +from TestUtils import BAM_DATADIR, force_str + +def build_pileup_with_samtoolsshell(fn): + os.system("samtools mpileup {} 2> /dev/null | wc -l > /dev/null".format(fn)) + return 2998 + + +def build_pileup_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "mpileup", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + return len(proc.stdout.readlines()) + + +def build_pileup_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + return len(list(inf.pileup(stepper="samtools"))) + + +def build_depth_with_samtoolsshell(fn): + os.system( + "samtools mpileup {} 2> /dev/null | awk '{{a += $4}} END {{print a}}' > /dev/null".format(fn)) + return 107241 + + +def build_depth_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "mpileup", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + data = [x.split() for x in proc.stdout.readlines()] + return [int(x[3]) for x in data] + + +def build_depth_with_filter_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + return [x.get_num_aligned() for x in inf.pileup(stepper="samtools")] + + +def build_depth_with_pysam(*args, **kwargs): + with pysam.AlignmentFile(*args, **kwargs) as inf: + return [x.nsegments for x in inf.pileup(stepper="samtools")] + + +def build_query_bases_with_samtoolsshell(fn): + os.system("samtools mpileup {} 2> /dev/null | awk '{{a = a $5}} END {{print a}}' | wc -c > /dev/null".format(fn)) + return 116308 + + +def build_query_bases_with_samtoolspipe(fn, *args, **kwargs): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "mpileup", fn] + list(args), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + stdout = proc.stdout.read().decode() + return [x.split()[4] for x in stdout.splitlines()] + + +def build_query_bases_with_samtoolspysam(fn, *args): + return [x.split()[4] for x in pysam.samtools.mpileup(fn, *args).splitlines()] + + +def build_query_bases_with_pysam_pileups(*args, **kwargs): + total_pileup = [] + with pysam.AlignmentFile(*args, **kwargs) as inf: + total_pileup = [ + [r.alignment.query_sequence[r.query_position_or_next] + for r in column.pileups if r.query_position_or_next is not None] + for column in inf.pileup(stepper="samtools")] + return total_pileup + + +def build_query_qualities_with_pysam_pileups(*args, **kwargs): + total_pileup = [] + with pysam.AlignmentFile(*args, **kwargs) as inf: + total_pileup = [ + [r.alignment.query_qualities[r.query_position_or_next] + for r in column.pileups if r.query_position_or_next is not None] + for column in inf.pileup(stepper="samtools")] + return total_pileup + + +def build_query_bases_with_pysam(fn, *args, **kwargs): + total_pileup = [] + with pysam.AlignmentFile(fn) as inf: + total_pileup = [column.get_query_sequences( + mark_ends=True, add_indels=True, mark_matches=True) for column in + inf.pileup(*args, **kwargs)] + return total_pileup + + +def build_query_names_with_pysam(*args, **kwargs): + total_pileup = [] + with pysam.AlignmentFile(*args, **kwargs) as inf: + total_pileup = [column.get_query_names() for column in + inf.pileup(stepper="samtools")] + return total_pileup + + +def build_query_qualities_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "mpileup", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + data = [force_str(x).split()[5] for x in proc.stdout.readlines()] + return data + + +def build_query_qualities_with_pysam(*args, **kwargs): + total_pileup = [] + with pysam.AlignmentFile(*args, **kwargs) as inf: + total_pileup = [column.get_query_qualities() for column in + inf.pileup(stepper="samtools")] + return total_pileup + + +def build_mapping_qualities_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "mpileup", "-s", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + data = [force_str(x).split()[6] for x in proc.stdout.readlines()] + return data + + +def build_mapping_qualities_with_pysam(*args, **kwargs): + total_pileup = [] + with pysam.AlignmentFile(*args, **kwargs) as inf: + total_pileup = [column.get_mapping_qualities() for column in + inf.pileup(stepper="samtools")] + return total_pileup + + +def build_query_positions_with_samtoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen(["samtools", "mpileup", "-O", fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + data = [list(map(int, force_str(x).split()[6].split(","))) + for x in proc.stdout.readlines()] + return data + + +def build_query_positions_with_pysam(*args, **kwargs): + total_pileup = [] + with pysam.AlignmentFile(*args, **kwargs) as inf: + total_pileup = [column.get_query_positions() for column in + inf.pileup(stepper="samtools")] + return total_pileup diff --git a/tests/StreamFiledescriptors_test.py b/tests/StreamFiledescriptors_test.py index f6c5ced..f09ef37 100644 --- a/tests/StreamFiledescriptors_test.py +++ b/tests/StreamFiledescriptors_test.py @@ -4,13 +4,11 @@ import subprocess import threading import errno import unittest - from pysam import AlignmentFile +from TestUtils import BAM_DATADIR IS_PYTHON2 = sys.version_info[0] == 2 -from TestUtils import BAM_DATADIR - def alignmentfile_writer_thread(infile, outfile): def _writer_thread(infile, outfile): @@ -51,7 +49,8 @@ class StreamTest(unittest.TestCase): shell=True) as proc: in_stream = AlignmentFile(os.path.join(BAM_DATADIR, 'ex1.bam')) - out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header) + out_stream = AlignmentFile( + proc.stdin, 'wh', header=in_stream.header) writer = alignmentfile_writer_thread(in_stream, out_stream) @@ -63,7 +62,7 @@ class StreamTest(unittest.TestCase): @unittest.skip("test contains bug") def test_samtools_processing(self): - + # The following test causes the suite to hang # as the stream_processor raises: # ValueError: file has no sequences defined (mode='r') - is it SAM/BAM format? @@ -72,9 +71,10 @@ class StreamTest(unittest.TestCase): stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) as proc: - + in_stream = AlignmentFile(os.path.join(BAM_DATADIR, 'ex1.bam')) - out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header) + out_stream = AlignmentFile( + proc.stdin, 'wb', header=in_stream.header) writer = alignmentfile_writer_thread(in_stream, out_stream) @@ -83,7 +83,7 @@ class StreamTest(unittest.TestCase): out_stream, writer) self.assertEqual(read, 35) - + if __name__ == "__main__": unittest.main() diff --git a/tests/TestUtils.py b/tests/TestUtils.py index c5572d3..f4fe8e3 100644 --- a/tests/TestUtils.py +++ b/tests/TestUtils.py @@ -1,24 +1,27 @@ import sys import os -import pysam +import glob import difflib import gzip +import contextlib import inspect import tempfile +import pysam WORKDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "pysam_test_work")) BAM_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__), - "pysam_data")) + "pysam_data")) TABIX_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "tabix_data")) CBCF_DATADIR = os.path.abspath(os.path.join(os.path.dirname(__file__), - "cbcf_data")) + "cbcf_data")) -LINKDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "linker_tests")) +LINKDIR = os.path.abspath(os.path.join( + os.path.dirname(__file__), "..", "linker_tests")) TESTS_TEMPDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "tmp")) @@ -41,6 +44,7 @@ if IS_PYTHON3: return s.decode('ascii') except AttributeError: return s + def force_bytes(s): try: return s.encode('ascii') @@ -49,6 +53,7 @@ if IS_PYTHON3: else: def force_str(s): return s + def force_bytes(s): return s @@ -117,9 +122,9 @@ def check_samtools_view_equal( l1 = sorted(l1[:-1].split("\t")) l2 = sorted(l2[:-1].split("\t")) if l1 != l2: - print ("mismatch in line %i" % n) - print (l1) - print (l2) + print("mismatch in line %i" % n) + print(l1) + print(l2) return False else: return False @@ -200,14 +205,38 @@ def get_temp_filename(suffix=""): os.makedirs(TESTS_TEMPDIR) except OSError: pass + f = tempfile.NamedTemporaryFile( - prefix="tmp_{}_".format(caller_name), + prefix="pysamtests_tmp_{}_".format(caller_name), suffix=suffix, delete=False, dir=TESTS_TEMPDIR) + f.close() return f.name +@contextlib.contextmanager +def get_temp_context(suffix="", keep=False): + caller_name = inspect.getouterframes(inspect.currentframe(), 3)[1][3] + try: + os.makedirs(TESTS_TEMPDIR) + except OSError: + pass + + f = tempfile.NamedTemporaryFile( + prefix="pysamtests_tmp_{}_".format(caller_name), + suffix=suffix, + delete=False, + dir=TESTS_TEMPDIR) + + f.close() + yield f.name + + if not keep: + # clear up any indices as well + for f in glob.glob(f.name + "*"): + os.unlink(f) + def load_and_convert(filename, encode=True): '''load data from filename and convert all fields to string. @@ -232,3 +261,7 @@ def load_and_convert(filename, encode=True): data.append(d) return data + + +def flatten_nested_list(l): + return [i for ll in l for i in ll] diff --git a/tests/VariantFileFetchTestUtils.py b/tests/VariantFileFetchTestUtils.py new file mode 100644 index 0000000..1aaca37 --- /dev/null +++ b/tests/VariantFileFetchTestUtils.py @@ -0,0 +1,69 @@ +import os +import subprocess +import pysam + +try: + import cyvcf2 +except ImportError: + pass + + +from TestUtils import CBCF_DATADIR, force_str + +def build_filter_from_vcf_with_samtoolsshell(fn): + retval = os.popen( + "bcftools filter -e \"N_ALT != 1 || QUAL < 20 || maf[0]>0.05\" {} | grep -cv ^# ".format(fn)).read() + return int(retval.strip()) + + +def build_filter_from_vcf_with_bcftoolspipe(fn): + FNULL = open(os.devnull, 'w') + with subprocess.Popen([ + "bcftools", + "filter", + "-e", + "N_ALT != 1 || QUAL < 20 || maf[0]>0.05", + fn], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=FNULL) as proc: + data = [line for line in proc.stdout.readlines() if not line.startswith(b"#")] + return len(data) + + +def build_filter_from_vcf_with_cyvcf2(fn): + n = 0 + try: + for v in cyvcf2.VCF(fn): + if len(v.ALT) > 1: + continue + if v.QUAL < 20: + continue + if v.aaf > 0.05: + continue + n += 1 + except NameError: + n = 9120 + return n + + +def build_filter_from_vcf_with_pysam(fn): + n = 0 + with pysam.VariantFile(fn) as vcf: + for v in vcf: + # the two commands below take >1s out of 19s total + if len(v.alts) > 1: + continue + if v.qual < 20: + continue + # this takes 12s out of 19s total + gts = [s['GT'] for s in v.samples.values()] + # the lines below take 6s out of 19s total + an = sum(len(gt) for gt in gts) + ac = sum(sum(gt) for gt in gts) + aaf = (float(ac) / float(an)) + if aaf > 0.05: + continue + n += 1 + return n + diff --git a/tests/VariantFile_bench.py b/tests/VariantFile_bench.py new file mode 100644 index 0000000..d48760c --- /dev/null +++ b/tests/VariantFile_bench.py @@ -0,0 +1,59 @@ +"""Benchmarking module for AlignmentFile functionality""" +import os +import pytest + + +from TestUtils import BAM_DATADIR, force_str, flatten_nested_list +from VariantFileFetchTestUtils import * + + +GENOMES_URL = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz" + +CHROM = 22 + + +@pytest.fixture +def genomes_data(): + url = GENOMES_URL.format(chrom=CHROM) + fn = os.path.basename(url) + print(fn) + if not os.path.exists(fn): + os.system("wget {}".format(url)) + if not os.path.exists(fn + ".tbi"): + os.system("wget {}".format(url + ".tbi")) + + fn_small = "small.vcf.gz" + if not os.path.exists(fn_small): + os.system("bcftools view {} | head -n 10000 | bgzip > {}".format(fn, fn_small)) + os.system("tabix -p vcf {}".format(fn_small)) + + return fn_small + + +@pytest.mark.benchmark(min_rounds=1) +def test_build_filter_from_vcf_with_bcftoolsshell(benchmark, genomes_data): + result = benchmark(build_filter_from_vcf_with_samtoolsshell, genomes_data) + assert result == 9120 + + +@pytest.mark.benchmark(min_rounds=1) +def test_build_filter_from_vcf_with_bcftoolpipe(benchmark, genomes_data): + result = benchmark(build_filter_from_vcf_with_bcftoolspipe, genomes_data) + assert result == 9120 + + +@pytest.mark.benchmark(min_rounds=1) +def test_build_filter_from_vcf_with_cyvcf2(benchmark, genomes_data): + result = benchmark(build_filter_from_vcf_with_cyvcf2, genomes_data) + # note: inconsistent with bcftools + assert result == 9114 + + +@pytest.mark.benchmark(min_rounds=1) +def test_build_filter_from_vcf_with_pysam(benchmark, genomes_data): + result = benchmark(build_filter_from_vcf_with_pysam, genomes_data) + # note: inconsistent with bcftools + assert result == 9114 + + + diff --git a/tests/VariantFile_test.py b/tests/VariantFile_test.py index dd8df5b..eedcc9a 100644 --- a/tests/VariantFile_test.py +++ b/tests/VariantFile_test.py @@ -2,6 +2,7 @@ import os import sys import unittest import pysam +import shutil import gzip import subprocess @@ -10,7 +11,7 @@ try: except ImportError: Path = None -from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR +from TestUtils import get_temp_filename, check_lines_equal, load_and_convert, CBCF_DATADIR, get_temp_context def read_header(filename): @@ -44,12 +45,12 @@ class TestMissingGenotypes(unittest.TestCase): self.assertEqual(True, os.path.exists(fn)) v = pysam.VariantFile(fn) for site in v: - for ss,rec in site.samples.items(): + for ss, rec in site.samples.items(): a, b = ss, rec v = pysam.VariantFile(fn) for x, site in enumerate(v): - for ss,rec in site.samples.items(): + for ss, rec in site.samples.items(): a, b = ss, rec.alleles a, b = ss, rec.allele_indices @@ -60,6 +61,41 @@ class TestMissingGenotypes(unittest.TestCase): self.check(self.filename + ".gz") +class TestMissingSamples(unittest.TestCase): + + filename = "gnomad.vcf" + + def setUp(self): + self.compare = load_and_convert( + os.path.join(CBCF_DATADIR, self.filename), + encode=False) + + def check(self, filename): + """see issue #593""" + fn = os.path.join(CBCF_DATADIR, filename) + self.assertEqual(True, os.path.exists(fn)) + expect_fail = not "fixed" in self.filename + with pysam.VariantFile(fn) as inf: + rec = next(inf.fetch()) + if expect_fail: + self.assertRaises(ValueError, rec.info.__getitem__, "GC") + else: + self.assertEqual(rec.info["GC"], (27, 35, 16)) + + def testVCF(self): + self.check(self.filename) + + def testVCFGZ(self): + self.check(self.filename + ".gz") + + +class TestMissingSamplesFixed(TestMissingSamples): + # workaround for NUMBER=G in INFO records: + # perl 's/Number=G/Number=./ if (/INFO/)' + + filename = "gnomad_fixed.vcf" + + class TestOpening(unittest.TestCase): def testMissingFile(self): @@ -71,52 +107,40 @@ class TestOpening(unittest.TestCase): "missing_file.vcf.gz") def testEmptyFileVCF(self): - with open("tests/tmp_testEmptyFile.vcf", "w"): - pass - - self.assertRaises(ValueError, pysam.VariantFile, - "tests/tmp_testEmptyFile.vcf") - - os.unlink("tests/tmp_testEmptyFile.vcf") - - - if Path and sys.version_info >= (3,6): - def testEmptyFileVCFFromPath(self): - with open("tests/tmp_testEmptyFile.vcf", "w"): + with get_temp_context("tmp_testEmptyFile.vcf") as fn: + with open(fn, "w"): pass + self.assertRaises(ValueError, pysam.VariantFile, fn) - self.assertRaises(ValueError, pysam.VariantFile, - Path("tests/tmp_testEmptyFile.vcf")) - - os.unlink("tests/tmp_testEmptyFile.vcf") + if Path and sys.version_info >= (3, 6): + def testEmptyFileVCFFromPath(self): + with get_temp_context("tmp_testEmptyFile.vcf") as fn: + with open(fn, "w"): + pass + self.assertRaises(ValueError, pysam.VariantFile, + Path(fn)) def testEmptyFileVCFGZWithIndex(self): - with open("tests/tmp_testEmptyFile.vcf", "w"): - pass - - pysam.tabix_index("tests/tmp_testEmptyFile.vcf", - preset="vcf", - force=True) - - self.assertRaises(ValueError, pysam.VariantFile, - "tests/tmp_testEmptyFile.vcf.gz") + with get_temp_context("tmp_testEmptyFile.vcf") as fn: + with open(fn, "w"): + pass + # tabix_index will automatically compress + pysam.tabix_index(fn, + preset="vcf", + force=True) - os.unlink("tests/tmp_testEmptyFile.vcf.gz") - os.unlink("tests/tmp_testEmptyFile.vcf.gz.tbi") + self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz") def testEmptyFileVCFGZWithoutIndex(self): - with open("tests/tmp_testEmptyFileWithoutIndex.vcf", "w"): - pass - - pysam.tabix_compress("tests/tmp_testEmptyFileWithoutIndex.vcf", - "tests/tmp_testEmptyFileWithoutIndex.vcf.gz", - force=True) + with get_temp_context("tmp_testEmptyFileWithoutIndex.vcf") as fn: + with open(fn, "w"): + pass - self.assertRaises(ValueError, pysam.VariantFile, - "tests/tmp_testEmptyFileWithoutIndex.vcf.gz") + pysam.tabix_compress(fn, + fn + ".gz", + force=True) - os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf") - os.unlink("tests/tmp_testEmptyFileWithoutIndex.vcf.gz") + self.assertRaises(ValueError, pysam.VariantFile, fn + ".gz") def testEmptyFileVCFOnlyHeader(self): with pysam.VariantFile(os.path.join( @@ -132,7 +156,7 @@ class TestOpening(unittest.TestCase): def testDetectVCF(self): with pysam.VariantFile(os.path.join(CBCF_DATADIR, - "example_vcf40.vcf")) as inf: + "example_vcf40.vcf")) as inf: self.assertEqual(inf.category, 'VARIANTS') self.assertEqual(inf.format, 'VCF') self.assertEqual(inf.compression, 'NONE') @@ -142,7 +166,7 @@ class TestOpening(unittest.TestCase): def testDetectVCFGZ(self): with pysam.VariantFile(os.path.join(CBCF_DATADIR, - "example_vcf40.vcf.gz")) as inf: + "example_vcf40.vcf.gz")) as inf: self.assertEqual(inf.category, 'VARIANTS') self.assertEqual(inf.format, 'VCF') self.assertEqual(inf.compression, 'BGZF') @@ -162,6 +186,67 @@ class TestOpening(unittest.TestCase): self.assertEqual(len(list(inf.fetch())), 5) +class TestIndexFormatsVCF(unittest.TestCase): + + vcf_filename = os.path.join(CBCF_DATADIR, "example_vcf40.vcf") + bcf_filename = os.path.join(CBCF_DATADIR, "example_vcf40.bcf") + + def test_vcf_with_tbi_index(self): + with get_temp_context("tmp_fn.vcf") as fn: + shutil.copyfile(self.vcf_filename, fn) + pysam.tabix_index(fn, preset="vcf", force=True) + self.assertTrue(os.path.exists(fn + ".gz" + ".tbi")) + self.assertFalse(os.path.exists(fn + ".gz" + ".csi")) + + with pysam.VariantFile(fn + ".gz") as inf: + self.assertEqual(len(list(inf.fetch("20"))), 3) + + def test_vcf_with_csi_index(self): + with get_temp_context("tmp_fn.vcf") as fn: + shutil.copyfile(self.vcf_filename, fn) + + pysam.tabix_index(fn, preset="vcf", force=True, csi=True) + self.assertTrue(os.path.exists(fn + ".gz" + ".csi")) + self.assertFalse(os.path.exists(fn + ".gz" + ".tbi")) + + with pysam.VariantFile(fn + ".gz") as inf: + self.assertEqual(len(list(inf.fetch("20"))), 3) + + def test_bcf_with_prebuilt_csi(self): + with get_temp_context("tmp_fn.bcf") as fn: + shutil.copyfile(self.bcf_filename, fn) + shutil.copyfile(self.bcf_filename + ".csi", fn + ".csi") + + self.assertTrue(os.path.exists(fn + ".csi")) + self.assertFalse(os.path.exists(fn + ".tbi")) + + with pysam.VariantFile(fn) as inf: + self.assertEqual(len(list(inf.fetch("20"))), 3) + + def test_bcf_with_tbi_index_will_produce_csi(self): + with get_temp_context("tmp_fn.bcf") as fn: + shutil.copyfile(self.bcf_filename, fn) + + pysam.tabix_index(fn, preset="bcf", force=True, csi=False) + self.assertTrue(os.path.exists(fn + ".csi")) + self.assertFalse(os.path.exists(fn + ".tbi")) + + with pysam.VariantFile(fn) as inf: + self.assertEqual(len(list(inf.fetch("20"))), 3) + + def test_bcf_with_csi_index(self): + with get_temp_context("tmp_fn.bcf") as fn: + shutil.copyfile(self.bcf_filename, fn) + + pysam.tabix_index(fn, preset="vcf", force=True, csi=True) + + self.assertTrue(os.path.exists(fn + ".csi")) + self.assertFalse(os.path.exists(fn + ".tbi")) + + with pysam.VariantFile(fn) as inf: + self.assertEqual(len(list(inf.fetch("20"))), 3) + + class TestHeader(unittest.TestCase): filename = "example_vcf40.vcf" @@ -210,7 +295,7 @@ class TestParsing(unittest.TestCase): chrom = [rec.chrom for rec in v] self.assertEqual(chrom, ['M', '17', '20', '20', '20']) - if Path and sys.version_info >= (3,6): + if Path and sys.version_info >= (3, 6): def testChromFromPath(self): fn = os.path.join(CBCF_DATADIR, self.filename) v = pysam.VariantFile(Path(fn)) @@ -239,7 +324,8 @@ class TestParsing(unittest.TestCase): fn = os.path.join(CBCF_DATADIR, self.filename) v = pysam.VariantFile(fn) ids = [rec.id for rec in v] - self.assertEqual(ids, [None, 'rs6054257', None, 'rs6040355', 'microsat1']) + self.assertEqual( + ids, [None, 'rs6054257', None, 'rs6040355', 'microsat1']) def testRef(self): fn = os.path.join(CBCF_DATADIR, self.filename) @@ -251,13 +337,15 @@ class TestParsing(unittest.TestCase): fn = os.path.join(CBCF_DATADIR, self.filename) v = pysam.VariantFile(fn) alts = [rec.alts for rec in v] - self.assertEqual(alts, [None, ('A',), ('A',), ('G', 'T'), ('G', 'GTACT')]) + self.assertEqual(alts, [None, ('A',), ('A',), + ('G', 'T'), ('G', 'GTACT')]) def testAlleles(self): fn = os.path.join(CBCF_DATADIR, self.filename) v = pysam.VariantFile(fn) alleles = [rec.alleles for rec in v] - self.assertEqual(alleles, [('T',), ('G', 'A'), ('T', 'A'), ('A', 'G', 'T'), ('GTCT', 'G', 'GTACT')]) + self.assertEqual(alleles, [ + ('T',), ('G', 'A'), ('T', 'A'), ('A', 'G', 'T'), ('GTCT', 'G', 'GTACT')]) def testQual(self): fn = os.path.join(CBCF_DATADIR, self.filename) @@ -269,17 +357,20 @@ class TestParsing(unittest.TestCase): fn = os.path.join(CBCF_DATADIR, self.filename) v = pysam.VariantFile(fn) filter = [rec.filter.keys() for rec in v] - self.assertEqual(filter, [['PASS'], ['PASS'], ['q10'], ['PASS'], ['PASS']]) + self.assertEqual(filter, [['PASS'], ['PASS'], + ['q10'], ['PASS'], ['PASS']]) def testInfo(self): fn = os.path.join(CBCF_DATADIR, self.filename) v = pysam.VariantFile(fn) info = [rec.info.items() for rec in v] self.assertEqual(info, [[('NS', 3), ('DP', 13), ('AA', 'T')], - [('NS', 3), ('DP', 14), ('AF', (0.5,)), ('DB', True), ('H2', True)], - [('NS', 3), ('DP', 11), ('AF', (0.017000000923871994,))], + [('NS', 3), ('DP', 14), ('AF', (0.5,)), + ('DB', True), ('H2', True)], + [('NS', 3), ('DP', 11), + ('AF', (0.017000000923871994,))], [('NS', 2), ('DP', 10), ('AF', (0.3330000042915344, 0.6669999957084656)), - ('AA', 'T'), ('DB', True)], + ('AA', 'T'), ('DB', True)], [('NS', 3), ('DP', 9), ('AA', 'G')]]) def testFormat(self): @@ -308,17 +399,28 @@ class TestParsing(unittest.TestCase): v = pysam.VariantFile(fn) format = [s.items() for rec in v for s in rec.samples.values()] self.assertEqual(format, [[('GT', (0, 0)), ('GQ', 54), ('DP', 7), ('HQ', (56, 60))], - [('GT', (0, 0)), ('GQ', 48), ('DP', 4), ('HQ', (51, 51))], - [('GT', (0, 0)), ('GQ', 61), ('DP', 2), ('HQ', (None,))], - [('GT', (0, 0)), ('GQ', 48), ('DP', 1), ('HQ', (51, 51))], - [('GT', (1, 0)), ('GQ', 48), ('DP', 8), ('HQ', (51, 51))], - [('GT', (1, 1)), ('GQ', 43), ('DP', 5), ('HQ', (None, None))], - [('GT', (0, 0)), ('GQ', 49), ('DP', 3), ('HQ', (58, 50))], - [('GT', (0, 1)), ('GQ', 3), ('DP', 5), ('HQ', (65, 3))], - [('GT', (0, 0)), ('GQ', 41), ('DP', 3), ('HQ', (None,))], - [('GT', (1, 2)), ('GQ', 21), ('DP', 6), ('HQ', (23, 27))], - [('GT', (2, 1)), ('GQ', 2), ('DP', 0), ('HQ', (18, 2))], - [('GT', (2, 2)), ('GQ', 35), ('DP', 4), ('HQ', (None,))], + [('GT', (0, 0)), ('GQ', 48), + ('DP', 4), ('HQ', (51, 51))], + [('GT', (0, 0)), ('GQ', 61), + ('DP', 2), ('HQ', (None,))], + [('GT', (0, 0)), ('GQ', 48), + ('DP', 1), ('HQ', (51, 51))], + [('GT', (1, 0)), ('GQ', 48), + ('DP', 8), ('HQ', (51, 51))], + [('GT', (1, 1)), ('GQ', 43), + ('DP', 5), ('HQ', (None, None))], + [('GT', (0, 0)), ('GQ', 49), + ('DP', 3), ('HQ', (58, 50))], + [('GT', (0, 1)), ('GQ', 3), + ('DP', 5), ('HQ', (65, 3))], + [('GT', (0, 0)), ('GQ', 41), + ('DP', 3), ('HQ', (None,))], + [('GT', (1, 2)), ('GQ', 21), + ('DP', 6), ('HQ', (23, 27))], + [('GT', (2, 1)), ('GQ', 2), + ('DP', 0), ('HQ', (18, 2))], + [('GT', (2, 2)), ('GQ', 35), + ('DP', 4), ('HQ', (None,))], [('GT', (0, 1)), ('GQ', 35), ('DP', 4)], [('GT', (0, 2)), ('GQ', 17), ('DP', 2)], [('GT', (1, 1)), ('GQ', 40), ('DP', 3)]]) @@ -336,16 +438,15 @@ class TestIndexFilename(unittest.TestCase): filenames = [('example_vcf40.vcf.gz', 'example_vcf40.vcf.gz.tbi'), ('example_vcf40.vcf.gz', 'example_vcf40.vcf.gz.csi'), - ('example_vcf40.bcf', 'example_vcf40.bcf.csi')] + ('example_vcf40.bcf', 'example_vcf40.bcf.csi')] def testOpen(self): for fn, idx_fn in self.filenames: fn = os.path.join(CBCF_DATADIR, fn) idx_fn = os.path.join(CBCF_DATADIR, idx_fn) - v = pysam.VariantFile(fn, index_filename=idx_fn) - - self.assertEqual(len(list(v.fetch('20'))), 3) + with pysam.VariantFile(fn, index_filename=idx_fn) as inf: + self.assertEqual(len(list(inf.fetch('20'))), 3) class TestConstructionVCFWithContigs(unittest.TestCase): @@ -435,7 +536,7 @@ class TestConstructionVCFWithContigs(unittest.TestCase): self.complete_check(fn_in, fn_out) -#class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs): +# class TestConstructionVCFWithoutContigs(TestConstructionVCFWithContigs): # """construct VariantFile from scratch.""" # filename = "example_vcf40.vcf" @@ -466,7 +567,8 @@ class TestSettingRecordValues(unittest.TestCase): self.assertEqual(inf.format, 'VCF') self.assertEqual(inf.version, (4, 0)) self.assertEqual(inf.compression, 'NONE') - self.assertEqual(inf.description, 'VCF version 4.0 variant calling text') + self.assertEqual( + inf.description, 'VCF version 4.0 variant calling text') self.assertTrue(inf.is_open) self.assertEqual(inf.is_read, True) self.assertEqual(inf.is_write, False) @@ -485,14 +587,15 @@ class TestSettingRecordValues(unittest.TestCase): with pysam.VariantFile(os.path.join(CBCF_DATADIR, self.filename)) as inf: record = next(inf) sample = record.samples["NA00001"] - print (sample["GT"]) + print(sample["GT"]) self.assertEqual(sample["GT"], (0, 0)) sample["GT"] = sample["GT"] + class TestSubsetting(unittest.TestCase): - + filename = "example_vcf42.vcf.gz" - + def testSubsetting(self): with pysam.VariantFile(os.path.join(CBCF_DATADIR, self.filename)) as inf: @@ -501,8 +604,8 @@ class TestSubsetting(unittest.TestCase): if __name__ == "__main__": # build data files - print ("building data files") + print("building data files") subprocess.call("make -C %s" % CBCF_DATADIR, shell=True) - print ("starting tests") + print("starting tests") unittest.main() - print ("completed tests") + print("completed tests") diff --git a/tests/cbcf_data/gnomad.vcf b/tests/cbcf_data/gnomad.vcf new file mode 100644 index 0000000..9875c03 --- /dev/null +++ b/tests/cbcf_data/gnomad.vcf @@ -0,0 +1,200 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FILTER= +##FILTER== 20, DP >= 10, AB => 0.2 for het calls))"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO +22 16050036 rs374742143 A C 442156.34 RF AC=67;AF=4.29487e-01;AN=156;BaseQRankSum=7.36000e-01;ClippingRankSum=2.96000e-01;DB;DP=50165;FS=7.05600e+00;InbreedingCoeff=3.82000e-01;MQ=2.71500e+01;MQRankSum=-1.02600e+00;QD=2.47500e+01;ReadPosRankSum=-2.11000e-01;SOR=1.26750e+01;VQSLOD=-9.58600e+02;VQSR_culprit=MQ;GQ_HIST_ALT=16|1279|299|254|155|24|16|28|50|135|78|4|6|11|32|43|6|4|3|35;DP_HIST_ALT=1769|653|51|5|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|2|5|24|36|11|89|2|125|31|61|85|24|60|34|8|3|7;GQ_HIST_ALL=2359|2810|730|651|296|51|34|33|53|135|78|4|6|11|32|43|6|4|3|35;DP_HIST_ALL=5518|1756|94|6|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|2|5|24|36|11|89|2|125|31|61|85|24|60|34|8|3|7;AC_AFR=3;AC_AMR=7;AC_ASJ=0;AC_EAS=0;AC_FIN=50;AC_NFE=3;AC_OTH=4;AC_Male=32;AC_Female=35;AN_AFR=14;AN_AMR=10;AN_ASJ=0;AN_EAS=0;AN_FIN=84;AN_NFE=42;AN_OTH=6;AN_Male=80;AN_Female=76;AF_AFR=2.14286e-01;AF_AMR=7.00000e-01;AF_ASJ=.;AF_EAS=.;AF_FIN=5.95238e-01;AF_NFE=7.14286e-02;AF_OTH=6.66667e-01;AF_Male=4.00000e-01;AF_Female=4.60526e-01;GC_AFR=4,3,0;GC_AMR=0,3,2;GC_ASJ=0,0,0;GC_EAS=0,0,0;GC_FIN=4,26,12;GC_NFE=18,3,0;GC_OTH=1,0,2;GC_Male=16,16,8;GC_Female=11,19,8;AC_raw=4349;AN_raw=14748;AF_raw=2.94887e-01;GC_raw=4896,607,1871;GC=27,35,16;Hom_AFR=0;Hom_AMR=2;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=12;Hom_NFE=0;Hom_OTH=2;Hom_Male=8;Hom_Female=8;Hom_raw=1871;Hom=16;POPMAX=AMR;AC_POPMAX=7;AN_POPMAX=10;AF_POPMAX=7.00000e-01;DP_MEDIAN=3;DREF_MEDIAN=5.60406e-10;GQ_MEDIAN=12;AB_MEDIAN=5.55556e-01;AS_RF=4.71609e-02;AS_FilterStatus=RF;CSQ=C|intergenic_variant|MODIFIER|||||||||||||||rs374742143|1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050068 . A G 82.46 RF;AC0 AC=0;AF=0.00000e+00;AN=2708;DP=111486;FS=0.00000e+00;InbreedingCoeff=-3.63000e-02;MQ=3.20200e+01;QD=1.64900e+01;SOR=3.61100e+00;VQSLOD=-9.28900e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;DP_HIST_ALT=0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=1227|4189|2184|3537|2071|618|693|346|97|92|61|9|18|0|7|4|4|0|0|0;DP_HIST_ALL=7502|6289|1163|169|26|8|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=0;AC_Female=0;AN_AFR=280;AN_AMR=162;AN_ASJ=34;AN_EAS=74;AN_FIN=748;AN_NFE=1300;AN_OTH=110;AN_Male=1584;AN_Female=1124;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=0.00000e+00;AF_Female=0.00000e+00;GC_AFR=140,0,0;GC_AMR=81,0,0;GC_ASJ=17,0,0;GC_EAS=37,0,0;GC_FIN=374,0,0;GC_NFE=650,0,0;GC_OTH=55,0,0;GC_Male=792,0,0;GC_Female=562,0,0;AC_raw=2;AN_raw=30314;AF_raw=6.59761e-05;GC_raw=15156,0,1;GC=1354,0,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=1;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=5;DREF_MEDIAN=1.22034e-16;GQ_MEDIAN=15;AB_MEDIAN=5.00000e-01;AS_RF=5.87186e-02;AS_FilterStatus=RF|AC0;CSQ=G|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050069 . C T 43.44 RF;AC0 AC=0;AF=0.00000e+00;AN=2802;BaseQRankSum=7.20000e-01;ClippingRankSum=-1.38000e+00;DP=112972;FS=0.00000e+00;InbreedingCoeff=-3.70000e-02;MQ=3.67700e+01;MQRankSum=7.20000e-01;QD=7.24000e+00;ReadPosRankSum=1.38000e+00;SOR=1.32900e+00;VQSLOD=-3.27400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0;DP_HIST_ALT=0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0;GQ_HIST_ALL=1200|4098|2219|3540|2117|622|710|352|104|101|64|10|20|1|8|3|4|0|0|0;DP_HIST_ALL=7405|6348|1203|181|28|8|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=0;AC_Female=0;AN_AFR=296;AN_AMR=164;AN_ASJ=36;AN_EAS=70;AN_FIN=766;AN_NFE=1354;AN_OTH=116;AN_Male=1638;AN_Female=1164;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=0.00000e+00;AF_Female=0.00000e+00;GC_AFR=148,0,0;GC_AMR=82,0,0;GC_ASJ=18,0,0;GC_EAS=35,0,0;GC_FIN=383,0,0;GC_NFE=677,0,0;GC_OTH=58,0,0;GC_Male=819,0,0;GC_Female=582,0,0;AC_raw=1;AN_raw=30346;AF_raw=3.29533e-05;GC_raw=15172,1,0;GC=1401,0,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=6;DREF_MEDIAN=3.16178e-11;GQ_MEDIAN=38;AB_MEDIAN=6.66667e-01;AS_RF=9.22976e-02;AS_FilterStatus=RF|AC0;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050098 . G A 242.82 RF AC=2;AF=2.40964e-04;AN=8300;BaseQRankSum=2.45000e+00;ClippingRankSum=1.03000e-01;DP=167171;FS=0.00000e+00;InbreedingCoeff=-1.46000e-02;MQ=3.06100e+01;MQRankSum=-9.35000e-01;QD=1.05600e+01;ReadPosRankSum=6.60000e-01;SOR=3.84000e-01;VQSLOD=-4.11500e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|2;DP_HIST_ALT=0|0|2|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|1|1|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=350|1697|1483|3462|3106|1262|1746|994|348|436|237|88|141|14|39|5|22|4|8|7;DP_HIST_ALL=3368|7876|3165|776|212|33|16|3|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|1|1|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=2;AC_OTH=0;AC_Male=2;AC_Female=0;AN_AFR=1298;AN_AMR=334;AN_ASJ=84;AN_EAS=380;AN_FIN=1460;AN_NFE=4412;AN_OTH=332;AN_Male=4748;AN_Female=3552;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=4.53309e-04;AF_OTH=0.00000e+00;AF_Male=4.21230e-04;AF_Female=0.00000e+00;GC_AFR=649,0,0;GC_AMR=167,0,0;GC_ASJ=42,0,0;GC_EAS=190,0,0;GC_FIN=730,0,0;GC_NFE=2204,2,0;GC_OTH=166,0,0;GC_Male=2372,2,0;GC_Female=1776,0,0;AC_raw=2;AN_raw=30898;AF_raw=6.47291e-05;GC_raw=15447,2,0;GC=4148,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=2;AN_POPMAX=4412;AF_POPMAX=4.53309e-04;DP_MEDIAN=11;DREF_MEDIAN=6.29479e-16;GQ_MEDIAN=99;AB_MEDIAN=4.80769e-01;AS_RF=3.30908e-01;AS_FilterStatus=RF;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050115 rs587755077 G A 10684.53 RF AC=31;AF=2.54057e-03;AN=12202;BaseQRankSum=1.59000e+00;ClippingRankSum=6.70000e-02;DP=196906;FS=0.00000e+00;InbreedingCoeff=1.40000e-02;MQ=3.37700e+01;MQRankSum=4.06000e-01;QD=8.23000e+00;ReadPosRankSum=3.22000e-01;SOR=4.23100e+00;VQSLOD=-1.16300e+02;VQSR_culprit=MQ;GQ_HIST_ALT=3|6|10|5|3|3|6|13|17|16|2|3|9|12|8|11|5|3|3|14;DP_HIST_ALT=22|95|32|3|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|4|6|16|20|8|21|1|28|5|7|7|7|7|8|0|0|0;GQ_HIST_ALL=237|959|984|2740|3098|1328|2215|1483|527|685|442|163|289|52|79|25|61|6|27|39;DP_HIST_ALL=1972|7287|4327|1314|407|79|44|9|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|4|6|16|20|8|21|1|28|5|7|7|7|7|8|0|0|0;AC_AFR=30;AC_AMR=1;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=16;AC_Female=15;AN_AFR=2236;AN_AMR=436;AN_ASJ=144;AN_EAS=640;AN_FIN=1862;AN_NFE=6410;AN_OTH=474;AN_Male=6874;AN_Female=5328;AF_AFR=1.34168e-02;AF_AMR=2.29358e-03;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=2.32761e-03;AF_Female=2.81532e-03;GC_AFR=1088,30,0;GC_AMR=217,1,0;GC_ASJ=72,0,0;GC_EAS=320,0,0;GC_FIN=931,0,0;GC_NFE=3205,0,0;GC_OTH=237,0,0;GC_Male=3421,16,0;GC_Female=2649,15,0;AC_raw=159;AN_raw=30878;AF_raw=5.14930e-03;GC_raw=15287,145,7;GC=6070,31,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=7;Hom=0;POPMAX=AFR;AC_POPMAX=30;AN_POPMAX=2236;AF_POPMAX=1.34168e-02;DP_MEDIAN=7;DREF_MEDIAN=1.74246e-08;GQ_MEDIAN=48;AB_MEDIAN=4.44444e-01;AS_RF=5.02621e-02;AS_FilterStatus=RF;CSQ=A|intergenic_variant|MODIFIER|||||||||||||||rs587755077|1||||SNV|1||||||||||||||||A:0.0064|||||||||||||||||||||||||||| +22 16050116 . G C 28395.88 RF AC=246;AF=2.51431e-02;AN=9784;BaseQRankSum=2.48000e-01;ClippingRankSum=0.00000e+00;DP=201089;FS=0.00000e+00;InbreedingCoeff=-7.09000e-02;MQ=3.23400e+01;MQRankSum=-1.23100e+00;QD=3.27000e+00;ReadPosRankSum=2.48000e-01;SOR=7.16400e+00;VQSLOD=-3.11400e+02;VQSR_culprit=MQ;GQ_HIST_ALT=0|3|6|9|32|41|110|119|73|75|49|48|27|32|38|16|4|9|19|27;DP_HIST_ALT=21|356|285|60|12|3|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|11|105|184|174|98|40|64|2|30|5|11|8|2|2|0|0|0|0;GQ_HIST_ALL=2616|992|935|2244|2407|1109|1723|1219|458|593|382|175|259|62|105|27|46|11|38|55;DP_HIST_ALL=1838|7215|4403|1404|443|92|50|11|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|11|105|184|174|98|40|64|2|30|5|11|8|2|2|0|0|0|0;AC_AFR=18;AC_AMR=5;AC_ASJ=4;AC_EAS=3;AC_FIN=5;AC_NFE=202;AC_OTH=9;AC_Male=141;AC_Female=105;AN_AFR=2166;AN_AMR=402;AN_ASJ=88;AN_EAS=650;AN_FIN=1722;AN_NFE=4390;AN_OTH=366;AN_Male=5450;AN_Female=4334;AF_AFR=8.31025e-03;AF_AMR=1.24378e-02;AF_ASJ=4.54545e-02;AF_EAS=4.61538e-03;AF_FIN=2.90360e-03;AF_NFE=4.60137e-02;AF_OTH=2.45902e-02;AF_Male=2.58716e-02;AF_Female=2.42270e-02;GC_AFR=1065,18,0;GC_AMR=196,5,0;GC_ASJ=40,4,0;GC_EAS=322,3,0;GC_FIN=856,5,0;GC_NFE=1993,202,0;GC_OTH=174,9,0;GC_Male=2584,141,0;GC_Female=2062,105,0;AC_raw=738;AN_raw=30912;AF_raw=2.38742e-02;GC_raw=14719,736,1;GC=4646,246,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=1;Hom=0;POPMAX=NFE;AC_POPMAX=202;AN_POPMAX=4390;AF_POPMAX=4.60137e-02;DP_MEDIAN=9;DREF_MEDIAN=6.30918e-05;GQ_MEDIAN=42;AB_MEDIAN=2.50000e-01;AS_RF=8.65786e-03;AS_FilterStatus=RF;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050129 . G GACA,C 1164.92 PASS AC=3,0;AF=1.81378e-04,0.00000e+00;AN=16540;BaseQRankSum=-2.24000e-01;ClippingRankSum=-2.63000e-01;DP=230703;FS=8.86700e+00;InbreedingCoeff=5.80000e-03;MQ=3.46800e+01;MQRankSum=1.43000e-01;QD=1.01300e+01;ReadPosRankSum=-2.13000e-01;SOR=3.30000e-02;VQSLOD=-1.37800e+00;VQSR_culprit=SOR;VQSR_NEGATIVE_TRAIN_SITE;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|1|0|0|4,0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0;DP_HIST_ALT=0|3|2|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|2|0|1|1|1|0|0|0|0|1|0|0|0|0,0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0;GQ_HIST_ALL=104|464|608|1985|2741|1441|2526|1859|721|1069|690|260|538|62|159|47|92|8|46|68;DP_HIST_ALL=1018|6123|5177|2097|791|161|97|24|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|2|0|1|1|1|0|1|0|0|1|0|0|0|0;AC_AFR=0,0;AC_AMR=0,0;AC_ASJ=0,0;AC_EAS=0,0;AC_FIN=0,0;AC_NFE=3,0;AC_OTH=0,0;AC_Male=3,0;AC_Female=0,0;AN_AFR=3614;AN_AMR=528;AN_ASJ=180;AN_EAS=892;AN_FIN=2256;AN_NFE=8466;AN_OTH=604;AN_Male=9308;AN_Female=7232;AF_AFR=0.00000e+00,0.00000e+00;AF_AMR=0.00000e+00,0.00000e+00;AF_ASJ=0.00000e+00,0.00000e+00;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,0.00000e+00;AF_NFE=3.54359e-04,0.00000e+00;AF_OTH=0.00000e+00,0.00000e+00;AF_Male=3.22303e-04,0.00000e+00;AF_Female=0.00000e+00,0.00000e+00;GC_AFR=1807,0,0,0,0,0;GC_AMR=264,0,0,0,0,0;GC_ASJ=90,0,0,0,0,0;GC_EAS=446,0,0,0,0,0;GC_FIN=1128,0,0,0,0,0;GC_NFE=4230,3,0,0,0,0;GC_OTH=302,0,0,0,0,0;GC_Male=4651,3,0,0,0,0;GC_Female=3616,0,0,0,0,0;AC_raw=6,1;AN_raw=30976;AF_raw=1.93698e-04,3.22831e-05;GC_raw=15481,6,0,1,0,0;GC=8267,3,0,0,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=NFE,.;AC_POPMAX=3,.;AN_POPMAX=8466,.;AF_POPMAX=3.54359e-04,.;DP_MEDIAN=10,8;DREF_MEDIAN=3.15558e-20,2.51189e-13;GQ_MEDIAN=99,72;AB_MEDIAN=4.41558e-01,6.25000e-01;AS_RF=5.16800e-01,3.29197e-01;AS_FilterStatus=PASS,RF|AC0;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||2||||insertion|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050141 . C A 108.57 PASS AC=1;AF=5.09632e-05;AN=19622;BaseQRankSum=-7.51000e-01;ClippingRankSum=-1.43000e-01;DP=255559;FS=0.00000e+00;InbreedingCoeff=-4.70000e-03;MQ=3.40100e+01;MQRankSum=-3.32000e-01;QD=9.87000e+00;ReadPosRankSum=6.60000e-02;SOR=9.90000e-02;VQSLOD=-3.45100e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=90|295|443|1443|2398|1347|2546|2110|845|1318|842|356|727|79|237|59|159|7|90|99;DP_HIST_ALL=626|4935|5582|2741|1103|289|169|45|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=1;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=4696;AN_AMR=588;AN_ASJ=206;AN_EAS=1092;AN_FIN=2392;AN_NFE=9986;AN_OTH=662;AN_Male=10956;AN_Female=8666;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=1.00140e-04;AF_OTH=0.00000e+00;AF_Male=9.12742e-05;AF_Female=0.00000e+00;GC_AFR=2348,0,0;GC_AMR=294,0,0;GC_ASJ=103,0,0;GC_EAS=546,0,0;GC_FIN=1196,0,0;GC_NFE=4992,1,0;GC_OTH=331,0,0;GC_Male=5477,1,0;GC_Female=4333,0,0;AC_raw=1;AN_raw=30980;AF_raw=3.22789e-05;GC_raw=15489,1,0;GC=9810,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=1;AN_POPMAX=9986;AF_POPMAX=1.00140e-04;DP_MEDIAN=11;DREF_MEDIAN=7.94328e-18;GQ_MEDIAN=99;AB_MEDIAN=5.45455e-01;AS_RF=4.14273e-01;AS_FilterStatus=PASS;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050146 . A T 48.32 RF;AC0 AC=0;AF=0.00000e+00;AN=20636;BaseQRankSum=-3.54000e-01;ClippingRankSum=5.50000e-01;DP=264413;FS=2.76200e+00;InbreedingCoeff=-4.10000e-03;MQ=3.73000e+01;MQRankSum=2.00000e+00;QD=6.90000e+00;ReadPosRankSum=2.00000e+00;SOR=1.53600e+00;VQSLOD=-3.42700e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0;DP_HIST_ALT=0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;GQ_HIST_ALL=84|258|359|1323|2262|1285|2499|2192|869|1402|937|374|819|84|270|69|176|10|92|126;DP_HIST_ALL=524|4533|5628|2926|1267|363|187|58|4|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=0;AC_Female=0;AN_AFR=5074;AN_AMR=602;AN_ASJ=216;AN_EAS=1154;AN_FIN=2448;AN_NFE=10458;AN_OTH=684;AN_Male=11480;AN_Female=9156;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=0.00000e+00;AF_Female=0.00000e+00;GC_AFR=2537,0,0;GC_AMR=301,0,0;GC_ASJ=108,0,0;GC_EAS=577,0,0;GC_FIN=1224,0,0;GC_NFE=5229,0,0;GC_OTH=342,0,0;GC_Male=5740,0,0;GC_Female=4578,0,0;AC_raw=1;AN_raw=30980;AF_raw=3.22789e-05;GC_raw=15489,1,0;GC=10318,0,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=7;DREF_MEDIAN=1.00000e-11;GQ_MEDIAN=78;AB_MEDIAN=5.71429e-01;AS_RF=2.11047e-01;AS_FilterStatus=RF|AC0;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| diff --git a/tests/cbcf_data/gnomad_fixed.vcf b/tests/cbcf_data/gnomad_fixed.vcf new file mode 100644 index 0000000..552a41a --- /dev/null +++ b/tests/cbcf_data/gnomad_fixed.vcf @@ -0,0 +1,200 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FILTER= +##FILTER== 20, DP >= 10, AB => 0.2 for het calls))"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO +22 16050036 rs374742143 A C 442156.34 RF AC=67;AF=4.29487e-01;AN=156;BaseQRankSum=7.36000e-01;ClippingRankSum=2.96000e-01;DB;DP=50165;FS=7.05600e+00;InbreedingCoeff=3.82000e-01;MQ=2.71500e+01;MQRankSum=-1.02600e+00;QD=2.47500e+01;ReadPosRankSum=-2.11000e-01;SOR=1.26750e+01;VQSLOD=-9.58600e+02;VQSR_culprit=MQ;GQ_HIST_ALT=16|1279|299|254|155|24|16|28|50|135|78|4|6|11|32|43|6|4|3|35;DP_HIST_ALT=1769|653|51|5|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|2|5|24|36|11|89|2|125|31|61|85|24|60|34|8|3|7;GQ_HIST_ALL=2359|2810|730|651|296|51|34|33|53|135|78|4|6|11|32|43|6|4|3|35;DP_HIST_ALL=5518|1756|94|6|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|2|5|24|36|11|89|2|125|31|61|85|24|60|34|8|3|7;AC_AFR=3;AC_AMR=7;AC_ASJ=0;AC_EAS=0;AC_FIN=50;AC_NFE=3;AC_OTH=4;AC_Male=32;AC_Female=35;AN_AFR=14;AN_AMR=10;AN_ASJ=0;AN_EAS=0;AN_FIN=84;AN_NFE=42;AN_OTH=6;AN_Male=80;AN_Female=76;AF_AFR=2.14286e-01;AF_AMR=7.00000e-01;AF_ASJ=.;AF_EAS=.;AF_FIN=5.95238e-01;AF_NFE=7.14286e-02;AF_OTH=6.66667e-01;AF_Male=4.00000e-01;AF_Female=4.60526e-01;GC_AFR=4,3,0;GC_AMR=0,3,2;GC_ASJ=0,0,0;GC_EAS=0,0,0;GC_FIN=4,26,12;GC_NFE=18,3,0;GC_OTH=1,0,2;GC_Male=16,16,8;GC_Female=11,19,8;AC_raw=4349;AN_raw=14748;AF_raw=2.94887e-01;GC_raw=4896,607,1871;GC=27,35,16;Hom_AFR=0;Hom_AMR=2;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=12;Hom_NFE=0;Hom_OTH=2;Hom_Male=8;Hom_Female=8;Hom_raw=1871;Hom=16;POPMAX=AMR;AC_POPMAX=7;AN_POPMAX=10;AF_POPMAX=7.00000e-01;DP_MEDIAN=3;DREF_MEDIAN=5.60406e-10;GQ_MEDIAN=12;AB_MEDIAN=5.55556e-01;AS_RF=4.71609e-02;AS_FilterStatus=RF;CSQ=C|intergenic_variant|MODIFIER|||||||||||||||rs374742143|1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050068 . A G 82.46 RF;AC0 AC=0;AF=0.00000e+00;AN=2708;DP=111486;FS=0.00000e+00;InbreedingCoeff=-3.63000e-02;MQ=3.20200e+01;QD=1.64900e+01;SOR=3.61100e+00;VQSLOD=-9.28900e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;DP_HIST_ALT=0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=1227|4189|2184|3537|2071|618|693|346|97|92|61|9|18|0|7|4|4|0|0|0;DP_HIST_ALL=7502|6289|1163|169|26|8|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=0;AC_Female=0;AN_AFR=280;AN_AMR=162;AN_ASJ=34;AN_EAS=74;AN_FIN=748;AN_NFE=1300;AN_OTH=110;AN_Male=1584;AN_Female=1124;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=0.00000e+00;AF_Female=0.00000e+00;GC_AFR=140,0,0;GC_AMR=81,0,0;GC_ASJ=17,0,0;GC_EAS=37,0,0;GC_FIN=374,0,0;GC_NFE=650,0,0;GC_OTH=55,0,0;GC_Male=792,0,0;GC_Female=562,0,0;AC_raw=2;AN_raw=30314;AF_raw=6.59761e-05;GC_raw=15156,0,1;GC=1354,0,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=1;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=5;DREF_MEDIAN=1.22034e-16;GQ_MEDIAN=15;AB_MEDIAN=5.00000e-01;AS_RF=5.87186e-02;AS_FilterStatus=RF|AC0;CSQ=G|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050069 . C T 43.44 RF;AC0 AC=0;AF=0.00000e+00;AN=2802;BaseQRankSum=7.20000e-01;ClippingRankSum=-1.38000e+00;DP=112972;FS=0.00000e+00;InbreedingCoeff=-3.70000e-02;MQ=3.67700e+01;MQRankSum=7.20000e-01;QD=7.24000e+00;ReadPosRankSum=1.38000e+00;SOR=1.32900e+00;VQSLOD=-3.27400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0;DP_HIST_ALT=0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0;GQ_HIST_ALL=1200|4098|2219|3540|2117|622|710|352|104|101|64|10|20|1|8|3|4|0|0|0;DP_HIST_ALL=7405|6348|1203|181|28|8|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=0;AC_Female=0;AN_AFR=296;AN_AMR=164;AN_ASJ=36;AN_EAS=70;AN_FIN=766;AN_NFE=1354;AN_OTH=116;AN_Male=1638;AN_Female=1164;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=0.00000e+00;AF_Female=0.00000e+00;GC_AFR=148,0,0;GC_AMR=82,0,0;GC_ASJ=18,0,0;GC_EAS=35,0,0;GC_FIN=383,0,0;GC_NFE=677,0,0;GC_OTH=58,0,0;GC_Male=819,0,0;GC_Female=582,0,0;AC_raw=1;AN_raw=30346;AF_raw=3.29533e-05;GC_raw=15172,1,0;GC=1401,0,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=6;DREF_MEDIAN=3.16178e-11;GQ_MEDIAN=38;AB_MEDIAN=6.66667e-01;AS_RF=9.22976e-02;AS_FilterStatus=RF|AC0;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050098 . G A 242.82 RF AC=2;AF=2.40964e-04;AN=8300;BaseQRankSum=2.45000e+00;ClippingRankSum=1.03000e-01;DP=167171;FS=0.00000e+00;InbreedingCoeff=-1.46000e-02;MQ=3.06100e+01;MQRankSum=-9.35000e-01;QD=1.05600e+01;ReadPosRankSum=6.60000e-01;SOR=3.84000e-01;VQSLOD=-4.11500e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|2;DP_HIST_ALT=0|0|2|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|1|1|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=350|1697|1483|3462|3106|1262|1746|994|348|436|237|88|141|14|39|5|22|4|8|7;DP_HIST_ALL=3368|7876|3165|776|212|33|16|3|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|1|1|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=2;AC_OTH=0;AC_Male=2;AC_Female=0;AN_AFR=1298;AN_AMR=334;AN_ASJ=84;AN_EAS=380;AN_FIN=1460;AN_NFE=4412;AN_OTH=332;AN_Male=4748;AN_Female=3552;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=4.53309e-04;AF_OTH=0.00000e+00;AF_Male=4.21230e-04;AF_Female=0.00000e+00;GC_AFR=649,0,0;GC_AMR=167,0,0;GC_ASJ=42,0,0;GC_EAS=190,0,0;GC_FIN=730,0,0;GC_NFE=2204,2,0;GC_OTH=166,0,0;GC_Male=2372,2,0;GC_Female=1776,0,0;AC_raw=2;AN_raw=30898;AF_raw=6.47291e-05;GC_raw=15447,2,0;GC=4148,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=2;AN_POPMAX=4412;AF_POPMAX=4.53309e-04;DP_MEDIAN=11;DREF_MEDIAN=6.29479e-16;GQ_MEDIAN=99;AB_MEDIAN=4.80769e-01;AS_RF=3.30908e-01;AS_FilterStatus=RF;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050115 rs587755077 G A 10684.53 RF AC=31;AF=2.54057e-03;AN=12202;BaseQRankSum=1.59000e+00;ClippingRankSum=6.70000e-02;DP=196906;FS=0.00000e+00;InbreedingCoeff=1.40000e-02;MQ=3.37700e+01;MQRankSum=4.06000e-01;QD=8.23000e+00;ReadPosRankSum=3.22000e-01;SOR=4.23100e+00;VQSLOD=-1.16300e+02;VQSR_culprit=MQ;GQ_HIST_ALT=3|6|10|5|3|3|6|13|17|16|2|3|9|12|8|11|5|3|3|14;DP_HIST_ALT=22|95|32|3|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|4|6|16|20|8|21|1|28|5|7|7|7|7|8|0|0|0;GQ_HIST_ALL=237|959|984|2740|3098|1328|2215|1483|527|685|442|163|289|52|79|25|61|6|27|39;DP_HIST_ALL=1972|7287|4327|1314|407|79|44|9|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|4|6|16|20|8|21|1|28|5|7|7|7|7|8|0|0|0;AC_AFR=30;AC_AMR=1;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=16;AC_Female=15;AN_AFR=2236;AN_AMR=436;AN_ASJ=144;AN_EAS=640;AN_FIN=1862;AN_NFE=6410;AN_OTH=474;AN_Male=6874;AN_Female=5328;AF_AFR=1.34168e-02;AF_AMR=2.29358e-03;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=2.32761e-03;AF_Female=2.81532e-03;GC_AFR=1088,30,0;GC_AMR=217,1,0;GC_ASJ=72,0,0;GC_EAS=320,0,0;GC_FIN=931,0,0;GC_NFE=3205,0,0;GC_OTH=237,0,0;GC_Male=3421,16,0;GC_Female=2649,15,0;AC_raw=159;AN_raw=30878;AF_raw=5.14930e-03;GC_raw=15287,145,7;GC=6070,31,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=7;Hom=0;POPMAX=AFR;AC_POPMAX=30;AN_POPMAX=2236;AF_POPMAX=1.34168e-02;DP_MEDIAN=7;DREF_MEDIAN=1.74246e-08;GQ_MEDIAN=48;AB_MEDIAN=4.44444e-01;AS_RF=5.02621e-02;AS_FilterStatus=RF;CSQ=A|intergenic_variant|MODIFIER|||||||||||||||rs587755077|1||||SNV|1||||||||||||||||A:0.0064|||||||||||||||||||||||||||| +22 16050116 . G C 28395.88 RF AC=246;AF=2.51431e-02;AN=9784;BaseQRankSum=2.48000e-01;ClippingRankSum=0.00000e+00;DP=201089;FS=0.00000e+00;InbreedingCoeff=-7.09000e-02;MQ=3.23400e+01;MQRankSum=-1.23100e+00;QD=3.27000e+00;ReadPosRankSum=2.48000e-01;SOR=7.16400e+00;VQSLOD=-3.11400e+02;VQSR_culprit=MQ;GQ_HIST_ALT=0|3|6|9|32|41|110|119|73|75|49|48|27|32|38|16|4|9|19|27;DP_HIST_ALT=21|356|285|60|12|3|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|11|105|184|174|98|40|64|2|30|5|11|8|2|2|0|0|0|0;GQ_HIST_ALL=2616|992|935|2244|2407|1109|1723|1219|458|593|382|175|259|62|105|27|46|11|38|55;DP_HIST_ALL=1838|7215|4403|1404|443|92|50|11|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|11|105|184|174|98|40|64|2|30|5|11|8|2|2|0|0|0|0;AC_AFR=18;AC_AMR=5;AC_ASJ=4;AC_EAS=3;AC_FIN=5;AC_NFE=202;AC_OTH=9;AC_Male=141;AC_Female=105;AN_AFR=2166;AN_AMR=402;AN_ASJ=88;AN_EAS=650;AN_FIN=1722;AN_NFE=4390;AN_OTH=366;AN_Male=5450;AN_Female=4334;AF_AFR=8.31025e-03;AF_AMR=1.24378e-02;AF_ASJ=4.54545e-02;AF_EAS=4.61538e-03;AF_FIN=2.90360e-03;AF_NFE=4.60137e-02;AF_OTH=2.45902e-02;AF_Male=2.58716e-02;AF_Female=2.42270e-02;GC_AFR=1065,18,0;GC_AMR=196,5,0;GC_ASJ=40,4,0;GC_EAS=322,3,0;GC_FIN=856,5,0;GC_NFE=1993,202,0;GC_OTH=174,9,0;GC_Male=2584,141,0;GC_Female=2062,105,0;AC_raw=738;AN_raw=30912;AF_raw=2.38742e-02;GC_raw=14719,736,1;GC=4646,246,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=1;Hom=0;POPMAX=NFE;AC_POPMAX=202;AN_POPMAX=4390;AF_POPMAX=4.60137e-02;DP_MEDIAN=9;DREF_MEDIAN=6.30918e-05;GQ_MEDIAN=42;AB_MEDIAN=2.50000e-01;AS_RF=8.65786e-03;AS_FilterStatus=RF;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050129 . G GACA,C 1164.92 PASS AC=3,0;AF=1.81378e-04,0.00000e+00;AN=16540;BaseQRankSum=-2.24000e-01;ClippingRankSum=-2.63000e-01;DP=230703;FS=8.86700e+00;InbreedingCoeff=5.80000e-03;MQ=3.46800e+01;MQRankSum=1.43000e-01;QD=1.01300e+01;ReadPosRankSum=-2.13000e-01;SOR=3.30000e-02;VQSLOD=-1.37800e+00;VQSR_culprit=SOR;VQSR_NEGATIVE_TRAIN_SITE;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|1|0|0|4,0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0;DP_HIST_ALT=0|3|2|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|2|0|1|1|1|0|0|0|0|1|0|0|0|0,0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0;GQ_HIST_ALL=104|464|608|1985|2741|1441|2526|1859|721|1069|690|260|538|62|159|47|92|8|46|68;DP_HIST_ALL=1018|6123|5177|2097|791|161|97|24|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|2|0|1|1|1|0|1|0|0|1|0|0|0|0;AC_AFR=0,0;AC_AMR=0,0;AC_ASJ=0,0;AC_EAS=0,0;AC_FIN=0,0;AC_NFE=3,0;AC_OTH=0,0;AC_Male=3,0;AC_Female=0,0;AN_AFR=3614;AN_AMR=528;AN_ASJ=180;AN_EAS=892;AN_FIN=2256;AN_NFE=8466;AN_OTH=604;AN_Male=9308;AN_Female=7232;AF_AFR=0.00000e+00,0.00000e+00;AF_AMR=0.00000e+00,0.00000e+00;AF_ASJ=0.00000e+00,0.00000e+00;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,0.00000e+00;AF_NFE=3.54359e-04,0.00000e+00;AF_OTH=0.00000e+00,0.00000e+00;AF_Male=3.22303e-04,0.00000e+00;AF_Female=0.00000e+00,0.00000e+00;GC_AFR=1807,0,0,0,0,0;GC_AMR=264,0,0,0,0,0;GC_ASJ=90,0,0,0,0,0;GC_EAS=446,0,0,0,0,0;GC_FIN=1128,0,0,0,0,0;GC_NFE=4230,3,0,0,0,0;GC_OTH=302,0,0,0,0,0;GC_Male=4651,3,0,0,0,0;GC_Female=3616,0,0,0,0,0;AC_raw=6,1;AN_raw=30976;AF_raw=1.93698e-04,3.22831e-05;GC_raw=15481,6,0,1,0,0;GC=8267,3,0,0,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=NFE,.;AC_POPMAX=3,.;AN_POPMAX=8466,.;AF_POPMAX=3.54359e-04,.;DP_MEDIAN=10,8;DREF_MEDIAN=3.15558e-20,2.51189e-13;GQ_MEDIAN=99,72;AB_MEDIAN=4.41558e-01,6.25000e-01;AS_RF=5.16800e-01,3.29197e-01;AS_FilterStatus=PASS,RF|AC0;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||2||||insertion|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050141 . C A 108.57 PASS AC=1;AF=5.09632e-05;AN=19622;BaseQRankSum=-7.51000e-01;ClippingRankSum=-1.43000e-01;DP=255559;FS=0.00000e+00;InbreedingCoeff=-4.70000e-03;MQ=3.40100e+01;MQRankSum=-3.32000e-01;QD=9.87000e+00;ReadPosRankSum=6.60000e-02;SOR=9.90000e-02;VQSLOD=-3.45100e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=90|295|443|1443|2398|1347|2546|2110|845|1318|842|356|727|79|237|59|159|7|90|99;DP_HIST_ALL=626|4935|5582|2741|1103|289|169|45|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=1;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=4696;AN_AMR=588;AN_ASJ=206;AN_EAS=1092;AN_FIN=2392;AN_NFE=9986;AN_OTH=662;AN_Male=10956;AN_Female=8666;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=1.00140e-04;AF_OTH=0.00000e+00;AF_Male=9.12742e-05;AF_Female=0.00000e+00;GC_AFR=2348,0,0;GC_AMR=294,0,0;GC_ASJ=103,0,0;GC_EAS=546,0,0;GC_FIN=1196,0,0;GC_NFE=4992,1,0;GC_OTH=331,0,0;GC_Male=5477,1,0;GC_Female=4333,0,0;AC_raw=1;AN_raw=30980;AF_raw=3.22789e-05;GC_raw=15489,1,0;GC=9810,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=1;AN_POPMAX=9986;AF_POPMAX=1.00140e-04;DP_MEDIAN=11;DREF_MEDIAN=7.94328e-18;GQ_MEDIAN=99;AB_MEDIAN=5.45455e-01;AS_RF=4.14273e-01;AS_FilterStatus=PASS;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| +22 16050146 . A T 48.32 RF;AC0 AC=0;AF=0.00000e+00;AN=20636;BaseQRankSum=-3.54000e-01;ClippingRankSum=5.50000e-01;DP=264413;FS=2.76200e+00;InbreedingCoeff=-4.10000e-03;MQ=3.73000e+01;MQRankSum=2.00000e+00;QD=6.90000e+00;ReadPosRankSum=2.00000e+00;SOR=1.53600e+00;VQSLOD=-3.42700e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0;DP_HIST_ALT=0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;GQ_HIST_ALL=84|258|359|1323|2262|1285|2499|2192|869|1402|937|374|819|84|270|69|176|10|92|126;DP_HIST_ALL=524|4533|5628|2926|1267|363|187|58|4|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=0;AC_Female=0;AN_AFR=5074;AN_AMR=602;AN_ASJ=216;AN_EAS=1154;AN_FIN=2448;AN_NFE=10458;AN_OTH=684;AN_Male=11480;AN_Female=9156;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=0.00000e+00;AF_Female=0.00000e+00;GC_AFR=2537,0,0;GC_AMR=301,0,0;GC_ASJ=108,0,0;GC_EAS=577,0,0;GC_FIN=1224,0,0;GC_NFE=5229,0,0;GC_OTH=342,0,0;GC_Male=5740,0,0;GC_Female=4578,0,0;AC_raw=1;AN_raw=30980;AF_raw=3.22789e-05;GC_raw=15489,1,0;GC=10318,0,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=7;DREF_MEDIAN=1.00000e-11;GQ_MEDIAN=78;AB_MEDIAN=5.71429e-01;AS_RF=2.11047e-01;AS_FilterStatus=RF|AC0;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| diff --git a/tests/compile_test.py b/tests/compile_test.py index f91e180..f56adb7 100644 --- a/tests/compile_test.py +++ b/tests/compile_test.py @@ -8,21 +8,20 @@ pysam and tabix works. # clean up previous compilation import os +import unittest +import pysam +from TestUtils import BAM_DATADIR, TABIX_DATADIR + try: os.unlink('tests/_compile_test.c') os.unlink('tests/_compile_test.pyxbldc') except OSError: pass - import pyximport pyximport.install(build_in_temp=False) import _compile_test -import unittest -import pysam -from TestUtils import BAM_DATADIR, TABIX_DATADIR - class BAMTest(unittest.TestCase): @@ -43,7 +42,7 @@ class GTFTest(unittest.TestCase): nread = _compile_test.testCountGTF( pysam.Tabixfile(self.input_filename)) self.assertEqual(nread, 237) - + if __name__ == "__main__": unittest.main() diff --git a/tests/faidx_bench.py b/tests/faidx_bench.py new file mode 100644 index 0000000..c167336 --- /dev/null +++ b/tests/faidx_bench.py @@ -0,0 +1,71 @@ +"""Benchmarking the cfaidx module. Usage:: + +pytest benchmark/faidx_bench.py +""" +import os +import pysam + + +from TestUtils import BAM_DATADIR + + +def iterate_over_fastx(fn, persist=True): + return len(list(pysam.FastxFile(fn, persist=persist))) + + +def iterate_over_fastx_as_file(fn): + with open(fn) as inf: + return len(inf.read()) + + +def test_fasta_iteration_short_sequences(benchmark): + result = benchmark(iterate_over_fastx, os.path.join( + BAM_DATADIR, "faidx_ex1.fa")) + assert result == 3270 + + +def test_fasta_iteration_long_sequences(benchmark): + result = benchmark(iterate_over_fastx, os.path.join(BAM_DATADIR, "ex1.fa")) + assert result == 2 + + +def test_fasta_iteration_short_sequences_without_persistence(benchmark): + result = benchmark(iterate_over_fastx, os.path.join( + BAM_DATADIR, "faidx_ex1.fa"), persist=False) + assert result == 3270 + + +def test_fasta_iteration_long_sequences_without_persistence(benchmark): + result = benchmark(iterate_over_fastx, os.path.join( + BAM_DATADIR, "ex1.fa"), persist=False) + assert result == 2 + + +def test_fasta_iteration_short_sequences_as_file(benchmark): + result = benchmark(iterate_over_fastx_as_file, + os.path.join(BAM_DATADIR, "faidx_ex1.fa")) + assert result == 195399 + + +def test_fasta_iteration_long_sequences_as_file(benchmark): + result = benchmark(iterate_over_fastx_as_file, + os.path.join(BAM_DATADIR, "ex1.fa")) + assert result == 3225 + + +def test_fastq_iteration_short_sequences(benchmark): + result = benchmark(iterate_over_fastx, os.path.join( + BAM_DATADIR, "faidx_ex1.fq")) + assert result == 3270 + + +def test_fastq_iteration_short_sequences_without_persistence(benchmark): + result = benchmark(iterate_over_fastx, os.path.join( + BAM_DATADIR, "faidx_ex1.fq"), persist=False) + assert result == 3270 + + +def test_fastq_iteration_short_sequences_as_file(benchmark): + result = benchmark(iterate_over_fastx_as_file, + os.path.join(BAM_DATADIR, "faidx_ex1.fq")) + assert result == 320458 diff --git a/tests/faidx_test.py b/tests/faidx_test.py index 9df34b6..c618a92 100644 --- a/tests/faidx_test.py +++ b/tests/faidx_test.py @@ -5,16 +5,16 @@ import gzip import copy import shutil -from TestUtils import checkURL, BAM_DATADIR +from TestUtils import checkURL, BAM_DATADIR, get_temp_filename class TestFastaFile(unittest.TestCase): sequences = { 'chr1': - "CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCTGTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACCAAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCTCTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCAATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGCAGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAACAACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACACATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATACCATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTTTCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAATACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGAACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTGTGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAGTCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGCTTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTCTCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGGAGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATATTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTCTCCCTCGTCTTCTTA", + "CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCTGTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACCAAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCTCTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCAATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGCAGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAACAACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACACATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATACCATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTTTCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAATACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGAACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTGTGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAGTCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGCTTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTCTCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGGAGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATATTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTCTCCCTCGTCTTCTTA", # noqa 'chr2': - "TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAGCTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCTTATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTTCAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTTAGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAGGAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATTTTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTAAGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATAAAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACCTCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATTAATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCAAATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGTAAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATATAACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAATACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGATGATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTGCGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATAGCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAAAAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAATTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAAAAAATATTTACAGTAACT", + "TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAGCTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCTTATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTTCAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTTAGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAGGAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATTTTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTAAGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATAAAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACCTCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATTAATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCAAATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGTAAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATATAACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAATACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGATGATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTGCGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATAGCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAAAAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAATTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAAAAAATATTTACAGTAACT", # noqa } def setUp(self): @@ -59,49 +59,46 @@ class TestFastaFile(unittest.TestCase): class TestFastaFilePathIndex(unittest.TestCase): filename = os.path.join(BAM_DATADIR, "ex1.fa") - - def testGarbageIndex(self): - self.assertRaises(NotImplementedError, + data_suffix = ".fa" + + def test_raise_exception_if_index_is_missing(self): + self.assertRaises(IOError, pysam.FastaFile, self.filename, - filepath_index="garbage.fa.fai") - return + filepath_index="garbage" + self.data_suffix + ".fai") - self.assertRaises(ValueError, - pysam.FastaFile, - self.filename, - filepath_index="garbage.fa.fai") + def test_open_file_without_index_succeeds(self): + with pysam.FastaFile(self.filename) as inf: + self.assertEqual(len(inf), 2) - def testOpenWithoutIndex(self): - faidx = pysam.FastaFile(self.filename) - faidx.close() + def test_open_file_with_explicit_index_succeeds(self): + with pysam.FastaFile(self.filename, + filepath_index=self.filename + ".fai") as inf: + self.assertEqual(len(inf), 2) - def testOpenWithStandardIndex(self): - self.assertRaises(NotImplementedError, - pysam.FastaFile, - self.filename, - filepath_index=self.filename + ".fai") - return + def test_open_file_with_explicit_abritrarily_named_index_succeeds(self): + tmpfilename = get_temp_filename(self.data_suffix) + shutil.copyfile(self.filename, tmpfilename) - faidx = pysam.FastaFile(self.filename, - filepath_index=self.filename + ".fai") - faidx.close() + filepath_index = self.filename + ".fai" + filepath_index_compressed = self.filename + ".gzi" + if not os.path.exists(filepath_index_compressed): + filepath_index_compressed = None + with pysam.FastaFile(tmpfilename, + filepath_index=filepath_index, + filepath_index_compressed=filepath_index_compressed) as inf: + self.assertEqual(len(inf), 2) - def testOpenWithOtherIndex(self): - return - tmpfilename = "tmp_" + os.path.basename(self.filename) - shutil.copyfile(self.filename, tmpfilename) - faidx = pysam.FastaFile(tmpfilename, - filepath_index=self.filename + ".fai") - faidx.close() # index should not be auto-generated self.assertFalse(os.path.exists(tmpfilename + ".fai")) os.unlink(tmpfilename) + class TestFastaFilePathIndexCompressed(TestFastaFilePathIndex): - - filename = os.path.join(BAM_DATADIR, "ex1.fa.gz") + filename = os.path.join(BAM_DATADIR, "ex1.fa.gz") + data_suffix = ".fa.gz" + class TestFastxFileFastq(unittest.TestCase): @@ -219,7 +216,8 @@ class TestRemoteFileFTP(unittest.TestCase): '''test remote access. ''' - url = "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa" + url = ("ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/" + "GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa") def testFTPView(self): if not checkURL(self.url): @@ -242,22 +240,24 @@ class TestRemoteFileFTP(unittest.TestCase): self.assertEqual(f.get_reference_length("chr1"), 248956422) + class TestFastqRecord(unittest.TestCase): filetype = pysam.FastxFile filename = "faidx_ex1.fq" - + def setUp(self): with self.filetype(os.path.join(BAM_DATADIR, self.filename), persist=True) as inf: self.record = next(inf) - + def test_fastx_record_sequence_can_be_modified(self): old_sequence = self.record.sequence new_record = copy.copy(self.record) new_sequence = "AAAC" new_record.set_sequence(new_sequence) - self.assertEqual(str(new_record), ">{}\n{}".format(self.record.name, new_sequence)) + self.assertEqual(str(new_record), ">{}\n{}".format( + self.record.name, new_sequence)) self.assertEqual(self.record.sequence, old_sequence) self.assertEqual(new_record.sequence, new_sequence) @@ -273,7 +273,7 @@ class TestFastqRecord(unittest.TestCase): self.assertRaises(ValueError, self.record.set_name, None) - + def test_fastx_record_comment_can_be_modified(self): old_comment = self.record.comment new_comment = "this is a new comment" @@ -289,7 +289,7 @@ class TestFastqRecord(unittest.TestCase): new_record.set_comment(new_comment) self.assertEqual(new_record.comment, new_comment) self.assertEqual(self.record.comment, old_comment) - + def test_fastx_record_quality_can_be_modified(self): old_quality = self.record.quality new_quality = "A" * len(old_quality) @@ -314,7 +314,7 @@ class TestFastqRecord(unittest.TestCase): fastx_record) fastx_record.set_sequence("sequence") self.assertEqual(str(fastx_record), ">name\nsequence") - - + + if __name__ == "__main__": unittest.main() diff --git a/tests/linking_test.py b/tests/linking_test.py index 25b9b04..15fd91a 100644 --- a/tests/linking_test.py +++ b/tests/linking_test.py @@ -15,7 +15,8 @@ def check_import(statement): statement, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as exc: if b"ImportError" in exc.output: - raise ImportError("module could not be imported: {}".format(str(exc.output))) + raise ImportError( + "module could not be imported: {}".format(str(exc.output))) else: raise @@ -40,11 +41,12 @@ class TestLinking(unittest.TestCase): def setUp(self): self.workdir = os.path.join(LINKDIR, self.package_name) - + def test_package_can_be_installed(self): subprocess.check_output( - "cd {} && rm -rf build && python setup.py install".format(self.workdir), - shell=True) + "cd {} && rm -rf build && python setup.py install".format( + self.workdir), + shell=True) @unittest.skipUnless( @@ -53,7 +55,7 @@ class TestLinking(unittest.TestCase): class TestLinkWithRpath(TestLinking): package_name = "link_with_rpath" - + def test_package_tests_pass(self): self.assertTrue(check_pass( "cd {} && python test_module.py".format(os.path.join(self.workdir, "tests")))) @@ -76,14 +78,15 @@ class TestLinkWithoutRpath(TestLinking): def test_package_tests_pass_if_ld_library_path_set(self): pysam_libraries = pysam.get_libraries() - pysam_libdirs, pysam_libs = zip(*[os.path.split(x) for x in pysam_libraries]) + pysam_libdirs, pysam_libs = zip( + *[os.path.split(x) for x in pysam_libraries]) pysam_libdir = pysam_libdirs[0] self.assertTrue(check_pass( "export LD_LIBRARY_PATH={}:$PATH && cd {} && python test_module.py".format( pysam_libdir, os.path.join(self.workdir, "tests")))) - + if __name__ == "__main__": unittest.main() diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index 2ccedd2..b48a27c 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -9,6 +9,7 @@ CRAI=$(CRAM:%.cram=%.cram.crai) all: ex1.pileup.gz \ ex1.sam ex1.bam \ ex2.sam.gz ex2.sam ex2.bam ex2.bam.bai \ + with_md.sam.gz with_md.bam with_md.bam.bai \ uncompressed.bam \ $(BAM) $(BAI) \ $(CRAM) $(CRAI) \ @@ -18,13 +19,17 @@ all: ex1.pileup.gz \ empty.bam empty.bam.bai \ explicit_index.bam explicit_index.cram \ faidx_empty_seq.fq.gz \ - ex1.fa.gz ex1.fa.gz.fai \ - ex1_csi.bam + ex1.fa.gz ex1.fa.gz.csi \ + ex1_csi.bam \ + example_reverse_complement.bam # ex2.sam - as ex1.sam, but with header ex2.sam.gz: ex1.bam ex1.bam.bai samtools view -h ex1.bam | gzip > ex2.sam.gz +with_md.sam.gz: ex2.bam ex1.fa + samtools calmd --output-fmt BAM $^ > $@ + #%.bam: %.sam ex1.fa.fai # samtools import ex1.fa.fai $< $@ @@ -93,5 +98,5 @@ clean: %.fa.gz: %.fa bgzip < $< > $@ -%.fa.gz.fai: %.fa.gz +%.fa.gz.csi: %.fa.gz samtools faidx $< diff --git a/tests/pysam_data/example_no_seq_in_header.bam b/tests/pysam_data/example_no_seq_in_header.bam new file mode 100644 index 0000000000000000000000000000000000000000..72de636194cf09dae6b47ee000ff954c2751b9a2 GIT binary patch literal 953 zcmV;q14jHGiwFb&00000{{{d;LjnN60cFrTNW)MRfZ?O1t*MIO;?O1FprA<_oAeQ- zRxLijXMHOo(MDQmlQv1U)xp6*5eG+cbxK-mcZM0;b*#GoSbK4x@Cu?*AJ5}xRA57QS@#*A#QYLm~rC^23AijXuiny^&N zYl3ZLld3H&q;&Gia-3zPwSp@0elC|+(-zAa?1HJLbWRf`UglPdlFTE{vLW{WaVb5a z6@*MC{C{vswd@?fn72Lkzni;$pT(pp`k-O^%8ez$uCI$$3n&qVmLK*KP3~%}c~jLB zMg~s3E_#>*+4wRYkRc*Bxv&Djw>~&=Jb>f-HGjJV;pXy~k{}Yt9z#h8+G|rztS~&5 zA32<);OO(ZvWf%_kKEJ(0WGl7aqA)Bhr|92hG=@3(gC2Bpz)#`frKD#l_QX$v{i>d zM(}Jo5d`4Xe5oK21ZI+IdbRIOszdPAeCH-0gy8(M3PBilq8frQg0*`Wf&nPC1CZZ0 zuA}I#m_acGXHRdq#1w*$HZOu{m_2+40;UnnrPm!lGN?9x@&W(=ABzYC000000RIL6 zLPG)o+X1cAPixdb7zXf3^kB=@vRf6z-Rw^OOc$h(cV;rl40th@?V*dHAPBYxL3%8H zg1WSI>D{9r!INJ@K@=1`w%?=r&TazfCWk0KLx#BoemwIgCxnm=jk&J>L3OVCcy=0a(m zC;1@HUpylDlikj*4>!*W#}9P)Z5>>gL@90nAWhR%$=hDfpC7ZOB4m+WVH8DmNxzg# z_b=Rc6-m%mBC!&wgfgHqCE{kvHip?P++Kt=l=2@anE}qZG?2zxW_8N!@%;GqecB&* z-h)1M>nSK7G_7v^B+OY{NhTLME$_S#O4&(BY6SGoaLKh73!{`uk5#*~!np z>4ak^iI%Yet%ZhFN^k6s@p@QsE%ttJYQhG}W=VN?>D@Z3P+HHd1iT)|QX{0$wNkp& zN?0`!V>G-M+!+O4RnjRXdpIcPRmqL^Gb@F_X0k-<+MINkofL&I+!h-DdM6G_T$H>z zd;{8QUVlDv&i|zsg#(*GW # 0.1.12 @@ -148,15 +149,17 @@ class SamtoolsTest(unittest.TestCase): ''' self.check_version() - if not os.path.exists(WORKDIR): - os.makedirs(WORKDIR) + self.workdir = os.path.join(WORKDIR, "samtools_test") + + if not os.path.exists(self.workdir): + os.makedirs(self.workdir) for f in self.requisites: shutil.copy(os.path.join(BAM_DATADIR, f), - os.path.join(WORKDIR, f)) + os.path.join(self.workdir, f)) self.savedir = os.getcwd() - os.chdir(WORKDIR) + os.chdir(self.workdir) return @@ -184,7 +187,7 @@ class SamtoolsTest(unittest.TestCase): pysam_targets = [x % r_pysam for x in targets] pysam_method = getattr(self.module, command) - + # run samtools full_statement = re.sub("%\(out\)s", self.executable, statement) run_command(" ".join((self.executable, full_statement))) @@ -220,7 +223,8 @@ class SamtoolsTest(unittest.TestCase): for s, p in zip(samtools_files, pysam_files): binary_equal = checkBinaryEqual(s, p) - error_msg = "%s failed: files %s and %s are not the same" % (command, s, p) + error_msg = "%s failed: files %s and %s are not the same" % ( + command, s, p) if binary_equal: continue elif s.endswith(".bam"): @@ -232,7 +236,7 @@ class SamtoolsTest(unittest.TestCase): check_lines_equal( self, s, p, filter_f=lambda x: x.startswith("#"), - msg=error_msg) + msg=error_msg) def testStatements(self): for statement in self.statements: @@ -241,9 +245,9 @@ class SamtoolsTest(unittest.TestCase): # bioconda samtools will be available. if command in ("bedcov", "stats", "dict", "bam2fq"): continue - - if (command == "calmd" and - list(sys.version_info[:2]) == [3, 3]): + + if (command == "calmd" and + list(sys.version_info[:2]) == [3, 3]): # skip calmd test, fails only on python 3.3.5 # in linux (empty output). Works in OsX and passes # for 3.4 and 3.5, see issue #293 @@ -256,7 +260,7 @@ class SamtoolsTest(unittest.TestCase): if self.executable == "bcftools": # bcftools usage messages end with exit(1) return - + for statement in self.statements: command = self.get_command(statement, map_to_internal=False) # ignore commands that exit or cause other failures @@ -271,9 +275,8 @@ class SamtoolsTest(unittest.TestCase): self.assertTrue(re.search(expected, usage_msg) is not None) def tearDown(self): - return - if os.path.exists(WORKDIR): - shutil.rmtree(WORKDIR) + if os.path.exists(self.workdir): + shutil.rmtree(self.workdir) os.chdir(self.savedir) @@ -300,7 +303,8 @@ if sys.platform != "darwin": self.assertTrue(isinstance(retval, basestring)) def testReturnValueData(self): - args = "-O BAM {}".format(os.path.join(BAM_DATADIR, "ex1.bam")).split(" ") + args = "-O BAM {}".format(os.path.join(BAM_DATADIR, + "ex1.bam")).split(" ") retval = pysam.view(*args) if IS_PYTHON3: @@ -310,7 +314,6 @@ if sys.platform != "darwin": self.assertTrue(isinstance(retval, bytes)) self.assertTrue(isinstance(retval, basestring)) - class StdoutTest(unittest.TestCase): '''test if stdout can be redirected.''' @@ -344,9 +347,9 @@ if sys.platform != "darwin": self.assertTrue(len(r) > 0) class PysamTest(SamtoolsTest): - """check access to samtools command in the pysam + """check access to samtools command in the pysam main package. - + This is for backwards capability. """ diff --git a/tests/tabix_bench.py b/tests/tabix_bench.py new file mode 100644 index 0000000..ce7077d --- /dev/null +++ b/tests/tabix_bench.py @@ -0,0 +1,180 @@ +import gzip +import os +import pysam + +from TestUtils import TABIX_DATADIR + +FN_COMPRESSED = "example.bed.gz" +FN_UNCOMPRESSED = "example.bed" +FN_LARGE_COMPRESSED = "example_large.bed.gz" +FN_LARGE_UNCOMPRESSED = "example_large.bed" + + +def read_python_compressed(fn): + '''iterate through with python.''' + with gzip.open(fn, mode="r") as f: + return len([x.split(b"\t") for x in f]) + + +def read_python_uncompressed(fn): + with open(fn) as f: + return len([x.split("\t") for x in f]) + + +def fetch_plain(fn): + with pysam.Tabixfile(fn) as f: + return len(list(f.fetch())) + + +def fetch_parsed(fn): + with pysam.Tabixfile(fn) as f: + return len(list(f.fetch(parser=pysam.asBed()))) + + +def iterate_generic_compressed(fn): + with gzip.open(fn) as f: + return len(list(pysam.tabix_generic_iterator(f, parser=pysam.asBed()))) + + +def iterate_generic_uncompressed(fn): + with open(fn) as f: + return len(list(pysam.tabix_generic_iterator(f, parser=pysam.asBed()))) + + +def iterate_parsed_compressed(fn): + with gzip.open(fn) as f: + return len(list(pysam.tabix_iterator(f, parser=pysam.asBed()))) + + +def iterate_parsed_uncompressed(fn): + with open(fn) as f: + return len(list(pysam.tabix_iterator(f, parser=pysam.asBed()))) + + +def iterate_file_compressed(fn): + with gzip.open(fn) as f: + return len(list(pysam.tabix_file_iterator(f, parser=pysam.asBed()))) + + +def iterate_file_uncompressed(fn): + with open(fn) as f: + return len(list(pysam.tabix_file_iterator(f, parser=pysam.asBed()))) + + +def test_read_python_compressed(benchmark): + result = benchmark(read_python_compressed, + os.path.join(TABIX_DATADIR, FN_COMPRESSED)) + assert result == 164 + + +def test_read_python_uncompressed(benchmark): + result = benchmark(read_python_uncompressed, + os.path.join(TABIX_DATADIR, FN_UNCOMPRESSED)) + assert result == 164 + + +def test_fetch_plain(benchmark): + result = benchmark(fetch_plain, os.path.join(TABIX_DATADIR, FN_COMPRESSED)) + assert result == 164 + + +def test_fetch_parsed(benchmark): + result = benchmark(fetch_parsed, os.path.join( + TABIX_DATADIR, FN_COMPRESSED)) + assert result == 164 + + +def test_iterate_generic_compressed(benchmark): + result = benchmark(iterate_generic_compressed, + os.path.join(TABIX_DATADIR, FN_COMPRESSED)) + assert result == 164 + + +def test_iterate_generic_uncompressed(benchmark): + result = benchmark(iterate_generic_uncompressed, + os.path.join(TABIX_DATADIR, FN_UNCOMPRESSED)) + assert result == 164 + + +def test_iterate_parsed_compressed(benchmark): + result = benchmark(iterate_parsed_compressed, + os.path.join(TABIX_DATADIR, FN_COMPRESSED)) + assert result == 164 + + +def test_iterate_parsed_uncompressed(benchmark): + result = benchmark(iterate_parsed_uncompressed, + os.path.join(TABIX_DATADIR, FN_UNCOMPRESSED)) + assert result == 164 + + +def test_iterate_file_compressed(benchmark): + result = benchmark(iterate_file_compressed, + os.path.join(TABIX_DATADIR, FN_COMPRESSED)) + assert result == 164 + + +def test_iterate_file_uncompressed(benchmark): + result = benchmark(iterate_file_uncompressed, + os.path.join(TABIX_DATADIR, FN_UNCOMPRESSED)) + assert result == 164 + + +def test_read_python_large_compressed(benchmark): + result = benchmark(read_python_compressed, os.path.join( + TABIX_DATADIR, FN_LARGE_COMPRESSED)) + assert result == 100000 + + +def test_read_python_large_uncompressed(benchmark): + result = benchmark(read_python_uncompressed, os.path.join( + TABIX_DATADIR, FN_LARGE_UNCOMPRESSED)) + assert result == 100000 + + +def test_fetch_plain(benchmark): + result = benchmark(fetch_plain, os.path.join( + TABIX_DATADIR, FN_LARGE_COMPRESSED)) + assert result == 100000 + + +def test_fetch_parsed(benchmark): + result = benchmark(fetch_parsed, os.path.join( + TABIX_DATADIR, FN_LARGE_COMPRESSED)) + assert result == 100000 + + +def test_iterate_generic_large_compressed(benchmark): + result = benchmark(iterate_generic_compressed, os.path.join( + TABIX_DATADIR, FN_LARGE_COMPRESSED)) + assert result == 100000 + + +def test_iterate_generic_large_uncompressed(benchmark): + result = benchmark(iterate_generic_uncompressed, os.path.join( + TABIX_DATADIR, FN_LARGE_UNCOMPRESSED)) + assert result == 100000 + + +def test_iterate_parsed_large_compressed(benchmark): + result = benchmark(iterate_parsed_compressed, os.path.join( + TABIX_DATADIR, FN_LARGE_COMPRESSED)) + assert result == 100000 + + +def test_iterate_parsed_large_uncompressed(benchmark): + result = benchmark(iterate_parsed_uncompressed, os.path.join( + TABIX_DATADIR, FN_LARGE_UNCOMPRESSED)) + assert result == 100000 + + +def test_iterate_file_large_compressed(benchmark): + result = benchmark(iterate_file_compressed, os.path.join( + TABIX_DATADIR, FN_LARGE_COMPRESSED)) + assert result == 100000 + + +def test_iterate_file_large_uncompressed(benchmark): + result = benchmark(iterate_file_uncompressed, os.path.join( + TABIX_DATADIR, FN_LARGE_UNCOMPRESSED)) + assert result == 100000 diff --git a/tests/tabix_data/example.bed b/tests/tabix_data/example.bed new file mode 100644 index 0000000..544e42d --- /dev/null +++ b/tests/tabix_data/example.bed @@ -0,0 +1,164 @@ +chr1 1737 2090 +chr1 1737 4275 +chr1 1873 1920 +chr1 1873 3533 +chr1 2042 2090 +chr1 2476 2560 +chr1 2476 2584 +chr1 2838 2915 +chr1 3084 3237 +chr1 3084 4021 +chr1 3084 4275 +chr1 3316 3533 +chr1 4022 4024 +chr1 4022 4249 +chr1 4226 4561 +chr1 4226 4692 +chr1 4226 19233 +chr1 4226 19433 +chr1 4250 4252 +chr1 4250 4275 +chr1 4267 4364 +chr1 4267 19433 +chr1 4559 4561 +chr1 4562 4692 +chr1 4833 4901 +chr1 4868 4901 +chr1 5659 5764 +chr1 5659 5810 +chr1 5767 5810 +chr1 6470 6628 +chr1 6717 6918 +chr1 6721 6918 +chr1 7096 7141 +chr1 7096 7227 +chr1 7096 7231 +chr1 7139 7141 +chr1 7142 7231 +chr1 7414 7416 +chr1 7414 7605 +chr1 7414 19206 +chr1 7417 7605 +chr1 7465 7605 +chr1 7469 7605 +chr1 7778 7924 +chr1 8131 8226 +chr1 8131 8229 +chr1 8227 8229 +chr1 8776 8868 +chr1 8776 8938 +chr1 8866 8868 +chr1 8869 8938 +chr1 14601 14706 +chr1 14601 14754 +chr1 14704 14706 +chr1 14707 14754 +chr1 19184 19206 +chr1 19184 19233 +chr1 19397 19433 +chr1 19417 19902 +chr1 19417 20960 +chr1 19417 20972 +chr1 20130 20530 +chr1 20130 20972 +chr1 20229 20366 +chr1 20427 20530 +chr1 20839 20960 +chr1 20839 20972 +chr1 24417 25003 +chr1 24417 25037 +chr1 24417 25944 +chr1 25001 25003 +chr1 25004 25037 +chr1 25108 25344 +chr1 25108 25936 +chr1 25140 25344 +chr1 25584 25599 +chr1 25584 25936 +chr1 25584 25944 +chr1 25597 25599 +chr1 25600 25944 +chr1 42912 42930 +chr1 42912 44799 +chr1 44693 44796 +chr1 44693 44799 +chr1 44797 44799 +chr1 52811 53750 +chr1 58918 58953 +chr1 58918 59971 +chr1 58954 58956 +chr1 58954 59868 +chr1 59869 59871 +chr1 59869 59971 +chr1 79158 81492 +chr1 79158 110795 +chr1 79158 123429 +chr1 79414 79913 +chr1 79414 80968 +chr1 80150 80968 +chr1 81954 82103 +chr1 82093 82103 +chr1 82093 119080 +chr1 100816 101220 +chr1 100816 119036 +chr1 102563 102667 +chr1 110584 110795 +chr1 110638 110795 +chr1 118918 119036 +chr1 118918 119080 +chr1 118944 119086 +chr1 118944 123429 +chr1 120967 123786 +chr1 123237 123429 +chr1 125110 125869 +chr1 125110 127902 +chr1 125110 129483 +chr1 127146 127148 +chr1 127146 127938 +chr1 127146 129483 +chr1 127149 127938 +chr1 127432 127902 +chr1 129119 129483 +chr1 129481 129483 +chr1 129653 129710 +chr1 129653 130202 +chr1 129938 130202 +chr1 131337 132874 +chr1 131337 139570 +chr1 132671 132874 +chr1 132671 136694 +chr1 136249 136372 +chr1 136249 139570 +chr1 136505 136694 +chr1 147647 147749 +chr1 147647 147750 +chr1 150309 150553 +chr1 150309 151388 +chr1 151177 151388 +chr1 154268 154654 +chr1 154268 163727 +chr1 155747 155805 +chr1 155752 155805 +chr1 155752 158630 +chr1 157963 158028 +chr1 158473 158630 +chr1 158912 159127 +chr1 162420 162551 +chr1 163616 163727 +chr2 28814 31610 +chr2 28814 31627 +chr2 28814 36385 +chr2 28814 36870 +chr2 31220 31627 +chr2 31220 32952 +chr2 31221 31627 +chr2 31221 36870 +chr2 31608 31610 +chr2 31611 31627 +chr2 32809 32952 +chr2 35440 36385 +chr2 36383 36385 +chr2 36807 36870 +chr2 187569 189901 +chr2 187569 192605 +chr2 190164 192605 diff --git a/benchmark/windows_small.bed b/tests/tabix_data/example_large.bed similarity index 100% rename from benchmark/windows_small.bed rename to tests/tabix_data/example_large.bed diff --git a/benchmark/windows_small.bed.gz b/tests/tabix_data/example_large.bed.gz similarity index 100% rename from benchmark/windows_small.bed.gz rename to tests/tabix_data/example_large.bed.gz diff --git a/tests/tabix_data/example_large.bed.gz.tbi b/tests/tabix_data/example_large.bed.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..35c9cd2b01f5ce162a056ce3032f93512e2df718 GIT binary patch literal 7877 zcmZ{pd0bKn-^H70(zmjl#>!C9%smxHG))w(+y%9=Tv02bBDWOW$aJd7$`nNu!xS?> zrgF(8_hrfo6%)BOn&ncV!W}CmF?W4$&*%B$o#*pD{J|gm$vx+L&iP%f+yC>^hVP$s zg$+j>f86kgX7I3LtzcffE;Dwy1z$d{bHL}(AoUpHWYjLt^3wE|O_|4zCPjttgF64u5FRkEg57sqFNC5vq?Q7$*04+K;{ zZ*9Tfy7b_~SolC{W#tPnM?4lT#QPn2)lkS>e*S{6J|CDo$!#%fkGgW@dQ4{MC^Eec z{Yb+sD<^#*yjlO;vY$ce1nwd@5p(s5I4pl#+DdY9Eg7IUw)|<~KgU?OY$D|{}>+^9|QGG^taYc?VV?B(Qy(_j@UJ~fVkRW3tua^nRkj2-9W&LvLO|i{OVIN)44<%rzKT;7MQg9 zkFL8Yv;wWizxkU^RpccNkP0(_a#$M(*@ zw0oy>BVv7b+nedMccJqq-aM-Aj|-S%SPnC;*v77f#=gHi^pV!HoBn|pE8c1&$_|<4 z2Gq5=8M&t~(=iAn~3FC?Hw2h}3!qgmtdH1oypi+0KD?udw8t+zN ztLG!^Mo0*Ax4-rh8mAv(n-Pj5Ii}G9jf9Eo?f9k3HjA`~U36}7*RmPDZ^mvmS~nqn zNQYN;H)yiUAZus#=#kjOV?%Ynn}m9V7@Mw6jQ4!4_SOp!?JM@pD^ZJ$yNa#Qx%)c* zx_Y-;SR?O&#yed61*1oGj5I4`pKouf_x4YFgUF1bPsrhuQJ)Y7DE#1P^@-{X)I`Y- zv0ZzIB5#(Vv1+cJ0-Xs!6#M4j(Ale&-Z@#k&PbmLIjlP0HBi#|#xBXuvm(&M)if`Y zf$86$M0zs(#J4xZ%r+v2@d_FB$gVZGcSL)6nsMugfaG?&OGhh|yEnUXR~s@Mw5N-5 z6nnd!CETg|*oJ+i5V~rm;8p#2-#a8&VWOU=JS{NI8h?FqchZ+yU#PEvcb5{lDY|A}TY^^KmS(5-Wg zlWMih6hC5yf#-aP&#_uQJod&vk_1OfAAD)XnfxF(bzG&IK-_vINRbi}O zl#Olir=v#$w+I@}d?jzTsn7is>Y`sjkZt)Iq1QON&)@|8TLH3-(c3 zX{Li+8tUWMJ&~7d3R_OreH8$<6RRW0J=tGYtNGdi^N)K-SWO#5tHJn8fAwdZfVHbF z@yC`bJDG#{V)g1XPWG*)jtLWg|A9@}d)JO}y|}Z}qS3%B>k9vD-gQ!^n^&WO^}E?F zuT6x%o&iTQ_sb%!AlrXxN+0d9`{qB}HZs>179Pw|Zz&>0lcR3K72_51VHFH2!)Oh;bWmFqwtZ`BcF!1AI4eXsyuX;^B~X z!vq|S6^>j?E7PC}qXqN?uG#(~Wqd!sTB?$?%*3f$S0qgeBdWU*HuyOlGDx%0tQc!- zsJM$^db>+Q_VtJnGsMNqO5nQ-4|acTb5?ldp5QseW|Rf_O%nH7sCh)I*t9Ss-W_42 z28D#1u11dc#8+o|`hGGjj%R2$<5p=JBaT*}(bjny4d2Ks*04PquV(USdtRE5aO(F$ z2aoLxb-JCmU62|8k{m$b|lws}6KC8;o$-!YG(T^*3tkCD0`AaLF z>o*pZi;6|&U-EV^Q=@9SIU08`sIQ&5D?r$@eSjZM!K3>k(Zql~!L6#cY)6#;CEG@P zWLh+bf4ECsP+N*@ZB(jzK7eFrW;~&n&U3jcDSK6#JuW4Hv8SRzFUDvJ;PA7epzkPb z5vS(Ty&yVBWvNIH-l9eox(J!%LT7$s!*8tH(uHmC)`lEbxU`W2?ot!LJuejH$ygzeoP5~z+E6|* zo|)@|iWTH0`yZuj!GxN_Zu0#I4P_Z^`k8s0p{sv!o|M-9mDAhw=1{3s4l%l0?v#vC z2=w3*Sh2tS0LS!)DYtW%nQxDl9(Z~d*?L{6&D0`iCKwiO<_YI; zYp}IqS76k!iQr-!#pIO&RtS1rOoU_3jvuO-0b5nN(fuL z4dpW9O&Mj2$H>Vy*ZXE+XmRg&%HoNTBfiYsjU;zhO=eCg1?%s$lrA{r4nt&4;A>kf zeHoL3+$X3u?Z_wG8W#4+nK0(SiBjwQr;Im&XKQIy@qoOomi$Z^f5wkEjR@ZU$#kR( zc;~&O`0*4>ZNE-2ro#>9pvD)GeuNu{M4A0k8#=W{dQerhkF*2cUcy%O14!G@gQA#? z@P3&hC5kQ1A`f485RtUt%7FPzS1_rNWdJ6oQBSXAvhu;iJnEoqH#)jshJdpdeq&{U zi@8)Y>G?j==;k59aWAmb)aUEz!=RDLEN8vz4~dXr~Vz zQESVVg@FfIRF#-5XdV!w07oCZ!I}j-!+(B;xd=o1^^p$12Z0!L)P2cWR1}x3l>m^G z;q!oO!x~noBVV~rHIOE%2}QF%!p%hH8&i1_Pt@9BzU_i4db(e>176%GvpL)N>)x7k zR!n(1d5?J|)@U#KI;YZ*k$vSj66fOb%Kxz1BRA4XV!3wYW`9@Il;Cw5vV5}@9W=$y zL^5f=Pj~xYBL2PCw1{vY?vV3M6Zp%gY5BbY@Gf-8Ax~R1MPH|bdpb8m%aMO#C5J@^ z7k0sU;z4<#CqIdd?RX)_dGII5D-5dr3YDb}E}-G{@8c<7?unzm0}HJzlKl$L5}E(b zlO{2!5-@Q`W#6nCJi5>PYAP_R312T2JFeVd38k?4Q|@Q^Wk@5%$sDR3QmwFTi%;Z%iCOj_L@$Ci7IW+Uai@`Q_hei{v#Vo7C7enNTOXRGr>Bz^cfQ49icPqGh{ z;^=Bt!I7v@D%fHLRH;~5KJ3ER0voob-j~>;G&x~EX92SZ(cXYe3Er`wjyCI;?T1&2 zO(~qvoug7-Y*hDyf8Z^}R`6xRH8W~@%>^kHO ztGIfSP@85skTVIS@42_+^# zv{KU@apWXGv|2lcv`nKyn7ZjBwaKGeNTpA0tx(jfwb`|3$nQ@fBK>h+PXi+ z7{1y-XI+vu_eq=e9&0ry`lvDP7Ugb{tfbk#B0k*g$*Y3Xa zC~C)Mc_EfxDLn{pFO@`~df8!Wh*HUM6oMz-kE(bYCm%ko$k#0+ZtM|TRVBN?LKC^O1@!fj;L23A$o9?0vRL_X20V-O?p6kAwr zR|fTs^Go~O(l5ZrKSz<~0dJSx2M=kOj5u=VNuW2a#Oz{OMg~u;i)xBdL!Sd;cEcqf zf21Jpi>=A>j~Ua^)?3v zBLrDhS!)~f8+lvK7j3{+$<=M$5T*v19Od=gNM7i{hmqy)H7LD2@evf_DM5}qEQ|)F zY;i&Zjdd4n$fKqVyL*p@zaPf%H!fJ9v-)OLQEAe|Eka-xijV(~Rbk}hv>5(b!ndCN z+bTN~p%C1>34dH0G)&jSdl-R6I+xFu?cJD1Scrp_BQhaY6-;5m9f{#rUvX-#V=o7l z>V0TNwzev@nOUH^-YC8~SwL~acr)K9I?%qY0do(?%`+At9I>iHIByB(fB1>wRVwx) z7g9s}Nmg))$b6e>KgkH5-ygFJKKLF*iMl7ACl6nJ2R2@#HiLWqT$S57@jJ+NF8olL zE?U@Uj^*D@yv`bwCSIdL)rvJQU9g60eRPl~dk9s@mQLRY@jX`>@yVLZ>`;B?5J*!N>JjEaO`WaNuV}%=}Ss6}RMrmu}TC))9mMwO2g?8yVCo@W1z}!~A}--3?V~ z5|io&Cc>jdvsUnFz&u@9x_F1>EnO(22DqR*E$>>6=gJxhN9;`u>02Jy%*j~pe=Szug9DXvLmm7_vC=r_FspHf)zL%B_}m`5 zLQhOf>a`F$Q&TrpnhUp0-g+F8`Vl?wIeOr;_SBs6)ZEU{$5CmXyF%M0`f?ouV^3=` z?}Wbf1Wx{R84M4^wgj>|n3zlD%YH(8_d~)On?a&00T~J& z{UMSv$_>kq%acGxFA*dEWpt1&rl6vpy^;@m zxH`ZSK5A0FaK%5M)-qhBi(Ao-WIJa*De(#`UwY>-e@v#2UcV=Xqt>p^gNrv=<|uo1 zSmEeKO~HQ8O1Im`^e12Z2!yybrF$f7HqmP(aK=m=ci}n6HpI3A_~fhDKd#LYjyrh_#qm*Ef!Vmpp>BM(OLDzvVwD{;HXjk6l-~A$ae` zG%uivr3-pqr4q>LUi``!D>UTvOzQ0zBXoX0337U$%y!sK2nEP>>bn?4G!!6*_^XLG zSvSBWC_n(2F015w5OT!3#w|8QSWEYmh>H`mSgHLoZTNbL*eCh0cL(z`6Hao&AbNCE zW--m~P3fm%K!i~%m)owv$S?7TsQ)!2Q&!-QarxWh+R2yJFH4anjo1rD@baomEYSdM z1x^`m44!d>ebHxYVZx%ZjL8%Lb@VMHD{uu@TH0A_hF|j6FuSHgc)1gHH{C*tzTHa# zTQ;bQmeo<0V4+-I$D>HXx^XNlZxlm_;=ojFGPl`)9jP{a~VNk>2gUnMm0s z@DPL}tM1qO(JCJLJ2t6s>Z$TUej4(%lS_enN6;IOm+d+#Z@W439QMX|hEY7Y=(a>T z~m`H{Jt1i#XU*A zTrQH?qN1KpgN@MR02?+Y^jjR_ZzQk4zUK!8%8zvFIhh^W43KSsSAN(=agi<)`e$JD{fe#TQ z(QeU^Wd^IhunQ(So>h&OEv8~R{#i5f0lLi$UWD&f1p0S1V~QOWD1wW~Z*DvAdn4@K<;$TTE#25KfU>ocZq(^ZzNU4^~Wgji9#hPEe_Y z{`4x6{p)UeRU&b}sVRgjY?`9R*90`m3$jOEgti06&OL1F65J}>|$ ztZjrpZe>`~)9N4RWu&_5*XFrTySplWGV%EY%+lm@f3lE$u4xW5(b~cT%k8wu(*Wwo zTbB=eKqx};$%PrAA`%MfiQqIALHd)<|P zue)&J$0W*km!m>D)a4m{GVg!jYPeyJc{cBX4L%UTWGT)H?zx}cNvrzkf4rdE?RJ2^ zZ}zwDg(mXJ{4ds=daD6?$|I!%3|i?MoZ2Wu=8pUj%zUjmmQVFUPHY6h^bxL5dt)o% zAZ)oydWcANpFMXF7wGEo4l28B@lSLwTQI_Qa3Pzz-X}9^yT?0Ryz5Ru z@Y;O9RbnH7VsHQM@x30yj2C#5kOB`3#>T`<#n`>sx#=Nv*ecU&jpMdoQ9AI6{%=`s5>n4cPA%x64f%Y*1D1oBh70#L%AlU_@bY#ZZhqH;*IUv>?}O;fkD`*Y*YZ9vUKrChS#?%;&ql02J=6 zi-+XAhO2NhxzNeg+KVm0qI&O(akxXme@}GW_5c48_wO~Ol**O3qo6glCzbt=A>zu1 z1orcKiNpuhc8`6!-njo0(v*i4nqFJ{E-sU>7n}#W3S=CbOmxF+ACSMAGAj}eU zGJ##t)rt0SBt<~X68vt3d{9Jw^B7{*KD3rcD0GTbpvacnq|bkd(9-R2#FOt>mcCa7 z?0f_Hn0HQv`-*%(yKB9gg^tsUT67oK)L zJ70z@a8%2@<5#fNEnJ&uoUX^I$;F!L^!FfJfAnQM9VEEK!7gnK$M{?(5)D=Ep0XfD zYX?VT5_05iw?A{&FT?)U)zLcTN0e%dFLnQJzix2}i22e;dFz#yrNt_M?k2q3KG= zSHcL1c5nmWKTdIw(b@S_Rs1B{6XoD}O+gl?X+@|-oW_^rlD!#W& yq!f2-CAT}xD^b>|*5I1r$aMEQVqLkoFVOedsSO*n{^Lcp`i71FdT;&x>wf|8F!@RV literal 0 HcmV?d00001 diff --git a/tests/tabix_test.py b/tests/tabix_test.py index 890130d..013ff86 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -11,6 +11,7 @@ import shutil import gzip import pysam import unittest +import subprocess import glob import re import copy @@ -52,7 +53,7 @@ def checkBinaryEqual(filename1, filename2): if len(d1) != len(d2): return False - + found = False for c1, c2 in zip(d1, d2): if c1 != c2: @@ -76,17 +77,18 @@ class TestIndexing(unittest.TestCase): '''test indexing via preset.''' pysam.tabix_index(self.tmpfilename, preset="gff") - self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx)) + self.assertTrue(checkBinaryEqual( + self.tmpfilename + ".tbi", self.filename_idx)) def test_indexing_to_custom_location_works(self): '''test indexing a file with a non-default location.''' index_path = get_temp_filename(suffix='custom.tbi') - pysam.tabix_index(self.tmpfilename, preset="gff", index=index_path, force=True) + pysam.tabix_index(self.tmpfilename, preset="gff", + index=index_path, force=True) self.assertTrue(checkBinaryEqual(index_path, self.filename_idx)) os.unlink(index_path) - def test_indexing_with_explict_columns_works(self): '''test indexing via preset.''' @@ -96,7 +98,8 @@ class TestIndexing(unittest.TestCase): end_col=4, line_skip=0, zerobased=False) - self.assertTrue(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx)) + self.assertTrue(checkBinaryEqual( + self.tmpfilename + ".tbi", self.filename_idx)) def test_indexing_with_lineskipping_works(self): '''test indexing via preset and lineskip.''' @@ -106,8 +109,9 @@ class TestIndexing(unittest.TestCase): end_col=4, line_skip=1, zerobased=False) - self.assertFalse(checkBinaryEqual(self.tmpfilename + ".tbi", self.filename_idx)) - + self.assertFalse(checkBinaryEqual( + self.tmpfilename + ".tbi", self.filename_idx)) + def tearDown(self): os.unlink(self.tmpfilename) if os.path.exists(self.tmpfilename + ".tbi"): @@ -122,7 +126,7 @@ class TestCompression(unittest.TestCase): def setUp(self): self.tmpfilename = get_temp_filename(suffix="gtf") with gzip.open(self.filename, "rb") as infile, \ - open(self.tmpfilename, "wb") as outfile: + open(self.tmpfilename, "wb") as outfile: outfile.write(infile.read()) def testCompression(self): @@ -206,19 +210,19 @@ class IterationTest(unittest.TestCase): if start is not None and end is None: # until end of contig subset = [x[3] - for x in self.compare if x[0] == contig - and x[2] > start] + for x in self.compare if x[0] == contig and + x[2] > start] elif start is None and end is not None: # from start of contig subset = [x[3] - for x in self.compare if x[0] == contig - and x[1] <= end] + for x in self.compare if x[0] == contig and + x[1] <= end] elif start is None and end is None: subset = [x[3] for x in self.compare if x[0] == contig] else: # all within interval - subset = [x[3] for x in self.compare if x[0] == contig - and min(x[2], end) - max(x[1], start) > 0] + subset = [x[3] for x in self.compare if x[0] == contig and + min(x[2], end) - max(x[1], start) > 0] if self.with_comments: subset.extend(self.comments) @@ -401,7 +405,7 @@ class TestIterationWithComments(TestIterationWithoutComments): def setUp(self): TestIterationWithoutComments.setUp(self) - + class TestIterators(unittest.TestCase): filename = os.path.join(TABIX_DATADIR, "example.gtf.gz") @@ -415,7 +419,7 @@ class TestIterators(unittest.TestCase): self.compare = load_and_convert(self.filename) self.tmpfilename_uncompressed = 'tmp_TestIterators' with gzip.open(self.filename, "rb") as infile, \ - open(self.tmpfilename_uncompressed, "wb") as outfile: + open(self.tmpfilename_uncompressed, "wb") as outfile: outfile.write(infile.read()) def tearDown(self): @@ -591,7 +595,8 @@ if IS_PYTHON3: with pysam.TabixFile( self.tmpfilename + ".gz", encoding="ascii") as t: results = list(t.fetch(parser=pysam.asVCF())) - self.assertRaises(UnicodeDecodeError, getattr, results[1], "id") + self.assertRaises(UnicodeDecodeError, + getattr, results[1], "id") with pysam.TabixFile( self.tmpfilename + ".gz", encoding="utf-8") as t: @@ -625,7 +630,7 @@ class TestVCFFromTabix(TestVCF): def tearDown(self): self.tabix.close() TestVCF.tearDown(self) - + def testRead(self): ncolumns = len(self.columns) @@ -748,7 +753,7 @@ class TestVCFFromVCF(TestVCF): for x, msg in self.fail_on_parsing: if "{}.vcf".format(x) == fn: return "parsing" - + for x, msg in self.fail_on_samples: if "{}.vcf".format(x) == fn: return "samples" @@ -995,7 +1000,8 @@ class TestVCFFromVariantFile(TestVCFFromVCF): v = smp.values() if 'GT' in smp: - alleles = [str(a) if a is not None else '.' for a in smp.allele_indices] + alleles = [ + str(a) if a is not None else '.' for a in smp.allele_indices] v[0] = '/|'[smp.phased].join(alleles) comp = ":".join(map(convert_field, v)) @@ -1035,7 +1041,7 @@ for vcf_file in vcf_files: class TestRemoteFileHTTP(unittest.TestCase): - url = "http://www.cgat.org/downloads/public/pysam/test/example.gtf.gz" + url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example.gtf.gz" region = "chr1:1-1000" local = os.path.join(TABIX_DATADIR, "example.gtf.gz") @@ -1044,7 +1050,7 @@ class TestRemoteFileHTTP(unittest.TestCase): self.remote_file = None else: self.remote_file = pysam.TabixFile(self.url, "r") - + self.local_file = pysam.TabixFile(self.local, "r") def tearDown(self): @@ -1074,7 +1080,7 @@ class TestRemoteFileHTTP(unittest.TestCase): class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP): - url = "http://www.cgat.org/downloads/public/pysam/test/example_comments.gtf.gz" + url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/example_comments.gtf.gz" region = "chr1:1-1000" local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz") @@ -1091,7 +1097,7 @@ class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP): self.assertEqual(list(self.local_file.header), ["# comment at start"]) self.assertEqual(list(self.local_file.header), self.remote_file.header) - + class TestIndexArgument(unittest.TestCase): @@ -1099,6 +1105,7 @@ class TestIndexArgument(unittest.TestCase): filename_dst = "tmp_example.vcf.gz" index_src = os.path.join(TABIX_DATADIR, "example.vcf.gz.tbi") index_dst = "tmp_index_example.vcf.gz.tbi" + index_dst_dat = "tmp_index_example.vcf.gz.tbi.dat" preset = "vcf" def testFetchAll(self): @@ -1120,6 +1127,25 @@ class TestIndexArgument(unittest.TestCase): os.unlink(self.filename_dst) os.unlink(self.index_dst) + def testLoadIndexWithoutTbiExtension(self): + shutil.copyfile(self.filename_src, self.filename_dst) + shutil.copyfile(self.index_src, self.index_dst_dat) + + with pysam.TabixFile( + self.filename_src, "r", index=self.index_src) as same_basename_file: + same_basename_results = list(same_basename_file.fetch()) + + with pysam.TabixFile( + self.filename_dst, "r", index=self.index_dst_dat) as diff_index_file: + diff_index_result = list(diff_index_file.fetch()) + + self.assertEqual(len(same_basename_results), len(diff_index_result)) + for x, y in zip(same_basename_results, diff_index_result): + self.assertEqual(x, y) + + os.unlink(self.filename_dst) + os.unlink(self.index_dst_dat) + def _TestMultipleIteratorsHelper(filename, multiple_iterators): '''open file within scope, return iterator.''' @@ -1188,8 +1214,10 @@ class TestMultipleIterators(unittest.TestCase): def testDisjointIterators(self): # two iterators working on the same file with pysam.TabixFile(self.filename) as tabix: - a = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next() - b = tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True).next() + a = tabix.fetch(parser=pysam.asGTF(), + multiple_iterators=True).next() + b = tabix.fetch(parser=pysam.asGTF(), + multiple_iterators=True).next() # both iterators are at top of file self.assertEqual(str(a), str(b)) @@ -1223,4 +1251,5 @@ class TestContextManager(unittest.TestCase): if __name__ == "__main__": + subprocess.call("make -C %s" % TABIX_DATADIR, shell=True) unittest.main() diff --git a/tests/tabixproxies_test.py b/tests/tabixproxies_test.py index 35ad8fc..32f8e42 100644 --- a/tests/tabixproxies_test.py +++ b/tests/tabixproxies_test.py @@ -89,7 +89,7 @@ class TestParser(unittest.TestCase): '''test iteration from uncompressed file.''' tmpfilename = 'tmp_testIteratorUncompressed' with gzip.open(self.filename, "rb") as infile, \ - open(tmpfilename, "wb") as outfile: + open(tmpfilename, "wb") as outfile: outfile.write(infile.read()) with open(tmpfilename) as infile: @@ -130,7 +130,7 @@ class TestGTF(TestParser): # remove quotes around numeric values s = re.sub("\"(\d+)\"", r"\1", s) return s - + def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=self.parser())): @@ -149,7 +149,7 @@ class TestGTF(TestParser): r = self.tabix.fetch(parser=self.parser()).next() - r.contig = r.contig + "_test_contig" + r.contig = r.contig + "_test_contig" r.source = r.source + "_test_source" r.feature = r.feature + "_test_feature" r.start += 10 @@ -173,7 +173,7 @@ class TestGTF(TestParser): sr = str(r) self.assertEqual(r.transcript_id, "abcd") self.assertTrue("transcript_id \"abcd\"" in sr) - + def test_added_attribute_is_output(self): r = self.tabix.fetch(parser=self.parser()).next() @@ -187,14 +187,14 @@ class TestGTF(TestParser): self.assertTrue("new_text_attribute \"abc\"" in str(r).split("\t")[8]) def test_setting_start_is_one_based(self): - + r = self.tabix.fetch(parser=self.parser()).next() r.start = 1800 self.assertEqual(r.start, 1800) self.assertEqual(str(r).split("\t")[3], "1801") def test_setting_end_is_one_based(self): - + r = self.tabix.fetch(parser=self.parser()).next() r.end = 2100 self.assertEqual(r.end, 2100) @@ -281,14 +281,14 @@ class TestGTF(TestParser): def test_asdict_contains_attributes(self): r = self.tabix.fetch(parser=self.parser()).next() - d = r.as_dict() + d = r.to_dict() c = self.compare[0] s = self.build_attribute_string(d) self.assertEqual(s, c[8]) def test_asdict_can_be_modified(self): r = self.tabix.fetch(parser=self.parser()).next() - d = r.as_dict() + d = r.to_dict() d["gene_id"] = "new_gene_id" self.assertTrue("gene_id \"new_gene_id\"", str(r)) @@ -322,7 +322,7 @@ class TestGFF3(TestGTF): def test_setting_fields(self): for r in self.tabix.fetch(parser=self.parser()): - r.contig = r.contig + "_test_contig" + r.contig = r.contig + "_test_contig" r.source = "test_source" r.feature = "test_feature" r.start += 10 @@ -330,7 +330,7 @@ class TestGFF3(TestGTF): r.score = 20 r.strand = "+" r.frame = 0 - r.ID="test" + r.ID = "test" sr = str(r) self.assertTrue("test_contig" in sr) self.assertTrue("test_source" in sr) @@ -357,6 +357,6 @@ class TestGFF3(TestGTF): r.new_text_attribute = "abc" self.assertTrue("new_text_attribute=abc" in str(r).split("\t")[8]) - + if __name__ == "__main__": unittest.main() diff --git a/tests/test_samtools_python.py b/tests/test_samtools_python.py index f7a351b..6088ce7 100644 --- a/tests/test_samtools_python.py +++ b/tests/test_samtools_python.py @@ -6,7 +6,7 @@ from TestUtils import BAM_DATADIR def test_idxstats_parse_split_lines(): bam_filename = os.path.join(BAM_DATADIR, "ex2.bam") # Test pysam 0.8.X style output, which returns a list of lines - lines = pysam.idxstats(bam_filename, split_lines=True) + lines = pysam.idxstats(bam_filename, split_lines=True) for line in lines: _seqname, _seqlen, nmapped, _nunmapped = line.split() @@ -37,7 +37,7 @@ def test_bedcov(): bam_filename = os.path.join(BAM_DATADIR, "ex1.bam") bed_filename = os.path.join(BAM_DATADIR, "ex1.bed") # Test pysam 0.9.X style output, which returns a string that needs to be split by \n - bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False) + bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False) lines = bedcov_string.splitlines() for line in lines: fields = line.split('\t') -- 2.30.2