From: Andreas Tille Date: Tue, 7 Dec 2021 14:07:01 +0000 (+0100) Subject: Drop outdated and unused tests X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2~29 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=e39be3eb9e6586ce37b5c52d5dd09f56db3bfdc7;p=python-pysam.git Drop outdated and unused tests --- diff --git a/debian/changelog b/debian/changelog index 0cef3be..49ef31d 100644 --- a/debian/changelog +++ b/debian/changelog @@ -7,6 +7,7 @@ python-pysam (0.18.0+ds-1) UNRELEASED; urgency=medium * Remove constraints unnecessary since buster: + Build-Depends: Drop versioned constraint on bcftools, libhts-dev and samtools. + * Drop outdated and unused tests -- Andreas Tille Tue, 07 Dec 2021 09:21:04 +0100 diff --git a/debian/patches/bcftools_v1.10_full b/debian/patches/bcftools_v1.10_full deleted file mode 100644 index fad5c40..0000000 --- a/debian/patches/bcftools_v1.10_full +++ /dev/null @@ -1,34431 +0,0 @@ -Author: Michael R. Crusoe -Description: sync with bcftools 1.10 - -use devtools/import.py and the contents of the bcftools -Debian package with its patches fully applied - ---- python-pysam.orig/bcftools/LICENSE -+++ python-pysam/bcftools/LICENSE -@@ -723,3 +723,26 @@ - - ----------------------------------------------------------------------------- - -+LICENSE FOR VariantKey (https://github.com/Genomicsplc/variantkey) -+ -+The MIT License -+ -+Copyright (c) 2017-2018 GENOMICS plc -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+THE SOFTWARE. ---- python-pysam.orig/bcftools/bam2bcf.c -+++ python-pysam/bcftools/bam2bcf.c -@@ -125,6 +125,7 @@ - memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); - if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); - if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); -+ if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); - } - - /* -@@ -152,6 +153,7 @@ - memset(r->qsum,0,sizeof(float)*4); - memset(r->anno,0,sizeof(double)*16); - memset(r->p,0,sizeof(float)*25); -+ r->SCR = 0; - - if (ref_base >= 0) { - ref4 = seq_nt16_int[ref_base]; -@@ -199,6 +201,7 @@ - if (q > 63) q = 63; - if (q < 4) q = 4; // MQ=0 reads count as BQ=4 - bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; -+ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; - // collect annotations - if (b < 4) - { -@@ -225,8 +228,12 @@ - // collect for bias tests - if ( baseQ > 59 ) baseQ = 59; - if ( mapQ > 59 ) mapQ = 59; -- int len, pos = get_position(p, &len); -- int epos = (double)pos/(len+1) * bca->npos; -+ int len, epos = 0; -+ if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) ) -+ { -+ int pos = get_position(p, &len); -+ epos = (double)pos/(len+1) * bca->npos; -+ } - int ibq = baseQ/60. * bca->nqual; - int imq = mapQ/60. * bca->nqual; - if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; -@@ -650,6 +657,14 @@ - call->DP4[4*i+3] = calls[i].anno[3]; - } - } -+ if ( call->SCR ) -+ { -+ for (i=0; iSCR[0] += calls[i].SCR; -+ call->SCR[1+i] = calls[i].SCR; -+ } -+ } - if ( call->ADF ) - { - assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well -@@ -702,19 +717,23 @@ - // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); - // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - -- call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); -+ if ( bca->fmt_flag & B2B_INFO_RPB ) -+ call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); - - #if CDF_MWU_TESTS -- call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); -+ // CDF version of MWU tests is not calculated by default -+ if ( bca->fmt_flag & B2B_INFO_RPB ) -+ call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); - #endif - -- call->vdb = calc_vdb(bca->alt_pos, bca->npos); -+ if ( bca->fmt_flag & B2B_INFO_VDB ) -+ call->vdb = calc_vdb(bca->alt_pos, bca->npos); - - return 0; - } -@@ -790,6 +809,8 @@ - if ( fmt_flag&B2B_INFO_DPR ) - bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); - } -+ if ( fmt_flag&B2B_INFO_SCR ) -+ bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1); - - float tmpf[16]; - for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; -@@ -861,6 +882,8 @@ - if ( fmt_flag&B2B_FMT_DPR ) - bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - } -+ if ( fmt_flag&B2B_FMT_SCR ) -+ bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample); - - return 0; - } ---- python-pysam.orig/bcftools/bam2bcf.c.pysam.c -+++ python-pysam/bcftools/bam2bcf.c.pysam.c -@@ -127,6 +127,7 @@ - memset(bca->rev_mqs,0,sizeof(int)*bca->nqual); - if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); - if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); -+ if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); - } - - /* -@@ -154,6 +155,7 @@ - memset(r->qsum,0,sizeof(float)*4); - memset(r->anno,0,sizeof(double)*16); - memset(r->p,0,sizeof(float)*25); -+ r->SCR = 0; - - if (ref_base >= 0) { - ref4 = seq_nt16_int[ref_base]; -@@ -201,6 +203,7 @@ - if (q > 63) q = 63; - if (q < 4) q = 4; // MQ=0 reads count as BQ=4 - bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; -+ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; - // collect annotations - if (b < 4) - { -@@ -227,8 +230,12 @@ - // collect for bias tests - if ( baseQ > 59 ) baseQ = 59; - if ( mapQ > 59 ) mapQ = 59; -- int len, pos = get_position(p, &len); -- int epos = (double)pos/(len+1) * bca->npos; -+ int len, epos = 0; -+ if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB) ) -+ { -+ int pos = get_position(p, &len); -+ epos = (double)pos/(len+1) * bca->npos; -+ } - int ibq = baseQ/60. * bca->nqual; - int imq = mapQ/60. * bca->nqual; - if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; -@@ -652,6 +659,14 @@ - call->DP4[4*i+3] = calls[i].anno[3]; - } - } -+ if ( call->SCR ) -+ { -+ for (i=0; iSCR[0] += calls[i].SCR; -+ call->SCR[1+i] = calls[i].SCR; -+ } -+ } - if ( call->ADF ) - { - assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well -@@ -704,19 +719,23 @@ - // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); - // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - -- call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); -+ if ( bca->fmt_flag & B2B_INFO_RPB ) -+ call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual); - - #if CDF_MWU_TESTS -- call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); -+ // CDF version of MWU tests is not calculated by default -+ if ( bca->fmt_flag & B2B_INFO_RPB ) -+ call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); - #endif - -- call->vdb = calc_vdb(bca->alt_pos, bca->npos); -+ if ( bca->fmt_flag & B2B_INFO_VDB ) -+ call->vdb = calc_vdb(bca->alt_pos, bca->npos); - - return 0; - } -@@ -792,6 +811,8 @@ - if ( fmt_flag&B2B_INFO_DPR ) - bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele); - } -+ if ( fmt_flag&B2B_INFO_SCR ) -+ bcf_update_info_int32(hdr, rec, "SCR", bc->SCR, 1); - - float tmpf[16]; - for (i=0; i<16; i++) tmpf[i] = bc->anno[i]; -@@ -863,6 +884,8 @@ - if ( fmt_flag&B2B_FMT_DPR ) - bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele); - } -+ if ( fmt_flag&B2B_FMT_SCR ) -+ bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample); - - return 0; - } ---- python-pysam.orig/bcftools/bam2bcf.h -+++ python-pysam/bcftools/bam2bcf.h -@@ -55,10 +55,18 @@ - #define B2B_INFO_AD (1<<9) - #define B2B_INFO_ADF (1<<10) - #define B2B_INFO_ADR (1<<11) -+#define B2B_INFO_SCR (1<<12) -+#define B2B_FMT_SCR (1<<13) -+#define B2B_INFO_VDB (1<<14) -+#define B2B_INFO_RPB (1<<15) - - #define B2B_MAX_ALLELES 5 - -+#define PLP_HAS_SOFT_CLIP(i) ((i)&1) -+#define PLP_SAMPLE_ID(i) ((i)>>1) -+ - typedef struct __bcf_callaux_t { -+ int fmt_flag; - int capQ, min_baseQ; - int openQ, extQ, tandemQ; // for indels - uint32_t min_support, max_support; // for collecting indel candidates -@@ -77,10 +85,11 @@ - void *rghash; - } bcf_callaux_t; - -+// per-sample values - typedef struct { - uint32_t ori_depth; - unsigned int mq0; -- int32_t *ADF, *ADR; -+ int32_t *ADF, *ADR, SCR; - float qsum[4]; - // The fields are: - // depth fwd .. ref (0) and non-ref (2) -@@ -98,6 +107,7 @@ - float p[25]; // phred-scaled likelihood of each genotype - } bcf_callret1_t; - -+// values for all samples - typedef struct { - int tid, pos; - bcf_hdr_t *bcf_hdr; -@@ -107,7 +117,7 @@ - int n_supp; // number of supporting non-reference reads - double anno[16]; - unsigned int depth, ori_depth, mq0; -- int32_t *PL, *DP4, *ADR, *ADF; -+ int32_t *PL, *DP4, *ADR, *ADF, *SCR; - uint8_t *fmt_arr; - float vdb; // variant distance bias - float mwu_pos, mwu_mq, mwu_bq, mwu_mqs; ---- python-pysam.orig/bcftools/bcftools.h -+++ python-pysam/bcftools/bcftools.h -@@ -39,7 +39,15 @@ - #define FT_STDIN (1<<3) - - char *bcftools_version(void); -+ -+/// Report an error and exit -1 - void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); -+ -+/// Report an error and exit -1. If errno != 0, appends strerror(errno). -+// Note: unlike error() above, the message should not end with "\n" as a -+// newline will be added by the function. -+void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2); -+ - void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); - const char *hts_bcf_wmode(int file_type); - ---- python-pysam.orig/bcftools/call.h -+++ python-pysam/bcftools/call.h -@@ -49,12 +49,35 @@ - } - family_t; - -+// For the single-sample and grouped -G calling -+typedef struct -+{ -+ float *qsum; // QS(quality sum) values -+ int nqsum, dp; -+ double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc; -+} -+grp1_t; -+typedef struct -+{ -+ grp1_t *grp; -+ int ngrp; -+ int *smpl2grp; -+} -+grp_t; -+ -+// For the `-C alleles -i` constrained calling -+typedef struct -+{ -+ uint32_t n:31, used:1; -+ char **allele; -+} -+tgt_als_t; -+ - typedef struct _ccall_t ccall_t; - typedef struct - { - // mcall only -- float *qsum; // QS(sum) values -- int nqsum, npdg; -+ int npdg; - int *als_map, nals_map; // mapping from full set of alleles to trimmed set of alleles (old -> new) - int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old) - char **als; // array to hold the trimmed set of alleles to appear on output -@@ -65,14 +88,19 @@ - uint16_t *trio[5][5]; // family type, second index: allele count (2-4, first two are unused) - double *GLs; - float *GPs; // FORMAT/GP: posterior probabilities -- int32_t *GQs; // FORMAT/GQ: genotype qualities -+ int32_t *GQs, *ADs; // FORMAT/GQ: genotype qualities; AD: allelic depth for -G - int32_t *itmp; // temporary int array, used for new PLs with CALL_CONSTR_ALLELES -- int n_itmp, nGPs; -+ int n_itmp, nGPs, nADs; - vcmp_t *vcmp; - double trio_Pm_SNPs, trio_Pm_del, trio_Pm_ins; // P(mendelian) for trio calling, see mcall_call_trio_genotypes() - int32_t *ugts, *cgts; // unconstraind and constrained GTs - uint32_t output_tags; - char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN) -+ tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES -+ char *sample_groups; // for single-sample or grouped calling with -G -+ grp_t smpl_grp; -+ float *qsum; -+ int nqsum; - - // ccall only - double indel_frac, min_perm_p, min_lrt; ---- /dev/null -+++ python-pysam/bcftools/cols.c -@@ -0,0 +1,109 @@ -+/* -+ Copyright (C) 2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+ DEALINGS IN THE SOFTWARE. -+*/ -+ -+#include -+#include "cols.h" -+ -+cols_t *cols_split(const char *line, cols_t *cols, char delim) -+{ -+ if ( !cols ) cols = (cols_t*) calloc(1,sizeof(cols_t)); -+ if ( cols->rmme ) free(cols->rmme); -+ cols->n = 0; -+ cols->rmme = strdup(line); -+ char *ss = cols->rmme; -+ while (1) -+ { -+ char *se = ss; -+ while ( *se && *se!=delim ) se++; -+ char tmp = *se; -+ *se = 0; -+ cols->n++; -+ if ( cols->n > cols->m ) -+ { -+ cols->m += 10; -+ cols->off = (char**) realloc(cols->off, sizeof(*cols->off)*cols->m); -+ } -+ cols->off[ cols->n - 1 ] = ss; -+ if ( !tmp ) break; -+ ss = se + 1; -+ } -+ return cols; -+} -+ -+void cols_append(cols_t *cols, char *str) -+{ -+ if ( cols->rmme ) -+ { -+ size_t str_len = strlen(str); -+ size_t lst_len = strlen(cols->off[ cols->n - 1 ]); -+ size_t tot_len = 2 + str_len + lst_len + (cols->off[ cols->n - 1 ] - cols->rmme); -+ -+ cols_t *tmp_cols = (cols_t*)calloc(1,sizeof(cols_t)); -+ tmp_cols->rmme = (char*) calloc(tot_len,1); -+ tmp_cols->off = (char**) calloc(cols->n+1,sizeof(*tmp_cols->off)); -+ -+ char *ptr = tmp_cols->rmme; -+ int i; -+ for (i=0; in; i++) -+ { -+ size_t len = strlen(cols->off[i]); -+ memcpy(ptr, cols->off[i], len); -+ tmp_cols->off[i] = ptr; -+ ptr += len + 1; -+ } -+ memcpy(ptr, str, str_len); -+ tmp_cols->off[i] = ptr; -+ -+ free(cols->off); -+ free(cols->rmme); -+ cols->rmme = tmp_cols->rmme; -+ cols->off = tmp_cols->off; -+ cols->n = cols->n+1; -+ cols->m = cols->n; -+ free(tmp_cols); -+ return; -+ } -+ cols->n++; -+ if ( cols->n > cols->m ) -+ { -+ cols->m++; -+ cols->off = (char**) realloc(cols->off,sizeof(*cols->off)*cols->m); -+ } -+ cols->off[cols->n-1] = str; -+} -+void cols_clear(cols_t *cols) -+{ -+ if ( !cols ) return; -+ free(cols->rmme); -+ free(cols->off); -+ cols->rmme = NULL; -+ cols->off = NULL; -+} -+void cols_destroy(cols_t *cols) -+{ -+ if ( !cols ) return; -+ cols_clear(cols); -+ free(cols); -+} -+ ---- /dev/null -+++ python-pysam/bcftools/cols.c.pysam.c -@@ -0,0 +1,111 @@ -+#include "bcftools.pysam.h" -+ -+/* -+ Copyright (C) 2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+ DEALINGS IN THE SOFTWARE. -+*/ -+ -+#include -+#include "cols.h" -+ -+cols_t *cols_split(const char *line, cols_t *cols, char delim) -+{ -+ if ( !cols ) cols = (cols_t*) calloc(1,sizeof(cols_t)); -+ if ( cols->rmme ) free(cols->rmme); -+ cols->n = 0; -+ cols->rmme = strdup(line); -+ char *ss = cols->rmme; -+ while (1) -+ { -+ char *se = ss; -+ while ( *se && *se!=delim ) se++; -+ char tmp = *se; -+ *se = 0; -+ cols->n++; -+ if ( cols->n > cols->m ) -+ { -+ cols->m += 10; -+ cols->off = (char**) realloc(cols->off, sizeof(*cols->off)*cols->m); -+ } -+ cols->off[ cols->n - 1 ] = ss; -+ if ( !tmp ) break; -+ ss = se + 1; -+ } -+ return cols; -+} -+ -+void cols_append(cols_t *cols, char *str) -+{ -+ if ( cols->rmme ) -+ { -+ size_t str_len = strlen(str); -+ size_t lst_len = strlen(cols->off[ cols->n - 1 ]); -+ size_t tot_len = 2 + str_len + lst_len + (cols->off[ cols->n - 1 ] - cols->rmme); -+ -+ cols_t *tmp_cols = (cols_t*)calloc(1,sizeof(cols_t)); -+ tmp_cols->rmme = (char*) calloc(tot_len,1); -+ tmp_cols->off = (char**) calloc(cols->n+1,sizeof(*tmp_cols->off)); -+ -+ char *ptr = tmp_cols->rmme; -+ int i; -+ for (i=0; in; i++) -+ { -+ size_t len = strlen(cols->off[i]); -+ memcpy(ptr, cols->off[i], len); -+ tmp_cols->off[i] = ptr; -+ ptr += len + 1; -+ } -+ memcpy(ptr, str, str_len); -+ tmp_cols->off[i] = ptr; -+ -+ free(cols->off); -+ free(cols->rmme); -+ cols->rmme = tmp_cols->rmme; -+ cols->off = tmp_cols->off; -+ cols->n = cols->n+1; -+ cols->m = cols->n; -+ free(tmp_cols); -+ return; -+ } -+ cols->n++; -+ if ( cols->n > cols->m ) -+ { -+ cols->m++; -+ cols->off = (char**) realloc(cols->off,sizeof(*cols->off)*cols->m); -+ } -+ cols->off[cols->n-1] = str; -+} -+void cols_clear(cols_t *cols) -+{ -+ if ( !cols ) return; -+ free(cols->rmme); -+ free(cols->off); -+ cols->rmme = NULL; -+ cols->off = NULL; -+} -+void cols_destroy(cols_t *cols) -+{ -+ if ( !cols ) return; -+ cols_clear(cols); -+ free(cols); -+} -+ ---- /dev/null -+++ python-pysam/bcftools/cols.h -@@ -0,0 +1,51 @@ -+/* -+ Copyright (C) 2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+ DEALINGS IN THE SOFTWARE. -+*/ -+ -+#ifndef __COLS_H__ -+#define __COLS_H__ -+ -+#include -+ -+typedef struct -+{ -+ int n,m; -+ char **off, *rmme; -+} -+cols_t; -+ -+/* -+ cols_split() can be called repeatedly to split new strings, memory is allocated -+ and deallocated automatically -+*/ -+cols_t *cols_split(const char *line, cols_t *cols, char delim); -+ -+/* -+ Although cols_append() can be combined with cols_split(), it is much slower and -+ the string must exist throughout the life of cols unless initialized with cols_split(). -+*/ -+void cols_append(cols_t *cols, char *str); -+void cols_clear(cols_t *cols); -+void cols_destroy(cols_t *cols); -+ -+#endif ---- python-pysam.orig/bcftools/consensus.c -+++ python-pysam/bcftools/consensus.c -@@ -50,6 +50,7 @@ - #define PICK_ALT 2 - #define PICK_LONG 4 - #define PICK_SHORT 8 -+#define PICK_IUPAC 16 - - typedef struct - { -@@ -76,11 +77,12 @@ - int fa_src_pos; // last genomic coordinate read from the input fasta (0-based) - char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778 - int prev_base_pos; // the position of prev_base -+ int prev_is_insert; - - rbuf_t vcf_rbuf; - bcf1_t **vcf_buf; - int nvcf_buf, rid; -- char *chr; -+ char *chr, *chr_prefix; - - regidx_t *mask; - regitr_t *itr; -@@ -98,7 +100,7 @@ - FILE *fp_out; - FILE *fp_chain; - char **argv; -- int argc, output_iupac, haplotype, allele, isample; -+ int argc, output_iupac, haplotype, allele, isample, napplied; - char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele; - } - args_t; -@@ -207,7 +209,7 @@ - { - args->files = bcf_sr_init(); - args->files->require_index = 1; -- if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); - args->hdr = args->files->readers[0].header; - args->isample = -1; - if ( args->sample ) -@@ -299,7 +301,7 @@ - args->vcf_rbuf.n = 0; - bcf_sr_seek(args->files,line,args->fa_ori_pos); - if ( tmp_ptr ) *tmp_ptr = tmp; -- fprintf(args->fp_out,">%s\n",line); -+ fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); - if (args->chain_fname ) - { - args->chain = init_chain(args->chain, args->fa_ori_pos); -@@ -331,7 +333,7 @@ - { - bcf1_t *rec = *rec_ptr; - if ( args->vcf_rbuf.n >= args->vcf_rbuf.m ) -- error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("FIXME: too many overlapping records near %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - - // Insert the new record in the buffer. The line would be overwritten in - // the next bcf_sr_next_line call, therefore we need to swap it with an -@@ -395,9 +397,18 @@ - if ( !fmt ) return; - - if ( fmt->type!=BCF_BT_INT8 ) -- error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - uint8_t *ptr = fmt->p + fmt->size*args->isample; -- if ( args->haplotype ) -+ -+ enum { use_hap, use_iupac, pick_one } action = use_hap; -+ if ( args->allele==PICK_IUPAC ) -+ { -+ if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac; -+ } -+ else if ( args->output_iupac ) action = use_iupac; -+ else if ( !args->haplotype ) action = pick_one; -+ -+ if ( action==use_hap ) - { - if ( args->haplotype > fmt->n ) - { -@@ -410,7 +421,7 @@ - { - if ( !warned_haplotype ) - { -- fprintf(stderr, "Can't apply %d-th haplotype at %s:%d. (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(stderr, "Can't apply %d-th haplotype at %s:%"PRId64". (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - warned_haplotype = 1; - } - return; -@@ -428,7 +439,7 @@ - ialt = bcf_gt_allele(ialt); - } - } -- else if ( args->output_iupac ) -+ else if ( action==use_iupac ) - { - ialt = ptr[0]; - if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) -@@ -456,7 +467,7 @@ - - if ( ialt>=0 ) - { -- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? - { - char ial = rec->d.allele[ialt][0]; -@@ -488,7 +499,7 @@ - { - if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; - jalt = bcf_gt_allele(ptr[i]); -- if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( args->allele & (PICK_LONG|PICK_SHORT) ) - { - int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]); -@@ -510,7 +521,7 @@ - } - } - if ( !ialt ) return; // ref allele -- if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - } - else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) - { -@@ -531,18 +542,29 @@ - ialt = 1; - } - -- // Overlapping variant? Can be still OK iff this is an insertion -- if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) ) -+ // Overlapping variant? -+ if ( rec->pos <= args->fa_frz_pos ) - { -- fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); -- return; -+ // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888). -+ // This still may not be enough for more complicated cases with multiple duplicate positions -+ // and other types in between. In such case let the user normalize the VCF and remove duplicates. -+ int overlap = 0; -+ if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1; -+ else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1; -+ -+ if ( overlap ) -+ { -+ fprintf(stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ return; -+ } -+ - } - - int len_diff = 0, alen = 0; - int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; - if ( idx<0 ) - { -- fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - return; - } - if ( rec->rlen > args->fa_buf.l - idx ) -@@ -552,17 +574,17 @@ - if ( alen > rec->rlen ) - { - rec->d.allele[ialt][rec->rlen] = 0; -- fprintf(stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - } - } - if ( idx>=args->fa_buf.l ) -- error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); -+ error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); - - // sanity check the reference base - if ( rec->d.allele[ialt][0]=='<' ) - { - if ( strcasecmp(rec->d.allele[ialt], "") ) -- error("Symbolic alleles other than are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("Symbolic alleles other than are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 - len_diff = 1-rec->rlen; - rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event -@@ -570,7 +592,7 @@ - } - else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) - { -- // This is hacky, handle a special case: if insert follows a deletion (AAC>A, C>CAA), -+ // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA), - // the reference base in fa_buf is lost and the check fails. We do not keep a buffer - // with the original sequence as it should not be necessary, we should encounter max - // one base overlap -@@ -591,11 +613,11 @@ - args->fa_buf.s[idx+rec->rlen] = 0; - } - error( -- "The fasta sequence does not match the REF allele at %s:%d:\n" -- " .vcf: [%s]\n" -+ "The fasta sequence does not match the REF allele at %s:%"PRId64":\n" -+ " .vcf: [%s] <- (REF)\n" - " .vcf: [%s] <- (ALT)\n" - " .fa: [%s]%c%s\n", -- bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, -+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, - tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" - ); - } -@@ -618,19 +640,31 @@ - // deletion or same size event - for (i=0; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; -+ - if ( len_diff ) -- { -- args->prev_base = rec->d.allele[0][rec->rlen - 1]; -- args->prev_base_pos = rec->pos + rec->rlen - 1; - memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); -- } -+ -+ args->prev_base = rec->d.allele[0][rec->rlen - 1]; -+ args->prev_base_pos = rec->pos + rec->rlen - 1; -+ args->prev_is_insert = 0; - } - else - { -+ args->prev_is_insert = 1; -+ args->prev_base_pos = rec->pos; -+ - // insertion - ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); - memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); -- for (i=0; id.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; -+ for (i=ibeg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; - } - if (args->chain && len_diff != 0) -@@ -650,6 +684,7 @@ - args->fa_buf.l += len_diff; - args->fa_mod_off += len_diff; - args->fa_frz_pos = rec->pos + rec->rlen - 1; -+ args->napplied++; - } - - -@@ -755,6 +790,7 @@ - flush_fa_buffer(args, 0); - bgzf_close(fasta); - free(str.s); -+ fprintf(stderr,"Applied %d variants\n", args->napplied); - } - - static void usage(args_t *args) -@@ -772,17 +808,19 @@ - fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); - fprintf(stderr, " the codes are case-insensitive:\n"); -- fprintf(stderr, " 1: first allele from GT\n"); -- fprintf(stderr, " 2: second allele\n"); -+ fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); -+ fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); - fprintf(stderr, " R: REF allele in het genotypes\n"); - fprintf(stderr, " A: ALT allele\n"); - fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); - fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); -+ fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(stderr, " -m, --mask replace regions with N\n"); - fprintf(stderr, " -M, --missing output instead of skipping the missing genotypes\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); -+ fprintf(stderr, " -p, --prefix prefix to add to output sequence names\n"); - fprintf(stderr, " -s, --sample apply variants of the given sample\n"); - fprintf(stderr, "Examples:\n"); - fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); -@@ -809,13 +847,15 @@ - {"mask",1,0,'m'}, - {"missing",1,0,'M'}, - {"chain",1,0,'c'}, -+ {"prefix",required_argument,0,'p'}, - {0,0,0,0} - }; - int c; -- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) - { - switch (c) - { -+ case 'p': args->chr_prefix = optarg; break; - case 's': args->sample = optarg; break; - case 'o': args->output_fname = optarg; break; - case 'I': args->output_iupac = 1; break; -@@ -837,10 +877,14 @@ - else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; - else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; - else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; -+ else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1; -+ else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2; - else - { -- args->haplotype = optarg[0] - '0'; -- if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); -+ char *tmp; -+ args->haplotype = strtol(optarg, &tmp, 10); -+ if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg); -+ if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n"); - } - break; - default: usage(args); break; ---- python-pysam.orig/bcftools/consensus.c.pysam.c -+++ python-pysam/bcftools/consensus.c.pysam.c -@@ -52,6 +52,7 @@ - #define PICK_ALT 2 - #define PICK_LONG 4 - #define PICK_SHORT 8 -+#define PICK_IUPAC 16 - - typedef struct - { -@@ -78,11 +79,12 @@ - int fa_src_pos; // last genomic coordinate read from the input fasta (0-based) - char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778 - int prev_base_pos; // the position of prev_base -+ int prev_is_insert; - - rbuf_t vcf_rbuf; - bcf1_t **vcf_buf; - int nvcf_buf, rid; -- char *chr; -+ char *chr, *chr_prefix; - - regidx_t *mask; - regitr_t *itr; -@@ -100,7 +102,7 @@ - FILE *fp_out; - FILE *fp_chain; - char **argv; -- int argc, output_iupac, haplotype, allele, isample; -+ int argc, output_iupac, haplotype, allele, isample, napplied; - char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele; - } - args_t; -@@ -209,7 +211,7 @@ - { - args->files = bcf_sr_init(); - args->files->require_index = 1; -- if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); - args->hdr = args->files->readers[0].header; - args->isample = -1; - if ( args->sample ) -@@ -301,7 +303,7 @@ - args->vcf_rbuf.n = 0; - bcf_sr_seek(args->files,line,args->fa_ori_pos); - if ( tmp_ptr ) *tmp_ptr = tmp; -- fprintf(args->fp_out,">%s\n",line); -+ fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); - if (args->chain_fname ) - { - args->chain = init_chain(args->chain, args->fa_ori_pos); -@@ -333,7 +335,7 @@ - { - bcf1_t *rec = *rec_ptr; - if ( args->vcf_rbuf.n >= args->vcf_rbuf.m ) -- error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("FIXME: too many overlapping records near %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - - // Insert the new record in the buffer. The line would be overwritten in - // the next bcf_sr_next_line call, therefore we need to swap it with an -@@ -397,9 +399,18 @@ - if ( !fmt ) return; - - if ( fmt->type!=BCF_BT_INT8 ) -- error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - uint8_t *ptr = fmt->p + fmt->size*args->isample; -- if ( args->haplotype ) -+ -+ enum { use_hap, use_iupac, pick_one } action = use_hap; -+ if ( args->allele==PICK_IUPAC ) -+ { -+ if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac; -+ } -+ else if ( args->output_iupac ) action = use_iupac; -+ else if ( !args->haplotype ) action = pick_one; -+ -+ if ( action==use_hap ) - { - if ( args->haplotype > fmt->n ) - { -@@ -412,7 +423,7 @@ - { - if ( !warned_haplotype ) - { -- fprintf(bcftools_stderr, "Can't apply %d-th haplotype at %s:%d. (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr, "Can't apply %d-th haplotype at %s:%"PRId64". (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - warned_haplotype = 1; - } - return; -@@ -430,7 +441,7 @@ - ialt = bcf_gt_allele(ialt); - } - } -- else if ( args->output_iupac ) -+ else if ( action==use_iupac ) - { - ialt = ptr[0]; - if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) -@@ -458,7 +469,7 @@ - - if ( ialt>=0 ) - { -- if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? - { - char ial = rec->d.allele[ialt][0]; -@@ -490,7 +501,7 @@ - { - if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; - jalt = bcf_gt_allele(ptr[i]); -- if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( args->allele & (PICK_LONG|PICK_SHORT) ) - { - int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]); -@@ -512,7 +523,7 @@ - } - } - if ( !ialt ) return; // ref allele -- if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - } - else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) - { -@@ -533,18 +544,29 @@ - ialt = 1; - } - -- // Overlapping variant? Can be still OK iff this is an insertion -- if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) ) -+ // Overlapping variant? -+ if ( rec->pos <= args->fa_frz_pos ) - { -- fprintf(bcftools_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1); -- return; -+ // Can be still OK iff this is an insertion (and which does not follow another insertion, see #888). -+ // This still may not be enough for more complicated cases with multiple duplicate positions -+ // and other types in between. In such case let the user normalize the VCF and remove duplicates. -+ int overlap = 0; -+ if ( rec->pos < args->fa_frz_pos || !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) ) overlap = 1; -+ else if ( rec->d.var[ialt].n <= 0 || args->prev_is_insert ) overlap = 1; -+ -+ if ( overlap ) -+ { -+ fprintf(bcftools_stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ return; -+ } -+ - } - - int len_diff = 0, alen = 0; - int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; - if ( idx<0 ) - { -- fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - return; - } - if ( rec->rlen > args->fa_buf.l - idx ) -@@ -554,17 +576,17 @@ - if ( alen > rec->rlen ) - { - rec->d.allele[ialt][rec->rlen] = 0; -- fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - } - } - if ( idx>=args->fa_buf.l ) -- error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); -+ error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); - - // sanity check the reference base - if ( rec->d.allele[ialt][0]=='<' ) - { - if ( strcasecmp(rec->d.allele[ialt], "") ) -- error("Symbolic alleles other than are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("Symbolic alleles other than are currently not supported: %s at %s:%"PRId64"\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 - len_diff = 1-rec->rlen; - rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event -@@ -572,7 +594,7 @@ - } - else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) ) - { -- // This is hacky, handle a special case: if insert follows a deletion (AAC>A, C>CAA), -+ // This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA), - // the reference base in fa_buf is lost and the check fails. We do not keep a buffer - // with the original sequence as it should not be necessary, we should encounter max - // one base overlap -@@ -593,11 +615,11 @@ - args->fa_buf.s[idx+rec->rlen] = 0; - } - error( -- "The fasta sequence does not match the REF allele at %s:%d:\n" -- " .vcf: [%s]\n" -+ "The fasta sequence does not match the REF allele at %s:%"PRId64":\n" -+ " .vcf: [%s] <- (REF)\n" - " .vcf: [%s] <- (ALT)\n" - " .fa: [%s]%c%s\n", -- bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, -+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, - tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" - ); - } -@@ -620,19 +642,31 @@ - // deletion or same size event - for (i=0; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; -+ - if ( len_diff ) -- { -- args->prev_base = rec->d.allele[0][rec->rlen - 1]; -- args->prev_base_pos = rec->pos + rec->rlen - 1; - memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); -- } -+ -+ args->prev_base = rec->d.allele[0][rec->rlen - 1]; -+ args->prev_base_pos = rec->pos + rec->rlen - 1; -+ args->prev_is_insert = 0; - } - else - { -+ args->prev_is_insert = 1; -+ args->prev_base_pos = rec->pos; -+ - // insertion - ks_resize(&args->fa_buf, args->fa_buf.l + len_diff); - memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen); -- for (i=0; id.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; -+ for (i=ibeg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; - } - if (args->chain && len_diff != 0) -@@ -652,6 +686,7 @@ - args->fa_buf.l += len_diff; - args->fa_mod_off += len_diff; - args->fa_frz_pos = rec->pos + rec->rlen - 1; -+ args->napplied++; - } - - -@@ -757,6 +792,7 @@ - flush_fa_buffer(args, 0); - bgzf_close(fasta); - free(str.s); -+ fprintf(bcftools_stderr,"Applied %d variants\n", args->napplied); - } - - static void usage(args_t *args) -@@ -774,17 +810,19 @@ - fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(bcftools_stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); - fprintf(bcftools_stderr, " the codes are case-insensitive:\n"); -- fprintf(bcftools_stderr, " 1: first allele from GT\n"); -- fprintf(bcftools_stderr, " 2: second allele\n"); -+ fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n"); -+ fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n"); - fprintf(bcftools_stderr, " R: REF allele in het genotypes\n"); - fprintf(bcftools_stderr, " A: ALT allele\n"); - fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); - fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); -+ fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(bcftools_stderr, " -m, --mask replace regions with N\n"); - fprintf(bcftools_stderr, " -M, --missing output instead of skipping the missing genotypes\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); -+ fprintf(bcftools_stderr, " -p, --prefix prefix to add to output sequence names\n"); - fprintf(bcftools_stderr, " -s, --sample apply variants of the given sample\n"); - fprintf(bcftools_stderr, "Examples:\n"); - fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); -@@ -811,13 +849,15 @@ - {"mask",1,0,'m'}, - {"missing",1,0,'M'}, - {"chain",1,0,'c'}, -+ {"prefix",required_argument,0,'p'}, - {0,0,0,0} - }; - int c; -- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:",loptions,NULL)) >= 0) - { - switch (c) - { -+ case 'p': args->chr_prefix = optarg; break; - case 's': args->sample = optarg; break; - case 'o': args->output_fname = optarg; break; - case 'I': args->output_iupac = 1; break; -@@ -839,10 +879,14 @@ - else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; - else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; - else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; -+ else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1; -+ else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2; - else - { -- args->haplotype = optarg[0] - '0'; -- if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n"); -+ char *tmp; -+ args->haplotype = strtol(optarg, &tmp, 10); -+ if ( tmp==optarg || *tmp ) error("Error: Could not parse --haplotype %s, expected numeric argument\n", optarg); -+ if ( args->haplotype <=0 ) error("Error: Expected positive integer with --haplotype\n"); - } - break; - default: usage(args); break; ---- python-pysam.orig/bcftools/convert.c -+++ python-pysam/bcftools/convert.c -@@ -30,12 +30,15 @@ - #include - #include - #include -+#define __STDC_FORMAT_MACROS - #include - #include - #include - #include - #include -+#include - #include "bcftools.h" -+#include "variantkey.h" - #include "convert.h" - - #define T_CHROM 1 -@@ -67,6 +70,9 @@ - #define T_END 27 - #define T_POS0 28 - #define T_END0 29 -+#define T_RSX 30 // RSID HEX -+#define T_VKX 31 // VARIANTKEY HEX -+#define T_PBINOM 32 - - typedef struct _fmt_t - { -@@ -196,13 +202,44 @@ - } - static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) - { -+ int i; -+ if ( !fmt->key ) // the whole INFO column -+ { -+ int first = 1; -+ for (i=0; in_info; i++) -+ { -+ bcf_info_t *inf = &line->d.info[i]; -+ if ( !inf->vptr ) continue; -+ if ( !first ) kputc(';', str); -+ first = 0; -+ if ( inf->key >= convert->header->n[BCF_DT_ID] ) continue; -+ kputs(convert->header->id[BCF_DT_ID][inf->key].key, str); -+ if ( inf->len <= 0 ) continue; -+ kputc('=', str); -+ if ( inf->len == 1 ) -+ { -+ switch (inf->type) -+ { -+ case BCF_BT_INT8: if ( inf->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; -+ case BCF_BT_INT16: if ( inf->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; -+ case BCF_BT_INT32: if ( inf->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; -+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(inf->v1.f) ) kputc('.', str); else kputd(inf->v1.f, str); break; -+ case BCF_BT_CHAR: kputc(inf->v1.i, str); break; -+ default: error("Unexpected type %d", inf->type); break; -+ } -+ } -+ else bcf_fmt_array(str, inf->len, inf->type, inf->vptr); -+ } -+ if ( first ) kputc('.', str); -+ return; -+ } -+ - if ( fmt->id<0 ) - { - kputc('.', str); - return; - } - -- int i; - for (i=0; in_info; i++) - if ( line->d.info[i].key == fmt->id ) break; - -@@ -276,6 +313,50 @@ - - fmt->ready = 1; - } -+static void process_complete_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ if ( convert->nsamples ) -+ { -+ int i,j; -+ if ( line->n_fmt) -+ { -+ int gt_i = -1; -+ bcf_fmt_t *fmt = line->d.fmt; -+ int first = 1; -+ for (i=0; i<(int)line->n_fmt; i++) -+ { -+ if ( !fmt[i].p || fmt[i].id<0 ) continue; -+ if ( !first ) kputc(':', str); -+ first = 0; -+ kputs(convert->header->id[BCF_DT_ID][fmt[i].id].key, str); -+ if ( strcmp(convert->header->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; -+ } -+ if ( first ) kputc('.', str); -+ for (j=0; jnsamples; j++) -+ { -+ kputc('\t', str); -+ first = 1; -+ for (i=0; i<(int)line->n_fmt; i++) -+ { -+ bcf_fmt_t *f = &fmt[i]; -+ if ( !f->p ) continue; -+ if ( !first ) kputc(':', str); -+ first = 0; -+ if (gt_i == i) -+ bcf_format_gt(f,convert->samples[j],str); -+ else -+ bcf_fmt_array(str, f->n, f->type, f->p + convert->samples[j] * f->size); -+ } -+ if ( first ) kputc('.', str); -+ } -+ } -+ else -+ for (j=0; j<=line->n_sample; j++) -+ kputs("\t.", str); -+ } -+ else -+ kputc('.',str); -+} - static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) - { - if ( !fmt->ready ) -@@ -555,6 +636,7 @@ - if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; } - if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; } - if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; } -+ if ( line_type & VCF_OVERLAP ) { if (i) kputc(',',str); kputs("OVERLAP", str); i++; } - } - static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) - { -@@ -590,7 +672,7 @@ - // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); - // return; - -- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); -+ error("Error parsing GT tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); - } - - n /= convert->nsamples; -@@ -641,7 +723,7 @@ - // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); - // return; - -- error("Error parsing PL tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); -+ error("Error parsing PL tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); - } - - n /= convert->nsamples; -@@ -690,7 +772,7 @@ - // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); - // return; - -- error("Error parsing GP tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); -+ error("Error parsing GP tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); - } - - n /= convert->nsamples; -@@ -702,7 +784,7 @@ - { - if ( ptr[j]==bcf_int32_vector_end ) break; - if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; } -- if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%d:%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),line->pos+1,ptr[j]); -+ if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); - sum+=ptr[j]; - } - if ( j==line->n_allele ) -@@ -745,24 +827,24 @@ - - int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); - if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); - bcf_fmt_t *fmt_gt = NULL; - for (i=0; in_fmt; i++) - if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } - if ( !fmt_gt ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 - if ( line->n_allele > 100 ) -- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) -- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); -+ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); - - if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid -- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( fmt_gt->n!=1 && fmt_gt->n!=2 ) -- error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1); -+ error("Uh, ploidy of %d not supported, see %s:%"PRId64"\n", fmt_gt->n, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; - for (i=0; insamples; i++) -@@ -899,22 +981,22 @@ - - int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); - if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); - bcf_fmt_t *fmt_gt = NULL; - for (i=0; in_fmt; i++) - if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } - if ( !fmt_gt ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 - if ( line->n_allele > 100 ) -- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) -- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); -+ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); - - if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid -- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; - for (i=0; insamples; i++) -@@ -1020,6 +1102,91 @@ - str->s[--str->l] = 0; // delete the last space - } - -+static void process_rsid_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ char *ptr = line->d.id; -+ ptr += 2; // remove 'rs' -+ ksprintf(str, "%08" PRIx32 "", (uint32_t)strtoul(ptr, NULL, 10)); -+} -+ -+static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ uint64_t vk = variantkey( -+ convert->header->id[BCF_DT_CTG][line->rid].key, -+ strlen(convert->header->id[BCF_DT_CTG][line->rid].key), -+ line->pos, -+ line->d.allele[0], -+ strlen(line->d.allele[0]), -+ line->d.allele[1], -+ strlen(line->d.allele[1])); -+ ksprintf(str, "%016" PRIx64 "", vk); -+} -+ -+static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ int i; -+ if ( !fmt->ready ) -+ { -+ fmt->fmt = NULL; // AD -+ fmt->usr = NULL; // GT -+ -+ for (i=0; i<(int)line->n_fmt; i++) -+ if ( line->d.fmt[i].id==fmt->id ) { fmt->fmt = &line->d.fmt[i]; break; } -+ -+ // Check that the first field is GT -+ int gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); -+ if ( !bcf_hdr_idinfo_exists(convert->header, BCF_HL_FMT, fmt->id) ) error("Error: FORMAT/GT is not defined in the header\n"); -+ for (i=0; i<(int)line->n_fmt; i++) -+ if ( line->d.fmt[i].id==gt_id ) { fmt->usr = &line->d.fmt[i]; break; } // it should always be first according to VCF spec, but... -+ -+ if ( fmt->usr && line->d.fmt[i].type!=BCF_BT_INT8 ) // skip sites with many alleles -+ fmt->usr = NULL; -+ -+ fmt->ready = 1; -+ } -+ bcf_fmt_t *gt_fmt = (bcf_fmt_t*) fmt->usr; -+ if ( !fmt->fmt || !gt_fmt || gt_fmt->n!=2 ) goto invalid; -+ -+ int n[2] = {0,0}; -+ int8_t *gt = (int8_t*)(gt_fmt->p + isample*gt_fmt->size); -+ for (i=0; i<2; i++) -+ { -+ if ( bcf_gt_is_missing(gt[i]) || gt[i] == bcf_int8_vector_end ) goto invalid; -+ int al = bcf_gt_allele(gt[i]); -+ if ( al > line->n_allele || al >= fmt->fmt->n ) goto invalid; -+ -+ #define BRANCH(type_t, missing, vector_end) { \ -+ type_t val = ((type_t *) fmt->fmt->p)[al + isample*fmt->fmt->n]; \ -+ if ( val==missing || val==vector_end ) goto invalid; \ -+ else n[i] = val; \ -+ } -+ switch (fmt->fmt->type) -+ { -+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; -+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; -+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; -+ default: goto invalid; break; -+ } -+ #undef BRANCH -+ } -+ -+ if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str); -+ else -+ { -+ double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5); -+ pval *= 2; -+ assert( pval-1 < 1e-10 ); -+ if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) -+ else -+ pval = -4.34294481903*log(pval); -+ kputd(pval, str); -+ } -+ return; -+ -+invalid: -+ kputc('.', str); -+} -+ - static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) - { - convert->nfmt++; -@@ -1054,11 +1221,14 @@ - else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; } - else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; } - else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } -- else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) -- { -- fmt->type = T_INFO; -- fprintf(stderr,"Warning: Assuming INFO/%s\n", key); -- } -+ else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } -+ else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } -+ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } -+ } -+ if ( fmt->type==T_PBINOM ) -+ { -+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); -+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); - } - } - -@@ -1072,15 +1242,15 @@ - case T_CHROM: fmt->handler = &process_chrom; break; - case T_POS: fmt->handler = &process_pos; break; - case T_POS0: fmt->handler = &process_pos0; break; -- case T_END: fmt->handler = &process_end; break; -- case T_END0: fmt->handler = &process_end0; break; -+ case T_END: fmt->handler = &process_end; convert->max_unpack |= BCF_UN_INFO; break; -+ case T_END0: fmt->handler = &process_end0; convert->max_unpack |= BCF_UN_INFO; break; - case T_ID: fmt->handler = &process_id; break; - case T_REF: fmt->handler = &process_ref; break; - case T_ALT: fmt->handler = &process_alt; break; - case T_QUAL: fmt->handler = &process_qual; break; - case T_FILTER: fmt->handler = &process_filter; convert->max_unpack |= BCF_UN_FLT; break; - case T_INFO: fmt->handler = &process_info; convert->max_unpack |= BCF_UN_INFO; break; -- case T_FORMAT: fmt->handler = &process_format; convert->max_unpack |= BCF_UN_FMT; break; -+ case T_FORMAT: fmt->handler = fmt->key ? &process_format : &process_complete_format; convert->max_unpack |= BCF_UN_FMT; break; - case T_SAMPLE: fmt->handler = &process_sample; break; - case T_SEP: fmt->handler = &process_sep; break; - case T_IS_TS: fmt->handler = &process_is_ts; break; -@@ -1093,6 +1263,9 @@ - case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break; - case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break; - case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break; -+ case T_RSX: fmt->handler = &process_rsid_hex; break; -+ case T_VKX: fmt->handler = &process_variantkey_hex; break; -+ case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; - default: error("TODO: handler for type %d\n", fmt->type); - } - if ( key && fmt->type==T_INFO ) -@@ -1144,7 +1317,14 @@ - else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); - else if ( !strcmp(str.s, "INFO") ) - { -- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); -+ if ( *q!='/' ) -+ { -+ int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, str.s); -+ if ( bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) -+ error("Could not parse format string \"%s\". Did you mean %%INFO/%s?\n", convert->format_str,str.s); -+ else -+ error("Could not parse format string: %s\n", convert->format_str); -+ } - p = ++q; - str.l = 0; - while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -@@ -1153,6 +1333,17 @@ - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); - fmt->subscript = parse_subscript(&q); - } -+ else if ( !strcmp(str.s,"PBINOM") ) -+ { -+ if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); -+ p = ++q; -+ str.l = 0; -+ while ( *q && *q!=')' ) q++; -+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); -+ kputsn(p, q-p, &str); -+ register_tag(convert, T_PBINOM, str.s, is_gtf); -+ q++; -+ } - else - { - fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); -@@ -1187,17 +1378,26 @@ - else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); -+ else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); -+ else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); -+ else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n"); - else if ( !strcmp(str.s, "INFO") ) - { -- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); -- p = ++q; -- str.l = 0; -- while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -- if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); -- kputsn(p, q-p, &str); -- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); -- fmt->subscript = parse_subscript(&q); -+ if ( *q=='/' ) -+ { -+ p = ++q; -+ str.l = 0; -+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); -+ kputsn(p, q-p, &str); -+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); -+ fmt->subscript = parse_subscript(&q); -+ } -+ else -+ register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO - } -+ else if ( !strcmp(str.s, "FORMAT") ) -+ register_tag(convert, T_FORMAT, NULL, 0); - else - { - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); -@@ -1336,7 +1536,15 @@ - int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) - { - if ( !convert->allow_undef_tags && convert->undef_info_tag ) -- error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag); -+ { -+ kstring_t msg = {0,0,0}; -+ ksprintf(&msg,"Error: no such tag defined in the VCF header: INFO/%s", convert->undef_info_tag); -+ -+ int hdr_id = bcf_hdr_id2int(convert->header,BCF_DT_ID,convert->undef_info_tag); -+ if ( hdr_id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,hdr_id) ) -+ ksprintf(&msg,". FORMAT fields must be enclosed in square brackets, e.g. \"[ %%%s]\"", convert->undef_info_tag); -+ error("%s\n", msg.s); -+ } - - int l_ori = str->l; - bcf_unpack(line, convert->max_unpack); -@@ -1357,7 +1565,7 @@ - for (js=0; jsnsamples; js++) - { - // Skip samples when filtering was requested -- if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; -+ if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; - - // Here comes a hack designed for TBCSQ. When running on large files, - // such as 1000GP, there are too many empty fields in the output and ---- python-pysam.orig/bcftools/convert.c.pysam.c -+++ python-pysam/bcftools/convert.c.pysam.c -@@ -32,12 +32,15 @@ - #include - #include - #include -+#define __STDC_FORMAT_MACROS - #include - #include - #include - #include - #include -+#include - #include "bcftools.h" -+#include "variantkey.h" - #include "convert.h" - - #define T_CHROM 1 -@@ -69,6 +72,9 @@ - #define T_END 27 - #define T_POS0 28 - #define T_END0 29 -+#define T_RSX 30 // RSID HEX -+#define T_VKX 31 // VARIANTKEY HEX -+#define T_PBINOM 32 - - typedef struct _fmt_t - { -@@ -198,13 +204,44 @@ - } - static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) - { -+ int i; -+ if ( !fmt->key ) // the whole INFO column -+ { -+ int first = 1; -+ for (i=0; in_info; i++) -+ { -+ bcf_info_t *inf = &line->d.info[i]; -+ if ( !inf->vptr ) continue; -+ if ( !first ) kputc(';', str); -+ first = 0; -+ if ( inf->key >= convert->header->n[BCF_DT_ID] ) continue; -+ kputs(convert->header->id[BCF_DT_ID][inf->key].key, str); -+ if ( inf->len <= 0 ) continue; -+ kputc('=', str); -+ if ( inf->len == 1 ) -+ { -+ switch (inf->type) -+ { -+ case BCF_BT_INT8: if ( inf->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; -+ case BCF_BT_INT16: if ( inf->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; -+ case BCF_BT_INT32: if ( inf->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(inf->v1.i, str); break; -+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(inf->v1.f) ) kputc('.', str); else kputd(inf->v1.f, str); break; -+ case BCF_BT_CHAR: kputc(inf->v1.i, str); break; -+ default: error("Unexpected type %d", inf->type); break; -+ } -+ } -+ else bcf_fmt_array(str, inf->len, inf->type, inf->vptr); -+ } -+ if ( first ) kputc('.', str); -+ return; -+ } -+ - if ( fmt->id<0 ) - { - kputc('.', str); - return; - } - -- int i; - for (i=0; in_info; i++) - if ( line->d.info[i].key == fmt->id ) break; - -@@ -278,6 +315,50 @@ - - fmt->ready = 1; - } -+static void process_complete_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ if ( convert->nsamples ) -+ { -+ int i,j; -+ if ( line->n_fmt) -+ { -+ int gt_i = -1; -+ bcf_fmt_t *fmt = line->d.fmt; -+ int first = 1; -+ for (i=0; i<(int)line->n_fmt; i++) -+ { -+ if ( !fmt[i].p || fmt[i].id<0 ) continue; -+ if ( !first ) kputc(':', str); -+ first = 0; -+ kputs(convert->header->id[BCF_DT_ID][fmt[i].id].key, str); -+ if ( strcmp(convert->header->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; -+ } -+ if ( first ) kputc('.', str); -+ for (j=0; jnsamples; j++) -+ { -+ kputc('\t', str); -+ first = 1; -+ for (i=0; i<(int)line->n_fmt; i++) -+ { -+ bcf_fmt_t *f = &fmt[i]; -+ if ( !f->p ) continue; -+ if ( !first ) kputc(':', str); -+ first = 0; -+ if (gt_i == i) -+ bcf_format_gt(f,convert->samples[j],str); -+ else -+ bcf_fmt_array(str, f->n, f->type, f->p + convert->samples[j] * f->size); -+ } -+ if ( first ) kputc('.', str); -+ } -+ } -+ else -+ for (j=0; j<=line->n_sample; j++) -+ kputs("\t.", str); -+ } -+ else -+ kputc('.',str); -+} - static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) - { - if ( !fmt->ready ) -@@ -557,6 +638,7 @@ - if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; } - if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; } - if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; } -+ if ( line_type & VCF_OVERLAP ) { if (i) kputc(',',str); kputs("OVERLAP", str); i++; } - } - static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) - { -@@ -592,7 +674,7 @@ - // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); - // return; - -- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); -+ error("Error parsing GT tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); - } - - n /= convert->nsamples; -@@ -643,7 +725,7 @@ - // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); - // return; - -- error("Error parsing PL tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); -+ error("Error parsing PL tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); - } - - n /= convert->nsamples; -@@ -692,7 +774,7 @@ - // for (i=0; insamples; i++) kputs(" 0.33 0.33 0.33", str); - // return; - -- error("Error parsing GP tag at %s:%d\n", bcf_seqname(convert->header,line),line->pos+1); -+ error("Error parsing GP tag at %s:%"PRId64"\n", bcf_seqname(convert->header,line),(int64_t) line->pos+1); - } - - n /= convert->nsamples; -@@ -704,7 +786,7 @@ - { - if ( ptr[j]==bcf_int32_vector_end ) break; - if ( ptr[j]==bcf_int32_missing ) { ptr[j]=0; continue; } -- if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%d:%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),line->pos+1,ptr[j]); -+ if ( ptr[j]<0 || ptr[j]>1 ) error("[%s:%"PRId64":%f] GP value outside range [0,1]; bcftools convert expects the VCF4.3+ spec for the GP field encoding genotype posterior probabilities", bcf_seqname(convert->header,line),(int64_t) line->pos+1,ptr[j]); - sum+=ptr[j]; - } - if ( j==line->n_allele ) -@@ -747,24 +829,24 @@ - - int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); - if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); - bcf_fmt_t *fmt_gt = NULL; - for (i=0; in_fmt; i++) - if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } - if ( !fmt_gt ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 - if ( line->n_allele > 100 ) -- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) -- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); -+ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); - - if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid -- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( fmt_gt->n!=1 && fmt_gt->n!=2 ) -- error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1); -+ error("Uh, ploidy of %d not supported, see %s:%"PRId64"\n", fmt_gt->n, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; - for (i=0; insamples; i++) -@@ -901,22 +983,22 @@ - - int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); - if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); - bcf_fmt_t *fmt_gt = NULL; - for (i=0; in_fmt; i++) - if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } - if ( !fmt_gt ) -- error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1); -+ error("FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99 - if ( line->n_allele > 100 ) -- error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Too many alleles (%d) at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - if ( ks_resize(str, str->l+convert->nsamples*8) != 0 ) -- error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8)); -+ error("Could not alloc %" PRIu64 " bytes\n", (uint64_t)(str->l + convert->nsamples*8)); - - if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid -- error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1); -+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%"PRId64"\n", line->n_allele, bcf_seqname(convert->header, line),(int64_t) line->pos+1); - - int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n; - for (i=0; insamples; i++) -@@ -1022,6 +1104,91 @@ - str->s[--str->l] = 0; // delete the last space - } - -+static void process_rsid_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ char *ptr = line->d.id; -+ ptr += 2; // remove 'rs' -+ ksprintf(str, "%08" PRIx32 "", (uint32_t)strtoul(ptr, NULL, 10)); -+} -+ -+static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ uint64_t vk = variantkey( -+ convert->header->id[BCF_DT_CTG][line->rid].key, -+ strlen(convert->header->id[BCF_DT_CTG][line->rid].key), -+ line->pos, -+ line->d.allele[0], -+ strlen(line->d.allele[0]), -+ line->d.allele[1], -+ strlen(line->d.allele[1])); -+ ksprintf(str, "%016" PRIx64 "", vk); -+} -+ -+static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) -+{ -+ int i; -+ if ( !fmt->ready ) -+ { -+ fmt->fmt = NULL; // AD -+ fmt->usr = NULL; // GT -+ -+ for (i=0; i<(int)line->n_fmt; i++) -+ if ( line->d.fmt[i].id==fmt->id ) { fmt->fmt = &line->d.fmt[i]; break; } -+ -+ // Check that the first field is GT -+ int gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT"); -+ if ( !bcf_hdr_idinfo_exists(convert->header, BCF_HL_FMT, fmt->id) ) error("Error: FORMAT/GT is not defined in the header\n"); -+ for (i=0; i<(int)line->n_fmt; i++) -+ if ( line->d.fmt[i].id==gt_id ) { fmt->usr = &line->d.fmt[i]; break; } // it should always be first according to VCF spec, but... -+ -+ if ( fmt->usr && line->d.fmt[i].type!=BCF_BT_INT8 ) // skip sites with many alleles -+ fmt->usr = NULL; -+ -+ fmt->ready = 1; -+ } -+ bcf_fmt_t *gt_fmt = (bcf_fmt_t*) fmt->usr; -+ if ( !fmt->fmt || !gt_fmt || gt_fmt->n!=2 ) goto invalid; -+ -+ int n[2] = {0,0}; -+ int8_t *gt = (int8_t*)(gt_fmt->p + isample*gt_fmt->size); -+ for (i=0; i<2; i++) -+ { -+ if ( bcf_gt_is_missing(gt[i]) || gt[i] == bcf_int8_vector_end ) goto invalid; -+ int al = bcf_gt_allele(gt[i]); -+ if ( al > line->n_allele || al >= fmt->fmt->n ) goto invalid; -+ -+ #define BRANCH(type_t, missing, vector_end) { \ -+ type_t val = ((type_t *) fmt->fmt->p)[al + isample*fmt->fmt->n]; \ -+ if ( val==missing || val==vector_end ) goto invalid; \ -+ else n[i] = val; \ -+ } -+ switch (fmt->fmt->type) -+ { -+ case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; -+ case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; -+ case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; -+ default: goto invalid; break; -+ } -+ #undef BRANCH -+ } -+ -+ if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str); -+ else -+ { -+ double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5); -+ pval *= 2; -+ assert( pval-1 < 1e-10 ); -+ if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) -+ else -+ pval = -4.34294481903*log(pval); -+ kputd(pval, str); -+ } -+ return; -+ -+invalid: -+ kputc('.', str); -+} -+ - static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf) - { - convert->nfmt++; -@@ -1056,11 +1223,14 @@ - else if ( !strcmp("QUAL",key) ) { fmt->type = T_QUAL; } - else if ( !strcmp("FILTER",key) ) { fmt->type = T_FILTER; } - else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; } -- else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) -- { -- fmt->type = T_INFO; -- fprintf(bcftools_stderr,"Warning: Assuming INFO/%s\n", key); -- } -+ else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; } -+ else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; } -+ else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) { fmt->type = T_INFO; } -+ } -+ if ( fmt->type==T_PBINOM ) -+ { -+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key); -+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT, fmt->id) ) error("No such FORMAT tag defined in the header: %s\n", fmt->key); - } - } - -@@ -1074,15 +1244,15 @@ - case T_CHROM: fmt->handler = &process_chrom; break; - case T_POS: fmt->handler = &process_pos; break; - case T_POS0: fmt->handler = &process_pos0; break; -- case T_END: fmt->handler = &process_end; break; -- case T_END0: fmt->handler = &process_end0; break; -+ case T_END: fmt->handler = &process_end; convert->max_unpack |= BCF_UN_INFO; break; -+ case T_END0: fmt->handler = &process_end0; convert->max_unpack |= BCF_UN_INFO; break; - case T_ID: fmt->handler = &process_id; break; - case T_REF: fmt->handler = &process_ref; break; - case T_ALT: fmt->handler = &process_alt; break; - case T_QUAL: fmt->handler = &process_qual; break; - case T_FILTER: fmt->handler = &process_filter; convert->max_unpack |= BCF_UN_FLT; break; - case T_INFO: fmt->handler = &process_info; convert->max_unpack |= BCF_UN_INFO; break; -- case T_FORMAT: fmt->handler = &process_format; convert->max_unpack |= BCF_UN_FMT; break; -+ case T_FORMAT: fmt->handler = fmt->key ? &process_format : &process_complete_format; convert->max_unpack |= BCF_UN_FMT; break; - case T_SAMPLE: fmt->handler = &process_sample; break; - case T_SEP: fmt->handler = &process_sep; break; - case T_IS_TS: fmt->handler = &process_is_ts; break; -@@ -1095,6 +1265,9 @@ - case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break; - case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break; - case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break; -+ case T_RSX: fmt->handler = &process_rsid_hex; break; -+ case T_VKX: fmt->handler = &process_variantkey_hex; break; -+ case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break; - default: error("TODO: handler for type %d\n", fmt->type); - } - if ( key && fmt->type==T_INFO ) -@@ -1146,7 +1319,14 @@ - else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf); - else if ( !strcmp(str.s, "INFO") ) - { -- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); -+ if ( *q!='/' ) -+ { -+ int id = bcf_hdr_id2int(convert->header, BCF_DT_ID, str.s); -+ if ( bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) ) -+ error("Could not parse format string \"%s\". Did you mean %%INFO/%s?\n", convert->format_str,str.s); -+ else -+ error("Could not parse format string: %s\n", convert->format_str); -+ } - p = ++q; - str.l = 0; - while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -@@ -1155,6 +1335,17 @@ - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); - fmt->subscript = parse_subscript(&q); - } -+ else if ( !strcmp(str.s,"PBINOM") ) -+ { -+ if ( *q!='(' ) error("Could not parse the expression: %s\n", convert->format_str); -+ p = ++q; -+ str.l = 0; -+ while ( *q && *q!=')' ) q++; -+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); -+ kputsn(p, q-p, &str); -+ register_tag(convert, T_PBINOM, str.s, is_gtf); -+ q++; -+ } - else - { - fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf); -@@ -1189,17 +1380,26 @@ - else if ( !strcmp(str.s, "_GP_TO_PROB3") ) register_tag(convert, T_GP_TO_PROB3, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP") ) register_tag(convert, T_GT_TO_HAP, str.s, is_gtf); - else if ( !strcmp(str.s, "_GT_TO_HAP2") ) register_tag(convert, T_GT_TO_HAP2, str.s, is_gtf); -+ else if ( !strcmp(str.s, "RSX") ) register_tag(convert, T_RSX, str.s, is_gtf); -+ else if ( !strcmp(str.s, "VKX") ) register_tag(convert, T_VKX, str.s, is_gtf); -+ else if ( !strcmp(str.s,"pbinom") ) error("Error: pbinom() is currently supported only with FORMAT tags. (todo)\n"); - else if ( !strcmp(str.s, "INFO") ) - { -- if ( *q!='/' ) error("Could not parse format string: %s\n", convert->format_str); -- p = ++q; -- str.l = 0; -- while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -- if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); -- kputsn(p, q-p, &str); -- fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); -- fmt->subscript = parse_subscript(&q); -+ if ( *q=='/' ) -+ { -+ p = ++q; -+ str.l = 0; -+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -+ if ( q-p==0 ) error("Could not parse format string: %s\n", convert->format_str); -+ kputsn(p, q-p, &str); -+ fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); -+ fmt->subscript = parse_subscript(&q); -+ } -+ else -+ register_tag(convert, T_INFO, NULL, is_gtf); // the whole INFO - } -+ else if ( !strcmp(str.s, "FORMAT") ) -+ register_tag(convert, T_FORMAT, NULL, 0); - else - { - fmt_t *fmt = register_tag(convert, T_INFO, str.s, is_gtf); -@@ -1338,7 +1538,15 @@ - int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) - { - if ( !convert->allow_undef_tags && convert->undef_info_tag ) -- error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag); -+ { -+ kstring_t msg = {0,0,0}; -+ ksprintf(&msg,"Error: no such tag defined in the VCF header: INFO/%s", convert->undef_info_tag); -+ -+ int hdr_id = bcf_hdr_id2int(convert->header,BCF_DT_ID,convert->undef_info_tag); -+ if ( hdr_id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,hdr_id) ) -+ ksprintf(&msg,". FORMAT fields must be enclosed in square brackets, e.g. \"[ %%%s]\"", convert->undef_info_tag); -+ error("%s\n", msg.s); -+ } - - int l_ori = str->l; - bcf_unpack(line, convert->max_unpack); -@@ -1359,7 +1567,7 @@ - for (js=0; jsnsamples; js++) - { - // Skip samples when filtering was requested -- if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; -+ if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; - - // Here comes a hack designed for TBCSQ. When running on large files, - // such as 1000GP, there are too many empty fields in the output and ---- python-pysam.orig/bcftools/csq.c -+++ python-pysam/bcftools/csq.c -@@ -1,3 +1,6 @@ -+//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz -+ -+ - /* The MIT License - - Copyright (c) 2016-2018 Genome Research Ltd. -@@ -25,6 +28,7 @@ - */ - /* - Things that would be nice to have -+ - dynamic N_REF_PAD - - for stop-lost events (also in frameshifts) report the number of truncated aa's - - memory could be greatly reduced by indexing gff (but it is quite compact already) - - deletions that go beyond transcript boundaries are not checked at sequence level -@@ -95,6 +99,7 @@ - splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron - synonymous_variant .. DNA sequence variant resulting in no amino acid change - stop_retained_variant .. different stop codon -+ start_retained_variant .. start codon retained by indel realignment - non_coding_variant .. variant in non-coding sequence, such as RNA gene - 5_prime_UTR_variant - 3_prime_UTR_variant -@@ -133,6 +138,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -142,7 +148,6 @@ - #include - #include - #include --#include - #include - #include "bcftools.h" - #include "filter.h" -@@ -208,13 +213,15 @@ - #define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string - #define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf - #define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence -+#define CSQ_ELONGATION (1<<22) // symbolic insertion -+#define CSQ_START_RETAINED (1<<23) - - // Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345 - #define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \ - CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \ - CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \ -- CSQ_UPSTREAM_STOP) --#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST) -+ CSQ_UPSTREAM_STOP|CSQ_START_RETAINED) -+#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED) - - #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) - #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) -@@ -244,7 +251,9 @@ - "inframe_altering", - NULL, - NULL, -- "coding_sequence" -+ "coding_sequence", -+ "feature_elongation", -+ "start_retained" - }; - - -@@ -339,7 +348,7 @@ - typedef struct - { - char *name; // human readable name, e.g. ORF45 -- uint8_t iseq; -+ uint32_t iseq; - } - gf_gene_t; - typedef struct -@@ -392,7 +401,8 @@ - { - bcf1_t *line; - uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved -- uint32_t nfmt:4, nvcsq:28, mvcsq; -+ uint32_t nfmt:4, // the bitmask size (the number of integers per sample) -+ nvcsq:28, mvcsq; - vcsq_t *vcsq; // there can be multiple consequences for a single VCF record - } - vrec_t; -@@ -408,6 +418,7 @@ - { - vrec_t **vrec; // buffer of VCF lines with the same position - int n, m; -+ uint32_t keep_until; // the maximum transcript end position - }; - KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*) - -@@ -580,9 +591,10 @@ - char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; - char *bcsq_tag; - int argc, output_type; -- int phase, quiet, local_csq; -+ int phase, verbosity, local_csq, record_cmd_line; - int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ - int ncsq_small_warned; -+ int brief_predictions; - - int rid; // current chromosome - tr_heap_t *active_tr; // heap of active transcripts for quick flushing -@@ -596,6 +608,7 @@ - int ncsq_buf, mcsq_buf; - id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx - int force; // force run under various conditions. Currently only to skip out-of-phase transcripts -+ int n_threads; // extra compression/decompression threads - - faidx_t *fai; - kstring_t str, str2; -@@ -671,7 +684,7 @@ - aux->seq[aux->nseq] = strdup(chr_beg); - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; -- assert( aux->nseq < 256 ); // see gf_gene_t.iseq -+ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq - } - chr_end[1] = c; - return iseq; -@@ -886,7 +899,7 @@ - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { -- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored transcript: %s\n",line); -+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript: %s\n",line); - return; - } - -@@ -912,7 +925,7 @@ - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { -- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored gene: %s\n",line); -+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene: %s\n",line); - return; - } - -@@ -978,7 +991,7 @@ - if ( !ss ) return -1; // no ID, ignore the line - if ( !strncmp("chromosome",ss+3,10) ) return -1; - if ( !strncmp("supercontig",ss+3,11) ) return -1; -- if ( args->quiet<2 ) fprintf(stderr,"ignored: %s\n", line); -+ if ( args->verbosity > 0 ) fprintf(stderr,"ignored: %s\n", line); - return -1; - } - -@@ -1000,7 +1013,7 @@ - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; -- else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; } -+ else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; } - ss += 2; - - // 8. column: phase (codon offset) -@@ -1008,7 +1021,7 @@ - else if ( *ss == '1' ) ftr->phase = 1; - else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase -- else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } -+ else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } - ss += 2; - - // substring search for "Parent=transcript:ENST00000437963" -@@ -1122,7 +1135,7 @@ - { - if ( args->force ) - { -- if ( args->quiet < 2 ) -+ if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; -@@ -1160,7 +1173,7 @@ - { - if ( args->force ) - { -- if ( args->quiet < 2 ) -+ if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; -@@ -1293,7 +1306,7 @@ - } - tscript_init_cds(args); - -- if ( !args->quiet ) -+ if ( args->verbosity > 0 ) - { - fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", - regidx_nregs(args->idx_tscript), -@@ -1309,14 +1322,16 @@ - free(aux->seq); - gff_id_destroy(&aux->gene_ids); - -- if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) -+ if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) - { - khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; - fprintf(stderr,"Ignored the following biotypes:\n"); - for (i = kh_begin(ign); i < kh_end(ign); i++) - { - if ( !kh_exist(ign,i)) continue; -- fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); -+ const char *biotype = kh_key(ign,i); -+ if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; -+ fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); - } - } - khash_str2int_destroy_free(aux->ignored_biotypes); -@@ -1326,7 +1341,7 @@ - { - args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; - -- if ( !args->quiet ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); -+ if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); - init_gff(args); - - args->rid = -1; -@@ -1349,7 +1364,8 @@ - if ( args->output_type==FT_TAB_TEXT ) - { - // significant speedup for plain VCFs -- bcf_hdr_set_samples(args->hdr,NULL,0); -+ if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0) -+ error_errno("[%s] Couldn't build sample filter", __func__); - } - args->phase = PHASE_DROP_GT; - } -@@ -1360,7 +1376,7 @@ - if ( args->output_type==FT_TAB_TEXT ) - { - args->out = args->output_fname ? fopen(args->output_fname,"w") : stdout; -- if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno)); -+ if ( !args->out ) error("Failed to write to %s: %s\n", !strcmp("-",args->output_fname)?"standard output":args->output_fname,strerror(errno)); - - fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); - fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]); -@@ -1380,14 +1396,16 @@ - else - { - args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); -- if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); -- bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); -- bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); -+ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); -+ if ( args->n_threads > 0) -+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); -+ if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); -+ bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); - if ( args->hdr_nsmpl ) - bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); -- bcf_hdr_write(args->out_fh, args->hdr); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); - } -- if ( !args->quiet ) fprintf(stderr,"Calling...\n"); -+ if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n"); - } - - void destroy_data(args_t *args) -@@ -1487,6 +1505,7 @@ - splice->vcf.pos = rec->pos; - splice->vcf.rlen = rec->rlen; - splice->vcf.ref = rec->d.allele[0]; -+ splice->csq = 0; - } - static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) - { -@@ -1594,7 +1613,7 @@ - #endif - } - void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); --static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid) -+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type) - { - while ( regitr_overlap(itr) ) - { -@@ -1604,7 +1623,7 @@ - csq_t csq; - memset(&csq, 0, sizeof(csq_t)); - csq.pos = rec->pos; -- csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; -+ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type; - csq.type.biotype = tr->type; - csq.type.strand = tr->strand; - csq.type.trid = tr->id; -@@ -1658,7 +1677,7 @@ - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr - { -- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - if ( ret!=0 ) - { - regitr_destroy(itr); -@@ -1696,7 +1715,7 @@ - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr - { -- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - if ( ret!=0 ) - { - regitr_destroy(itr); -@@ -1763,14 +1782,105 @@ - return SPLICE_INSIDE; - } - -+int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) -+{ -+ static int small_ref_padding_warned = 0; -+ tscript_t *tr = splice->tr; -+ -+ // We know the VCF record overlaps the exon, but does it overlap the start codon? -+ if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; -+ if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0; -+ -+#if XDBG -+ fprintf(stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev"); -+ fprintf(stderr," %d .. %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt); -+#endif -+ -+ // is there enough ref sequence for the extension? All coordinates are 0-based -+ int ref_len = strlen(splice->vcf.ref); -+ int alt_len = strlen(splice->vcf.alt); -+ assert( ref_len > alt_len ); -+ int ndel = ref_len - alt_len; -+ -+ if ( tr->strand==STRAND_REV ) -+ { -+ int32_t vcf_ref_end = splice->vcf.pos + ref_len - 1; // end pos of the VCF REF allele -+ int32_t tr_ref_end = splice->tr->end + N_REF_PAD; // the end pos of accessible cached ref seq -+ if ( vcf_ref_end + ndel > tr_ref_end ) -+ { -+ if ( !small_ref_padding_warned ) -+ { -+ fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); -+ small_ref_padding_warned = 1; -+ } -+ return 0; -+ } -+ -+ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele -+ char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted -+#if XDBG -+ fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); -+#endif -+ int i = 0; -+ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; -+ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced -+ } -+ else -+ { -+ // STRAND_FWD -+ int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion -+ if ( vcf_block_beg < 0 ) return 0; -+ -+#if XDBG -+ fprintf(stderr,"vcf_block_beg: %d\n",vcf_block_beg+1); -+#endif -+ -+ if ( N_REF_PAD + vcf_block_beg < ex_beg ) -+ { -+ if ( !small_ref_padding_warned ) -+ { -+ fprintf(stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); -+ small_ref_padding_warned = 1; -+ } -+ return 0; -+ } -+ -+ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele -+ char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block -+#if XDBG -+ fprintf(stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); -+#endif -+ -+ int i = 0; -+ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; -+ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced -+ } -+ -+ return 1; -+} -+ - static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) - { -+ if ( splice->check_start ) -+ { -+ // check for synonymous start -+ // test/csq/ENST00000375992/incorrect-synon-del-not-start-lost.txt -+ // test/csq/ENST00000368801.2/start-lost.txt -+ // test/csq/ENST00000318249.2/synonymous-start-lost.txt -+ int is_synonymous = shifted_del_synonymous(args, splice, ex_beg, ex_end); -+ if ( is_synonymous ) -+ { -+ splice->csq |= CSQ_START_RETAINED; -+ return SPLICE_OVERLAP; -+ } -+ } -+ - // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG - splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base - splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base - - #if XDBG --fprintf(stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); -+fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); - #endif - - if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1 -@@ -1783,7 +1893,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -1839,7 +1949,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -1874,7 +1984,6 @@ - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); - return SPLICE_OUTSIDE; - } -- - if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 - { - if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; -@@ -1929,7 +2038,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -1959,7 +2068,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -2008,7 +2117,6 @@ - } - static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) - { -- splice->csq = 0; - splice->vcf.alen = strlen(splice->vcf.alt); - - int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0; -@@ -2038,6 +2146,7 @@ - return 0; - } - -+ - // return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref) - int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial) - { -@@ -2070,7 +2179,7 @@ - if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; - - #if XDBG --fprintf(stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); -+fprintf(stderr,"\nhap_init: %d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); - #endif - int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); - #if XDBG -@@ -2078,7 +2187,7 @@ - #endif - - if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA -- if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq -+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq - { - free(splice.kref.s); - free(splice.kalt.s); -@@ -2136,6 +2245,8 @@ - if ( len < 0 ) // overlapping variants - { - free(str.s); -+ free(splice.kref.s); -+ free(splice.kalt.s); - return 1; - } - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); -@@ -2173,6 +2284,7 @@ - if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf - } - -+ - free(splice.kref.s); - free(splice.kalt.s); - return 0; -@@ -2206,7 +2318,7 @@ - void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) - { - #if XDBG --fprintf(stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); -+fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); - #endif - char tmp[3], *codon, *end; - int i, len, npad; -@@ -2306,7 +2418,7 @@ - #if DBG>1 - fprintf(stderr," npad: %d\n",npad); - #endif --if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); -+ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d npad=%d\n",sbeg,(int)seq.l,(int)seq.m,npad); - assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand - - if ( npad==2 ) -@@ -2327,8 +2439,8 @@ - for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); - #if DBG>1 - fprintf(stderr,"\t i=%d\n", i); -- if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]); -- if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]); -+ if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]); -+ if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]); - #endif - if ( i==-1 ) - { -@@ -2569,12 +2681,25 @@ - kputs(csq->vstr.s, str); - } - -+void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) -+{ -+ if ( !args->brief_predictions ) -+ kputs(aa->s, str); -+ else -+ { -+ int len = aa->l; -+ if ( aa->s[len-1]=='*' ) len--; -+ kputc(aa->s[0], str); -+ kputs("..", str); -+ kputw(beg+len, str); -+ } -+} -+ - void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) - { - int i; - tscript_t *tr = hap->tr; - int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; -- - int icsq = node->ncsq_list++; - hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); - csq_t *csq = &node->csq_list[icsq]; -@@ -2678,12 +2803,12 @@ - int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; - kputc_('|', &str); - kputw(aa_rbeg, &str); -- kputs(hap->tref.s, &str); -+ kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str); - if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) - { - kputc_('>', &str); - kputw(aa_sbeg, &str); -- kputs(hap->tseq.s, &str); -+ kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str); - } - kputc_('|', &str); - -@@ -2961,18 +3086,15 @@ - int icsq = 2*csq->idx + ihap; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT - { -- int print_warning = 1; -- if ( args->quiet ) -+ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) - { -- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; -+ fprintf(stderr, -+ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", -+ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); -+ if ( !args->ncsq_small_warned ) -+ fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; - } -- if ( print_warning ) -- { -- fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", -- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); -- if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n"); -- } - break; - } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; -@@ -2984,12 +3106,10 @@ - { - int i,j; - tr_heap_t *heap = args->active_tr; -- - while ( heap->ndat && heap->dat[0]->end<=pos ) - { - tscript_t *tr = heap->dat[0]; - khp_delete(trhp, heap); -- - args->hap->tr = tr; - if ( tr->root && tr->root->nchild ) // normal, non-localized calling - { -@@ -3028,7 +3148,7 @@ - - #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } - --void vbuf_push(args_t *args, bcf1_t **rec_ptr) -+vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) - { - int i; - -@@ -3044,6 +3164,7 @@ - i = rbuf_append(&args->vcf_rbuf); - if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t)); - args->vcf_buf[i]->n = 0; -+ args->vcf_buf[i]->keep_until = 0; - } - vbuf_t *vbuf = args->vcf_buf[i]; - vbuf->n++; -@@ -3063,16 +3184,29 @@ - int ret; - khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret); - kh_val(args->pos2vbuf,k) = vbuf; -+ -+ return vbuf; - } - --void vbuf_flush(args_t *args) -+void vbuf_flush(args_t *args, uint32_t pos) - { -- if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone -- - int i,j; -- while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 ) -+ while ( args->vcf_rbuf.n ) - { -- vbuf_t *vbuf = args->vcf_buf[i]; -+ vbuf_t *vbuf; -+ if ( !args->local_csq && args->active_tr->ndat ) -+ { -+ // check if the first active transcript starts beyond the first buffered VCF record, -+ // cannot output buffered VCF lines (args.vbuf) until the active transcripts are gone -+ vbuf = args->vcf_buf[ args->vcf_rbuf.f ]; -+ if ( vbuf->keep_until > pos ) break; -+ assert( vbuf->n ); -+ } -+ -+ i = rbuf_shift(&args->vcf_rbuf); -+ assert( i>=0 ); -+ vbuf = args->vcf_buf[i]; -+ int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1; - for (i=0; in; i++) - { - vrec_t *vrec = vbuf->vrec[i]; -@@ -3083,7 +3217,10 @@ - } - if ( !vrec->nvcsq ) - { -- bcf_write(args->out_fh, args->hdr, vrec->line); -+ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); -+ int save_pos = vrec->line->pos; -+ bcf_empty(vrec->line); -+ vrec->line->pos = save_pos; // this is necessary for compound variants - continue; - } - -@@ -3098,19 +3235,24 @@ - if ( args->hdr_nsmpl ) - { - if ( vrec->nfmt < args->nfmt_bcsq ) -- for (j=1; jhdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl)); -+ for (j=1; jhdr_nsmpl; j++) -+ memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl)); - bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); - } - vrec->nvcsq = 0; -- bcf_write(args->out_fh, args->hdr, vrec->line); -+ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); -+ int save_pos = vrec->line->pos; -+ bcf_empty(vrec->line); -+ vrec->line->pos = save_pos; - } -- if ( vbuf->n ) -+ if ( pos!=-1 ) - { -- khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos); -+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos); - if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); - } - vbuf->n = 0; - } -+ if ( args->active_tr->ndat ) return; - - for (i=0; inrm_tr; i++) - { -@@ -3137,10 +3279,12 @@ - int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); - if ( pad_beg + pad_end != 2*N_REF_PAD ) - { -- char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD); -+ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); - for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; - memcpy(ref+i, tr->ref, len); -+ len += i; - for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; -+ ref[i+len] = 0; - free(tr->ref); - tr->ref = ref; - } -@@ -3148,15 +3292,19 @@ - - static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) - { -- char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0); -- char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos); -- assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) ); -- while ( *ref && *vcf ) -- { -- if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) ) -- error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]); -- ref++; -- vcf++; -+ int vbeg = 0; -+ int rbeg = rec->pos - tr->beg + N_REF_PAD; -+ if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } -+ char *ref = tr->ref + rbeg; -+ char *vcf = rec->d.allele[0] + vbeg; -+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD ); -+ int i = 0; -+ while ( ref[i] && vcf[i] ) -+ { -+ if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) -+ error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", -+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); -+ i++; - } - } - -@@ -3195,6 +3343,7 @@ - - for (i=1; in_allele; i++) - { -+ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } - if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; - - csq_t csq; -@@ -3294,12 +3443,12 @@ - int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; - kputc_('|', &str); - kputw(aa_rbeg, &str); -- kputs(tref->s, &str); -+ kprint_aa_prediction(args,aa_rbeg,tref,&str); - if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) - { - kputc_('>', &str); - kputw(aa_sbeg, &str); -- kputs(tseq->s, &str); -+ kprint_aa_prediction(args,aa_sbeg,tseq,&str); - } - kputc_('|', &str); - kputw(rec->pos+1, &str); -@@ -3330,8 +3479,10 @@ - return ret; - } - --int test_cds(args_t *args, bcf1_t *rec) -+int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) - { -+ static int overlaps_warned = 0, multiploid_warned = 0; -+ - int i, ret = 0, hap_ret; - const char *chr = bcf_seqname(args->hdr,rec); - // note that the off-by-one extension of rlen is deliberate to account for insertions -@@ -3341,6 +3492,7 @@ - gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; - if ( !GF_is_coding(tr->type) ) continue; -+ if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; - ret = 1; - if ( !tr->root ) - { -@@ -3370,10 +3522,17 @@ - // overlapping or intron variant, cannot apply - if ( hap_ret==1 ) - { -- if ( !args->quiet ) -- fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) -+ { -+ fprintf(stderr, -+ "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n", -+ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( !overlaps_warned ) -+ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); -+ overlaps_warned = 1; -+ } - if ( args->out ) -- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); - } - else ret = 1; // prevent reporting as intron in test_tscript - hap_destroy(child); -@@ -3409,10 +3568,17 @@ - ngts /= bcf_hdr_nsamples(args->hdr); - if ( ngts!=1 && ngts!=2 ) - { -- if ( !args->quiet ) -- fprintf(stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) ) -+ { -+ fprintf(stderr, -+ "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n", -+ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( !multiploid_warned ) -+ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); -+ multiploid_warned = 1; -+ } - if ( args->out ) -- fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); - continue; - } - for (ismpl=0; ismplsmpl->n; ismpl++) -@@ -3429,7 +3595,7 @@ - if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) - { - if ( args->phase==PHASE_REQUIRE ) -- error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); -+ error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); - if ( args->phase==PHASE_SKIP ) - continue; - if ( args->phase==PHASE_NON_REF ) -@@ -3468,12 +3634,18 @@ - // overlapping or intron variant, cannot apply - if ( hap_ret==1 ) - { -- if ( !args->quiet ) -- fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", -- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); -+ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) -+ { -+ fprintf(stderr, -+ "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n", -+ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); -+ if ( !overlaps_warned ) -+ fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); -+ overlaps_warned = 1; -+ } - if ( args->out ) -- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", -- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); -+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n", -+ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); - } - hap_destroy(child); - continue; -@@ -3559,19 +3731,15 @@ - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT - { - int ismpl = args->smpl->idx[i]; -- int print_warning = 1; -- if ( args->quiet ) -+ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) - { -- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; -+ fprintf(stderr, -+ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", -+ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1); -+ if ( !args->ncsq_small_warned ) -+ fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; - } -- if ( print_warning ) -- { -- fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", -- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); -- if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n"); -- } -- break; - } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); -@@ -3594,8 +3762,9 @@ - tscript_t *tr = splice.tr = utr->tr; - for (i=1; in_allele; i++) - { -- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } -+ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } - splice.vcf.alt = rec->d.allele[i]; -+ splice.csq = 0; - int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); - if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; - csq_t csq; -@@ -3637,6 +3806,7 @@ - { - if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } - splice.vcf.alt = rec->d.allele[i]; -+ splice.csq = 0; - splice_csq(args, &splice, exon->beg, exon->end); - if ( splice.csq ) ret = 1; - } -@@ -3659,8 +3829,9 @@ - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); - for (i=1; in_allele; i++) - { -- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } -+ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } - splice.vcf.alt = rec->d.allele[i]; -+ splice.csq = 0; - int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); - if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF - csq_t csq; -@@ -3680,22 +3851,151 @@ - return ret; - } - --void process(args_t *args, bcf1_t **rec_ptr) -+void test_symbolic_alt(args_t *args, bcf1_t *rec) -+{ -+ static int warned = 0; -+ if ( args->verbosity && (!warned && args->verbosity > 0) ) -+ { -+ fprintf(stderr,"Warning: The support for symbolic ALT insertions is experimental.\n"); -+ warned = 1; -+ } -+ -+ const char *chr = bcf_seqname(args->hdr,rec); -+ -+ // only insertions atm -+ int beg = rec->pos + 1; -+ int end = beg; -+ int csq_class = CSQ_ELONGATION; -+ -+ int hit = 0; -+ if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) ) -+ { -+ while ( regitr_overlap(args->itr) ) -+ { -+ csq_t csq; -+ memset(&csq, 0, sizeof(csq_t)); -+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); -+ tscript_t *tr = cds->tr; -+ csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; -+ csq.pos = rec->pos; -+ csq.type.biotype = tr->type; -+ csq.type.strand = tr->strand; -+ csq.type.trid = tr->id; -+ csq.type.gene = tr->gene->name; -+ csq_stage(args, &csq, rec); -+ hit = 1; -+ } -+ } -+ if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) ) -+ { -+ while ( regitr_overlap(args->itr) ) -+ { -+ csq_t csq; -+ memset(&csq, 0, sizeof(csq_t)); -+ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); -+ tscript_t *tr = utr->tr; -+ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; -+ csq.pos = rec->pos; -+ csq.type.biotype = tr->type; -+ csq.type.strand = tr->strand; -+ csq.type.trid = tr->id; -+ csq.type.gene = tr->gene->name; -+ csq_stage(args, &csq, rec); -+ hit = 1; -+ } -+ } -+ if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) ) -+ { -+ splice_t splice; -+ splice_init(&splice, rec); -+ splice.check_acceptor = splice.check_donor = 1; -+ -+ while ( regitr_overlap(args->itr) ) -+ { -+ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); -+ splice.tr = exon->tr; -+ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites -+ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; -+ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; -+ splice.vcf.alt = rec->d.allele[1]; -+ splice.csq = csq_class; -+ splice_csq(args, &splice, exon->beg, exon->end); -+ if ( splice.csq ) hit = 1; -+ } -+ } -+ if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) ) -+ { -+ splice_t splice; -+ splice_init(&splice, rec); -+ -+ while ( regitr_overlap(args->itr) ) -+ { -+ csq_t csq; -+ memset(&csq, 0, sizeof(csq_t)); -+ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); -+ splice.vcf.alt = rec->d.allele[1]; -+ splice.csq = csq_class; -+ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); -+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF -+ csq.type.type = (GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING) | csq_class; -+ csq.pos = rec->pos; -+ csq.type.biotype = tr->type; -+ csq.type.strand = tr->strand; -+ csq.type.trid = tr->id; -+ csq.type.gene = tr->gene->name; -+ csq_stage(args, &csq, rec); -+ } -+ } -+} -+ -+void debug_print_buffers(args_t *args, int pos) -+{ -+ int i,j; -+ fprintf(stderr,"debug_print_buffers at %d\n", pos); -+ fprintf(stderr,"vbufs:\n"); -+ for (i=0; ivcf_rbuf.n; i++) -+ { -+ int k = rbuf_kth(&args->vcf_rbuf, i); -+ vbuf_t *vbuf = args->vcf_buf[k]; -+ -+ fprintf(stderr,"\tvbuf %d:\n", i); -+ for (j=0; jn; j++) -+ { -+ vrec_t *vrec = vbuf->vrec[j]; -+ fprintf(stderr,"\t\t%"PRId64" .. nvcsq=%d\n", (int64_t) vrec->line->pos+1, vrec->nvcsq); -+ } -+ } -+ fprintf(stderr,"pos2vbuf:"); -+ khint_t k; -+ for (k = 0; k < kh_end(args->pos2vbuf); ++k) -+ if (kh_exist(args->pos2vbuf, k)) fprintf(stderr," %d",1+(int)kh_key(args->pos2vbuf, k)); -+ fprintf(stderr,"\n"); -+ fprintf(stderr,"active_tr: %d\n", args->active_tr->ndat); -+} -+ -+static void process(args_t *args, bcf1_t **rec_ptr) - { - if ( !rec_ptr ) - { - hap_flush(args, REGIDX_MAX); -- vbuf_flush(args); -+ vbuf_flush(args, REGIDX_MAX); - return; - } - - bcf1_t *rec = *rec_ptr; -+ static int32_t prev_rid = -1, prev_pos = -1; -+ if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; } -+ if ( prev_pos > rec->pos ) -+ error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - - int call_csq = 1; -- if ( !rec->n_allele ) call_csq = 0; // no alternate allele -- else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele -- else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc -- else if ( args->filter ) -+ if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele -+ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0; // gVCF, not an alt allele -+ else if ( rec->d.allele[1][0]=='<' ) -+ { -+ if ( strncmp("d.allele[1], 4) ) call_csq = 0; // only is supported at the moment -+ } -+ if ( call_csq && args->filter ) - { - call_csq = filter_test(args->filter, rec, NULL); - if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1; -@@ -3704,25 +4004,34 @@ - { - if ( !args->out_fh ) return; // not a VCF output - vbuf_push(args, rec_ptr); -- vbuf_flush(args); -+ hap_flush(args, rec->pos-1); -+ vbuf_flush(args, rec->pos-1); - return; - } - - if ( args->rid != rec->rid ) - { - hap_flush(args, REGIDX_MAX); -- vbuf_flush(args); -+ vbuf_flush(args, REGIDX_MAX); - } - args->rid = rec->rid; -- vbuf_push(args, rec_ptr); -+ vbuf_t *vbuf = vbuf_push(args, rec_ptr); - -- int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec); -- hit += test_utr(args, rec); -- hit += test_splice(args, rec); -- if ( !hit ) test_tscript(args, rec); -+ if ( rec->d.allele[1][0]!='<' ) -+ { -+ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec, vbuf); -+ hit += test_utr(args, rec); -+ hit += test_splice(args, rec); -+ if ( !hit ) test_tscript(args, rec); -+ } -+ else -+ test_symbolic_alt(args, rec); - -- hap_flush(args, rec->pos-1); -- vbuf_flush(args); -+ if ( rec->pos > 0 ) -+ { -+ hap_flush(args, rec->pos-1); -+ vbuf_flush(args, rec->pos-1); -+ } - - return; - } -@@ -3739,6 +4048,7 @@ - " -g, --gff-annot gff3 annotation file\n" - "\n" - "CSQ options:\n" -+ " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" - " -c, --custom-tag use this tag instead of the default BCSQ\n" - " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq maximum number of consequences to consider per site [16]\n" -@@ -3752,16 +4062,18 @@ - " -e, --exclude exclude sites for which the expression is true\n" - " --force run even if some sanity checks fail\n" - " -i, --include select sites for which the expression is true\n" -+ " --no-version do not append version and command line to the header\n" - " -o, --output write output to a file [standard output]\n" - " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" - " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" -- " -q, --quiet suppress warning messages. Can be given two times for even less messages\n" - " -r, --regions restrict to comma-separated list of regions\n" - " -R, --regions-file restrict to regions listed in a file\n" - " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" - " -S, --samples-file samples to include\n" - " -t, --targets similar to -r but streams rather than index-jumps\n" - " -T, --targets-file similar to -R but streams rather than index-jumps\n" -+ " --threads use multithreading with worker threads [0]\n" -+ " -v, --verbose verbosity level 0-2 [1]\n" - "\n" - "Example:\n" - " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" -@@ -3779,12 +4091,16 @@ - args->output_type = FT_VCF; - args->bcsq_tag = "BCSQ"; - args->ncsq_max = 2*16; -+ args->verbosity = 1; -+ args->record_cmd_line = 1; - - static struct option loptions[] = - { - {"force",0,0,1}, -+ {"threads",required_argument,NULL,2}, - {"help",0,0,'h'}, - {"ncsq",1,0,'n'}, -+ {"brief-predictions",0,0,'b'}, - {"custom-tag",1,0,'c'}, - {"local-csq",0,0,'l'}, - {"gff-annot",1,0,'g'}, -@@ -3795,24 +4111,36 @@ - {"output-type",1,NULL,'O'}, - {"phase",1,0,'p'}, - {"quiet",0,0,'q'}, -+ {"verbose",1,0,'v'}, - {"regions",1,0,'r'}, - {"regions-file",1,0,'R'}, - {"samples",1,0,'s'}, - {"samples-file",1,0,'S'}, - {"targets",1,0,'t'}, - {"targets-file",1,0,'T'}, -+ {"no-version",no_argument,NULL,3}, - {0,0,0,0} - }; - int c, targets_is_file = 0, regions_is_file = 0; -- char *targets_list = NULL, *regions_list = NULL; -- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0) -+ char *targets_list = NULL, *regions_list = NULL, *tmp; -+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0) - { - switch (c) - { - case 1 : args->force = 1; break; -+ case 2 : -+ args->n_threads = strtol(optarg,&tmp,10); -+ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); -+ break; -+ case 3 : args->record_cmd_line = 0; break; -+ case 'b': args->brief_predictions = 1; break; - case 'l': args->local_csq = 1; break; - case 'c': args->bcsq_tag = optarg; break; -- case 'q': args->quiet++; break; -+ case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; -+ case 'v': -+ args->verbosity = atoi(optarg); -+ if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); -+ break; - case 'p': - switch (optarg[0]) - { -@@ -3869,8 +4197,9 @@ - error("Failed to read the targets: %s\n", targets_list); - if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", regions_list); -+ if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads); - if ( !bcf_sr_add_reader(args->sr, fname) ) -- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum)); -+ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum)); - args->hdr = bcf_sr_get_header(args->sr,0); - - init_data(args); -@@ -3883,7 +4212,6 @@ - destroy_data(args); - bcf_sr_destroy(args->sr); - free(args); -- - return 0; - } - ---- python-pysam.orig/bcftools/csq.c.pysam.c -+++ python-pysam/bcftools/csq.c.pysam.c -@@ -1,5 +1,8 @@ - #include "bcftools.pysam.h" - -+//$bt csq -f $ref -g $gff -p r -Ou -o /dev/null /lustre/scratch116/vr/projects/g1k/phase3/release/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz -+ -+ - /* The MIT License - - Copyright (c) 2016-2018 Genome Research Ltd. -@@ -27,6 +30,7 @@ - */ - /* - Things that would be nice to have -+ - dynamic N_REF_PAD - - for stop-lost events (also in frameshifts) report the number of truncated aa's - - memory could be greatly reduced by indexing gff (but it is quite compact already) - - deletions that go beyond transcript boundaries are not checked at sequence level -@@ -97,6 +101,7 @@ - splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron - synonymous_variant .. DNA sequence variant resulting in no amino acid change - stop_retained_variant .. different stop codon -+ start_retained_variant .. start codon retained by indel realignment - non_coding_variant .. variant in non-coding sequence, such as RNA gene - 5_prime_UTR_variant - 3_prime_UTR_variant -@@ -135,6 +140,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -144,7 +150,6 @@ - #include - #include - #include --#include - #include - #include "bcftools.h" - #include "filter.h" -@@ -210,13 +215,15 @@ - #define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string - #define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf - #define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence -+#define CSQ_ELONGATION (1<<22) // symbolic insertion -+#define CSQ_START_RETAINED (1<<23) - - // Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345 - #define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \ - CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \ - CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \ -- CSQ_UPSTREAM_STOP) --#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST) -+ CSQ_UPSTREAM_STOP|CSQ_START_RETAINED) -+#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED) - - #define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION))) - #define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING)) -@@ -246,7 +253,9 @@ - "inframe_altering", - NULL, - NULL, -- "coding_sequence" -+ "coding_sequence", -+ "feature_elongation", -+ "start_retained" - }; - - -@@ -341,7 +350,7 @@ - typedef struct - { - char *name; // human readable name, e.g. ORF45 -- uint8_t iseq; -+ uint32_t iseq; - } - gf_gene_t; - typedef struct -@@ -394,7 +403,8 @@ - { - bcf1_t *line; - uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved -- uint32_t nfmt:4, nvcsq:28, mvcsq; -+ uint32_t nfmt:4, // the bitmask size (the number of integers per sample) -+ nvcsq:28, mvcsq; - vcsq_t *vcsq; // there can be multiple consequences for a single VCF record - } - vrec_t; -@@ -410,6 +420,7 @@ - { - vrec_t **vrec; // buffer of VCF lines with the same position - int n, m; -+ uint32_t keep_until; // the maximum transcript end position - }; - KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*) - -@@ -582,9 +593,10 @@ - char *outdir, **argv, *fa_fname, *gff_fname, *output_fname; - char *bcsq_tag; - int argc, output_type; -- int phase, quiet, local_csq; -+ int phase, verbosity, local_csq, record_cmd_line; - int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ - int ncsq_small_warned; -+ int brief_predictions; - - int rid; // current chromosome - tr_heap_t *active_tr; // heap of active transcripts for quick flushing -@@ -598,6 +610,7 @@ - int ncsq_buf, mcsq_buf; - id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx - int force; // force run under various conditions. Currently only to skip out-of-phase transcripts -+ int n_threads; // extra compression/decompression threads - - faidx_t *fai; - kstring_t str, str2; -@@ -673,7 +686,7 @@ - aux->seq[aux->nseq] = strdup(chr_beg); - iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]); - aux->nseq++; -- assert( aux->nseq < 256 ); // see gf_gene_t.iseq -+ assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq - } - chr_end[1] = c; - return iseq; -@@ -888,7 +901,7 @@ - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { -- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(bcftools_stderr,"ignored transcript: %s\n",line); -+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript: %s\n",line); - return; - } - -@@ -914,7 +927,7 @@ - int biotype = gff_parse_biotype(ss); - if ( biotype <= 0 ) - { -- if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(bcftools_stderr,"ignored gene: %s\n",line); -+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene: %s\n",line); - return; - } - -@@ -980,7 +993,7 @@ - if ( !ss ) return -1; // no ID, ignore the line - if ( !strncmp("chromosome",ss+3,10) ) return -1; - if ( !strncmp("supercontig",ss+3,11) ) return -1; -- if ( args->quiet<2 ) fprintf(bcftools_stderr,"ignored: %s\n", line); -+ if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored: %s\n", line); - return -1; - } - -@@ -1002,7 +1015,7 @@ - // 7. column: strand - if ( *ss == '+' ) ftr->strand = STRAND_FWD; - else if ( *ss == '-' ) ftr->strand = STRAND_REV; -- else { if ( args->quiet<2 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } -+ else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown strand: %c\n", *ss); return -1; } - ss += 2; - - // 8. column: phase (codon offset) -@@ -1010,7 +1023,7 @@ - else if ( *ss == '1' ) ftr->phase = 1; - else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase -- else { if ( args->quiet<2 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } -+ else { if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } - ss += 2; - - // substring search for "Parent=transcript:ENST00000437963" -@@ -1124,7 +1137,7 @@ - { - if ( args->force ) - { -- if ( args->quiet < 2 ) -+ if ( args->verbosity > 0 ) - fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; -@@ -1162,7 +1175,7 @@ - { - if ( args->force ) - { -- if ( args->quiet < 2 ) -+ if ( args->verbosity > 0 ) - fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); - tscript_ok = 0; - break; -@@ -1295,7 +1308,7 @@ - } - tscript_init_cds(args); - -- if ( !args->quiet ) -+ if ( args->verbosity > 0 ) - { - fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", - regidx_nregs(args->idx_tscript), -@@ -1311,14 +1324,16 @@ - free(aux->seq); - gff_id_destroy(&aux->gene_ids); - -- if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) ) -+ if ( args->verbosity > 0 && khash_str2int_size(aux->ignored_biotypes) ) - { - khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes; - fprintf(bcftools_stderr,"Ignored the following biotypes:\n"); - for (i = kh_begin(ign); i < kh_end(ign); i++) - { - if ( !kh_exist(ign,i)) continue; -- fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i)); -+ const char *biotype = kh_key(ign,i); -+ if ( !strcmp(biotype,"TCE") ) biotype = "TCE (\"To be Experimentally Confirmed\")"; -+ fprintf(bcftools_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), biotype); - } - } - khash_str2int_destroy_free(aux->ignored_biotypes); -@@ -1328,7 +1343,7 @@ - { - args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; - -- if ( !args->quiet ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname); -+ if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Parsing %s ...\n", args->gff_fname); - init_gff(args); - - args->rid = -1; -@@ -1351,7 +1366,8 @@ - if ( args->output_type==FT_TAB_TEXT ) - { - // significant speedup for plain VCFs -- bcf_hdr_set_samples(args->hdr,NULL,0); -+ if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0) -+ error_errno("[%s] Couldn't build sample filter", __func__); - } - args->phase = PHASE_DROP_GT; - } -@@ -1362,7 +1378,7 @@ - if ( args->output_type==FT_TAB_TEXT ) - { - args->out = args->output_fname ? fopen(args->output_fname,"w") : bcftools_stdout; -- if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno)); -+ if ( !args->out ) error("Failed to write to %s: %s\n", !strcmp("-",args->output_fname)?"standard output":args->output_fname,strerror(errno)); - - fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version()); - fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]); -@@ -1382,14 +1398,16 @@ - else - { - args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); -- if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); -- bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); -- bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); -+ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); -+ if ( args->n_threads > 0) -+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); -+ if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); -+ bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); - if ( args->hdr_nsmpl ) - bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); -- bcf_hdr_write(args->out_fh, args->hdr); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); - } -- if ( !args->quiet ) fprintf(bcftools_stderr,"Calling...\n"); -+ if ( args->verbosity > 0 ) fprintf(bcftools_stderr,"Calling...\n"); - } - - void destroy_data(args_t *args) -@@ -1489,6 +1507,7 @@ - splice->vcf.pos = rec->pos; - splice->vcf.rlen = rec->rlen; - splice->vcf.ref = rec->d.allele[0]; -+ splice->csq = 0; - } - static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) - { -@@ -1596,7 +1615,7 @@ - #endif - } - void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec); --static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid) -+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid, uint32_t type) - { - while ( regitr_overlap(itr) ) - { -@@ -1606,7 +1625,7 @@ - csq_t csq; - memset(&csq, 0, sizeof(csq_t)); - csq.pos = rec->pos; -- csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; -+ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type; - csq.type.biotype = tr->type; - csq.type.strand = tr->strand; - csq.type.trid = tr->id; -@@ -1660,7 +1679,7 @@ - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr - { -- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - if ( ret!=0 ) - { - regitr_destroy(itr); -@@ -1698,7 +1717,7 @@ - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr - { -- ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - if ( ret!=0 ) - { - regitr_destroy(itr); -@@ -1765,14 +1784,105 @@ - return SPLICE_INSIDE; - } - -+int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) -+{ -+ static int small_ref_padding_warned = 0; -+ tscript_t *tr = splice->tr; -+ -+ // We know the VCF record overlaps the exon, but does it overlap the start codon? -+ if ( tr->strand==STRAND_REV && splice->vcf.pos + splice->vcf.rlen + 2 <= ex_end ) return 0; -+ if ( tr->strand==STRAND_FWD && splice->vcf.pos >= ex_beg + 3 ) return 0; -+ -+#if XDBG -+ fprintf(bcftools_stderr,"shifted_del_synonymous: %d-%d %s\n",ex_beg,ex_end, tr->strand==STRAND_FWD?"fwd":"rev"); -+ fprintf(bcftools_stderr," %d .. %s > %s\n",splice->vcf.pos+1,splice->vcf.ref,splice->vcf.alt); -+#endif -+ -+ // is there enough ref sequence for the extension? All coordinates are 0-based -+ int ref_len = strlen(splice->vcf.ref); -+ int alt_len = strlen(splice->vcf.alt); -+ assert( ref_len > alt_len ); -+ int ndel = ref_len - alt_len; -+ -+ if ( tr->strand==STRAND_REV ) -+ { -+ int32_t vcf_ref_end = splice->vcf.pos + ref_len - 1; // end pos of the VCF REF allele -+ int32_t tr_ref_end = splice->tr->end + N_REF_PAD; // the end pos of accessible cached ref seq -+ if ( vcf_ref_end + ndel > tr_ref_end ) -+ { -+ if ( !small_ref_padding_warned ) -+ { -+ fprintf(bcftools_stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); -+ small_ref_padding_warned = 1; -+ } -+ return 0; -+ } -+ -+ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele -+ char *ptr_ref = splice->tr->ref + N_REF_PAD + (vcf_ref_end + 1 - splice->tr->beg); // the first ref base after the ndel bases deleted -+#if XDBG -+ fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); -+#endif -+ int i = 0; -+ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; -+ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced -+ } -+ else -+ { -+ // STRAND_FWD -+ int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion -+ if ( vcf_block_beg < 0 ) return 0; -+ -+#if XDBG -+ fprintf(bcftools_stderr,"vcf_block_beg: %d\n",vcf_block_beg+1); -+#endif -+ -+ if ( N_REF_PAD + vcf_block_beg < ex_beg ) -+ { -+ if ( !small_ref_padding_warned ) -+ { -+ fprintf(bcftools_stderr,"Warning: Could not verify synonymous start/stop at %s:%d due to small N_REF_PAD. (Improve me?)\n",bcf_seqname(args->hdr,splice->vcf.rec),splice->vcf.pos+1); -+ small_ref_padding_warned = 1; -+ } -+ return 0; -+ } -+ -+ char *ptr_vcf = splice->vcf.ref + alt_len; // the first deleted base in the VCF REF allele -+ char *ptr_ref = splice->tr->ref + N_REF_PAD + vcf_block_beg - splice->tr->beg; // the replacement ref block -+#if XDBG -+ fprintf(bcftools_stderr,"vcf: %s\nref: %s\n",ptr_vcf,ptr_ref); -+#endif -+ -+ int i = 0; -+ while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; -+ if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced -+ } -+ -+ return 1; -+} -+ - static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) - { -+ if ( splice->check_start ) -+ { -+ // check for synonymous start -+ // test/csq/ENST00000375992/incorrect-synon-del-not-start-lost.txt -+ // test/csq/ENST00000368801.2/start-lost.txt -+ // test/csq/ENST00000318249.2/synonymous-start-lost.txt -+ int is_synonymous = shifted_del_synonymous(args, splice, ex_beg, ex_end); -+ if ( is_synonymous ) -+ { -+ splice->csq |= CSQ_START_RETAINED; -+ return SPLICE_OVERLAP; -+ } -+ } -+ - // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG - splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base - splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base - - #if XDBG --fprintf(bcftools_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); -+fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end); - #endif - - if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1 -@@ -1785,7 +1895,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -1841,7 +1951,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -1876,7 +1986,6 @@ - csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq); - return SPLICE_OUTSIDE; - } -- - if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1 - { - if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION; -@@ -1931,7 +2040,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -1961,7 +2070,7 @@ - regitr_t *itr = regitr_init(NULL); - const char *chr = bcf_seqname(args->hdr,splice->vcf.rec); - if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr -- csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id); -+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq); - regitr_destroy(itr); - } - if ( !csq ) -@@ -2010,7 +2119,6 @@ - } - static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end) - { -- splice->csq = 0; - splice->vcf.alen = strlen(splice->vcf.alt); - - int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0; -@@ -2040,6 +2148,7 @@ - return 0; - } - -+ - // return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref) - int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial) - { -@@ -2072,7 +2181,7 @@ - if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1; - - #if XDBG --fprintf(bcftools_stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); -+fprintf(bcftools_stderr,"\nhap_init: %d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop); - #endif - int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1); - #if XDBG -@@ -2080,7 +2189,7 @@ - #endif - - if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA -- if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq -+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP || splice.csq==CSQ_START_LOST ) // not a coding csq - { - free(splice.kref.s); - free(splice.kalt.s); -@@ -2138,6 +2247,8 @@ - if ( len < 0 ) // overlapping variants - { - free(str.s); -+ free(splice.kref.s); -+ free(splice.kalt.s); - return 1; - } - kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str); -@@ -2175,6 +2286,7 @@ - if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf - } - -+ - free(splice.kref.s); - free(splice.kalt.s); - return 0; -@@ -2208,7 +2320,7 @@ - void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill) - { - #if XDBG --fprintf(bcftools_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); -+fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l); - #endif - char tmp[3], *codon, *end; - int i, len, npad; -@@ -2308,7 +2420,7 @@ - #if DBG>1 - fprintf(bcftools_stderr," npad: %d\n",npad); - #endif --if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(bcftools_stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m); -+ if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(bcftools_stderr,"sbeg=%d seq.l=%d seq.m=%d npad=%d\n",sbeg,(int)seq.l,(int)seq.m,npad); - assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand - - if ( npad==2 ) -@@ -2329,8 +2441,8 @@ - for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end); - #if DBG>1 - fprintf(bcftools_stderr,"\t i=%d\n", i); -- if(i==1)fprintf(bcftools_stderr,"[0] %c\n",tmp[2]); -- if(i==0)fprintf(bcftools_stderr,"[0] %c%c\n",tmp[1],tmp[2]); -+ if(i==1)fprintf(bcftools_stderr,"[0] %c\n",tmp[2]); -+ if(i==0)fprintf(bcftools_stderr,"[0] %c%c\n",tmp[1],tmp[2]); - #endif - if ( i==-1 ) - { -@@ -2571,12 +2683,25 @@ - kputs(csq->vstr.s, str); - } - -+void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str) -+{ -+ if ( !args->brief_predictions ) -+ kputs(aa->s, str); -+ else -+ { -+ int len = aa->l; -+ if ( aa->s[len-1]=='*' ) len--; -+ kputc(aa->s[0], str); -+ kputs("..", str); -+ kputw(beg+len, str); -+ } -+} -+ - void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel) - { - int i; - tscript_t *tr = hap->tr; - int ref_node = tr->strand==STRAND_FWD ? ibeg : iend; -- - int icsq = node->ncsq_list++; - hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list); - csq_t *csq = &node->csq_list[icsq]; -@@ -2680,12 +2805,12 @@ - int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1; - kputc_('|', &str); - kputw(aa_rbeg, &str); -- kputs(hap->tref.s, &str); -+ kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str); - if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) ) - { - kputc_('>', &str); - kputw(aa_sbeg, &str); -- kputs(hap->tseq.s, &str); -+ kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str); - } - kputc_('|', &str); - -@@ -2963,18 +3088,15 @@ - int icsq = 2*csq->idx + ihap; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT - { -- int print_warning = 1; -- if ( args->quiet ) -+ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) - { -- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; -+ fprintf(bcftools_stderr, -+ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", -+ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); -+ if ( !args->ncsq_small_warned ) -+ fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; - } -- if ( print_warning ) -- { -- fprintf(bcftools_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", -- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); -- if ( args->quiet ) fprintf(bcftools_stderr,"(This warning is printed only once)\n"); -- } - break; - } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; -@@ -2986,12 +3108,10 @@ - { - int i,j; - tr_heap_t *heap = args->active_tr; -- - while ( heap->ndat && heap->dat[0]->end<=pos ) - { - tscript_t *tr = heap->dat[0]; - khp_delete(trhp, heap); -- - args->hap->tr = tr; - if ( tr->root && tr->root->nchild ) // normal, non-localized calling - { -@@ -3030,7 +3150,7 @@ - - #define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; } - --void vbuf_push(args_t *args, bcf1_t **rec_ptr) -+vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) - { - int i; - -@@ -3046,6 +3166,7 @@ - i = rbuf_append(&args->vcf_rbuf); - if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t)); - args->vcf_buf[i]->n = 0; -+ args->vcf_buf[i]->keep_until = 0; - } - vbuf_t *vbuf = args->vcf_buf[i]; - vbuf->n++; -@@ -3065,16 +3186,29 @@ - int ret; - khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret); - kh_val(args->pos2vbuf,k) = vbuf; -+ -+ return vbuf; - } - --void vbuf_flush(args_t *args) -+void vbuf_flush(args_t *args, uint32_t pos) - { -- if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone -- - int i,j; -- while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 ) -+ while ( args->vcf_rbuf.n ) - { -- vbuf_t *vbuf = args->vcf_buf[i]; -+ vbuf_t *vbuf; -+ if ( !args->local_csq && args->active_tr->ndat ) -+ { -+ // check if the first active transcript starts beyond the first buffered VCF record, -+ // cannot output buffered VCF lines (args.vbuf) until the active transcripts are gone -+ vbuf = args->vcf_buf[ args->vcf_rbuf.f ]; -+ if ( vbuf->keep_until > pos ) break; -+ assert( vbuf->n ); -+ } -+ -+ i = rbuf_shift(&args->vcf_rbuf); -+ assert( i>=0 ); -+ vbuf = args->vcf_buf[i]; -+ int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1; - for (i=0; in; i++) - { - vrec_t *vrec = vbuf->vrec[i]; -@@ -3085,7 +3219,10 @@ - } - if ( !vrec->nvcsq ) - { -- bcf_write(args->out_fh, args->hdr, vrec->line); -+ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); -+ int save_pos = vrec->line->pos; -+ bcf_empty(vrec->line); -+ vrec->line->pos = save_pos; // this is necessary for compound variants - continue; - } - -@@ -3100,19 +3237,24 @@ - if ( args->hdr_nsmpl ) - { - if ( vrec->nfmt < args->nfmt_bcsq ) -- for (j=1; jhdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl)); -+ for (j=1; jhdr_nsmpl; j++) -+ memmove(&vrec->smpl[j*vrec->nfmt], &vrec->smpl[j*args->nfmt_bcsq], vrec->nfmt*sizeof(*vrec->smpl)); - bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt); - } - vrec->nvcsq = 0; -- bcf_write(args->out_fh, args->hdr, vrec->line); -+ if ( bcf_write(args->out_fh, args->hdr, vrec->line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); -+ int save_pos = vrec->line->pos; -+ bcf_empty(vrec->line); -+ vrec->line->pos = save_pos; - } -- if ( vbuf->n ) -+ if ( pos!=-1 ) - { -- khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos); -+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos); - if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); - } - vbuf->n = 0; - } -+ if ( args->active_tr->ndat ) return; - - for (i=0; inrm_tr; i++) - { -@@ -3139,10 +3281,12 @@ - int pad_end = len - (tr->end - tr->beg + 1 + pad_beg); - if ( pad_beg + pad_end != 2*N_REF_PAD ) - { -- char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD); -+ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD + 1); - for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N'; - memcpy(ref+i, tr->ref, len); -+ len += i; - for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N'; -+ ref[i+len] = 0; - free(tr->ref); - tr->ref = ref; - } -@@ -3150,15 +3294,19 @@ - - static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) - { -- char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0); -- char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos); -- assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) ); -- while ( *ref && *vcf ) -- { -- if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) ) -- error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]); -- ref++; -- vcf++; -+ int vbeg = 0; -+ int rbeg = rec->pos - tr->beg + N_REF_PAD; -+ if ( rbeg < 0 ) { vbeg += abs(rbeg); rbeg = 0; } -+ char *ref = tr->ref + rbeg; -+ char *vcf = rec->d.allele[0] + vbeg; -+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) && ref - tr->ref < tr->end - tr->beg + 2*N_REF_PAD ); -+ int i = 0; -+ while ( ref[i] && vcf[i] ) -+ { -+ if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) -+ error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", -+ bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); -+ i++; - } - } - -@@ -3197,6 +3345,7 @@ - - for (i=1; in_allele; i++) - { -+ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } - if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; - - csq_t csq; -@@ -3296,12 +3445,12 @@ - int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1; - kputc_('|', &str); - kputw(aa_rbeg, &str); -- kputs(tref->s, &str); -+ kprint_aa_prediction(args,aa_rbeg,tref,&str); - if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) ) - { - kputc_('>', &str); - kputw(aa_sbeg, &str); -- kputs(tseq->s, &str); -+ kprint_aa_prediction(args,aa_sbeg,tseq,&str); - } - kputc_('|', &str); - kputw(rec->pos+1, &str); -@@ -3332,8 +3481,10 @@ - return ret; - } - --int test_cds(args_t *args, bcf1_t *rec) -+int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) - { -+ static int overlaps_warned = 0, multiploid_warned = 0; -+ - int i, ret = 0, hap_ret; - const char *chr = bcf_seqname(args->hdr,rec); - // note that the off-by-one extension of rlen is deliberate to account for insertions -@@ -3343,6 +3494,7 @@ - gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); - tscript_t *tr = cds->tr; - if ( !GF_is_coding(tr->type) ) continue; -+ if ( vbuf->keep_until < tr->end ) vbuf->keep_until = tr->end; - ret = 1; - if ( !tr->root ) - { -@@ -3372,10 +3524,17 @@ - // overlapping or intron variant, cannot apply - if ( hap_ret==1 ) - { -- if ( !args->quiet ) -- fprintf(bcftools_stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) -+ { -+ fprintf(bcftools_stderr, -+ "Warning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s.\n", -+ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( !overlaps_warned ) -+ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); -+ overlaps_warned = 1; -+ } - if ( args->out ) -- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); - } - else ret = 1; // prevent reporting as intron in test_tscript - hap_destroy(child); -@@ -3411,10 +3570,17 @@ - ngts /= bcf_hdr_nsamples(args->hdr); - if ( ngts!=1 && ngts!=2 ) - { -- if ( !args->quiet ) -- fprintf(bcftools_stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) ) -+ { -+ fprintf(bcftools_stderr, -+ "Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s.\n", -+ chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ if ( !multiploid_warned ) -+ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); -+ multiploid_warned = 1; -+ } - if ( args->out ) -- fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]); -+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); - continue; - } - for (ismpl=0; ismplsmpl->n; ismpl++) -@@ -3431,7 +3597,7 @@ - if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) ) - { - if ( args->phase==PHASE_REQUIRE ) -- error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); -+ error("Unphased heterozygous genotype at %s:%"PRId64", sample %s. See the --phase option.\n", chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]); - if ( args->phase==PHASE_SKIP ) - continue; - if ( args->phase==PHASE_NON_REF ) -@@ -3470,12 +3636,18 @@ - // overlapping or intron variant, cannot apply - if ( hap_ret==1 ) - { -- if ( !args->quiet ) -- fprintf(bcftools_stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", -- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); -+ if ( args->verbosity && (!overlaps_warned || args->verbosity > 1) ) -+ { -+ fprintf(bcftools_stderr, -+ "Warning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s.\n", -+ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); -+ if ( !overlaps_warned ) -+ fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); -+ overlaps_warned = 1; -+ } - if ( args->out ) -- fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n", -- chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); -+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64", sample %s\t%s>%s\n", -+ chr,(int64_t) rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]); - } - hap_destroy(child); - continue; -@@ -3561,19 +3733,15 @@ - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT - { - int ismpl = args->smpl->idx[i]; -- int print_warning = 1; -- if ( args->quiet ) -+ if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) - { -- if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0; -+ fprintf(bcftools_stderr, -+ "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", -+ args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1); -+ if ( !args->ncsq_small_warned ) -+ fprintf(bcftools_stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; - } -- if ( print_warning ) -- { -- fprintf(bcftools_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n", -- args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1); -- if ( args->quiet ) fprintf(bcftools_stderr,"(This warning is printed only once)\n"); -- } -- break; - } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); -@@ -3596,8 +3764,9 @@ - tscript_t *tr = splice.tr = utr->tr; - for (i=1; in_allele; i++) - { -- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } -+ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } - splice.vcf.alt = rec->d.allele[i]; -+ splice.csq = 0; - int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); - if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; - csq_t csq; -@@ -3639,6 +3808,7 @@ - { - if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } - splice.vcf.alt = rec->d.allele[i]; -+ splice.csq = 0; - splice_csq(args, &splice, exon->beg, exon->end); - if ( splice.csq ) ret = 1; - } -@@ -3661,8 +3831,9 @@ - tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); - for (i=1; in_allele; i++) - { -- if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; } -+ if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } - splice.vcf.alt = rec->d.allele[i]; -+ splice.csq = 0; - int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); - if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF - csq_t csq; -@@ -3682,22 +3853,151 @@ - return ret; - } - --void process(args_t *args, bcf1_t **rec_ptr) -+void test_symbolic_alt(args_t *args, bcf1_t *rec) -+{ -+ static int warned = 0; -+ if ( args->verbosity && (!warned && args->verbosity > 0) ) -+ { -+ fprintf(bcftools_stderr,"Warning: The support for symbolic ALT insertions is experimental.\n"); -+ warned = 1; -+ } -+ -+ const char *chr = bcf_seqname(args->hdr,rec); -+ -+ // only insertions atm -+ int beg = rec->pos + 1; -+ int end = beg; -+ int csq_class = CSQ_ELONGATION; -+ -+ int hit = 0; -+ if ( regidx_overlap(args->idx_cds,chr,beg,end, args->itr) ) -+ { -+ while ( regitr_overlap(args->itr) ) -+ { -+ csq_t csq; -+ memset(&csq, 0, sizeof(csq_t)); -+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); -+ tscript_t *tr = cds->tr; -+ csq.type.type = (GF_is_coding(tr->type) ? CSQ_CODING_SEQUENCE : CSQ_NON_CODING) | csq_class; -+ csq.pos = rec->pos; -+ csq.type.biotype = tr->type; -+ csq.type.strand = tr->strand; -+ csq.type.trid = tr->id; -+ csq.type.gene = tr->gene->name; -+ csq_stage(args, &csq, rec); -+ hit = 1; -+ } -+ } -+ if ( regidx_overlap(args->idx_utr,chr,beg,end, args->itr) ) -+ { -+ while ( regitr_overlap(args->itr) ) -+ { -+ csq_t csq; -+ memset(&csq, 0, sizeof(csq_t)); -+ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); -+ tscript_t *tr = utr->tr; -+ csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | csq_class; -+ csq.pos = rec->pos; -+ csq.type.biotype = tr->type; -+ csq.type.strand = tr->strand; -+ csq.type.trid = tr->id; -+ csq.type.gene = tr->gene->name; -+ csq_stage(args, &csq, rec); -+ hit = 1; -+ } -+ } -+ if ( regidx_overlap(args->idx_exon,chr,beg,end, args->itr) ) -+ { -+ splice_t splice; -+ splice_init(&splice, rec); -+ splice.check_acceptor = splice.check_donor = 1; -+ -+ while ( regitr_overlap(args->itr) ) -+ { -+ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*); -+ splice.tr = exon->tr; -+ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites -+ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1; -+ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1; -+ splice.vcf.alt = rec->d.allele[1]; -+ splice.csq = csq_class; -+ splice_csq(args, &splice, exon->beg, exon->end); -+ if ( splice.csq ) hit = 1; -+ } -+ } -+ if ( !hit && regidx_overlap(args->idx_tscript,chr,beg,end, args->itr) ) -+ { -+ splice_t splice; -+ splice_init(&splice, rec); -+ -+ while ( regitr_overlap(args->itr) ) -+ { -+ csq_t csq; -+ memset(&csq, 0, sizeof(csq_t)); -+ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); -+ splice.vcf.alt = rec->d.allele[1]; -+ splice.csq = csq_class; -+ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); -+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF -+ csq.type.type = (GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING) | csq_class; -+ csq.pos = rec->pos; -+ csq.type.biotype = tr->type; -+ csq.type.strand = tr->strand; -+ csq.type.trid = tr->id; -+ csq.type.gene = tr->gene->name; -+ csq_stage(args, &csq, rec); -+ } -+ } -+} -+ -+void debug_print_buffers(args_t *args, int pos) -+{ -+ int i,j; -+ fprintf(bcftools_stderr,"debug_print_buffers at %d\n", pos); -+ fprintf(bcftools_stderr,"vbufs:\n"); -+ for (i=0; ivcf_rbuf.n; i++) -+ { -+ int k = rbuf_kth(&args->vcf_rbuf, i); -+ vbuf_t *vbuf = args->vcf_buf[k]; -+ -+ fprintf(bcftools_stderr,"\tvbuf %d:\n", i); -+ for (j=0; jn; j++) -+ { -+ vrec_t *vrec = vbuf->vrec[j]; -+ fprintf(bcftools_stderr,"\t\t%"PRId64" .. nvcsq=%d\n", (int64_t) vrec->line->pos+1, vrec->nvcsq); -+ } -+ } -+ fprintf(bcftools_stderr,"pos2vbuf:"); -+ khint_t k; -+ for (k = 0; k < kh_end(args->pos2vbuf); ++k) -+ if (kh_exist(args->pos2vbuf, k)) fprintf(bcftools_stderr," %d",1+(int)kh_key(args->pos2vbuf, k)); -+ fprintf(bcftools_stderr,"\n"); -+ fprintf(bcftools_stderr,"active_tr: %d\n", args->active_tr->ndat); -+} -+ -+static void process(args_t *args, bcf1_t **rec_ptr) - { - if ( !rec_ptr ) - { - hap_flush(args, REGIDX_MAX); -- vbuf_flush(args); -+ vbuf_flush(args, REGIDX_MAX); - return; - } - - bcf1_t *rec = *rec_ptr; -+ static int32_t prev_rid = -1, prev_pos = -1; -+ if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; prev_pos = rec->pos; } -+ if ( prev_pos > rec->pos ) -+ error("Error: The file is not sorted, %s:%d comes before %s:%"PRId64"\n",bcf_seqname(args->hdr,rec),prev_pos+1,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - - int call_csq = 1; -- if ( !rec->n_allele ) call_csq = 0; // no alternate allele -- else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele -- else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc -- else if ( args->filter ) -+ if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele -+ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0; // gVCF, not an alt allele -+ else if ( rec->d.allele[1][0]=='<' ) -+ { -+ if ( strncmp("d.allele[1], 4) ) call_csq = 0; // only is supported at the moment -+ } -+ if ( call_csq && args->filter ) - { - call_csq = filter_test(args->filter, rec, NULL); - if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1; -@@ -3706,25 +4006,34 @@ - { - if ( !args->out_fh ) return; // not a VCF output - vbuf_push(args, rec_ptr); -- vbuf_flush(args); -+ hap_flush(args, rec->pos-1); -+ vbuf_flush(args, rec->pos-1); - return; - } - - if ( args->rid != rec->rid ) - { - hap_flush(args, REGIDX_MAX); -- vbuf_flush(args); -+ vbuf_flush(args, REGIDX_MAX); - } - args->rid = rec->rid; -- vbuf_push(args, rec_ptr); -+ vbuf_t *vbuf = vbuf_push(args, rec_ptr); - -- int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec); -- hit += test_utr(args, rec); -- hit += test_splice(args, rec); -- if ( !hit ) test_tscript(args, rec); -+ if ( rec->d.allele[1][0]!='<' ) -+ { -+ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec, vbuf); -+ hit += test_utr(args, rec); -+ hit += test_splice(args, rec); -+ if ( !hit ) test_tscript(args, rec); -+ } -+ else -+ test_symbolic_alt(args, rec); - -- hap_flush(args, rec->pos-1); -- vbuf_flush(args); -+ if ( rec->pos > 0 ) -+ { -+ hap_flush(args, rec->pos-1); -+ vbuf_flush(args, rec->pos-1); -+ } - - return; - } -@@ -3741,6 +4050,7 @@ - " -g, --gff-annot gff3 annotation file\n" - "\n" - "CSQ options:\n" -+ " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" - " -c, --custom-tag use this tag instead of the default BCSQ\n" - " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq maximum number of consequences to consider per site [16]\n" -@@ -3754,16 +4064,18 @@ - " -e, --exclude exclude sites for which the expression is true\n" - " --force run even if some sanity checks fail\n" - " -i, --include select sites for which the expression is true\n" -+ " --no-version do not append version and command line to the header\n" - " -o, --output write output to a file [standard output]\n" - " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" - " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" -- " -q, --quiet suppress warning messages. Can be given two times for even less messages\n" - " -r, --regions restrict to comma-separated list of regions\n" - " -R, --regions-file restrict to regions listed in a file\n" - " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" - " -S, --samples-file samples to include\n" - " -t, --targets similar to -r but streams rather than index-jumps\n" - " -T, --targets-file similar to -R but streams rather than index-jumps\n" -+ " --threads use multithreading with worker threads [0]\n" -+ " -v, --verbose verbosity level 0-2 [1]\n" - "\n" - "Example:\n" - " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" -@@ -3781,12 +4093,16 @@ - args->output_type = FT_VCF; - args->bcsq_tag = "BCSQ"; - args->ncsq_max = 2*16; -+ args->verbosity = 1; -+ args->record_cmd_line = 1; - - static struct option loptions[] = - { - {"force",0,0,1}, -+ {"threads",required_argument,NULL,2}, - {"help",0,0,'h'}, - {"ncsq",1,0,'n'}, -+ {"brief-predictions",0,0,'b'}, - {"custom-tag",1,0,'c'}, - {"local-csq",0,0,'l'}, - {"gff-annot",1,0,'g'}, -@@ -3797,24 +4113,36 @@ - {"output-type",1,NULL,'O'}, - {"phase",1,0,'p'}, - {"quiet",0,0,'q'}, -+ {"verbose",1,0,'v'}, - {"regions",1,0,'r'}, - {"regions-file",1,0,'R'}, - {"samples",1,0,'s'}, - {"samples-file",1,0,'S'}, - {"targets",1,0,'t'}, - {"targets-file",1,0,'T'}, -+ {"no-version",no_argument,NULL,3}, - {0,0,0,0} - }; - int c, targets_is_file = 0, regions_is_file = 0; -- char *targets_list = NULL, *regions_list = NULL; -- while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0) -+ char *targets_list = NULL, *regions_list = NULL, *tmp; -+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bv:",loptions,NULL)) >= 0) - { - switch (c) - { - case 1 : args->force = 1; break; -+ case 2 : -+ args->n_threads = strtol(optarg,&tmp,10); -+ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); -+ break; -+ case 3 : args->record_cmd_line = 0; break; -+ case 'b': args->brief_predictions = 1; break; - case 'l': args->local_csq = 1; break; - case 'c': args->bcsq_tag = optarg; break; -- case 'q': args->quiet++; break; -+ case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; -+ case 'v': -+ args->verbosity = atoi(optarg); -+ if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); -+ break; - case 'p': - switch (optarg[0]) - { -@@ -3871,8 +4199,9 @@ - error("Failed to read the targets: %s\n", targets_list); - if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 ) - error("Failed to read the regions: %s\n", regions_list); -+ if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create %d extra threads\n", args->n_threads); - if ( !bcf_sr_add_reader(args->sr, fname) ) -- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum)); -+ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->sr->errnum)); - args->hdr = bcf_sr_get_header(args->sr,0); - - init_data(args); -@@ -3885,7 +4214,6 @@ - destroy_data(args); - bcf_sr_destroy(args->sr); - free(args); -- - return 0; - } - ---- python-pysam.orig/bcftools/filter.c -+++ python-pysam/bcftools/filter.c -@@ -28,7 +28,10 @@ - #include - #include - #include -+#include -+#ifndef _WIN32 - #include -+#endif - #include - #include - #include -@@ -53,8 +56,8 @@ - # define __FUNCTION__ __func__ - #endif - --uint64_t bcf_double_missing = 0x7ff0000000000001; --uint64_t bcf_double_vector_end = 0x7ff0000000000002; -+static const uint64_t bcf_double_missing = 0x7ff0000000000001; -+static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; - static inline void bcf_double_set(double *ptr, uint64_t value) - { - union { uint64_t i; double d; } u; -@@ -71,6 +74,7 @@ - #define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) - #define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) - #define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) -+#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) - - - typedef struct _token_t -@@ -82,7 +86,7 @@ - char *tag; // for debugging and printout only, VCF tag name - double threshold; // filtering threshold - int is_constant; // the threshold is set -- int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types -+ int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types - int idx; // 0-based index to VCF vectors, - // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) - int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited -@@ -151,11 +155,14 @@ - #define TOK_CNT 26 - #define TOK_PERLSUB 27 - #define TOK_BINOM 28 -- --// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 --// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p --static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8}; --#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcp" -+#define TOK_PHRED 29 -+#define TOK_MEDIAN 30 -+#define TOK_STDEV 31 -+ -+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 -+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s -+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; -+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" - - // Return negative values if it is a function with variable number of arguments - static int filters_next_token(char **str, int *len) -@@ -179,12 +186,16 @@ - - if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; } - if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; } -+ if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; } -+ if ( !strncasecmp(tmp,"MEDIAN(",7) ) { (*str) += 6; return TOK_MEDIAN; } - if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; } -+ if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; } - if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } - if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } - if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } - if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } - if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; } -+ if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; } - if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility - if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility - if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility -@@ -195,6 +206,7 @@ - if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return -TOK_PERLSUB; } - if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } - if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } -+ if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; } // to be able to distinguish between INFO/ILEN and on-the-fly ILEN - - if ( tmp[0]=='@' ) // file name - { -@@ -280,28 +292,30 @@ - } - - --/* -+/* - Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. - -- Based on jkb's staden code with some adjustements. -+ Based on jkb's staden code with some adjustments. - https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123 - */ - char *expand_path(char *path) - { --#ifdef _WIN32 -- return strdup(path); // windows expansion: todo --#endif -- - kstring_t str = {0,0,0}; - - if ( path[0] == '~' ) - { - if ( !path[1] || path[1] == '/' ) - { -+#ifdef _WIN32 -+ kputs(getenv("HOMEDRIVE"), &str); -+ kputs(getenv("HOMEPATH"), &str); -+#else - // ~ or ~/path - kputs(getenv("HOME"), &str); - if ( path[1] ) kputs(path+1, &str); -+#endif - } -+#ifndef _WIN32 - else - { - // user name: ~pd3/path -@@ -315,13 +329,18 @@ - else kputs(pwentry->pw_dir, &str); - kputs(end, &str); - } -- return str.s; -+#endif -+ return ks_release(&str); - } - if ( path[0] == '$' ) - { - char *var = getenv(path+1); -- if ( var ) path = var; -+ if ( var ) { -+ kputs(var, &str); -+ return ks_release(&str); -+ } - } -+ - return strdup(path); - } - -@@ -444,6 +463,8 @@ - return; - } - -+ if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); -+ - if ( rtok->tok_type==TOK_EQ ) - rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - else -@@ -499,6 +520,14 @@ - return -1; // this shouldn't happen - } - -+static void filters_set_chrom(filter_t *flt, bcf1_t *line, token_t *tok) -+{ -+ tok->str_value.l = 0; -+ kputs(bcf_seqname(flt->hdr,line), &tok->str_value); -+ tok->nvalues = tok->str_value.l; -+ tok->is_str = 1; -+} -+ - static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok) - { - tok->values[0] = line->pos+1; -@@ -640,7 +669,7 @@ - static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) - { - if ( line->n_sample != tok->nsamples ) -- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); -+ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); - - int nvals; - if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) -@@ -659,8 +688,10 @@ - { - if ( !tok->usmpl[i] ) continue; - int32_t *ptr = flt->tmpi + i*nsrc1; -- if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end ) -+ if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing ) - bcf_double_set_missing(tok->values[i]); -+ else if ( ptr[tok->idx]==bcf_int32_vector_end ) -+ bcf_double_set_vector_end(tok->values[i]); - else - tok->values[i] = ptr[tok->idx]; - } -@@ -677,24 +708,31 @@ - for (k=0; knidxs && !tok->idxs[k] ) continue; -- if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end ) -+ if ( src[k]==bcf_int32_missing ) - bcf_double_set_missing(dst[j]); -+ else if ( src[k]==bcf_int32_vector_end ) -+ bcf_double_set_vector_end(dst[j]); - else - dst[j] = src[k]; - j++; - } -- while (j < tok->nval1) -+ if ( j==0 ) - { - bcf_double_set_missing(dst[j]); - j++; - } -+ while (j < tok->nval1) -+ { -+ bcf_double_set_vector_end(dst[j]); -+ j++; -+ } - } - } - } - static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) - { - if ( line->n_sample != tok->nsamples ) -- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); -+ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); - - int nvals; - if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) -@@ -713,8 +751,10 @@ - { - if ( !tok->usmpl[i] ) continue; - float *ptr = flt->tmpf + i*nsrc1; -- if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) ) -+ if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) ) - bcf_double_set_missing(tok->values[i]); -+ else if ( bcf_float_is_vector_end(ptr[tok->idx]) ) -+ bcf_double_set_vector_end(tok->values[i]); - else - tok->values[i] = ptr[tok->idx]; - } -@@ -731,24 +771,31 @@ - for (k=0; knidxs && !tok->idxs[k] ) continue; -- if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) ) -+ if ( bcf_float_is_missing(src[k]) ) - bcf_double_set_missing(dst[j]); -+ else if ( bcf_float_is_vector_end(src[k]) ) -+ bcf_double_set_vector_end(dst[j]); - else - dst[j] = src[k]; - j++; - } -- while (j < tok->nval1) -+ if ( j==0 ) - { - bcf_double_set_missing(dst[j]); - j++; - } -+ while (j < tok->nval1) -+ { -+ bcf_double_set_vector_end(dst[j]); -+ j++; -+ } - } - } - } - static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) - { - if ( line->n_sample != tok->nsamples ) -- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); -+ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); - - int i, ndim = tok->str_value.m; - int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); -@@ -868,7 +915,7 @@ - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; -- default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; -+ default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break; - } - #undef BRANCH_INT - assert( tok->nsamples == nsmpl ); -@@ -916,6 +963,19 @@ - tok->nvalues = tok->str_value.l; - tok->nval1 = blen; - } -+static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok) -+{ -+ tok->nvalues = line->n_allele - 1; -+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values); -+ -+ int i, rlen = strlen(line->d.allele[0]); -+ for (i=1; in_allele; i++) -+ { -+ int alen = strlen(line->d.allele[i]); -+ if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]); -+ else tok->values[i-1] = alen - rlen; -+ } -+} - static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) - { - tok->str_value.l = 0; -@@ -1014,10 +1074,16 @@ - if ( rtok->pass_samples[i] ) npass++; - } - -- assert( rtok->values ); -- rtok->nvalues = 1; -- rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); -- rtok->nsamples = 0; -+ hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values); -+ double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); -+ rtok->nval1 = 1; -+ rtok->nvalues = rtok->nsamples; -+ -+ // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats -+ // consider only the passing site AND samples. The values for failed samples is set to -1 so -+ // that it can never conflict with valid expressions. -+ for (i=0; insamples; i++) -+ rtok->values[i] = rtok->pass_samples[i] ? value : -1; - - return 1; - } -@@ -1103,7 +1169,7 @@ - int i, has_value = 0; - for (i=0; invalues; i++) - { -- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val < tok->values[i] ) val = tok->values[i]; - } -@@ -1123,7 +1189,7 @@ - int i, has_value = 0; - for (i=0; invalues; i++) - { -- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val > tok->values[i] ) val = tok->values[i]; - } -@@ -1142,7 +1208,7 @@ - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) -- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } -+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } - if ( n ) - { - rtok->values[0] = val / n; -@@ -1150,6 +1216,61 @@ - } - return 1; - } -+static int compare_doubles(const void *lhs, const void *rhs) -+{ -+ double arg1 = *(const double*) lhs; -+ double arg2 = *(const double*) rhs; -+ if (arg1 < arg2) return -1; -+ if (arg1 > arg2) return 1; -+ return 0; -+} -+static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) -+{ -+ token_t *tok = stack[nstack - 1]; -+ rtok->nvalues = 0; -+ if ( !tok->nvalues ) return 1; -+ int i, n = 0; -+ for (i=0; invalues; i++) -+ { -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; -+ if ( n < i ) tok->values[n] = tok->values[i]; -+ n++; -+ } -+ if ( !n ) return 1; -+ if ( n==1 ) rtok->values[0] = tok->values[0]; -+ else -+ { -+ qsort(tok->values, n, sizeof(double), compare_doubles); -+ rtok->values[0] = n % 2 ? tok->values[n/2] : (tok->values[n/2-1] + tok->values[n/2]) * 0.5; -+ } -+ rtok->nvalues = 1; -+ return 1; -+} -+static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) -+{ -+ token_t *tok = stack[nstack - 1]; -+ rtok->nvalues = 0; -+ if ( !tok->nvalues ) return 1; -+ int i, n = 0; -+ for (i=0; invalues; i++) -+ { -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; -+ if ( n < i ) tok->values[n] = tok->values[i]; -+ n++; -+ } -+ if ( !n ) return 1; -+ if ( n==1 ) rtok->values[0] = 0; -+ else -+ { -+ double sdev = 0, avg = 0; -+ for (i=0; ivalues[n]; -+ avg /= n; -+ for (i=0; ivalues[n] - avg) * (tok->values[n] - avg); -+ rtok->values[0] = sqrt(sdev/n); -+ } -+ rtok->nvalues = 1; -+ return 1; -+} - static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) - { - rtok->nvalues = 0; -@@ -1158,7 +1279,7 @@ - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) -- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } -+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } - if ( n ) - { - rtok->values[0] = val; -@@ -1177,17 +1298,28 @@ - int i; - for (i=0; invalues; i++) - if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); -- else rtok->values[i] = fabs(tok->values[i]); -+ else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); - return 1; - } - static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) - { - token_t *tok = stack[nstack - 1]; -- if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n"); -- - int i, cnt = 0; -- for (i=0; insamples; i++) -- if ( tok->pass_samples[i] ) cnt++; -+ if ( !tok->nsamples ) -+ { -+ if ( tok->is_str ) -+ { -+ if ( tok->str_value.l ) cnt = 1; -+ for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; -+ } -+ else -+ cnt = tok->nvalues; -+ } -+ else -+ { -+ for (i=0; insamples; i++) -+ if ( tok->pass_samples[i] ) cnt++; -+ } - - rtok->nvalues = 1; - rtok->values[0] = cnt; -@@ -1303,10 +1435,10 @@ - } - int idx1 = bcf_gt_allele(ptr[0]); - int idx2 = bcf_gt_allele(ptr[1]); -- if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); -- if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); -+ if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); -+ if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); - double *vals = tok->values + tok->nval1*i; -- if ( bcf_double_is_missing(vals[idx1]) || bcf_double_is_missing(vals[idx2]) ) -+ if ( bcf_double_is_missing_or_vector_end(vals[idx1]) || bcf_double_is_missing_or_vector_end(vals[idx2]) ) - { - bcf_double_set_missing(rtok->values[i]); - continue; -@@ -1324,13 +1456,13 @@ - // the fields given explicitly: binom(AD[:0],AD[:1]) - token_t *tok2 = stack[istack+1]; - if ( tok->nval1!=1 || tok2->nval1!=1 ) -- error("Expected one value per binom() argument, found %d and %d at %s:%d\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),line->pos+1); -+ error("Expected one value per binom() argument, found %d and %d at %s:%"PRId64"\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); - for (i=0; insamples; i++) - { - if ( !rtok->usmpl[i] ) continue; - double *ptr1 = tok->values + tok->nval1*i; - double *ptr2 = tok2->values + tok2->nval1*i; -- if ( bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) -+ if ( bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) - { - bcf_double_set_missing(rtok->values[i]); - continue; -@@ -1370,7 +1502,7 @@ - ptr2 = &tok2->values[0]; - } - } -- if ( !ptr1 || !ptr2 || bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) -+ if ( !ptr1 || !ptr2 || bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) - bcf_double_set_missing(rtok->values[0]); - else - { -@@ -1381,6 +1513,31 @@ - } - return rtok->nargs; - } -+static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) -+{ -+ token_t *tok = stack[nstack - 1]; -+ if ( tok->is_str ) error("PHRED() can be applied only on numeric values\n"); -+ -+ rtok->nsamples = tok->nsamples; -+ rtok->nval1 = tok->nval1; -+ memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); -+ assert(tok->usmpl); -+ if ( !rtok->usmpl ) -+ { -+ rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); -+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl)); -+ } -+ rtok->nvalues = tok->nvalues; -+ if ( !tok->nvalues ) return 1; -+ -+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); -+ int i; -+ for (i=0; invalues; i++) -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); -+ else rtok->values[i] = -4.34294481903*log(tok->values[i]); -+ -+ return 1; -+} - inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) - { - token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; -@@ -1414,7 +1571,7 @@ - assert( atok->nsamples==btok->nsamples ); \ - for (i=0; invalues; i++) \ - { \ -- if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \ -+ if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ - { \ - bcf_double_set_missing(rtok->values[i]); \ - continue; \ -@@ -1428,11 +1585,11 @@ - token_t *xtok = atok->nsamples ? atok : btok; \ - token_t *ytok = atok->nsamples ? btok : atok; \ - assert( ytok->nvalues==1 ); \ -- if ( !bcf_double_is_missing(ytok->values[0]) ) \ -+ if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \ - { \ - for (i=0; invalues; i++) \ - { \ -- if ( bcf_double_is_missing(xtok->values[i]) ) \ -+ if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \ - { \ - bcf_double_set_missing(rtok->values[i]); \ - continue; \ -@@ -1566,7 +1723,6 @@ - { \ - token_t *rtok = _rtok; \ - int i, j, k; \ -- assert( !atok->nsamples || !btok->nsamples ); \ - tok_init_samples(atok, btok, rtok); \ - if ( !atok->nsamples && !btok->nsamples ) \ - { \ -@@ -1576,7 +1732,7 @@ - token_t *tok = atok->nvalues ? atok : btok; \ - for (j=0; jnvalues; j++) \ - { \ -- if ( bcf_double_is_missing(tok->values[j]) ) \ -+ if ( bcf_double_is_missing_or_vector_end(tok->values[j]) ) \ - { \ - if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \ - } \ -@@ -1587,15 +1743,19 @@ - { \ - for (i=0; invalues; i++) \ - { \ -- int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \ -+ int amiss = bcf_double_is_missing_or_vector_end(atok->values[i]) ? 1 : 0; \ - for (j=0; jnvalues; j++) \ - { \ -- int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \ -+ int nmiss = amiss + (bcf_double_is_missing_or_vector_end(btok->values[j]) ? 1 : 0); \ - if ( nmiss ) \ - { \ - if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ - } \ -- else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ -+ else if ( atok->values[i] > 16777216 || btok->values[j] > 16777216 ) /* Ugly, see #871 */ \ -+ { \ -+ if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ -+ } \ -+ else if ( (float)atok->values[i] CMP_OP (float)btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ - } \ - } \ - } \ -@@ -1617,7 +1777,7 @@ - { \ - int miss = 0; \ - for (j=0; jnvalues; j++) \ -- miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \ -+ miss |= bcf_double_is_missing_or_vector_end(tok->values[j]) ? 1 : 0; \ - if ( missing_logic[++miss] ) \ - { \ - for (i=0; insamples; i++) \ -@@ -1631,10 +1791,36 @@ - double *ptr = tok->values + i*tok->nval1; \ - int miss = 0; \ - for (j=0; jnval1; j++) \ -- miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \ -+ miss |= bcf_double_is_missing_or_vector_end(ptr[j]) ? 1 : 0; \ - if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ - } \ - } \ -+ else if ( atok->nsamples && btok->nsamples ) \ -+ { \ -+ if ( atok->nval1!=btok->nval1 ) error("Incompatible number of per-sample values in comparison: %d vs %d\n",atok->nval1,btok->nval1); \ -+ if ( atok->nsamples!=btok->nsamples ) error("Incompatible number samples in comparison: %d vs %d\n",atok->nsamples,btok->nsamples); \ -+ for (i=0; insamples; i++) \ -+ { \ -+ if ( !atok->usmpl[i] || !btok->usmpl[i] ) { rtok->usmpl[i] = 0; continue; } \ -+ double *aptr = atok->values + i*atok->nval1; \ -+ double *bptr = btok->values + i*btok->nval1; \ -+ for (j=0; jnval1; j++) \ -+ { \ -+ int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \ -+ if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ -+ nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \ -+ if ( nmiss ) \ -+ { \ -+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ -+ } \ -+ else if ( aptr[j] > 16777216 || bptr[j] > 16777216 ) /* Ugly, see #871 */ \ -+ { \ -+ if ( aptr[j] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ -+ } \ -+ else if ( (float)aptr[j] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ -+ } \ -+ } \ -+ } \ - else \ - { \ - token_t *xtok = atok->nsamples ? atok : btok; \ -@@ -1646,16 +1832,20 @@ - double *yptr = ytok->values + i*ytok->nval1; \ - for (j=0; jnval1; j++) \ - { \ -- int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \ -+ int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \ - if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ - for (k=0; knvalues; k++) \ - { \ -- int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \ -+ int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \ - if ( nmiss ) \ - { \ - if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ - } \ -- else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ -+ else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \ -+ { \ -+ if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ -+ } \ -+ else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ - } \ - } \ - } \ -@@ -1874,11 +2064,15 @@ - int *idxs2 = NULL, nidxs2 = 0, idx2 = 0; - - int set_samples = 0; -- char *colon = rindex(tag_idx, ':'); -+ char *colon = strrchr(tag_idx, ':'); - if ( tag_idx[0]=='@' ) // file list with sample names - { - if ( !is_fmt ) error("Could not parse \"%s\". (Not a FORMAT tag yet a sample list provided.)\n", ori); - char *fname = expand_path(tag_idx+1); -+#ifdef _WIN32 -+ if (fname && strlen(fname) > 2 && fname[1] == ':') // Deal with Windows paths, such as 'C:\..' -+ colon = strrchr(fname+2, ':'); -+#endif - int nsmpl; - char **list = hts_readlist(fname, 1, &nsmpl); - if ( !list && colon ) -@@ -1887,7 +2081,7 @@ - tok->idxs = idxs2; - tok->nidxs = nidxs2; - tok->idx = idx2; -- colon = rindex(fname, ':'); -+ colon = strrchr(fname, ':'); - *colon = 0; - list = hts_readlist(fname, 1, &nsmpl); - } -@@ -1995,6 +2189,7 @@ - } - static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) - { -+ tok->tag_type = -1; - tok->tok_type = TOK_VAL; - tok->hdr_id = -1; - tok->pass_site = -1; -@@ -2065,6 +2260,7 @@ - tok->comparator = filters_cmp_filter; - tok->tag = strdup("FILTER"); - filter->max_unpack |= BCF_UN_FLT; -+ tok->tag_type = BCF_HL_FLT; - return 0; - } - else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) -@@ -2073,6 +2269,12 @@ - tok->tag = strdup("ID"); - return 0; - } -+ else if ( !strncasecmp(str,"CHROM",len) ) -+ { -+ tok->setter = &filters_set_chrom; -+ tok->tag = strdup("CHROM"); -+ return 0; -+ } - else if ( !strncasecmp(str,"POS",len) ) - { - tok->setter = &filters_set_pos; -@@ -2111,12 +2313,14 @@ - } - else if ( !strncasecmp(str,"N_MISSING",len) ) - { -+ filter->max_unpack |= BCF_UN_FMT; - tok->setter = &filters_set_nmissing; - tok->tag = strdup("N_MISSING"); - return 0; - } - else if ( !strncasecmp(str,"F_MISSING",len) ) - { -+ filter->max_unpack |= BCF_UN_FMT; - tok->setter = &filters_set_nmissing; - tok->tag = strdup("F_MISSING"); - return 0; -@@ -2154,7 +2358,7 @@ - for (i=0; insamples; i++) tok->usmpl[i] = 1; - } - -- tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; -+ tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; - if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; - if ( tok->hdr_id>=0 ) - { -@@ -2264,17 +2468,26 @@ - free(tmp.s); - return 0; - } -+ else if ( !strcasecmp(tmp.s,"ILEN") || !strcasecmp(tmp.s,"%ILEN") ) -+ { -+ filter->max_unpack |= BCF_UN_STR; -+ tok->setter = &filters_set_ilen; -+ tok->tag = strdup("ILEN"); -+ free(tmp.s); -+ return 0; -+ } - - // is it a value? Here we parse as integer/float separately and use strtof - // rather than strtod, because the more accurate double representation - // would invalidate floating point comparisons like QUAL=59.2, obtained via -- // htslib/vcf parser -+ // htslib/vcf parser. -+ // Update: use strtod() and force floats only in comparisons - char *end; - tok->threshold = strtol(tmp.s, &end, 10); // integer? - if ( end - tmp.s != strlen(tmp.s) ) - { - errno = 0; -- tok->threshold = strtof(tmp.s, &end); // float? -+ tok->threshold = strtod(tmp.s, &end); // float? - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); - } - tok->is_constant = 1; -@@ -2455,7 +2668,7 @@ - if ( ret==-1 ) error("Missing quotes in: %s\n", str); - - // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); -- // int i; for (i=0; ihdr_id = -1; - tok->pass_site = -1; - tok->threshold = -1.0; -- if ( !strncasecmp(tmp-len,"N_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("N_PASS"); } -- else if ( !strncasecmp(tmp-len,"F_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("F_PASS"); } -+ if ( !strncasecmp(tmp-len,"N_PASS",6) ) -+ { -+ filter->max_unpack |= BCF_UN_FMT; -+ tok->func = func_npass; -+ tok->tag = strdup("N_PASS"); -+ } -+ else if ( !strncasecmp(tmp-len,"F_PASS",6) ) -+ { -+ filter->max_unpack |= BCF_UN_FMT; -+ tok->func = func_npass; -+ tok->tag = strdup("F_PASS"); -+ } - else error("The function \"%s\" is not supported\n", tmp-len); - continue; - } -@@ -2607,7 +2830,8 @@ - // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be - // just before or after the FILTER token and they must be followed with a comparison operator. - // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator. -- // Additionally, treat "." as missing value rather than a string in numeric equalities. -+ // Additionally, treat "." as missing value rather than a string in numeric equalities; that -+ // @file is only used with ID; etc. - // This code is fragile: improve me. - int i; - for (i=0; istr); - -+ if ( out[i].hash ) -+ { -+ int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1; -+ if ( out[j].comparator!=filters_cmp_id ) -+ error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n"); -+ } - if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC ) - out[i].func = vector_logic_or; - if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC ) -@@ -2629,7 +2859,7 @@ - int set_missing = 0; - if ( out[k].hdr_id>0 ) - { -- int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); -+ int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id); - if ( type==BCF_HT_INT ) set_missing = 1; - else if ( type==BCF_HT_REAL ) set_missing = 1; - } -@@ -2655,7 +2885,7 @@ - } - if ( out[i].tok_type!=TOK_VAL ) continue; - if ( !out[i].tag ) continue; -- if ( !strcmp(out[i].tag,"TYPE") ) -+ if ( out[i].setter==filters_set_type ) - { - if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int itok, ival; -@@ -2669,6 +2899,7 @@ - else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; } - else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; } - else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; } -+ else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; } - else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } - else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); - if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; -@@ -2703,7 +2934,7 @@ - else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r - continue; - } -- if ( !strcmp(out[i].tag,"FILTER") ) -+ if ( out[i].tag_type==BCF_HL_FLT ) - { - if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int itok = i, ival; -@@ -2732,13 +2963,17 @@ - filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0; - for (i=0; ifilters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; } -@@ -2893,7 +3130,6 @@ - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic) - else - error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type); -- - } - filter->flt_stack[nstack-2] = &filter->filters[i]; - nstack--; ---- python-pysam.orig/bcftools/filter.c.pysam.c -+++ python-pysam/bcftools/filter.c.pysam.c -@@ -30,7 +30,10 @@ - #include - #include - #include -+#include -+#ifndef _WIN32 - #include -+#endif - #include - #include - #include -@@ -55,8 +58,8 @@ - # define __FUNCTION__ __func__ - #endif - --uint64_t bcf_double_missing = 0x7ff0000000000001; --uint64_t bcf_double_vector_end = 0x7ff0000000000002; -+static const uint64_t bcf_double_missing = 0x7ff0000000000001; -+static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; - static inline void bcf_double_set(double *ptr, uint64_t value) - { - union { uint64_t i; double d; } u; -@@ -73,6 +76,7 @@ - #define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) - #define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) - #define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) -+#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) - - - typedef struct _token_t -@@ -84,7 +88,7 @@ - char *tag; // for debugging and printout only, VCF tag name - double threshold; // filtering threshold - int is_constant; // the threshold is set -- int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types -+ int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types - int idx; // 0-based index to VCF vectors, - // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) - int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited -@@ -153,11 +157,14 @@ - #define TOK_CNT 26 - #define TOK_PERLSUB 27 - #define TOK_BINOM 28 -- --// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 --// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p --static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8}; --#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcp" -+#define TOK_PHRED 29 -+#define TOK_MEDIAN 30 -+#define TOK_STDEV 31 -+ -+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 -+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s -+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; -+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" - - // Return negative values if it is a function with variable number of arguments - static int filters_next_token(char **str, int *len) -@@ -181,12 +188,16 @@ - - if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; } - if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; } -+ if ( !strncasecmp(tmp,"MEAN(",5) ) { (*str) += 4; return TOK_AVG; } -+ if ( !strncasecmp(tmp,"MEDIAN(",7) ) { (*str) += 6; return TOK_MEDIAN; } - if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; } -+ if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; } - if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } - if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } - if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; } - if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } - if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; } -+ if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; } - if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility - if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility - if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility -@@ -197,6 +208,7 @@ - if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return -TOK_PERLSUB; } - if ( !strncasecmp(tmp,"N_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } - if ( !strncasecmp(tmp,"F_PASS(",7) ) { *len = 6; (*str) += 6; return -TOK_FUNC; } -+ if ( !strncasecmp(tmp,"%ILEN",5) ) { *len = 5; return TOK_VAL; } // to be able to distinguish between INFO/ILEN and on-the-fly ILEN - - if ( tmp[0]=='@' ) // file name - { -@@ -282,28 +294,30 @@ - } - - --/* -+/* - Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. - -- Based on jkb's staden code with some adjustements. -+ Based on jkb's staden code with some adjustments. - https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123 - */ - char *expand_path(char *path) - { --#ifdef _WIN32 -- return strdup(path); // windows expansion: todo --#endif -- - kstring_t str = {0,0,0}; - - if ( path[0] == '~' ) - { - if ( !path[1] || path[1] == '/' ) - { -+#ifdef _WIN32 -+ kputs(getenv("HOMEDRIVE"), &str); -+ kputs(getenv("HOMEPATH"), &str); -+#else - // ~ or ~/path - kputs(getenv("HOME"), &str); - if ( path[1] ) kputs(path+1, &str); -+#endif - } -+#ifndef _WIN32 - else - { - // user name: ~pd3/path -@@ -317,13 +331,18 @@ - else kputs(pwentry->pw_dir, &str); - kputs(end, &str); - } -- return str.s; -+#endif -+ return ks_release(&str); - } - if ( path[0] == '$' ) - { - char *var = getenv(path+1); -- if ( var ) path = var; -+ if ( var ) { -+ kputs(var, &str); -+ return ks_release(&str); -+ } - } -+ - return strdup(path); - } - -@@ -446,6 +465,8 @@ - return; - } - -+ if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); -+ - if ( rtok->tok_type==TOK_EQ ) - rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - else -@@ -501,6 +522,14 @@ - return -1; // this shouldn't happen - } - -+static void filters_set_chrom(filter_t *flt, bcf1_t *line, token_t *tok) -+{ -+ tok->str_value.l = 0; -+ kputs(bcf_seqname(flt->hdr,line), &tok->str_value); -+ tok->nvalues = tok->str_value.l; -+ tok->is_str = 1; -+} -+ - static void filters_set_pos(filter_t *flt, bcf1_t *line, token_t *tok) - { - tok->values[0] = line->pos+1; -@@ -642,7 +671,7 @@ - static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) - { - if ( line->n_sample != tok->nsamples ) -- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); -+ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); - - int nvals; - if ( (nvals=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) -@@ -661,8 +690,10 @@ - { - if ( !tok->usmpl[i] ) continue; - int32_t *ptr = flt->tmpi + i*nsrc1; -- if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing || ptr[tok->idx]==bcf_int32_vector_end ) -+ if ( tok->idx>=nsrc1 || ptr[tok->idx]==bcf_int32_missing ) - bcf_double_set_missing(tok->values[i]); -+ else if ( ptr[tok->idx]==bcf_int32_vector_end ) -+ bcf_double_set_vector_end(tok->values[i]); - else - tok->values[i] = ptr[tok->idx]; - } -@@ -679,24 +710,31 @@ - for (k=0; knidxs && !tok->idxs[k] ) continue; -- if ( src[k]==bcf_int32_missing || src[k]==bcf_int32_vector_end ) -+ if ( src[k]==bcf_int32_missing ) - bcf_double_set_missing(dst[j]); -+ else if ( src[k]==bcf_int32_vector_end ) -+ bcf_double_set_vector_end(dst[j]); - else - dst[j] = src[k]; - j++; - } -- while (j < tok->nval1) -+ if ( j==0 ) - { - bcf_double_set_missing(dst[j]); - j++; - } -+ while (j < tok->nval1) -+ { -+ bcf_double_set_vector_end(dst[j]); -+ j++; -+ } - } - } - } - static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) - { - if ( line->n_sample != tok->nsamples ) -- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); -+ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); - - int nvals; - if ( (nvals=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<0 ) -@@ -715,8 +753,10 @@ - { - if ( !tok->usmpl[i] ) continue; - float *ptr = flt->tmpf + i*nsrc1; -- if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) || bcf_float_is_vector_end(ptr[tok->idx]) ) -+ if ( tok->idx>=nsrc1 || bcf_float_is_missing(ptr[tok->idx]) ) - bcf_double_set_missing(tok->values[i]); -+ else if ( bcf_float_is_vector_end(ptr[tok->idx]) ) -+ bcf_double_set_vector_end(tok->values[i]); - else - tok->values[i] = ptr[tok->idx]; - } -@@ -733,24 +773,31 @@ - for (k=0; knidxs && !tok->idxs[k] ) continue; -- if ( bcf_float_is_missing(src[k]) || bcf_float_is_vector_end(src[k]) ) -+ if ( bcf_float_is_missing(src[k]) ) - bcf_double_set_missing(dst[j]); -+ else if ( bcf_float_is_vector_end(src[k]) ) -+ bcf_double_set_vector_end(dst[j]); - else - dst[j] = src[k]; - j++; - } -- while (j < tok->nval1) -+ if ( j==0 ) - { - bcf_double_set_missing(dst[j]); - j++; - } -+ while (j < tok->nval1) -+ { -+ bcf_double_set_vector_end(dst[j]); -+ j++; -+ } - } - } - } - static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) - { - if ( line->n_sample != tok->nsamples ) -- error("Incorrect number of FORMAT fields at %s:%d .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),line->pos+1,tok->tag,line->n_sample,tok->nsamples); -+ error("Incorrect number of FORMAT fields at %s:%"PRId64" .. %s, %d vs %d\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,tok->tag,line->n_sample,tok->nsamples); - - int i, ndim = tok->str_value.m; - int nstr = bcf_get_format_char(flt->hdr, line, tok->tag, &tok->str_value.s, &ndim); -@@ -870,7 +917,7 @@ - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; -- default: error("The GT type is not lineognised: %d at %s:%d\n",fmt->type, bcf_seqname(flt->hdr,line),line->pos+1); break; -+ default: error("The GT type is not lineognised: %d at %s:%"PRId64"\n",fmt->type, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); break; - } - #undef BRANCH_INT - assert( tok->nsamples == nsmpl ); -@@ -918,6 +965,19 @@ - tok->nvalues = tok->str_value.l; - tok->nval1 = blen; - } -+static void filters_set_ilen(filter_t *flt, bcf1_t *line, token_t *tok) -+{ -+ tok->nvalues = line->n_allele - 1; -+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values); -+ -+ int i, rlen = strlen(line->d.allele[0]); -+ for (i=1; in_allele; i++) -+ { -+ int alen = strlen(line->d.allele[i]); -+ if ( rlen==alen ) bcf_double_set_missing(tok->values[i-1]); -+ else tok->values[i-1] = alen - rlen; -+ } -+} - static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) - { - tok->str_value.l = 0; -@@ -1016,10 +1076,16 @@ - if ( rtok->pass_samples[i] ) npass++; - } - -- assert( rtok->values ); -- rtok->nvalues = 1; -- rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); -- rtok->nsamples = 0; -+ hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values); -+ double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); -+ rtok->nval1 = 1; -+ rtok->nvalues = rtok->nsamples; -+ -+ // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats -+ // consider only the passing site AND samples. The values for failed samples is set to -1 so -+ // that it can never conflict with valid expressions. -+ for (i=0; insamples; i++) -+ rtok->values[i] = rtok->pass_samples[i] ? value : -1; - - return 1; - } -@@ -1105,7 +1171,7 @@ - int i, has_value = 0; - for (i=0; invalues; i++) - { -- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val < tok->values[i] ) val = tok->values[i]; - } -@@ -1125,7 +1191,7 @@ - int i, has_value = 0; - for (i=0; invalues; i++) - { -- if ( bcf_double_is_missing(tok->values[i]) || bcf_double_is_vector_end(tok->values[i]) ) continue; -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; - has_value = 1; - if ( val > tok->values[i] ) val = tok->values[i]; - } -@@ -1144,7 +1210,7 @@ - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) -- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } -+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } - if ( n ) - { - rtok->values[0] = val / n; -@@ -1152,6 +1218,61 @@ - } - return 1; - } -+static int compare_doubles(const void *lhs, const void *rhs) -+{ -+ double arg1 = *(const double*) lhs; -+ double arg2 = *(const double*) rhs; -+ if (arg1 < arg2) return -1; -+ if (arg1 > arg2) return 1; -+ return 0; -+} -+static int func_median(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) -+{ -+ token_t *tok = stack[nstack - 1]; -+ rtok->nvalues = 0; -+ if ( !tok->nvalues ) return 1; -+ int i, n = 0; -+ for (i=0; invalues; i++) -+ { -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; -+ if ( n < i ) tok->values[n] = tok->values[i]; -+ n++; -+ } -+ if ( !n ) return 1; -+ if ( n==1 ) rtok->values[0] = tok->values[0]; -+ else -+ { -+ qsort(tok->values, n, sizeof(double), compare_doubles); -+ rtok->values[0] = n % 2 ? tok->values[n/2] : (tok->values[n/2-1] + tok->values[n/2]) * 0.5; -+ } -+ rtok->nvalues = 1; -+ return 1; -+} -+static int func_stddev(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) -+{ -+ token_t *tok = stack[nstack - 1]; -+ rtok->nvalues = 0; -+ if ( !tok->nvalues ) return 1; -+ int i, n = 0; -+ for (i=0; invalues; i++) -+ { -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) continue; -+ if ( n < i ) tok->values[n] = tok->values[i]; -+ n++; -+ } -+ if ( !n ) return 1; -+ if ( n==1 ) rtok->values[0] = 0; -+ else -+ { -+ double sdev = 0, avg = 0; -+ for (i=0; ivalues[n]; -+ avg /= n; -+ for (i=0; ivalues[n] - avg) * (tok->values[n] - avg); -+ rtok->values[0] = sqrt(sdev/n); -+ } -+ rtok->nvalues = 1; -+ return 1; -+} - static int func_sum(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) - { - rtok->nvalues = 0; -@@ -1160,7 +1281,7 @@ - double val = 0; - int i, n = 0; - for (i=0; invalues; i++) -- if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } -+ if ( !bcf_double_is_missing_or_vector_end(tok->values[i]) ) { val += tok->values[i]; n++; } - if ( n ) - { - rtok->values[0] = val; -@@ -1179,17 +1300,28 @@ - int i; - for (i=0; invalues; i++) - if ( bcf_double_is_missing(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); -- else rtok->values[i] = fabs(tok->values[i]); -+ else if ( !bcf_double_is_vector_end(tok->values[i]) ) rtok->values[i] = fabs(tok->values[i]); - return 1; - } - static int func_count(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) - { - token_t *tok = stack[nstack - 1]; -- if ( !tok->nsamples ) error("COUNT() can be applied only on FORMAT fields\n"); -- - int i, cnt = 0; -- for (i=0; insamples; i++) -- if ( tok->pass_samples[i] ) cnt++; -+ if ( !tok->nsamples ) -+ { -+ if ( tok->is_str ) -+ { -+ if ( tok->str_value.l ) cnt = 1; -+ for (i=0; istr_value.l; i++) if ( tok->str_value.s[i]==',' ) cnt++; -+ } -+ else -+ cnt = tok->nvalues; -+ } -+ else -+ { -+ for (i=0; insamples; i++) -+ if ( tok->pass_samples[i] ) cnt++; -+ } - - rtok->nvalues = 1; - rtok->values[0] = cnt; -@@ -1305,10 +1437,10 @@ - } - int idx1 = bcf_gt_allele(ptr[0]); - int idx2 = bcf_gt_allele(ptr[1]); -- if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); -- if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%d, sample %s\n", bcf_seqname(flt->hdr,line),line->pos+1,flt->hdr->samples[i]); -+ if ( idx1>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); -+ if ( idx2>=line->n_allele ) error("Incorrect allele index at %s:%"PRId64", sample %s\n", bcf_seqname(flt->hdr,line),(int64_t) line->pos+1,flt->hdr->samples[i]); - double *vals = tok->values + tok->nval1*i; -- if ( bcf_double_is_missing(vals[idx1]) || bcf_double_is_missing(vals[idx2]) ) -+ if ( bcf_double_is_missing_or_vector_end(vals[idx1]) || bcf_double_is_missing_or_vector_end(vals[idx2]) ) - { - bcf_double_set_missing(rtok->values[i]); - continue; -@@ -1326,13 +1458,13 @@ - // the fields given explicitly: binom(AD[:0],AD[:1]) - token_t *tok2 = stack[istack+1]; - if ( tok->nval1!=1 || tok2->nval1!=1 ) -- error("Expected one value per binom() argument, found %d and %d at %s:%d\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),line->pos+1); -+ error("Expected one value per binom() argument, found %d and %d at %s:%"PRId64"\n",tok->nval1,tok2->nval1, bcf_seqname(flt->hdr,line),(int64_t) line->pos+1); - for (i=0; insamples; i++) - { - if ( !rtok->usmpl[i] ) continue; - double *ptr1 = tok->values + tok->nval1*i; - double *ptr2 = tok2->values + tok2->nval1*i; -- if ( bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) -+ if ( bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) - { - bcf_double_set_missing(rtok->values[i]); - continue; -@@ -1372,7 +1504,7 @@ - ptr2 = &tok2->values[0]; - } - } -- if ( !ptr1 || !ptr2 || bcf_double_is_missing(ptr1[0]) || bcf_double_is_missing(ptr2[0]) ) -+ if ( !ptr1 || !ptr2 || bcf_double_is_missing_or_vector_end(ptr1[0]) || bcf_double_is_missing_or_vector_end(ptr2[0]) ) - bcf_double_set_missing(rtok->values[0]); - else - { -@@ -1383,6 +1515,31 @@ - } - return rtok->nargs; - } -+static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) -+{ -+ token_t *tok = stack[nstack - 1]; -+ if ( tok->is_str ) error("PHRED() can be applied only on numeric values\n"); -+ -+ rtok->nsamples = tok->nsamples; -+ rtok->nval1 = tok->nval1; -+ memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); -+ assert(tok->usmpl); -+ if ( !rtok->usmpl ) -+ { -+ rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); -+ memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl)); -+ } -+ rtok->nvalues = tok->nvalues; -+ if ( !tok->nvalues ) return 1; -+ -+ hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); -+ int i; -+ for (i=0; invalues; i++) -+ if ( bcf_double_is_missing_or_vector_end(tok->values[i]) ) bcf_double_set_missing(rtok->values[i]); -+ else rtok->values[i] = -4.34294481903*log(tok->values[i]); -+ -+ return 1; -+} - inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) - { - token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; -@@ -1416,7 +1573,7 @@ - assert( atok->nsamples==btok->nsamples ); \ - for (i=0; invalues; i++) \ - { \ -- if ( bcf_double_is_missing(atok->values[i]) || bcf_double_is_missing(btok->values[i]) ) \ -+ if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ - { \ - bcf_double_set_missing(rtok->values[i]); \ - continue; \ -@@ -1430,11 +1587,11 @@ - token_t *xtok = atok->nsamples ? atok : btok; \ - token_t *ytok = atok->nsamples ? btok : atok; \ - assert( ytok->nvalues==1 ); \ -- if ( !bcf_double_is_missing(ytok->values[0]) ) \ -+ if ( !bcf_double_is_missing_or_vector_end(ytok->values[0]) ) \ - { \ - for (i=0; invalues; i++) \ - { \ -- if ( bcf_double_is_missing(xtok->values[i]) ) \ -+ if ( bcf_double_is_missing_or_vector_end(xtok->values[i]) ) \ - { \ - bcf_double_set_missing(rtok->values[i]); \ - continue; \ -@@ -1568,7 +1725,6 @@ - { \ - token_t *rtok = _rtok; \ - int i, j, k; \ -- assert( !atok->nsamples || !btok->nsamples ); \ - tok_init_samples(atok, btok, rtok); \ - if ( !atok->nsamples && !btok->nsamples ) \ - { \ -@@ -1578,7 +1734,7 @@ - token_t *tok = atok->nvalues ? atok : btok; \ - for (j=0; jnvalues; j++) \ - { \ -- if ( bcf_double_is_missing(tok->values[j]) ) \ -+ if ( bcf_double_is_missing_or_vector_end(tok->values[j]) ) \ - { \ - if ( missing_logic[2] ) { rtok->pass_site = 1; break; } \ - } \ -@@ -1589,15 +1745,19 @@ - { \ - for (i=0; invalues; i++) \ - { \ -- int amiss = bcf_double_is_missing(atok->values[i]) ? 1 : 0; \ -+ int amiss = bcf_double_is_missing_or_vector_end(atok->values[i]) ? 1 : 0; \ - for (j=0; jnvalues; j++) \ - { \ -- int nmiss = amiss + (bcf_double_is_missing(btok->values[j]) ? 1 : 0); \ -+ int nmiss = amiss + (bcf_double_is_missing_or_vector_end(btok->values[j]) ? 1 : 0); \ - if ( nmiss ) \ - { \ - if ( missing_logic[nmiss] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ - } \ -- else if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ -+ else if ( atok->values[i] > 16777216 || btok->values[j] > 16777216 ) /* Ugly, see #871 */ \ -+ { \ -+ if ( atok->values[i] CMP_OP btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ -+ } \ -+ else if ( (float)atok->values[i] CMP_OP (float)btok->values[j] ) { rtok->pass_site = 1; i = atok->nvalues; break; } \ - } \ - } \ - } \ -@@ -1619,7 +1779,7 @@ - { \ - int miss = 0; \ - for (j=0; jnvalues; j++) \ -- miss |= bcf_double_is_missing(tok->values[j]) ? 1 : 0; \ -+ miss |= bcf_double_is_missing_or_vector_end(tok->values[j]) ? 1 : 0; \ - if ( missing_logic[++miss] ) \ - { \ - for (i=0; insamples; i++) \ -@@ -1633,10 +1793,36 @@ - double *ptr = tok->values + i*tok->nval1; \ - int miss = 0; \ - for (j=0; jnval1; j++) \ -- miss |= bcf_double_is_missing(ptr[j]) ? 1 : 0; \ -+ miss |= bcf_double_is_missing_or_vector_end(ptr[j]) ? 1 : 0; \ - if ( missing_logic[++miss] ) { rtok->pass_samples[i] = missing_logic[miss]; rtok->pass_site = 1; } \ - } \ - } \ -+ else if ( atok->nsamples && btok->nsamples ) \ -+ { \ -+ if ( atok->nval1!=btok->nval1 ) error("Incompatible number of per-sample values in comparison: %d vs %d\n",atok->nval1,btok->nval1); \ -+ if ( atok->nsamples!=btok->nsamples ) error("Incompatible number samples in comparison: %d vs %d\n",atok->nsamples,btok->nsamples); \ -+ for (i=0; insamples; i++) \ -+ { \ -+ if ( !atok->usmpl[i] || !btok->usmpl[i] ) { rtok->usmpl[i] = 0; continue; } \ -+ double *aptr = atok->values + i*atok->nval1; \ -+ double *bptr = btok->values + i*btok->nval1; \ -+ for (j=0; jnval1; j++) \ -+ { \ -+ int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \ -+ if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ -+ nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \ -+ if ( nmiss ) \ -+ { \ -+ if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ -+ } \ -+ else if ( aptr[j] > 16777216 || bptr[j] > 16777216 ) /* Ugly, see #871 */ \ -+ { \ -+ if ( aptr[j] CMP_OP bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ -+ } \ -+ else if ( (float)aptr[j] CMP_OP (float)bptr[j] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; break; } \ -+ } \ -+ } \ -+ } \ - else \ - { \ - token_t *xtok = atok->nsamples ? atok : btok; \ -@@ -1648,16 +1834,20 @@ - double *yptr = ytok->values + i*ytok->nval1; \ - for (j=0; jnval1; j++) \ - { \ -- int miss = bcf_double_is_missing(xptr[j]) ? 1 : 0; \ -+ int miss = bcf_double_is_missing_or_vector_end(xptr[j]) ? 1 : 0; \ - if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \ - for (k=0; knvalues; k++) \ - { \ -- int nmiss = miss + (bcf_double_is_missing(yptr[k]) ? 1 : 0); \ -+ int nmiss = miss + (bcf_double_is_missing_or_vector_end(yptr[k]) ? 1 : 0); \ - if ( nmiss ) \ - { \ - if ( missing_logic[nmiss] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ - } \ -- else if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ -+ else if ( xptr[j] > 16777216 || yptr[k] > 16777216 ) /* Ugly, see #871 */ \ -+ { \ -+ if ( xptr[j] CMP_OP yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ -+ } \ -+ else if ( (float)xptr[j] CMP_OP (float)yptr[k] ) { rtok->pass_samples[i] = 1; rtok->pass_site = 1; j = xtok->nval1; break; } \ - } \ - } \ - } \ -@@ -1876,11 +2066,15 @@ - int *idxs2 = NULL, nidxs2 = 0, idx2 = 0; - - int set_samples = 0; -- char *colon = rindex(tag_idx, ':'); -+ char *colon = strrchr(tag_idx, ':'); - if ( tag_idx[0]=='@' ) // file list with sample names - { - if ( !is_fmt ) error("Could not parse \"%s\". (Not a FORMAT tag yet a sample list provided.)\n", ori); - char *fname = expand_path(tag_idx+1); -+#ifdef _WIN32 -+ if (fname && strlen(fname) > 2 && fname[1] == ':') // Deal with Windows paths, such as 'C:\..' -+ colon = strrchr(fname+2, ':'); -+#endif - int nsmpl; - char **list = hts_readlist(fname, 1, &nsmpl); - if ( !list && colon ) -@@ -1889,7 +2083,7 @@ - tok->idxs = idxs2; - tok->nidxs = nidxs2; - tok->idx = idx2; -- colon = rindex(fname, ':'); -+ colon = strrchr(fname, ':'); - *colon = 0; - list = hts_readlist(fname, 1, &nsmpl); - } -@@ -1997,6 +2191,7 @@ - } - static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) - { -+ tok->tag_type = -1; - tok->tok_type = TOK_VAL; - tok->hdr_id = -1; - tok->pass_site = -1; -@@ -2067,6 +2262,7 @@ - tok->comparator = filters_cmp_filter; - tok->tag = strdup("FILTER"); - filter->max_unpack |= BCF_UN_FLT; -+ tok->tag_type = BCF_HL_FLT; - return 0; - } - else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) -@@ -2075,6 +2271,12 @@ - tok->tag = strdup("ID"); - return 0; - } -+ else if ( !strncasecmp(str,"CHROM",len) ) -+ { -+ tok->setter = &filters_set_chrom; -+ tok->tag = strdup("CHROM"); -+ return 0; -+ } - else if ( !strncasecmp(str,"POS",len) ) - { - tok->setter = &filters_set_pos; -@@ -2113,12 +2315,14 @@ - } - else if ( !strncasecmp(str,"N_MISSING",len) ) - { -+ filter->max_unpack |= BCF_UN_FMT; - tok->setter = &filters_set_nmissing; - tok->tag = strdup("N_MISSING"); - return 0; - } - else if ( !strncasecmp(str,"F_MISSING",len) ) - { -+ filter->max_unpack |= BCF_UN_FMT; - tok->setter = &filters_set_nmissing; - tok->tag = strdup("F_MISSING"); - return 0; -@@ -2156,7 +2360,7 @@ - for (i=0; insamples; i++) tok->usmpl[i] = 1; - } - -- tok->type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; -+ tok->tag_type = is_fmt ? BCF_HL_FMT : BCF_HL_INFO; - if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; - if ( tok->hdr_id>=0 ) - { -@@ -2266,17 +2470,26 @@ - free(tmp.s); - return 0; - } -+ else if ( !strcasecmp(tmp.s,"ILEN") || !strcasecmp(tmp.s,"%ILEN") ) -+ { -+ filter->max_unpack |= BCF_UN_STR; -+ tok->setter = &filters_set_ilen; -+ tok->tag = strdup("ILEN"); -+ free(tmp.s); -+ return 0; -+ } - - // is it a value? Here we parse as integer/float separately and use strtof - // rather than strtod, because the more accurate double representation - // would invalidate floating point comparisons like QUAL=59.2, obtained via -- // htslib/vcf parser -+ // htslib/vcf parser. -+ // Update: use strtod() and force floats only in comparisons - char *end; - tok->threshold = strtol(tmp.s, &end, 10); // integer? - if ( end - tmp.s != strlen(tmp.s) ) - { - errno = 0; -- tok->threshold = strtof(tmp.s, &end); // float? -+ tok->threshold = strtod(tmp.s, &end); // float? - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); - } - tok->is_constant = 1; -@@ -2457,7 +2670,7 @@ - if ( ret==-1 ) error("Missing quotes in: %s\n", str); - - // fprintf(bcftools_stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); -- // int i; for (i=0; ihdr_id = -1; - tok->pass_site = -1; - tok->threshold = -1.0; -- if ( !strncasecmp(tmp-len,"N_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("N_PASS"); } -- else if ( !strncasecmp(tmp-len,"F_PASS",6) ) { tok->func = func_npass; tok->tag = strdup("F_PASS"); } -+ if ( !strncasecmp(tmp-len,"N_PASS",6) ) -+ { -+ filter->max_unpack |= BCF_UN_FMT; -+ tok->func = func_npass; -+ tok->tag = strdup("N_PASS"); -+ } -+ else if ( !strncasecmp(tmp-len,"F_PASS",6) ) -+ { -+ filter->max_unpack |= BCF_UN_FMT; -+ tok->func = func_npass; -+ tok->tag = strdup("F_PASS"); -+ } - else error("The function \"%s\" is not supported\n", tmp-len); - continue; - } -@@ -2609,7 +2832,8 @@ - // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be - // just before or after the FILTER token and they must be followed with a comparison operator. - // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator. -- // Additionally, treat "." as missing value rather than a string in numeric equalities. -+ // Additionally, treat "." as missing value rather than a string in numeric equalities; that -+ // @file is only used with ID; etc. - // This code is fragile: improve me. - int i; - for (i=0; istr); - -+ if ( out[i].hash ) -+ { -+ int j = out[i+1].tok_type==TOK_VAL ? i+1 : i-1; -+ if ( out[j].comparator!=filters_cmp_id ) -+ error("Error: could not parse the expression. Note that the \"@file_name\" syntax can be currently used with ID column only.\n"); -+ } - if ( out[i].tok_type==TOK_OR || out[i].tok_type==TOK_OR_VEC ) - out[i].func = vector_logic_or; - if ( out[i].tok_type==TOK_AND || out[i].tok_type==TOK_AND_VEC ) -@@ -2631,7 +2861,7 @@ - int set_missing = 0; - if ( out[k].hdr_id>0 ) - { -- int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id); -+ int type = bcf_hdr_id2type(filter->hdr,out[k].tag_type,out[k].hdr_id); - if ( type==BCF_HT_INT ) set_missing = 1; - else if ( type==BCF_HT_REAL ) set_missing = 1; - } -@@ -2657,7 +2887,7 @@ - } - if ( out[i].tok_type!=TOK_VAL ) continue; - if ( !out[i].tag ) continue; -- if ( !strcmp(out[i].tag,"TYPE") ) -+ if ( out[i].setter==filters_set_type ) - { - if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int itok, ival; -@@ -2671,6 +2901,7 @@ - else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; } - else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; } - else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; } -+ else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; } - else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } - else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); - if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; -@@ -2705,7 +2936,7 @@ - else if ( !strcasecmp(out[ival].key,"r") ) { out[i].setter = filters_set_genotype2; out[ival].key[0]='r'; out[ival].key[1]=0; } // r - continue; - } -- if ( !strcmp(out[i].tag,"FILTER") ) -+ if ( out[i].tag_type==BCF_HL_FLT ) - { - if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); - int itok = i, ival; -@@ -2734,13 +2965,17 @@ - filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0; - for (i=0; ifilters[i].tok_type == TOK_EQ ) { missing_logic[0] = missing_logic[2] = 1; } -@@ -2895,7 +3132,6 @@ - CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],>=,missing_logic) - else - error("todo: %s:%d .. type=%d\n", __FILE__,__LINE__,filter->filters[i].tok_type); -- - } - filter->flt_stack[nstack-2] = &filter->filters[i]; - nstack--; ---- python-pysam.orig/bcftools/gvcf.c -+++ python-pysam/bcftools/gvcf.c -@@ -156,7 +156,7 @@ - if ( gvcf->npl>0 ) - bcf_update_format_int32(hdr, gvcf->line, "PL", gvcf->pl, gvcf->npl); - bcf_update_format_int32(hdr, gvcf->line, "DP", gvcf->dp, nsmpl); -- bcf_write1(fh, hdr, gvcf->line); -+ if ( bcf_write1(fh, hdr, gvcf->line)!=0 ) error("[%s] Error: failed to write the record\n", __func__); - gvcf->prev_range = 0; - gvcf->rid = -1; - gvcf->npl = 0; ---- python-pysam.orig/bcftools/gvcf.c.pysam.c -+++ python-pysam/bcftools/gvcf.c.pysam.c -@@ -158,7 +158,7 @@ - if ( gvcf->npl>0 ) - bcf_update_format_int32(hdr, gvcf->line, "PL", gvcf->pl, gvcf->npl); - bcf_update_format_int32(hdr, gvcf->line, "DP", gvcf->dp, nsmpl); -- bcf_write1(fh, hdr, gvcf->line); -+ if ( bcf_write1(fh, hdr, gvcf->line)!=0 ) error("[%s] Error: failed to write the record\n", __func__); - gvcf->prev_range = 0; - gvcf->rid = -1; - gvcf->npl = 0; ---- /dev/null -+++ python-pysam/bcftools/hex.h -@@ -0,0 +1,95 @@ -+// VariantKey -+// -+// hex.h -+// -+// @category Libraries -+// @author Nicola Asuni -+// @copyright 2017-2018 GENOMICS plc -+// @license MIT (see LICENSE) -+// @link https://github.com/genomicsplc/variantkey -+// -+// LICENSE -+// -+// Copyright (c) 2017-2018 GENOMICS plc -+// -+// Permission is hereby granted, free of charge, to any person obtaining a copy -+// of this software and associated documentation files (the "Software"), to deal -+// in the Software without restriction, including without limitation the rights -+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+// copies of the Software, and to permit persons to whom the Software is -+// furnished to do so, subject to the following conditions: -+// -+// The above copyright notice and this permission notice shall be included in -+// all copies or substantial portions of the Software. -+// -+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+// THE SOFTWARE. -+ -+/** -+ * @file hex.h -+ * @brief Utility functions to manipulate strings. -+ * -+ * Collection of utility functions to manipulate strings. -+ */ -+ -+#ifndef ASTRING_H -+#define ASTRING_H -+ -+#include -+#include -+ -+/** @brief Returns uint64_t hexadecimal string (16 characters). -+ * -+ * @param n Number to parse -+ * @param str String buffer to be returned (it must be sized 17 bytes at least). -+ * -+ * @return Upon successful return, these function returns the number of characters processed -+ * (excluding the null byte used to end output to strings). -+ * If the buffer size is not sufficient, then the return value is the number of characters required for -+ * buffer string, including the terminating null byte. -+ */ -+static inline size_t hex_uint64_t(uint64_t n, char *str) -+{ -+ return sprintf(str, "%016" PRIx64, n); -+} -+ -+/** @brief Parses a 16 chars hexadecimal string and returns the code. -+ * -+ * @param s Hexadecimal string to parse (it must contain 16 hexadecimal characters). -+ * -+ * @return uint64_t unsigned integer number. -+ */ -+static inline uint64_t parse_hex_uint64_t(const char *s) -+{ -+ uint64_t v = 0; -+ uint8_t b; -+ size_t i; -+ for (i = 0; i < 16; i++) -+ { -+ b = s[i]; -+ if (b >= 'a') -+ { -+ b -= ('a' - 10); // a-f -+ } -+ else -+ { -+ if (b >= 'A') -+ { -+ b -= ('A' - 10); // A-F -+ } -+ else -+ { -+ b -= '0'; // 0-9 -+ } -+ } -+ v = ((v << 4) | b); -+ } -+ return v; -+} -+ -+#endif // ASTRING_H ---- python-pysam.orig/bcftools/htslib-1.9/LICENSE -+++ /dev/null -@@ -1,69 +0,0 @@ --[Files in this distribution outwith the cram/ subdirectory are distributed --according to the terms of the following MIT/Expat license.] -- --The MIT/Expat License -- --Copyright (C) 2012-2018 Genome Research Ltd. -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL --THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING --FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER --DEALINGS IN THE SOFTWARE. -- -- --[Files within the cram/ subdirectory in this distribution are distributed --according to the terms of the following Modified 3-Clause BSD license.] -- --The Modified-BSD License -- --Copyright (C) 2012-2018 Genome Research Ltd. -- --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are met: -- --1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- --2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- --3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute -- nor the names of its contributors may be used to endorse or promote products -- derived from this software without specific prior written permission. -- --THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" --AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE --FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR --SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER --CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, --OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE --OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- -- --[The use of a range of years within a copyright notice in this distribution --should be interpreted as being equivalent to a list of years including the --first and last year specified and all consecutive years between them. -- --For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009, --2011-2012" should be interpreted as being identical to a notice that reads --"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice --that reads "Copyright (C) 2005-2012" should be interpreted as being identical --to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, --2011, 2012".] ---- python-pysam.orig/bcftools/htslib-1.9/README -+++ /dev/null -@@ -1,5 +0,0 @@ --HTSlib is an implementation of a unified C library for accessing common file --formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing --data. It is the core library used by samtools and bcftools. -- --See INSTALL for building and installation instructions. ---- python-pysam.orig/bcftools/main.c -+++ python-pysam/bcftools/main.c -@@ -53,7 +53,9 @@ - #if USE_GPL - int main_polysomy(int argc, char *argv[]); - #endif -+#ifdef ENABLE_BCF_PLUGINS - int main_plugin(int argc, char *argv[]); -+#endif - int main_consensus(int argc, char *argv[]); - int main_csq(int argc, char *argv[]); - int bam_mpileup(int argc, char *argv[]); -@@ -110,15 +112,12 @@ - .alias = "norm", - .help = "left-align and normalize indels" - }, -+#ifdef ENABLE_BCF_PLUGINS - { .func = main_plugin, - .alias = "plugin", --#ifdef ENABLE_BCF_PLUGINS - .help = "user-defined plugins" --#else -- /* Do not advertise when plugins disabled. */ -- .help = "-user-defined plugins" --#endif - }, -+#endif - { .func = main_vcfquery, - .alias = "query", - .help = "transform VCF/BCF into user-defined formats" -@@ -235,12 +234,24 @@ - fprintf(fp,"\n"); - } - -+// This is a tricky one, but on Windows the filename wildcard expansion is done by -+// the application and not by the shell, as traditionally it never had a "shell". -+// Even now, DOS and Powershell do not do this expansion (but bash does). -+// -+// This means that Mingw/Msys implements code before main() that takes e.g. "*" and -+// expands it up to a list of matching filenames. This in turn breaks things like -+// specifying "*" as a region (all the unmapped reads). We take a hard line here - -+// filename expansion is the task of the shell, not our application! -+#ifdef _WIN32 -+int _CRT_glob = 0; -+#endif -+ - int main(int argc, char *argv[]) - { - if (argc < 2) { usage(stderr); return 1; } - - if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { -- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2018 Genome Research Ltd.\n", bcftools_version(), hts_version()); -+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version()); - #if USE_GPL - printf("License GPLv3+: GNU GPL version 3 or later \n"); - #else ---- python-pysam.orig/bcftools/main.c.pysam.c -+++ python-pysam/bcftools/main.c.pysam.c -@@ -55,7 +55,9 @@ - #if USE_GPL - int main_polysomy(int argc, char *argv[]); - #endif -+#ifdef ENABLE_BCF_PLUGINS - int main_plugin(int argc, char *argv[]); -+#endif - int main_consensus(int argc, char *argv[]); - int main_csq(int argc, char *argv[]); - int bam_mpileup(int argc, char *argv[]); -@@ -112,15 +114,12 @@ - .alias = "norm", - .help = "left-align and normalize indels" - }, -+#ifdef ENABLE_BCF_PLUGINS - { .func = main_plugin, - .alias = "plugin", --#ifdef ENABLE_BCF_PLUGINS - .help = "user-defined plugins" --#else -- /* Do not advertise when plugins disabled. */ -- .help = "-user-defined plugins" --#endif - }, -+#endif - { .func = main_vcfquery, - .alias = "query", - .help = "transform VCF/BCF into user-defined formats" -@@ -237,12 +236,24 @@ - fprintf(fp,"\n"); - } - -+// This is a tricky one, but on Windows the filename wildcard expansion is done by -+// the application and not by the shell, as traditionally it never had a "shell". -+// Even now, DOS and Powershell do not do this expansion (but bash does). -+// -+// This means that Mingw/Msys implements code before main() that takes e.g. "*" and -+// expands it up to a list of matching filenames. This in turn breaks things like -+// specifying "*" as a region (all the unmapped reads). We take a hard line here - -+// filename expansion is the task of the shell, not our application! -+#ifdef _WIN32 -+int _CRT_glob = 0; -+#endif -+ - int bcftools_main(int argc, char *argv[]) - { - if (argc < 2) { usage(bcftools_stderr); return 1; } - - if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { -- fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2018 Genome Research Ltd.\n", bcftools_version(), hts_version()); -+ fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2019 Genome Research Ltd.\n", bcftools_version(), hts_version()); - #if USE_GPL - fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); - #else ---- python-pysam.orig/bcftools/mcall.c -+++ python-pysam/bcftools/mcall.c -@@ -23,7 +23,9 @@ - THE SOFTWARE. */ - - #include -+#include - #include -+#include - #include "call.h" - - // Using priors for GTs does not seem to be mathematically justified. Although -@@ -36,9 +38,6 @@ - // genotypes is reported instead. - #define FLAT_PDG_FOR_MISSING 0 - --// Estimate QS (combined quality and allele frequencies) from PLs --#define QS_FROM_PDG 0 -- - - void qcall_init(call_t *call) { return; } - void qcall_destroy(call_t *call) { return; } -@@ -244,12 +243,84 @@ - free(call->trio[j][i]); - } - -+static void init_sample_groups(call_t *call) -+{ -+ int i, nsmpl = bcf_hdr_nsamples(call->hdr); -+ if ( !call->sample_groups ) -+ { -+ // standard pooled calling, all samples in the same group -+ grp_t *grps = &call->smpl_grp; -+ grps->ngrp = 1; -+ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); -+ grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int)); -+ } -+ else if ( !strcmp("-",call->sample_groups) ) -+ { -+ // single-sample calling, each sample creates its own group -+ grp_t *grps = &call->smpl_grp; -+ grps->ngrp = nsmpl; -+ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); -+ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); -+ for (i=0; ismpl2grp[i] = i; -+ } -+ else -+ { -+ int nlines; -+ char **lines = hts_readlist(call->sample_groups, 1, &nlines); -+ if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); -+ -+ uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); -+ void *grp2idx = khash_str2int_init(); -+ -+ grp_t *grps = &call->smpl_grp; -+ for (i=0; isample_groups,lines[i]); -+ *ptr = 0; -+ int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); -+ if ( ismpl<0 ) continue; -+ if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); -+ if ( !khash_str2int_has_key(grp2idx,ptr+1) ) -+ { -+ khash_str2int_inc(grp2idx, ptr+1); -+ grps->ngrp++; -+ } -+ int igrp; -+ if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 ) -+ smpl2grp1[ismpl] = igrp+1; -+ else -+ error("This should not happen, fixme: %s\n",ptr+1); -+ } -+ khash_str2int_destroy(grp2idx); -+ -+ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); -+ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); -+ for (i=0; ihdr->samples[i],call->sample_groups); -+ grps->smpl2grp[i] = smpl2grp1[i] - 1; -+ } -+ free(smpl2grp1); -+ for (i=0; ismpl_grp; -+ for (i=0; ingrp; i++) -+ free(grps->grp[i].qsum); -+ free(grps->grp); -+ free(grps->smpl2grp); -+} -+ - void mcall_init(call_t *call) - { - call_init_pl2p(call); - -- call->nqsum = 5; -- call->qsum = (float*) malloc(sizeof(float)*call->nqsum); // will be expanded later if ncessary - call->nals_map = 5; - call->als_map = (int*) malloc(sizeof(int)*call->nals_map); - call->npl_map = 5*(5+1)/2; // will be expanded later if necessary -@@ -302,26 +373,28 @@ - call->theta = log(call->theta); - } - -- return; -+ init_sample_groups(call); - } - - void mcall_destroy(call_t *call) - { -+ destroy_sample_groups(call); - if (call->vcmp) vcmp_destroy(call->vcmp); - free(call->itmp); - mcall_destroy_trios(call); - free(call->GPs); -+ free(call->ADs); - free(call->GLs); - free(call->GQs); - free(call->anno16); - free(call->PLs); -- free(call->qsum); - free(call->als_map); - free(call->pl_map); - free(call->gts); free(call->cgts); free(call->ugts); - free(call->pdg); - free(call->als); - free(call->ac); -+ free(call->qsum); - return; - } - -@@ -431,40 +504,6 @@ - } - } - --/* -- Allele frequency estimated as: -- #A = \sum_i (2*P_AA + P_AB) -- F_A = #A / ( #A + #B ) -- where i runs across all samples --*/ --void estimate_qsum(call_t *call, bcf1_t *rec) --{ -- double *pdg = call->pdg; -- int ngts = rec->n_allele*(rec->n_allele+1)/2; -- int i,nsmpl = bcf_hdr_nsamples(call->hdr); -- -- hts_expand(float,rec->n_allele,call->nqsum,call->qsum); -- for (i=0; in_allele; i++) call->qsum[i] = 0; -- -- for (i=0; in_allele; a++) -- { -- for (b=0; b<=a; b++) -- { -- call->qsum[a] += pdg[k]; -- call->qsum[b] += pdg[k]; -- k++; -- } -- } -- pdg += ngts; -- } -- float sum = 0; -- for (i=0; in_allele; i++) sum += call->qsum[i]; -- if ( sum ) for (i=0; in_allele; i++) call->qsum[i] /= sum; --} -- - // Create mapping between old and new (trimmed) alleles - void init_allele_trimming_maps(call_t *call, int als, int nals) - { -@@ -581,6 +620,7 @@ - // at most tri-allelic sites are considered. Returns the number of alleles. - static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) - { -+ int j; - int ia,ib,ic; // iterators over up to three alleles - int max_als=0; // most likely combination of alleles - double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles -@@ -606,32 +646,46 @@ - UPDATE_MAX_LKs(1<0 && lk_tot_set); - } - -+ grp_t *grps = &call->smpl_grp; -+ - // Two alleles - if ( nals>1 ) - { - for (ia=0; iaqsum[ia]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; - int iaa = (ia+1)*(ia+2)/2-1; - for (ib=0; ibqsum[ib]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; - double lk_tot = 0; - int lk_tot_set = 0; -- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]); -- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]); -- double fa2 = fa*fa; -- double fb2 = fb*fb; -- double fab = 2*fa*fb; -+ int ia_cov = 0, ib_cov = 0; -+ for (j=0; jngrp; j++) -+ { -+ grp1_t *grp = &grps->grp[j]; -+ if ( grp->qsum[ia] ) ia_cov = 1; -+ if ( grp->qsum[ib] ) ib_cov = 1; -+ if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; } -+ grp->dp = 1; -+ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]); -+ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]); -+ grp->fa2 = grp->fa*grp->fa; -+ grp->fb2 = grp->fb*grp->fb; -+ grp->fab = 2*grp->fa*grp->fb; -+ } -+ if ( !ia_cov || !ib_cov ) continue; - int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; -+ if ( !grp->dp ) continue; - double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) -- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; -+ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab]; - else if ( call->ploidy && call->ploidy[isample]==1 ) -- val = fa*pdg[iaa] + fb*pdg[ibb]; -+ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb]; - if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; - } -@@ -647,35 +701,48 @@ - { - for (ia=0; iaqsum[ia]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; - int iaa = (ia+1)*(ia+2)/2-1; - for (ib=0; ibqsum[ib]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; - int ibb = (ib+1)*(ib+2)/2-1; - int iab = iaa - ia + ib; - for (ic=0; icqsum[ic]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue; - double lk_tot = 0; - int lk_tot_set = 1; -- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); -- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); -- double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); -- double fa2 = fa*fa; -- double fb2 = fb*fb; -- double fc2 = fc*fc; -- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; -+ int ia_cov = 0, ib_cov = 0, ic_cov = 0; -+ for (j=0; jngrp; j++) -+ { -+ grp1_t *grp = &grps->grp[j]; -+ if ( grp->qsum[ia] ) ia_cov = 1; -+ if ( grp->qsum[ib] ) ib_cov = 1; -+ if ( grp->qsum[ic] ) ic_cov = 1; -+ if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; } -+ grp->dp = 1; -+ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); -+ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); -+ grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); -+ grp->fa2 = grp->fa*grp->fa; -+ grp->fb2 = grp->fb*grp->fb; -+ grp->fc2 = grp->fc*grp->fc; -+ grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc; -+ } -+ if ( !ia_cov || !ib_cov || !ic_cov ) continue; - int isample, icc = (ic+1)*(ic+2)/2-1; - int iac = iaa - ia + ic, ibc = ibb - ib + ic; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; -+ if ( !grp->dp ) continue; - double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) -- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; -+ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc]; - else if ( call->ploidy && call->ploidy[isample]==1 ) -- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; -+ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc]; - if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; - } -@@ -788,12 +855,13 @@ - gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; - - // Non-zero depth, determine the most likely genotype -+ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; - double best_lk = 0; - for (ia=0; iaqsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; -+ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; - #if USE_PRIOR_FOR_GTS - if ( ia!=0 ) lk *= prior; - #endif -@@ -816,7 +884,7 @@ - { - if ( !(out_als & 1<qsum[ia]*call->qsum[ib]; -+ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; - #if USE_PRIOR_FOR_GTS - if ( ia!=0 ) lk *= prior; - if ( ib!=0 ) lk *= prior; -@@ -940,6 +1008,7 @@ - - for (i=0; ismpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; - double sum_lk = 0; - double best_lk = 0; - for (ia=0; iaals_map[ia],call->als_map[ia]); -- double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; -+ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; - sum_lk += lk; - gls[idx] = lk; - if ( best_lk < lk ) -@@ -966,7 +1035,7 @@ - if ( !(out_als & 1<als_map[ia],call->als_map[ib]); -- double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib]; -+ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; - sum_lk += lk; - gls[idx] = lk; - if ( best_lk < lk ) -@@ -1272,28 +1341,37 @@ - // - static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) - { -- bcf_sr_regions_t *tgt = call->srs->targets; -- if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals); -- hts_expand(char*,tgt->nals+1,call->nals,call->als); -+ assert( call->tgt_als->n ); -+ if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n); -+ hts_expand(char*,call->tgt_als->n+1,call->nals,call->als); - - int has_new = 0; - - int i, j, nals = 1; - for (i=1; inals_map; i++) call->als_map[i] = -1; - -- if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], tgt->als[0]) < 0 ) -- error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,tgt->als[0],rec->d.allele[0]); -+ if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], call->tgt_als->allele[0]) < 0 ) -+ error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,call->tgt_als->allele[0],rec->d.allele[0]); - - // create mapping from new to old alleles -- call->als[0] = tgt->als[0]; -+ call->als[0] = call->tgt_als->allele[0]; - call->als_map[0] = 0; - -- for (i=1; inals; i++) -+ for (i=1; itgt_als->n; i++) - { -- call->als[nals] = tgt->als[i]; -- j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]); -+ call->als[nals] = call->tgt_als->allele[i]; -+ j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); - -- if ( j+1==*unseen ) { fprintf(stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } -+ if ( j+1==*unseen ) -+ { -+ fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]); -+ int k; -+ for (k=0; kn_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]); -+ fprintf(stderr,"\tTAB="); -+ for (k=0; ktgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); -+ fprintf(stderr,"\n"); -+ return -1; -+ } - - if ( j>=0 ) - { -@@ -1364,11 +1442,51 @@ - bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl); - - // update QS -- float qsum[5]; -- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); -+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); -+ hts_expand(float,nals,call->nqsum,call->qsum); - for (i=0; ials_map[i]qsum[call->als_map[i]] : 0; -- bcf_update_info_float(call->hdr, rec, "QS", qsum, nals); -+ call->qsum[i] = call->als_map[i]smpl_grp.grp[0].qsum[call->als_map[i]] : 0; -+ bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals); -+ -+ // update any Number=R tags -+ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point -+ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs; -+ for (i=0; in_fmt; i++) -+ { -+ bcf_fmt_t *fmt = &rec->d.fmt[i]; -+ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id); -+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag -+ -+ // NB:works only for BCF_HT_INT and BCF_HT_REAL -+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id); -+ assert( type==BCF_HT_INT || type==BCF_HT_REAL ); -+ assert( sizeof(float)==sizeof(int32_t) ); -+ -+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id); -+ int nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); -+ if (nret<=0) continue; -+ int nsmpl = bcf_hdr_nsamples(call->hdr); -+ int size1 = sizeof(float); -+ hts_expand(float, nsmpl * nals, ntmp_new, tmp_new); -+ for (j=0; jn; -+ uint8_t *ptr_new = (uint8_t *) tmp_new + j*nals*size1; -+ for (k=0; kals_map[k]; -+ memcpy(dst,src,size1); -+ } -+ } -+ nret = bcf_update_format(call->hdr, rec, key, tmp_new, nsmpl*nals, type); -+ assert( nret==0 ); -+ } -+ call->PLs = (int32_t*) tmp_new; -+ call->mPLs = ntmp_new; -+ call->itmp = (int32_t*) tmp_ori; -+ call->n_itmp = ntmp_ori; -+ - - if ( *unseen ) *unseen = nals-1; - return 0; -@@ -1383,7 +1501,7 @@ - */ - int mcall(call_t *call, bcf1_t *rec) - { -- int i, unseen = call->unseen; -+ int i,j, unseen = call->unseen; - - // Force alleles when calling genotypes given alleles was requested - if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; -@@ -1404,61 +1522,83 @@ - hts_expand(double, call->nPLs, call->npdg, call->pdg); - set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen); - -- #if QS_FROM_PDG -- estimate_qsum(call, rec); -- #else -- // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. -- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); -+ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. -+ if ( call->smpl_grp.ngrp == 1 ) -+ { -+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); - if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); - if ( nqs < nals ) - { - // Some of the listed alleles do not have the corresponding QS field. This is -- // typically ref-only site with X in ALT. -+ // typically ref-only site with <*> in ALT. -+ hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum); -+ for (i=nqs; ismpl_grp.grp[0].qsum[i] = 0; -+ } -+ } -+ else -+ { -+ for (j=0; jsmpl_grp.ngrp; j++) -+ { -+ hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum); -+ memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals); -+ } - -- hts_expand(float,nals,call->nqsum,call->qsum); -- for (i=nqs; iqsum[i] = 0; -+ int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs); -+ if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n"); -+ nad /= bcf_hdr_nsamples(call->hdr); -+ hts_expand(float,nals,call->nqsum,call->qsum); -+ float qsum = 0; -+ for (i=0; ihdr); i++) -+ { -+ int32_t *ptr = call->ADs + i*nad; -+ for (j=0; jqsum[j] = 0; -+ else { call->qsum[j] = ptr[j]; qsum += ptr[j]; } -+ } -+ for (; jqsum[j] = 0; -+ if ( qsum ) -+ for (j=0; jqsum[j] /= qsum; -+ -+ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]]; -+ for (j=0; jqsum[j] += call->qsum[j]; - } -+ } - -- // If available, take into account reference panel AFs -- if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) -+ // If available, take into account reference panel AFs -+ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) -+ { -+ int an = call->ac[0]; -+ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) - { -- int an = call->ac[0]; -- if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) -+ int ac0 = an; // number of alleles in the reference population -+ for (i=0; iac[i]==bcf_int32_vector_end ) break; -- if ( call->ac[i]==bcf_int32_missing ) continue; -- ac0 -= call->ac[i]; -- call->qsum[i+1] += call->ac[i]*0.5; -- } -- if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); -- call->qsum[0] += ac0*0.5; -- for (i=0; iqsum[i] /= nsmpl + 0.5*an; -+ if ( call->ac[i]==bcf_int32_vector_end ) break; -+ if ( call->ac[i]==bcf_int32_missing ) continue; -+ ac0 -= call->ac[i]; -+ for (j=0; jsmpl_grp.ngrp; j++) -+ call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5; -+ } -+ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); -+ for (j=0; jsmpl_grp.ngrp; j++) -+ call->smpl_grp.grp[j].qsum[0] += ac0*0.5; -+ for (i=0; ismpl_grp.ngrp; j++) -+ call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an; - } - } -+ } - -+ for (j=0; jsmpl_grp.ngrp; j++) -+ { - float qsum_tot = 0; -- for (i=0; iqsum[i]; -- -- // Is this still necessary?? -- // -- // if (0&& !call->qsum[0] ) -- // { -- // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, -- // // an equivalent of a single reference read. -- // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) -- // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); -- // if ( call->itmp[0] ) -- // { -- // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; -- // qsum_tot += call->qsum[0]; -- // } -- // } -- -- if ( qsum_tot ) for (i=0; iqsum[i] /= qsum_tot; -- #endif -+ for (i=0; ismpl_grp.grp[j].qsum[i]; -+ if ( qsum_tot ) for (i=0; ismpl_grp.grp[j].qsum[i] /= qsum_tot; -+ } - - bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag - -@@ -1466,7 +1606,7 @@ - int out_als, nout; - if ( nals > 8*sizeof(out_als) ) - { -- fprintf(stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); -+ fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; - } - nout = mcall_find_best_alleles(call, nals, &out_als); -@@ -1510,7 +1650,7 @@ - { - if ( nout>4 ) - { -- fprintf(stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); -+ fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; - } - mcall_call_trio_genotypes(call, rec, nals,nout,out_als); ---- python-pysam.orig/bcftools/mcall.c.pysam.c -+++ python-pysam/bcftools/mcall.c.pysam.c -@@ -25,7 +25,9 @@ - THE SOFTWARE. */ - - #include -+#include - #include -+#include - #include "call.h" - - // Using priors for GTs does not seem to be mathematically justified. Although -@@ -38,9 +40,6 @@ - // genotypes is reported instead. - #define FLAT_PDG_FOR_MISSING 0 - --// Estimate QS (combined quality and allele frequencies) from PLs --#define QS_FROM_PDG 0 -- - - void qcall_init(call_t *call) { return; } - void qcall_destroy(call_t *call) { return; } -@@ -246,12 +245,84 @@ - free(call->trio[j][i]); - } - -+static void init_sample_groups(call_t *call) -+{ -+ int i, nsmpl = bcf_hdr_nsamples(call->hdr); -+ if ( !call->sample_groups ) -+ { -+ // standard pooled calling, all samples in the same group -+ grp_t *grps = &call->smpl_grp; -+ grps->ngrp = 1; -+ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); -+ grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int)); -+ } -+ else if ( !strcmp("-",call->sample_groups) ) -+ { -+ // single-sample calling, each sample creates its own group -+ grp_t *grps = &call->smpl_grp; -+ grps->ngrp = nsmpl; -+ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); -+ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); -+ for (i=0; ismpl2grp[i] = i; -+ } -+ else -+ { -+ int nlines; -+ char **lines = hts_readlist(call->sample_groups, 1, &nlines); -+ if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); -+ -+ uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); -+ void *grp2idx = khash_str2int_init(); -+ -+ grp_t *grps = &call->smpl_grp; -+ for (i=0; isample_groups,lines[i]); -+ *ptr = 0; -+ int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); -+ if ( ismpl<0 ) continue; -+ if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); -+ if ( !khash_str2int_has_key(grp2idx,ptr+1) ) -+ { -+ khash_str2int_inc(grp2idx, ptr+1); -+ grps->ngrp++; -+ } -+ int igrp; -+ if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 ) -+ smpl2grp1[ismpl] = igrp+1; -+ else -+ error("This should not happen, fixme: %s\n",ptr+1); -+ } -+ khash_str2int_destroy(grp2idx); -+ -+ grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); -+ grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); -+ for (i=0; ihdr->samples[i],call->sample_groups); -+ grps->smpl2grp[i] = smpl2grp1[i] - 1; -+ } -+ free(smpl2grp1); -+ for (i=0; ismpl_grp; -+ for (i=0; ingrp; i++) -+ free(grps->grp[i].qsum); -+ free(grps->grp); -+ free(grps->smpl2grp); -+} -+ - void mcall_init(call_t *call) - { - call_init_pl2p(call); - -- call->nqsum = 5; -- call->qsum = (float*) malloc(sizeof(float)*call->nqsum); // will be expanded later if ncessary - call->nals_map = 5; - call->als_map = (int*) malloc(sizeof(int)*call->nals_map); - call->npl_map = 5*(5+1)/2; // will be expanded later if necessary -@@ -304,26 +375,28 @@ - call->theta = log(call->theta); - } - -- return; -+ init_sample_groups(call); - } - - void mcall_destroy(call_t *call) - { -+ destroy_sample_groups(call); - if (call->vcmp) vcmp_destroy(call->vcmp); - free(call->itmp); - mcall_destroy_trios(call); - free(call->GPs); -+ free(call->ADs); - free(call->GLs); - free(call->GQs); - free(call->anno16); - free(call->PLs); -- free(call->qsum); - free(call->als_map); - free(call->pl_map); - free(call->gts); free(call->cgts); free(call->ugts); - free(call->pdg); - free(call->als); - free(call->ac); -+ free(call->qsum); - return; - } - -@@ -433,40 +506,6 @@ - } - } - --/* -- Allele frequency estimated as: -- #A = \sum_i (2*P_AA + P_AB) -- F_A = #A / ( #A + #B ) -- where i runs across all samples --*/ --void estimate_qsum(call_t *call, bcf1_t *rec) --{ -- double *pdg = call->pdg; -- int ngts = rec->n_allele*(rec->n_allele+1)/2; -- int i,nsmpl = bcf_hdr_nsamples(call->hdr); -- -- hts_expand(float,rec->n_allele,call->nqsum,call->qsum); -- for (i=0; in_allele; i++) call->qsum[i] = 0; -- -- for (i=0; in_allele; a++) -- { -- for (b=0; b<=a; b++) -- { -- call->qsum[a] += pdg[k]; -- call->qsum[b] += pdg[k]; -- k++; -- } -- } -- pdg += ngts; -- } -- float sum = 0; -- for (i=0; in_allele; i++) sum += call->qsum[i]; -- if ( sum ) for (i=0; in_allele; i++) call->qsum[i] /= sum; --} -- - // Create mapping between old and new (trimmed) alleles - void init_allele_trimming_maps(call_t *call, int als, int nals) - { -@@ -583,6 +622,7 @@ - // at most tri-allelic sites are considered. Returns the number of alleles. - static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) - { -+ int j; - int ia,ib,ic; // iterators over up to three alleles - int max_als=0; // most likely combination of alleles - double ref_lk = 0, max_lk = -HUGE_VAL; // likelihood of the reference and of most likely combination of alleles -@@ -608,32 +648,46 @@ - UPDATE_MAX_LKs(1<0 && lk_tot_set); - } - -+ grp_t *grps = &call->smpl_grp; -+ - // Two alleles - if ( nals>1 ) - { - for (ia=0; iaqsum[ia]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; - int iaa = (ia+1)*(ia+2)/2-1; - for (ib=0; ibqsum[ib]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; - double lk_tot = 0; - int lk_tot_set = 0; -- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]); -- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]); -- double fa2 = fa*fa; -- double fb2 = fb*fb; -- double fab = 2*fa*fb; -+ int ia_cov = 0, ib_cov = 0; -+ for (j=0; jngrp; j++) -+ { -+ grp1_t *grp = &grps->grp[j]; -+ if ( grp->qsum[ia] ) ia_cov = 1; -+ if ( grp->qsum[ib] ) ib_cov = 1; -+ if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; } -+ grp->dp = 1; -+ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]); -+ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]); -+ grp->fa2 = grp->fa*grp->fa; -+ grp->fb2 = grp->fb*grp->fb; -+ grp->fab = 2*grp->fa*grp->fb; -+ } -+ if ( !ia_cov || !ib_cov ) continue; - int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; -+ if ( !grp->dp ) continue; - double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) -- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; -+ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab]; - else if ( call->ploidy && call->ploidy[isample]==1 ) -- val = fa*pdg[iaa] + fb*pdg[ibb]; -+ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb]; - if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; - } -@@ -649,35 +703,48 @@ - { - for (ia=0; iaqsum[ia]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ia]==0 ) continue; - int iaa = (ia+1)*(ia+2)/2-1; - for (ib=0; ibqsum[ib]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; - int ibb = (ib+1)*(ib+2)/2-1; - int iab = iaa - ia + ib; - for (ic=0; icqsum[ic]==0 ) continue; -+ if ( grps->ngrp==1 && grps->grp[0].qsum[ic]==0 ) continue; - double lk_tot = 0; - int lk_tot_set = 1; -- double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); -- double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); -- double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]); -- double fa2 = fa*fa; -- double fb2 = fb*fb; -- double fc2 = fc*fc; -- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; -+ int ia_cov = 0, ib_cov = 0, ic_cov = 0; -+ for (j=0; jngrp; j++) -+ { -+ grp1_t *grp = &grps->grp[j]; -+ if ( grp->qsum[ia] ) ia_cov = 1; -+ if ( grp->qsum[ib] ) ib_cov = 1; -+ if ( grp->qsum[ic] ) ic_cov = 1; -+ if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; } -+ grp->dp = 1; -+ grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); -+ grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); -+ grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); -+ grp->fa2 = grp->fa*grp->fa; -+ grp->fb2 = grp->fb*grp->fb; -+ grp->fc2 = grp->fc*grp->fc; -+ grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc; -+ } -+ if ( !ia_cov || !ib_cov || !ic_cov ) continue; - int isample, icc = (ic+1)*(ic+2)/2-1; - int iac = iaa - ia + ic, ibc = ibb - ib + ic; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; -+ if ( !grp->dp ) continue; - double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) -- val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; -+ val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc]; - else if ( call->ploidy && call->ploidy[isample]==1 ) -- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; -+ val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc]; - if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; - } -@@ -790,12 +857,13 @@ - gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; - - // Non-zero depth, determine the most likely genotype -+ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; - double best_lk = 0; - for (ia=0; iaqsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; -+ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; - #if USE_PRIOR_FOR_GTS - if ( ia!=0 ) lk *= prior; - #endif -@@ -818,7 +886,7 @@ - { - if ( !(out_als & 1<qsum[ia]*call->qsum[ib]; -+ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; - #if USE_PRIOR_FOR_GTS - if ( ia!=0 ) lk *= prior; - if ( ib!=0 ) lk *= prior; -@@ -942,6 +1010,7 @@ - - for (i=0; ismpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; - double sum_lk = 0; - double best_lk = 0; - for (ia=0; iaals_map[ia],call->als_map[ia]); -- double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia]; -+ double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; - sum_lk += lk; - gls[idx] = lk; - if ( best_lk < lk ) -@@ -968,7 +1037,7 @@ - if ( !(out_als & 1<als_map[ia],call->als_map[ib]); -- double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib]; -+ double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; - sum_lk += lk; - gls[idx] = lk; - if ( best_lk < lk ) -@@ -1274,28 +1343,37 @@ - // - static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) - { -- bcf_sr_regions_t *tgt = call->srs->targets; -- if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals); -- hts_expand(char*,tgt->nals+1,call->nals,call->als); -+ assert( call->tgt_als->n ); -+ if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n); -+ hts_expand(char*,call->tgt_als->n+1,call->nals,call->als); - - int has_new = 0; - - int i, j, nals = 1; - for (i=1; inals_map; i++) call->als_map[i] = -1; - -- if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], tgt->als[0]) < 0 ) -- error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,tgt->als[0],rec->d.allele[0]); -+ if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], call->tgt_als->allele[0]) < 0 ) -+ error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,call->tgt_als->allele[0],rec->d.allele[0]); - - // create mapping from new to old alleles -- call->als[0] = tgt->als[0]; -+ call->als[0] = call->tgt_als->allele[0]; - call->als_map[0] = 0; - -- for (i=1; inals; i++) -+ for (i=1; itgt_als->n; i++) - { -- call->als[nals] = tgt->als[i]; -- j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]); -+ call->als[nals] = call->tgt_als->allele[i]; -+ j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); - -- if ( j+1==*unseen ) { fprintf(bcftools_stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; } -+ if ( j+1==*unseen ) -+ { -+ fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s). VCF=",i,call->tgt_als->allele[i]); -+ int k; -+ for (k=0; kn_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]); -+ fprintf(bcftools_stderr,"\tTAB="); -+ for (k=0; ktgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); -+ fprintf(bcftools_stderr,"\n"); -+ return -1; -+ } - - if ( j>=0 ) - { -@@ -1366,11 +1444,51 @@ - bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl); - - // update QS -- float qsum[5]; -- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); -+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); -+ hts_expand(float,nals,call->nqsum,call->qsum); - for (i=0; ials_map[i]qsum[call->als_map[i]] : 0; -- bcf_update_info_float(call->hdr, rec, "QS", qsum, nals); -+ call->qsum[i] = call->als_map[i]smpl_grp.grp[0].qsum[call->als_map[i]] : 0; -+ bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals); -+ -+ // update any Number=R tags -+ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point -+ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs; -+ for (i=0; in_fmt; i++) -+ { -+ bcf_fmt_t *fmt = &rec->d.fmt[i]; -+ int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id); -+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag -+ -+ // NB:works only for BCF_HT_INT and BCF_HT_REAL -+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id); -+ assert( type==BCF_HT_INT || type==BCF_HT_REAL ); -+ assert( sizeof(float)==sizeof(int32_t) ); -+ -+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id); -+ int nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); -+ if (nret<=0) continue; -+ int nsmpl = bcf_hdr_nsamples(call->hdr); -+ int size1 = sizeof(float); -+ hts_expand(float, nsmpl * nals, ntmp_new, tmp_new); -+ for (j=0; jn; -+ uint8_t *ptr_new = (uint8_t *) tmp_new + j*nals*size1; -+ for (k=0; kals_map[k]; -+ memcpy(dst,src,size1); -+ } -+ } -+ nret = bcf_update_format(call->hdr, rec, key, tmp_new, nsmpl*nals, type); -+ assert( nret==0 ); -+ } -+ call->PLs = (int32_t*) tmp_new; -+ call->mPLs = ntmp_new; -+ call->itmp = (int32_t*) tmp_ori; -+ call->n_itmp = ntmp_ori; -+ - - if ( *unseen ) *unseen = nals-1; - return 0; -@@ -1385,7 +1503,7 @@ - */ - int mcall(call_t *call, bcf1_t *rec) - { -- int i, unseen = call->unseen; -+ int i,j, unseen = call->unseen; - - // Force alleles when calling genotypes given alleles was requested - if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; -@@ -1406,61 +1524,83 @@ - hts_expand(double, call->nPLs, call->npdg, call->pdg); - set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen); - -- #if QS_FROM_PDG -- estimate_qsum(call, rec); -- #else -- // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. -- int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); -+ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. -+ if ( call->smpl_grp.ngrp == 1 ) -+ { -+ int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); - if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); - if ( nqs < nals ) - { - // Some of the listed alleles do not have the corresponding QS field. This is -- // typically ref-only site with X in ALT. -+ // typically ref-only site with <*> in ALT. -+ hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum); -+ for (i=nqs; ismpl_grp.grp[0].qsum[i] = 0; -+ } -+ } -+ else -+ { -+ for (j=0; jsmpl_grp.ngrp; j++) -+ { -+ hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum); -+ memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals); -+ } - -- hts_expand(float,nals,call->nqsum,call->qsum); -- for (i=nqs; iqsum[i] = 0; -+ int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs); -+ if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n"); -+ nad /= bcf_hdr_nsamples(call->hdr); -+ hts_expand(float,nals,call->nqsum,call->qsum); -+ float qsum = 0; -+ for (i=0; ihdr); i++) -+ { -+ int32_t *ptr = call->ADs + i*nad; -+ for (j=0; jqsum[j] = 0; -+ else { call->qsum[j] = ptr[j]; qsum += ptr[j]; } -+ } -+ for (; jqsum[j] = 0; -+ if ( qsum ) -+ for (j=0; jqsum[j] /= qsum; -+ -+ grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]]; -+ for (j=0; jqsum[j] += call->qsum[j]; - } -+ } - -- // If available, take into account reference panel AFs -- if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) -+ // If available, take into account reference panel AFs -+ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) -+ { -+ int an = call->ac[0]; -+ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) - { -- int an = call->ac[0]; -- if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) -+ int ac0 = an; // number of alleles in the reference population -+ for (i=0; iac[i]==bcf_int32_vector_end ) break; -- if ( call->ac[i]==bcf_int32_missing ) continue; -- ac0 -= call->ac[i]; -- call->qsum[i+1] += call->ac[i]*0.5; -- } -- if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); -- call->qsum[0] += ac0*0.5; -- for (i=0; iqsum[i] /= nsmpl + 0.5*an; -+ if ( call->ac[i]==bcf_int32_vector_end ) break; -+ if ( call->ac[i]==bcf_int32_missing ) continue; -+ ac0 -= call->ac[i]; -+ for (j=0; jsmpl_grp.ngrp; j++) -+ call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5; -+ } -+ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); -+ for (j=0; jsmpl_grp.ngrp; j++) -+ call->smpl_grp.grp[j].qsum[0] += ac0*0.5; -+ for (i=0; ismpl_grp.ngrp; j++) -+ call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an; - } - } -+ } - -+ for (j=0; jsmpl_grp.ngrp; j++) -+ { - float qsum_tot = 0; -- for (i=0; iqsum[i]; -- -- // Is this still necessary?? -- // -- // if (0&& !call->qsum[0] ) -- // { -- // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value, -- // // an equivalent of a single reference read. -- // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 ) -- // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); -- // if ( call->itmp[0] ) -- // { -- // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl; -- // qsum_tot += call->qsum[0]; -- // } -- // } -- -- if ( qsum_tot ) for (i=0; iqsum[i] /= qsum_tot; -- #endif -+ for (i=0; ismpl_grp.grp[j].qsum[i]; -+ if ( qsum_tot ) for (i=0; ismpl_grp.grp[j].qsum[i] /= qsum_tot; -+ } - - bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag - -@@ -1468,7 +1608,7 @@ - int out_als, nout; - if ( nals > 8*sizeof(out_als) ) - { -- fprintf(bcftools_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; - } - nout = mcall_find_best_alleles(call, nals, &out_als); -@@ -1512,7 +1652,7 @@ - { - if ( nout>4 ) - { -- fprintf(bcftools_stderr,"Too many alleles at %s:%d, skipping.\n", bcf_seqname(call->hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; - } - mcall_call_trio_genotypes(call, rec, nals,nout,out_als); ---- python-pysam.orig/bcftools/mpileup.c -+++ python-pysam/bcftools/mpileup.c -@@ -1,6 +1,6 @@ - /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - -- Copyright (C) 2008-2017 Genome Research Ltd. -+ Copyright (C) 2008-2018 Genome Research Ltd. - Portions copyright (C) 2009-2012 Broad Institute. - - Author: Heng Li -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -222,8 +223,8 @@ - if (ma->conf->fai && b->core.tid >= 0) { - has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); - if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence -- fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", -- __func__, b->core.pos, ref_len, b->core.tid); -+ fprintf(stderr,"[%s] Skipping because %"PRId64" is outside of %d [ref:%d]\n", -+ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); - continue; - } - } else { -@@ -246,13 +247,28 @@ - - // Called once per new bam added to the pileup. - // We cache sample information here so we don't have to keep recomputing this --// on each and every pileup column. -+// on each and every pileup column. If FMT/SCR annotation is requested, a flag -+// is set to indicate the presence of a soft clip. - // - // Cd is an arbitrary block of data we can write into, which ends up in --// the pileup structures. We stash the sample ID there. --static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { -+// the pileup structures. We stash the sample ID there: -+// has_soft_clip .. cd->i & 1 -+// sample_id .. cd->i >> 1 -+static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) -+{ - mplp_aux_t *ma = (mplp_aux_t *)data; -- cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); -+ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1; -+ if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) ) -+ { -+ int i; -+ for (i=0; icore.n_cigar; i++) -+ { -+ int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; -+ if ( cig!=BAM_CSOFT_CLIP ) continue; -+ cd->i |= 1; -+ break; -+ } -+ } - return 0; - } - -@@ -265,7 +281,7 @@ - for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position - { - const bam_pileup1_t *p = plp[i] + j; -- int id = p->cd.i; -+ int id = PLP_SAMPLE_ID(p->cd.i); - if (m->n_plp[id] == m->m_plp[id]) - { - m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; -@@ -280,7 +296,7 @@ - { - if ( !conf->gvcf ) - { -- if ( rec ) bcf_write1(fp, hdr, rec); -+ if ( rec && bcf_write1(fp, hdr, rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); - return; - } - -@@ -298,7 +314,7 @@ - if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1; - } - rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref); -- if ( rec ) bcf_write1(fp,hdr,rec); -+ if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); - } - - static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) -@@ -310,7 +326,7 @@ - - while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) - { -- if ( end && (posend) ) continue; -+ if ( posend ) continue; - if ( conf->bed && tid >= 0 ) - { - int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL); -@@ -521,11 +537,13 @@ - - bcf_hdr_append(conf->bcf_hdr,"##ALT="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_INFO_VDB ) -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_INFO_RPB ) -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -@@ -553,17 +571,21 @@ - if ( conf->fmt_flag&B2B_FMT_SP ) - bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_AD ) -- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADF ) -- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADR ) -- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_AD ) -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_ADF ) -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_INFO_SCR ) -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_FMT_SCR ) -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_ADR ) -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->gvcf ) - gvcf_update_header(conf->gvcf, conf->bcf_hdr); - -@@ -571,7 +593,7 @@ - const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); - for (i=0; ibcf_hdr, smpl[i]); -- bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); -+ if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); - - conf->bca = bcf_call_init(-1., conf->min_baseQ); - conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); -@@ -579,6 +601,7 @@ - conf->bca->min_frac = conf->min_frac; - conf->bca->min_support = conf->min_support; - conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; -+ conf->bca->fmt_flag = conf->fmt_flag; - - conf->bc.bcf_hdr = conf->bcf_hdr; - conf->bc.n = nsmpl; -@@ -599,11 +622,14 @@ - conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; - } - } -+ if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) ) -+ conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR)); - } - - // init mpileup - conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); - if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); -+ fprintf(stderr, "[%s] maximum number of reads per input file set to -d %d\n", __func__, conf->max_depth); - if ( (double)conf->max_depth * conf->nfiles > 1<<20) - fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); - if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) -@@ -623,7 +649,7 @@ - if ( ireg++ > 0 ) - { - conf->buf.l = 0; -- ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); -+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); - - for (i=0; infiles; i++) - { -@@ -647,7 +673,7 @@ - while ( regitr_loop(conf->reg_itr) ); - } - else -- mpileup_reg(conf,0,0); -+ mpileup_reg(conf,0,UINT32_MAX); - - flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); - -@@ -656,13 +682,14 @@ - bcf_destroy1(conf->bcf_rec); - if (conf->bcf_fp) - { -- hts_close(conf->bcf_fp); -+ if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); - bcf_hdr_destroy(conf->bcf_hdr); - bcf_call_destroy(conf->bca); - free(conf->bc.PL); - free(conf->bc.DP4); - free(conf->bc.ADR); - free(conf->bc.ADF); -+ free(conf->bc.SCR); - free(conf->bc.fmt_arr); - free(conf->bcr); - } -@@ -738,7 +765,7 @@ - files = (char**) realloc(files,nfiles*sizeof(char*)); - files[nfiles-1] = strdup(buf); - } -- fclose(fh); -+ if ( fclose(fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,file_list); - if ( !nfiles ) - { - fprintf(stderr,"No files read from %s\n", file_list); -@@ -765,6 +792,8 @@ - else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD; - else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; - else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; -+ else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; -+ else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; - else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; - else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; - else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; -@@ -779,6 +808,9 @@ - return flag; - } - -+// todo: make it possible to turn off some annotations or change the defaults, -+// specifically RPB, VDB, MWU, SGB tests. It would be good to do some -+// benchmarking first to see if it's worth it. - static void list_annotations(FILE *fp) - { - fprintf(fp, -@@ -790,12 +822,14 @@ - " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" - " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" - " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" -+" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" - "\n" - "INFO annotation tags available:\n" - "\n" - " INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n" - " INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n" - " INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n" -+" INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" - "\n"); - } - -@@ -818,7 +852,7 @@ - " -b, --bam-list FILE list of input BAM filenames, one per line\n" - " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" - " -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" --" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); -+" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); - fprintf(fp, - " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" - " -f, --fasta-ref FILE faidx indexed reference sequence file\n" -@@ -850,7 +884,7 @@ - " -o, --output FILE write output to FILE [standard output]\n" - " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" - " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" --" --threads INT number of extra output compression threads [0]\n" -+" --threads INT use multithreading with INT worker threads [0]\n" - "\n" - "SNP/INDEL genotype likelihoods options:\n" - " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); -@@ -870,6 +904,10 @@ - " -P, --platforms STR comma separated list of platforms for indels [all]\n" - "\n" - "Notes: Assuming diploid individuals.\n" -+"\n" -+"Example:\n" -+" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" -+" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" - "\n"); - - free(tmp_require); -@@ -897,6 +935,7 @@ - mplp.record_cmd_line = 1; - mplp.n_threads = 0; - mplp.bsmpl = bam_smpl_init(); -+ mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() - - static const struct option lopts[] = - { -@@ -1049,7 +1088,7 @@ - - if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) - { -- fprintf(stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n"); -+ fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); - mplp.fmt_flag |= B2B_FMT_DP; - } - if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) ---- python-pysam.orig/bcftools/mpileup.c.pysam.c -+++ python-pysam/bcftools/mpileup.c.pysam.c -@@ -2,7 +2,7 @@ - - /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - -- Copyright (C) 2008-2017 Genome Research Ltd. -+ Copyright (C) 2008-2018 Genome Research Ltd. - Portions copyright (C) 2009-2012 Broad Institute. - - Author: Heng Li -@@ -33,6 +33,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -224,8 +225,8 @@ - if (ma->conf->fai && b->core.tid >= 0) { - has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); - if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence -- fprintf(bcftools_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", -- __func__, b->core.pos, ref_len, b->core.tid); -+ fprintf(bcftools_stderr,"[%s] Skipping because %"PRId64" is outside of %d [ref:%d]\n", -+ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); - continue; - } - } else { -@@ -248,13 +249,28 @@ - - // Called once per new bam added to the pileup. - // We cache sample information here so we don't have to keep recomputing this --// on each and every pileup column. -+// on each and every pileup column. If FMT/SCR annotation is requested, a flag -+// is set to indicate the presence of a soft clip. - // - // Cd is an arbitrary block of data we can write into, which ends up in --// the pileup structures. We stash the sample ID there. --static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { -+// the pileup structures. We stash the sample ID there: -+// has_soft_clip .. cd->i & 1 -+// sample_id .. cd->i >> 1 -+static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) -+{ - mplp_aux_t *ma = (mplp_aux_t *)data; -- cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); -+ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b) << 1; -+ if ( ma->conf->fmt_flag & (B2B_INFO_SCR|B2B_FMT_SCR) ) -+ { -+ int i; -+ for (i=0; icore.n_cigar; i++) -+ { -+ int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; -+ if ( cig!=BAM_CSOFT_CLIP ) continue; -+ cd->i |= 1; -+ break; -+ } -+ } - return 0; - } - -@@ -267,7 +283,7 @@ - for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position - { - const bam_pileup1_t *p = plp[i] + j; -- int id = p->cd.i; -+ int id = PLP_SAMPLE_ID(p->cd.i); - if (m->n_plp[id] == m->m_plp[id]) - { - m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; -@@ -282,7 +298,7 @@ - { - if ( !conf->gvcf ) - { -- if ( rec ) bcf_write1(fp, hdr, rec); -+ if ( rec && bcf_write1(fp, hdr, rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); - return; - } - -@@ -300,7 +316,7 @@ - if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1; - } - rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref); -- if ( rec ) bcf_write1(fp,hdr,rec); -+ if ( rec && bcf_write1(fp,hdr,rec)!=0 ) error("[%s] Error: failed to write the record to %s\n", __func__,conf->output_fname?conf->output_fname:"standard output"); - } - - static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) -@@ -312,7 +328,7 @@ - - while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0) - { -- if ( end && (posend) ) continue; -+ if ( posend ) continue; - if ( conf->bed && tid >= 0 ) - { - int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL); -@@ -523,11 +539,13 @@ - - bcf_hdr_append(conf->bcf_hdr,"##ALT="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_INFO_VDB ) -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_INFO_RPB ) -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -@@ -555,17 +573,21 @@ - if ( conf->fmt_flag&B2B_FMT_SP ) - bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_AD ) -- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADF ) -- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_FMT_ADR ) -- bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_AD ) -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_INFO_ADF ) -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_INFO_SCR ) -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ if ( conf->fmt_flag&B2B_FMT_SCR ) -+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_ADR ) -- bcf_hdr_append(conf->bcf_hdr,"##INFO="); -+ bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->gvcf ) - gvcf_update_header(conf->gvcf, conf->bcf_hdr); - -@@ -573,7 +595,7 @@ - const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); - for (i=0; ibcf_hdr, smpl[i]); -- bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); -+ if ( bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr)!=0 ) error("[%s] Error: failed to write the header to %s\n",__func__,conf->output_fname?conf->output_fname:"standard output"); - - conf->bca = bcf_call_init(-1., conf->min_baseQ); - conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); -@@ -581,6 +603,7 @@ - conf->bca->min_frac = conf->min_frac; - conf->bca->min_support = conf->min_support; - conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; -+ conf->bca->fmt_flag = conf->fmt_flag; - - conf->bc.bcf_hdr = conf->bcf_hdr; - conf->bc.n = nsmpl; -@@ -601,11 +624,14 @@ - conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; - } - } -+ if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) ) -+ conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR)); - } - - // init mpileup - conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); - if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); -+ fprintf(bcftools_stderr, "[%s] maximum number of reads per input file set to -d %d\n", __func__, conf->max_depth); - if ( (double)conf->max_depth * conf->nfiles > 1<<20) - fprintf(bcftools_stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); - if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) -@@ -625,7 +651,7 @@ - if ( ireg++ > 0 ) - { - conf->buf.l = 0; -- ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); -+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); - - for (i=0; infiles; i++) - { -@@ -649,7 +675,7 @@ - while ( regitr_loop(conf->reg_itr) ); - } - else -- mpileup_reg(conf,0,0); -+ mpileup_reg(conf,0,UINT32_MAX); - - flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); - -@@ -658,13 +684,14 @@ - bcf_destroy1(conf->bcf_rec); - if (conf->bcf_fp) - { -- hts_close(conf->bcf_fp); -+ if ( hts_close(conf->bcf_fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,conf->output_fname); - bcf_hdr_destroy(conf->bcf_hdr); - bcf_call_destroy(conf->bca); - free(conf->bc.PL); - free(conf->bc.DP4); - free(conf->bc.ADR); - free(conf->bc.ADF); -+ free(conf->bc.SCR); - free(conf->bc.fmt_arr); - free(conf->bcr); - } -@@ -740,7 +767,7 @@ - files = (char**) realloc(files,nfiles*sizeof(char*)); - files[nfiles-1] = strdup(buf); - } -- fclose(fh); -+ if ( fclose(fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,file_list); - if ( !nfiles ) - { - fprintf(bcftools_stderr,"No files read from %s\n", file_list); -@@ -767,6 +794,8 @@ - else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD; - else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; - else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; -+ else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; -+ else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; - else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; - else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; - else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; -@@ -781,6 +810,9 @@ - return flag; - } - -+// todo: make it possible to turn off some annotations or change the defaults, -+// specifically RPB, VDB, MWU, SGB tests. It would be good to do some -+// benchmarking first to see if it's worth it. - static void list_annotations(FILE *fp) - { - fprintf(fp, -@@ -792,12 +824,14 @@ - " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" - " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" - " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" -+" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" - "\n" - "INFO annotation tags available:\n" - "\n" - " INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n" - " INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n" - " INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n" -+" INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" - "\n"); - } - -@@ -820,7 +854,7 @@ - " -b, --bam-list FILE list of input BAM filenames, one per line\n" - " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" - " -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" --" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); -+" -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); - fprintf(fp, - " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" - " -f, --fasta-ref FILE faidx indexed reference sequence file\n" -@@ -852,7 +886,7 @@ - " -o, --output FILE write output to FILE [standard output]\n" - " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" - " 'z' compressed VCF; 'v' uncompressed VCF [v]\n" --" --threads INT number of extra output compression threads [0]\n" -+" --threads INT use multithreading with INT worker threads [0]\n" - "\n" - "SNP/INDEL genotype likelihoods options:\n" - " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ); -@@ -872,6 +906,10 @@ - " -P, --platforms STR comma separated list of platforms for indels [all]\n" - "\n" - "Notes: Assuming diploid individuals.\n" -+"\n" -+"Example:\n" -+" # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" -+" bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" - "\n"); - - free(tmp_require); -@@ -899,6 +937,7 @@ - mplp.record_cmd_line = 1; - mplp.n_threads = 0; - mplp.bsmpl = bam_smpl_init(); -+ mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB; // the default to be changed in future, see also parse_format_flag() - - static const struct option lopts[] = - { -@@ -1051,7 +1090,7 @@ - - if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) ) - { -- fprintf(bcftools_stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n"); -+ fprintf(bcftools_stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); - mplp.fmt_flag |= B2B_FMT_DP; - } - if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) ---- python-pysam.orig/bcftools/plugins/GTisec.c -+++ python-pysam/bcftools/plugins/GTisec.c -@@ -320,7 +320,7 @@ - int gte_smp = 0; // number GT array entries per sample (should be 2, one entry per allele) - if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) - { -- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); -+ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); - } - - gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ---- python-pysam.orig/bcftools/plugins/GTisec.c.pysam.c -+++ python-pysam/bcftools/plugins/GTisec.c.pysam.c -@@ -322,7 +322,7 @@ - int gte_smp = 0; // number GT array entries per sample (should be 2, one entry per allele) - if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) - { -- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); -+ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); - } - - gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ---- python-pysam.orig/bcftools/plugins/GTsubset.c -+++ python-pysam/bcftools/plugins/GTsubset.c -@@ -163,7 +163,7 @@ - args.ngt_arr = 0; /*! hold the number of current GT array entries */ - if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) - { -- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); -+ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); - } - - gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ---- python-pysam.orig/bcftools/plugins/GTsubset.c.pysam.c -+++ python-pysam/bcftools/plugins/GTsubset.c.pysam.c -@@ -165,7 +165,7 @@ - args.ngt_arr = 0; /*! hold the number of current GT array entries */ - if ( (gte_smp = bcf_get_genotypes(args.hdr, rec, &(args.gt_arr), &(args.ngt_arr) ) ) <= 0 ) - { -- error("GT not present at %s: %d\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos+1); -+ error("GT not present at %s: %"PRId64"\n", args.hdr->id[BCF_DT_CTG][rec->rid].key, (int64_t) rec->pos+1); - } - - gte_smp /= args.nsmp; // divide total number of genotypes array entries (= args.ngt_arr) by number of samples ---- python-pysam.orig/bcftools/plugins/ad-bias.c -+++ python-pysam/bcftools/plugins/ad-bias.c -@@ -26,6 +26,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -55,6 +56,7 @@ - convert_t *convert; - kstring_t str; - uint64_t nsite,ncmp; -+ int variant_type; - } - args_t; - -@@ -75,11 +77,12 @@ - " run \"bcftools plugin\" for a list of common options\n" - "\n" - "Plugin options:\n" -- " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" -- " -d, --min-dp Minimum required depth [0]\n" -- " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" -- " -s, --samples List of sample pairs, one tab-delimited pair per line\n" -- " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" -+ " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" -+ " -d, --min-dp Minimum required depth [0]\n" -+ " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" -+ " -s, --samples List of sample pairs, one tab-delimited pair per line\n" -+ " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" -+ " -v, --variant-type Consider only variants of this type. (By default all variants are considered.)\n" - "\n" - "Example:\n" - " bcftools +ad-bias file.bcf -- -t 1e-3 -s samples.txt\n" -@@ -117,7 +120,7 @@ - - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -@@ -134,11 +137,12 @@ - {"format",required_argument,NULL,'f'}, - {"samples",required_argument,NULL,'s'}, - {"threshold",required_argument,NULL,'t'}, -+ {"variant-type",required_argument,NULL,'v'}, - {NULL,0,NULL,0} - }; - int c; - char *tmp; -- while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:v:",loptions,NULL)) >= 0) - { - switch (c) - { -@@ -155,6 +159,11 @@ - if ( *tmp ) error("Could not parse: -t %s\n", optarg); - break; - case 's': fname = optarg; break; -+ case 'v': -+ if ( !strcasecmp(optarg,"snp") || !strcasecmp(optarg,"snps") ) args.variant_type = VCF_SNP; -+ else if ( !strcasecmp(optarg,"indel") || !strcasecmp(optarg,"indels") ) args.variant_type = VCF_INDEL; -+ else error("Error: Variant type \"%s\" is not supported\n",optarg); -+ break; - case 'f': format = optarg; break; - case 'h': - case '?': -@@ -168,14 +177,29 @@ - printf("# The command line was:\tbcftools +ad-bias %s", argv[0]); - for (c=1; cn_allele < 2 ) return NULL; -+ - int nad = bcf_get_format_int32(args.hdr, rec, "AD", &args.ad_arr, &args.mad_arr); - if ( nad<0 ) return NULL; - nad /= bcf_hdr_nsamples(args.hdr); -@@ -183,30 +207,78 @@ - if ( args.convert ) convert_line(args.convert, rec, &args.str); - args.nsite++; - -- int i; -+ int i,j; - for (i=0; ismpl; - int32_t *bptr = args.ad_arr + nad*pair->ctrl; - -- if ( aptr[0]==bcf_int32_missing ) continue; -- if ( bptr[0]==bcf_int32_missing ) continue; -- if ( aptr[0]+aptr[1] < args.min_dp ) continue; -- if ( bptr[0]+bptr[1] < args.min_dp ) continue; -- if ( aptr[1] < args.min_alt_dp && bptr[1] < args.min_alt_dp ) continue; -+ // Find the two most frequent alleles -+ int nbig=-1,nsmall=-1,ibig=-1,ismall=-1; -+ for (j=0; jd.allele[ibig])!=strlen(rec->d.allele[ismall]) ) continue; -+ if ( args.variant_type==VCF_INDEL && strlen(rec->d.allele[ibig])==strlen(rec->d.allele[ismall]) ) continue; -+ } -+ -+ int iref,ialt,nalt; -+ if ( ibig > ismall ) ialt = ibig, iref = ismall, nalt = nbig; -+ else ialt = ismall, iref = ibig, nalt = nsmall; -+ -+ if ( nalt < args.min_alt_dp ) continue; - - args.ncmp++; - -- int n11 = aptr[0], n12 = aptr[1]; -- int n21 = bptr[0], n22 = bptr[1]; -+ int n11 = aptr[iref], n12 = aptr[ialt]; -+ int n21 = bptr[iref], n22 = bptr[ialt]; - double left, right, fisher; - kt_fisher_exact(n11,n12,n21,n22, &left,&right,&fisher); - if ( fisher >= args.th ) continue; - -- printf("FT\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%e", -+ printf("FT\t%s\t%s\t%s\t%"PRId64"\t%s\t%s\t%d\t%d\t%d\t%d\t%e", - pair->smpl_name,pair->ctrl_name, -- bcf_hdr_id2name(args.hdr,rec->rid), rec->pos+1, -+ bcf_hdr_id2name(args.hdr,rec->rid), (int64_t) rec->pos+1, -+ rec->d.allele[iref],rec->d.allele[ialt], - n11,n12,n21,n22, fisher - ); - if ( args.convert ) printf("\t%s", args.str.s); ---- python-pysam.orig/bcftools/plugins/ad-bias.c.pysam.c -+++ python-pysam/bcftools/plugins/ad-bias.c.pysam.c -@@ -28,6 +28,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -57,6 +58,7 @@ - convert_t *convert; - kstring_t str; - uint64_t nsite,ncmp; -+ int variant_type; - } - args_t; - -@@ -77,11 +79,12 @@ - " run \"bcftools plugin\" for a list of common options\n" - "\n" - "Plugin options:\n" -- " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" -- " -d, --min-dp Minimum required depth [0]\n" -- " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" -- " -s, --samples List of sample pairs, one tab-delimited pair per line\n" -- " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" -+ " -a, --min-alt-dp Minimum required alternate allele depth [1]\n" -+ " -d, --min-dp Minimum required depth [0]\n" -+ " -f, --format Optional tags to append to output (`bcftools query` style of format)\n" -+ " -s, --samples List of sample pairs, one tab-delimited pair per line\n" -+ " -t, --threshold Output only hits with p-value smaller than [1e-3]\n" -+ " -v, --variant-type Consider only variants of this type. (By default all variants are considered.)\n" - "\n" - "Example:\n" - " bcftools +ad-bias file.bcf -- -t 1e-3 -s samples.txt\n" -@@ -119,7 +122,7 @@ - - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -@@ -136,11 +139,12 @@ - {"format",required_argument,NULL,'f'}, - {"samples",required_argument,NULL,'s'}, - {"threshold",required_argument,NULL,'t'}, -+ {"variant-type",required_argument,NULL,'v'}, - {NULL,0,NULL,0} - }; - int c; - char *tmp; -- while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "?hs:t:f:d:a:v:",loptions,NULL)) >= 0) - { - switch (c) - { -@@ -157,6 +161,11 @@ - if ( *tmp ) error("Could not parse: -t %s\n", optarg); - break; - case 's': fname = optarg; break; -+ case 'v': -+ if ( !strcasecmp(optarg,"snp") || !strcasecmp(optarg,"snps") ) args.variant_type = VCF_SNP; -+ else if ( !strcasecmp(optarg,"indel") || !strcasecmp(optarg,"indels") ) args.variant_type = VCF_INDEL; -+ else error("Error: Variant type \"%s\" is not supported\n",optarg); -+ break; - case 'f': format = optarg; break; - case 'h': - case '?': -@@ -170,14 +179,29 @@ - fprintf(bcftools_stdout, "# The command line was:\tbcftools +ad-bias %s", argv[0]); - for (c=1; cn_allele < 2 ) return NULL; -+ - int nad = bcf_get_format_int32(args.hdr, rec, "AD", &args.ad_arr, &args.mad_arr); - if ( nad<0 ) return NULL; - nad /= bcf_hdr_nsamples(args.hdr); -@@ -185,30 +209,78 @@ - if ( args.convert ) convert_line(args.convert, rec, &args.str); - args.nsite++; - -- int i; -+ int i,j; - for (i=0; ismpl; - int32_t *bptr = args.ad_arr + nad*pair->ctrl; - -- if ( aptr[0]==bcf_int32_missing ) continue; -- if ( bptr[0]==bcf_int32_missing ) continue; -- if ( aptr[0]+aptr[1] < args.min_dp ) continue; -- if ( bptr[0]+bptr[1] < args.min_dp ) continue; -- if ( aptr[1] < args.min_alt_dp && bptr[1] < args.min_alt_dp ) continue; -+ // Find the two most frequent alleles -+ int nbig=-1,nsmall=-1,ibig=-1,ismall=-1; -+ for (j=0; jd.allele[ibig])!=strlen(rec->d.allele[ismall]) ) continue; -+ if ( args.variant_type==VCF_INDEL && strlen(rec->d.allele[ibig])==strlen(rec->d.allele[ismall]) ) continue; -+ } -+ -+ int iref,ialt,nalt; -+ if ( ibig > ismall ) ialt = ibig, iref = ismall, nalt = nbig; -+ else ialt = ismall, iref = ibig, nalt = nsmall; -+ -+ if ( nalt < args.min_alt_dp ) continue; - - args.ncmp++; - -- int n11 = aptr[0], n12 = aptr[1]; -- int n21 = bptr[0], n22 = bptr[1]; -+ int n11 = aptr[iref], n12 = aptr[ialt]; -+ int n21 = bptr[iref], n22 = bptr[ialt]; - double left, right, fisher; - kt_fisher_exact(n11,n12,n21,n22, &left,&right,&fisher); - if ( fisher >= args.th ) continue; - -- fprintf(bcftools_stdout, "FT\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%e", -+ fprintf(bcftools_stdout, "FT\t%s\t%s\t%s\t%"PRId64"\t%s\t%s\t%d\t%d\t%d\t%d\t%e", - pair->smpl_name,pair->ctrl_name, -- bcf_hdr_id2name(args.hdr,rec->rid), rec->pos+1, -+ bcf_hdr_id2name(args.hdr,rec->rid), (int64_t) rec->pos+1, -+ rec->d.allele[iref],rec->d.allele[ialt], - n11,n12,n21,n22, fisher - ); - if ( args.convert ) fprintf(bcftools_stdout, "\t%s", args.str.s); ---- /dev/null -+++ python-pysam/bcftools/plugins/add-variantkey.c -@@ -0,0 +1,86 @@ -+/* plugins/add-variantkey.c -- add VariantKey INFO field. -+ -+ Copyright (C) 2017-2018 GENOMICS plc. -+ -+ Author: Nicola Asuni -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "../variantkey.h" -+ -+bcf_hdr_t *in_hdr, *out_hdr; -+ -+const char *about(void) -+{ -+ return "Add VariantKey INFO fields VKX and RSX.\n"; -+} -+ -+const char *usage(void) -+{ -+ return -+ "\n" -+ "About: Add VKX and RSX columns.\n" -+ "Usage: bcftools +add-variantkey [General Options] \n" -+ "Options:\n" -+ " run \"bcftools plugin\" for a list of common options\n" -+ "\n" -+ "Example:\n" -+ " bcftools +add-variantkey in.vcf\n" -+ "\n"; -+} -+ -+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -+{ -+ in_hdr = in; -+ out_hdr = out; -+ bcf_hdr_append(out_hdr, "##INFO="); -+ bcf_hdr_append(out_hdr, "##INFO="); -+ return 0; -+} -+ -+bcf1_t *process(bcf1_t *rec) -+{ -+ uint64_t vk = variantkey( -+ in_hdr->id[BCF_DT_CTG][rec->rid].key, -+ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), -+ rec->pos, -+ rec->d.allele[0], -+ strlen(rec->d.allele[0]), -+ rec->d.allele[1], -+ strlen(rec->d.allele[1])); -+ char vs[17]; -+ variantkey_hex(vk, vs); -+ bcf_update_info_string(out_hdr, rec, "VKX", vs); -+ char rsid[9]; -+ char *ptr = rec->d.id; -+ ptr += 2; // remove 'rs' -+ sprintf(rsid, "%08" PRIx32, (uint32_t)strtoul(ptr, NULL, 10)); -+ bcf_update_info_string(out_hdr, rec, "RSX", rsid); -+ return rec; -+} -+ -+void destroy(void) -+{ -+} ---- /dev/null -+++ python-pysam/bcftools/plugins/add-variantkey.c.pysam.c -@@ -0,0 +1,88 @@ -+#include "bcftools.pysam.h" -+ -+/* plugins/add-variantkey.c -- add VariantKey INFO field. -+ -+ Copyright (C) 2017-2018 GENOMICS plc. -+ -+ Author: Nicola Asuni -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "../variantkey.h" -+ -+bcf_hdr_t *in_hdr, *out_hdr; -+ -+const char *about(void) -+{ -+ return "Add VariantKey INFO fields VKX and RSX.\n"; -+} -+ -+const char *usage(void) -+{ -+ return -+ "\n" -+ "About: Add VKX and RSX columns.\n" -+ "Usage: bcftools +add-variantkey [General Options] \n" -+ "Options:\n" -+ " run \"bcftools plugin\" for a list of common options\n" -+ "\n" -+ "Example:\n" -+ " bcftools +add-variantkey in.vcf\n" -+ "\n"; -+} -+ -+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -+{ -+ in_hdr = in; -+ out_hdr = out; -+ bcf_hdr_append(out_hdr, "##INFO="); -+ bcf_hdr_append(out_hdr, "##INFO="); -+ return 0; -+} -+ -+bcf1_t *process(bcf1_t *rec) -+{ -+ uint64_t vk = variantkey( -+ in_hdr->id[BCF_DT_CTG][rec->rid].key, -+ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), -+ rec->pos, -+ rec->d.allele[0], -+ strlen(rec->d.allele[0]), -+ rec->d.allele[1], -+ strlen(rec->d.allele[1])); -+ char vs[17]; -+ variantkey_hex(vk, vs); -+ bcf_update_info_string(out_hdr, rec, "VKX", vs); -+ char rsid[9]; -+ char *ptr = rec->d.id; -+ ptr += 2; // remove 'rs' -+ sprintf(rsid, "%08" PRIx32, (uint32_t)strtoul(ptr, NULL, 10)); -+ bcf_update_info_string(out_hdr, rec, "RSX", rsid); -+ return rec; -+} -+ -+void destroy(void) -+{ -+} ---- python-pysam.orig/bcftools/plugins/af-dist.c -+++ python-pysam/bcftools/plugins/af-dist.c -@@ -170,12 +170,12 @@ - if ( dosage==1 ) - { - args->prob_dist[iRA]++; -- if ( list_RA ) printf("GT\t%s\t%d\t%s\t1\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pRA); -+ if ( list_RA ) printf("GT\t%s\t%"PRId64"\t%s\t1\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pRA); - } - else if ( dosage==2 ) - { - args->prob_dist[iAA]++; -- if ( list_AA ) printf("GT\t%s\t%d\t%s\t2\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pAA); -+ if ( list_AA ) printf("GT\t%s\t%"PRId64"\t%s\t2\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pAA); - } - } - ---- python-pysam.orig/bcftools/plugins/af-dist.c.pysam.c -+++ python-pysam/bcftools/plugins/af-dist.c.pysam.c -@@ -172,12 +172,12 @@ - if ( dosage==1 ) - { - args->prob_dist[iRA]++; -- if ( list_RA ) fprintf(bcftools_stdout, "GT\t%s\t%d\t%s\t1\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pRA); -+ if ( list_RA ) fprintf(bcftools_stdout, "GT\t%s\t%"PRId64"\t%s\t1\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pRA); - } - else if ( dosage==2 ) - { - args->prob_dist[iAA]++; -- if ( list_AA ) fprintf(bcftools_stdout, "GT\t%s\t%d\t%s\t2\t%f\n",chr,rec->pos+1,args->hdr->samples[i],pAA); -+ if ( list_AA ) fprintf(bcftools_stdout, "GT\t%s\t%"PRId64"\t%s\t2\t%f\n",chr,(int64_t) rec->pos+1,args->hdr->samples[i],pAA); - } - } - ---- /dev/null -+++ python-pysam/bcftools/plugins/allele-length.c -@@ -0,0 +1,113 @@ -+/* plugins/allele-length.c -- Calculate stats about the length of alleles -+ -+ Copyright (C) 2017-2018 GENOMICS plc. -+ -+ Author: Nicola Asuni -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+#include -+#include -+#include -+ -+#define MAXLEN 512 -+ -+static uint64_t numvar; -+static uint64_t numxvar; -+static uint64_t reflen[MAXLEN]; -+static uint64_t altlen[MAXLEN]; -+static uint64_t refaltlen[MAXLEN]; -+static uint64_t xrefaltlen[MAXLEN]; -+ -+const char *about(void) -+{ -+ return "Count the frequency of the length of REF, ALT and REF+ALT\n"; -+} -+ -+const char *usage(void) -+{ -+ return -+ "\n" -+ "About: Count the frequency of the length of alleles.\n" -+ "Usage: bcftools +allele-length [General Options] \n" -+ "Options:\n" -+ " run \"bcftools plugin\" for a list of common options\n" -+ "\n" -+ "Example:\n" -+ " bcftools +allele-length in.vcf\n" -+ "\n"; -+} -+ -+// return 0 if the string contains characters other than standard ACGT base letters -+int contain_non_base(const char *str) -+{ -+ int c; -+ while ((c = *str++)) -+ { -+ if ((c != 'A') && (c != 'a') && (c != 'C') && (c != 'c') && (c != 'G') && (c != 'g') && (c != 'T') && (c != 't')) -+ { -+ return 1; -+ } -+ } -+ return 0; -+} -+ -+// Called once at startup, allows to initialize local variables. -+// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. -+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -+{ -+ numvar = 0; -+ int i = 0; -+ for(i = 0; i < MAXLEN; i++) { -+ reflen[i] = 0; -+ altlen[i] = 0; -+ refaltlen[i] = 0; -+ xrefaltlen[i] = 0; -+ } -+ return 1; -+} -+ -+// Called for each VCF record. Return rec to output the line or NULL to suppress output. -+bcf1_t *process(bcf1_t *rec) -+{ -+ int rl = strlen(rec->d.allele[0]); -+ int al = strlen(rec->d.allele[1]); -+ reflen[rl] += 1; -+ altlen[al] += 1; -+ refaltlen[(rl + al)] += 1; -+ if ((contain_non_base(rec->d.allele[0])) || (contain_non_base(rec->d.allele[1]))) -+ { -+ xrefaltlen[(rl + al)] += 1; -+ numxvar++; -+ } -+ numvar++; -+ return NULL; -+} -+ -+// Print final output -+void destroy(void) -+{ -+ int i = 0; -+ printf("LENGTH\tREF\tALT\tREF+ALT\tREF+ALT WITH NON-BASE NUCLEOTIDES\n"); -+ for(i = 0; i < MAXLEN; i++) { -+ printf("%d\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, reflen[i], altlen[i], refaltlen[i], xrefaltlen[i]); -+ } -+ printf("\t\t\t%"PRIu64"\t%"PRIu64"\n", numvar, numxvar); -+} ---- /dev/null -+++ python-pysam/bcftools/plugins/allele-length.c.pysam.c -@@ -0,0 +1,115 @@ -+#include "bcftools.pysam.h" -+ -+/* plugins/allele-length.c -- Calculate stats about the length of alleles -+ -+ Copyright (C) 2017-2018 GENOMICS plc. -+ -+ Author: Nicola Asuni -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+#include -+#include -+#include -+ -+#define MAXLEN 512 -+ -+static uint64_t numvar; -+static uint64_t numxvar; -+static uint64_t reflen[MAXLEN]; -+static uint64_t altlen[MAXLEN]; -+static uint64_t refaltlen[MAXLEN]; -+static uint64_t xrefaltlen[MAXLEN]; -+ -+const char *about(void) -+{ -+ return "Count the frequency of the length of REF, ALT and REF+ALT\n"; -+} -+ -+const char *usage(void) -+{ -+ return -+ "\n" -+ "About: Count the frequency of the length of alleles.\n" -+ "Usage: bcftools +allele-length [General Options] \n" -+ "Options:\n" -+ " run \"bcftools plugin\" for a list of common options\n" -+ "\n" -+ "Example:\n" -+ " bcftools +allele-length in.vcf\n" -+ "\n"; -+} -+ -+// return 0 if the string contains characters other than standard ACGT base letters -+int contain_non_base(const char *str) -+{ -+ int c; -+ while ((c = *str++)) -+ { -+ if ((c != 'A') && (c != 'a') && (c != 'C') && (c != 'c') && (c != 'G') && (c != 'g') && (c != 'T') && (c != 't')) -+ { -+ return 1; -+ } -+ } -+ return 0; -+} -+ -+// Called once at startup, allows to initialize local variables. -+// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. -+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -+{ -+ numvar = 0; -+ int i = 0; -+ for(i = 0; i < MAXLEN; i++) { -+ reflen[i] = 0; -+ altlen[i] = 0; -+ refaltlen[i] = 0; -+ xrefaltlen[i] = 0; -+ } -+ return 1; -+} -+ -+// Called for each VCF record. Return rec to output the line or NULL to suppress output. -+bcf1_t *process(bcf1_t *rec) -+{ -+ int rl = strlen(rec->d.allele[0]); -+ int al = strlen(rec->d.allele[1]); -+ reflen[rl] += 1; -+ altlen[al] += 1; -+ refaltlen[(rl + al)] += 1; -+ if ((contain_non_base(rec->d.allele[0])) || (contain_non_base(rec->d.allele[1]))) -+ { -+ xrefaltlen[(rl + al)] += 1; -+ numxvar++; -+ } -+ numvar++; -+ return NULL; -+} -+ -+// Print final output -+void destroy(void) -+{ -+ int i = 0; -+ fprintf(bcftools_stdout, "LENGTH\tREF\tALT\tREF+ALT\tREF+ALT WITH NON-BASE NUCLEOTIDES\n"); -+ for(i = 0; i < MAXLEN; i++) { -+ fprintf(bcftools_stdout, "%d\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, reflen[i], altlen[i], refaltlen[i], xrefaltlen[i]); -+ } -+ fprintf(bcftools_stdout, "\t\t\t%"PRIu64"\t%"PRIu64"\n", numvar, numxvar); -+} ---- python-pysam.orig/bcftools/plugins/check-ploidy.c -+++ python-pysam/bcftools/plugins/check-ploidy.c -@@ -101,7 +101,7 @@ - if ( !fmt_gt ) return NULL; // no GT tag - - if ( args->ndat != rec->n_sample ) -- error("Incorrect number of samples at %s:%d .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),rec->pos+1,rec->n_sample,args->ndat); -+ error("Incorrect number of samples at %s:%"PRId64" .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_sample,args->ndat); - - if ( args->rid!=rec->rid && args->rid!=-1 ) - { -@@ -143,7 +143,7 @@ - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; -- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->hdr,rec),rec->pos+1); break; -+ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); break; - } - #undef BRANCH_INT - ---- python-pysam.orig/bcftools/plugins/check-ploidy.c.pysam.c -+++ python-pysam/bcftools/plugins/check-ploidy.c.pysam.c -@@ -103,7 +103,7 @@ - if ( !fmt_gt ) return NULL; // no GT tag - - if ( args->ndat != rec->n_sample ) -- error("Incorrect number of samples at %s:%d .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),rec->pos+1,rec->n_sample,args->ndat); -+ error("Incorrect number of samples at %s:%"PRId64" .. found %d, expected %d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_sample,args->ndat); - - if ( args->rid!=rec->rid && args->rid!=-1 ) - { -@@ -145,7 +145,7 @@ - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; -- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->hdr,rec),rec->pos+1); break; -+ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); break; - } - #undef BRANCH_INT - ---- python-pysam.orig/bcftools/plugins/check-sparsity.c -+++ python-pysam/bcftools/plugins/check-sparsity.c -@@ -129,7 +129,7 @@ - if ( args->itr ) hts_itr_destroy(args->itr); - if ( args->tbx ) tbx_destroy(args->tbx); - if ( args->idx ) hts_idx_destroy(args->idx); -- hts_close(args->fp); -+ if ( hts_close(args->fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); - } - - static void report(args_t *args, const char *reg) -@@ -247,7 +247,7 @@ - args->min_sites = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse: -n %s\n", optarg); - break; -- case 'R': args->region_is_file = 1; -+ case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; - case 'h': - case '?': ---- python-pysam.orig/bcftools/plugins/check-sparsity.c.pysam.c -+++ python-pysam/bcftools/plugins/check-sparsity.c.pysam.c -@@ -131,7 +131,7 @@ - if ( args->itr ) hts_itr_destroy(args->itr); - if ( args->tbx ) tbx_destroy(args->tbx); - if ( args->idx ) hts_idx_destroy(args->idx); -- hts_close(args->fp); -+ if ( hts_close(args->fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); - } - - static void report(args_t *args, const char *reg) -@@ -249,7 +249,7 @@ - args->min_sites = strtol(optarg,&tmp,10); - if ( *tmp ) error("Could not parse: -n %s\n", optarg); - break; -- case 'R': args->region_is_file = 1; -+ case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; - case 'h': - case '?': ---- python-pysam.orig/bcftools/plugins/contrast.c -+++ python-pysam/bcftools/plugins/contrast.c -@@ -27,12 +27,15 @@ - #include - #include - #include -+#include - #include - #include // for isatty -+#include - #include - #include - #include - #include -+#include - #include - #include "bcftools.h" - #include "filter.h" -@@ -42,21 +45,29 @@ - #define FLT_INCLUDE 1 - #define FLT_EXCLUDE 2 - -+#define PRINT_PASSOC (1<<0) -+#define PRINT_FASSOC (1<<1) -+#define PRINT_NASSOC (1<<2) -+#define PRINT_NOVELAL (1<<3) -+#define PRINT_NOVELGT (1<<4) -+ - typedef struct - { -- int argc, filter_logic, regions_is_file, targets_is_file, output_type; -- char **argv, *output_fname, *fname, *regions, *targets, *filter_str; -- char *bg_samples_str, *novel_samples_str; -- int *bg_smpl, *novel_smpl, nbg_smpl, nnovel_smpl; -+ int argc, filter_logic, regions_is_file, targets_is_file, output_type, force_samples; -+ uint32_t annots; -+ char **argv, *output_fname, *fname, *regions, *targets, *filter_str, *annots_str; -+ char *control_samples_str, *case_samples_str, *max_AC_str; -+ int *control_smpl, *case_smpl, ncontrol_smpl, ncase_smpl; - filter_t *filter; - bcf_srs_t *sr; - bcf_hdr_t *hdr, *hdr_out; - htsFile *out_fh; - int32_t *gts; - int mgts; -- uint32_t *bg_gts; -- int nbg_gts, mbg_gts, ntotal, nskipped, ntested, nnovel_al, nnovel_gt; -- kstring_t novel_als_smpl, novel_gts_smpl; -+ uint32_t *control_gts; -+ int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt; -+ kstring_t case_als_smpl, case_gts_smpl; -+ int max_AC, nals[4]; // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region - } - args_t; - -@@ -71,30 +82,110 @@ - { - return - "\n" -- "About: Finds novel alleles and genotypes in two groups of samples. Adds\n" -- " an annotation which lists samples with a novel allele (INFO/NOVELAL)\n" -- " or a novel genotype (INFO/NOVELGT)\n" -+ "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n" -+ " genotypes in two groups of samples. Adds the following INFO annotations:\n" -+ " - PASSOC .. Fisher's exact test probability of genotypic association (REF vs non-REF allele)\n" -+ " - FASSOC .. proportion of non-REF allele in controls and cases\n" -+ " - NASSOC .. number of control-ref, control-alt, case-ref and case-alt alleles\n" -+ " - NOVELAL .. lists samples with a novel allele not observed in the control group\n" -+ " - NOVELGT .. lists samples with a novel genotype not observed in the control group\n" - "Usage: bcftools +contrast [Plugin Options]\n" - "Plugin options:\n" -- " -0, --bg-samples list of background samples\n" -- " -1, --novel-samples list of samples where novel allele or genotype are expected\n" -- " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -- " -i, --include EXPR include sites and samples for which the expression is true\n" -- " -o, --output FILE output file name [stdout]\n" -- " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -- " -r, --regions REG restrict to comma-separated list of regions\n" -- " -R, --regions-file FILE restrict to regions listed in a file\n" -- " -t, --targets REG similar to -r but streams rather than index-jumps\n" -- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ " -a, --annots list of annotations to output [PASSOC,FASSOC,NOVELAL]\n" -+ " -0, --control-samples file or comma-separated list of control (background) samples\n" -+ " -1, --case-samples file or comma-separated list of samples where novel allele or genotype is expected\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " -f, --max-allele-freq NUM calculate enrichment of rare alleles. Floating point numbers between 0 and 1 are\n" -+ " interpreted as ALT allele frequencies, integers as ALT allele counts\n" -+ " --force-samples continue even if some samples listed in the -0,-1 files are missing from the VCF\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " -o, --output FILE output file name [stdout]\n" -+ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ " -r, --regions REG restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REG similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - "\n" - "Example:\n" - " # Test if any of the samples a,b is different from the samples c,d,e\n" - " bcftools +contrast -0 c,d,e -1 a,b file.bcf\n" -+ "\n" -+ " # Same as above, but read samples from a file. In case of a name collision, the sample name\n" -+ " # has precedence: the existence of a file with a list of samples is not checked unless no such\n" -+ " # sample exists in the VCF. Use a full path (e.g. \"./string\" instead of \"string\") to avoid\n" -+ " # name clashes\n" -+ " bcftools +contrast -0 samples0.txt -1 samples1.txt file.bcf\n" -+ "\n" -+ " # The same as above but checks for enrichment of rare alleles, AF<0.001 in this example, in a region\n" -+ " bcftools +contrast -r 20:1000-2000 -f 0.001 -0 samples0.txt -1 samples1.txt file.bcf\n" - "\n"; - } - -+static int cmp_int(const void *a, const void *b) -+{ -+ if ( *((int*)a) < *((int*)b) ) return -1; -+ if ( *((int*)a) > *((int*)b) ) return -1; -+ return 0; -+} -+static void read_sample_list_or_file(bcf_hdr_t *hdr, const char *str, int **smpl, int *nsmpl, int force_samples) -+{ -+ char **str_list = NULL; -+ int i,j, *list, nlist = 0, is_file, nskipped = 0; -+ -+ for (is_file=0; is_file<=1; is_file++) -+ { -+ if ( str_list ) -+ { -+ for (i=0; i= 0 ) continue; -+ if ( is_file ) -+ { -+ if ( !force_samples ) error("The sample \"%s\" is not present in the VCF. Use --force-samples to proceed anyway.\n", str_list[i]); -+ j--; -+ nskipped++; -+ continue; -+ } -+ break; -+ } -+ if ( i==nlist ) break; -+ } -+ for (i=0; i1?"s":"",nskipped,str,nskipped>1?"are":"is"); -+ free(str_list); -+ qsort(list,nlist,sizeof(*list),cmp_int); -+ *smpl = list; -+ *nsmpl = nlist; -+} -+ - static void init_data(args_t *args) - { -+ int ntmp, i; -+ char **tmp = hts_readlist(args->annots_str, 0, &ntmp); -+ for (i=0; iannots |= PRINT_PASSOC; -+ else if ( !strcasecmp("FASSOC",tmp[i]) ) args->annots |= PRINT_FASSOC; -+ else if ( !strcasecmp("NASSOC",tmp[i]) ) args->annots |= PRINT_NASSOC; -+ else if ( !strcasecmp("NOVELAL",tmp[i]) ) args->annots |= PRINT_NOVELAL; -+ else if ( !strcasecmp("NOVELGT",tmp[i]) ) args->annots |= PRINT_NOVELGT; -+ else error("The annotation is not recognised: %s\n", tmp[i]); -+ free(tmp[i]); -+ } -+ free(tmp); -+ - args->sr = bcf_sr_init(); - if ( args->regions ) - { -@@ -105,47 +196,51 @@ - if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); - args->hdr = bcf_sr_get_header(args->sr,0); - args->hdr_out = bcf_hdr_dup(args->hdr); -- bcf_hdr_append(args->hdr_out, "##INFO="); -- bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_PASSOC ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_FASSOC ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_NASSOC ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_NOVELAL ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_NOVELGT ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); - - if ( args->filter_str ) - args->filter = filter_init(args->hdr, args->filter_str); - -- int i; -- char **smpl = hts_readlist(args->bg_samples_str, 0, &args->nbg_smpl); -- args->bg_smpl = (int*) malloc(sizeof(int)*args->nbg_smpl); -- for (i=0; inbg_smpl; i++) -- { -- args->bg_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); -- if ( args->bg_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); -- free(smpl[i]); -- } -- free(smpl); -- -- smpl = hts_readlist(args->novel_samples_str, 0, &args->nnovel_smpl); -- args->novel_smpl = (int*) malloc(sizeof(int)*args->nnovel_smpl); -- for (i=0; innovel_smpl; i++) -- { -- args->novel_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); -- if ( args->novel_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); -- free(smpl[i]); -- } -- free(smpl); -+ read_sample_list_or_file(args->hdr, args->control_samples_str, &args->control_smpl, &args->ncontrol_smpl, args->force_samples); -+ read_sample_list_or_file(args->hdr, args->case_samples_str, &args->case_smpl, &args->ncase_smpl, args->force_samples); - - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -- bcf_hdr_write(args->out_fh, args->hdr_out); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ -+ if ( args->max_AC_str ) -+ { -+ char *tmp; -+ args->max_AC = strtol(args->max_AC_str, &tmp, 10); -+ if ( tmp==args->max_AC_str || *tmp ) -+ { -+ double val = strtod(args->max_AC_str, &tmp); -+ if ( tmp==args->max_AC_str || *tmp ) error("Could not parse the argument: -f, --max-allele-freq %s\n", args->max_AC_str); -+ if ( val<0 || val>1 ) error("Expected integer or float from the range [0,1]: -f, --max-allele-freq %s\n", args->max_AC_str); -+ args->max_AC = val * bcf_hdr_nsamples(args->hdr); -+ if ( !args->max_AC ) args->max_AC = 1; -+ } -+ } - } - static void destroy_data(args_t *args) - { - bcf_hdr_destroy(args->hdr_out); -- hts_close(args->out_fh); -- free(args->novel_als_smpl.s); -- free(args->novel_gts_smpl.s); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); -+ free(args->case_als_smpl.s); -+ free(args->case_gts_smpl.s); - free(args->gts); -- free(args->bg_gts); -- free(args->bg_smpl); -- free(args->novel_smpl); -+ free(args->control_gts); -+ free(args->control_smpl); -+ free(args->case_smpl); - if ( args->filter ) filter_destroy(args->filter); - bcf_sr_destroy(args->sr); - free(args); -@@ -191,13 +286,14 @@ - ngts /= rec->n_sample; - if ( ngts>2 ) error("todo: ploidy=%d\n", ngts); - -- args->nbg_gts = 0; -- uint32_t bg_als = 0; -+ args->ncontrol_gts = 0; -+ uint32_t control_als = 0; -+ int32_t nals[4] = {0,0,0,0}; // ctrl-ref, ctrl-alt, case-ref, case-alt - int i,j; -- for (i=0; inbg_smpl; i++) -+ for (i=0; incontrol_smpl; i++) - { - uint32_t gt = 0; -- int32_t *ptr = args->gts + args->bg_smpl[i]*ngts; -+ int32_t *ptr = args->gts + args->control_smpl[i]*ngts; - for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(stderr,"Too many alleles (>32) at %s:%"PRId64", skipping the site.\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - warned = 1; - } - args->nskipped++; - return -1; - } -- bg_als |= 1<bg_gts, &args->nbg_gts, &args->mbg_gts); -+ if ( args->annots & PRINT_NOVELGT ) -+ binary_insert(gt, &args->control_gts, &args->ncontrol_gts, &args->mcontrol_gts); - } -- if ( !bg_als ) -+ if ( !control_als ) - { - // all are missing - args->nskipped++; - return -1; - } - -- args->novel_als_smpl.l = 0; -- args->novel_gts_smpl.l = 0; -+ args->case_als_smpl.l = 0; -+ args->case_gts_smpl.l = 0; - - int has_gt = 0; -- for (i=0; innovel_smpl; i++) -+ for (i=0; incase_smpl; i++) - { -- int novel_al = 0; -+ int case_al = 0; - uint32_t gt = 0; -- int32_t *ptr = args->gts + args->novel_smpl[i]*ngts; -+ int32_t *ptr = args->gts + args->case_smpl[i]*ngts; - for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(stderr,"Too many alleles (>32) at %s:%"PRId64", skipping. (todo?)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - warned = 1; - } - args->nskipped++; - return -1; - } -- if ( !(bg_als & (1<hdr->samples[ args->novel_smpl[i] ]; -- if ( novel_al ) -+ char *smpl = args->hdr->samples[ args->case_smpl[i] ]; -+ if ( case_al ) - { -- if ( args->novel_als_smpl.l ) kputc(',', &args->novel_als_smpl); -- kputs(smpl, &args->novel_als_smpl); -+ if ( args->annots & PRINT_NOVELAL ) -+ { -+ if ( args->case_als_smpl.l ) kputc(',', &args->case_als_smpl); -+ kputs(smpl, &args->case_als_smpl); -+ } - } -- else if ( !binary_search(gt, args->bg_gts, args->nbg_gts) ) -+ else if ( (args->annots & PRINT_NOVELGT) && !binary_search(gt, args->control_gts, args->ncontrol_gts) ) - { -- if ( args->novel_gts_smpl.l ) kputc(',', &args->novel_gts_smpl); -- kputs(smpl, &args->novel_gts_smpl); -+ if ( args->case_gts_smpl.l ) kputc(',', &args->case_gts_smpl); -+ kputs(smpl, &args->case_gts_smpl); - } - } - if ( !has_gt ) -@@ -273,15 +377,54 @@ - args->nskipped++; - return -1; - } -- if ( args->novel_als_smpl.l ) -+ -+ if ( args->max_AC ) - { -- bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->novel_als_smpl.s); -- args->nnovel_al++; -+ if ( nals[0]+nals[2] > nals[1]+nals[3] ) -+ { -+ if ( nals[1]+nals[3] <= args->max_AC ) -+ for (i=0; i<4; i++) args->nals[i] += nals[i]; -+ } -+ else -+ { -+ if ( nals[0]+nals[2] <= args->max_AC ) -+ { -+ args->nals[0] += nals[1]; -+ args->nals[1] += nals[0]; -+ args->nals[2] += nals[3]; -+ args->nals[3] += nals[2]; -+ } -+ } -+ } -+ -+ float vals[2]; -+ if ( args->annots & PRINT_PASSOC ) -+ { -+ double left, right, fisher; -+ kt_fisher_exact(nals[0],nals[1],nals[2],nals[3], &left,&right,&fisher); -+ vals[0] = fisher; -+ bcf_update_info_float(args->hdr_out, rec, "PASSOC", vals, 1); - } -- if ( args->novel_gts_smpl.l ) -+ if ( args->annots & PRINT_FASSOC ) - { -- bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->novel_gts_smpl.s); -- args->nnovel_gt++; -+ if ( nals[0]+nals[1] ) vals[0] = (float)nals[1]/(nals[0]+nals[1]); -+ else bcf_float_set_missing(vals[0]); -+ if ( nals[2]+nals[3] ) vals[1] = (float)nals[3]/(nals[2]+nals[3]); -+ else bcf_float_set_missing(vals[1]); -+ bcf_update_info_float(args->hdr_out, rec, "FASSOC", vals, 2); -+ } -+ if ( args->annots & PRINT_NASSOC ) -+ bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4); -+ -+ if ( args->case_als_smpl.l ) -+ { -+ bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s); -+ args->ncase_al++; -+ } -+ if ( args->case_gts_smpl.l ) -+ { -+ bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s); -+ args->ncase_gt++; - } - args->ntested++; - return 0; -@@ -292,10 +435,16 @@ - args_t *args = (args_t*) calloc(1,sizeof(args_t)); - args->argc = argc; args->argv = argv; - args->output_fname = "-"; -+ args->annots_str = "PASSOC,FASSOC"; - static struct option loptions[] = - { -- {"bg-samples",required_argument,0,'0'}, -- {"novel-samples",required_argument,0,'1'}, -+ {"max-allele-freq",required_argument,0,'f'}, -+ {"annots",required_argument,0,'a'}, -+ {"force-samples",no_argument,0,1}, -+ {"bg-samples",required_argument,0,'0'}, // renamed to --control-samples, leaving it in for backward compatibility -+ {"control-samples",required_argument,0,'0'}, -+ {"novel-samples",required_argument,0,'1'}, // renamed to --case-samples, leaving it in for backward compatibility -+ {"case-samples",required_argument,0,'1'}, - {"include",required_argument,0,'i'}, - {"exclude",required_argument,0,'e'}, - {"output",required_argument,NULL,'o'}, -@@ -307,12 +456,15 @@ - {NULL,0,NULL,0} - }; - int c; -- while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0) - { - switch (c) - { -- case '0': args->bg_samples_str = optarg; break; -- case '1': args->novel_samples_str = optarg; break; -+ case 1 : args->force_samples = 1; break; -+ case 'f': args->max_AC_str = optarg; break; -+ case 'a': args->annots_str = optarg; break; -+ case '0': args->control_samples_str = optarg; break; -+ case '1': args->case_samples_str = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 't': args->targets = optarg; break; -@@ -354,10 +506,18 @@ - if ( !pass ) continue; - } - process_record(args, rec); -- bcf_write(args->out_fh, args->hdr_out, rec); -+ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - -- fprintf(stderr,"Total/processed/skipped/novel_allele/novel_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->nnovel_al, args->nnovel_gt); -+ fprintf(stderr,"Total/processed/skipped/case_allele/case_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->ncase_al, args->ncase_gt); -+ if ( args->max_AC ) -+ { -+ double val1, val2, fisher; -+ kt_fisher_exact(args->nals[0],args->nals[1],args->nals[2],args->nals[3], &val1,&val2,&fisher); -+ val1 = args->nals[0]+args->nals[1] ? (float)args->nals[1]/(args->nals[0]+args->nals[1]) : 0; -+ val2 = args->nals[2]+args->nals[3] ? (float)args->nals[3]/(args->nals[2]+args->nals[3]) : 0; -+ fprintf(stderr,"max_AC/PASSOC/FASSOC/NASSOC:\t%d\t%e\t%f,%f\t%d,%d,%d,%d\n",args->max_AC,fisher,val1,val2,args->nals[0],args->nals[1],args->nals[2],args->nals[3]); -+ } - destroy_data(args); - - return 0; ---- python-pysam.orig/bcftools/plugins/contrast.c.pysam.c -+++ python-pysam/bcftools/plugins/contrast.c.pysam.c -@@ -29,12 +29,15 @@ - #include - #include - #include -+#include - #include - #include // for isatty -+#include - #include - #include - #include - #include -+#include - #include - #include "bcftools.h" - #include "filter.h" -@@ -44,21 +47,29 @@ - #define FLT_INCLUDE 1 - #define FLT_EXCLUDE 2 - -+#define PRINT_PASSOC (1<<0) -+#define PRINT_FASSOC (1<<1) -+#define PRINT_NASSOC (1<<2) -+#define PRINT_NOVELAL (1<<3) -+#define PRINT_NOVELGT (1<<4) -+ - typedef struct - { -- int argc, filter_logic, regions_is_file, targets_is_file, output_type; -- char **argv, *output_fname, *fname, *regions, *targets, *filter_str; -- char *bg_samples_str, *novel_samples_str; -- int *bg_smpl, *novel_smpl, nbg_smpl, nnovel_smpl; -+ int argc, filter_logic, regions_is_file, targets_is_file, output_type, force_samples; -+ uint32_t annots; -+ char **argv, *output_fname, *fname, *regions, *targets, *filter_str, *annots_str; -+ char *control_samples_str, *case_samples_str, *max_AC_str; -+ int *control_smpl, *case_smpl, ncontrol_smpl, ncase_smpl; - filter_t *filter; - bcf_srs_t *sr; - bcf_hdr_t *hdr, *hdr_out; - htsFile *out_fh; - int32_t *gts; - int mgts; -- uint32_t *bg_gts; -- int nbg_gts, mbg_gts, ntotal, nskipped, ntested, nnovel_al, nnovel_gt; -- kstring_t novel_als_smpl, novel_gts_smpl; -+ uint32_t *control_gts; -+ int ncontrol_gts, mcontrol_gts, ntotal, nskipped, ntested, ncase_al, ncase_gt; -+ kstring_t case_als_smpl, case_gts_smpl; -+ int max_AC, nals[4]; // nals: number of control-ref, control-alt, case-ref and case-alt alleles in the region - } - args_t; - -@@ -73,30 +84,110 @@ - { - return - "\n" -- "About: Finds novel alleles and genotypes in two groups of samples. Adds\n" -- " an annotation which lists samples with a novel allele (INFO/NOVELAL)\n" -- " or a novel genotype (INFO/NOVELGT)\n" -+ "About: Runs a basic association test, per-site or in a region, and checks for novel alleles and\n" -+ " genotypes in two groups of samples. Adds the following INFO annotations:\n" -+ " - PASSOC .. Fisher's exact test probability of genotypic association (REF vs non-REF allele)\n" -+ " - FASSOC .. proportion of non-REF allele in controls and cases\n" -+ " - NASSOC .. number of control-ref, control-alt, case-ref and case-alt alleles\n" -+ " - NOVELAL .. lists samples with a novel allele not observed in the control group\n" -+ " - NOVELGT .. lists samples with a novel genotype not observed in the control group\n" - "Usage: bcftools +contrast [Plugin Options]\n" - "Plugin options:\n" -- " -0, --bg-samples list of background samples\n" -- " -1, --novel-samples list of samples where novel allele or genotype are expected\n" -- " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -- " -i, --include EXPR include sites and samples for which the expression is true\n" -- " -o, --output FILE output file name [bcftools_stdout]\n" -- " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -- " -r, --regions REG restrict to comma-separated list of regions\n" -- " -R, --regions-file FILE restrict to regions listed in a file\n" -- " -t, --targets REG similar to -r but streams rather than index-jumps\n" -- " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ " -a, --annots list of annotations to output [PASSOC,FASSOC,NOVELAL]\n" -+ " -0, --control-samples file or comma-separated list of control (background) samples\n" -+ " -1, --case-samples file or comma-separated list of samples where novel allele or genotype is expected\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " -f, --max-allele-freq NUM calculate enrichment of rare alleles. Floating point numbers between 0 and 1 are\n" -+ " interpreted as ALT allele frequencies, integers as ALT allele counts\n" -+ " --force-samples continue even if some samples listed in the -0,-1 files are missing from the VCF\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " -o, --output FILE output file name [bcftools_stdout]\n" -+ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ " -r, --regions REG restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REG similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - "\n" - "Example:\n" - " # Test if any of the samples a,b is different from the samples c,d,e\n" - " bcftools +contrast -0 c,d,e -1 a,b file.bcf\n" -+ "\n" -+ " # Same as above, but read samples from a file. In case of a name collision, the sample name\n" -+ " # has precedence: the existence of a file with a list of samples is not checked unless no such\n" -+ " # sample exists in the VCF. Use a full path (e.g. \"./string\" instead of \"string\") to avoid\n" -+ " # name clashes\n" -+ " bcftools +contrast -0 samples0.txt -1 samples1.txt file.bcf\n" -+ "\n" -+ " # The same as above but checks for enrichment of rare alleles, AF<0.001 in this example, in a region\n" -+ " bcftools +contrast -r 20:1000-2000 -f 0.001 -0 samples0.txt -1 samples1.txt file.bcf\n" - "\n"; - } - -+static int cmp_int(const void *a, const void *b) -+{ -+ if ( *((int*)a) < *((int*)b) ) return -1; -+ if ( *((int*)a) > *((int*)b) ) return -1; -+ return 0; -+} -+static void read_sample_list_or_file(bcf_hdr_t *hdr, const char *str, int **smpl, int *nsmpl, int force_samples) -+{ -+ char **str_list = NULL; -+ int i,j, *list, nlist = 0, is_file, nskipped = 0; -+ -+ for (is_file=0; is_file<=1; is_file++) -+ { -+ if ( str_list ) -+ { -+ for (i=0; i= 0 ) continue; -+ if ( is_file ) -+ { -+ if ( !force_samples ) error("The sample \"%s\" is not present in the VCF. Use --force-samples to proceed anyway.\n", str_list[i]); -+ j--; -+ nskipped++; -+ continue; -+ } -+ break; -+ } -+ if ( i==nlist ) break; -+ } -+ for (i=0; i1?"s":"",nskipped,str,nskipped>1?"are":"is"); -+ free(str_list); -+ qsort(list,nlist,sizeof(*list),cmp_int); -+ *smpl = list; -+ *nsmpl = nlist; -+} -+ - static void init_data(args_t *args) - { -+ int ntmp, i; -+ char **tmp = hts_readlist(args->annots_str, 0, &ntmp); -+ for (i=0; iannots |= PRINT_PASSOC; -+ else if ( !strcasecmp("FASSOC",tmp[i]) ) args->annots |= PRINT_FASSOC; -+ else if ( !strcasecmp("NASSOC",tmp[i]) ) args->annots |= PRINT_NASSOC; -+ else if ( !strcasecmp("NOVELAL",tmp[i]) ) args->annots |= PRINT_NOVELAL; -+ else if ( !strcasecmp("NOVELGT",tmp[i]) ) args->annots |= PRINT_NOVELGT; -+ else error("The annotation is not recognised: %s\n", tmp[i]); -+ free(tmp[i]); -+ } -+ free(tmp); -+ - args->sr = bcf_sr_init(); - if ( args->regions ) - { -@@ -107,47 +198,51 @@ - if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); - args->hdr = bcf_sr_get_header(args->sr,0); - args->hdr_out = bcf_hdr_dup(args->hdr); -- bcf_hdr_append(args->hdr_out, "##INFO="); -- bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_PASSOC ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_FASSOC ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_NASSOC ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_NOVELAL ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); -+ if ( args->annots & PRINT_NOVELGT ) -+ bcf_hdr_append(args->hdr_out, "##INFO="); - - if ( args->filter_str ) - args->filter = filter_init(args->hdr, args->filter_str); - -- int i; -- char **smpl = hts_readlist(args->bg_samples_str, 0, &args->nbg_smpl); -- args->bg_smpl = (int*) malloc(sizeof(int)*args->nbg_smpl); -- for (i=0; inbg_smpl; i++) -- { -- args->bg_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); -- if ( args->bg_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); -- free(smpl[i]); -- } -- free(smpl); -- -- smpl = hts_readlist(args->novel_samples_str, 0, &args->nnovel_smpl); -- args->novel_smpl = (int*) malloc(sizeof(int)*args->nnovel_smpl); -- for (i=0; innovel_smpl; i++) -- { -- args->novel_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); -- if ( args->novel_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); -- free(smpl[i]); -- } -- free(smpl); -+ read_sample_list_or_file(args->hdr, args->control_samples_str, &args->control_smpl, &args->ncontrol_smpl, args->force_samples); -+ read_sample_list_or_file(args->hdr, args->case_samples_str, &args->case_smpl, &args->ncase_smpl, args->force_samples); - - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -- bcf_hdr_write(args->out_fh, args->hdr_out); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ -+ if ( args->max_AC_str ) -+ { -+ char *tmp; -+ args->max_AC = strtol(args->max_AC_str, &tmp, 10); -+ if ( tmp==args->max_AC_str || *tmp ) -+ { -+ double val = strtod(args->max_AC_str, &tmp); -+ if ( tmp==args->max_AC_str || *tmp ) error("Could not parse the argument: -f, --max-allele-freq %s\n", args->max_AC_str); -+ if ( val<0 || val>1 ) error("Expected integer or float from the range [0,1]: -f, --max-allele-freq %s\n", args->max_AC_str); -+ args->max_AC = val * bcf_hdr_nsamples(args->hdr); -+ if ( !args->max_AC ) args->max_AC = 1; -+ } -+ } - } - static void destroy_data(args_t *args) - { - bcf_hdr_destroy(args->hdr_out); -- hts_close(args->out_fh); -- free(args->novel_als_smpl.s); -- free(args->novel_gts_smpl.s); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); -+ free(args->case_als_smpl.s); -+ free(args->case_gts_smpl.s); - free(args->gts); -- free(args->bg_gts); -- free(args->bg_smpl); -- free(args->novel_smpl); -+ free(args->control_gts); -+ free(args->control_smpl); -+ free(args->case_smpl); - if ( args->filter ) filter_destroy(args->filter); - bcf_sr_destroy(args->sr); - free(args); -@@ -193,13 +288,14 @@ - ngts /= rec->n_sample; - if ( ngts>2 ) error("todo: ploidy=%d\n", ngts); - -- args->nbg_gts = 0; -- uint32_t bg_als = 0; -+ args->ncontrol_gts = 0; -+ uint32_t control_als = 0; -+ int32_t nals[4] = {0,0,0,0}; // ctrl-ref, ctrl-alt, case-ref, case-alt - int i,j; -- for (i=0; inbg_smpl; i++) -+ for (i=0; incontrol_smpl; i++) - { - uint32_t gt = 0; -- int32_t *ptr = args->gts + args->bg_smpl[i]*ngts; -+ int32_t *ptr = args->gts + args->control_smpl[i]*ngts; - for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr,"Too many alleles (>32) at %s:%"PRId64", skipping the site.\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - warned = 1; - } - args->nskipped++; - return -1; - } -- bg_als |= 1<bg_gts, &args->nbg_gts, &args->mbg_gts); -+ if ( args->annots & PRINT_NOVELGT ) -+ binary_insert(gt, &args->control_gts, &args->ncontrol_gts, &args->mcontrol_gts); - } -- if ( !bg_als ) -+ if ( !control_als ) - { - // all are missing - args->nskipped++; - return -1; - } - -- args->novel_als_smpl.l = 0; -- args->novel_gts_smpl.l = 0; -+ args->case_als_smpl.l = 0; -+ args->case_gts_smpl.l = 0; - - int has_gt = 0; -- for (i=0; innovel_smpl; i++) -+ for (i=0; incase_smpl; i++) - { -- int novel_al = 0; -+ int case_al = 0; - uint32_t gt = 0; -- int32_t *ptr = args->gts + args->novel_smpl[i]*ngts; -+ int32_t *ptr = args->gts + args->case_smpl[i]*ngts; - for (j=0; j32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr,"Too many alleles (>32) at %s:%"PRId64", skipping. (todo?)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - warned = 1; - } - args->nskipped++; - return -1; - } -- if ( !(bg_als & (1<hdr->samples[ args->novel_smpl[i] ]; -- if ( novel_al ) -+ char *smpl = args->hdr->samples[ args->case_smpl[i] ]; -+ if ( case_al ) - { -- if ( args->novel_als_smpl.l ) kputc(',', &args->novel_als_smpl); -- kputs(smpl, &args->novel_als_smpl); -+ if ( args->annots & PRINT_NOVELAL ) -+ { -+ if ( args->case_als_smpl.l ) kputc(',', &args->case_als_smpl); -+ kputs(smpl, &args->case_als_smpl); -+ } - } -- else if ( !binary_search(gt, args->bg_gts, args->nbg_gts) ) -+ else if ( (args->annots & PRINT_NOVELGT) && !binary_search(gt, args->control_gts, args->ncontrol_gts) ) - { -- if ( args->novel_gts_smpl.l ) kputc(',', &args->novel_gts_smpl); -- kputs(smpl, &args->novel_gts_smpl); -+ if ( args->case_gts_smpl.l ) kputc(',', &args->case_gts_smpl); -+ kputs(smpl, &args->case_gts_smpl); - } - } - if ( !has_gt ) -@@ -275,15 +379,54 @@ - args->nskipped++; - return -1; - } -- if ( args->novel_als_smpl.l ) -+ -+ if ( args->max_AC ) - { -- bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->novel_als_smpl.s); -- args->nnovel_al++; -+ if ( nals[0]+nals[2] > nals[1]+nals[3] ) -+ { -+ if ( nals[1]+nals[3] <= args->max_AC ) -+ for (i=0; i<4; i++) args->nals[i] += nals[i]; -+ } -+ else -+ { -+ if ( nals[0]+nals[2] <= args->max_AC ) -+ { -+ args->nals[0] += nals[1]; -+ args->nals[1] += nals[0]; -+ args->nals[2] += nals[3]; -+ args->nals[3] += nals[2]; -+ } -+ } -+ } -+ -+ float vals[2]; -+ if ( args->annots & PRINT_PASSOC ) -+ { -+ double left, right, fisher; -+ kt_fisher_exact(nals[0],nals[1],nals[2],nals[3], &left,&right,&fisher); -+ vals[0] = fisher; -+ bcf_update_info_float(args->hdr_out, rec, "PASSOC", vals, 1); - } -- if ( args->novel_gts_smpl.l ) -+ if ( args->annots & PRINT_FASSOC ) - { -- bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->novel_gts_smpl.s); -- args->nnovel_gt++; -+ if ( nals[0]+nals[1] ) vals[0] = (float)nals[1]/(nals[0]+nals[1]); -+ else bcf_float_set_missing(vals[0]); -+ if ( nals[2]+nals[3] ) vals[1] = (float)nals[3]/(nals[2]+nals[3]); -+ else bcf_float_set_missing(vals[1]); -+ bcf_update_info_float(args->hdr_out, rec, "FASSOC", vals, 2); -+ } -+ if ( args->annots & PRINT_NASSOC ) -+ bcf_update_info_int32(args->hdr_out, rec, "NASSOC", nals, 4); -+ -+ if ( args->case_als_smpl.l ) -+ { -+ bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->case_als_smpl.s); -+ args->ncase_al++; -+ } -+ if ( args->case_gts_smpl.l ) -+ { -+ bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->case_gts_smpl.s); -+ args->ncase_gt++; - } - args->ntested++; - return 0; -@@ -294,10 +437,16 @@ - args_t *args = (args_t*) calloc(1,sizeof(args_t)); - args->argc = argc; args->argv = argv; - args->output_fname = "-"; -+ args->annots_str = "PASSOC,FASSOC"; - static struct option loptions[] = - { -- {"bg-samples",required_argument,0,'0'}, -- {"novel-samples",required_argument,0,'1'}, -+ {"max-allele-freq",required_argument,0,'f'}, -+ {"annots",required_argument,0,'a'}, -+ {"force-samples",no_argument,0,1}, -+ {"bg-samples",required_argument,0,'0'}, // renamed to --control-samples, leaving it in for backward compatibility -+ {"control-samples",required_argument,0,'0'}, -+ {"novel-samples",required_argument,0,'1'}, // renamed to --case-samples, leaving it in for backward compatibility -+ {"case-samples",required_argument,0,'1'}, - {"include",required_argument,0,'i'}, - {"exclude",required_argument,0,'e'}, - {"output",required_argument,NULL,'o'}, -@@ -309,12 +458,15 @@ - {NULL,0,NULL,0} - }; - int c; -- while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:a:f:",loptions,NULL)) >= 0) - { - switch (c) - { -- case '0': args->bg_samples_str = optarg; break; -- case '1': args->novel_samples_str = optarg; break; -+ case 1 : args->force_samples = 1; break; -+ case 'f': args->max_AC_str = optarg; break; -+ case 'a': args->annots_str = optarg; break; -+ case '0': args->control_samples_str = optarg; break; -+ case '1': args->case_samples_str = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 't': args->targets = optarg; break; -@@ -356,10 +508,18 @@ - if ( !pass ) continue; - } - process_record(args, rec); -- bcf_write(args->out_fh, args->hdr_out, rec); -+ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - -- fprintf(bcftools_stderr,"Total/processed/skipped/novel_allele/novel_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->nnovel_al, args->nnovel_gt); -+ fprintf(bcftools_stderr,"Total/processed/skipped/case_allele/case_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->ncase_al, args->ncase_gt); -+ if ( args->max_AC ) -+ { -+ double val1, val2, fisher; -+ kt_fisher_exact(args->nals[0],args->nals[1],args->nals[2],args->nals[3], &val1,&val2,&fisher); -+ val1 = args->nals[0]+args->nals[1] ? (float)args->nals[1]/(args->nals[0]+args->nals[1]) : 0; -+ val2 = args->nals[2]+args->nals[3] ? (float)args->nals[3]/(args->nals[2]+args->nals[3]) : 0; -+ fprintf(bcftools_stderr,"max_AC/PASSOC/FASSOC/NASSOC:\t%d\t%e\t%f,%f\t%d,%d,%d,%d\n",args->max_AC,fisher,val1,val2,args->nals[0],args->nals[1],args->nals[2],args->nals[3]); -+ } - destroy_data(args); - - return 0; ---- python-pysam.orig/bcftools/plugins/counts.c -+++ python-pysam/bcftools/plugins/counts.c -@@ -1,6 +1,6 @@ - /* plugins/counts.c -- counts SNPs, Indels, and total number of sites. - -- Copyright (C) 2013, 2014 Genome Research Ltd. -+ Copyright (C) 2013-2018 Genome Research Ltd. - - Author: Petr Danecek - -@@ -24,9 +24,10 @@ - - #include - #include -+#include - #include - --int nsamples, nsnps, nindels, nmnps, nothers, nsites; -+uint64_t nsamples, nsnps, nindels, nmnps, nothers, nsites; - - /* - This short description is used to generate the output of `bcftools plugin -l`. -@@ -71,12 +72,12 @@ - */ - void destroy(void) - { -- printf("Number of samples: %d\n", nsamples); -- printf("Number of SNPs: %d\n", nsnps); -- printf("Number of INDELs: %d\n", nindels); -- printf("Number of MNPs: %d\n", nmnps); -- printf("Number of others: %d\n", nothers); -- printf("Number of sites: %d\n", nsites); -+ printf("Number of samples: %"PRIu64"\n", nsamples); -+ printf("Number of SNPs: %"PRIu64"\n", nsnps); -+ printf("Number of INDELs: %"PRIu64"\n", nindels); -+ printf("Number of MNPs: %"PRIu64"\n", nmnps); -+ printf("Number of others: %"PRIu64"\n", nothers); -+ printf("Number of sites: %"PRIu64"\n", nsites); - } - - ---- python-pysam.orig/bcftools/plugins/counts.c.pysam.c -+++ python-pysam/bcftools/plugins/counts.c.pysam.c -@@ -2,7 +2,7 @@ - - /* plugins/counts.c -- counts SNPs, Indels, and total number of sites. - -- Copyright (C) 2013, 2014 Genome Research Ltd. -+ Copyright (C) 2013-2018 Genome Research Ltd. - - Author: Petr Danecek - -@@ -26,9 +26,10 @@ - - #include - #include -+#include - #include - --int nsamples, nsnps, nindels, nmnps, nothers, nsites; -+uint64_t nsamples, nsnps, nindels, nmnps, nothers, nsites; - - /* - This short description is used to generate the output of `bcftools plugin -l`. -@@ -73,12 +74,12 @@ - */ - void destroy(void) - { -- fprintf(bcftools_stdout, "Number of samples: %d\n", nsamples); -- fprintf(bcftools_stdout, "Number of SNPs: %d\n", nsnps); -- fprintf(bcftools_stdout, "Number of INDELs: %d\n", nindels); -- fprintf(bcftools_stdout, "Number of MNPs: %d\n", nmnps); -- fprintf(bcftools_stdout, "Number of others: %d\n", nothers); -- fprintf(bcftools_stdout, "Number of sites: %d\n", nsites); -+ fprintf(bcftools_stdout, "Number of samples: %"PRIu64"\n", nsamples); -+ fprintf(bcftools_stdout, "Number of SNPs: %"PRIu64"\n", nsnps); -+ fprintf(bcftools_stdout, "Number of INDELs: %"PRIu64"\n", nindels); -+ fprintf(bcftools_stdout, "Number of MNPs: %"PRIu64"\n", nmnps); -+ fprintf(bcftools_stdout, "Number of others: %"PRIu64"\n", nothers); -+ fprintf(bcftools_stdout, "Number of sites: %"PRIu64"\n", nsites); - } - - ---- python-pysam.orig/bcftools/plugins/dosage.c -+++ python-pysam/bcftools/plugins/dosage.c -@@ -1,6 +1,6 @@ - /* plugins/dosage.c -- prints genotype dosage. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014-2018 Genome Research Ltd. - - Author: Petr Danecek - -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - #include "bcftools.h" - - -@@ -87,7 +88,7 @@ - for (j=0; jn_allele); \ - int k, l = 0; \ - for (j=0; jn_allele; j++) \ -@@ -103,11 +105,12 @@ - { \ - dsg[j] += vals[l]; \ - dsg[k] += vals[l]; \ -+ l++; \ - } \ - } \ - } \ - for (j=1; jn_allele; j++) \ -- printf("%c%.1f",j==1?'\t':',',dsg[j]); \ -+ printf("%c%f",j==1?'\t':',',dsg[j]); \ - ptr += nret; \ - } \ - } -@@ -122,7 +125,7 @@ - - int calc_dosage_GL(bcf1_t *rec) - { -- int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,pl_type); -+ int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,gl_type); - if ( nret<0 ) return -1; - - nret /= rec->n_sample; -@@ -138,15 +141,15 @@ - for (j=0; jn_allele; j++) dsg[j] = -1; \ - else \ - { \ -- for (; jn_allele); \ - int k, l = 0; \ - for (j=0; jn_allele; j++) \ -@@ -155,15 +158,16 @@ - { \ - dsg[j] += vals[l]; \ - dsg[k] += vals[l]; \ -+ l++; \ - } \ - } \ - } \ - for (j=1; jn_allele; j++) \ -- printf("%c%.1f",j==1?'\t':',',dsg[j]); \ -+ printf("%c%f",j==1?'\t':',',dsg[j]); \ - ptr += nret; \ - } \ - } -- switch (pl_type) -+ switch (gl_type) - { - case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_missing,ptr[j]==bcf_int32_vector_end); break; - case BCF_HT_REAL: BRANCH(float,bcf_float_is_missing(ptr[j]),bcf_float_is_vector_end(ptr[j])); break; -@@ -187,7 +191,7 @@ - { - if ( ptr[j]==bcf_int32_vector_end || bcf_gt_is_missing(ptr[j]) ) break; - int idx = bcf_gt_allele(ptr[j]); -- if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%d\n", bcf_seqname(in_hdr,rec),rec->pos+1); -+ if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - dsg[idx] += 1; - } - if ( !j ) -@@ -300,7 +304,7 @@ - { - int i,j, ret; - -- printf("%s\t%d\t%s", bcf_seqname(in_hdr,rec),rec->pos+1,rec->d.allele[0]); -+ printf("%s\t%"PRId64"\t%s", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1,rec->d.allele[0]); - if ( rec->n_allele == 1 ) printf("\t."); - else for (i=1; in_allele; i++) printf("%c%s", i==1?'\t':',', rec->d.allele[i]); - if ( rec->n_allele==1 ) ---- python-pysam.orig/bcftools/plugins/dosage.c.pysam.c -+++ python-pysam/bcftools/plugins/dosage.c.pysam.c -@@ -2,7 +2,7 @@ - - /* plugins/dosage.c -- prints genotype dosage. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014-2018 Genome Research Ltd. - - Author: Petr Danecek - -@@ -29,6 +29,7 @@ - #include - #include - #include -+#include - #include "bcftools.h" - - -@@ -89,7 +90,7 @@ - for (j=0; jn_allele); \ - int k, l = 0; \ - for (j=0; jn_allele; j++) \ -@@ -105,11 +107,12 @@ - { \ - dsg[j] += vals[l]; \ - dsg[k] += vals[l]; \ -+ l++; \ - } \ - } \ - } \ - for (j=1; jn_allele; j++) \ -- fprintf(bcftools_stdout, "%c%.1f",j==1?'\t':',',dsg[j]); \ -+ fprintf(bcftools_stdout, "%c%f",j==1?'\t':',',dsg[j]); \ - ptr += nret; \ - } \ - } -@@ -124,7 +127,7 @@ - - int calc_dosage_GL(bcf1_t *rec) - { -- int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,pl_type); -+ int i, j, nret = bcf_get_format_values(in_hdr,rec,"GL",(void**)&buf,&nbuf,gl_type); - if ( nret<0 ) return -1; - - nret /= rec->n_sample; -@@ -140,15 +143,15 @@ - for (j=0; jn_allele; j++) dsg[j] = -1; \ - else \ - { \ -- for (; jn_allele); \ - int k, l = 0; \ - for (j=0; jn_allele; j++) \ -@@ -157,15 +160,16 @@ - { \ - dsg[j] += vals[l]; \ - dsg[k] += vals[l]; \ -+ l++; \ - } \ - } \ - } \ - for (j=1; jn_allele; j++) \ -- fprintf(bcftools_stdout, "%c%.1f",j==1?'\t':',',dsg[j]); \ -+ fprintf(bcftools_stdout, "%c%f",j==1?'\t':',',dsg[j]); \ - ptr += nret; \ - } \ - } -- switch (pl_type) -+ switch (gl_type) - { - case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_missing,ptr[j]==bcf_int32_vector_end); break; - case BCF_HT_REAL: BRANCH(float,bcf_float_is_missing(ptr[j]),bcf_float_is_vector_end(ptr[j])); break; -@@ -189,7 +193,7 @@ - { - if ( ptr[j]==bcf_int32_vector_end || bcf_gt_is_missing(ptr[j]) ) break; - int idx = bcf_gt_allele(ptr[j]); -- if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%d\n", bcf_seqname(in_hdr,rec),rec->pos+1); -+ if ( idx > rec->n_allele ) error("The allele index is out of range at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - dsg[idx] += 1; - } - if ( !j ) -@@ -302,7 +306,7 @@ - { - int i,j, ret; - -- fprintf(bcftools_stdout, "%s\t%d\t%s", bcf_seqname(in_hdr,rec),rec->pos+1,rec->d.allele[0]); -+ fprintf(bcftools_stdout, "%s\t%"PRId64"\t%s", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1,rec->d.allele[0]); - if ( rec->n_allele == 1 ) fprintf(bcftools_stdout, "\t."); - else for (i=1; in_allele; i++) fprintf(bcftools_stdout, "%c%s", i==1?'\t':',', rec->d.allele[i]); - if ( rec->n_allele==1 ) ---- python-pysam.orig/bcftools/plugins/fill-AN-AC.c -+++ python-pysam/bcftools/plugins/fill-AN-AC.c -@@ -33,7 +33,7 @@ - - const char *about(void) - { -- return "Fill INFO fields AN and AC.\n"; -+ return "Fill INFO fields AN and AC. This plugin is DEPRECATED, use fill-tags instead.\n"; - } - - int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ---- python-pysam.orig/bcftools/plugins/fill-AN-AC.c.pysam.c -+++ python-pysam/bcftools/plugins/fill-AN-AC.c.pysam.c -@@ -35,7 +35,7 @@ - - const char *about(void) - { -- return "Fill INFO fields AN and AC.\n"; -+ return "Fill INFO fields AN and AC. This plugin is DEPRECATED, use fill-tags instead.\n"; - } - - int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ---- python-pysam.orig/bcftools/plugins/fill-from-fasta.c -+++ python-pysam/bcftools/plugins/fill-from-fasta.c -@@ -26,6 +26,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -54,6 +55,7 @@ - " -h, --header-lines optional file containing header lines to append\n" - " -i, --include annotate only records passing filter expression\n" - " -e, --exclude annotate only records failing filter expression\n" -+" -N, --replace-non-ACGTN replace non-ACGTN characters with N\n" - - "\n" - "Examples:\n" -@@ -74,6 +76,7 @@ - faidx_t *faidx; - int anno = 0; - char *column = NULL; -+int replace_nonACGTN = 0; - - #define ANNO_REF 1 - #define ANNO_STRING 2 -@@ -92,6 +95,7 @@ - char *ref_fname = NULL, *header_fname = NULL; - static struct option loptions[] = - { -+ {"replace-non-ACGTN",no_argument,NULL,'N'}, - {"exclude",required_argument,NULL,'e'}, - {"include",required_argument,NULL,'i'}, - {"column",required_argument,NULL,'c'}, -@@ -99,12 +103,13 @@ - {"header-lines",required_argument,NULL,'h'}, - {NULL,0,NULL,0} - }; -- while ((c = getopt_long(argc, argv, "c:f:?h:i:e:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "c:f:?h:i:e:N",loptions,NULL)) >= 0) - { - switch (c) - { - case 'e': filter_str = optarg; filter_logic |= FLT_EXCLUDE; break; - case 'i': filter_str = optarg; filter_logic |= FLT_INCLUDE; break; -+ case 'N': replace_nonACGTN = 1; break; - case 'c': column = optarg; break; - case 'f': ref_fname = optarg; break; - case 'h': header_fname = optarg; break; -@@ -132,7 +137,8 @@ - } - hts_close(file); - free(str.s); -- bcf_hdr_sync(out_hdr); -+ if (bcf_hdr_sync(out_hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - if (!strcasecmp("REF", column)) anno = ANNO_REF; - else { -@@ -181,9 +187,12 @@ - // could be sped up here by fetching the whole chromosome? could assume - // sorted, but revert to this when non-sorted records found? - char *fa = faidx_fetch_seq(faidx, bcf_seqname(in_hdr,rec), rec->pos, rec->pos+ref_len-1, &fa_len); -- if ( !fa ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(in_hdr,rec->rid), rec->pos+1); -+ if ( !fa ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(in_hdr,rec->rid),(int64_t) rec->pos+1); - for (i=0; i96 ) fa[i] -= 32; -+ if ( replace_nonACGTN && fa[i]!='A' && fa[i]!='C' && fa[i]!='G' && fa[i]!='T' && fa[i]!='N' ) fa[i] = 'N'; -+ } - - assert(ref_len == fa_len); - if (anno==ANNO_REF) ---- python-pysam.orig/bcftools/plugins/fill-from-fasta.c.pysam.c -+++ python-pysam/bcftools/plugins/fill-from-fasta.c.pysam.c -@@ -28,6 +28,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -56,6 +57,7 @@ - " -h, --header-lines optional file containing header lines to append\n" - " -i, --include annotate only records passing filter expression\n" - " -e, --exclude annotate only records failing filter expression\n" -+" -N, --replace-non-ACGTN replace non-ACGTN characters with N\n" - - "\n" - "Examples:\n" -@@ -76,6 +78,7 @@ - faidx_t *faidx; - int anno = 0; - char *column = NULL; -+int replace_nonACGTN = 0; - - #define ANNO_REF 1 - #define ANNO_STRING 2 -@@ -94,6 +97,7 @@ - char *ref_fname = NULL, *header_fname = NULL; - static struct option loptions[] = - { -+ {"replace-non-ACGTN",no_argument,NULL,'N'}, - {"exclude",required_argument,NULL,'e'}, - {"include",required_argument,NULL,'i'}, - {"column",required_argument,NULL,'c'}, -@@ -101,12 +105,13 @@ - {"header-lines",required_argument,NULL,'h'}, - {NULL,0,NULL,0} - }; -- while ((c = getopt_long(argc, argv, "c:f:?h:i:e:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "c:f:?h:i:e:N",loptions,NULL)) >= 0) - { - switch (c) - { - case 'e': filter_str = optarg; filter_logic |= FLT_EXCLUDE; break; - case 'i': filter_str = optarg; filter_logic |= FLT_INCLUDE; break; -+ case 'N': replace_nonACGTN = 1; break; - case 'c': column = optarg; break; - case 'f': ref_fname = optarg; break; - case 'h': header_fname = optarg; break; -@@ -134,7 +139,8 @@ - } - hts_close(file); - free(str.s); -- bcf_hdr_sync(out_hdr); -+ if (bcf_hdr_sync(out_hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - if (!strcasecmp("REF", column)) anno = ANNO_REF; - else { -@@ -183,9 +189,12 @@ - // could be sped up here by fetching the whole chromosome? could assume - // sorted, but revert to this when non-sorted records found? - char *fa = faidx_fetch_seq(faidx, bcf_seqname(in_hdr,rec), rec->pos, rec->pos+ref_len-1, &fa_len); -- if ( !fa ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(in_hdr,rec->rid), rec->pos+1); -+ if ( !fa ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(in_hdr,rec->rid),(int64_t) rec->pos+1); - for (i=0; i96 ) fa[i] -= 32; -+ if ( replace_nonACGTN && fa[i]!='A' && fa[i]!='C' && fa[i]!='G' && fa[i]!='T' && fa[i]!='N' ) fa[i] = 'N'; -+ } - - assert(ref_len == fa_len); - if (anno==ANNO_REF) ---- python-pysam.orig/bcftools/plugins/fill-tags.c -+++ python-pysam/bcftools/plugins/fill-tags.c -@@ -1,6 +1,6 @@ - /* The MIT License - -- Copyright (c) 2015 Genome Research Ltd. -+ Copyright (c) 2015-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -29,10 +29,12 @@ - #include - #include - #include -+#include - #include - #include - #include - #include -+#include - #include "bcftools.h" - - #define SET_AN (1<<0) -@@ -45,6 +47,17 @@ - #define SET_MAF (1<<7) - #define SET_HWE (1<<8) - #define SET_ExcHet (1<<9) -+#define SET_FUNC (1<<10) -+ -+typedef struct _args_t args_t; -+typedef struct _ftf_t ftf_t; -+typedef int (*fill_tag_f)(args_t *, bcf1_t *, ftf_t *); -+struct _ftf_t -+{ -+ char *src_tag, *dst_tag; -+ fill_tag_f func; -+ int *pop_vals; // for now assuming only 1 integer value per annotation -+}; - - typedef struct - { -@@ -62,7 +75,7 @@ - } - pop_t; - --typedef struct -+struct _args_t - { - bcf_hdr_t *in_hdr, *out_hdr; - int npop, tags, drop_missing, gt_id; -@@ -72,21 +85,24 @@ - double *hwe_probs; - int mhwe_probs; - kstring_t str; --} --args_t; -+ kbitset_t *bset; -+ ftf_t *ftf; -+ int nftf; -+}; - - static args_t *args; - - const char *about(void) - { -- return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n"; -+ return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS and more.\n"; - } - - const char *usage(void) - { - return - "\n" -- "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n" -+ "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS\n" -+ " or custom INFO/TAG=func(FMT/TAG), use -l for detailed description\n" - "Usage: bcftools +fill-tags [General Options] -- [Plugin Options]\n" - "Options:\n" - " run \"bcftools plugin\" for a list of common options\n" -@@ -94,14 +110,24 @@ - "Plugin options:\n" - " -d, --drop-missing do not count half-missing genotypes \"./1\" as hemizygous\n" - " -l, --list-tags list available tags with description\n" -- " -t, --tags LIST list of output tags. By default, all tags are filled.\n" -+ " -t, --tags LIST list of output tags, \"all\" for all tags\n" - " -S, --samples-file FILE list of samples (first column) and comma-separated list of populations (second column)\n" - "\n" - "Example:\n" -- " bcftools +fill-tags in.bcf -Ob -o out.bcf\n" -+ " # Print a detailed list of available tags\n" -+ " bcftools +fill-tags -- -l\n" -+ "\n" -+ " # Fill INFO/AN and INFO/AC\n" - " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t AN,AC\n" -- " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -d\n" -+ "\n" -+ " # Fill all available tags\n" -+ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t all\n" -+ "\n" -+ " # Calculate HWE for sample groups (possibly multiple) read from a file\n" - " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -S sample-group.txt -t HWE\n" -+ "\n" -+ " # Calculate total read depth (INFO/DP) from per-sample depths (FORMAT/DP)\n" -+ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'DP=sum(DP)'\n" - "\n"; - } - -@@ -180,7 +206,7 @@ - khash_str2int_destroy_free(smpli); - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - void init_pops(args_t *args) -@@ -211,13 +237,118 @@ - } - } - -+void ftf_destroy(args_t *args) -+{ -+ int i; -+ for (i=0; inftf; i++) -+ { -+ ftf_t *ftf = &args->ftf[i]; -+ free(ftf->src_tag); -+ free(ftf->dst_tag); -+ free(ftf->pop_vals); -+ } -+ free(args->ftf); -+} -+int ftf_sum(args_t *args, bcf1_t *rec, ftf_t *ftf) -+{ -+ int nsmpl = bcf_hdr_nsamples(args->in_hdr); -+ int nval = bcf_get_format_int32(args->in_hdr, rec, ftf->src_tag, &args->iarr, &args->miarr); -+ if ( nval<=0 ) return 0; -+ nval /= nsmpl; -+ -+ int i; -+ for (i=0; inpop; i++) -+ ftf->pop_vals[i] = -1; -+ -+ for (i=0; iiarr[i*nval]==bcf_int32_missing || args->iarr[i*nval]==bcf_int32_vector_end ) continue; -+ -+ pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; -+ while ( *pop ) -+ { -+ int ipop = (int)(*pop - args->pop); -+ if ( ftf->pop_vals[ipop]<0 ) ftf->pop_vals[ipop] = 0; -+ ftf->pop_vals[ipop] += args->iarr[i*nval]; -+ pop++; -+ } -+ } -+ -+ for (i=0; inpop; i++) -+ { -+ if ( ftf->pop_vals[i]<0 ) continue; -+ args->str.l = 0; -+ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); -+ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->pop_vals+i,1)!=0 ) -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); -+ } -+ -+ return 0; -+} -+ -+void hdr_append(args_t *args, char *fmt) -+{ -+ int i; -+ for (i=0; inpop; i++) -+ bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); -+} -+ -+int parse_func(args_t *args, char *tag, char *expr) -+{ -+ args->nftf++; -+ args->ftf = (ftf_t *)realloc(args->ftf,sizeof(*args->ftf)*args->nftf); -+ ftf_t *ftf = &args->ftf[ args->nftf - 1 ]; -+ -+ ftf->pop_vals = (int*)calloc(args->npop,sizeof(*ftf->pop_vals)); -+ ftf->dst_tag = (char*)calloc(expr-tag,1); -+ memcpy(ftf->dst_tag, tag, expr-tag-1); -+ -+ if ( !strncasecmp(expr,"sum(",4) ) { ftf->func = ftf_sum; expr += 4; } -+ else error("Error: the expression not recognised: %s\n",tag); -+ -+ char *tmp = expr; -+ while ( *tmp && *tmp!=')' ) tmp++; -+ if ( !*tmp ) error("Error: could not parse: %s\n",tag); -+ -+ ftf->src_tag = (char*)calloc(tmp-expr+2,1); -+ memcpy(ftf->src_tag, expr, tmp-expr); -+ -+ int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,ftf->src_tag); -+ if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) error("Error: the field FORMAT/%s is not present\n",ftf->src_tag); -+ -+ int i = 0; -+ for (i=0; inpop; i++) -+ { -+ args->str.l = 0; -+ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); -+ id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->str.s); -+ if ( bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) -+ { -+ if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=BCF_VL_FIXED ) -+ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); -+ if ( bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=1 ) -+ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); -+ if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=BCF_HT_INT ) -+ error("Error: the field INFO/%s already exists with a definition different from Type=Integer\n",args->str.s); -+ } -+ else -+ bcf_hdr_printf(args->out_hdr, "##INFO=",args->str.s,tag,*args->pop[i].name ? " in " : "",args->pop[i].name); -+ } -+ return SET_FUNC; -+} - int parse_tags(args_t *args, const char *str) - { -- int i, flag = 0, n_tags; -- char **tags = hts_readlist(str, 0, &n_tags); -+ if ( !args->in_hdr ) error("%s", usage()); -+ -+ int i,j, flag = 0, n_tags; -+ char **tags = hts_readlist(str, 0, &n_tags), *ptr; - for(i=0; inpop; i++) -- bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); --} -- - void list_tags(void) - { - error( -@@ -256,8 +381,10 @@ - "INFO/AC_Hemi Number:A Type:Integer .. Allele counts in hemizygous genotypes\n" - "INFO/AF Number:A Type:Float .. Allele frequency\n" - "INFO/MAF Number:A Type:Float .. Minor Allele frequency\n" -- "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306)\n" -- "INFO/ExcHet Number:A Type:Float .. Probability of excess heterozygosity\n" -+ "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306); 1=good, 0=bad\n" -+ "INFO/ExcHet Number:A Type:Float .. Test excess heterozygosity; 1=good, 0=bad\n" -+ "TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined\n" -+ " expressions such as \"DP=sum(DP)\". This is currently very basic, to be extended.\n" - ); - } - -@@ -266,7 +393,7 @@ - args = (args_t*) calloc(1,sizeof(args_t)); - args->in_hdr = in; - args->out_hdr = out; -- char *samples_fname = NULL; -+ char *samples_fname = NULL, *tags_str = "all"; - static struct option loptions[] = - { - {"list-tags",0,0,'l'}, -@@ -282,7 +409,7 @@ - { - case 'l': list_tags(); break; - case 'd': args->drop_missing = 1; break; -- case 't': args->tags |= parse_tags(args,optarg); break; -+ case 't': tags_str = optarg; break; - case 'S': samples_fname = optarg; break; - case 'h': - case '?': -@@ -295,12 +422,11 @@ - args->gt_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,"GT"); - if ( args->gt_id<0 ) error("Error: GT field is not present\n"); - -- if ( !args->tags ) -- for (c=0; c<=9; c++) args->tags |= 1<tags |= parse_tags(args,tags_str); -+ - if ( args->tags & SET_AN ) hdr_append(args, "##INFO="); - if ( args->tags & SET_AC ) hdr_append(args, "##INFO="); - if ( args->tags & SET_NS ) hdr_append(args, "##INFO="); -@@ -309,8 +435,8 @@ - if ( args->tags & SET_AC_Hemi ) hdr_append(args, "##INFO="); - if ( args->tags & SET_AF ) hdr_append(args, "##INFO="); - if ( args->tags & SET_MAF ) hdr_append(args, "##INFO="); -- if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); -- if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); -+ if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); -+ if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); - - return 0; - } -@@ -340,7 +466,7 @@ - double *probs = args->hwe_probs; - - // start at midpoint -- int mid = nrare * (nref + nalt - nrare) / (nref + nalt); -+ int mid = (double)nrare * (nref + nalt - nrare) / (nref + nalt); - - // check to ensure that midpoint and rare alleles have same parity - if ( (nrare & 1) ^ (mid & 1) ) mid++; -@@ -389,19 +515,17 @@ - *p_hwe = prob; - } - --static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, int als) -+static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, kbitset_t *bset) - { -- int ial; -- for (ial=0; als; ial++) -+ kbitset_iter_t itr; -+ int i; -+ kbs_start(&itr); -+ while ((i = kbs_next(bset, &itr)) >= 0) - { -- if ( als&1 ) -- { -- if ( is_half ) pop->counts[ial].nac++; -- else if ( !is_hom ) pop->counts[ial].nhet++; -- else if ( !is_hemi ) pop->counts[ial].nhom += 2; -- else pop->counts[ial].nhemi++; -- } -- als >>= 1; -+ if ( is_half ) pop->counts[i].nac++; -+ else if ( !is_hom ) pop->counts[i].nhet++; -+ else if ( !is_hemi ) pop->counts[i].nhom += 2; -+ else pop->counts[i].nhemi++; - } - pop->ns++; - } -@@ -413,9 +537,13 @@ - - bcf1_t *process(bcf1_t *rec) - { -+ bcf_unpack(rec, BCF_UN_FMT); -+ - int i,j, nsmpl = bcf_hdr_nsamples(args->in_hdr);; - -- bcf_unpack(rec, BCF_UN_FMT); -+ for (i=0; inftf; i++) -+ args->ftf[i].func(args, rec, &args->ftf[i]); -+ - bcf_fmt_t *fmt_gt = NULL; - for (i=0; in_fmt; i++) - if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } -@@ -429,14 +557,15 @@ - for (i=0; inpop; i++) - clean_counts(&args->pop[i], rec->n_allele); - -- assert( rec->n_allele < 8*sizeof(int) ); -+ if ( kbs_resize(&args->bset, rec->n_allele) < 0 ) error("kbs_resize: failed to store %d bits\n", rec->n_allele); - - #define BRANCH_INT(type_t,vector_end) \ - { \ - for (i=0; ip + i*fmt_gt->size); \ -- int ial, als = 0, nals = 0, is_half, is_hom, is_hemi; \ -+ int ial, nbits = 0, nals = 0, is_half, is_hom, is_hemi; \ -+ kbs_clear(args->bset); \ - for (ial=0; ialn; ial++) \ - { \ - if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ -@@ -445,11 +574,12 @@ - nals++; \ - \ - if ( idx >= rec->n_allele ) \ -- error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args->in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),rec->pos+1); \ -- als |= (1<in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); \ -+ if ( !kbs_exists(args->bset, idx) ) nbits++; \ -+ kbs_insert(args->bset, idx); \ - } \ - if ( nals==0 ) continue; /* missing genotype */ \ -- is_hom = als && !(als & (als-1)); /* only one bit is set */ \ -+ is_hom = nbits==1 ? 1 : 0; /* only one bit is set for homs */ \ - if ( nals!=ial ) \ - { \ - if ( args->drop_missing ) is_hemi = 0, is_half = 1; \ -@@ -458,14 +588,14 @@ - else if ( nals==1 ) is_hemi = 1, is_half = 0; \ - else is_hemi = 0, is_half = 0; \ - pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; \ -- while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,als); pop++; }\ -+ while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,args->bset); pop++; } \ - } \ - } - switch (fmt_gt->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; -- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),rec->pos+1); break; -+ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); break; - } - #undef BRANCH_INT - -@@ -476,7 +606,7 @@ - args->str.l = 0; - ksprintf(&args->str, "NS%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&args->pop[i].ns,1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AN ) -@@ -491,7 +621,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AN%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&an,1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & (SET_AF | SET_MAF) ) -@@ -507,25 +637,29 @@ - args->farr[j-1] += pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; - an = pop->counts[0].nhet + pop->counts[0].nhom + pop->counts[0].nhemi + pop->counts[0].nac; - for (j=1; jn_allele; j++) an += args->farr[j-1]; -- if ( !an ) continue; -- for (j=1; jn_allele; j++) args->farr[j-1] /= an; -+ if ( an ) -+ for (j=1; jn_allele; j++) args->farr[j-1] /= an; -+ else -+ for (j=1; jn_allele; j++) bcf_float_set_missing(args->farr[j-1]); - } - if ( args->tags & SET_AF ) - { - args->str.l = 0; - ksprintf(&args->str, "AF%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - if ( args->tags & SET_MAF ) - { -- if ( !an ) continue; -- for (j=1; jn_allele; j++) -- if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites -+ if ( an ) -+ { -+ for (j=1; jn_allele; j++) -+ if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites -+ } - args->str.l = 0; - ksprintf(&args->str, "MAF%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - } -@@ -543,7 +677,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AC_Het ) -@@ -560,7 +694,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC_Het%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AC_Hom ) -@@ -577,7 +711,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC_Hom%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AC_Hemi && rec->n_allele > 1 ) -@@ -594,7 +728,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC_Hemi%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & (SET_HWE|SET_ExcHet) ) -@@ -625,14 +759,14 @@ - args->str.l = 0; - ksprintf(&args->str, "HWE%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fhwe,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - if ( args->tags & SET_ExcHet ) - { - args->str.l = 0; - ksprintf(&args->str, "ExcHet%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fexc_het,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - } -@@ -650,12 +784,14 @@ - free(args->pop[i].smpl); - free(args->pop[i].counts); - } -+ kbs_destroy(args->bset); - free(args->str.s); - free(args->pop); - free(args->smpl2pop); - free(args->iarr); - free(args->farr); - free(args->hwe_probs); -+ ftf_destroy(args); - free(args); - } - ---- python-pysam.orig/bcftools/plugins/fill-tags.c.pysam.c -+++ python-pysam/bcftools/plugins/fill-tags.c.pysam.c -@@ -2,7 +2,7 @@ - - /* The MIT License - -- Copyright (c) 2015 Genome Research Ltd. -+ Copyright (c) 2015-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -31,10 +31,12 @@ - #include - #include - #include -+#include - #include - #include - #include - #include -+#include - #include "bcftools.h" - - #define SET_AN (1<<0) -@@ -47,6 +49,17 @@ - #define SET_MAF (1<<7) - #define SET_HWE (1<<8) - #define SET_ExcHet (1<<9) -+#define SET_FUNC (1<<10) -+ -+typedef struct _args_t args_t; -+typedef struct _ftf_t ftf_t; -+typedef int (*fill_tag_f)(args_t *, bcf1_t *, ftf_t *); -+struct _ftf_t -+{ -+ char *src_tag, *dst_tag; -+ fill_tag_f func; -+ int *pop_vals; // for now assuming only 1 integer value per annotation -+}; - - typedef struct - { -@@ -64,7 +77,7 @@ - } - pop_t; - --typedef struct -+struct _args_t - { - bcf_hdr_t *in_hdr, *out_hdr; - int npop, tags, drop_missing, gt_id; -@@ -74,21 +87,24 @@ - double *hwe_probs; - int mhwe_probs; - kstring_t str; --} --args_t; -+ kbitset_t *bset; -+ ftf_t *ftf; -+ int nftf; -+}; - - static args_t *args; - - const char *about(void) - { -- return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n"; -+ return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS and more.\n"; - } - - const char *usage(void) - { - return - "\n" -- "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS.\n" -+ "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS\n" -+ " or custom INFO/TAG=func(FMT/TAG), use -l for detailed description\n" - "Usage: bcftools +fill-tags [General Options] -- [Plugin Options]\n" - "Options:\n" - " run \"bcftools plugin\" for a list of common options\n" -@@ -96,14 +112,24 @@ - "Plugin options:\n" - " -d, --drop-missing do not count half-missing genotypes \"./1\" as hemizygous\n" - " -l, --list-tags list available tags with description\n" -- " -t, --tags LIST list of output tags. By default, all tags are filled.\n" -+ " -t, --tags LIST list of output tags, \"all\" for all tags\n" - " -S, --samples-file FILE list of samples (first column) and comma-separated list of populations (second column)\n" - "\n" - "Example:\n" -- " bcftools +fill-tags in.bcf -Ob -o out.bcf\n" -+ " # Print a detailed list of available tags\n" -+ " bcftools +fill-tags -- -l\n" -+ "\n" -+ " # Fill INFO/AN and INFO/AC\n" - " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t AN,AC\n" -- " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -d\n" -+ "\n" -+ " # Fill all available tags\n" -+ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t all\n" -+ "\n" -+ " # Calculate HWE for sample groups (possibly multiple) read from a file\n" - " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -S sample-group.txt -t HWE\n" -+ "\n" -+ " # Calculate total read depth (INFO/DP) from per-sample depths (FORMAT/DP)\n" -+ " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'DP=sum(DP)'\n" - "\n"; - } - -@@ -182,7 +208,7 @@ - khash_str2int_destroy_free(smpli); - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - void init_pops(args_t *args) -@@ -213,13 +239,118 @@ - } - } - -+void ftf_destroy(args_t *args) -+{ -+ int i; -+ for (i=0; inftf; i++) -+ { -+ ftf_t *ftf = &args->ftf[i]; -+ free(ftf->src_tag); -+ free(ftf->dst_tag); -+ free(ftf->pop_vals); -+ } -+ free(args->ftf); -+} -+int ftf_sum(args_t *args, bcf1_t *rec, ftf_t *ftf) -+{ -+ int nsmpl = bcf_hdr_nsamples(args->in_hdr); -+ int nval = bcf_get_format_int32(args->in_hdr, rec, ftf->src_tag, &args->iarr, &args->miarr); -+ if ( nval<=0 ) return 0; -+ nval /= nsmpl; -+ -+ int i; -+ for (i=0; inpop; i++) -+ ftf->pop_vals[i] = -1; -+ -+ for (i=0; iiarr[i*nval]==bcf_int32_missing || args->iarr[i*nval]==bcf_int32_vector_end ) continue; -+ -+ pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; -+ while ( *pop ) -+ { -+ int ipop = (int)(*pop - args->pop); -+ if ( ftf->pop_vals[ipop]<0 ) ftf->pop_vals[ipop] = 0; -+ ftf->pop_vals[ipop] += args->iarr[i*nval]; -+ pop++; -+ } -+ } -+ -+ for (i=0; inpop; i++) -+ { -+ if ( ftf->pop_vals[i]<0 ) continue; -+ args->str.l = 0; -+ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); -+ if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,ftf->pop_vals+i,1)!=0 ) -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); -+ } -+ -+ return 0; -+} -+ -+void hdr_append(args_t *args, char *fmt) -+{ -+ int i; -+ for (i=0; inpop; i++) -+ bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); -+} -+ -+int parse_func(args_t *args, char *tag, char *expr) -+{ -+ args->nftf++; -+ args->ftf = (ftf_t *)realloc(args->ftf,sizeof(*args->ftf)*args->nftf); -+ ftf_t *ftf = &args->ftf[ args->nftf - 1 ]; -+ -+ ftf->pop_vals = (int*)calloc(args->npop,sizeof(*ftf->pop_vals)); -+ ftf->dst_tag = (char*)calloc(expr-tag,1); -+ memcpy(ftf->dst_tag, tag, expr-tag-1); -+ -+ if ( !strncasecmp(expr,"sum(",4) ) { ftf->func = ftf_sum; expr += 4; } -+ else error("Error: the expression not recognised: %s\n",tag); -+ -+ char *tmp = expr; -+ while ( *tmp && *tmp!=')' ) tmp++; -+ if ( !*tmp ) error("Error: could not parse: %s\n",tag); -+ -+ ftf->src_tag = (char*)calloc(tmp-expr+2,1); -+ memcpy(ftf->src_tag, expr, tmp-expr); -+ -+ int id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,ftf->src_tag); -+ if ( !bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) error("Error: the field FORMAT/%s is not present\n",ftf->src_tag); -+ -+ int i = 0; -+ for (i=0; inpop; i++) -+ { -+ args->str.l = 0; -+ ksprintf(&args->str, "%s%s", ftf->dst_tag,args->pop[i].suffix); -+ id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,args->str.s); -+ if ( bcf_hdr_idinfo_exists(args->in_hdr,BCF_HL_FMT,id) ) -+ { -+ if ( bcf_hdr_id2length(args->in_hdr,BCF_HL_FMT,id)!=BCF_VL_FIXED ) -+ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); -+ if ( bcf_hdr_id2number(args->in_hdr,BCF_HL_FMT,id)!=1 ) -+ error("Error: the field INFO/%s already exists with a definition different from Number=1\n",args->str.s); -+ if ( bcf_hdr_id2type(args->in_hdr,BCF_HT_INT,id)!=BCF_HT_INT ) -+ error("Error: the field INFO/%s already exists with a definition different from Type=Integer\n",args->str.s); -+ } -+ else -+ bcf_hdr_printf(args->out_hdr, "##INFO=",args->str.s,tag,*args->pop[i].name ? " in " : "",args->pop[i].name); -+ } -+ return SET_FUNC; -+} - int parse_tags(args_t *args, const char *str) - { -- int i, flag = 0, n_tags; -- char **tags = hts_readlist(str, 0, &n_tags); -+ if ( !args->in_hdr ) error("%s", usage()); -+ -+ int i,j, flag = 0, n_tags; -+ char **tags = hts_readlist(str, 0, &n_tags), *ptr; - for(i=0; inpop; i++) -- bcf_hdr_printf(args->out_hdr, fmt, args->pop[i].suffix,*args->pop[i].name ? " in " : "",args->pop[i].name); --} -- - void list_tags(void) - { - error( -@@ -258,8 +383,10 @@ - "INFO/AC_Hemi Number:A Type:Integer .. Allele counts in hemizygous genotypes\n" - "INFO/AF Number:A Type:Float .. Allele frequency\n" - "INFO/MAF Number:A Type:Float .. Minor Allele frequency\n" -- "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306)\n" -- "INFO/ExcHet Number:A Type:Float .. Probability of excess heterozygosity\n" -+ "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306); 1=good, 0=bad\n" -+ "INFO/ExcHet Number:A Type:Float .. Test excess heterozygosity; 1=good, 0=bad\n" -+ "TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined\n" -+ " expressions such as \"DP=sum(DP)\". This is currently very basic, to be extended.\n" - ); - } - -@@ -268,7 +395,7 @@ - args = (args_t*) calloc(1,sizeof(args_t)); - args->in_hdr = in; - args->out_hdr = out; -- char *samples_fname = NULL; -+ char *samples_fname = NULL, *tags_str = "all"; - static struct option loptions[] = - { - {"list-tags",0,0,'l'}, -@@ -284,7 +411,7 @@ - { - case 'l': list_tags(); break; - case 'd': args->drop_missing = 1; break; -- case 't': args->tags |= parse_tags(args,optarg); break; -+ case 't': tags_str = optarg; break; - case 'S': samples_fname = optarg; break; - case 'h': - case '?': -@@ -297,12 +424,11 @@ - args->gt_id = bcf_hdr_id2int(args->in_hdr,BCF_DT_ID,"GT"); - if ( args->gt_id<0 ) error("Error: GT field is not present\n"); - -- if ( !args->tags ) -- for (c=0; c<=9; c++) args->tags |= 1<tags |= parse_tags(args,tags_str); -+ - if ( args->tags & SET_AN ) hdr_append(args, "##INFO="); - if ( args->tags & SET_AC ) hdr_append(args, "##INFO="); - if ( args->tags & SET_NS ) hdr_append(args, "##INFO="); -@@ -311,8 +437,8 @@ - if ( args->tags & SET_AC_Hemi ) hdr_append(args, "##INFO="); - if ( args->tags & SET_AF ) hdr_append(args, "##INFO="); - if ( args->tags & SET_MAF ) hdr_append(args, "##INFO="); -- if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); -- if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); -+ if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); -+ if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); - - return 0; - } -@@ -342,7 +468,7 @@ - double *probs = args->hwe_probs; - - // start at midpoint -- int mid = nrare * (nref + nalt - nrare) / (nref + nalt); -+ int mid = (double)nrare * (nref + nalt - nrare) / (nref + nalt); - - // check to ensure that midpoint and rare alleles have same parity - if ( (nrare & 1) ^ (mid & 1) ) mid++; -@@ -391,19 +517,17 @@ - *p_hwe = prob; - } - --static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, int als) -+static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, kbitset_t *bset) - { -- int ial; -- for (ial=0; als; ial++) -+ kbitset_iter_t itr; -+ int i; -+ kbs_start(&itr); -+ while ((i = kbs_next(bset, &itr)) >= 0) - { -- if ( als&1 ) -- { -- if ( is_half ) pop->counts[ial].nac++; -- else if ( !is_hom ) pop->counts[ial].nhet++; -- else if ( !is_hemi ) pop->counts[ial].nhom += 2; -- else pop->counts[ial].nhemi++; -- } -- als >>= 1; -+ if ( is_half ) pop->counts[i].nac++; -+ else if ( !is_hom ) pop->counts[i].nhet++; -+ else if ( !is_hemi ) pop->counts[i].nhom += 2; -+ else pop->counts[i].nhemi++; - } - pop->ns++; - } -@@ -415,9 +539,13 @@ - - bcf1_t *process(bcf1_t *rec) - { -+ bcf_unpack(rec, BCF_UN_FMT); -+ - int i,j, nsmpl = bcf_hdr_nsamples(args->in_hdr);; - -- bcf_unpack(rec, BCF_UN_FMT); -+ for (i=0; inftf; i++) -+ args->ftf[i].func(args, rec, &args->ftf[i]); -+ - bcf_fmt_t *fmt_gt = NULL; - for (i=0; in_fmt; i++) - if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } -@@ -431,14 +559,15 @@ - for (i=0; inpop; i++) - clean_counts(&args->pop[i], rec->n_allele); - -- assert( rec->n_allele < 8*sizeof(int) ); -+ if ( kbs_resize(&args->bset, rec->n_allele) < 0 ) error("kbs_resize: failed to store %d bits\n", rec->n_allele); - - #define BRANCH_INT(type_t,vector_end) \ - { \ - for (i=0; ip + i*fmt_gt->size); \ -- int ial, als = 0, nals = 0, is_half, is_hom, is_hemi; \ -+ int ial, nbits = 0, nals = 0, is_half, is_hom, is_hemi; \ -+ kbs_clear(args->bset); \ - for (ial=0; ialn; ial++) \ - { \ - if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ -@@ -447,11 +576,12 @@ - nals++; \ - \ - if ( idx >= rec->n_allele ) \ -- error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args->in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),rec->pos+1); \ -- als |= (1<in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); \ -+ if ( !kbs_exists(args->bset, idx) ) nbits++; \ -+ kbs_insert(args->bset, idx); \ - } \ - if ( nals==0 ) continue; /* missing genotype */ \ -- is_hom = als && !(als & (als-1)); /* only one bit is set */ \ -+ is_hom = nbits==1 ? 1 : 0; /* only one bit is set for homs */ \ - if ( nals!=ial ) \ - { \ - if ( args->drop_missing ) is_hemi = 0, is_half = 1; \ -@@ -460,14 +590,14 @@ - else if ( nals==1 ) is_hemi = 1, is_half = 0; \ - else is_hemi = 0, is_half = 0; \ - pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; \ -- while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,als); pop++; }\ -+ while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,args->bset); pop++; } \ - } \ - } - switch (fmt_gt->type) { - case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; -- default: error("The GT type is not recognised: %d at %s:%d\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),rec->pos+1); break; -+ default: error("The GT type is not recognised: %d at %s:%"PRId64"\n",fmt_gt->type, bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); break; - } - #undef BRANCH_INT - -@@ -478,7 +608,7 @@ - args->str.l = 0; - ksprintf(&args->str, "NS%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&args->pop[i].ns,1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AN ) -@@ -493,7 +623,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AN%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,&an,1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & (SET_AF | SET_MAF) ) -@@ -509,25 +639,29 @@ - args->farr[j-1] += pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; - an = pop->counts[0].nhet + pop->counts[0].nhom + pop->counts[0].nhemi + pop->counts[0].nac; - for (j=1; jn_allele; j++) an += args->farr[j-1]; -- if ( !an ) continue; -- for (j=1; jn_allele; j++) args->farr[j-1] /= an; -+ if ( an ) -+ for (j=1; jn_allele; j++) args->farr[j-1] /= an; -+ else -+ for (j=1; jn_allele; j++) bcf_float_set_missing(args->farr[j-1]); - } - if ( args->tags & SET_AF ) - { - args->str.l = 0; - ksprintf(&args->str, "AF%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - if ( args->tags & SET_MAF ) - { -- if ( !an ) continue; -- for (j=1; jn_allele; j++) -- if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites -+ if ( an ) -+ { -+ for (j=1; jn_allele; j++) -+ if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites -+ } - args->str.l = 0; - ksprintf(&args->str, "MAF%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - } -@@ -545,7 +679,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AC_Het ) -@@ -562,7 +696,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC_Het%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AC_Hom ) -@@ -579,7 +713,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC_Hom%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & SET_AC_Hemi && rec->n_allele > 1 ) -@@ -596,7 +730,7 @@ - args->str.l = 0; - ksprintf(&args->str, "AC_Hemi%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - if ( args->tags & (SET_HWE|SET_ExcHet) ) -@@ -627,14 +761,14 @@ - args->str.l = 0; - ksprintf(&args->str, "HWE%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fhwe,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - if ( args->tags & SET_ExcHet ) - { - args->str.l = 0; - ksprintf(&args->str, "ExcHet%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,fexc_het,rec->n_allele-1)!=0 ) -- error("Error occurred while updating %s at %s:%d\n", args->str.s,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - } - } - } -@@ -652,12 +786,14 @@ - free(args->pop[i].smpl); - free(args->pop[i].counts); - } -+ kbs_destroy(args->bset); - free(args->str.s); - free(args->pop); - free(args->smpl2pop); - free(args->iarr); - free(args->farr); - free(args->hwe_probs); -+ ftf_destroy(args); - free(args); - } - ---- python-pysam.orig/bcftools/plugins/fixploidy.c -+++ python-pysam/bcftools/plugins/fixploidy.c -@@ -190,7 +190,7 @@ - return rec; // GT field not present - - if ( ngts % n_sample ) -- error("Error at %s:%d: wrong number of GT fields\n",bcf_seqname(in_hdr,rec),rec->pos+1); -+ error("Error at %s:%"PRId64": wrong number of GT fields\n",bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - - if ( force_ploidy==-1 ) - ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy); -@@ -215,7 +215,7 @@ - while ( jpos+1); -+ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - } - else if ( ngts!=1 || max_ploidy!=1 ) - { -@@ -232,7 +232,7 @@ - while ( jpos+1); -+ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - } - return rec; - } ---- python-pysam.orig/bcftools/plugins/fixploidy.c.pysam.c -+++ python-pysam/bcftools/plugins/fixploidy.c.pysam.c -@@ -192,7 +192,7 @@ - return rec; // GT field not present - - if ( ngts % n_sample ) -- error("Error at %s:%d: wrong number of GT fields\n",bcf_seqname(in_hdr,rec),rec->pos+1); -+ error("Error at %s:%"PRId64": wrong number of GT fields\n",bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - - if ( force_ploidy==-1 ) - ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy); -@@ -217,7 +217,7 @@ - while ( jpos+1); -+ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - } - else if ( ngts!=1 || max_ploidy!=1 ) - { -@@ -234,7 +234,7 @@ - while ( jpos+1); -+ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1); - } - return rec; - } ---- python-pysam.orig/bcftools/plugins/fixref.c -+++ python-pysam/bcftools/plugins/fixref.c -@@ -76,6 +76,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -90,6 +91,7 @@ - #define MODE_TOP2FWD 2 - #define MODE_FLIP2FWD 3 - #define MODE_USE_ID 4 -+#define MODE_REF_ALT 5 - - typedef struct - { -@@ -128,16 +130,20 @@ - "\n" - "About: This tool helps to determine and fix strand orientation.\n" - " Currently the following modes are recognised:\n" -- " flip .. flips non-ambiguous SNPs and ignores the rest\n" -- " id .. swap REF/ALT and GTs using the ID column to determine the REF allele\n" -- " stats .. collect and print stats\n" -- " top .. converts from Illumina TOP strand to fwd\n" -+ " flip .. flip REF/ALT columns and GTs for non-ambiguous SNPs and ignore the rest\n" -+ " id .. swap REF/ALT columns and GTs using the ID column to determine the REF allele\n" -+ " ref-alt .. swap REF/ALT columns to match the reference but not modify the genotypes\n" -+ " stats .. collect and print stats\n" -+ " top .. convert from Illumina TOP strand to fwd\n" - "\n" - " WARNING: Do not use the program blindly, make an effort to\n" - " understand what strand convention your data uses! Make sure\n" - " the reason for mismatching REF alleles is not a different\n" - " reference build!!\n" - "\n" -+ " Please check this page before messing up your VCF even more\n" -+ " http://samtools.github.io/bcftools/howtos/plugin.fixref.html\n" -+ "\n" - "Usage: bcftools +fixref [General Options] -- [Plugin Options]\n" - "Options:\n" - " run \"bcftools plugin\" for a list of common options\n" -@@ -148,7 +154,7 @@ - " -i, --use-id Swap REF/ALT using the ID column to determine the REF allele, implies -m id.\n" - " Download the dbSNP file from\n" - " https://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf\n" -- " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"top\") [stats]\n" -+ " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"ref-alt\", \"top\") [stats]\n" - "\n" - "Examples:\n" - " # run stats\n" -@@ -189,6 +195,7 @@ - if ( !strcasecmp(optarg,"top") ) args.mode = MODE_TOP2FWD; - else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD; - else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID; -+ else if ( !strcasecmp(optarg,"ref-alt") ) args.mode = MODE_REF_ALT; - else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS; - else error("The source strand convention not recognised: %s\n", optarg); - break; -@@ -217,6 +224,8 @@ - if ( !swap ) return rec; // only fix the alleles, leaving GTs unchanged - - int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->ngts); -+ if ( ngts<=0 ) return rec; // no samples, no genotypes -+ - int i, j, nsmpl = bcf_hdr_nsamples(args->hdr); - ngts /= nsmpl; - for (i=0; iskip_rid = rec->rid; - return -2; - } -- error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - } - int ir = nt2int(*ref); - free(ref); -@@ -288,6 +297,7 @@ - args->i2m = kh_init(i2m); - bcf_srs_t *sr = bcf_sr_init(); - if ( bcf_sr_set_regions(sr, chr, 0) != 0 ) goto done; -+ if ( !args->dbsnp_fname ) error("No ID file specified, use -i/--use-id\n"); - if ( !bcf_sr_add_reader(sr,args->dbsnp_fname) ) error("Failed to open %s: %s\n", args->dbsnp_fname,bcf_sr_strerror(sr->errnum)); - while ( bcf_sr_next_line(sr) ) - { -@@ -330,7 +340,7 @@ - - ref = kh_val(args->i2m, k).ref; - if ( ref!=ir ) -- error("Reference base mismatch at %s:%d .. %c vs %c\n",bcf_seqname(args->hdr,rec),rec->pos+1,int2nt(ref),int2nt(ir)); -+ error("Reference base mismatch at %s:%"PRId64" .. %c vs %c\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,int2nt(ref),int2nt(ir)); - - if ( ia==ref ) return rec; - if ( ib==ref ) { args->nswap++; return set_ref_alt(args,rec,int2nt(ib),int2nt(ia),1); } -@@ -408,14 +418,22 @@ - if ( !args.unsorted && args.pos > rec->pos ) - { - fprintf(stderr, -- "Warning: corrected position(s) results in unsorted VCF, for example %s:%d comes after %s:%d\n" -+ "Warning: corrected position(s) results in unsorted VCF, for example %s:%"PRId64" comes after %s:%d\n" - " The standard unix `sort` or `vcf-sort` from vcftools can be used to fix the order.\n", -- bcf_seqname(args.hdr,rec),rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); -+ bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); - args.unsorted = 1; - } - args.pos = rec->pos; - return ret; - } -+ else if ( args.mode==MODE_REF_ALT ) // only change the REF/ALT column, leave the genotypes as is -+ { -+ if ( ir==ia ) return ret; -+ if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),0); } -+ if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } -+ if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),0); } -+ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); -+ } - else if ( args.mode==MODE_FLIP2FWD ) - { - int pair = 1 << ia | 1 << ib; -@@ -428,7 +446,7 @@ - if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),1); } - if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } - if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),1); } -- error("FIXME: this should not happen %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); -+ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - } - else if ( args.mode==MODE_TOP2FWD ) - { -@@ -457,8 +475,8 @@ - { - int len, win = rec->pos > 100 ? 100 : rec->pos, beg = rec->pos - win, end = rec->pos + win; - char *ref = faidx_fetch_seq(args.fai, (char*)bcf_seqname(args.hdr,rec), beg,end, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); -- if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%d (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),rec->pos+1, end,beg); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); -+ if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%"PRId64" (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1, end,beg); - - int i, mid = rec->pos - beg, strand = 0; - for (i=1; i<=win; i++) ---- python-pysam.orig/bcftools/plugins/fixref.c.pysam.c -+++ python-pysam/bcftools/plugins/fixref.c.pysam.c -@@ -78,6 +78,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -92,6 +93,7 @@ - #define MODE_TOP2FWD 2 - #define MODE_FLIP2FWD 3 - #define MODE_USE_ID 4 -+#define MODE_REF_ALT 5 - - typedef struct - { -@@ -130,16 +132,20 @@ - "\n" - "About: This tool helps to determine and fix strand orientation.\n" - " Currently the following modes are recognised:\n" -- " flip .. flips non-ambiguous SNPs and ignores the rest\n" -- " id .. swap REF/ALT and GTs using the ID column to determine the REF allele\n" -- " stats .. collect and print stats\n" -- " top .. converts from Illumina TOP strand to fwd\n" -+ " flip .. flip REF/ALT columns and GTs for non-ambiguous SNPs and ignore the rest\n" -+ " id .. swap REF/ALT columns and GTs using the ID column to determine the REF allele\n" -+ " ref-alt .. swap REF/ALT columns to match the reference but not modify the genotypes\n" -+ " stats .. collect and print stats\n" -+ " top .. convert from Illumina TOP strand to fwd\n" - "\n" - " WARNING: Do not use the program blindly, make an effort to\n" - " understand what strand convention your data uses! Make sure\n" - " the reason for mismatching REF alleles is not a different\n" - " reference build!!\n" - "\n" -+ " Please check this page before messing up your VCF even more\n" -+ " http://samtools.github.io/bcftools/howtos/plugin.fixref.html\n" -+ "\n" - "Usage: bcftools +fixref [General Options] -- [Plugin Options]\n" - "Options:\n" - " run \"bcftools plugin\" for a list of common options\n" -@@ -150,7 +156,7 @@ - " -i, --use-id Swap REF/ALT using the ID column to determine the REF allele, implies -m id.\n" - " Download the dbSNP file from\n" - " https://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf\n" -- " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"top\") [stats]\n" -+ " -m, --mode Collect stats (\"stats\") or convert (\"flip\", \"id\", \"ref-alt\", \"top\") [stats]\n" - "\n" - "Examples:\n" - " # run stats\n" -@@ -191,6 +197,7 @@ - if ( !strcasecmp(optarg,"top") ) args.mode = MODE_TOP2FWD; - else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD; - else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID; -+ else if ( !strcasecmp(optarg,"ref-alt") ) args.mode = MODE_REF_ALT; - else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS; - else error("The source strand convention not recognised: %s\n", optarg); - break; -@@ -219,6 +226,8 @@ - if ( !swap ) return rec; // only fix the alleles, leaving GTs unchanged - - int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->ngts); -+ if ( ngts<=0 ) return rec; // no samples, no genotypes -+ - int i, j, nsmpl = bcf_hdr_nsamples(args->hdr); - ngts /= nsmpl; - for (i=0; iskip_rid = rec->rid; - return -2; - } -- error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1); -+ error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - } - int ir = nt2int(*ref); - free(ref); -@@ -290,6 +299,7 @@ - args->i2m = kh_init(i2m); - bcf_srs_t *sr = bcf_sr_init(); - if ( bcf_sr_set_regions(sr, chr, 0) != 0 ) goto done; -+ if ( !args->dbsnp_fname ) error("No ID file specified, use -i/--use-id\n"); - if ( !bcf_sr_add_reader(sr,args->dbsnp_fname) ) error("Failed to open %s: %s\n", args->dbsnp_fname,bcf_sr_strerror(sr->errnum)); - while ( bcf_sr_next_line(sr) ) - { -@@ -332,7 +342,7 @@ - - ref = kh_val(args->i2m, k).ref; - if ( ref!=ir ) -- error("Reference base mismatch at %s:%d .. %c vs %c\n",bcf_seqname(args->hdr,rec),rec->pos+1,int2nt(ref),int2nt(ir)); -+ error("Reference base mismatch at %s:%"PRId64" .. %c vs %c\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,int2nt(ref),int2nt(ir)); - - if ( ia==ref ) return rec; - if ( ib==ref ) { args->nswap++; return set_ref_alt(args,rec,int2nt(ib),int2nt(ia),1); } -@@ -410,14 +420,22 @@ - if ( !args.unsorted && args.pos > rec->pos ) - { - fprintf(bcftools_stderr, -- "Warning: corrected position(s) results in unsorted VCF, for example %s:%d comes after %s:%d\n" -+ "Warning: corrected position(s) results in unsorted VCF, for example %s:%"PRId64" comes after %s:%d\n" - " The standard unix `sort` or `vcf-sort` from vcftools can be used to fix the order.\n", -- bcf_seqname(args.hdr,rec),rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); -+ bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1,bcf_seqname(args.hdr,rec),args.pos); - args.unsorted = 1; - } - args.pos = rec->pos; - return ret; - } -+ else if ( args.mode==MODE_REF_ALT ) // only change the REF/ALT column, leave the genotypes as is -+ { -+ if ( ir==ia ) return ret; -+ if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),0); } -+ if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } -+ if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),0); } -+ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); -+ } - else if ( args.mode==MODE_FLIP2FWD ) - { - int pair = 1 << ia | 1 << ib; -@@ -430,7 +448,7 @@ - if ( ir==ib ) { args.nswap++; return set_ref_alt(&args,rec,int2nt(ib),int2nt(ia),1); } - if ( ir==revint(ia) ) { args.nflip++; return set_ref_alt(&args,rec,int2nt(revint(ia)),int2nt(revint(ib)),0); } - if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),1); } -- error("FIXME: this should not happen %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); -+ error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - } - else if ( args.mode==MODE_TOP2FWD ) - { -@@ -459,8 +477,8 @@ - { - int len, win = rec->pos > 100 ? 100 : rec->pos, beg = rec->pos - win, end = rec->pos + win; - char *ref = faidx_fetch_seq(args.fai, (char*)bcf_seqname(args.hdr,rec), beg,end, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); -- if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%d (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),rec->pos+1, end,beg); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); -+ if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%"PRId64" (%d %d)\n", win,len, bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1, end,beg); - - int i, mid = rec->pos - beg, strand = 0; - for (i=1; i<=win; i++) ---- python-pysam.orig/bcftools/plugins/guess-ploidy.c -+++ python-pysam/bcftools/plugins/guess-ploidy.c -@@ -387,7 +387,7 @@ - counts->pdip += log(pdip); - counts->ncount++; - if ( args->verbose>1 ) -- printf("DBG\t%s\t%d\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), -+ printf("DBG\t%s\t%"PRId64"\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), - freq[1],tmp[0],tmp[1],tmp[2],phap,pdip); - } - } -@@ -444,7 +444,7 @@ - else if ( !strcasecmp(optarg,"hg38") ) region = "chrX:2781480-155701381"; - else error("The argument not recognised, expected --genome b37, b38, hg19 or hg38: %s\n", optarg); - break; -- case 'R': region_is_file = 1; -+ case 'R': region_is_file = 1; // fall-through - case 'r': region = optarg; break; - case 'v': args->verbose++; break; - case 't': ---- python-pysam.orig/bcftools/plugins/guess-ploidy.c.pysam.c -+++ python-pysam/bcftools/plugins/guess-ploidy.c.pysam.c -@@ -389,7 +389,7 @@ - counts->pdip += log(pdip); - counts->ncount++; - if ( args->verbose>1 ) -- fprintf(bcftools_stdout, "DBG\t%s\t%d\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), -+ fprintf(bcftools_stdout, "DBG\t%s\t%"PRId64"\t%s\t%e\t%e\t%e\t%e\t%e\t%e\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,ismpl), - freq[1],tmp[0],tmp[1],tmp[2],phap,pdip); - } - } -@@ -446,7 +446,7 @@ - else if ( !strcasecmp(optarg,"hg38") ) region = "chrX:2781480-155701381"; - else error("The argument not recognised, expected --genome b37, b38, hg19 or hg38: %s\n", optarg); - break; -- case 'R': region_is_file = 1; -+ case 'R': region_is_file = 1; // fall-through - case 'r': region = optarg; break; - case 'v': args->verbose++; break; - case 't': ---- /dev/null -+++ python-pysam/bcftools/plugins/gvcfz.c -@@ -0,0 +1,378 @@ -+/* -+ Copyright (C) 2017 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+*/ -+/* -+ Compress gVCF file by resizing gVCF blocks according to specified criteria. -+*/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define GQ_KEY_NONE NULL -+#define GQ_KEY_GQ "GQ" -+#define GQ_KEY_RGQ "RGQ" -+ -+typedef struct -+{ -+ int32_t end, min_dp, gq, pl[3], grp; -+ char *gq_key; -+ bcf1_t *rec; -+} -+block_t; -+typedef struct -+{ -+ char *expr; // expression -+ int flt_id; // filter id, -1 for PASS -+ filter_t *flt; // filter -+} -+grp_t; -+typedef struct -+{ -+ filter_t *filter; -+ char *filter_str; -+ int filter_logic; -+ block_t gvcf; -+ htsFile *fh_out; -+ int ngrp; -+ grp_t *grp; -+ char *group_by; -+ int argc, region_is_file, target_is_file, output_type, trim_alts; -+ int32_t *tmpi, mtmpi, mean_min_dp_reported; -+ char **argv, *region, *target, *fname, *output_fname, *keep_tags; -+ bcf_hdr_t *hdr_in, *hdr_out; -+ bcf_srs_t *sr; -+} -+args_t; -+ -+const char *about(void) -+{ -+ return "Compress gVCF file by resizing gVCF blocks according to specified criteria.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n" -+ "\n" -+ "Usage: bcftools +gvcfz [Options]\n" -+ "Plugin options:\n" -+ " -a, --trim-alt-alleles trim alternate alleles not seen in the genotypes\n" -+ " -e, --exclude exclude sites for which the expression is true\n" -+ " -i, --include include sites for which the expression is true\n" -+ " -g, --group-by EXPR group gVCF blocks according to the expression\n" -+ " -o, --output FILE write gVCF output to the FILE\n" -+ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ "Examples:\n" -+ " # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n" -+ " bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n" -+ "\n" -+ " # Compress all non-reference sites into a single block, remove unused alternate alleles\n" -+ " bcftools +gvcfz input.bcf -a -g'PASS:GT!=\"alt\"'\n" -+ "\n"; -+} -+ -+static void init_groups(args_t *args) -+{ -+ args->hdr_out = bcf_hdr_dup(args->hdr_in); -+ bcf_hdr_printf(args->hdr_out, "##INFO="); -+ -+ // avoid nested double quotes in FILTER description -+ char *hdr_str = strdup(args->group_by); -+ char *tmp = hdr_str; -+ while (*tmp) -+ { -+ if ( *tmp=='"' ) *tmp = '\''; -+ tmp++; -+ } -+ -+ char *rmme_str = strdup(args->group_by), *beg = rmme_str; -+ while ( *beg ) -+ { -+ while ( *beg && isspace(*beg) ) beg++; -+ if ( !beg ) break; -+ char *end = beg; -+ while ( *end && *end!=':' ) end++; -+ if ( *end!=':' ) error("Could not parse the expression: \"%s\"\n", args->group_by); -+ *end = 0; -+ char *flt = beg; -+ beg = ++end; -+ while ( *end && *end!=';' ) end++; -+ char tmp = *end; *end = 0; -+ if ( strcmp(flt,"PASS") ) -+ { -+ bcf_hdr_printf(args->hdr_out, "##FILTER=", flt, hdr_str); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); -+ } -+ args->ngrp++; -+ args->grp = (grp_t*) realloc(args->grp,sizeof(grp_t)*args->ngrp); -+ grp_t *grp = args->grp + args->ngrp - 1; -+ grp->expr = strdup(beg); -+ grp->flt_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt); -+ if ( !bcf_hdr_idinfo_exists(args->hdr_out, BCF_HL_FLT, grp->flt_id) ) error("Could not initialize the filter \"%s\"\n", flt); -+ if ( !strcmp(flt,"PASS") ) grp->flt_id = -1; -+ -+ // remove trailing spaces -+ beg = grp->expr + strlen(grp->expr); while ( beg >= grp->expr && isspace(*beg) ) { *beg = 0; beg--; } -+ beg = grp->expr; while ( *beg && isspace(*beg) ) beg++; -+ -+ grp->flt = strcmp("-",beg) ? filter_init(args->hdr_in, grp->expr) : NULL; -+ -+ if ( !tmp ) break; -+ beg = end + 1; -+ } -+ free(rmme_str); -+ free(hdr_str); -+} -+ -+static void destroy_data(args_t *args) -+{ -+ int i; -+ for (i=0; ingrp; i++) -+ { -+ if ( args->grp[i].flt ) filter_destroy(args->grp[i].flt); -+ free(args->grp[i].expr); -+ } -+ free(args->grp); -+ -+ if ( args->filter ) filter_destroy(args->filter); -+ if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname); -+ -+ bcf_sr_destroy(args->sr); -+ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); -+ if ( args->gvcf.rec ) bcf_destroy(args->gvcf.rec); -+ free(args->tmpi); -+ free(args); -+} -+ -+static void flush_block(args_t *args, bcf1_t *rec) -+{ -+ block_t *gvcf = &args->gvcf; -+ if ( gvcf->grp < 0 ) return; -+ if ( rec && gvcf->end - 1 >= rec->pos ) gvcf->end = rec->pos; // NB: end is 1-based, rec->pos is 0-based -+ -+ if ( gvcf->rec->pos+1 < gvcf->end && bcf_update_info_int32(args->hdr_out,gvcf->rec,"END",&gvcf->end,1) != 0 ) -+ error("Could not update INFO/END at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"DP",&gvcf->min_dp,1) != 0 ) -+ error("Could not update FORMAT/DP at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ if ( gvcf->gq_key ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,gvcf->gq_key,&gvcf->gq,1) != 0 ) -+ error("Could not update FORMAT/%s at %s:%"PRId64"\n", gvcf->gq_key, bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ } -+ if ( gvcf->pl[0] >=0 ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 ) -+ error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ } -+ if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) -+ bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id); -+ -+ if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n"); -+ -+ gvcf->grp = -1; -+} -+static void process_gvcf(args_t *args) -+{ -+ bcf1_t *rec = bcf_sr_get_line(args->sr,0); -+ -+ if ( args->filter ) -+ { -+ int pass = filter_test(args->filter, rec, NULL); -+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; -+ if ( !pass ) return; -+ } -+ -+ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) -+ { -+ if ( args->trim_alts ) -+ { -+ bcf_unpack(rec, BCF_UN_ALL); -+ if ( bcf_trim_alleles(args->hdr_in, rec)<0 ) -+ error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr_in, rec),(int64_t) rec->pos+1); -+ -+ // trim the ref allele if necessary -+ if ( rec->d.allele[0][1] ) -+ { -+ rec->d.allele[0][1] = 0; -+ bcf_update_alleles(args->hdr_in, rec, (const char**)rec->d.allele, 1); -+ } -+ -+ } -+ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) -+ { -+ // not a gvcf block -+ flush_block(args, rec); -+ if ( bcf_write(args->fh_out, args->hdr_out, rec)!=0 ) error("Failed to write\n"); -+ return; -+ } -+ } -+ -+ int ret = bcf_get_info_int32(args->hdr_in,rec,"END",&args->tmpi,&args->mtmpi); -+ int32_t end = ret==1 ? args->tmpi[0] : rec->pos + 1; -+ -+ char *gq_key = GQ_KEY_GQ; -+ ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); -+ if ( ret!=1 ) -+ { -+ gq_key = GQ_KEY_RGQ; -+ if ( ret<1 ) ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); -+ if ( ret!=1 ) gq_key = GQ_KEY_NONE; -+ } -+ int32_t gq = ret==1 ? args->tmpi[0] : 0; -+ -+ int32_t min_dp = 0; -+ if ( bcf_get_format_int32(args->hdr_in,rec,"MIN_DP",&args->tmpi,&args->mtmpi)==1 ) -+ min_dp = args->tmpi[0]; -+ else if ( bcf_get_format_int32(args->hdr_in,rec,"DP",&args->tmpi,&args->mtmpi)==1 ) -+ min_dp = args->tmpi[0]; -+ else -+ error("Expected one FORMAT/MIN_DP or FORMAT/DP value at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); -+ -+ int32_t pl[3] = {-1,-1,-1}; -+ ret = bcf_get_format_int32(args->hdr_in,rec,"PL",&args->tmpi,&args->mtmpi); -+ if ( ret>3 ) error("Expected three FORMAT/PL values at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); -+ else if ( ret==3 ) -+ { -+ pl[0] = args->tmpi[0]; -+ pl[1] = args->tmpi[1]; -+ pl[2] = args->tmpi[2]; -+ } -+ -+ int i; -+ for (i=0; ingrp; i++) -+ if ( !args->grp[i].flt || filter_test(args->grp[i].flt, rec, NULL)==1 ) break; -+ -+ if ( args->gvcf.grp != i ) flush_block(args, rec); // new block -+ if ( args->gvcf.grp >= 0 && args->gvcf.rec->rid != rec->rid ) flush_block(args, NULL); // new chromosome -+ -+ if ( args->gvcf.grp >= 0 ) // extend an existing block -+ { -+ if ( args->gvcf.end < end ) args->gvcf.end = end; -+ if ( args->gvcf.gq_key!=GQ_KEY_NONE && gq_key!=GQ_KEY_NONE && args->gvcf.gq > gq ) args->gvcf.gq = gq; -+ if ( args->gvcf.min_dp > min_dp ) args->gvcf.min_dp = min_dp; -+ if ( args->gvcf.pl[0] > pl[0] ) args->gvcf.pl[0] = pl[0]; -+ if ( args->gvcf.pl[1] > pl[1] ) args->gvcf.pl[1] = pl[1]; -+ if ( args->gvcf.pl[2] > pl[2] ) args->gvcf.pl[2] = pl[2]; -+ return; -+ } -+ -+ // start a new block -+ args->gvcf.rec = bcf_copy(args->gvcf.rec, rec); -+ args->gvcf.grp = i; -+ args->gvcf.min_dp = min_dp; -+ args->gvcf.end = end; -+ args->gvcf.pl[0] = pl[0]; -+ args->gvcf.pl[1] = pl[1]; -+ args->gvcf.pl[2] = pl[2]; -+ args->gvcf.gq_key = gq_key; -+ if ( gq_key!=GQ_KEY_NONE ) args->gvcf.gq = gq; -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_type = FT_VCF; -+ args->output_fname = "-"; -+ static struct option loptions[] = -+ { -+ {"trim-alt-alleles",required_argument,0,'a'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"group-by",required_argument,NULL,'g'}, -+ {"stats",required_argument,NULL,'s'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'a': args->trim_alts = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 'g': args->group_by = optarg; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ } -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ if ( !args->group_by ) error("Missing the -g option\n"); -+ -+ args->gvcf.rec = bcf_init(); -+ args->gvcf.grp = -1; // the block is inactive -+ args->sr = bcf_sr_init(); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr_in = bcf_sr_get_header(args->sr,0); -+ if ( args->filter_str ) -+ args->filter = filter_init(args->hdr_in, args->filter_str); -+ init_groups(args); -+ args->fh_out = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -+ if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n"); -+ while ( bcf_sr_next_line(args->sr) ) process_gvcf(args); -+ flush_block(args, NULL); -+ -+ destroy_data(args); -+ return 0; -+} -+ -+ ---- /dev/null -+++ python-pysam/bcftools/plugins/gvcfz.c.pysam.c -@@ -0,0 +1,380 @@ -+#include "bcftools.pysam.h" -+ -+/* -+ Copyright (C) 2017 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+*/ -+/* -+ Compress gVCF file by resizing gVCF blocks according to specified criteria. -+*/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define GQ_KEY_NONE NULL -+#define GQ_KEY_GQ "GQ" -+#define GQ_KEY_RGQ "RGQ" -+ -+typedef struct -+{ -+ int32_t end, min_dp, gq, pl[3], grp; -+ char *gq_key; -+ bcf1_t *rec; -+} -+block_t; -+typedef struct -+{ -+ char *expr; // expression -+ int flt_id; // filter id, -1 for PASS -+ filter_t *flt; // filter -+} -+grp_t; -+typedef struct -+{ -+ filter_t *filter; -+ char *filter_str; -+ int filter_logic; -+ block_t gvcf; -+ htsFile *fh_out; -+ int ngrp; -+ grp_t *grp; -+ char *group_by; -+ int argc, region_is_file, target_is_file, output_type, trim_alts; -+ int32_t *tmpi, mtmpi, mean_min_dp_reported; -+ char **argv, *region, *target, *fname, *output_fname, *keep_tags; -+ bcf_hdr_t *hdr_in, *hdr_out; -+ bcf_srs_t *sr; -+} -+args_t; -+ -+const char *about(void) -+{ -+ return "Compress gVCF file by resizing gVCF blocks according to specified criteria.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Compress gVCF file by resizing gVCF blocks according to specified criteria.\n" -+ "\n" -+ "Usage: bcftools +gvcfz [Options]\n" -+ "Plugin options:\n" -+ " -a, --trim-alt-alleles trim alternate alleles not seen in the genotypes\n" -+ " -e, --exclude exclude sites for which the expression is true\n" -+ " -i, --include include sites for which the expression is true\n" -+ " -g, --group-by EXPR group gVCF blocks according to the expression\n" -+ " -o, --output FILE write gVCF output to the FILE\n" -+ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ "Examples:\n" -+ " # Compress blocks by GQ and DP. Multiple blocks separated by a semicolon can be defined\n" -+ " bcftools +gvcfz input.bcf -g'PASS:GQ>60 & DP<20; PASS:GQ>40 & DP<15; Flt1:QG>20; Flt2:-'\n" -+ "\n" -+ " # Compress all non-reference sites into a single block, remove unused alternate alleles\n" -+ " bcftools +gvcfz input.bcf -a -g'PASS:GT!=\"alt\"'\n" -+ "\n"; -+} -+ -+static void init_groups(args_t *args) -+{ -+ args->hdr_out = bcf_hdr_dup(args->hdr_in); -+ bcf_hdr_printf(args->hdr_out, "##INFO="); -+ -+ // avoid nested double quotes in FILTER description -+ char *hdr_str = strdup(args->group_by); -+ char *tmp = hdr_str; -+ while (*tmp) -+ { -+ if ( *tmp=='"' ) *tmp = '\''; -+ tmp++; -+ } -+ -+ char *rmme_str = strdup(args->group_by), *beg = rmme_str; -+ while ( *beg ) -+ { -+ while ( *beg && isspace(*beg) ) beg++; -+ if ( !beg ) break; -+ char *end = beg; -+ while ( *end && *end!=':' ) end++; -+ if ( *end!=':' ) error("Could not parse the expression: \"%s\"\n", args->group_by); -+ *end = 0; -+ char *flt = beg; -+ beg = ++end; -+ while ( *end && *end!=';' ) end++; -+ char tmp = *end; *end = 0; -+ if ( strcmp(flt,"PASS") ) -+ { -+ bcf_hdr_printf(args->hdr_out, "##FILTER=", flt, hdr_str); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); -+ } -+ args->ngrp++; -+ args->grp = (grp_t*) realloc(args->grp,sizeof(grp_t)*args->ngrp); -+ grp_t *grp = args->grp + args->ngrp - 1; -+ grp->expr = strdup(beg); -+ grp->flt_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt); -+ if ( !bcf_hdr_idinfo_exists(args->hdr_out, BCF_HL_FLT, grp->flt_id) ) error("Could not initialize the filter \"%s\"\n", flt); -+ if ( !strcmp(flt,"PASS") ) grp->flt_id = -1; -+ -+ // remove trailing spaces -+ beg = grp->expr + strlen(grp->expr); while ( beg >= grp->expr && isspace(*beg) ) { *beg = 0; beg--; } -+ beg = grp->expr; while ( *beg && isspace(*beg) ) beg++; -+ -+ grp->flt = strcmp("-",beg) ? filter_init(args->hdr_in, grp->expr) : NULL; -+ -+ if ( !tmp ) break; -+ beg = end + 1; -+ } -+ free(rmme_str); -+ free(hdr_str); -+} -+ -+static void destroy_data(args_t *args) -+{ -+ int i; -+ for (i=0; ingrp; i++) -+ { -+ if ( args->grp[i].flt ) filter_destroy(args->grp[i].flt); -+ free(args->grp[i].expr); -+ } -+ free(args->grp); -+ -+ if ( args->filter ) filter_destroy(args->filter); -+ if ( hts_close(args->fh_out)!=0 ) error("failed to close %s\n", args->output_fname); -+ -+ bcf_sr_destroy(args->sr); -+ if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); -+ if ( args->gvcf.rec ) bcf_destroy(args->gvcf.rec); -+ free(args->tmpi); -+ free(args); -+} -+ -+static void flush_block(args_t *args, bcf1_t *rec) -+{ -+ block_t *gvcf = &args->gvcf; -+ if ( gvcf->grp < 0 ) return; -+ if ( rec && gvcf->end - 1 >= rec->pos ) gvcf->end = rec->pos; // NB: end is 1-based, rec->pos is 0-based -+ -+ if ( gvcf->rec->pos+1 < gvcf->end && bcf_update_info_int32(args->hdr_out,gvcf->rec,"END",&gvcf->end,1) != 0 ) -+ error("Could not update INFO/END at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"DP",&gvcf->min_dp,1) != 0 ) -+ error("Could not update FORMAT/DP at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ if ( gvcf->gq_key ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,gvcf->gq_key,&gvcf->gq,1) != 0 ) -+ error("Could not update FORMAT/%s at %s:%"PRId64"\n", gvcf->gq_key, bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ } -+ if ( gvcf->pl[0] >=0 ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,gvcf->rec,"PL",&gvcf->pl,3) != 0 ) -+ error("Could not update FORMAT/PL at %s:%"PRId64"\n", bcf_seqname(args->hdr_out,gvcf->rec),(int64_t) gvcf->rec->pos+1); -+ } -+ if ( gvcf->grp < args->ngrp && args->grp[gvcf->grp].flt_id >= 0 ) -+ bcf_add_filter(args->hdr_out, gvcf->rec, args->grp[gvcf->grp].flt_id); -+ -+ if ( bcf_write(args->fh_out, args->hdr_out, gvcf->rec)!=0 ) error("Failed to write the header\n"); -+ -+ gvcf->grp = -1; -+} -+static void process_gvcf(args_t *args) -+{ -+ bcf1_t *rec = bcf_sr_get_line(args->sr,0); -+ -+ if ( args->filter ) -+ { -+ int pass = filter_test(args->filter, rec, NULL); -+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; -+ if ( !pass ) return; -+ } -+ -+ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) -+ { -+ if ( args->trim_alts ) -+ { -+ bcf_unpack(rec, BCF_UN_ALL); -+ if ( bcf_trim_alleles(args->hdr_in, rec)<0 ) -+ error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr_in, rec),(int64_t) rec->pos+1); -+ -+ // trim the ref allele if necessary -+ if ( rec->d.allele[0][1] ) -+ { -+ rec->d.allele[0][1] = 0; -+ bcf_update_alleles(args->hdr_in, rec, (const char**)rec->d.allele, 1); -+ } -+ -+ } -+ if ( rec->n_allele > 2 || (rec->n_allele == 2 && strcmp("",rec->d.allele[1]) && strcmp("<*>",rec->d.allele[1])) ) -+ { -+ // not a gvcf block -+ flush_block(args, rec); -+ if ( bcf_write(args->fh_out, args->hdr_out, rec)!=0 ) error("Failed to write\n"); -+ return; -+ } -+ } -+ -+ int ret = bcf_get_info_int32(args->hdr_in,rec,"END",&args->tmpi,&args->mtmpi); -+ int32_t end = ret==1 ? args->tmpi[0] : rec->pos + 1; -+ -+ char *gq_key = GQ_KEY_GQ; -+ ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); -+ if ( ret!=1 ) -+ { -+ gq_key = GQ_KEY_RGQ; -+ if ( ret<1 ) ret = bcf_get_format_int32(args->hdr_in,rec,gq_key,&args->tmpi,&args->mtmpi); -+ if ( ret!=1 ) gq_key = GQ_KEY_NONE; -+ } -+ int32_t gq = ret==1 ? args->tmpi[0] : 0; -+ -+ int32_t min_dp = 0; -+ if ( bcf_get_format_int32(args->hdr_in,rec,"MIN_DP",&args->tmpi,&args->mtmpi)==1 ) -+ min_dp = args->tmpi[0]; -+ else if ( bcf_get_format_int32(args->hdr_in,rec,"DP",&args->tmpi,&args->mtmpi)==1 ) -+ min_dp = args->tmpi[0]; -+ else -+ error("Expected one FORMAT/MIN_DP or FORMAT/DP value at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); -+ -+ int32_t pl[3] = {-1,-1,-1}; -+ ret = bcf_get_format_int32(args->hdr_in,rec,"PL",&args->tmpi,&args->mtmpi); -+ if ( ret>3 ) error("Expected three FORMAT/PL values at %s:%"PRId64"\n", bcf_seqname(args->hdr_in,rec),(int64_t) rec->pos+1); -+ else if ( ret==3 ) -+ { -+ pl[0] = args->tmpi[0]; -+ pl[1] = args->tmpi[1]; -+ pl[2] = args->tmpi[2]; -+ } -+ -+ int i; -+ for (i=0; ingrp; i++) -+ if ( !args->grp[i].flt || filter_test(args->grp[i].flt, rec, NULL)==1 ) break; -+ -+ if ( args->gvcf.grp != i ) flush_block(args, rec); // new block -+ if ( args->gvcf.grp >= 0 && args->gvcf.rec->rid != rec->rid ) flush_block(args, NULL); // new chromosome -+ -+ if ( args->gvcf.grp >= 0 ) // extend an existing block -+ { -+ if ( args->gvcf.end < end ) args->gvcf.end = end; -+ if ( args->gvcf.gq_key!=GQ_KEY_NONE && gq_key!=GQ_KEY_NONE && args->gvcf.gq > gq ) args->gvcf.gq = gq; -+ if ( args->gvcf.min_dp > min_dp ) args->gvcf.min_dp = min_dp; -+ if ( args->gvcf.pl[0] > pl[0] ) args->gvcf.pl[0] = pl[0]; -+ if ( args->gvcf.pl[1] > pl[1] ) args->gvcf.pl[1] = pl[1]; -+ if ( args->gvcf.pl[2] > pl[2] ) args->gvcf.pl[2] = pl[2]; -+ return; -+ } -+ -+ // start a new block -+ args->gvcf.rec = bcf_copy(args->gvcf.rec, rec); -+ args->gvcf.grp = i; -+ args->gvcf.min_dp = min_dp; -+ args->gvcf.end = end; -+ args->gvcf.pl[0] = pl[0]; -+ args->gvcf.pl[1] = pl[1]; -+ args->gvcf.pl[2] = pl[2]; -+ args->gvcf.gq_key = gq_key; -+ if ( gq_key!=GQ_KEY_NONE ) args->gvcf.gq = gq; -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_type = FT_VCF; -+ args->output_fname = "-"; -+ static struct option loptions[] = -+ { -+ {"trim-alt-alleles",required_argument,0,'a'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"group-by",required_argument,NULL,'g'}, -+ {"stats",required_argument,NULL,'s'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ while ((c = getopt_long(argc, argv, "vr:R:t:T:o:O:g:i:e:a",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'a': args->trim_alts = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 'g': args->group_by = optarg; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ } -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ if ( !args->group_by ) error("Missing the -g option\n"); -+ -+ args->gvcf.rec = bcf_init(); -+ args->gvcf.grp = -1; // the block is inactive -+ args->sr = bcf_sr_init(); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr_in = bcf_sr_get_header(args->sr,0); -+ if ( args->filter_str ) -+ args->filter = filter_init(args->hdr_in, args->filter_str); -+ init_groups(args); -+ args->fh_out = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -+ if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n"); -+ while ( bcf_sr_next_line(args->sr) ) process_gvcf(args); -+ flush_block(args, NULL); -+ -+ destroy_data(args); -+ return 0; -+} -+ -+ ---- /dev/null -+++ python-pysam/bcftools/plugins/indel-stats.c -@@ -0,0 +1,753 @@ -+/* The MIT License -+ -+ Copyright (c) 2018 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+static int NVAF = 20; -+static int MAX_LEN = 20; -+ -+static inline int len2bin(int len) -+{ -+ if ( len < -MAX_LEN ) return 0; -+ if ( len > MAX_LEN ) return 2*MAX_LEN; -+ return MAX_LEN + len; -+} -+HTS_UNUSED static inline int bin2len(int bin) -+{ -+ return bin - MAX_LEN; -+} -+static inline int vaf2bin(float vaf) -+{ -+ return vaf*(NVAF-1); -+} -+HTS_UNUSED static inline float bin2vaf(int bin) -+{ -+ return (float)bin/(NVAF-1); -+} -+ -+typedef struct -+{ -+ uint32_t -+ *nvaf, // number of indels genotypes with low VAF (<=0.2) and high VAF (>0.2); use vaf2bin and bin2vaf -+ *nlen, // length distribution (-MAX_LEN,MAX_LEN); use len2bin and bin2len; site-wise unless samples are present -+ npass_gt, // number of indel genotypes passing the filter -+ npass, // number of sites passing the filter -+ nsites, // number of sites total -+ nins, ndel, // number of insertions and deletions, site-wise, not genotype-wise -+ nframeshift, ninframe, // site-wise -+ *nfrac; // number of het indels contributing to dfrac -+ double -+ *dfrac; // minor allele fraction at HET indel genotypes, determined from FORMAT/AD -+} -+stats_t; -+ -+typedef struct -+{ -+ stats_t stats; -+ filter_t *filter; -+ char *expr; -+} -+flt_stats_t; -+ -+#define iCHILD 0 -+#define iFATHER 1 -+#define iMOTHER 2 -+ -+typedef struct -+{ -+ int idx[3]; // VCF sample index for father, mother and child -+ int pass; // do all three pass the filters? -+} -+trio_t; -+ -+typedef struct -+{ -+ int argc, filter_logic, regions_is_file, targets_is_file; -+ int nflt_str; -+ char *filter_str, **flt_str; -+ char **argv, *output_fname, *fname, *regions, *targets, *csq_tag, *ped_fname; -+ trio_t *trio; -+ int ntrio, mtrio; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr; -+ flt_stats_t *filters; -+ int nfilters, nsmpl; -+ char *csq_str; -+ int32_t *gt_arr, *ad_arr, *ac; -+ int mgt_arr, mad_arr, mac, mcsq_str; -+ int ngt, ngt1, nad, nad1; -+ int allow_alt2ref_DNMs; // is "0/0 0/1 1/1" (child,father,mother) a valid DNM? -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Calculate indel stats scanning over a range of thresholds simultaneously.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Calculates indel stats. Use curly brackets to scan a range of values simultaneously\n" -+ "Usage: bcftools +indel-stats [Plugin Options]\n" -+ "Plugin options:\n" -+ " --alt2ref-DNM consider GT errors such as 0/1 + 1/1 -> 0/0 a valid DNM\n" -+ " -c, --csq-tag STR VEP or BCSQ tag to determine inframe and frameshift variants [CSQ]\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " --max-len INT maximum indel length to consider [20]\n" -+ " --nvaf INT number of variant allele frequency bins [20]\n" -+ " -o, --output FILE output file name [stdout]\n" -+ " -p, --ped FILE limit the stats to de novo indels\n" -+ " -r, --regions REG restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REG similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ "\n" -+ "Example:\n" -+ " bcftools +indel-stats -i 'GQ>{10,20,30,40,50}' file.bcf\n" -+ "\n"; -+} -+ -+static void parse_filters(args_t *args) -+{ -+ if ( !args->filter_str ) return; -+ int mflt = 1; -+ args->nflt_str = 1; -+ args->flt_str = (char**) malloc(sizeof(char*)); -+ args->flt_str[0] = strdup(args->filter_str); -+ while (1) -+ { -+ int i, expanded = 0; -+ for (i=args->nflt_str-1; i>=0; i--) -+ { -+ char *exp_beg = strchr(args->flt_str[i], '{'); -+ if ( !exp_beg ) continue; -+ char *exp_end = strchr(exp_beg+1, '}'); -+ if ( !exp_end ) error("Could not parse the expression: %s\n", args->filter_str); -+ char *beg = exp_beg+1, *mid = beg; -+ while ( midflt_str[i], exp_beg - args->flt_str[i], &tmp); -+ kputsn(beg, mid - beg, &tmp); -+ kputs(exp_end+1, &tmp); -+ args->nflt_str++; -+ hts_expand(char*, args->nflt_str, mflt, args->flt_str); -+ args->flt_str[args->nflt_str-1] = tmp.s; -+ beg = ++mid; -+ } -+ expanded = 1; -+ free(args->flt_str[i]); -+ memmove(&args->flt_str[i], &args->flt_str[i+1], (args->nflt_str-i-1)*sizeof(*args->flt_str)); -+ args->nflt_str--; -+ args->flt_str[args->nflt_str] = NULL; -+ } -+ if ( !expanded ) break; -+ } -+ -+ fprintf(stderr,"Collecting data for %d filtering expressions\n", args->nflt_str); -+} -+ -+static int cmp_trios(const void *_a, const void *_b) -+{ -+ trio_t *a = (trio_t *) _a; -+ trio_t *b = (trio_t *) _b; -+ int i; -+ int amin = a->idx[0]; -+ for (i=1; i<3; i++) -+ if ( amin > a->idx[i] ) amin = a->idx[i]; -+ int bmin = b->idx[0]; -+ for (i=1; i<3; i++) -+ if ( bmin > b->idx[i] ) bmin = b->idx[i]; -+ if ( amin < bmin ) return -1; -+ if ( amin > bmin ) return 1; -+ return 0; -+} -+static void parse_ped(args_t *args, char *fname) -+{ -+ htsFile *fp = hts_open(fname, "r"); -+ if ( !fp ) error("Could not read: %s\n", fname); -+ -+ kstring_t str = {0,0,0}; -+ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); -+ -+ int moff = 0, *off = NULL; -+ do -+ { -+ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment -+ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 -+ int ncols = ksplit_core(str.s,0,&moff,&off); -+ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); -+ -+ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); -+ if ( father<0 ) continue; -+ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); -+ if ( mother<0 ) continue; -+ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); -+ if ( child<0 ) continue; -+ -+ args->ntrio++; -+ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); -+ trio_t *trio = &args->trio[args->ntrio-1]; -+ trio->idx[iFATHER] = father; -+ trio->idx[iMOTHER] = mother; -+ trio->idx[iCHILD] = child; -+ } -+ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); -+ -+ fprintf(stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); -+ if ( !args->ntrio ) error("No complete trio identified\n"); -+ -+ // sort the sample by index so that they are accessed more or less sequentially -+ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); -+ -+ free(str.s); -+ free(off); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->regions ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); -+ } -+ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ if ( args->ped_fname ) -+ parse_ped(args, args->ped_fname); -+ -+ parse_filters(args); -+ -+ int i; -+ if ( !args->nflt_str ) -+ { -+ args->filters = (flt_stats_t*) calloc(1, sizeof(flt_stats_t)); -+ args->nfilters = 1; -+ args->filters[0].expr = strdup("all"); -+ args->filters[0].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); -+ args->filters[0].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[0].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[0].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); -+ } -+ else -+ { -+ args->nfilters = args->nflt_str; -+ args->filters = (flt_stats_t*) calloc(args->nfilters, sizeof(flt_stats_t)); -+ for (i=0; infilters; i++) -+ { -+ args->filters[i].filter = filter_init(args->hdr, args->flt_str[i]); -+ args->filters[i].expr = strdup(args->flt_str[i]); -+ args->filters[i].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); -+ args->filters[i].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[i].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[i].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); -+ -+ // replace tab's with spaces so that the output stays parsable -+ char *tmp = args->filters[i].expr; -+ while ( *tmp ) -+ { -+ if ( *tmp=='\t' ) *tmp = ' '; -+ tmp++; -+ } -+ } -+ } -+ args->nsmpl = bcf_hdr_nsamples(args->hdr); -+} -+static void destroy_data(args_t *args) -+{ -+ int i; -+ for (i=0; infilters; i++) -+ { -+ if ( args->filters[i].filter ) filter_destroy(args->filters[i].filter); -+ free(args->filters[i].stats.nvaf); -+ free(args->filters[i].stats.nlen); -+ free(args->filters[i].stats.nfrac); -+ free(args->filters[i].stats.dfrac); -+ free(args->filters[i].expr); -+ } -+ free(args->filters); -+ for (i=0; inflt_str; i++) free(args->flt_str[i]); -+ free(args->flt_str); -+ bcf_sr_destroy(args->sr); -+ free(args->ac); -+ free(args->trio); -+ free(args->csq_str); -+ free(args->gt_arr); -+ free(args->ad_arr); -+ free(args); -+} -+static void report_stats(args_t *args) -+{ -+ int i = 0,j; -+ FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w"); -+ if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); -+ fprintf(fh,"# CMD line shows the command line used to generate this output\n"); -+ fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); -+ fprintf(fh,"# SN* summary number for every threshold:\n"); -+ fprintf(fh,"# %d) SN*, filter id\n", ++i); -+ fprintf(fh,"# %d) number of samples (or trios with -p)\n", ++i); -+ fprintf(fh,"# %d) number of indel sites total\n", ++i); -+ fprintf(fh,"# %d) number of indel sites that pass the filter (and, with -p, have a de novo indel)\n", ++i); -+ fprintf(fh,"# %d) number of indel genotypes that pass the filter (and, with -p, are de novo)\n", ++i); -+ fprintf(fh,"# %d) number of insertions (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"# %d) number of deletions (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"# %d) number of frameshifts (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"# %d) number of inframe indels (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# DVAF* lines report indel variant allele frequency (VAF) distribution for every threshold,\n"); -+ fprintf(fh,"# k-th bin corresponds to the frequency k/(nVAF-1):\n"); -+ fprintf(fh,"# %d) DVAF*, filter id\n", ++i); -+ fprintf(fh,"# %d) nVAF, number of bins which split the [0,1] VAF interval.\n", ++i); -+ fprintf(fh,"# %d-%d) counts of indel genotypes in the VAF bin. For non-reference hets, the VAF of the less supported allele is recorded\n", i+1, i+NVAF); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# DLEN* lines report indel length distribution for every threshold. When genotype fields are available,\n"); -+ fprintf(fh,"# the counts correspond to the number of genotypes, otherwise the number of sites are given.\n"); -+ fprintf(fh,"# The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions.\n"); -+ fprintf(fh,"# The firt/last bin contains also all deletions/insertions larger than MAX_LEN:\n"); -+ fprintf(fh,"# %d) DLEN*, filter id\n", ++i); -+ fprintf(fh,"# %d) maximum indel length\n", ++i); -+ fprintf(fh,"# %d-%d) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x)\n", i+1, i+MAX_LEN*2+1); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# DFRAC* lines report the mean minor allele fraction at HET indel genotypes as a function of indel size.\n"); -+ fprintf(fh,"# The format is the same as for DLEN:\n"); -+ fprintf(fh,"# %d) DFRAC*, filter id\n", ++i); -+ fprintf(fh,"# %d) maximum indel length\n", ++i); -+ fprintf(fh,"# %d-%d) mean fraction at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# NFRAC* lines report the number of indels informing the DFRAC distribution.\n"); -+ fprintf(fh,"# %d) NFRAC*, filter id\n", ++i); -+ fprintf(fh,"# %d) maximum indel length\n", ++i); -+ fprintf(fh,"# %d-%d) counts at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); -+ fprintf(fh,"#\n"); -+ fprintf(fh, "CMD\t%s", args->argv[0]); -+ for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); -+ fprintf(fh, "\n"); -+ for (i=0; infilters; i++) -+ { -+ flt_stats_t *flt = &args->filters[i]; -+ fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); -+ } -+ for (i=0; infilters; i++) -+ { -+ stats_t *stats = &args->filters[i].stats; -+ -+ fprintf(fh,"SN%d", i); -+ fprintf(fh,"\t%u", args->ntrio ? args->ntrio : args->nsmpl); -+ fprintf(fh,"\t%u", stats->nsites); -+ fprintf(fh,"\t%u", stats->npass); -+ fprintf(fh,"\t%u", stats->npass_gt); -+ fprintf(fh,"\t%u", stats->nins); -+ fprintf(fh,"\t%u", stats->ndel); -+ fprintf(fh,"\t%u", stats->nframeshift); -+ fprintf(fh,"\t%u", stats->ninframe); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"DVAF%d", i); -+ fprintf(fh,"\t%d", NVAF); -+ for (j=0; jnvaf[j]); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"DLEN%d", i); -+ fprintf(fh,"\t%d", MAX_LEN); -+ for (j=0; jnlen[j]); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"DFRAC%d", i); -+ fprintf(fh,"\t%d", MAX_LEN); -+ for (j=0; jnfrac[j] ) fprintf(fh,"\t%.2f",stats->dfrac[j]/stats->nfrac[j]); -+ else fprintf(fh,"\t."); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"NFRAC%d", i); -+ fprintf(fh,"\t%d", MAX_LEN); -+ for (j=0; jnfrac[j]); -+ fprintf(fh,"\n"); -+ } -+ if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname); -+} -+ -+static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) -+{ -+ int32_t *ptr = arr + ngt1 * idx; -+ if ( bcf_gt_is_missing(ptr[0]) ) return -1; -+ als[0] = bcf_gt_allele(ptr[0]); -+ -+ if ( ngt1==1 || ptr[1]==bcf_int32_vector_end ) { ptr[1] = ptr[0]; return -2; } -+ -+ if ( bcf_gt_is_missing(ptr[1]) ) return -1; -+ als[1] = bcf_gt_allele(ptr[1]); -+ -+ return 0; -+} -+ -+static inline void update_indel_stats(args_t *args, bcf1_t *rec, stats_t *stats, int ismpl, int *als) -+{ -+ int j; -+ if ( als[0] >= args->nad1 || als[1] >= args->nad1 ) error("Incorrect GT allele at %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,als[0],als[1]); -+ int32_t *ad_ptr = args->ad_arr + ismpl*args->nad1; -+ -+ // find the allele with most support -+ uint32_t ntot = 0; -+ for (j=0; jnad1; j++) -+ { -+ if ( ad_ptr[j]==bcf_int32_missing ) continue; -+ if ( ad_ptr[j]==bcf_int32_vector_end ) break; -+ ntot += ad_ptr[j]; -+ } -+ if ( !ntot ) return; -+ -+ // Find the alternate allele fraction, total and relative. Set al0 to be the more frequent indel allele. -+ // The genotypes have been already sanitized in parse_genotype(). -+ int al0 = als[0], al1 = als[1]; -+ if ( !(bcf_get_variant_type(rec,al0) & VCF_INDEL) ) -+ { -+ if ( !(bcf_get_variant_type(rec,al1) & VCF_INDEL) ) error("FIXME: this should not happen .. %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,al0,al1); -+ al0 = als[1]; al1 = als[0]; -+ } -+ else if ( (bcf_get_variant_type(rec,al1) & VCF_INDEL) && al0!=al1 ) -+ { -+ // Select the more frequent indel allele. -+ if ( ad_ptr[al0] < ad_ptr[al1] ) al0 = als[1], al1 = als[0]; -+ -+ // Record length of both indel alleles -+ int bin = len2bin(rec->d.var[al1].n); -+ if ( bin >= 0 ) stats->nlen[bin]++; -+ } -+ -+ float vaf = (float)ad_ptr[al0] / ntot; -+ int bin = vaf2bin(vaf); -+ stats->nvaf[bin]++; -+ -+ // al0 is now the major indel allele -+ int len_bin = len2bin(rec->d.var[al0].n); -+ if ( len_bin < 0 ) return; -+ stats->nlen[len_bin]++; -+ -+ if ( al0!=al1 ) -+ { -+ ntot = ad_ptr[al0] + ad_ptr[al1]; -+ if ( ntot ) -+ { -+ stats->nfrac[len_bin]++; -+ stats->dfrac[len_bin]+= (double)ad_ptr[al0] / ntot; -+ } -+ } -+} -+ -+static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt) -+{ -+ int i,j; -+ uint8_t *smpl_pass = NULL; -+ -+ stats_t *stats = &flt->stats; -+ stats->nsites++; -+ -+ // Find out which samples/trios pass and if the site passes -+ if ( flt->filter ) -+ { -+ int pass_site = filter_test(flt->filter, rec, (const uint8_t**) &smpl_pass); -+ if ( args->ntrio ) -+ { -+ if ( args->filter_logic & FLT_EXCLUDE ) -+ { -+ if ( pass_site ) -+ { -+ if ( !smpl_pass ) return; -+ pass_site = 0; -+ for (i=0; intrio; i++) -+ { -+ int pass_trio = 1; -+ for (j=0; j<3; j++) -+ { -+ int idx = args->trio[i].idx[j]; -+ if ( smpl_pass[idx] ) { pass_trio = 0; break; } -+ } -+ args->trio[i].pass = pass_trio; -+ if ( pass_trio ) pass_site = 1; -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; intrio; i++) args->trio[i].pass = 1; -+ } -+ else if ( !pass_site ) return; -+ else if ( smpl_pass ) -+ { -+ pass_site = 0; -+ for (i=0; intrio; i++) -+ { -+ int pass_trio = 1; -+ for (j=0; j<3; j++) -+ { -+ int idx = args->trio[i].idx[j]; -+ if ( !smpl_pass[idx] ) { pass_trio = 0; break; } -+ } -+ args->trio[i].pass = pass_trio; -+ if ( pass_trio ) pass_site = 1; -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; intrio; i++) args->trio[i].pass = 1; -+ } -+ else -+ { -+ if ( args->filter_logic & FLT_EXCLUDE ) -+ { -+ if ( pass_site ) -+ { -+ if ( !smpl_pass ) return; -+ pass_site = 0; -+ for (i=0; insmpl; i++) -+ { -+ if ( smpl_pass[i] ) smpl_pass[i] = 0; -+ else { smpl_pass[i] = 1; pass_site = 1; } -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; insmpl; i++) smpl_pass[i] = 1; -+ } -+ else if ( !pass_site ) return; -+ } -+ } -+ -+ args->ngt = 0; -+ if ( args->nsmpl ) -+ { -+ // Get the genotypes -+ args->ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); -+ args->ngt1 = args->ngt / rec->n_sample; -+ -+ if ( args->ngt>0 ) -+ { -+ // Get the AD counts -+ args->nad = bcf_get_format_int32(args->hdr, rec, "AD", &args->ad_arr, &args->mad_arr); -+ args->nad1 = args->nad / rec->n_sample; -+ if ( args->nad>0 && args->nad1 != rec->n_allele ) error("Incorrect number of FORMAT/AD values at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ } -+ } -+ -+ // Is there a star allele? Don't count overlapping deletions twice -+ int star_allele = -1; -+ for (i=1; in_allele; i++) -+ if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } -+ -+ -+ if ( args->ngt>0 && args->ntrio ) -+ { -+ int is_dnm = 0; -+ for (i=0; intrio; i++) -+ { -+ if ( flt->filter && !args->trio[i].pass ) continue; -+ -+ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. -+ // the order is: child, father, mother -+ int als[6], *als_child = als, *als_father = als+2, *als_mother = als+4; -+ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iCHILD], als_child) < 0 ) continue; -+ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iFATHER], als_father) < 0 ) continue; -+ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iMOTHER], als_mother) < 0 ) continue; -+ -+ // Is it a DNM? -+ if ( !args->allow_alt2ref_DNMs && als_child[0]==0 && als_child[1]==0 ) continue; -+ if ( (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ) continue; -+ if ( (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ) continue; -+ if ( als_child[0]==star_allele || als_child[1]==star_allele ) continue; // don't count the same event multiple times -+ if ( als_father[0]==star_allele || als_father[1]==star_allele ) continue; -+ if ( als_mother[0]==star_allele || als_mother[1]==star_allele ) continue; -+ -+ int child_is_indel = (bcf_get_variant_type(rec,als_child[0]) & VCF_INDEL) || (bcf_get_variant_type(rec,als_child[1]) & VCF_INDEL) ? 1 : 0; -+ -+ if ( !args->allow_alt2ref_DNMs ) -+ { -+ if ( !child_is_indel ) continue; -+ } -+ else -+ { -+ if ( !child_is_indel && -+ !(bcf_get_variant_type(rec,als_father[0]) & VCF_INDEL) && -+ !(bcf_get_variant_type(rec,als_father[1]) & VCF_INDEL) && -+ !(bcf_get_variant_type(rec,als_mother[0]) & VCF_INDEL) && -+ !(bcf_get_variant_type(rec,als_mother[1]) & VCF_INDEL) ) continue; // not an indel, in any sample -+ } -+ -+ if ( child_is_indel ) -+ update_indel_stats(args, rec, stats, args->trio[i].idx[iCHILD], als_child); -+ -+ //printf("MERR\t%s\t%d\t%s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[iCHILD]]); -+ -+ stats->npass_gt++; -+ -+ is_dnm = 1; -+ } -+ if ( !is_dnm ) return; -+ } -+ else if ( args->ngt>0 ) -+ { -+ for (i=0; insmpl; i++) -+ { -+ if ( smpl_pass && !smpl_pass[i] ) continue; -+ -+ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. -+ int als[2] = {0,0}; -+ int ret = parse_genotype(args->gt_arr, args->ngt1, i, als); -+ if ( ret==-1 ) continue; // missing genotype -+ if ( !(bcf_get_variant_type(rec,als[0]) & VCF_INDEL) && !(bcf_get_variant_type(rec,als[1]) & VCF_INDEL) ) continue; // not an indel -+ -+ update_indel_stats(args, rec, stats, i, als); -+ -+ stats->npass_gt++; -+ } -+ } -+ -+ if ( bcf_get_info_string(args->hdr,rec,args->csq_tag,&args->csq_str,&args->mcsq_str) > 0 ) -+ { -+ if ( strstr(args->csq_str,"inframe") ) stats->ninframe++; -+ if ( strstr(args->csq_str,"frameshift") ) stats->nframeshift++; -+ } -+ -+ for (i=1; in_allele; i++) -+ { -+ if ( !(bcf_get_variant_type(rec,i) & VCF_INDEL) ) continue; -+ if ( rec->d.var[i].n < 0 ) stats->ndel++; -+ else if ( rec->d.var[i].n > 0 ) stats->nins++; -+ if ( args->ngt <= 0 ) -+ { -+ int bin = len2bin(rec->d.var[i].n); -+ if ( bin >= 0 ) stats->nlen[bin]++; -+ } -+ } -+ stats->npass++; -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_fname = "-"; -+ args->csq_tag = "CSQ"; -+ static struct option loptions[] = -+ { -+ {"max-len",required_argument,0,1}, -+ {"nvaf",required_argument,0,2}, -+ {"alt2ref-DNM",no_argument,0,3}, -+ {"ped",required_argument,0,'p'}, -+ {"csq-tag",required_argument,0,'c'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"output",required_argument,NULL,'o'}, -+ {"regions",1,0,'r'}, -+ {"regions-file",1,0,'R'}, -+ {"targets",1,0,'t'}, -+ {"targets-file",1,0,'T'}, -+ {NULL,0,NULL,0} -+ }; -+ char *tmp; -+ int c, i; -+ while ((c = getopt_long(argc, argv, "o:s:i:e:r:R:t:T:c:p:",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 1 : -+ MAX_LEN = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); -+ if ( MAX_LEN<=0 ) error("Expected value bigger than 0 --max-len\n"); -+ break; -+ case 2 : -+ NVAF = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); -+ if ( NVAF<0 || NVAF>1 ) error("Expected value from the interval [0,1] with --nvaf\n"); -+ break; -+ case 3 : args->allow_alt2ref_DNMs = 1; break; -+ case 'p': args->ped_fname = optarg; break; -+ case 'c': args->csq_tag = optarg; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': args->targets = optarg; break; -+ case 'T': args->targets = optarg; args->targets_is_file = 1; break; -+ case 'r': args->regions = optarg; break; -+ case 'R': args->regions = optarg; args->regions_is_file = 1; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s",usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s",usage_text()); -+ else args->fname = argv[optind]; -+ -+ init_data(args); -+ -+ while ( bcf_sr_next_line(args->sr) ) -+ { -+ bcf1_t *rec = bcf_sr_get_line(args->sr,0); -+ if ( !(bcf_get_variant_types(rec) & VCF_INDEL) ) continue; -+ for (i=0; infilters; i++) -+ process_record(args, rec, &args->filters[i]); -+ } -+ -+ report_stats(args); -+ destroy_data(args); -+ -+ return 0; -+} ---- /dev/null -+++ python-pysam/bcftools/plugins/indel-stats.c.pysam.c -@@ -0,0 +1,755 @@ -+#include "bcftools.pysam.h" -+ -+/* The MIT License -+ -+ Copyright (c) 2018 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+static int NVAF = 20; -+static int MAX_LEN = 20; -+ -+static inline int len2bin(int len) -+{ -+ if ( len < -MAX_LEN ) return 0; -+ if ( len > MAX_LEN ) return 2*MAX_LEN; -+ return MAX_LEN + len; -+} -+HTS_UNUSED static inline int bin2len(int bin) -+{ -+ return bin - MAX_LEN; -+} -+static inline int vaf2bin(float vaf) -+{ -+ return vaf*(NVAF-1); -+} -+HTS_UNUSED static inline float bin2vaf(int bin) -+{ -+ return (float)bin/(NVAF-1); -+} -+ -+typedef struct -+{ -+ uint32_t -+ *nvaf, // number of indels genotypes with low VAF (<=0.2) and high VAF (>0.2); use vaf2bin and bin2vaf -+ *nlen, // length distribution (-MAX_LEN,MAX_LEN); use len2bin and bin2len; site-wise unless samples are present -+ npass_gt, // number of indel genotypes passing the filter -+ npass, // number of sites passing the filter -+ nsites, // number of sites total -+ nins, ndel, // number of insertions and deletions, site-wise, not genotype-wise -+ nframeshift, ninframe, // site-wise -+ *nfrac; // number of het indels contributing to dfrac -+ double -+ *dfrac; // minor allele fraction at HET indel genotypes, determined from FORMAT/AD -+} -+stats_t; -+ -+typedef struct -+{ -+ stats_t stats; -+ filter_t *filter; -+ char *expr; -+} -+flt_stats_t; -+ -+#define iCHILD 0 -+#define iFATHER 1 -+#define iMOTHER 2 -+ -+typedef struct -+{ -+ int idx[3]; // VCF sample index for father, mother and child -+ int pass; // do all three pass the filters? -+} -+trio_t; -+ -+typedef struct -+{ -+ int argc, filter_logic, regions_is_file, targets_is_file; -+ int nflt_str; -+ char *filter_str, **flt_str; -+ char **argv, *output_fname, *fname, *regions, *targets, *csq_tag, *ped_fname; -+ trio_t *trio; -+ int ntrio, mtrio; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr; -+ flt_stats_t *filters; -+ int nfilters, nsmpl; -+ char *csq_str; -+ int32_t *gt_arr, *ad_arr, *ac; -+ int mgt_arr, mad_arr, mac, mcsq_str; -+ int ngt, ngt1, nad, nad1; -+ int allow_alt2ref_DNMs; // is "0/0 0/1 1/1" (child,father,mother) a valid DNM? -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Calculate indel stats scanning over a range of thresholds simultaneously.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Calculates indel stats. Use curly brackets to scan a range of values simultaneously\n" -+ "Usage: bcftools +indel-stats [Plugin Options]\n" -+ "Plugin options:\n" -+ " --alt2ref-DNM consider GT errors such as 0/1 + 1/1 -> 0/0 a valid DNM\n" -+ " -c, --csq-tag STR VEP or BCSQ tag to determine inframe and frameshift variants [CSQ]\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " --max-len INT maximum indel length to consider [20]\n" -+ " --nvaf INT number of variant allele frequency bins [20]\n" -+ " -o, --output FILE output file name [bcftools_stdout]\n" -+ " -p, --ped FILE limit the stats to de novo indels\n" -+ " -r, --regions REG restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REG similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ "\n" -+ "Example:\n" -+ " bcftools +indel-stats -i 'GQ>{10,20,30,40,50}' file.bcf\n" -+ "\n"; -+} -+ -+static void parse_filters(args_t *args) -+{ -+ if ( !args->filter_str ) return; -+ int mflt = 1; -+ args->nflt_str = 1; -+ args->flt_str = (char**) malloc(sizeof(char*)); -+ args->flt_str[0] = strdup(args->filter_str); -+ while (1) -+ { -+ int i, expanded = 0; -+ for (i=args->nflt_str-1; i>=0; i--) -+ { -+ char *exp_beg = strchr(args->flt_str[i], '{'); -+ if ( !exp_beg ) continue; -+ char *exp_end = strchr(exp_beg+1, '}'); -+ if ( !exp_end ) error("Could not parse the expression: %s\n", args->filter_str); -+ char *beg = exp_beg+1, *mid = beg; -+ while ( midflt_str[i], exp_beg - args->flt_str[i], &tmp); -+ kputsn(beg, mid - beg, &tmp); -+ kputs(exp_end+1, &tmp); -+ args->nflt_str++; -+ hts_expand(char*, args->nflt_str, mflt, args->flt_str); -+ args->flt_str[args->nflt_str-1] = tmp.s; -+ beg = ++mid; -+ } -+ expanded = 1; -+ free(args->flt_str[i]); -+ memmove(&args->flt_str[i], &args->flt_str[i+1], (args->nflt_str-i-1)*sizeof(*args->flt_str)); -+ args->nflt_str--; -+ args->flt_str[args->nflt_str] = NULL; -+ } -+ if ( !expanded ) break; -+ } -+ -+ fprintf(bcftools_stderr,"Collecting data for %d filtering expressions\n", args->nflt_str); -+} -+ -+static int cmp_trios(const void *_a, const void *_b) -+{ -+ trio_t *a = (trio_t *) _a; -+ trio_t *b = (trio_t *) _b; -+ int i; -+ int amin = a->idx[0]; -+ for (i=1; i<3; i++) -+ if ( amin > a->idx[i] ) amin = a->idx[i]; -+ int bmin = b->idx[0]; -+ for (i=1; i<3; i++) -+ if ( bmin > b->idx[i] ) bmin = b->idx[i]; -+ if ( amin < bmin ) return -1; -+ if ( amin > bmin ) return 1; -+ return 0; -+} -+static void parse_ped(args_t *args, char *fname) -+{ -+ htsFile *fp = hts_open(fname, "r"); -+ if ( !fp ) error("Could not read: %s\n", fname); -+ -+ kstring_t str = {0,0,0}; -+ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); -+ -+ int moff = 0, *off = NULL; -+ do -+ { -+ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment -+ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 -+ int ncols = ksplit_core(str.s,0,&moff,&off); -+ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); -+ -+ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); -+ if ( father<0 ) continue; -+ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); -+ if ( mother<0 ) continue; -+ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); -+ if ( child<0 ) continue; -+ -+ args->ntrio++; -+ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); -+ trio_t *trio = &args->trio[args->ntrio-1]; -+ trio->idx[iFATHER] = father; -+ trio->idx[iMOTHER] = mother; -+ trio->idx[iCHILD] = child; -+ } -+ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); -+ -+ fprintf(bcftools_stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); -+ if ( !args->ntrio ) error("No complete trio identified\n"); -+ -+ // sort the sample by index so that they are accessed more or less sequentially -+ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); -+ -+ free(str.s); -+ free(off); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->regions ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); -+ } -+ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ if ( args->ped_fname ) -+ parse_ped(args, args->ped_fname); -+ -+ parse_filters(args); -+ -+ int i; -+ if ( !args->nflt_str ) -+ { -+ args->filters = (flt_stats_t*) calloc(1, sizeof(flt_stats_t)); -+ args->nfilters = 1; -+ args->filters[0].expr = strdup("all"); -+ args->filters[0].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); -+ args->filters[0].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[0].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[0].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); -+ } -+ else -+ { -+ args->nfilters = args->nflt_str; -+ args->filters = (flt_stats_t*) calloc(args->nfilters, sizeof(flt_stats_t)); -+ for (i=0; infilters; i++) -+ { -+ args->filters[i].filter = filter_init(args->hdr, args->flt_str[i]); -+ args->filters[i].expr = strdup(args->flt_str[i]); -+ args->filters[i].stats.nvaf = (uint32_t*) calloc(NVAF, sizeof(uint32_t)); -+ args->filters[i].stats.nlen = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[i].stats.nfrac = (uint32_t*) calloc(MAX_LEN*2+1, sizeof(uint32_t)); -+ args->filters[i].stats.dfrac = (double*) calloc(MAX_LEN*2+1, sizeof(double)); -+ -+ // replace tab's with spaces so that the output stays parsable -+ char *tmp = args->filters[i].expr; -+ while ( *tmp ) -+ { -+ if ( *tmp=='\t' ) *tmp = ' '; -+ tmp++; -+ } -+ } -+ } -+ args->nsmpl = bcf_hdr_nsamples(args->hdr); -+} -+static void destroy_data(args_t *args) -+{ -+ int i; -+ for (i=0; infilters; i++) -+ { -+ if ( args->filters[i].filter ) filter_destroy(args->filters[i].filter); -+ free(args->filters[i].stats.nvaf); -+ free(args->filters[i].stats.nlen); -+ free(args->filters[i].stats.nfrac); -+ free(args->filters[i].stats.dfrac); -+ free(args->filters[i].expr); -+ } -+ free(args->filters); -+ for (i=0; inflt_str; i++) free(args->flt_str[i]); -+ free(args->flt_str); -+ bcf_sr_destroy(args->sr); -+ free(args->ac); -+ free(args->trio); -+ free(args->csq_str); -+ free(args->gt_arr); -+ free(args->ad_arr); -+ free(args); -+} -+static void report_stats(args_t *args) -+{ -+ int i = 0,j; -+ FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? bcftools_stdout : fopen(args->output_fname,"w"); -+ if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); -+ fprintf(fh,"# CMD line shows the command line used to generate this output\n"); -+ fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); -+ fprintf(fh,"# SN* summary number for every threshold:\n"); -+ fprintf(fh,"# %d) SN*, filter id\n", ++i); -+ fprintf(fh,"# %d) number of samples (or trios with -p)\n", ++i); -+ fprintf(fh,"# %d) number of indel sites total\n", ++i); -+ fprintf(fh,"# %d) number of indel sites that pass the filter (and, with -p, have a de novo indel)\n", ++i); -+ fprintf(fh,"# %d) number of indel genotypes that pass the filter (and, with -p, are de novo)\n", ++i); -+ fprintf(fh,"# %d) number of insertions (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"# %d) number of deletions (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"# %d) number of frameshifts (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"# %d) number of inframe indels (site-wise, not genotype-wise)\n", ++i); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# DVAF* lines report indel variant allele frequency (VAF) distribution for every threshold,\n"); -+ fprintf(fh,"# k-th bin corresponds to the frequency k/(nVAF-1):\n"); -+ fprintf(fh,"# %d) DVAF*, filter id\n", ++i); -+ fprintf(fh,"# %d) nVAF, number of bins which split the [0,1] VAF interval.\n", ++i); -+ fprintf(fh,"# %d-%d) counts of indel genotypes in the VAF bin. For non-reference hets, the VAF of the less supported allele is recorded\n", i+1, i+NVAF); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# DLEN* lines report indel length distribution for every threshold. When genotype fields are available,\n"); -+ fprintf(fh,"# the counts correspond to the number of genotypes, otherwise the number of sites are given.\n"); -+ fprintf(fh,"# The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions.\n"); -+ fprintf(fh,"# The firt/last bin contains also all deletions/insertions larger than MAX_LEN:\n"); -+ fprintf(fh,"# %d) DLEN*, filter id\n", ++i); -+ fprintf(fh,"# %d) maximum indel length\n", ++i); -+ fprintf(fh,"# %d-%d) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x)\n", i+1, i+MAX_LEN*2+1); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# DFRAC* lines report the mean minor allele fraction at HET indel genotypes as a function of indel size.\n"); -+ fprintf(fh,"# The format is the same as for DLEN:\n"); -+ fprintf(fh,"# %d) DFRAC*, filter id\n", ++i); -+ fprintf(fh,"# %d) maximum indel length\n", ++i); -+ fprintf(fh,"# %d-%d) mean fraction at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); -+ fprintf(fh,"#\n"); -+ i = 0; -+ fprintf(fh,"# NFRAC* lines report the number of indels informing the DFRAC distribution.\n"); -+ fprintf(fh,"# %d) NFRAC*, filter id\n", ++i); -+ fprintf(fh,"# %d) maximum indel length\n", ++i); -+ fprintf(fh,"# %d-%d) counts at indel lengths (-max,..,0,..,max)\n", i+1, i+MAX_LEN*2+1); -+ fprintf(fh,"#\n"); -+ fprintf(fh, "CMD\t%s", args->argv[0]); -+ for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); -+ fprintf(fh, "\n"); -+ for (i=0; infilters; i++) -+ { -+ flt_stats_t *flt = &args->filters[i]; -+ fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); -+ } -+ for (i=0; infilters; i++) -+ { -+ stats_t *stats = &args->filters[i].stats; -+ -+ fprintf(fh,"SN%d", i); -+ fprintf(fh,"\t%u", args->ntrio ? args->ntrio : args->nsmpl); -+ fprintf(fh,"\t%u", stats->nsites); -+ fprintf(fh,"\t%u", stats->npass); -+ fprintf(fh,"\t%u", stats->npass_gt); -+ fprintf(fh,"\t%u", stats->nins); -+ fprintf(fh,"\t%u", stats->ndel); -+ fprintf(fh,"\t%u", stats->nframeshift); -+ fprintf(fh,"\t%u", stats->ninframe); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"DVAF%d", i); -+ fprintf(fh,"\t%d", NVAF); -+ for (j=0; jnvaf[j]); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"DLEN%d", i); -+ fprintf(fh,"\t%d", MAX_LEN); -+ for (j=0; jnlen[j]); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"DFRAC%d", i); -+ fprintf(fh,"\t%d", MAX_LEN); -+ for (j=0; jnfrac[j] ) fprintf(fh,"\t%.2f",stats->dfrac[j]/stats->nfrac[j]); -+ else fprintf(fh,"\t."); -+ fprintf(fh,"\n"); -+ -+ fprintf(fh,"NFRAC%d", i); -+ fprintf(fh,"\t%d", MAX_LEN); -+ for (j=0; jnfrac[j]); -+ fprintf(fh,"\n"); -+ } -+ if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "bcftools_stdout" : args->output_fname); -+} -+ -+static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) -+{ -+ int32_t *ptr = arr + ngt1 * idx; -+ if ( bcf_gt_is_missing(ptr[0]) ) return -1; -+ als[0] = bcf_gt_allele(ptr[0]); -+ -+ if ( ngt1==1 || ptr[1]==bcf_int32_vector_end ) { ptr[1] = ptr[0]; return -2; } -+ -+ if ( bcf_gt_is_missing(ptr[1]) ) return -1; -+ als[1] = bcf_gt_allele(ptr[1]); -+ -+ return 0; -+} -+ -+static inline void update_indel_stats(args_t *args, bcf1_t *rec, stats_t *stats, int ismpl, int *als) -+{ -+ int j; -+ if ( als[0] >= args->nad1 || als[1] >= args->nad1 ) error("Incorrect GT allele at %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,als[0],als[1]); -+ int32_t *ad_ptr = args->ad_arr + ismpl*args->nad1; -+ -+ // find the allele with most support -+ uint32_t ntot = 0; -+ for (j=0; jnad1; j++) -+ { -+ if ( ad_ptr[j]==bcf_int32_missing ) continue; -+ if ( ad_ptr[j]==bcf_int32_vector_end ) break; -+ ntot += ad_ptr[j]; -+ } -+ if ( !ntot ) return; -+ -+ // Find the alternate allele fraction, total and relative. Set al0 to be the more frequent indel allele. -+ // The genotypes have been already sanitized in parse_genotype(). -+ int al0 = als[0], al1 = als[1]; -+ if ( !(bcf_get_variant_type(rec,al0) & VCF_INDEL) ) -+ { -+ if ( !(bcf_get_variant_type(rec,al1) & VCF_INDEL) ) error("FIXME: this should not happen .. %s:%"PRId64" .. %d/%d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,al0,al1); -+ al0 = als[1]; al1 = als[0]; -+ } -+ else if ( (bcf_get_variant_type(rec,al1) & VCF_INDEL) && al0!=al1 ) -+ { -+ // Select the more frequent indel allele. -+ if ( ad_ptr[al0] < ad_ptr[al1] ) al0 = als[1], al1 = als[0]; -+ -+ // Record length of both indel alleles -+ int bin = len2bin(rec->d.var[al1].n); -+ if ( bin >= 0 ) stats->nlen[bin]++; -+ } -+ -+ float vaf = (float)ad_ptr[al0] / ntot; -+ int bin = vaf2bin(vaf); -+ stats->nvaf[bin]++; -+ -+ // al0 is now the major indel allele -+ int len_bin = len2bin(rec->d.var[al0].n); -+ if ( len_bin < 0 ) return; -+ stats->nlen[len_bin]++; -+ -+ if ( al0!=al1 ) -+ { -+ ntot = ad_ptr[al0] + ad_ptr[al1]; -+ if ( ntot ) -+ { -+ stats->nfrac[len_bin]++; -+ stats->dfrac[len_bin]+= (double)ad_ptr[al0] / ntot; -+ } -+ } -+} -+ -+static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt) -+{ -+ int i,j; -+ uint8_t *smpl_pass = NULL; -+ -+ stats_t *stats = &flt->stats; -+ stats->nsites++; -+ -+ // Find out which samples/trios pass and if the site passes -+ if ( flt->filter ) -+ { -+ int pass_site = filter_test(flt->filter, rec, (const uint8_t**) &smpl_pass); -+ if ( args->ntrio ) -+ { -+ if ( args->filter_logic & FLT_EXCLUDE ) -+ { -+ if ( pass_site ) -+ { -+ if ( !smpl_pass ) return; -+ pass_site = 0; -+ for (i=0; intrio; i++) -+ { -+ int pass_trio = 1; -+ for (j=0; j<3; j++) -+ { -+ int idx = args->trio[i].idx[j]; -+ if ( smpl_pass[idx] ) { pass_trio = 0; break; } -+ } -+ args->trio[i].pass = pass_trio; -+ if ( pass_trio ) pass_site = 1; -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; intrio; i++) args->trio[i].pass = 1; -+ } -+ else if ( !pass_site ) return; -+ else if ( smpl_pass ) -+ { -+ pass_site = 0; -+ for (i=0; intrio; i++) -+ { -+ int pass_trio = 1; -+ for (j=0; j<3; j++) -+ { -+ int idx = args->trio[i].idx[j]; -+ if ( !smpl_pass[idx] ) { pass_trio = 0; break; } -+ } -+ args->trio[i].pass = pass_trio; -+ if ( pass_trio ) pass_site = 1; -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; intrio; i++) args->trio[i].pass = 1; -+ } -+ else -+ { -+ if ( args->filter_logic & FLT_EXCLUDE ) -+ { -+ if ( pass_site ) -+ { -+ if ( !smpl_pass ) return; -+ pass_site = 0; -+ for (i=0; insmpl; i++) -+ { -+ if ( smpl_pass[i] ) smpl_pass[i] = 0; -+ else { smpl_pass[i] = 1; pass_site = 1; } -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; insmpl; i++) smpl_pass[i] = 1; -+ } -+ else if ( !pass_site ) return; -+ } -+ } -+ -+ args->ngt = 0; -+ if ( args->nsmpl ) -+ { -+ // Get the genotypes -+ args->ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); -+ args->ngt1 = args->ngt / rec->n_sample; -+ -+ if ( args->ngt>0 ) -+ { -+ // Get the AD counts -+ args->nad = bcf_get_format_int32(args->hdr, rec, "AD", &args->ad_arr, &args->mad_arr); -+ args->nad1 = args->nad / rec->n_sample; -+ if ( args->nad>0 && args->nad1 != rec->n_allele ) error("Incorrect number of FORMAT/AD values at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ } -+ } -+ -+ // Is there a star allele? Don't count overlapping deletions twice -+ int star_allele = -1; -+ for (i=1; in_allele; i++) -+ if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } -+ -+ -+ if ( args->ngt>0 && args->ntrio ) -+ { -+ int is_dnm = 0; -+ for (i=0; intrio; i++) -+ { -+ if ( flt->filter && !args->trio[i].pass ) continue; -+ -+ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. -+ // the order is: child, father, mother -+ int als[6], *als_child = als, *als_father = als+2, *als_mother = als+4; -+ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iCHILD], als_child) < 0 ) continue; -+ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iFATHER], als_father) < 0 ) continue; -+ if ( parse_genotype(args->gt_arr, args->ngt1, args->trio[i].idx[iMOTHER], als_mother) < 0 ) continue; -+ -+ // Is it a DNM? -+ if ( !args->allow_alt2ref_DNMs && als_child[0]==0 && als_child[1]==0 ) continue; -+ if ( (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ) continue; -+ if ( (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ) continue; -+ if ( als_child[0]==star_allele || als_child[1]==star_allele ) continue; // don't count the same event multiple times -+ if ( als_father[0]==star_allele || als_father[1]==star_allele ) continue; -+ if ( als_mother[0]==star_allele || als_mother[1]==star_allele ) continue; -+ -+ int child_is_indel = (bcf_get_variant_type(rec,als_child[0]) & VCF_INDEL) || (bcf_get_variant_type(rec,als_child[1]) & VCF_INDEL) ? 1 : 0; -+ -+ if ( !args->allow_alt2ref_DNMs ) -+ { -+ if ( !child_is_indel ) continue; -+ } -+ else -+ { -+ if ( !child_is_indel && -+ !(bcf_get_variant_type(rec,als_father[0]) & VCF_INDEL) && -+ !(bcf_get_variant_type(rec,als_father[1]) & VCF_INDEL) && -+ !(bcf_get_variant_type(rec,als_mother[0]) & VCF_INDEL) && -+ !(bcf_get_variant_type(rec,als_mother[1]) & VCF_INDEL) ) continue; // not an indel, in any sample -+ } -+ -+ if ( child_is_indel ) -+ update_indel_stats(args, rec, stats, args->trio[i].idx[iCHILD], als_child); -+ -+ //printf("MERR\t%s\t%d\t%s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[iCHILD]]); -+ -+ stats->npass_gt++; -+ -+ is_dnm = 1; -+ } -+ if ( !is_dnm ) return; -+ } -+ else if ( args->ngt>0 ) -+ { -+ for (i=0; insmpl; i++) -+ { -+ if ( smpl_pass && !smpl_pass[i] ) continue; -+ -+ // Determine the alternate allele and the genotypes, skip if any of the alleles is missing. -+ int als[2] = {0,0}; -+ int ret = parse_genotype(args->gt_arr, args->ngt1, i, als); -+ if ( ret==-1 ) continue; // missing genotype -+ if ( !(bcf_get_variant_type(rec,als[0]) & VCF_INDEL) && !(bcf_get_variant_type(rec,als[1]) & VCF_INDEL) ) continue; // not an indel -+ -+ update_indel_stats(args, rec, stats, i, als); -+ -+ stats->npass_gt++; -+ } -+ } -+ -+ if ( bcf_get_info_string(args->hdr,rec,args->csq_tag,&args->csq_str,&args->mcsq_str) > 0 ) -+ { -+ if ( strstr(args->csq_str,"inframe") ) stats->ninframe++; -+ if ( strstr(args->csq_str,"frameshift") ) stats->nframeshift++; -+ } -+ -+ for (i=1; in_allele; i++) -+ { -+ if ( !(bcf_get_variant_type(rec,i) & VCF_INDEL) ) continue; -+ if ( rec->d.var[i].n < 0 ) stats->ndel++; -+ else if ( rec->d.var[i].n > 0 ) stats->nins++; -+ if ( args->ngt <= 0 ) -+ { -+ int bin = len2bin(rec->d.var[i].n); -+ if ( bin >= 0 ) stats->nlen[bin]++; -+ } -+ } -+ stats->npass++; -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_fname = "-"; -+ args->csq_tag = "CSQ"; -+ static struct option loptions[] = -+ { -+ {"max-len",required_argument,0,1}, -+ {"nvaf",required_argument,0,2}, -+ {"alt2ref-DNM",no_argument,0,3}, -+ {"ped",required_argument,0,'p'}, -+ {"csq-tag",required_argument,0,'c'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"output",required_argument,NULL,'o'}, -+ {"regions",1,0,'r'}, -+ {"regions-file",1,0,'R'}, -+ {"targets",1,0,'t'}, -+ {"targets-file",1,0,'T'}, -+ {NULL,0,NULL,0} -+ }; -+ char *tmp; -+ int c, i; -+ while ((c = getopt_long(argc, argv, "o:s:i:e:r:R:t:T:c:p:",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 1 : -+ MAX_LEN = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); -+ if ( MAX_LEN<=0 ) error("Expected value bigger than 0 --max-len\n"); -+ break; -+ case 2 : -+ NVAF = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: --max-len %s\n", optarg); -+ if ( NVAF<0 || NVAF>1 ) error("Expected value from the interval [0,1] with --nvaf\n"); -+ break; -+ case 3 : args->allow_alt2ref_DNMs = 1; break; -+ case 'p': args->ped_fname = optarg; break; -+ case 'c': args->csq_tag = optarg; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': args->targets = optarg; break; -+ case 'T': args->targets = optarg; args->targets_is_file = 1; break; -+ case 'r': args->regions = optarg; break; -+ case 'R': args->regions = optarg; args->regions_is_file = 1; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s",usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s",usage_text()); -+ else args->fname = argv[optind]; -+ -+ init_data(args); -+ -+ while ( bcf_sr_next_line(args->sr) ) -+ { -+ bcf1_t *rec = bcf_sr_get_line(args->sr,0); -+ if ( !(bcf_get_variant_types(rec) & VCF_INDEL) ) continue; -+ for (i=0; infilters; i++) -+ process_record(args, rec, &args->filters[i]); -+ } -+ -+ report_stats(args); -+ destroy_data(args); -+ -+ return 0; -+} ---- python-pysam.orig/bcftools/plugins/isecGT.c -+++ python-pysam/bcftools/plugins/isecGT.c -@@ -131,14 +131,14 @@ - smpl_ilist_t *smpl = smpl_ilist_map(args->hdr_a, args->hdr_b, SMPL_STRICT); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -- bcf_hdr_write(args->out_fh, args->hdr_a); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - - while ( bcf_sr_next_line(args->sr) ) - { - if ( !bcf_sr_has_line(args->sr,0) ) continue; - if ( !bcf_sr_has_line(args->sr,1) ) - { -- bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0)); -+ if ( bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0))!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - continue; - } - -@@ -163,7 +163,7 @@ - } - } - if ( dirty ) bcf_update_genotypes(args->hdr_a, line_a, args->arr_a, ngt_a*smpl->n); -- bcf_write(args->out_fh, args->hdr_a, line_a); -+ if ( bcf_write(args->out_fh, args->hdr_a, line_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - - if ( hts_close(args->out_fh)!=0 ) error("Close failed: %s\n",args->output_fname); ---- python-pysam.orig/bcftools/plugins/isecGT.c.pysam.c -+++ python-pysam/bcftools/plugins/isecGT.c.pysam.c -@@ -133,14 +133,14 @@ - smpl_ilist_t *smpl = smpl_ilist_map(args->hdr_a, args->hdr_b, SMPL_STRICT); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -- bcf_hdr_write(args->out_fh, args->hdr_a); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - - while ( bcf_sr_next_line(args->sr) ) - { - if ( !bcf_sr_has_line(args->sr,0) ) continue; - if ( !bcf_sr_has_line(args->sr,1) ) - { -- bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0)); -+ if ( bcf_write(args->out_fh, args->hdr_a, bcf_sr_get_line(args->sr,0))!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - continue; - } - -@@ -165,7 +165,7 @@ - } - } - if ( dirty ) bcf_update_genotypes(args->hdr_a, line_a, args->arr_a, ngt_a*smpl->n); -- bcf_write(args->out_fh, args->hdr_a, line_a); -+ if ( bcf_write(args->out_fh, args->hdr_a, line_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - - if ( hts_close(args->out_fh)!=0 ) error("Close failed: %s\n",args->output_fname); ---- python-pysam.orig/bcftools/plugins/mendelian.c -+++ python-pysam/bcftools/plugins/mendelian.c -@@ -1,6 +1,6 @@ - /* The MIT License - -- Copyright (c) 2015 Genome Research Ltd. -+ Copyright (c) 2015-2018 Genome Research Ltd. - - Author: Petr Danecek - -@@ -27,16 +27,18 @@ - #include - #include - #include -+#include - #include - #include -+#include - #include - #include - #include - #include - #include - #include // for isatty --#include "bcftools.h" --#include "regidx.h" -+#include "../bcftools.h" -+#include "../regidx.h" - - #define MODE_COUNT 1 - #define MODE_LIST_GOOD 2 -@@ -148,7 +150,7 @@ - " -r, --rules [?] predefined rules, 'list' to print available settings, append '?' for details\n" - " -R, --rules-file inheritance rules, see example below\n" - " -t, --trio names of mother, father and the child\n" -- " -T, --trio-file list of trios, one per line\n" -+ " -T, --trio-file list of trios, one per line (mother,father,child)\n" - "\n" - "Example:\n" - " # Default inheritance patterns, override with -r\n" -@@ -363,13 +365,22 @@ - if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); - if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; - -+ FILE *log_fh = stderr; -+ if ( args.mode==MODE_COUNT ) -+ { -+ log_fh = strcmp("-",args.output_fname) ? fopen(args.output_fname,"w") : stdout; -+ if ( !log_fh ) error("Error: cannot write to %s\n", args.output_fname); -+ } -+ - args.sr = bcf_sr_init(); -- if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); -+ if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args.sr->errnum)); - args.hdr = bcf_sr_get_header(args.sr, 0); -- args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); -- if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); -- bcf_hdr_write(args.out_fh, args.hdr); -- -+ if ( args.mode!=MODE_COUNT ) -+ { -+ args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); -+ if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); -+ if ( bcf_hdr_write(args.out_fh, args.hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); -+ } - - int i, n = 0; - char **list; -@@ -420,29 +431,30 @@ - if ( line ) - { - if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); -- bcf_write1(args.out_fh, args.hdr, line); -+ if ( args.out_fh && bcf_write1(args.out_fh, args.hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); - } - } -+ if ( args.out_fh && hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); - -- -- fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); -+ fprintf(log_fh,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio (mother,father,child)\n"); - for (i=0; inok,trio->nbad,args.nrec-(trio->nok+trio->nbad), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) - ); - } -+ if ( log_fh!=stderr && log_fh!=stdout && fclose(log_fh) ) error("Error: close failed for %s\n", args.output_fname); -+ - free(args.gt_arr); - free(args.trios); - regitr_destroy(args.itr); - regitr_destroy(args.itr_ori); - regidx_destroy(args.rules); - bcf_sr_destroy(args.sr); -- if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); - return 0; - } - -@@ -450,7 +462,7 @@ - { - static int warned = 0; - if ( warned ) return; -- fprintf(stderr,"Incorrect ploidy at %s:%d, skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),rec->pos+1); -+ fprintf(stderr,"Incorrect ploidy at %s:%"PRId64", skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - warned = 1; - } - -@@ -555,7 +567,7 @@ - } - - if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) -- error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); -+ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - - if ( args.mode&MODE_DELETE ) return rec; - if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec; ---- python-pysam.orig/bcftools/plugins/mendelian.c.pysam.c -+++ python-pysam/bcftools/plugins/mendelian.c.pysam.c -@@ -2,7 +2,7 @@ - - /* The MIT License - -- Copyright (c) 2015 Genome Research Ltd. -+ Copyright (c) 2015-2018 Genome Research Ltd. - - Author: Petr Danecek - -@@ -29,16 +29,18 @@ - #include - #include - #include -+#include - #include - #include -+#include - #include - #include - #include - #include - #include - #include // for isatty --#include "bcftools.h" --#include "regidx.h" -+#include "../bcftools.h" -+#include "../regidx.h" - - #define MODE_COUNT 1 - #define MODE_LIST_GOOD 2 -@@ -150,7 +152,7 @@ - " -r, --rules [?] predefined rules, 'list' to print available settings, append '?' for details\n" - " -R, --rules-file inheritance rules, see example below\n" - " -t, --trio names of mother, father and the child\n" -- " -T, --trio-file list of trios, one per line\n" -+ " -T, --trio-file list of trios, one per line (mother,father,child)\n" - "\n" - "Example:\n" - " # Default inheritance patterns, override with -r\n" -@@ -365,13 +367,22 @@ - if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); - if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; - -+ FILE *log_fh = bcftools_stderr; -+ if ( args.mode==MODE_COUNT ) -+ { -+ log_fh = strcmp("-",args.output_fname) ? fopen(args.output_fname,"w") : bcftools_stdout; -+ if ( !log_fh ) error("Error: cannot write to %s\n", args.output_fname); -+ } -+ - args.sr = bcf_sr_init(); -- if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); -+ if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args.sr->errnum)); - args.hdr = bcf_sr_get_header(args.sr, 0); -- args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); -- if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); -- bcf_hdr_write(args.out_fh, args.hdr); -- -+ if ( args.mode!=MODE_COUNT ) -+ { -+ args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); -+ if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); -+ if ( bcf_hdr_write(args.out_fh, args.hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); -+ } - - int i, n = 0; - char **list; -@@ -422,29 +433,30 @@ - if ( line ) - { - if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); -- bcf_write1(args.out_fh, args.hdr, line); -+ if ( args.out_fh && bcf_write1(args.out_fh, args.hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args.output_fname); - } - } -+ if ( args.out_fh && hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); - -- -- fprintf(bcftools_stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); -+ fprintf(log_fh,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio (mother,father,child)\n"); - for (i=0; inok,trio->nbad,args.nrec-(trio->nok+trio->nbad), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), - bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) - ); - } -+ if ( log_fh!=bcftools_stderr && log_fh!=bcftools_stdout && fclose(log_fh) ) error("Error: close failed for %s\n", args.output_fname); -+ - free(args.gt_arr); - free(args.trios); - regitr_destroy(args.itr); - regitr_destroy(args.itr_ori); - regidx_destroy(args.rules); - bcf_sr_destroy(args.sr); -- if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); - return 0; - } - -@@ -452,7 +464,7 @@ - { - static int warned = 0; - if ( warned ) return; -- fprintf(bcftools_stderr,"Incorrect ploidy at %s:%d, skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),rec->pos+1); -+ fprintf(bcftools_stderr,"Incorrect ploidy at %s:%"PRId64", skipping the trio. (This warning is printed only once.)\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - warned = 1; - } - -@@ -557,7 +569,7 @@ - } - - if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) ) -- error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1); -+ error("Could not update GT field at %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); - - if ( args.mode&MODE_DELETE ) return rec; - if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec; ---- python-pysam.orig/bcftools/plugins/missing2ref.c -+++ python-pysam/bcftools/plugins/missing2ref.c -@@ -109,7 +109,7 @@ - } - } - else{ -- fprintf(stderr,"Warning: Could not calculate allele count at position %d\n", rec->pos); -+ fprintf(stderr,"Warning: Could not calculate allele count at position %"PRId64"\n", (int64_t) rec->pos); - exit(1); - } - ---- python-pysam.orig/bcftools/plugins/missing2ref.c.pysam.c -+++ python-pysam/bcftools/plugins/missing2ref.c.pysam.c -@@ -111,7 +111,7 @@ - } - } - else{ -- fprintf(bcftools_stderr,"Warning: Could not calculate allele count at position %d\n", rec->pos); -+ fprintf(bcftools_stderr,"Warning: Could not calculate allele count at position %"PRId64"\n", (int64_t) rec->pos); - exit(1); - } - ---- /dev/null -+++ python-pysam/bcftools/plugins/parental-origin.c -@@ -0,0 +1,410 @@ -+/* The MIT License -+ -+ Copyright (c) 2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define CNV_DEL 0 -+#define CNV_DUP 1 -+ -+#define iCHILD 0 -+#define iFATHER 1 -+#define iMOTHER 2 -+ -+typedef struct -+{ -+ int idx[3]; // VCF sample index for child, father, mother -+ int pass; // do all three pass the filters? -+} -+trio_t; -+ -+typedef struct -+{ -+ int argc, filter_logic, cnv_type, debug, greedy; -+ filter_t *filter; -+ char *filter_str; -+ char **argv, *pfm, *fname, *region; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr; -+ trio_t trio; -+ int32_t *pl, *ad, *gt; // input FMT/PL, AD, and GT values -+ int mpl, mad, mgt; -+ double ppat,pmat; // method 1: probability of paternal/maternal origin -+ int ntest; // number of informative sites -+ int nmat, npat; // method 2: number of pat/mat sites based on simple ad[0] < ad[1] comparison -+ double min_pbinom; // minimum binomial probability of paternal hets -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Determine parental origin of a CNV region in a trio.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Determine parental origin of a CNV region\n" -+ "Usage: bcftools +parental-origin [Plugin Options]\n" -+ "Plugin options:\n" -+ " -b, --min-binom-prob FLOAT exclude parental HETs with skewed ALT allele fraction [1e-2]\n" -+ " -d, --debug list informative sites\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " -g, --greedy use also ambigous sites, e.g. het+hom parents for deletions\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " -p, --pfm P,F,M sample names of proband, father, and mother\n" -+ " -r, --region REGION chr:beg-end\n" -+ " -t, --type the CNV type\n" -+ "\n" -+ "Example:\n" -+ " bcftools +parental-origin -p proband,father,mother -t dup -r 14:22671179-22947951 file.bcf\n" -+ "\n"; -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->region ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->region, 0)<0 ) error("Failed to read the region: %s\n",args->region); -+ } -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ int id; -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/AD is not present in %s\n", args->fname); -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/GT is not present in %s\n", args->fname); -+ -+ if ( args->filter_str ) -+ args->filter = filter_init(args->hdr, args->filter_str); -+ -+ int i, n = 0; -+ char **list; -+ list = hts_readlist(args->pfm, 0, &n); -+ if ( n!=3 ) error("Expected three sample names with -t\n"); -+ args->trio.idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); -+ args->trio.idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); -+ args->trio.idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); -+ for (i=0; itrio.idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); -+ free(list[i]); -+ } -+ free(list); -+} -+static void destroy_data(args_t *args) -+{ -+ if ( args->filter ) filter_destroy(args->filter); -+ free(args->pl); -+ free(args->ad); -+ free(args->gt); -+ bcf_sr_destroy(args->sr); -+ free(args); -+} -+static inline double calc_binom_two_sided(int na, int nb, double aprob) -+{ -+ double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob); -+ if ( prob > 1 ) prob = 1; -+ return prob; -+} -+static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge) -+{ -+ return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob); -+} -+static void process_record(args_t *args, bcf1_t *rec) -+{ -+ if ( rec->n_allele!=2 || bcf_get_variant_types(rec)!=VCF_SNP ) return; -+ -+ int i,j; -+ if ( args->filter ) -+ { -+ uint8_t *smpl_pass = NULL; -+ int pass_site = filter_test(args->filter, rec, (const uint8_t**) &smpl_pass); -+ if ( args->filter_logic & FLT_EXCLUDE ) -+ { -+ if ( pass_site ) -+ { -+ if ( !smpl_pass ) return; -+ pass_site = 0; -+ for (i=0; i<3; i++) -+ { -+ if ( smpl_pass[args->trio.idx[i]] ) smpl_pass[args->trio.idx[i]] = 0; -+ else { smpl_pass[args->trio.idx[i]] = 1; pass_site = 1; } -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; i<3; i++) smpl_pass[args->trio.idx[i]] = 1; -+ } -+ else if ( !pass_site ) return; -+ -+ if ( smpl_pass ) -+ { -+ for (i=0; i<3; i++) -+ if ( !smpl_pass[args->trio.idx[i]] ) return; -+ } -+ } -+ -+ int nsmpl = bcf_hdr_nsamples(args->hdr); -+ int nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); -+ if ( nret<=0 ) -+ { -+ printf("The FORMAT/AD tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ return; -+ } -+ int nad1 = nret/nsmpl; -+ -+ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); -+ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ int npl1 = nret/nsmpl; -+ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) -+ { -+ printf("todo: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); -+ return; -+ } -+ -+ nret = bcf_get_genotypes(args->hdr,rec,&args->gt,&args->mgt); -+ if ( nret<=0 ) error("The FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ int ngt1 = nret/nsmpl; -+ if ( ngt1!=2 ) error("Todo: assuming diploid fields for now .. %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ -+ // number of ref and alt alleles in the proband -+ int32_t ad[6], *adP = ad, *adF = ad+2, *adM = ad+4; -+ int32_t dsg[3], *dsgP = dsg, *dsgF = dsg+1, *dsgM = dsg+2; -+ double gl[9], *glP = gl, *glF = gl+3, *glM = gl+6; -+ for (i=0; i<3; i++) // trio -+ { -+ int isum = 0; -+ int32_t *src = args->pl + npl1*args->trio.idx[i]; -+ double *gl_dst = gl + 3*i; -+ double sum = 0; -+ for (j=0; j<3; j++) // iterate over PL -+ { -+ if ( src[j]==bcf_int32_missing || src[j]==bcf_int32_vector_end ) return; -+ gl_dst[j] = pow(10,-0.1*src[j]); -+ sum += gl_dst[j]; -+ isum += src[j]; -+ } -+ if ( isum==0 ) return; -+ for (j=0; j<3; j++) gl_dst[j] /= sum; -+ -+ int32_t *gt = args->gt + ngt1*args->trio.idx[i]; -+ dsg[i] = 0; -+ for (j=0; jad + nad1*args->trio.idx[i]; -+ ad[2*i] = src[0]; -+ ad[2*i+1] = src[1]; -+ } -+ -+ #define is_RR(x) (x[0]==0) -+ #define is_RA(x) (x[1]==0) -+ #define is_AA(x) (x[2]==0) -+ if ( args->cnv_type==CNV_DEL ) -+ { -+ if ( *dsgP!=0 && *dsgP!=2 ) return; // proband not a hom -+ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents -+ if ( !args->greedy ) -+ { -+ if ( *dsgF==1 && *dsgP==*dsgM ) return; // both parents have the proband's allele -+ if ( *dsgM==1 && *dsgP==*dsgF ) return; -+ } -+ double pmat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[0]*glF[1] + glM[0]*glF[2] + 1/3.*glM[1]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + -+ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[2]*glF[1] + glM[2]*glF[0] + 1/3.*glM[1]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); -+ double ppat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[1]*glF[0] + glM[2]*glF[0] + 1/3.*glM[0]*glF[1] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + -+ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[1]*glF[2] + glM[0]*glF[2] + 1/3.*glM[2]*glF[1] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); -+ -+ // NB: pmat/ppat is the probability of parental origin of the observed, not the deleted allele; -+ // args->pmat/ppat is the probability of parental origin of the deleted allele -+ args->pmat += log(ppat); -+ args->ppat += log(pmat); -+ args->ntest++; -+ -+ if ( args->debug ) -+ { -+ // output: position, paternal probability, maternal probability, PLs of child, father, mother -+ printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); -+ for (i=0; i<3; i++) -+ { -+ for (j=0; j<3; j++) printf(" %d",args->pl[npl1*args->trio.idx[i]+j]); -+ printf("\t"); -+ } -+ printf("\n"); -+ } -+ } -+ if ( args->cnv_type==CNV_DUP ) -+ { -+ if ( !adP[0] || !adP[1] ) return; // proband is homozygous or has no coverage -+ if ( adP[0] == adP[1] ) return; // proband's alleles are not informative, any or none could have been duplicated -+ if ( *dsgP!=1 ) return; // the proband's genotype is not a het -+ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents -+ -+ if ( args->min_pbinom!=0 ) -+ { -+ // exclude parental hets with skewed ALT allele proportion -+ if ( *dsgF==1 && adF[0] && adF[1] && calc_binom_two_sided(adF[0],adF[1],0.5) < args->min_pbinom ) return; -+ if ( *dsgM==1 && adM[0] && adM[1] && calc_binom_two_sided(adM[0],adM[1],0.5) < args->min_pbinom ) return; -+ } -+ -+ double prra = glP[1] * calc_binom_one_sided(adP[1],adP[0],1/3.,1); -+ double praa = glP[1] * calc_binom_one_sided(adP[1],adP[0],2/3.,0); -+ double ppat = prra*(glM[1]*glF[0] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + -+ praa*(glM[1]*glF[2] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); -+ double pmat = prra*(glM[0]*glF[1] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + -+ praa*(glM[2]*glF[1] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); -+ args->pmat += log(pmat); -+ args->ppat += log(ppat); -+ args->ntest++; -+ -+ if ( args->debug ) -+ { -+ // output: position; paternal probability; maternal probability; ADs of child, father,mother; PLs of child, father, mother -+ printf("DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); -+ for (i=0; i<3; i++) -+ { -+ printf("%d %d\t",ad[2*i],ad[2*i+1]); -+ } -+ for (i=0; i<3; i++) -+ { -+ for (j=0; j<3; j++) printf(" %d",args->pl[npl1*args->trio.idx[i]+j]); -+ printf("\t"); -+ } -+ printf("\n"); -+ } -+ } -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->min_pbinom = 1e-2; -+ static struct option loptions[] = -+ { -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"pfm",required_argument,NULL,'p'}, -+ {"region",required_argument,0,'r'}, -+ {"type",required_argument,0,'t'}, -+ {"debug",no_argument,0,'d'}, -+ {"greedy",no_argument,0,'g'}, -+ {"min-binom-prob",required_argument,0,'b'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ char *tmp; -+ while ((c = getopt_long(argc, argv, "h?e:i:p:r:t:dgb:",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': -+ if ( !strcasecmp("dup",optarg) ) args->cnv_type = CNV_DUP; -+ else if ( !strcasecmp("del",optarg) ) args->cnv_type = CNV_DEL; -+ break; -+ case 'r': args->region = optarg; break; -+ case 'p': args->pfm = optarg; break; -+ case 'd': args->debug = 1; break; -+ case 'g': args->greedy = 1; break; -+ case 'b': -+ args->min_pbinom = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: -b %s\n", optarg); -+ if ( args->min_pbinom<0 || args->min_pbinom>1 ) error("Expected value from the interval [0,1] with --min-binom-prob\n"); -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ if ( !args->pfm ) error("Missing the -p option\n"); -+ -+ init_data(args); -+ if ( args->debug ) -+ { -+ if ( args->cnv_type==CNV_DEL ) printf("# DBG: position; paternal probability; maternal probability; PLs of child, father, mother\n"); -+ else printf("# DBG: position; paternal probability; maternal probability; ADs of child, father, mother; PLs of child, father, mother\n"); -+ } -+ -+ while ( bcf_sr_next_line(args->sr) ) -+ process_record(args, bcf_sr_get_line(args->sr,0)); -+ -+ double qual = 4.3429*fabs(args->ppat - args->pmat); -+ char *origin = "uncertain"; -+ if ( args->ppat > args->pmat ) origin = "paternal"; -+ else if ( args->ppat < args->pmat ) origin = "maternal"; -+ -+ int i; -+ printf("# bcftools +%s", args->argv[0]); -+ for (i=1; iargc; i++) printf(" %s",args->argv[i]); -+ printf("\n"); -+ printf("# [1]type\t[2]predicted_origin\t[3]quality\t[4]nmarkers\n"); -+ printf("%s\t%s\t%f\t%d\n", args->cnv_type==CNV_DUP ? "dup" : "del", origin, qual, args->ntest); -+ -+ destroy_data(args); -+ -+ return 0; -+} ---- /dev/null -+++ python-pysam/bcftools/plugins/parental-origin.c.pysam.c -@@ -0,0 +1,412 @@ -+#include "bcftools.pysam.h" -+ -+/* The MIT License -+ -+ Copyright (c) 2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define CNV_DEL 0 -+#define CNV_DUP 1 -+ -+#define iCHILD 0 -+#define iFATHER 1 -+#define iMOTHER 2 -+ -+typedef struct -+{ -+ int idx[3]; // VCF sample index for child, father, mother -+ int pass; // do all three pass the filters? -+} -+trio_t; -+ -+typedef struct -+{ -+ int argc, filter_logic, cnv_type, debug, greedy; -+ filter_t *filter; -+ char *filter_str; -+ char **argv, *pfm, *fname, *region; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr; -+ trio_t trio; -+ int32_t *pl, *ad, *gt; // input FMT/PL, AD, and GT values -+ int mpl, mad, mgt; -+ double ppat,pmat; // method 1: probability of paternal/maternal origin -+ int ntest; // number of informative sites -+ int nmat, npat; // method 2: number of pat/mat sites based on simple ad[0] < ad[1] comparison -+ double min_pbinom; // minimum binomial probability of paternal hets -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Determine parental origin of a CNV region in a trio.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Determine parental origin of a CNV region\n" -+ "Usage: bcftools +parental-origin [Plugin Options]\n" -+ "Plugin options:\n" -+ " -b, --min-binom-prob FLOAT exclude parental HETs with skewed ALT allele fraction [1e-2]\n" -+ " -d, --debug list informative sites\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " -g, --greedy use also ambigous sites, e.g. het+hom parents for deletions\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " -p, --pfm P,F,M sample names of proband, father, and mother\n" -+ " -r, --region REGION chr:beg-end\n" -+ " -t, --type the CNV type\n" -+ "\n" -+ "Example:\n" -+ " bcftools +parental-origin -p proband,father,mother -t dup -r 14:22671179-22947951 file.bcf\n" -+ "\n"; -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->region ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->region, 0)<0 ) error("Failed to read the region: %s\n",args->region); -+ } -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ int id; -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/AD is not present in %s\n", args->fname); -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/GT is not present in %s\n", args->fname); -+ -+ if ( args->filter_str ) -+ args->filter = filter_init(args->hdr, args->filter_str); -+ -+ int i, n = 0; -+ char **list; -+ list = hts_readlist(args->pfm, 0, &n); -+ if ( n!=3 ) error("Expected three sample names with -t\n"); -+ args->trio.idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); -+ args->trio.idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); -+ args->trio.idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); -+ for (i=0; itrio.idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); -+ free(list[i]); -+ } -+ free(list); -+} -+static void destroy_data(args_t *args) -+{ -+ if ( args->filter ) filter_destroy(args->filter); -+ free(args->pl); -+ free(args->ad); -+ free(args->gt); -+ bcf_sr_destroy(args->sr); -+ free(args); -+} -+static inline double calc_binom_two_sided(int na, int nb, double aprob) -+{ -+ double prob = na > nb ? 2 * kf_betai(na, nb+1, aprob) : 2 * kf_betai(nb, na+1, aprob); -+ if ( prob > 1 ) prob = 1; -+ return prob; -+} -+static inline double calc_binom_one_sided(int na, int nb, double aprob, int ge) -+{ -+ return ge ? kf_betai(na, nb + 1, aprob) : kf_betai(nb, na + 1, 1 - aprob); -+} -+static void process_record(args_t *args, bcf1_t *rec) -+{ -+ if ( rec->n_allele!=2 || bcf_get_variant_types(rec)!=VCF_SNP ) return; -+ -+ int i,j; -+ if ( args->filter ) -+ { -+ uint8_t *smpl_pass = NULL; -+ int pass_site = filter_test(args->filter, rec, (const uint8_t**) &smpl_pass); -+ if ( args->filter_logic & FLT_EXCLUDE ) -+ { -+ if ( pass_site ) -+ { -+ if ( !smpl_pass ) return; -+ pass_site = 0; -+ for (i=0; i<3; i++) -+ { -+ if ( smpl_pass[args->trio.idx[i]] ) smpl_pass[args->trio.idx[i]] = 0; -+ else { smpl_pass[args->trio.idx[i]] = 1; pass_site = 1; } -+ } -+ if ( !pass_site ) return; -+ } -+ else -+ for (i=0; i<3; i++) smpl_pass[args->trio.idx[i]] = 1; -+ } -+ else if ( !pass_site ) return; -+ -+ if ( smpl_pass ) -+ { -+ for (i=0; i<3; i++) -+ if ( !smpl_pass[args->trio.idx[i]] ) return; -+ } -+ } -+ -+ int nsmpl = bcf_hdr_nsamples(args->hdr); -+ int nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); -+ if ( nret<=0 ) -+ { -+ fprintf(bcftools_stdout, "The FORMAT/AD tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ return; -+ } -+ int nad1 = nret/nsmpl; -+ -+ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); -+ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ int npl1 = nret/nsmpl; -+ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) -+ { -+ fprintf(bcftools_stdout, "todo: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); -+ return; -+ } -+ -+ nret = bcf_get_genotypes(args->hdr,rec,&args->gt,&args->mgt); -+ if ( nret<=0 ) error("The FORMAT/GT tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ int ngt1 = nret/nsmpl; -+ if ( ngt1!=2 ) error("Todo: assuming diploid fields for now .. %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ -+ // number of ref and alt alleles in the proband -+ int32_t ad[6], *adP = ad, *adF = ad+2, *adM = ad+4; -+ int32_t dsg[3], *dsgP = dsg, *dsgF = dsg+1, *dsgM = dsg+2; -+ double gl[9], *glP = gl, *glF = gl+3, *glM = gl+6; -+ for (i=0; i<3; i++) // trio -+ { -+ int isum = 0; -+ int32_t *src = args->pl + npl1*args->trio.idx[i]; -+ double *gl_dst = gl + 3*i; -+ double sum = 0; -+ for (j=0; j<3; j++) // iterate over PL -+ { -+ if ( src[j]==bcf_int32_missing || src[j]==bcf_int32_vector_end ) return; -+ gl_dst[j] = pow(10,-0.1*src[j]); -+ sum += gl_dst[j]; -+ isum += src[j]; -+ } -+ if ( isum==0 ) return; -+ for (j=0; j<3; j++) gl_dst[j] /= sum; -+ -+ int32_t *gt = args->gt + ngt1*args->trio.idx[i]; -+ dsg[i] = 0; -+ for (j=0; jad + nad1*args->trio.idx[i]; -+ ad[2*i] = src[0]; -+ ad[2*i+1] = src[1]; -+ } -+ -+ #define is_RR(x) (x[0]==0) -+ #define is_RA(x) (x[1]==0) -+ #define is_AA(x) (x[2]==0) -+ if ( args->cnv_type==CNV_DEL ) -+ { -+ if ( *dsgP!=0 && *dsgP!=2 ) return; // proband not a hom -+ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents -+ if ( !args->greedy ) -+ { -+ if ( *dsgF==1 && *dsgP==*dsgM ) return; // both parents have the proband's allele -+ if ( *dsgM==1 && *dsgP==*dsgF ) return; -+ } -+ double pmat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[0]*glF[1] + glM[0]*glF[2] + 1/3.*glM[1]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + -+ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[2]*glF[1] + glM[2]*glF[0] + 1/3.*glM[1]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); -+ double ppat = glP[0]*(0.5*glM[0]*glF[0] + 2/3.*glM[1]*glF[0] + glM[2]*glF[0] + 1/3.*glM[0]*glF[1] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + -+ glP[2]*(0.5*glM[2]*glF[2] + 2/3.*glM[1]*glF[2] + glM[0]*glF[2] + 1/3.*glM[2]*glF[1] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); -+ -+ // NB: pmat/ppat is the probability of parental origin of the observed, not the deleted allele; -+ // args->pmat/ppat is the probability of parental origin of the deleted allele -+ args->pmat += log(ppat); -+ args->ppat += log(pmat); -+ args->ntest++; -+ -+ if ( args->debug ) -+ { -+ // output: position, paternal probability, maternal probability, PLs of child, father, mother -+ fprintf(bcftools_stdout, "DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); -+ for (i=0; i<3; i++) -+ { -+ for (j=0; j<3; j++) fprintf(bcftools_stdout, " %d",args->pl[npl1*args->trio.idx[i]+j]); -+ fprintf(bcftools_stdout, "\t"); -+ } -+ fprintf(bcftools_stdout, "\n"); -+ } -+ } -+ if ( args->cnv_type==CNV_DUP ) -+ { -+ if ( !adP[0] || !adP[1] ) return; // proband is homozygous or has no coverage -+ if ( adP[0] == adP[1] ) return; // proband's alleles are not informative, any or none could have been duplicated -+ if ( *dsgP!=1 ) return; // the proband's genotype is not a het -+ if ( *dsgF == *dsgM ) return; // cannot distinguish between parents -+ -+ if ( args->min_pbinom!=0 ) -+ { -+ // exclude parental hets with skewed ALT allele proportion -+ if ( *dsgF==1 && adF[0] && adF[1] && calc_binom_two_sided(adF[0],adF[1],0.5) < args->min_pbinom ) return; -+ if ( *dsgM==1 && adM[0] && adM[1] && calc_binom_two_sided(adM[0],adM[1],0.5) < args->min_pbinom ) return; -+ } -+ -+ double prra = glP[1] * calc_binom_one_sided(adP[1],adP[0],1/3.,1); -+ double praa = glP[1] * calc_binom_one_sided(adP[1],adP[0],2/3.,0); -+ double ppat = prra*(glM[1]*glF[0] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[2]*glF[1]) + -+ praa*(glM[1]*glF[2] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[0]*glF[1]); -+ double pmat = prra*(glM[0]*glF[1] + glM[0]*glF[2] + 0.5*glM[1]*glF[1] + glM[1]*glF[2]) + -+ praa*(glM[2]*glF[1] + glM[2]*glF[0] + 0.5*glM[1]*glF[1] + glM[1]*glF[0]); -+ args->pmat += log(pmat); -+ args->ppat += log(ppat); -+ args->ntest++; -+ -+ if ( args->debug ) -+ { -+ // output: position; paternal probability; maternal probability; ADs of child, father,mother; PLs of child, father, mother -+ fprintf(bcftools_stdout, "DBG\t%"PRId64"\t%e\t%e\t", (int64_t) rec->pos+1,ppat,pmat); -+ for (i=0; i<3; i++) -+ { -+ fprintf(bcftools_stdout, "%d %d\t",ad[2*i],ad[2*i+1]); -+ } -+ for (i=0; i<3; i++) -+ { -+ for (j=0; j<3; j++) fprintf(bcftools_stdout, " %d",args->pl[npl1*args->trio.idx[i]+j]); -+ fprintf(bcftools_stdout, "\t"); -+ } -+ fprintf(bcftools_stdout, "\n"); -+ } -+ } -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->min_pbinom = 1e-2; -+ static struct option loptions[] = -+ { -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"pfm",required_argument,NULL,'p'}, -+ {"region",required_argument,0,'r'}, -+ {"type",required_argument,0,'t'}, -+ {"debug",no_argument,0,'d'}, -+ {"greedy",no_argument,0,'g'}, -+ {"min-binom-prob",required_argument,0,'b'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ char *tmp; -+ while ((c = getopt_long(argc, argv, "h?e:i:p:r:t:dgb:",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': -+ if ( !strcasecmp("dup",optarg) ) args->cnv_type = CNV_DUP; -+ else if ( !strcasecmp("del",optarg) ) args->cnv_type = CNV_DEL; -+ break; -+ case 'r': args->region = optarg; break; -+ case 'p': args->pfm = optarg; break; -+ case 'd': args->debug = 1; break; -+ case 'g': args->greedy = 1; break; -+ case 'b': -+ args->min_pbinom = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: -b %s\n", optarg); -+ if ( args->min_pbinom<0 || args->min_pbinom>1 ) error("Expected value from the interval [0,1] with --min-binom-prob\n"); -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ if ( !args->pfm ) error("Missing the -p option\n"); -+ -+ init_data(args); -+ if ( args->debug ) -+ { -+ if ( args->cnv_type==CNV_DEL ) fprintf(bcftools_stdout, "# DBG: position; paternal probability; maternal probability; PLs of child, father, mother\n"); -+ else fprintf(bcftools_stdout, "# DBG: position; paternal probability; maternal probability; ADs of child, father, mother; PLs of child, father, mother\n"); -+ } -+ -+ while ( bcf_sr_next_line(args->sr) ) -+ process_record(args, bcf_sr_get_line(args->sr,0)); -+ -+ double qual = 4.3429*fabs(args->ppat - args->pmat); -+ char *origin = "uncertain"; -+ if ( args->ppat > args->pmat ) origin = "paternal"; -+ else if ( args->ppat < args->pmat ) origin = "maternal"; -+ -+ int i; -+ fprintf(bcftools_stdout, "# bcftools +%s", args->argv[0]); -+ for (i=1; iargc; i++) fprintf(bcftools_stdout, " %s",args->argv[i]); -+ fprintf(bcftools_stdout, "\n"); -+ fprintf(bcftools_stdout, "# [1]type\t[2]predicted_origin\t[3]quality\t[4]nmarkers\n"); -+ fprintf(bcftools_stdout, "%s\t%s\t%f\t%d\n", args->cnv_type==CNV_DUP ? "dup" : "del", origin, qual, args->ntest); -+ -+ destroy_data(args); -+ -+ return 0; -+} ---- python-pysam.orig/bcftools/plugins/prune.c -+++ python-pysam/bcftools/plugins/prune.c -@@ -129,7 +129,7 @@ - bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_pos,args->max_ld); - bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_r2,args->max_ld); - } -- bcf_hdr_write(args->out_fh, args->hdr); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( args->filter_r2 ) - args->filter_r2_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->filter_r2); - -@@ -147,7 +147,7 @@ - { - if ( args->filter ) - filter_destroy(args->filter); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - vcfbuf_destroy(args->vcfbuf); - bcf_sr_destroy(args->sr); - free(args->info_pos); -@@ -158,7 +158,7 @@ - { - bcf1_t *rec; - while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) -- bcf_write1(args->out_fh, args->hdr, rec); -+ if ( bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - static void process(args_t *args) - { -@@ -251,9 +251,9 @@ - else if ( !strcasecmp("kb",tmp) ) args->ld_win *= -1000; - else error("Could not parse: --window %s\n", optarg); - break; -- case 'T': args->target_is_file = 1; -+ case 'T': args->target_is_file = 1; // fall-through - case 't': args->target = optarg; break; -- case 'R': args->region_is_file = 1; -+ case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; - case 'o': args->output_fname = optarg; break; - case 'O': ---- python-pysam.orig/bcftools/plugins/prune.c.pysam.c -+++ python-pysam/bcftools/plugins/prune.c.pysam.c -@@ -131,7 +131,7 @@ - bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_pos,args->max_ld); - bcf_hdr_printf(args->hdr,"##INFO=%e upstream\">",args->info_r2,args->max_ld); - } -- bcf_hdr_write(args->out_fh, args->hdr); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( args->filter_r2 ) - args->filter_r2_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->filter_r2); - -@@ -149,7 +149,7 @@ - { - if ( args->filter ) - filter_destroy(args->filter); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - vcfbuf_destroy(args->vcfbuf); - bcf_sr_destroy(args->sr); - free(args->info_pos); -@@ -160,7 +160,7 @@ - { - bcf1_t *rec; - while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) -- bcf_write1(args->out_fh, args->hdr, rec); -+ if ( bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - static void process(args_t *args) - { -@@ -253,9 +253,9 @@ - else if ( !strcasecmp("kb",tmp) ) args->ld_win *= -1000; - else error("Could not parse: --window %s\n", optarg); - break; -- case 'T': args->target_is_file = 1; -+ case 'T': args->target_is_file = 1; // fall-through - case 't': args->target = optarg; break; -- case 'R': args->region_is_file = 1; -+ case 'R': args->region_is_file = 1; // fall-through - case 'r': args->region = optarg; break; - case 'o': args->output_fname = optarg; break; - case 'O': ---- /dev/null -+++ python-pysam/bcftools/plugins/remove-overlaps.c -@@ -0,0 +1,219 @@ -+/* -+ Copyright (C) 2017-2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+*/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "vcfbuf.h" -+#include "filter.h" -+ -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+typedef struct -+{ -+ filter_t *filter; -+ char *filter_str; -+ int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e) -+ vcfbuf_t *vcfbuf; -+ int argc, region_is_file, target_is_file, output_type, verbose, nrm, ntot, print_overlaps, rmdup; -+ char **argv, *region, *target, *fname, *output_fname; -+ htsFile *out_fh; -+ bcf_hdr_t *hdr; -+ bcf_srs_t *sr; -+} -+args_t; -+ -+const char *about(void) -+{ -+ return "Remove overlapping variants\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Remove overlapping variants.\n" -+ "\n" -+ "Usage: bcftools +remove-overlaps [Options]\n" -+ "Plugin options:\n" -+ " -d, --rm-dup remove only duplicate sites and remove them completely\n" -+ " -p, --print-overlaps do the opposite and print only overlapping sites\n" -+ " -v, --verbose print a list of removed sites\n" -+ "Standard options:\n" -+ " -e, --exclude EXPR exclude sites for which the expression is true\n" -+ " -i, --include EXPR include only sites for which the expression is true\n" -+ " -o, --output FILE write output to the FILE [standard output]\n" -+ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ " -r, --regions REGION restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REGION similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ "\n"; -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->region ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region); -+ } -+ if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ -+ args->vcfbuf = vcfbuf_init(args->hdr, 0); -+ if ( args->rmdup ) -+ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_RMDUP,1) -+ else -+ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_OVERLAP_WIN,1) -+ -+ if ( args->filter_str ) -+ args->filter = filter_init(args->hdr, args->filter_str); -+} -+static void destroy_data(args_t *args) -+{ -+ if ( args->filter ) -+ filter_destroy(args->filter); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); -+ vcfbuf_destroy(args->vcfbuf); -+ bcf_sr_destroy(args->sr); -+ free(args); -+} -+static void flush(args_t *args, int flush_all) -+{ -+ int nbuf = vcfbuf_nsites(args->vcfbuf); -+ bcf1_t *rec; -+ while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) -+ { -+ if ( nbuf>2 || (nbuf>1 && flush_all) ) -+ { -+ args->nrm++; -+ if ( args->verbose ) printf("%s\t%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ if ( args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ continue; // skip overlapping variants -+ } -+ if ( !args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ } -+} -+static void process(args_t *args) -+{ -+ args->ntot++; -+ bcf1_t *rec = bcf_sr_get_line(args->sr,0); -+ if ( args->filter ) -+ { -+ int ret = filter_test(args->filter, rec, NULL); -+ if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return; } -+ else if ( ret ) return; -+ } -+ bcf_sr_t *sr = bcf_sr_get_reader(args->sr, 0); -+ sr->buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); -+ flush(args,0); -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_type = FT_VCF; -+ args->output_fname = "-"; -+ static struct option loptions[] = -+ { -+ {"rm-dup",no_argument,NULL,'d'}, -+ {"print-overlaps",no_argument,NULL,'p'}, -+ {"exclude",required_argument,NULL,'e'}, -+ {"include",required_argument,NULL,'i'}, -+ {"regions",required_argument,NULL,'r'}, -+ {"regions-file",required_argument,NULL,'R'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {"verbose",no_argument,NULL,'v'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpd",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'd': args->rmdup = 1; break; -+ case 'p': args->print_overlaps = 1; break; -+ case 'v': args->verbose = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 'T': args->target_is_file = 1; // fall-through -+ case 't': args->target = optarg; break; -+ case 'R': args->region_is_file = 1; // fall-through -+ case 'r': args->region = optarg; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ } -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s",usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s",usage_text()); -+ else args->fname = argv[optind]; -+ -+ init_data(args); -+ -+ while ( bcf_sr_next_line(args->sr) ) process(args); -+ flush(args,1); -+ -+ fprintf(stderr,"Processed/Removed\t%d\t%d\n",args->ntot,args->nrm); -+ -+ destroy_data(args); -+ return 0; -+} -+ -+ ---- /dev/null -+++ python-pysam/bcftools/plugins/remove-overlaps.c.pysam.c -@@ -0,0 +1,221 @@ -+#include "bcftools.pysam.h" -+ -+/* -+ Copyright (C) 2017-2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+*/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "vcfbuf.h" -+#include "filter.h" -+ -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+typedef struct -+{ -+ filter_t *filter; -+ char *filter_str; -+ int filter_logic; // one of FLT_INCLUDE/FLT_EXCLUDE (-i or -e) -+ vcfbuf_t *vcfbuf; -+ int argc, region_is_file, target_is_file, output_type, verbose, nrm, ntot, print_overlaps, rmdup; -+ char **argv, *region, *target, *fname, *output_fname; -+ htsFile *out_fh; -+ bcf_hdr_t *hdr; -+ bcf_srs_t *sr; -+} -+args_t; -+ -+const char *about(void) -+{ -+ return "Remove overlapping variants\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Remove overlapping variants.\n" -+ "\n" -+ "Usage: bcftools +remove-overlaps [Options]\n" -+ "Plugin options:\n" -+ " -d, --rm-dup remove only duplicate sites and remove them completely\n" -+ " -p, --print-overlaps do the opposite and print only overlapping sites\n" -+ " -v, --verbose print a list of removed sites\n" -+ "Standard options:\n" -+ " -e, --exclude EXPR exclude sites for which the expression is true\n" -+ " -i, --include EXPR include only sites for which the expression is true\n" -+ " -o, --output FILE write output to the FILE [standard output]\n" -+ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ " -r, --regions REGION restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REGION similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ "\n"; -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->region ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region); -+ } -+ if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ -+ args->vcfbuf = vcfbuf_init(args->hdr, 0); -+ if ( args->rmdup ) -+ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_RMDUP,1) -+ else -+ vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_OVERLAP_WIN,1) -+ -+ if ( args->filter_str ) -+ args->filter = filter_init(args->hdr, args->filter_str); -+} -+static void destroy_data(args_t *args) -+{ -+ if ( args->filter ) -+ filter_destroy(args->filter); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); -+ vcfbuf_destroy(args->vcfbuf); -+ bcf_sr_destroy(args->sr); -+ free(args); -+} -+static void flush(args_t *args, int flush_all) -+{ -+ int nbuf = vcfbuf_nsites(args->vcfbuf); -+ bcf1_t *rec; -+ while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) ) -+ { -+ if ( nbuf>2 || (nbuf>1 && flush_all) ) -+ { -+ args->nrm++; -+ if ( args->verbose ) fprintf(bcftools_stdout, "%s\t%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ if ( args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ continue; // skip overlapping variants -+ } -+ if ( !args->print_overlaps && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ } -+} -+static void process(args_t *args) -+{ -+ args->ntot++; -+ bcf1_t *rec = bcf_sr_get_line(args->sr,0); -+ if ( args->filter ) -+ { -+ int ret = filter_test(args->filter, rec, NULL); -+ if ( args->filter_logic==FLT_INCLUDE ) { if ( !ret ) return; } -+ else if ( ret ) return; -+ } -+ bcf_sr_t *sr = bcf_sr_get_reader(args->sr, 0); -+ sr->buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); -+ flush(args,0); -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_type = FT_VCF; -+ args->output_fname = "-"; -+ static struct option loptions[] = -+ { -+ {"rm-dup",no_argument,NULL,'d'}, -+ {"print-overlaps",no_argument,NULL,'p'}, -+ {"exclude",required_argument,NULL,'e'}, -+ {"include",required_argument,NULL,'i'}, -+ {"regions",required_argument,NULL,'r'}, -+ {"regions-file",required_argument,NULL,'R'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {"verbose",no_argument,NULL,'v'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ while ((c = getopt_long(argc, argv, "r:R:t:T:o:O:i:e:vpd",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'd': args->rmdup = 1; break; -+ case 'p': args->print_overlaps = 1; break; -+ case 'v': args->verbose = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 'T': args->target_is_file = 1; // fall-through -+ case 't': args->target = optarg; break; -+ case 'R': args->region_is_file = 1; // fall-through -+ case 'r': args->region = optarg; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ } -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s",usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s",usage_text()); -+ else args->fname = argv[optind]; -+ -+ init_data(args); -+ -+ while ( bcf_sr_next_line(args->sr) ) process(args); -+ flush(args,1); -+ -+ fprintf(bcftools_stderr,"Processed/Removed\t%d\t%d\n",args->ntot,args->nrm); -+ -+ destroy_data(args); -+ return 0; -+} -+ -+ ---- python-pysam.orig/bcftools/plugins/setGT.c -+++ python-pysam/bcftools/plugins/setGT.c -@@ -320,7 +320,7 @@ - hts_expand(int,rec->n_allele,args->marr,args->arr); - int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT); - if ( ret<= 0 ) -- error("Could not calculate allele count at %s:%d\n", bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - - for(i=0; i < rec->n_allele; ++i) - { -@@ -353,8 +353,8 @@ - int ia = bcf_gt_allele(ptr[0]); - int ib = bcf_gt_allele(ptr[1]); - if ( ia>=nbinom || ib>=nbinom ) -- error("The sample %s has incorrect number of %s fields at %s:%d\n", -- args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("The sample %s has incorrect number of %s fields at %s:%"PRId64"\n", -+ args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - - double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]); - if ( !args->binom_cmp(prob,args->binom_val) ) continue; -@@ -391,7 +391,7 @@ - - for (i=0; in_sample; i++) - { -- if ( !args->smpl_pass[i] ) continue; -+ if ( args->smpl_pass && !args->smpl_pass[i] ) continue; - if ( args->new_mask>_UNPHASED ) - changed += unphase_gt(args->gts + i*ngts, ngts); - else if ( args->new_mask==GT_PHASED ) ---- python-pysam.orig/bcftools/plugins/setGT.c.pysam.c -+++ python-pysam/bcftools/plugins/setGT.c.pysam.c -@@ -322,7 +322,7 @@ - hts_expand(int,rec->n_allele,args->marr,args->arr); - int ret = bcf_calc_ac(args->in_hdr,rec,args->arr,BCF_UN_FMT); - if ( ret<= 0 ) -- error("Could not calculate allele count at %s:%d\n", bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("Could not calculate allele count at %s:%"PRId64"\n", bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - - for(i=0; i < rec->n_allele; ++i) - { -@@ -355,8 +355,8 @@ - int ia = bcf_gt_allele(ptr[0]); - int ib = bcf_gt_allele(ptr[1]); - if ( ia>=nbinom || ib>=nbinom ) -- error("The sample %s has incorrect number of %s fields at %s:%d\n", -- args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),rec->pos+1); -+ error("The sample %s has incorrect number of %s fields at %s:%"PRId64"\n", -+ args->in_hdr->samples[i],args->binom_tag,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); - - double prob = calc_binom(args->iarr[i*nbinom+ia],args->iarr[i*nbinom+ib]); - if ( !args->binom_cmp(prob,args->binom_val) ) continue; -@@ -393,7 +393,7 @@ - - for (i=0; in_sample; i++) - { -- if ( !args->smpl_pass[i] ) continue; -+ if ( args->smpl_pass && !args->smpl_pass[i] ) continue; - if ( args->new_mask>_UNPHASED ) - changed += unphase_gt(args->gts + i*ngts, ngts); - else if ( args->new_mask==GT_PHASED ) ---- python-pysam.orig/bcftools/plugins/smpl-stats.c -+++ python-pysam/bcftools/plugins/smpl-stats.c -@@ -28,6 +28,7 @@ - #include - #include - #include // for isatty -+#include - #include - #include - #include -@@ -230,11 +231,11 @@ - fprintf(fh,"# %d) number of indels\n", ++i); - fprintf(fh,"# %d) number of singletons\n", ++i); - fprintf(fh,"# %d) number of missing genotypes (./., ., ./0, etc)\n", ++i); -- fprintf(fh,"# %d) number of transitions (genotypes such as \"1/2\" are counted twice)\n", ++i); -- fprintf(fh,"# %d) number of transversions (genotypes such as \"1/2\" are counted twice)\n", ++i); -+ fprintf(fh,"# %d) number of transitions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); -+ fprintf(fh,"# %d) number of transversions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); - fprintf(fh,"# %d) overall ts/tv\n", ++i); - i = 0; -- fprintf(fh,"# SITE* lines report numbers for every threshold and site:\n"); -+ fprintf(fh,"# SITE* lines report numbers for every threshold:\n"); - fprintf(fh,"# %d) filter id\n", ++i); - fprintf(fh,"# %d) number of sites which pass the filter\n", ++i); - fprintf(fh,"# %d) number of SNVs\n", ++i); -@@ -390,7 +391,7 @@ - { - if ( als[j]==0 || als[j]==star_allele ) continue; - if ( als[j] >= rec->n_allele ) -- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[j]); -+ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[j]); - - if ( args->ac[als[j]]==1 ) { stats->nsingleton++; site_singleton = 1; } - ---- python-pysam.orig/bcftools/plugins/smpl-stats.c.pysam.c -+++ python-pysam/bcftools/plugins/smpl-stats.c.pysam.c -@@ -30,6 +30,7 @@ - #include - #include - #include // for isatty -+#include - #include - #include - #include -@@ -232,11 +233,11 @@ - fprintf(fh,"# %d) number of indels\n", ++i); - fprintf(fh,"# %d) number of singletons\n", ++i); - fprintf(fh,"# %d) number of missing genotypes (./., ., ./0, etc)\n", ++i); -- fprintf(fh,"# %d) number of transitions (genotypes such as \"1/2\" are counted twice)\n", ++i); -- fprintf(fh,"# %d) number of transversions (genotypes such as \"1/2\" are counted twice)\n", ++i); -+ fprintf(fh,"# %d) number of transitions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); -+ fprintf(fh,"# %d) number of transversions (alt het genotypes such as \"1/2\" are counted twice)\n", ++i); - fprintf(fh,"# %d) overall ts/tv\n", ++i); - i = 0; -- fprintf(fh,"# SITE* lines report numbers for every threshold and site:\n"); -+ fprintf(fh,"# SITE* lines report numbers for every threshold:\n"); - fprintf(fh,"# %d) filter id\n", ++i); - fprintf(fh,"# %d) number of sites which pass the filter\n", ++i); - fprintf(fh,"# %d) number of SNVs\n", ++i); -@@ -392,7 +393,7 @@ - { - if ( als[j]==0 || als[j]==star_allele ) continue; - if ( als[j] >= rec->n_allele ) -- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[j]); -+ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[j]); - - if ( args->ac[als[j]]==1 ) { stats->nsingleton++; site_singleton = 1; } - ---- /dev/null -+++ python-pysam/bcftools/plugins/split-vep.c -@@ -0,0 +1,934 @@ -+/* The MIT License -+ -+ Copyright (c) 2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "../bcftools.h" -+#include "../filter.h" -+#include "../convert.h" -+#include "../cols.h" -+ -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define SELECT_TR_ALL 0 -+#define SELECT_TR_WORST 1 -+#define SELECT_TR_PRIMARY 2 -+#define SELECT_CSQ_ANY -1 -+ -+typedef struct -+{ -+ char *field; // the name of the VEP field, e.g. Consequence,Gene,etc. -+ char *tag; // the name of the VCF tag: the annot_t.field with the -p prefix -+ int idx; // 0-based index within the VEP annotation string -+ int type; // annotation type, one of the BCF_HT_* types -+ kstring_t str; // annotation value, ready to pass to bcf_update_info_* -+} -+annot_t; -+ -+typedef struct -+{ -+ convert_t *convert; -+ filter_t *filter; -+ int argc, filter_logic, regions_is_file, targets_is_file, list_hdr; -+ kstring_t kstr; -+ char *filter_str, -+ *vep_tag; // the --annotation INFO tag to process -+ char **argv, *output_fname, *fname, *regions, *targets, *format_str; -+ int output_type; -+ htsFile *fh_vcf; -+ BGZF *fh_bgzf; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr, *hdr_out; -+ int nfield; // number of all available VEP fields -+ char **field; // list of all available VEP fields -+ int nannot; // number of requested fields -+ annot_t *annot; // requested fields -+ int nscale; // number of items in the severity scale -+ char **scale; // severity scale (list) -+ int ncsq_str; // the length of csq_str allocated by bcf_get_info_string() -+ char *csq_str; // the current bcf_get_info_string() result -+ int csq_idx, // the index of the Consequence field; for the --select CSQ option -+ primary_id; // the index of the CANONICAL field; for the --select TR option -+ char *severity, // the --severity scale option -+ *select, // the --select option -+ *column_str, // the --columns option -+ *annot_prefix; // the --annot-prefix option -+ void *field2idx, // VEP field name to index, used in initialization -+ *csq2severity; // consequence type to severity score -+ cols_t *cols_tr, // the current CSQ tag split into transcripts -+ *cols_csq; // the current CSQ transcript split into fields -+ int min_severity, max_severity; // ignore consequences outside this severity range -+ int drop_sites; // the -x, --drop-sites option -+ int select_tr; // one of SELECT_TR_* -+ uint8_t *smpl_pass; // for filtering at sample level, used with -f -+ int duplicate; // the -d, --duplicate option is set -+ char *all_fields_delim; // the -A, --all-fields option is set -+ float *farr; // helper arrays for bcf_update_* functions -+ int32_t *iarr; -+ int niarr,miarr, nfarr,mfarr; -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Query structured annotations such as the CSQ created by VEP.\n"; -+} -+ -+static const char *default_severity(void) -+{ -+ return -+ "# Default consequence substrings ordered in ascending order by severity.\n" -+ "# Consequences with the same severity can be put on the same line in arbitrary order.\n" -+ "intergenic\n" -+ "downstream upstream\n" -+ "intron\n" -+ "non_coding\n" -+ "regulatory\n" -+ "5_prime_utr 3_prime_utr\n" -+ "stop_retained start_retained synonymous\n" -+ "splice_region\n" -+ "coding_sequence\n" -+ "missense\n" -+ "inframe\n" -+ "exon_loss\n" -+ "disruptive\n" -+ "splice_acceptor splice_donor\n" -+ "start_lost stop_lost stop_gained frameshift\n"; -+} -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Query structured annotations such INFO/CSQ created by bcftools/csq or VEP. For more\n" -+ " more information and pointers see http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n" -+ "Usage: bcftools +split-vep [Plugin Options]\n" -+ "Plugin options:\n" -+ " -a, --annotation STR INFO annotation to parse [CSQ]\n" -+ " -A, --all-fields DELIM Output all fields replacing the -a tag (\"%CSQ\" by default) in the -f\n" -+ " filtering expression using the output field delimiter DELIM. This can be\n" -+ " \"tab\", \"space\" or an arbitrary string.\n" -+ " -c, --columns LIST[:type] Extract the fields listed either as indexes or names. The default type\n" -+ " of the new annotation is String but can be also Integer/Int or Float/Real.\n" -+ " -d, --duplicate Output per transcript/allele consequences on a new line rather rather than\n" -+ " as comma-separated fields on a single line\n" -+ " -f, --format Formatting expression for non-VCF/BCF output, same as `bcftools query -f`\n" -+ " -l, --list Parse the VCF header and list the annotation fields\n" -+ " -p, --annot-prefix Prefix of INFO annotations to be created after splitting the CSQ string\n" -+ " -s, --select TR:CSQ Select transcripts to extract by type and/or consequence. (See also the -x switch.)\n" -+ " TR, transcript: worst,primary(*),all [all]\n" -+ " CSQ, consequence: any,missense,missense+,etc [any]\n" -+ " (*) Primary transcripts have the field \"CANONICAL\" set to \"YES\"\n" -+ " -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n" -+ " the default scale\n" -+ " -x, --drop-sites Drop sites with none of the consequences matching the severity specified by -s.\n" -+ " This switch is intended for use with VCF/BCF output (i.e. -f not given).\n" -+ "Common options:\n" -+ " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n" -+ " -i, --include EXPR Include sites and samples for which the expression is true\n" -+ " -o, --output FILE Output file name [stdout]\n" -+ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF or text, v: uncompressed VCF or text [v]\n" -+ " -r, --regions REG Restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE Restrict to regions listed in a file\n" -+ " -t, --targets REG Similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" -+ "\n" -+ "Examples:\n" -+ " # List available fields of the INFO/CSQ annotation\n" -+ " bcftools +split-vep -l file.vcf.gz\n" -+ "\n" -+ " # List the default severity scale\n" -+ " bcftools +split-vep -S -\n" -+ "\n" -+ " # Extract Consequence, IMPACT and gene SYMBOL of the most severe consequence into\n" -+ " # INFO annotations starting with the prefix \"vep\". For brevity, the columns can\n" -+ " # be given also as 0-based indexes\n" -+ " bcftools +split-vep -c Consequence,IMPACT,SYMBOL -s worst -p vep file.vcf.gz\n" -+ " bcftools +split-vep -c 1-3 -s worst -p vep file.vcf.gz\n" -+ "\n" -+ " # Same as above but use the text output of the \"bcftools query\" format\n" -+ " bcftools +split-vep -s worst -f '%CHROM %POS %Consequence %IMPACT %SYMBOL\\n' file.vcf.gz\n" -+ "\n" -+ " # Print all subfields (tab-delimited) in place of %CSQ, each consequence on a new line\n" -+ " bcftools +split-vep -f '%CHROM %POS %CSQ\\n' -d -A tab file.vcf.gz\n" -+ "\n" -+ " # Extract gnomAD_AF subfield into a new INFO/gnomAD_AF annotation of Type=Float so that\n" -+ " # numeric filtering can be used.\n" -+ " bcftools +split-vep -c gnomAD_AF:Float file.vcf.gz -i'gnomAD_AF<0.001'\n" -+ "\n" -+ " # Similar to above, but add the annotation only if the consequence severity is missense\n" -+ " # or equivalent. In order to drop sites with different consequences completely, we add\n" -+ " # the -x switch. See the online documentation referenced above for more examples.\n" -+ " bcftools +split-vep -c gnomAD_AF:Float -s :missense file.vcf.gz\n" -+ " bcftools +split-vep -c gnomAD_AF:Float -s :missense -x file.vcf.gz\n" -+ "\n"; -+} -+ -+static void expand_csq_expression(args_t *args, kstring_t *str) -+{ -+ if ( !args->all_fields_delim ) return; -+ -+ str->l = 0; -+ kputc('%',str); -+ kputs(args->vep_tag,str); -+ char *ptr = strstr(args->format_str,str->s); -+ if ( !ptr ) return; -+ char *end = ptr + str->l, tmp = *end; -+ if ( isalnum(tmp) || tmp=='_' || tmp=='.' ) return; -+ *end = 0; -+ -+ str->l = 0; -+ kputsn(args->format_str, ptr - args->format_str, str); -+ -+ int i; -+ for (i=0; infield; i++) -+ { -+ if ( i>0 ) kputs(args->all_fields_delim, str); -+ kputc('%', str); -+ kputs(args->field[i], str); -+ } -+ -+ *end = tmp; -+ kputs(end, str); -+ -+ free(args->format_str); -+ args->format_str = str->s; -+ str->l = str->m = 0; -+ str->s = NULL; -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->regions ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); -+ } -+ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ args->hdr_out = bcf_hdr_dup(args->hdr); -+ -+ // Parse the header CSQ line, must contain Description with "Format: ..." declaration -+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr, BCF_HL_INFO, NULL, args->vep_tag, NULL); -+ if ( !hrec ) error("The tag INFO/%s not found in the header\n", args->vep_tag); -+ int ret = bcf_hrec_find_key(hrec, "Description"); -+ if ( ret<0 ) error("No \"Description\" field was found for the tag INFO/%s in the header\n", args->vep_tag); -+ char *format = strstr(hrec->vals[ret], "Format: "); -+ if ( !format ) error("Expected \"Format: \" substring in the header INFO/%s/Description, found: %s\n", args->vep_tag,hrec->vals[ret]); -+ format += 8; -+ char *ep = format; -+ while ( *ep ) -+ { -+ char *bp = ep; -+ while ( *ep && *ep!='|' ) ep++; -+ char tmp = *ep; -+ *ep = 0; -+ args->nfield++; -+ args->field = (char**)realloc(args->field,args->nfield*sizeof(*args->field)); -+ args->field[args->nfield-1] = strdup(bp); -+ if ( !tmp ) break; -+ ep++; -+ } -+ if ( !args->nfield ) error("Could not parse Description of INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); -+ int len = strlen(args->field[args->nfield-1]); -+ if ( args->field[args->nfield-1][len-1]=='"' ) args->field[args->nfield-1][len-1] = 0; // remove the trailing doublequote character -+ args->field2idx = khash_str2int_init(); -+ int i,j; -+ for (i=0; infield; i++) -+ { -+ if ( khash_str2int_has_key(args->field2idx, args->field[i]) ) -+ { -+ fprintf(stderr,"Warning: duplicate INFO/%s key \"%s\"\n", args->vep_tag,args->field[i]); -+ continue; -+ } -+ khash_str2int_set(args->field2idx, args->field[i], i); -+ } -+ -+ // Create a text output as with `bcftools query -f`. For this we need to determine the fields to be extracted -+ // from the formatting expression -+ kstring_t str = {0,0,0}; -+ if ( args->format_str && !args->column_str ) -+ { -+ // Special case: -A was given, extract all fields, for this the -a tag (%CSQ) must be present -+ if ( args->all_fields_delim ) expand_csq_expression(args, &str); -+ -+ for (i=0; infield; i++) -+ { -+ str.l = 0; -+ kputc('%',&str); -+ kputs(args->field[i],&str); -+ char end, *ptr = args->format_str; -+ while ( ptr ) -+ { -+ ptr = strstr(ptr,str.s); -+ if ( !ptr ) break; -+ end = ptr[str.l]; -+ if ( isalnum(end) || end=='_' || end=='.' ) -+ { -+ ptr++; -+ continue; -+ } -+ break; -+ } -+ if ( !ptr ) continue; -+ ptr[str.l] = 0; -+ int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, ptr+1); -+ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) ) -+ fprintf(stderr,"Note: ambigous key %s, using the %s subfield of %s, not the INFO/%s tag\n", ptr,ptr+1,args->vep_tag,ptr+1); -+ -+ int olen = args->column_str ? strlen(args->column_str) : 0; -+ int nlen = strlen(ptr) - 1; -+ args->column_str = (char*)realloc(args->column_str, olen + nlen + 2); -+ if ( olen ) -+ { -+ memcpy(args->column_str+olen,",",1); -+ olen++; -+ } -+ memcpy(args->column_str+olen,ptr+1,nlen); -+ args->column_str[olen+nlen] = 0; -+ -+ ptr[str.l] = end; -+ } -+ } -+ -+ // The "Consequence" column to look up severity, its name is hardwired for now -+ if ( khash_str2int_get(args->field2idx,"Consequence",&args->csq_idx)!=0 ) -+ error("The field \"Consequence\" is not present in INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); -+ -+ // Columns to extract: given as names, 0-based indexes or ranges of indexes -+ if ( args->column_str ) -+ { -+ int *column = NULL; -+ int *types = NULL; -+ ep = args->column_str; -+ while ( *ep ) -+ { -+ char *tp, *bp = ep; -+ while ( *ep && *ep!=',' ) ep++; -+ char tmp = *ep; -+ *ep = 0; -+ int type = BCF_HT_STR; -+ int idx_beg, idx_end; -+ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)==0 ) -+ idx_end = idx_beg; -+ else if ( (tp=strrchr(bp,':')) ) -+ { -+ *tp = 0; -+ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)!=0 ) -+ { -+ *tp = ':'; -+ error("No such column: \"%s\"\n", bp); -+ } -+ idx_end = idx_beg; -+ *tp = ':'; -+ if ( !strcasecmp(tp+1,"string") ) type = BCF_HT_STR; -+ else if ( !strcasecmp(tp+1,"float") || !strcasecmp(tp+1,"real") ) type = BCF_HT_REAL; -+ else if ( !strcasecmp(tp+1,"integer") || !strcasecmp(tp+1,"int") ) type = BCF_HT_INT; -+ else if ( !strcasecmp(tp+1,"flag") ) type = BCF_HT_FLAG; -+ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", tp+1,bp); -+ } -+ else -+ { -+ char *mp; -+ idx_beg = strtol(bp,&mp,10); -+ if ( !*mp ) idx_end = idx_beg; -+ else if ( *mp=='-' ) -+ idx_end = strtol(mp+1,&mp,10); -+ if ( *mp ) -+ { -+ if ( *mp==':' ) -+ { -+ idx_end = idx_beg; -+ if ( !strcasecmp(mp+1,"string") ) type = BCF_HT_STR; -+ else if ( !strcasecmp(mp+1,"float") || !strcasecmp(mp+1,"real") ) type = BCF_HT_REAL; -+ else if ( !strcasecmp(mp+1,"integer") || !strcasecmp(mp+1,"int") ) type = BCF_HT_INT; -+ else if ( !strcasecmp(mp+1,"flag") ) type = BCF_HT_FLAG; -+ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", mp+1,bp); -+ } -+ else -+ error("No such column: \"%s\"\n", bp); -+ } -+ } -+ -+ i = args->nannot; -+ args->nannot += idx_end - idx_beg + 1; -+ column = (int*)realloc(column,args->nannot*sizeof(*column)); -+ types = (int*)realloc(types,args->nannot*sizeof(*types)); -+ for (j=idx_beg; j<=idx_end; j++) -+ { -+ if ( j >= args->nfield ) error("The index is too big: %d\n", j); -+ column[i] = j; -+ types[i] = type; -+ i++; -+ } -+ if ( !tmp ) break; -+ ep++; -+ } -+ args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot)); -+ int len = args->annot_prefix ? strlen(args->annot_prefix) : 0; -+ for (i=0; inannot; i++) -+ { -+ annot_t *ann = &args->annot[i]; -+ ann->type = types[i]; -+ ann->idx = j = column[i]; -+ ann->field = strdup(args->field[j]); -+ int clen = strlen(args->field[j]); -+ ann->tag = (char*)malloc(clen+len+1); -+ if ( len ) memcpy(ann->tag,args->annot_prefix,len); -+ memcpy(ann->tag+len,ann->field,clen); -+ ann->tag[len+clen] = 0; -+ args->kstr.l = 0; -+ char *type = "String"; -+ if ( ann->type==BCF_HT_REAL ) type = "Float"; -+ else if ( ann->type==BCF_HT_INT ) type = "Integer"; -+ else if ( ann->type==BCF_HT_FLAG ) type = "Flag"; -+ ksprintf(&args->kstr,"##INFO=",type); -+ bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag); -+ } -+ free(column); -+ free(types); -+ -+ if ( bcf_hdr_sync(args->hdr_out)<0 ) -+ error_errno("[%s] Failed to update header", __func__); -+ } -+ if ( args->format_str ) -+ { -+ if ( !args->column_str && !args->select ) error("Error: No %s field selected in the formatting expression and -s not given: a typo?\n",args->vep_tag); -+ args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str); -+ if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str); -+ } -+ if ( args->filter_str ) -+ { -+ int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0; -+ args->filter = filter_init(args->hdr_out, args->filter_str); -+ max_unpack |= filter_max_unpack(args->filter); -+ args->sr->max_unpack = max_unpack; -+ if ( max_unpack & BCF_UN_FMT ) -+ convert_set_option(args->convert, subset_samples, &args->smpl_pass); -+ } -+ -+ // Severity scale -+ args->csq2severity = khash_str2int_init(); -+ int severity = 0; -+ str.l = 0; -+ if ( args->severity ) -+ { -+ kstring_t tmp = {0,0,0}; -+ htsFile *fp = hts_open(args->severity,"r"); -+ if ( !fp ) error("Cannot read %s\n", args->severity); -+ while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) -+ { -+ kputs(tmp.s, &str); -+ kputc('\n', &str); -+ } -+ free(tmp.s); -+ } -+ else -+ kputs(default_severity(),&str); -+ ep = str.s; -+ while ( *ep ) -+ { -+ if ( *ep=='#' ) -+ { -+ while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; } -+ if ( !*ep ) break; -+ ep++; -+ continue; -+ } -+ char *bp = ep; -+ while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; } -+ char tmp = *ep; -+ *ep = 0; -+ args->nscale++; -+ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); -+ args->scale[args->nscale-1] = strdup(bp); -+ if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) ) -+ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); -+ if ( !tmp ) break; -+ if ( tmp=='\n' ) severity++; -+ ep++; -+ while ( *ep && isspace(*ep) ) ep++; -+ } -+ free(str.s); -+ -+ // Transcript and/or consequence selection -+ if ( !args->select ) args->select = "all:any"; -+ cols_t *cols = cols_split(args->select, NULL, ':'); -+ char *sel_tr = cols->off[0][0] ? cols->off[0] : "all"; -+ char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any"; -+ if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL; -+ else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST; -+ else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY; -+ else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr); -+ if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups -+ else -+ { -+ int len = strlen(sel_csq); -+ int severity, modifier = '='; -+ if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; } -+ else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; } -+ if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 ) -+ error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq); -+ if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; } -+ else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; } -+ else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; } -+ } -+ cols_destroy(cols); -+ -+ // The 'CANONICAL' column to look up severity, its name is hardwired for now -+ if ( args->select_tr==SELECT_TR_PRIMARY && khash_str2int_get(args->field2idx,"CANONICAL",&args->primary_id)!=0 ) -+ error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]); -+} -+static void destroy_data(args_t *args) -+{ -+ free(args->farr); -+ free(args->iarr); -+ free(args->kstr.s); -+ free(args->column_str); -+ free(args->format_str); -+ cols_destroy(args->cols_csq); -+ cols_destroy(args->cols_tr); -+ int i; -+ for (i=0; inscale; i++) free(args->scale[i]); -+ free(args->scale); -+ for (i=0; infield; i++) free(args->field[i]); -+ free(args->field); -+ for (i=0; inannot; i++) -+ { -+ annot_t *ann = &args->annot[i]; -+ free(ann->field); -+ free(ann->tag); -+ free(ann->str.s); -+ } -+ free(args->annot); -+ if ( args->field2idx ) khash_str2int_destroy(args->field2idx); -+ if ( args->csq2severity ) khash_str2int_destroy(args->csq2severity); -+ bcf_sr_destroy(args->sr); -+ bcf_hdr_destroy(args->hdr_out); -+ free(args->csq_str); -+ if ( args->filter ) filter_destroy(args->filter); -+ if ( args->convert ) convert_destroy(args->convert); -+ if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); -+ if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); -+ free(args); -+} -+static void list_header(args_t *args) -+{ -+ int i; -+ for (i=0; infield; i++) printf("%d\t%s\n", i,args->field[i]); -+} -+ -+static void csq_to_severity(args_t *args, char *csq, int *min_severity, int *max_severity, int exact_match) -+{ -+ *min_severity = INT_MAX; -+ *max_severity = -1; -+ char *ep = csq; -+ while ( *ep ) -+ { -+ char *bp = ep; -+ while ( *ep && *ep!='&' ) { *ep = tolower(*ep); ep++; } -+ char tmp = *ep; -+ *ep = 0; -+ -+ int i, severity = -1; -+ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) -+ { -+ for (i=0; inscale; i++) -+ if ( strstr(bp,args->scale[i]) ) break; -+ -+ if ( i!=args->nscale ) -+ khash_str2int_get(args->csq2severity, args->scale[i], &severity); -+ else -+ severity = args->nscale + 1; -+ -+ args->nscale++; -+ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); -+ args->scale[args->nscale-1] = strdup(bp); -+ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); -+ if ( i==args->nscale ) -+ fprintf(stderr,"Note: assigning a (high) severity score to a new consequence, use -S to override: %s -> %d\n",args->scale[args->nscale-1],args->nscale); -+ -+ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) error("FIXME: failed to look up the consequence \"%s\"\n", bp); -+ } -+ if ( exact_match < 0 ) -+ { -+ if ( *min_severity > severity ) *min_severity = severity; -+ if ( *max_severity < severity ) *max_severity = severity; -+ } -+ else -+ { -+ if ( severity==exact_match ) -+ { -+ *min_severity = *max_severity = severity; -+ *ep = tmp; -+ return; -+ } -+ } -+ -+ if ( !tmp ) break; -+ *ep = tmp; -+ ep++; -+ } -+} -+ -+static int csq_severity_pass(args_t *args, char *csq) -+{ -+ if ( args->min_severity==args->max_severity && args->min_severity==SELECT_CSQ_ANY ) return 1; -+ -+ int min_severity, max_severity, exact_match = args->min_severity==args->max_severity ? args->min_severity : -1; -+ csq_to_severity(args, csq, &min_severity, &max_severity, exact_match); -+ if ( max_severity < args->min_severity ) return 0; -+ if ( min_severity > args->max_severity ) return 0; -+ return 1; -+} -+ -+static int get_primary_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! -+{ -+ int i; -+ for (i=0; in; i++) -+ { -+ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); -+ if ( args->primary_id >= args->cols_csq->n ) -+ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->primary_id,args->cols_csq->n); -+ if ( !strcmp("YES",args->cols_csq->off[args->primary_id]) ) return i; -+ } -+ return -1; -+} -+static int get_worst_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! -+{ -+ int i, max_severity = -1, imax_severity = 0; -+ for (i=0; in; i++) -+ { -+ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); -+ if ( args->csq_idx >= args->cols_csq->n ) -+ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); -+ char *csq = args->cols_csq->off[args->csq_idx]; -+ -+ int min, max; -+ csq_to_severity(args, csq, &min, &max, -1); -+ if ( max_severity < max ) { imax_severity = i; max_severity = max; } -+ } -+ return imax_severity; -+} -+static void annot_reset(annot_t *annot, int nannot) -+{ -+ int i; -+ for (i=0; istr.l ) kputc(',',&ann->str); -+ kputs(value, &ann->str); -+} -+static inline void parse_array_real(char *str, float **arr, int *marr, int *narr) -+{ -+ char *bp = str, *ep; -+ float *ptr = *arr; -+ int i, n = 1, m = *marr; -+ for (i=0; *bp; bp++) -+ if ( *bp == ',' ) n++; -+ -+ hts_expand(float*,n,m,ptr); -+ -+ i = 0; -+ bp = str; -+ while ( *bp ) -+ { -+ ptr[i] = strtod(bp, &ep); -+ if ( bp==ep ) -+ bcf_float_set_missing(ptr[i]); -+ i++; -+ while ( *ep && *ep!=',' ) ep++; -+ bp = *ep ? ep + 1 : ep; -+ } -+ *narr = i; -+ *marr = m; -+ *arr = ptr; -+} -+static inline void parse_array_int32(char *str, int **arr, int *marr, int *narr) -+{ -+ char *bp = str, *ep; -+ int32_t *ptr = *arr; -+ int i, n = 1, m = *marr; -+ for (i=0; *bp; bp++) -+ if ( *bp == ',' ) n++; -+ -+ hts_expand(int32_t*,n,m,ptr); -+ -+ i = 0; -+ bp = str; -+ while ( *bp ) -+ { -+ ptr[i] = strtol(bp, &ep, 10); -+ if ( bp==ep ) -+ ptr[i] = bcf_int32_missing; -+ i++; -+ while ( *ep && *ep!=',' ) ep++; -+ bp = *ep ? ep + 1 : ep; -+ } -+ *narr = i; -+ *marr = m; -+ *arr = ptr; -+} -+static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int all_missing) -+{ -+ int i, updated = 0; -+ for (i=0; inannot; i++) -+ { -+ annot_t *ann = &args->annot[i]; -+ if ( !ann->str.l ) continue; -+ if ( ann->type==BCF_HT_REAL ) -+ { -+ parse_array_real(ann->str.s,&args->farr,&args->mfarr,&args->nfarr); -+ bcf_update_info_float(args->hdr_out,rec,ann->tag,args->farr,args->nfarr); -+ } -+ else if ( ann->type==BCF_HT_INT ) -+ { -+ parse_array_int32(ann->str.s,&args->iarr,&args->miarr,&args->niarr); -+ bcf_update_info_int32(args->hdr_out,rec,ann->tag,args->iarr,args->niarr); -+ } -+ else -+ bcf_update_info_string(args->hdr_out,rec,ann->tag,ann->str.s); -+ updated++; -+ } -+ if ( args->filter ) -+ { -+ int pass = filter_test(args->filter, rec, (const uint8_t**) &args->smpl_pass); -+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; -+ if ( !pass ) return; -+ } -+ if ( args->format_str ) -+ { -+ if ( args->nannot ) -+ { -+ if ( !updated || all_missing ) return; // the standard case: using -f to print the CSQ subfields, skipping if missing -+ } -+ else -+ { -+ if ( !severity_pass ) return; // request to print only non-CSQ tags at sites that pass severity -+ } -+ -+ args->kstr.l = 0; -+ convert_line(args->convert, rec, &args->kstr); -+ if ( args->kstr.l && bgzf_write(args->fh_bgzf, args->kstr.s, args->kstr.l)!=args->kstr.l ) -+ error("Failed to write to %s\n", args->output_fname); -+ return; -+ } -+ if ( bcf_write(args->fh_vcf, args->hdr_out,rec)!=0 ) -+ error("Failed to write to %s\n", args->output_fname); -+} -+static void process_record(args_t *args, bcf1_t *rec) -+{ -+ int len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str); -+ if ( len<=0 ) return; -+ -+ args->cols_tr = cols_split(args->csq_str, args->cols_tr, ','); -+ -+ int i,j, itr_min = 0, itr_max = args->cols_tr->n - 1; -+ if ( args->select_tr==SELECT_TR_PRIMARY ) -+ { -+ itr_min = itr_max = get_primary_transcript(args, rec, args->cols_tr); -+ if ( itr_min<0 ) itr_max = itr_min - 1; -+ } -+ else if ( args->select_tr==SELECT_TR_WORST ) -+ itr_min = itr_max = get_worst_transcript(args, rec, args->cols_tr); -+ -+ annot_reset(args->annot, args->nannot); -+ int severity_pass = 0; // consequence severity requested via the -s option (BCF record may be output but not annotated) -+ int all_missing = 1; // transcripts with all requested annotations missing will be discarded if -f was given -+ static int too_few_fields_warned = 0; -+ for (i=itr_min; i<=itr_max; i++) -+ { -+ args->cols_csq = cols_split(args->cols_tr->off[i], args->cols_csq, '|'); -+ if ( args->csq_idx >= args->cols_csq->n ) -+ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); -+ -+ char *csq = args->cols_csq->off[args->csq_idx]; -+ if ( !csq_severity_pass(args, csq) ) continue; -+ severity_pass = 1; -+ -+ for (j=0; jnannot; j++) -+ { -+ annot_t *ann = &args->annot[j]; -+ if ( ann->idx >= args->cols_csq->n ) -+ { -+ if ( !too_few_fields_warned ) -+ { -+ fprintf(stderr, "Warning: fewer %s fields than expected at %s:%"PRId64", filling with dots. This warning is printed only once.\n", args->vep_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ too_few_fields_warned = 1; -+ } -+ annot_append(ann, "."); -+ continue; -+ } -+ -+ if ( !*args->cols_csq->off[ann->idx] ) -+ annot_append(ann, "."); // missing value -+ else -+ { -+ annot_append(ann, args->cols_csq->off[ann->idx]); -+ all_missing = 0; -+ } -+ } -+ -+ if ( args->duplicate ) -+ { -+ filter_and_output(args, rec, severity_pass, all_missing); -+ annot_reset(args->annot, args->nannot); -+ all_missing = 1; -+ severity_pass = 0; -+ } -+ } -+ if ( !severity_pass && args->drop_sites ) return; -+ if ( !args->duplicate ) -+ filter_and_output(args, rec, severity_pass, all_missing); -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_fname = "-"; -+ args->output_type = FT_VCF; -+ args->vep_tag = "CSQ"; -+ static struct option loptions[] = -+ { -+ {"drop-sites",no_argument,0,'x'}, -+ {"all-fields",no_argument,0,'A'}, -+ {"duplicate",no_argument,0,'d'}, -+ {"format",required_argument,0,'f'}, -+ {"annotation",required_argument,0,'a'}, -+ {"annot-prefix",required_argument,0,'p'}, -+ {"columns",required_argument,0,'c'}, -+ {"select",required_argument,0,'s'}, -+ {"severity",required_argument,0,'S'}, -+ {"list",no_argument,0,'l'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {"regions",1,0,'r'}, -+ {"regions-file",1,0,'R'}, -+ {"targets",1,0,'t'}, -+ {"targets-file",1,0,'T'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:x",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'A': -+ if ( !strcasecmp(optarg,"tab") ) args->all_fields_delim = "\t"; -+ else if ( !strcasecmp(optarg,"space") ) args->all_fields_delim = " "; -+ else args->all_fields_delim = optarg; -+ break; -+ case 'x': args->drop_sites = 1; break; -+ case 'd': args->duplicate = 1; break; -+ case 'f': args->format_str = strdup(optarg); break; -+ case 'a': args->vep_tag = optarg; break; -+ case 'p': args->annot_prefix = optarg; break; -+ case 'c': args->column_str = strdup(optarg); break; -+ case 'S': args->severity = optarg; break; -+ case 's': args->select = optarg; break; -+ case 'l': args->list_hdr = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': args->targets = optarg; break; -+ case 'T': args->targets = optarg; args->targets_is_file = 1; break; -+ case 'r': args->regions = optarg; break; -+ case 'R': args->regions = optarg; args->regions_is_file = 1; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ } -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n"); -+ if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n"); -+ if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity()); -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ init_data(args); -+ -+ if ( args->list_hdr ) -+ list_header(args); -+ else -+ { -+ if ( !args->format_str && !args->column_str ) -+ { -+ if ( args->min_severity==SELECT_CSQ_ANY && args->max_severity==SELECT_CSQ_ANY ) -+ error("Error: none of the -c,-f,-s options was given, why not use \"bcftools view\" instead?\n"); -+ else if ( !args->drop_sites ) -+ error("Error: when the -s option is used without -x, everything is printed; why not use \"bcftools view\" instead?\n"); -+ } -+ -+ if ( args->format_str ) -+ args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu"); -+ else -+ { -+ args->fh_vcf = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); -+ if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); -+ } -+ while ( bcf_sr_next_line(args->sr) ) -+ process_record(args, bcf_sr_get_line(args->sr,0)); -+ } -+ -+ destroy_data(args); -+ -+ return 0; -+} ---- /dev/null -+++ python-pysam/bcftools/plugins/split-vep.c.pysam.c -@@ -0,0 +1,936 @@ -+#include "bcftools.pysam.h" -+ -+/* The MIT License -+ -+ Copyright (c) 2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "../bcftools.h" -+#include "../filter.h" -+#include "../convert.h" -+#include "../cols.h" -+ -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define SELECT_TR_ALL 0 -+#define SELECT_TR_WORST 1 -+#define SELECT_TR_PRIMARY 2 -+#define SELECT_CSQ_ANY -1 -+ -+typedef struct -+{ -+ char *field; // the name of the VEP field, e.g. Consequence,Gene,etc. -+ char *tag; // the name of the VCF tag: the annot_t.field with the -p prefix -+ int idx; // 0-based index within the VEP annotation string -+ int type; // annotation type, one of the BCF_HT_* types -+ kstring_t str; // annotation value, ready to pass to bcf_update_info_* -+} -+annot_t; -+ -+typedef struct -+{ -+ convert_t *convert; -+ filter_t *filter; -+ int argc, filter_logic, regions_is_file, targets_is_file, list_hdr; -+ kstring_t kstr; -+ char *filter_str, -+ *vep_tag; // the --annotation INFO tag to process -+ char **argv, *output_fname, *fname, *regions, *targets, *format_str; -+ int output_type; -+ htsFile *fh_vcf; -+ BGZF *fh_bgzf; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr, *hdr_out; -+ int nfield; // number of all available VEP fields -+ char **field; // list of all available VEP fields -+ int nannot; // number of requested fields -+ annot_t *annot; // requested fields -+ int nscale; // number of items in the severity scale -+ char **scale; // severity scale (list) -+ int ncsq_str; // the length of csq_str allocated by bcf_get_info_string() -+ char *csq_str; // the current bcf_get_info_string() result -+ int csq_idx, // the index of the Consequence field; for the --select CSQ option -+ primary_id; // the index of the CANONICAL field; for the --select TR option -+ char *severity, // the --severity scale option -+ *select, // the --select option -+ *column_str, // the --columns option -+ *annot_prefix; // the --annot-prefix option -+ void *field2idx, // VEP field name to index, used in initialization -+ *csq2severity; // consequence type to severity score -+ cols_t *cols_tr, // the current CSQ tag split into transcripts -+ *cols_csq; // the current CSQ transcript split into fields -+ int min_severity, max_severity; // ignore consequences outside this severity range -+ int drop_sites; // the -x, --drop-sites option -+ int select_tr; // one of SELECT_TR_* -+ uint8_t *smpl_pass; // for filtering at sample level, used with -f -+ int duplicate; // the -d, --duplicate option is set -+ char *all_fields_delim; // the -A, --all-fields option is set -+ float *farr; // helper arrays for bcf_update_* functions -+ int32_t *iarr; -+ int niarr,miarr, nfarr,mfarr; -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Query structured annotations such as the CSQ created by VEP.\n"; -+} -+ -+static const char *default_severity(void) -+{ -+ return -+ "# Default consequence substrings ordered in ascending order by severity.\n" -+ "# Consequences with the same severity can be put on the same line in arbitrary order.\n" -+ "intergenic\n" -+ "downstream upstream\n" -+ "intron\n" -+ "non_coding\n" -+ "regulatory\n" -+ "5_prime_utr 3_prime_utr\n" -+ "stop_retained start_retained synonymous\n" -+ "splice_region\n" -+ "coding_sequence\n" -+ "missense\n" -+ "inframe\n" -+ "exon_loss\n" -+ "disruptive\n" -+ "splice_acceptor splice_donor\n" -+ "start_lost stop_lost stop_gained frameshift\n"; -+} -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Query structured annotations such INFO/CSQ created by bcftools/csq or VEP. For more\n" -+ " more information and pointers see http://samtools.github.io/bcftools/howtos/plugin.split-vep.html\n" -+ "Usage: bcftools +split-vep [Plugin Options]\n" -+ "Plugin options:\n" -+ " -a, --annotation STR INFO annotation to parse [CSQ]\n" -+ " -A, --all-fields DELIM Output all fields replacing the -a tag (\"%CSQ\" by default) in the -f\n" -+ " filtering expression using the output field delimiter DELIM. This can be\n" -+ " \"tab\", \"space\" or an arbitrary string.\n" -+ " -c, --columns LIST[:type] Extract the fields listed either as indexes or names. The default type\n" -+ " of the new annotation is String but can be also Integer/Int or Float/Real.\n" -+ " -d, --duplicate Output per transcript/allele consequences on a new line rather rather than\n" -+ " as comma-separated fields on a single line\n" -+ " -f, --format Formatting expression for non-VCF/BCF output, same as `bcftools query -f`\n" -+ " -l, --list Parse the VCF header and list the annotation fields\n" -+ " -p, --annot-prefix Prefix of INFO annotations to be created after splitting the CSQ string\n" -+ " -s, --select TR:CSQ Select transcripts to extract by type and/or consequence. (See also the -x switch.)\n" -+ " TR, transcript: worst,primary(*),all [all]\n" -+ " CSQ, consequence: any,missense,missense+,etc [any]\n" -+ " (*) Primary transcripts have the field \"CANONICAL\" set to \"YES\"\n" -+ " -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n" -+ " the default scale\n" -+ " -x, --drop-sites Drop sites with none of the consequences matching the severity specified by -s.\n" -+ " This switch is intended for use with VCF/BCF output (i.e. -f not given).\n" -+ "Common options:\n" -+ " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n" -+ " -i, --include EXPR Include sites and samples for which the expression is true\n" -+ " -o, --output FILE Output file name [bcftools_stdout]\n" -+ " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF or text, v: uncompressed VCF or text [v]\n" -+ " -r, --regions REG Restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE Restrict to regions listed in a file\n" -+ " -t, --targets REG Similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" -+ "\n" -+ "Examples:\n" -+ " # List available fields of the INFO/CSQ annotation\n" -+ " bcftools +split-vep -l file.vcf.gz\n" -+ "\n" -+ " # List the default severity scale\n" -+ " bcftools +split-vep -S -\n" -+ "\n" -+ " # Extract Consequence, IMPACT and gene SYMBOL of the most severe consequence into\n" -+ " # INFO annotations starting with the prefix \"vep\". For brevity, the columns can\n" -+ " # be given also as 0-based indexes\n" -+ " bcftools +split-vep -c Consequence,IMPACT,SYMBOL -s worst -p vep file.vcf.gz\n" -+ " bcftools +split-vep -c 1-3 -s worst -p vep file.vcf.gz\n" -+ "\n" -+ " # Same as above but use the text output of the \"bcftools query\" format\n" -+ " bcftools +split-vep -s worst -f '%CHROM %POS %Consequence %IMPACT %SYMBOL\\n' file.vcf.gz\n" -+ "\n" -+ " # Print all subfields (tab-delimited) in place of %CSQ, each consequence on a new line\n" -+ " bcftools +split-vep -f '%CHROM %POS %CSQ\\n' -d -A tab file.vcf.gz\n" -+ "\n" -+ " # Extract gnomAD_AF subfield into a new INFO/gnomAD_AF annotation of Type=Float so that\n" -+ " # numeric filtering can be used.\n" -+ " bcftools +split-vep -c gnomAD_AF:Float file.vcf.gz -i'gnomAD_AF<0.001'\n" -+ "\n" -+ " # Similar to above, but add the annotation only if the consequence severity is missense\n" -+ " # or equivalent. In order to drop sites with different consequences completely, we add\n" -+ " # the -x switch. See the online documentation referenced above for more examples.\n" -+ " bcftools +split-vep -c gnomAD_AF:Float -s :missense file.vcf.gz\n" -+ " bcftools +split-vep -c gnomAD_AF:Float -s :missense -x file.vcf.gz\n" -+ "\n"; -+} -+ -+static void expand_csq_expression(args_t *args, kstring_t *str) -+{ -+ if ( !args->all_fields_delim ) return; -+ -+ str->l = 0; -+ kputc('%',str); -+ kputs(args->vep_tag,str); -+ char *ptr = strstr(args->format_str,str->s); -+ if ( !ptr ) return; -+ char *end = ptr + str->l, tmp = *end; -+ if ( isalnum(tmp) || tmp=='_' || tmp=='.' ) return; -+ *end = 0; -+ -+ str->l = 0; -+ kputsn(args->format_str, ptr - args->format_str, str); -+ -+ int i; -+ for (i=0; infield; i++) -+ { -+ if ( i>0 ) kputs(args->all_fields_delim, str); -+ kputc('%', str); -+ kputs(args->field[i], str); -+ } -+ -+ *end = tmp; -+ kputs(end, str); -+ -+ free(args->format_str); -+ args->format_str = str->s; -+ str->l = str->m = 0; -+ str->s = NULL; -+} -+ -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->regions ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); -+ } -+ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ args->hdr_out = bcf_hdr_dup(args->hdr); -+ -+ // Parse the header CSQ line, must contain Description with "Format: ..." declaration -+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr, BCF_HL_INFO, NULL, args->vep_tag, NULL); -+ if ( !hrec ) error("The tag INFO/%s not found in the header\n", args->vep_tag); -+ int ret = bcf_hrec_find_key(hrec, "Description"); -+ if ( ret<0 ) error("No \"Description\" field was found for the tag INFO/%s in the header\n", args->vep_tag); -+ char *format = strstr(hrec->vals[ret], "Format: "); -+ if ( !format ) error("Expected \"Format: \" substring in the header INFO/%s/Description, found: %s\n", args->vep_tag,hrec->vals[ret]); -+ format += 8; -+ char *ep = format; -+ while ( *ep ) -+ { -+ char *bp = ep; -+ while ( *ep && *ep!='|' ) ep++; -+ char tmp = *ep; -+ *ep = 0; -+ args->nfield++; -+ args->field = (char**)realloc(args->field,args->nfield*sizeof(*args->field)); -+ args->field[args->nfield-1] = strdup(bp); -+ if ( !tmp ) break; -+ ep++; -+ } -+ if ( !args->nfield ) error("Could not parse Description of INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); -+ int len = strlen(args->field[args->nfield-1]); -+ if ( args->field[args->nfield-1][len-1]=='"' ) args->field[args->nfield-1][len-1] = 0; // remove the trailing doublequote character -+ args->field2idx = khash_str2int_init(); -+ int i,j; -+ for (i=0; infield; i++) -+ { -+ if ( khash_str2int_has_key(args->field2idx, args->field[i]) ) -+ { -+ fprintf(bcftools_stderr,"Warning: duplicate INFO/%s key \"%s\"\n", args->vep_tag,args->field[i]); -+ continue; -+ } -+ khash_str2int_set(args->field2idx, args->field[i], i); -+ } -+ -+ // Create a text output as with `bcftools query -f`. For this we need to determine the fields to be extracted -+ // from the formatting expression -+ kstring_t str = {0,0,0}; -+ if ( args->format_str && !args->column_str ) -+ { -+ // Special case: -A was given, extract all fields, for this the -a tag (%CSQ) must be present -+ if ( args->all_fields_delim ) expand_csq_expression(args, &str); -+ -+ for (i=0; infield; i++) -+ { -+ str.l = 0; -+ kputc('%',&str); -+ kputs(args->field[i],&str); -+ char end, *ptr = args->format_str; -+ while ( ptr ) -+ { -+ ptr = strstr(ptr,str.s); -+ if ( !ptr ) break; -+ end = ptr[str.l]; -+ if ( isalnum(end) || end=='_' || end=='.' ) -+ { -+ ptr++; -+ continue; -+ } -+ break; -+ } -+ if ( !ptr ) continue; -+ ptr[str.l] = 0; -+ int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, ptr+1); -+ if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) ) -+ fprintf(bcftools_stderr,"Note: ambigous key %s, using the %s subfield of %s, not the INFO/%s tag\n", ptr,ptr+1,args->vep_tag,ptr+1); -+ -+ int olen = args->column_str ? strlen(args->column_str) : 0; -+ int nlen = strlen(ptr) - 1; -+ args->column_str = (char*)realloc(args->column_str, olen + nlen + 2); -+ if ( olen ) -+ { -+ memcpy(args->column_str+olen,",",1); -+ olen++; -+ } -+ memcpy(args->column_str+olen,ptr+1,nlen); -+ args->column_str[olen+nlen] = 0; -+ -+ ptr[str.l] = end; -+ } -+ } -+ -+ // The "Consequence" column to look up severity, its name is hardwired for now -+ if ( khash_str2int_get(args->field2idx,"Consequence",&args->csq_idx)!=0 ) -+ error("The field \"Consequence\" is not present in INFO/%s: %s\n", args->vep_tag,hrec->vals[ret]); -+ -+ // Columns to extract: given as names, 0-based indexes or ranges of indexes -+ if ( args->column_str ) -+ { -+ int *column = NULL; -+ int *types = NULL; -+ ep = args->column_str; -+ while ( *ep ) -+ { -+ char *tp, *bp = ep; -+ while ( *ep && *ep!=',' ) ep++; -+ char tmp = *ep; -+ *ep = 0; -+ int type = BCF_HT_STR; -+ int idx_beg, idx_end; -+ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)==0 ) -+ idx_end = idx_beg; -+ else if ( (tp=strrchr(bp,':')) ) -+ { -+ *tp = 0; -+ if ( khash_str2int_get(args->field2idx, bp, &idx_beg)!=0 ) -+ { -+ *tp = ':'; -+ error("No such column: \"%s\"\n", bp); -+ } -+ idx_end = idx_beg; -+ *tp = ':'; -+ if ( !strcasecmp(tp+1,"string") ) type = BCF_HT_STR; -+ else if ( !strcasecmp(tp+1,"float") || !strcasecmp(tp+1,"real") ) type = BCF_HT_REAL; -+ else if ( !strcasecmp(tp+1,"integer") || !strcasecmp(tp+1,"int") ) type = BCF_HT_INT; -+ else if ( !strcasecmp(tp+1,"flag") ) type = BCF_HT_FLAG; -+ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", tp+1,bp); -+ } -+ else -+ { -+ char *mp; -+ idx_beg = strtol(bp,&mp,10); -+ if ( !*mp ) idx_end = idx_beg; -+ else if ( *mp=='-' ) -+ idx_end = strtol(mp+1,&mp,10); -+ if ( *mp ) -+ { -+ if ( *mp==':' ) -+ { -+ idx_end = idx_beg; -+ if ( !strcasecmp(mp+1,"string") ) type = BCF_HT_STR; -+ else if ( !strcasecmp(mp+1,"float") || !strcasecmp(mp+1,"real") ) type = BCF_HT_REAL; -+ else if ( !strcasecmp(mp+1,"integer") || !strcasecmp(mp+1,"int") ) type = BCF_HT_INT; -+ else if ( !strcasecmp(mp+1,"flag") ) type = BCF_HT_FLAG; -+ else error("The type \"%s\" (or column \"%s\"?) not recognised\n", mp+1,bp); -+ } -+ else -+ error("No such column: \"%s\"\n", bp); -+ } -+ } -+ -+ i = args->nannot; -+ args->nannot += idx_end - idx_beg + 1; -+ column = (int*)realloc(column,args->nannot*sizeof(*column)); -+ types = (int*)realloc(types,args->nannot*sizeof(*types)); -+ for (j=idx_beg; j<=idx_end; j++) -+ { -+ if ( j >= args->nfield ) error("The index is too big: %d\n", j); -+ column[i] = j; -+ types[i] = type; -+ i++; -+ } -+ if ( !tmp ) break; -+ ep++; -+ } -+ args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot)); -+ int len = args->annot_prefix ? strlen(args->annot_prefix) : 0; -+ for (i=0; inannot; i++) -+ { -+ annot_t *ann = &args->annot[i]; -+ ann->type = types[i]; -+ ann->idx = j = column[i]; -+ ann->field = strdup(args->field[j]); -+ int clen = strlen(args->field[j]); -+ ann->tag = (char*)malloc(clen+len+1); -+ if ( len ) memcpy(ann->tag,args->annot_prefix,len); -+ memcpy(ann->tag+len,ann->field,clen); -+ ann->tag[len+clen] = 0; -+ args->kstr.l = 0; -+ char *type = "String"; -+ if ( ann->type==BCF_HT_REAL ) type = "Float"; -+ else if ( ann->type==BCF_HT_INT ) type = "Integer"; -+ else if ( ann->type==BCF_HT_FLAG ) type = "Flag"; -+ ksprintf(&args->kstr,"##INFO=",type); -+ bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag); -+ } -+ free(column); -+ free(types); -+ -+ if ( bcf_hdr_sync(args->hdr_out)<0 ) -+ error_errno("[%s] Failed to update header", __func__); -+ } -+ if ( args->format_str ) -+ { -+ if ( !args->column_str && !args->select ) error("Error: No %s field selected in the formatting expression and -s not given: a typo?\n",args->vep_tag); -+ args->convert = convert_init(args->hdr_out, NULL, 0, args->format_str); -+ if ( !args->convert ) error("Could not parse the expression: %s\n", args->format_str); -+ } -+ if ( args->filter_str ) -+ { -+ int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0; -+ args->filter = filter_init(args->hdr_out, args->filter_str); -+ max_unpack |= filter_max_unpack(args->filter); -+ args->sr->max_unpack = max_unpack; -+ if ( max_unpack & BCF_UN_FMT ) -+ convert_set_option(args->convert, subset_samples, &args->smpl_pass); -+ } -+ -+ // Severity scale -+ args->csq2severity = khash_str2int_init(); -+ int severity = 0; -+ str.l = 0; -+ if ( args->severity ) -+ { -+ kstring_t tmp = {0,0,0}; -+ htsFile *fp = hts_open(args->severity,"r"); -+ if ( !fp ) error("Cannot read %s\n", args->severity); -+ while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) -+ { -+ kputs(tmp.s, &str); -+ kputc('\n', &str); -+ } -+ free(tmp.s); -+ } -+ else -+ kputs(default_severity(),&str); -+ ep = str.s; -+ while ( *ep ) -+ { -+ if ( *ep=='#' ) -+ { -+ while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; } -+ if ( !*ep ) break; -+ ep++; -+ continue; -+ } -+ char *bp = ep; -+ while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; } -+ char tmp = *ep; -+ *ep = 0; -+ args->nscale++; -+ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); -+ args->scale[args->nscale-1] = strdup(bp); -+ if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) ) -+ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); -+ if ( !tmp ) break; -+ if ( tmp=='\n' ) severity++; -+ ep++; -+ while ( *ep && isspace(*ep) ) ep++; -+ } -+ free(str.s); -+ -+ // Transcript and/or consequence selection -+ if ( !args->select ) args->select = "all:any"; -+ cols_t *cols = cols_split(args->select, NULL, ':'); -+ char *sel_tr = cols->off[0][0] ? cols->off[0] : "all"; -+ char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any"; -+ if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL; -+ else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST; -+ else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY; -+ else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr); -+ if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups -+ else -+ { -+ int len = strlen(sel_csq); -+ int severity, modifier = '='; -+ if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; } -+ else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; } -+ if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 ) -+ error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq); -+ if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; } -+ else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; } -+ else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; } -+ } -+ cols_destroy(cols); -+ -+ // The 'CANONICAL' column to look up severity, its name is hardwired for now -+ if ( args->select_tr==SELECT_TR_PRIMARY && khash_str2int_get(args->field2idx,"CANONICAL",&args->primary_id)!=0 ) -+ error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]); -+} -+static void destroy_data(args_t *args) -+{ -+ free(args->farr); -+ free(args->iarr); -+ free(args->kstr.s); -+ free(args->column_str); -+ free(args->format_str); -+ cols_destroy(args->cols_csq); -+ cols_destroy(args->cols_tr); -+ int i; -+ for (i=0; inscale; i++) free(args->scale[i]); -+ free(args->scale); -+ for (i=0; infield; i++) free(args->field[i]); -+ free(args->field); -+ for (i=0; inannot; i++) -+ { -+ annot_t *ann = &args->annot[i]; -+ free(ann->field); -+ free(ann->tag); -+ free(ann->str.s); -+ } -+ free(args->annot); -+ if ( args->field2idx ) khash_str2int_destroy(args->field2idx); -+ if ( args->csq2severity ) khash_str2int_destroy(args->csq2severity); -+ bcf_sr_destroy(args->sr); -+ bcf_hdr_destroy(args->hdr_out); -+ free(args->csq_str); -+ if ( args->filter ) filter_destroy(args->filter); -+ if ( args->convert ) convert_destroy(args->convert); -+ if ( args->fh_vcf && hts_close(args->fh_vcf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); -+ if ( args->fh_bgzf && bgzf_close(args->fh_bgzf)!=0 ) error("Error: close failed .. %s\n",args->output_fname); -+ free(args); -+} -+static void list_header(args_t *args) -+{ -+ int i; -+ for (i=0; infield; i++) fprintf(bcftools_stdout, "%d\t%s\n", i,args->field[i]); -+} -+ -+static void csq_to_severity(args_t *args, char *csq, int *min_severity, int *max_severity, int exact_match) -+{ -+ *min_severity = INT_MAX; -+ *max_severity = -1; -+ char *ep = csq; -+ while ( *ep ) -+ { -+ char *bp = ep; -+ while ( *ep && *ep!='&' ) { *ep = tolower(*ep); ep++; } -+ char tmp = *ep; -+ *ep = 0; -+ -+ int i, severity = -1; -+ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) -+ { -+ for (i=0; inscale; i++) -+ if ( strstr(bp,args->scale[i]) ) break; -+ -+ if ( i!=args->nscale ) -+ khash_str2int_get(args->csq2severity, args->scale[i], &severity); -+ else -+ severity = args->nscale + 1; -+ -+ args->nscale++; -+ args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); -+ args->scale[args->nscale-1] = strdup(bp); -+ khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); -+ if ( i==args->nscale ) -+ fprintf(bcftools_stderr,"Note: assigning a (high) severity score to a new consequence, use -S to override: %s -> %d\n",args->scale[args->nscale-1],args->nscale); -+ -+ if ( khash_str2int_get(args->csq2severity, bp, &severity)!=0 ) error("FIXME: failed to look up the consequence \"%s\"\n", bp); -+ } -+ if ( exact_match < 0 ) -+ { -+ if ( *min_severity > severity ) *min_severity = severity; -+ if ( *max_severity < severity ) *max_severity = severity; -+ } -+ else -+ { -+ if ( severity==exact_match ) -+ { -+ *min_severity = *max_severity = severity; -+ *ep = tmp; -+ return; -+ } -+ } -+ -+ if ( !tmp ) break; -+ *ep = tmp; -+ ep++; -+ } -+} -+ -+static int csq_severity_pass(args_t *args, char *csq) -+{ -+ if ( args->min_severity==args->max_severity && args->min_severity==SELECT_CSQ_ANY ) return 1; -+ -+ int min_severity, max_severity, exact_match = args->min_severity==args->max_severity ? args->min_severity : -1; -+ csq_to_severity(args, csq, &min_severity, &max_severity, exact_match); -+ if ( max_severity < args->min_severity ) return 0; -+ if ( min_severity > args->max_severity ) return 0; -+ return 1; -+} -+ -+static int get_primary_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! -+{ -+ int i; -+ for (i=0; in; i++) -+ { -+ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); -+ if ( args->primary_id >= args->cols_csq->n ) -+ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->primary_id,args->cols_csq->n); -+ if ( !strcmp("YES",args->cols_csq->off[args->primary_id]) ) return i; -+ } -+ return -1; -+} -+static int get_worst_transcript(args_t *args, bcf1_t *rec, cols_t *cols_tr) // modifies args->cols_csq! -+{ -+ int i, max_severity = -1, imax_severity = 0; -+ for (i=0; in; i++) -+ { -+ args->cols_csq = cols_split(cols_tr->off[i], args->cols_csq, '|'); -+ if ( args->csq_idx >= args->cols_csq->n ) -+ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); -+ char *csq = args->cols_csq->off[args->csq_idx]; -+ -+ int min, max; -+ csq_to_severity(args, csq, &min, &max, -1); -+ if ( max_severity < max ) { imax_severity = i; max_severity = max; } -+ } -+ return imax_severity; -+} -+static void annot_reset(annot_t *annot, int nannot) -+{ -+ int i; -+ for (i=0; istr.l ) kputc(',',&ann->str); -+ kputs(value, &ann->str); -+} -+static inline void parse_array_real(char *str, float **arr, int *marr, int *narr) -+{ -+ char *bp = str, *ep; -+ float *ptr = *arr; -+ int i, n = 1, m = *marr; -+ for (i=0; *bp; bp++) -+ if ( *bp == ',' ) n++; -+ -+ hts_expand(float*,n,m,ptr); -+ -+ i = 0; -+ bp = str; -+ while ( *bp ) -+ { -+ ptr[i] = strtod(bp, &ep); -+ if ( bp==ep ) -+ bcf_float_set_missing(ptr[i]); -+ i++; -+ while ( *ep && *ep!=',' ) ep++; -+ bp = *ep ? ep + 1 : ep; -+ } -+ *narr = i; -+ *marr = m; -+ *arr = ptr; -+} -+static inline void parse_array_int32(char *str, int **arr, int *marr, int *narr) -+{ -+ char *bp = str, *ep; -+ int32_t *ptr = *arr; -+ int i, n = 1, m = *marr; -+ for (i=0; *bp; bp++) -+ if ( *bp == ',' ) n++; -+ -+ hts_expand(int32_t*,n,m,ptr); -+ -+ i = 0; -+ bp = str; -+ while ( *bp ) -+ { -+ ptr[i] = strtol(bp, &ep, 10); -+ if ( bp==ep ) -+ ptr[i] = bcf_int32_missing; -+ i++; -+ while ( *ep && *ep!=',' ) ep++; -+ bp = *ep ? ep + 1 : ep; -+ } -+ *narr = i; -+ *marr = m; -+ *arr = ptr; -+} -+static void filter_and_output(args_t *args, bcf1_t *rec, int severity_pass, int all_missing) -+{ -+ int i, updated = 0; -+ for (i=0; inannot; i++) -+ { -+ annot_t *ann = &args->annot[i]; -+ if ( !ann->str.l ) continue; -+ if ( ann->type==BCF_HT_REAL ) -+ { -+ parse_array_real(ann->str.s,&args->farr,&args->mfarr,&args->nfarr); -+ bcf_update_info_float(args->hdr_out,rec,ann->tag,args->farr,args->nfarr); -+ } -+ else if ( ann->type==BCF_HT_INT ) -+ { -+ parse_array_int32(ann->str.s,&args->iarr,&args->miarr,&args->niarr); -+ bcf_update_info_int32(args->hdr_out,rec,ann->tag,args->iarr,args->niarr); -+ } -+ else -+ bcf_update_info_string(args->hdr_out,rec,ann->tag,ann->str.s); -+ updated++; -+ } -+ if ( args->filter ) -+ { -+ int pass = filter_test(args->filter, rec, (const uint8_t**) &args->smpl_pass); -+ if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; -+ if ( !pass ) return; -+ } -+ if ( args->format_str ) -+ { -+ if ( args->nannot ) -+ { -+ if ( !updated || all_missing ) return; // the standard case: using -f to print the CSQ subfields, skipping if missing -+ } -+ else -+ { -+ if ( !severity_pass ) return; // request to print only non-CSQ tags at sites that pass severity -+ } -+ -+ args->kstr.l = 0; -+ convert_line(args->convert, rec, &args->kstr); -+ if ( args->kstr.l && bgzf_write(args->fh_bgzf, args->kstr.s, args->kstr.l)!=args->kstr.l ) -+ error("Failed to write to %s\n", args->output_fname); -+ return; -+ } -+ if ( bcf_write(args->fh_vcf, args->hdr_out,rec)!=0 ) -+ error("Failed to write to %s\n", args->output_fname); -+} -+static void process_record(args_t *args, bcf1_t *rec) -+{ -+ int len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str); -+ if ( len<=0 ) return; -+ -+ args->cols_tr = cols_split(args->csq_str, args->cols_tr, ','); -+ -+ int i,j, itr_min = 0, itr_max = args->cols_tr->n - 1; -+ if ( args->select_tr==SELECT_TR_PRIMARY ) -+ { -+ itr_min = itr_max = get_primary_transcript(args, rec, args->cols_tr); -+ if ( itr_min<0 ) itr_max = itr_min - 1; -+ } -+ else if ( args->select_tr==SELECT_TR_WORST ) -+ itr_min = itr_max = get_worst_transcript(args, rec, args->cols_tr); -+ -+ annot_reset(args->annot, args->nannot); -+ int severity_pass = 0; // consequence severity requested via the -s option (BCF record may be output but not annotated) -+ int all_missing = 1; // transcripts with all requested annotations missing will be discarded if -f was given -+ static int too_few_fields_warned = 0; -+ for (i=itr_min; i<=itr_max; i++) -+ { -+ args->cols_csq = cols_split(args->cols_tr->off[i], args->cols_csq, '|'); -+ if ( args->csq_idx >= args->cols_csq->n ) -+ error("Too few columns at %s:%"PRId64" .. %d (Consequence) >= %d\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->csq_idx,args->cols_csq->n); -+ -+ char *csq = args->cols_csq->off[args->csq_idx]; -+ if ( !csq_severity_pass(args, csq) ) continue; -+ severity_pass = 1; -+ -+ for (j=0; jnannot; j++) -+ { -+ annot_t *ann = &args->annot[j]; -+ if ( ann->idx >= args->cols_csq->n ) -+ { -+ if ( !too_few_fields_warned ) -+ { -+ fprintf(bcftools_stderr, "Warning: fewer %s fields than expected at %s:%"PRId64", filling with dots. This warning is printed only once.\n", args->vep_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ too_few_fields_warned = 1; -+ } -+ annot_append(ann, "."); -+ continue; -+ } -+ -+ if ( !*args->cols_csq->off[ann->idx] ) -+ annot_append(ann, "."); // missing value -+ else -+ { -+ annot_append(ann, args->cols_csq->off[ann->idx]); -+ all_missing = 0; -+ } -+ } -+ -+ if ( args->duplicate ) -+ { -+ filter_and_output(args, rec, severity_pass, all_missing); -+ annot_reset(args->annot, args->nannot); -+ all_missing = 1; -+ severity_pass = 0; -+ } -+ } -+ if ( !severity_pass && args->drop_sites ) return; -+ if ( !args->duplicate ) -+ filter_and_output(args, rec, severity_pass, all_missing); -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_fname = "-"; -+ args->output_type = FT_VCF; -+ args->vep_tag = "CSQ"; -+ static struct option loptions[] = -+ { -+ {"drop-sites",no_argument,0,'x'}, -+ {"all-fields",no_argument,0,'A'}, -+ {"duplicate",no_argument,0,'d'}, -+ {"format",required_argument,0,'f'}, -+ {"annotation",required_argument,0,'a'}, -+ {"annot-prefix",required_argument,0,'p'}, -+ {"columns",required_argument,0,'c'}, -+ {"select",required_argument,0,'s'}, -+ {"severity",required_argument,0,'S'}, -+ {"list",no_argument,0,'l'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {"regions",1,0,'r'}, -+ {"regions-file",1,0,'R'}, -+ {"targets",1,0,'t'}, -+ {"targets-file",1,0,'T'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ while ((c = getopt_long(argc, argv, "o:O:i:e:r:R:t:T:lS:s:c:p:a:f:dA:x",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 'A': -+ if ( !strcasecmp(optarg,"tab") ) args->all_fields_delim = "\t"; -+ else if ( !strcasecmp(optarg,"space") ) args->all_fields_delim = " "; -+ else args->all_fields_delim = optarg; -+ break; -+ case 'x': args->drop_sites = 1; break; -+ case 'd': args->duplicate = 1; break; -+ case 'f': args->format_str = strdup(optarg); break; -+ case 'a': args->vep_tag = optarg; break; -+ case 'p': args->annot_prefix = optarg; break; -+ case 'c': args->column_str = strdup(optarg); break; -+ case 'S': args->severity = optarg; break; -+ case 's': args->select = optarg; break; -+ case 'l': args->list_hdr = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': args->targets = optarg; break; -+ case 'T': args->targets = optarg; args->targets_is_file = 1; break; -+ case 'r': args->regions = optarg; break; -+ case 'R': args->regions = optarg; args->regions_is_file = 1; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ } -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( args->drop_sites && args->format_str ) error("Error: the -x behavior is the default (and only supported) with -f\n"); -+ if ( args->all_fields_delim && !args->format_str ) error("Error: the -A option must be used with -f\n"); -+ if ( args->severity && (!strcmp("?",args->severity) || !strcmp("-",args->severity)) ) error("%s", default_severity()); -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ init_data(args); -+ -+ if ( args->list_hdr ) -+ list_header(args); -+ else -+ { -+ if ( !args->format_str && !args->column_str ) -+ { -+ if ( args->min_severity==SELECT_CSQ_ANY && args->max_severity==SELECT_CSQ_ANY ) -+ error("Error: none of the -c,-f,-s options was given, why not use \"bcftools view\" instead?\n"); -+ else if ( !args->drop_sites ) -+ error("Error: when the -s option is used without -x, everything is printed; why not use \"bcftools view\" instead?\n"); -+ } -+ -+ if ( args->format_str ) -+ args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu"); -+ else -+ { -+ args->fh_vcf = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); -+ if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); -+ } -+ while ( bcf_sr_next_line(args->sr) ) -+ process_record(args, bcf_sr_get_line(args->sr,0)); -+ } -+ -+ destroy_data(args); -+ -+ return 0; -+} ---- python-pysam.orig/bcftools/plugins/split.c -+++ python-pysam/bcftools/plugins/split.c -@@ -178,26 +178,6 @@ - if ( !nsmpl ) error("No samples to split: %s\n", args->fname); - args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh)); - args->bnames = set_file_base_names(args); -- kstring_t str = {0,0,0}; -- for (i=0; ibnames[i] ) continue; -- str.l = 0; -- kputs(args->output_dir, &str); -- if ( str.s[str.l-1] != '/' ) kputc('/', &str); -- int k, l = str.l; -- kputs(args->bnames[i], &str); -- for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); -- else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); -- else kputs(".vcf", &str); -- args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); -- if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno)); -- bcf_hdr_nsamples(args->hdr_out) = 1; -- args->hdr_out->samples[0] = args->bnames[i]; -- bcf_hdr_write(args->fh[i], args->hdr_out); -- } -- free(str.s); - - // parse tags - int is_info = 0, is_fmt = 0; -@@ -235,6 +215,57 @@ - { - args->keep_info = args->keep_fmt = 1; - } -+ if ( !args->keep_fmt && !args->nfmt_tags ) args->keep_fmt = 1; -+ if ( !args->keep_info || args->ninfo_tags || args->nfmt_tags ) -+ { -+ int j; -+ for (j=args->hdr_out->nhrec-1; j>=0; j--) -+ { -+ bcf_hrec_t *hrec = args->hdr_out->hrec[j]; -+ if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue; -+ int k = bcf_hrec_find_key(hrec,"ID"); -+ assert( k>=0 ); // this should always be true for valid VCFs -+ int remove = 0; -+ if ( hrec->type==BCF_HL_INFO && (!args->keep_info || args->ninfo_tags) ) -+ { -+ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); -+ if ( !args->keep_info || id >= args->ninfo_tags || !args->info_tags[id] ) remove = 1; -+ } -+ if ( hrec->type==BCF_HL_FMT && args->nfmt_tags ) -+ { -+ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); -+ if ( id >= args->nfmt_tags || !args->fmt_tags[id] ) remove = 1; -+ } -+ if ( remove ) -+ { -+ char *str = strdup(hrec->vals[k]); -+ bcf_hdr_remove(args->hdr_out,hrec->type,str); -+ free(str); -+ } -+ } -+ if ( bcf_hdr_sync(args->hdr_out)!=0 ) error("Failed to update the VCF header\n"); -+ } -+ -+ kstring_t str = {0,0,0}; -+ for (i=0; ibnames[i] ) continue; -+ str.l = 0; -+ kputs(args->output_dir, &str); -+ if ( str.s[str.l-1] != '/' ) kputc('/', &str); -+ int k, l = str.l; -+ kputs(args->bnames[i], &str); -+ for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); -+ else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); -+ else kputs(".vcf", &str); -+ args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); -+ if ( args->fh[i] == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, str.s, strerror(errno)); -+ bcf_hdr_nsamples(args->hdr_out) = 1; -+ args->hdr_out->samples[0] = args->bnames[i]; -+ if ( bcf_hdr_write(args->fh[i], args->hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,str.s); -+ } -+ free(str.s); - } - static void destroy_data(args_t *args) - { -@@ -245,7 +276,7 @@ - int i, nsmpl = bcf_hdr_nsamples(args->hdr_in); - for (i=0; ifh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed!\n"); -+ if ( args->fh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed .. %s\n",args->bnames[i]); - free(args->bnames[i]); - } - free(args->bnames); -@@ -307,7 +338,7 @@ - { - bcf_fmt_t *fmt = &src->d.fmt[i]; - int id = fmt->id; -- if ( !args->keep_fmt && !args->fmt_tags[id] ) continue; -+ if ( !args->keep_fmt && (id>=args->nfmt_tags || !args->fmt_tags[id]) ) continue; - - bcf_enc_int1(&tmp, id); - bcf_enc_size(&tmp, fmt->n, fmt->type); -@@ -343,7 +374,7 @@ - } - if ( !out ) out = rec_set_info(args, rec); - rec_set_format(args, rec, i, out); -- bcf_write(args->fh[i], args->hdr_out, out); -+ if ( bcf_write(args->fh[i], args->hdr_out, out)!=0 ) error("[%s] Error: failed to write the record\n", __func__); - } - if ( out ) bcf_destroy(out); - } ---- python-pysam.orig/bcftools/plugins/split.c.pysam.c -+++ python-pysam/bcftools/plugins/split.c.pysam.c -@@ -180,26 +180,6 @@ - if ( !nsmpl ) error("No samples to split: %s\n", args->fname); - args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh)); - args->bnames = set_file_base_names(args); -- kstring_t str = {0,0,0}; -- for (i=0; ibnames[i] ) continue; -- str.l = 0; -- kputs(args->output_dir, &str); -- if ( str.s[str.l-1] != '/' ) kputc('/', &str); -- int k, l = str.l; -- kputs(args->bnames[i], &str); -- for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); -- else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); -- else kputs(".vcf", &str); -- args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); -- if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno)); -- bcf_hdr_nsamples(args->hdr_out) = 1; -- args->hdr_out->samples[0] = args->bnames[i]; -- bcf_hdr_write(args->fh[i], args->hdr_out); -- } -- free(str.s); - - // parse tags - int is_info = 0, is_fmt = 0; -@@ -237,6 +217,57 @@ - { - args->keep_info = args->keep_fmt = 1; - } -+ if ( !args->keep_fmt && !args->nfmt_tags ) args->keep_fmt = 1; -+ if ( !args->keep_info || args->ninfo_tags || args->nfmt_tags ) -+ { -+ int j; -+ for (j=args->hdr_out->nhrec-1; j>=0; j--) -+ { -+ bcf_hrec_t *hrec = args->hdr_out->hrec[j]; -+ if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FMT ) continue; -+ int k = bcf_hrec_find_key(hrec,"ID"); -+ assert( k>=0 ); // this should always be true for valid VCFs -+ int remove = 0; -+ if ( hrec->type==BCF_HL_INFO && (!args->keep_info || args->ninfo_tags) ) -+ { -+ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); -+ if ( !args->keep_info || id >= args->ninfo_tags || !args->info_tags[id] ) remove = 1; -+ } -+ if ( hrec->type==BCF_HL_FMT && args->nfmt_tags ) -+ { -+ int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); -+ if ( id >= args->nfmt_tags || !args->fmt_tags[id] ) remove = 1; -+ } -+ if ( remove ) -+ { -+ char *str = strdup(hrec->vals[k]); -+ bcf_hdr_remove(args->hdr_out,hrec->type,str); -+ free(str); -+ } -+ } -+ if ( bcf_hdr_sync(args->hdr_out)!=0 ) error("Failed to update the VCF header\n"); -+ } -+ -+ kstring_t str = {0,0,0}; -+ for (i=0; ibnames[i] ) continue; -+ str.l = 0; -+ kputs(args->output_dir, &str); -+ if ( str.s[str.l-1] != '/' ) kputc('/', &str); -+ int k, l = str.l; -+ kputs(args->bnames[i], &str); -+ for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); -+ else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); -+ else kputs(".vcf", &str); -+ args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); -+ if ( args->fh[i] == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, str.s, strerror(errno)); -+ bcf_hdr_nsamples(args->hdr_out) = 1; -+ args->hdr_out->samples[0] = args->bnames[i]; -+ if ( bcf_hdr_write(args->fh[i], args->hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,str.s); -+ } -+ free(str.s); - } - static void destroy_data(args_t *args) - { -@@ -247,7 +278,7 @@ - int i, nsmpl = bcf_hdr_nsamples(args->hdr_in); - for (i=0; ifh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed!\n"); -+ if ( args->fh[i] && hts_close(args->fh[i])!=0 ) error("Error: close failed .. %s\n",args->bnames[i]); - free(args->bnames[i]); - } - free(args->bnames); -@@ -309,7 +340,7 @@ - { - bcf_fmt_t *fmt = &src->d.fmt[i]; - int id = fmt->id; -- if ( !args->keep_fmt && !args->fmt_tags[id] ) continue; -+ if ( !args->keep_fmt && (id>=args->nfmt_tags || !args->fmt_tags[id]) ) continue; - - bcf_enc_int1(&tmp, id); - bcf_enc_size(&tmp, fmt->n, fmt->type); -@@ -345,7 +376,7 @@ - } - if ( !out ) out = rec_set_info(args, rec); - rec_set_format(args, rec, i, out); -- bcf_write(args->fh[i], args->hdr_out, out); -+ if ( bcf_write(args->fh[i], args->hdr_out, out)!=0 ) error("[%s] Error: failed to write the record\n", __func__); - } - if ( out ) bcf_destroy(out); - } ---- python-pysam.orig/bcftools/plugins/tag2tag.c -+++ python-pysam/bcftools/plugins/tag2tag.c -@@ -26,6 +26,7 @@ - #include - #include - #include -+#include - #include - #include - #include "bcftools.h" -@@ -217,8 +218,8 @@ - } - - if ( j!=nals*(nals+1)/2 ) -- error("Wrong number of GP values for diploid genotype at %s:%d, expected %d, found %d\n", -- bcf_seqname(in_hdr,rec),rec->pos+1, nals*(nals+1)/2,j); -+ error("Wrong number of GP values for diploid genotype at %s:%"PRId64", expected %d, found %d\n", -+ bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1, nals*(nals+1)/2,j); - - if (ptr[jmax] < 1-thresh) - { ---- python-pysam.orig/bcftools/plugins/tag2tag.c.pysam.c -+++ python-pysam/bcftools/plugins/tag2tag.c.pysam.c -@@ -28,6 +28,7 @@ - #include - #include - #include -+#include - #include - #include - #include "bcftools.h" -@@ -219,8 +220,8 @@ - } - - if ( j!=nals*(nals+1)/2 ) -- error("Wrong number of GP values for diploid genotype at %s:%d, expected %d, found %d\n", -- bcf_seqname(in_hdr,rec),rec->pos+1, nals*(nals+1)/2,j); -+ error("Wrong number of GP values for diploid genotype at %s:%"PRId64", expected %d, found %d\n", -+ bcf_seqname(in_hdr,rec),(int64_t) rec->pos+1, nals*(nals+1)/2,j); - - if (ptr[jmax] < 1-thresh) - { ---- /dev/null -+++ python-pysam/bcftools/plugins/trio-dnm.c -@@ -0,0 +1,444 @@ -+/* The MIT License -+ -+ Copyright (c) 2018-2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define iCHILD 0 -+#define iFATHER 1 -+#define iMOTHER 2 -+ -+typedef struct -+{ -+ int idx[3]; // VCF sample index for child, father, mother -+ int pass; // do all three pass the filters? -+} -+trio_t; -+ -+typedef struct -+{ -+ int argc, filter_logic, regions_is_file, targets_is_file, output_type; -+ char *filter_str; -+ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; -+ htsFile *out_fh; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr, *hdr_out; -+ trio_t *trio; -+ int has_fmt_ad; -+ int ntrio, mtrio; -+ int32_t *pl, *ad, *dnm_qual, *vaf; // input FMT/PL and AD values, output DNM and VAF -+ int mpl, mad; -+ double min_score; -+ double *aprob; // proband's allele probabilities -+ double *pl3; // normalized PLs converted to probs for proband,father,mother -+ int maprob, mpl3, midx, *idx, force_ad; -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Screen variants for possible de-novo mutations in trios.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Screen variants for possible de-novo mutations in trios\n" -+ "Usage: bcftools +trio-dnm [Plugin Options]\n" -+ "Plugin options:\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " --force-AD calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " -m, --min-score NUM do not add FMT/DNM annotation if the score is smaller than NUM\n" -+ " -o, --output FILE output file name [stdout]\n" -+ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ " -p, --pfm P,F,M sample names of proband, father, and mother\n" -+ " -P, --ped FILE PED file\n" -+ " -r, --regions REG restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REG similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ "\n" -+ "Example:\n" -+ " # Annotate VCF with FORMAT/DNM, run for a single trio\n" -+ " bcftools +trio-dnm -p proband,father,mother file.bcf\n" -+ "\n" -+ " # Same as above, but read the trio(s) from a PED file\n" -+ " bcftools +trio-dnm -P file.ped file.bcf\n" -+ "\n" -+ " # Same as above plus extract a list of significant DNMs using the bcftools/query command\n" -+ " bcftools +trio-dnm -P file.ped file.bcf -Ou | bcftools query -i'DNM>10' -f'[%CHROM:%POS %SAMPLE %DNM\\n]'\n" -+ "\n"; -+} -+ -+static int cmp_trios(const void *_a, const void *_b) -+{ -+ trio_t *a = (trio_t *) _a; -+ trio_t *b = (trio_t *) _b; -+ int i; -+ int amin = a->idx[0]; -+ for (i=1; i<3; i++) -+ if ( amin > a->idx[i] ) amin = a->idx[i]; -+ int bmin = b->idx[0]; -+ for (i=1; i<3; i++) -+ if ( bmin > b->idx[i] ) bmin = b->idx[i]; -+ if ( amin < bmin ) return -1; -+ if ( amin > bmin ) return 1; -+ return 0; -+} -+static void parse_ped(args_t *args, char *fname) -+{ -+ htsFile *fp = hts_open(fname, "r"); -+ if ( !fp ) error("Could not read: %s\n", fname); -+ -+ kstring_t str = {0,0,0}; -+ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); -+ -+ int moff = 0, *off = NULL; -+ do -+ { -+ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment -+ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 -+ int ncols = ksplit_core(str.s,0,&moff,&off); -+ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); -+ -+ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); -+ if ( father<0 ) continue; -+ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); -+ if ( mother<0 ) continue; -+ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); -+ if ( child<0 ) continue; -+ -+ args->ntrio++; -+ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); -+ trio_t *trio = &args->trio[args->ntrio-1]; -+ trio->idx[iFATHER] = father; -+ trio->idx[iMOTHER] = mother; -+ trio->idx[iCHILD] = child; -+ } -+ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); -+ -+ fprintf(stderr,"Identified %d complete trio%s in the VCF file\n", args->ntrio,args->ntrio==1?"":"s"); -+ -+ // sort the sample by index so that they are accessed more or less sequentially -+ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); -+ -+ free(str.s); -+ free(off); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); -+} -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->regions ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); -+ } -+ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ int id; -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ fprintf(stderr, "Warning: the tag FORMAT/AD is not present in %s, the output tag FORMAT/VAF will not be added\n", args->fname); -+ else -+ args->has_fmt_ad = 1; -+ -+ args->hdr_out = bcf_hdr_dup(args->hdr); -+ bcf_hdr_append(args->hdr_out, "##FORMAT="); -+ if ( args->has_fmt_ad ) -+ bcf_hdr_append(args->hdr_out, "##FORMAT="); -+ -+ int i, n = 0; -+ char **list; -+ if ( args->pfm ) -+ { -+ args->ntrio = 1; -+ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); -+ list = hts_readlist(args->pfm, 0, &n); -+ if ( n!=3 ) error("Expected three sample names with -t\n"); -+ args->trio[0].idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); -+ args->trio[0].idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); -+ args->trio[0].idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); -+ for (i=0; itrio[0].idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); -+ free(list[i]); -+ } -+ free(list); -+ } -+ else -+ { -+ parse_ped(args,args->ped_fname); -+ if ( !args->ntrio ) error("No complete trio present\n"); -+ } -+ -+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ -+ args->dnm_qual = (int32_t*) malloc(sizeof(*args->dnm_qual)*bcf_hdr_nsamples(args->hdr)); -+ args->vaf = (int32_t*) malloc(sizeof(*args->vaf)*bcf_hdr_nsamples(args->hdr)); -+} -+static void destroy_data(args_t *args) -+{ -+ free(args->pl3); -+ free(args->aprob); -+ free(args->idx); -+ free(args->dnm_qual); -+ free(args->vaf); -+ free(args->trio); -+ free(args->pl); -+ free(args->ad); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); -+ bcf_hdr_destroy(args->hdr_out); -+ bcf_sr_destroy(args->sr); -+ free(args); -+} -+static float process_trio(args_t *args, int nals, double *pl[3], int npl, int *al0, int *al1) -+{ -+ assert( nals>1 ); -+ -+ // determine the two most likely proband's alleles -+ int i,j,k = 0,tmp; -+ -+ hts_expand(int,nals,args->midx,args->idx); -+ hts_expand(double,nals,args->maprob,args->aprob); -+ for (i=0; iaprob[i] = 0; -+ for (i=0; iaprob[i] += pl[iCHILD][k]; -+ args->aprob[j] += pl[iCHILD][k]; -+ k++; -+ } -+ } -+ -+ // sort in descendent order -+ double *arr = args->aprob; -+ int *idx = args->idx; -+ for (i=0; i0 && arr[idx[j]] > arr[idx[j-1]]; j--) -+ tmp = idx[j], idx[j] = idx[j-1], idx[j-1] = tmp; -+ -+ if ( idx[0] < idx[1] ) { *al0 = idx[0]; *al1 = idx[1]; } -+ else { *al0 = idx[1]; *al1 = idx[0]; } -+ -+ // Calculate the probability of inheriting the 00, 01, and 11 genotype. For DNM they all will be small -+ int k00 = bcf_alleles2gt(idx[0],idx[0]); -+ int k01 = bcf_alleles2gt(idx[0],idx[1]); -+ int k11 = bcf_alleles2gt(idx[1],idx[1]); -+ double pd00 = pl[iCHILD][k00] * (pl[iFATHER][k00] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]); -+ double pd11 = pl[iCHILD][k11] * (pl[iFATHER][k11] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]); -+ double pd01 = pl[iCHILD][k01] * (pl[iFATHER][k00] * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]) + pl[iFATHER][k11] * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]) -+ + 0.5*pl[iFATHER][k01] * (pl[iMOTHER][k00] + pl[iMOTHER][k01] + pl[iMOTHER][k11])); -+ -+ double max = pd01; -+ if ( max < pd00 ) max = pd00; -+ if ( max < pd11 ) max = pd11; -+ return fabs(4.3429 * log(max)); -+} -+static void process_record(args_t *args, bcf1_t *rec) -+{ -+ if ( rec->n_allele==1 ) -+ { -+ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ return; -+ } -+ static int n_ad_warned = 0; -+ int nret, nsmpl = bcf_hdr_nsamples(args->hdr), n_ad = args->has_fmt_ad; -+ if ( n_ad ) -+ { -+ nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); -+ if ( nret<=0 ) n_ad = 0; -+ else -+ { -+ n_ad = nret / nsmpl; -+ if ( nret != nsmpl * rec->n_allele ) -+ { -+ if ( !n_ad_warned ) -+ { -+ hts_log_warning("Incorrect number of fields for FORMAT/AD at %s:%"PRId64". This warning is printed only once", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ n_ad_warned = 1; -+ } -+ if ( !args->force_ad ) n_ad = 0; -+ } -+ } -+ } -+ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); -+ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ int npl1 = nret/nsmpl; -+ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) -+ error("fixme: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); -+ hts_expand(double,3*npl1,args->mpl3,args->pl3); -+ int i, j, k, al0, al1, write_dnm = 0, ad_set = 0; -+ for (i=0; idnm_qual[i] = bcf_int32_missing; -+ for (i=0; intrio; i++) -+ { -+ double *ppl[3]; -+ for (j=0; j<3; j++) -+ { -+ int32_t *src = args->pl + npl1 * args->trio[i].idx[j]; -+ double *dst = ppl[j] = args->pl3 + j*npl1; -+ double sum = 0; -+ for (k=0; kn_allele, ppl, npl1, &al0, &al1); -+ if ( score >= args->min_score ) -+ { -+ write_dnm = 1; -+ args->dnm_qual[ args->trio[i].idx[iCHILD] ] = score; -+ } -+ -+ if ( n_ad ) -+ { -+ if ( al0 < n_ad && al1 < n_ad ) -+ { -+ ad_set = 1; -+ for (j=0; j<3; j++) -+ { -+ int32_t *src = args->ad + n_ad * args->trio[i].idx[j]; -+ args->vaf[ args->trio[i].idx[j] ] = src[al0]+src[al1] ? round(src[al1]*100./(src[al0]+src[al1])) : 0; -+ } -+ } -+ else -+ for (j=0; j<3; j++) args->vaf[ args->trio[i].idx[j] ] = bcf_int32_missing; -+ } -+ } -+ if ( write_dnm ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,rec,"DNM",args->dnm_qual,nsmpl)!=0 ) -+ error("Failed to write FORMAT/DNM at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ if ( ad_set ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,rec,"VAF",args->vaf,nsmpl)!=0 ) -+ error("Failed to write FORMAT/VAF at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ } -+ } -+ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_fname = "-"; -+ static struct option loptions[] = -+ { -+ {"force-AD",no_argument,0,1}, -+ {"min-score",required_argument,0,'m'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {"ped",required_argument,NULL,'P'}, -+ {"pfm",required_argument,NULL,'p'}, -+ {"regions",1,0,'r'}, -+ {"regions-file",1,0,'R'}, -+ {"targets",1,0,'t'}, -+ {"targets-file",1,0,'T'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ char *tmp; -+ while ((c = getopt_long(argc, argv, "p:P:o:O:s:i:e:r:R:t:T:m:",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 1 : args->force_ad = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': args->targets = optarg; break; -+ case 'T': args->targets = optarg; args->targets_is_file = 1; break; -+ case 'r': args->regions = optarg; break; -+ case 'R': args->regions = optarg; args->regions_is_file = 1; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ }; -+ break; -+ case 'P': args->ped_fname = optarg; break; -+ case 'p': args->pfm = optarg; break; -+ case 'm': args->min_score = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: --min-score %s\n", optarg); -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); -+ if ( args->ped_fname && args->pfm ) error("Expected only -p or -P option, not both\n"); -+ -+ init_data(args); -+ -+ while ( bcf_sr_next_line(args->sr) ) -+ process_record(args, bcf_sr_get_line(args->sr,0)); -+ -+ destroy_data(args); -+ -+ return 0; -+} ---- /dev/null -+++ python-pysam/bcftools/plugins/trio-dnm.c.pysam.c -@@ -0,0 +1,446 @@ -+#include "bcftools.pysam.h" -+ -+/* The MIT License -+ -+ Copyright (c) 2018-2019 Genome Research Ltd. -+ -+ Author: Petr Danecek -+ -+ Permission is hereby granted, free of charge, to any person obtaining a copy -+ of this software and associated documentation files (the "Software"), to deal -+ in the Software without restriction, including without limitation the rights -+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+ copies of the Software, and to permit persons to whom the Software is -+ furnished to do so, subject to the following conditions: -+ -+ The above copyright notice and this permission notice shall be included in -+ all copies or substantial portions of the Software. -+ -+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+ THE SOFTWARE. -+ -+ */ -+ -+#include -+#include -+#include -+#include -+#include // for isatty -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bcftools.h" -+#include "filter.h" -+ -+ -+// Logic of the filters: include or exclude sites which match the filters? -+#define FLT_INCLUDE 1 -+#define FLT_EXCLUDE 2 -+ -+#define iCHILD 0 -+#define iFATHER 1 -+#define iMOTHER 2 -+ -+typedef struct -+{ -+ int idx[3]; // VCF sample index for child, father, mother -+ int pass; // do all three pass the filters? -+} -+trio_t; -+ -+typedef struct -+{ -+ int argc, filter_logic, regions_is_file, targets_is_file, output_type; -+ char *filter_str; -+ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; -+ htsFile *out_fh; -+ bcf_srs_t *sr; -+ bcf_hdr_t *hdr, *hdr_out; -+ trio_t *trio; -+ int has_fmt_ad; -+ int ntrio, mtrio; -+ int32_t *pl, *ad, *dnm_qual, *vaf; // input FMT/PL and AD values, output DNM and VAF -+ int mpl, mad; -+ double min_score; -+ double *aprob; // proband's allele probabilities -+ double *pl3; // normalized PLs converted to probs for proband,father,mother -+ int maprob, mpl3, midx, *idx, force_ad; -+} -+args_t; -+ -+args_t args; -+ -+const char *about(void) -+{ -+ return "Screen variants for possible de-novo mutations in trios.\n"; -+} -+ -+static const char *usage_text(void) -+{ -+ return -+ "\n" -+ "About: Screen variants for possible de-novo mutations in trios\n" -+ "Usage: bcftools +trio-dnm [Plugin Options]\n" -+ "Plugin options:\n" -+ " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" -+ " --force-AD calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" -+ " -i, --include EXPR include sites and samples for which the expression is true\n" -+ " -m, --min-score NUM do not add FMT/DNM annotation if the score is smaller than NUM\n" -+ " -o, --output FILE output file name [bcftools_stdout]\n" -+ " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" -+ " -p, --pfm P,F,M sample names of proband, father, and mother\n" -+ " -P, --ped FILE PED file\n" -+ " -r, --regions REG restrict to comma-separated list of regions\n" -+ " -R, --regions-file FILE restrict to regions listed in a file\n" -+ " -t, --targets REG similar to -r but streams rather than index-jumps\n" -+ " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" -+ "\n" -+ "Example:\n" -+ " # Annotate VCF with FORMAT/DNM, run for a single trio\n" -+ " bcftools +trio-dnm -p proband,father,mother file.bcf\n" -+ "\n" -+ " # Same as above, but read the trio(s) from a PED file\n" -+ " bcftools +trio-dnm -P file.ped file.bcf\n" -+ "\n" -+ " # Same as above plus extract a list of significant DNMs using the bcftools/query command\n" -+ " bcftools +trio-dnm -P file.ped file.bcf -Ou | bcftools query -i'DNM>10' -f'[%CHROM:%POS %SAMPLE %DNM\\n]'\n" -+ "\n"; -+} -+ -+static int cmp_trios(const void *_a, const void *_b) -+{ -+ trio_t *a = (trio_t *) _a; -+ trio_t *b = (trio_t *) _b; -+ int i; -+ int amin = a->idx[0]; -+ for (i=1; i<3; i++) -+ if ( amin > a->idx[i] ) amin = a->idx[i]; -+ int bmin = b->idx[0]; -+ for (i=1; i<3; i++) -+ if ( bmin > b->idx[i] ) bmin = b->idx[i]; -+ if ( amin < bmin ) return -1; -+ if ( amin > bmin ) return 1; -+ return 0; -+} -+static void parse_ped(args_t *args, char *fname) -+{ -+ htsFile *fp = hts_open(fname, "r"); -+ if ( !fp ) error("Could not read: %s\n", fname); -+ -+ kstring_t str = {0,0,0}; -+ if ( hts_getline(fp, KS_SEP_LINE, &str) <= 0 ) error("Empty file: %s\n", fname); -+ -+ int moff = 0, *off = NULL; -+ do -+ { -+ // familyID sampleID paternalID maternalID sex phenotype population relationship siblings secondOrder thirdOrder children comment -+ // BB03 HG01884 HG01885 HG01956 2 0 ACB child 0 0 0 0 -+ int ncols = ksplit_core(str.s,0,&moff,&off); -+ if ( ncols<4 ) error("Could not parse the ped file: %s\n", str.s); -+ -+ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[2]]); -+ if ( father<0 ) continue; -+ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[3]]); -+ if ( mother<0 ) continue; -+ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); -+ if ( child<0 ) continue; -+ -+ args->ntrio++; -+ hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); -+ trio_t *trio = &args->trio[args->ntrio-1]; -+ trio->idx[iFATHER] = father; -+ trio->idx[iMOTHER] = mother; -+ trio->idx[iCHILD] = child; -+ } -+ while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); -+ -+ fprintf(bcftools_stderr,"Identified %d complete trio%s in the VCF file\n", args->ntrio,args->ntrio==1?"":"s"); -+ -+ // sort the sample by index so that they are accessed more or less sequentially -+ qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); -+ -+ free(str.s); -+ free(off); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); -+} -+static void init_data(args_t *args) -+{ -+ args->sr = bcf_sr_init(); -+ if ( args->regions ) -+ { -+ args->sr->require_index = 1; -+ if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); -+ } -+ if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); -+ if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); -+ args->hdr = bcf_sr_get_header(args->sr,0); -+ -+ int id; -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); -+ if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) -+ fprintf(bcftools_stderr, "Warning: the tag FORMAT/AD is not present in %s, the output tag FORMAT/VAF will not be added\n", args->fname); -+ else -+ args->has_fmt_ad = 1; -+ -+ args->hdr_out = bcf_hdr_dup(args->hdr); -+ bcf_hdr_append(args->hdr_out, "##FORMAT="); -+ if ( args->has_fmt_ad ) -+ bcf_hdr_append(args->hdr_out, "##FORMAT="); -+ -+ int i, n = 0; -+ char **list; -+ if ( args->pfm ) -+ { -+ args->ntrio = 1; -+ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); -+ list = hts_readlist(args->pfm, 0, &n); -+ if ( n!=3 ) error("Expected three sample names with -t\n"); -+ args->trio[0].idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); -+ args->trio[0].idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); -+ args->trio[0].idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); -+ for (i=0; itrio[0].idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); -+ free(list[i]); -+ } -+ free(list); -+ } -+ else -+ { -+ parse_ped(args,args->ped_fname); -+ if ( !args->ntrio ) error("No complete trio present\n"); -+ } -+ -+ args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -+ if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ -+ args->dnm_qual = (int32_t*) malloc(sizeof(*args->dnm_qual)*bcf_hdr_nsamples(args->hdr)); -+ args->vaf = (int32_t*) malloc(sizeof(*args->vaf)*bcf_hdr_nsamples(args->hdr)); -+} -+static void destroy_data(args_t *args) -+{ -+ free(args->pl3); -+ free(args->aprob); -+ free(args->idx); -+ free(args->dnm_qual); -+ free(args->vaf); -+ free(args->trio); -+ free(args->pl); -+ free(args->ad); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); -+ bcf_hdr_destroy(args->hdr_out); -+ bcf_sr_destroy(args->sr); -+ free(args); -+} -+static float process_trio(args_t *args, int nals, double *pl[3], int npl, int *al0, int *al1) -+{ -+ assert( nals>1 ); -+ -+ // determine the two most likely proband's alleles -+ int i,j,k = 0,tmp; -+ -+ hts_expand(int,nals,args->midx,args->idx); -+ hts_expand(double,nals,args->maprob,args->aprob); -+ for (i=0; iaprob[i] = 0; -+ for (i=0; iaprob[i] += pl[iCHILD][k]; -+ args->aprob[j] += pl[iCHILD][k]; -+ k++; -+ } -+ } -+ -+ // sort in descendent order -+ double *arr = args->aprob; -+ int *idx = args->idx; -+ for (i=0; i0 && arr[idx[j]] > arr[idx[j-1]]; j--) -+ tmp = idx[j], idx[j] = idx[j-1], idx[j-1] = tmp; -+ -+ if ( idx[0] < idx[1] ) { *al0 = idx[0]; *al1 = idx[1]; } -+ else { *al0 = idx[1]; *al1 = idx[0]; } -+ -+ // Calculate the probability of inheriting the 00, 01, and 11 genotype. For DNM they all will be small -+ int k00 = bcf_alleles2gt(idx[0],idx[0]); -+ int k01 = bcf_alleles2gt(idx[0],idx[1]); -+ int k11 = bcf_alleles2gt(idx[1],idx[1]); -+ double pd00 = pl[iCHILD][k00] * (pl[iFATHER][k00] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]); -+ double pd11 = pl[iCHILD][k11] * (pl[iFATHER][k11] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]); -+ double pd01 = pl[iCHILD][k01] * (pl[iFATHER][k00] * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]) + pl[iFATHER][k11] * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]) -+ + 0.5*pl[iFATHER][k01] * (pl[iMOTHER][k00] + pl[iMOTHER][k01] + pl[iMOTHER][k11])); -+ -+ double max = pd01; -+ if ( max < pd00 ) max = pd00; -+ if ( max < pd11 ) max = pd11; -+ return fabs(4.3429 * log(max)); -+} -+static void process_record(args_t *args, bcf1_t *rec) -+{ -+ if ( rec->n_allele==1 ) -+ { -+ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ return; -+ } -+ static int n_ad_warned = 0; -+ int nret, nsmpl = bcf_hdr_nsamples(args->hdr), n_ad = args->has_fmt_ad; -+ if ( n_ad ) -+ { -+ nret = bcf_get_format_int32(args->hdr,rec,"AD",&args->ad,&args->mad); -+ if ( nret<=0 ) n_ad = 0; -+ else -+ { -+ n_ad = nret / nsmpl; -+ if ( nret != nsmpl * rec->n_allele ) -+ { -+ if ( !n_ad_warned ) -+ { -+ hts_log_warning("Incorrect number of fields for FORMAT/AD at %s:%"PRId64". This warning is printed only once", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ n_ad_warned = 1; -+ } -+ if ( !args->force_ad ) n_ad = 0; -+ } -+ } -+ } -+ nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); -+ if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ int npl1 = nret/nsmpl; -+ if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) -+ error("fixme: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); -+ hts_expand(double,3*npl1,args->mpl3,args->pl3); -+ int i, j, k, al0, al1, write_dnm = 0, ad_set = 0; -+ for (i=0; idnm_qual[i] = bcf_int32_missing; -+ for (i=0; intrio; i++) -+ { -+ double *ppl[3]; -+ for (j=0; j<3; j++) -+ { -+ int32_t *src = args->pl + npl1 * args->trio[i].idx[j]; -+ double *dst = ppl[j] = args->pl3 + j*npl1; -+ double sum = 0; -+ for (k=0; kn_allele, ppl, npl1, &al0, &al1); -+ if ( score >= args->min_score ) -+ { -+ write_dnm = 1; -+ args->dnm_qual[ args->trio[i].idx[iCHILD] ] = score; -+ } -+ -+ if ( n_ad ) -+ { -+ if ( al0 < n_ad && al1 < n_ad ) -+ { -+ ad_set = 1; -+ for (j=0; j<3; j++) -+ { -+ int32_t *src = args->ad + n_ad * args->trio[i].idx[j]; -+ args->vaf[ args->trio[i].idx[j] ] = src[al0]+src[al1] ? round(src[al1]*100./(src[al0]+src[al1])) : 0; -+ } -+ } -+ else -+ for (j=0; j<3; j++) args->vaf[ args->trio[i].idx[j] ] = bcf_int32_missing; -+ } -+ } -+ if ( write_dnm ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,rec,"DNM",args->dnm_qual,nsmpl)!=0 ) -+ error("Failed to write FORMAT/DNM at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ if ( ad_set ) -+ { -+ if ( bcf_update_format_int32(args->hdr_out,rec,"VAF",args->vaf,nsmpl)!=0 ) -+ error("Failed to write FORMAT/VAF at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); -+ } -+ } -+ if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); -+} -+ -+int run(int argc, char **argv) -+{ -+ args_t *args = (args_t*) calloc(1,sizeof(args_t)); -+ args->argc = argc; args->argv = argv; -+ args->output_fname = "-"; -+ static struct option loptions[] = -+ { -+ {"force-AD",no_argument,0,1}, -+ {"min-score",required_argument,0,'m'}, -+ {"include",required_argument,0,'i'}, -+ {"exclude",required_argument,0,'e'}, -+ {"output",required_argument,NULL,'o'}, -+ {"output-type",required_argument,NULL,'O'}, -+ {"ped",required_argument,NULL,'P'}, -+ {"pfm",required_argument,NULL,'p'}, -+ {"regions",1,0,'r'}, -+ {"regions-file",1,0,'R'}, -+ {"targets",1,0,'t'}, -+ {"targets-file",1,0,'T'}, -+ {NULL,0,NULL,0} -+ }; -+ int c; -+ char *tmp; -+ while ((c = getopt_long(argc, argv, "p:P:o:O:s:i:e:r:R:t:T:m:",loptions,NULL)) >= 0) -+ { -+ switch (c) -+ { -+ case 1 : args->force_ad = 1; break; -+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; -+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; -+ case 't': args->targets = optarg; break; -+ case 'T': args->targets = optarg; args->targets_is_file = 1; break; -+ case 'r': args->regions = optarg; break; -+ case 'R': args->regions = optarg; args->regions_is_file = 1; break; -+ case 'o': args->output_fname = optarg; break; -+ case 'O': -+ switch (optarg[0]) { -+ case 'b': args->output_type = FT_BCF_GZ; break; -+ case 'u': args->output_type = FT_BCF; break; -+ case 'z': args->output_type = FT_VCF_GZ; break; -+ case 'v': args->output_type = FT_VCF; break; -+ default: error("The output type \"%s\" not recognised\n", optarg); -+ }; -+ break; -+ case 'P': args->ped_fname = optarg; break; -+ case 'p': args->pfm = optarg; break; -+ case 'm': args->min_score = strtod(optarg,&tmp); -+ if ( *tmp ) error("Could not parse: --min-score %s\n", optarg); -+ break; -+ case 'h': -+ case '?': -+ default: error("%s", usage_text()); break; -+ } -+ } -+ if ( optind==argc ) -+ { -+ if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin -+ else { error("%s", usage_text()); } -+ } -+ else if ( optind+1!=argc ) error("%s", usage_text()); -+ else args->fname = argv[optind]; -+ -+ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); -+ if ( args->ped_fname && args->pfm ) error("Expected only -p or -P option, not both\n"); -+ -+ init_data(args); -+ -+ while ( bcf_sr_next_line(args->sr) ) -+ process_record(args, bcf_sr_get_line(args->sr,0)); -+ -+ destroy_data(args); -+ -+ return 0; -+} ---- python-pysam.orig/bcftools/plugins/trio-stats.c -+++ python-pysam/bcftools/plugins/trio-stats.c -@@ -1,6 +1,6 @@ - /* The MIT License - -- Copyright (c) 2018 Genome Research Ltd. -+ Copyright (c) 2018-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -26,14 +26,17 @@ - - #include - #include -+#include - #include - #include // for isatty -+#include - #include - #include - #include - #include - #include - #include -+#include - #include "bcftools.h" - #include "filter.h" - -@@ -46,6 +49,9 @@ - #define iFATHER 1 - #define iMOTHER 2 - -+#define VERBOSE_MENDEL 1 -+#define VERBOSE_TRANSMITTED 2 -+ - typedef struct - { - int idx[3]; // VCF sample index for father, mother and child -@@ -58,11 +64,13 @@ - uint32_t - npass, // number of genotypes passing the filter - nnon_ref, // number of non-reference genotypes -- nmendel_err, // number of mendelian errors -+ nmendel_err, // number of DNMs / mendelian errors - nnovel, // a singleton allele, but observed only in the child. Counted as mendel_err as well. - nsingleton, // het mother or father different from everyone else -- ndoubleton, // het mother+child or father+child different from everyone else -- nts, ntv; // number of transitions and transversions -+ ndoubleton, // het mother+child or father+child different from everyone else (transmitted alleles) -+ nts, ntv, // number of transitions and transversions -+ ndnm_recurrent, // number of recurrent DNMs / mendelian errors (counted as GTs, not sites; in ambiguous cases the allele with smaller AF is chosen) -+ ndnm_hom; // number of homozygous DNMs / mendelian errors - } - trio_stats_t; - -@@ -76,18 +84,33 @@ - - typedef struct - { -+ kbitset_t *sd_bset; // singleton (1) or doubleton (0) trio? -+ uint32_t -+ nalt, // number of all alternate trios -+ nsd, // number of singleton or doubleton trios -+ *idx; // indexes of the singleton and doubleon trios -+} -+alt_trios_t; // for one alt allele -+ -+typedef struct -+{ -+ int max_alt_trios; // maximum number of alternate trios [1] -+ int malt_trios; -+ alt_trios_t *alt_trios; - int argc, filter_logic, regions_is_file, targets_is_file; - int nflt_str; - char *filter_str, **flt_str; -- char **argv, *ped_fname, *output_fname, *fname, *regions, *targets; -+ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; - bcf_srs_t *sr; - bcf_hdr_t *hdr; - trio_t *trio; - int ntrio, mtrio; - flt_stats_t *filters; - int nfilters; -- int32_t *gt_arr, *ac, *ac_trio; -- int mgt_arr, mac, mac_trio; -+ int32_t *gt_arr, *ac, *ac_trio, *dnm_als; -+ int mgt_arr, mac, mac_trio, mdnm_als; -+ int verbose; -+ FILE *fp_out; - } - args_t; - -@@ -106,10 +129,14 @@ - " a range of values simultaneously\n" - "Usage: bcftools +trio-stats [Plugin Options]\n" - "Plugin options:\n" -+ " -a, --alt-trios INT for transmission rate consider only sites with at most this\n" -+ " many alternate trios, 0 for unlimited [0]\n" -+ " -d, --debug TYPE comma-separted list of features: {mendel-errors,transmitted}\n" - " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" - " -i, --include EXPR include sites and samples for which the expression is true\n" - " -o, --output FILE output file name [stdout]\n" - " -p, --ped FILE PED file\n" -+ " -P, --pfm P,F,M sample names of proband, father, and mother\n" - " -r, --regions REG restrict to comma-separated list of regions\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " -t, --targets REG similar to -r but streams rather than index-jumps\n" -@@ -169,13 +196,14 @@ - while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); - - fprintf(stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); -+ if ( !args->ntrio ) error("No complete trio identified\n"); - - // sort the sample by index so that they are accessed more or less sequentially - qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); - - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - static void parse_filters(args_t *args) -@@ -231,7 +259,33 @@ - if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); - args->hdr = bcf_sr_get_header(args->sr,0); - -- parse_ped(args, args->ped_fname); -+ if ( args->ped_fname ) -+ parse_ped(args, args->ped_fname); -+ else -+ { -+ args->ntrio = 1; -+ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); -+ int ibeg, iend = 0; -+ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; -+ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); -+ args->pfm[iend] = 0; -+ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm); -+ if ( child<0 ) error("No such sample: \"%s\"\n", args->pfm); -+ args->pfm[iend] = ','; -+ ibeg = ++iend; -+ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; -+ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); -+ args->pfm[iend] = 0; -+ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); -+ if ( father<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); -+ args->pfm[iend] = ','; -+ ibeg = ++iend; -+ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); -+ if ( mother<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); -+ args->trio[0].idx[iFATHER] = father; -+ args->trio[0].idx[iMOTHER] = mother; -+ args->trio[0].idx[iCHILD] = child; -+ } - parse_filters(args); - - int i; -@@ -261,6 +315,66 @@ - } - for (i=0; infilters; i++) - args->filters[i].stats = (trio_stats_t*) calloc(args->ntrio,sizeof(trio_stats_t)); -+ -+ args->fp_out = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w"); -+ if ( !args->fp_out ) error("Could not open the file for writing: %s\n", args->output_fname); -+ fprintf(args->fp_out,"# CMD line shows the command line used to generate this output\n"); -+ fprintf(args->fp_out,"# DEF lines define expressions for all tested thresholds\n"); -+ fprintf(args->fp_out,"# FLT* lines report numbers for every threshold and every trio:\n"); -+ i = 0; -+ fprintf(args->fp_out,"# %d) filter id\n", ++i); -+ fprintf(args->fp_out,"# %d) child\n", ++i); -+ fprintf(args->fp_out,"# %d) father\n", ++i); -+ fprintf(args->fp_out,"# %d) mother\n", ++i); -+ fprintf(args->fp_out,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of DNMs/Mendelian errors\n", ++i); -+ fprintf(args->fp_out,"# %d) number of novel singleton alleles in the child (counted also as DNM / Mendelian error)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of untransmitted trio singletons (one alternate allele present in one parent)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of transmitted trio singletons (one alternate allele present in one parent and the child)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of transitions, all distinct ALT alleles present in the trio are considered\n", ++i); -+ fprintf(args->fp_out,"# %d) number of transversions, all distinct ALT alleles present in the trio are considered\n", ++i); -+ fprintf(args->fp_out,"# %d) overall ts/tv, all distinct ALT alleles present in the trio are considered\n", ++i); -+ fprintf(args->fp_out,"# %d) number of homozygous DNMs/Mendelian errors (likely genotyping errors)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of recurrent DNMs/Mendelian errors (non-inherited alleles present in other samples; counts GTs, not sites)\n", ++i); -+ fprintf(args->fp_out, "CMD\t%s", args->argv[0]); -+ for (i=1; iargc; i++) fprintf(args->fp_out, " %s",args->argv[i]); -+ fprintf(args->fp_out, "\n"); -+} -+static void alt_trios_reset(args_t *args, int nals) -+{ -+ int i; -+ hts_expand0(alt_trios_t, nals, args->malt_trios, args->alt_trios); -+ for (i=0; ialt_trios[i]; -+ if ( !tr->idx ) -+ { -+ tr->idx = (uint32_t*)malloc(sizeof(*tr->idx)*args->ntrio); -+ tr->sd_bset = kbs_init(args->ntrio); -+ } -+ else -+ kbs_clear(tr->sd_bset); -+ tr->nsd = 0; -+ tr->nalt = 0; -+ } -+} -+static void alt_trios_destroy(args_t *args) -+{ -+ if ( !args->max_alt_trios ) return; -+ int i; -+ for (i=0; imalt_trios; i++) -+ { -+ free(args->alt_trios[i].idx); -+ kbs_destroy(args->alt_trios[i].sd_bset); -+ } -+ free(args->alt_trios); -+} -+static inline void alt_trios_add(args_t *args, int itrio, int ial, int is_singleton) -+{ -+ alt_trios_t *tr = &args->alt_trios[ial]; -+ if ( is_singleton ) kbs_insert(tr->sd_bset, tr->nsd); -+ tr->idx[ tr->nsd++ ] = itrio; - } - static void destroy_data(args_t *args) - { -@@ -275,64 +389,47 @@ - for (i=0; inflt_str; i++) free(args->flt_str[i]); - free(args->flt_str); - bcf_sr_destroy(args->sr); -+ alt_trios_destroy(args); - free(args->trio); - free(args->ac); - free(args->ac_trio); - free(args->gt_arr); -+ free(args->dnm_als); -+ if ( fclose(args->fp_out)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname); - free(args); - } - static void report_stats(args_t *args) - { - int i = 0,j; -- FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? stdout : fopen(args->output_fname,"w"); -- if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); -- fprintf(fh,"# CMD line shows the command line used to generate this output\n"); -- fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); -- fprintf(fh,"# FLT* lines report numbers for every threshold and every trio:\n"); -- fprintf(fh,"# %d) filter id\n", ++i); -- fprintf(fh,"# %d) child\n", ++i); -- fprintf(fh,"# %d) father\n", ++i); -- fprintf(fh,"# %d) mother\n", ++i); -- fprintf(fh,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); -- fprintf(fh,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); -- fprintf(fh,"# %d) number of Mendelian errors\n", ++i); -- fprintf(fh,"# %d) number of novel singleton alleles in the child (counted also as a Mendelian error)\n", ++i); -- fprintf(fh,"# %d) number of untransmitted singletons, present only in one parent\n", ++i); -- fprintf(fh,"# %d) number of transmitted singletons, present only in one parent and the child\n", ++i); -- fprintf(fh,"# %d) number of transitions, all ALT alleles present in the trio are considered\n", ++i); -- fprintf(fh,"# %d) number of transversions, all ALT alleles present in the trio are considered\n", ++i); -- fprintf(fh,"# %d) overall ts/tv, all ALT alleles present in the trio are considered\n", ++i); -- fprintf(fh, "CMD\t%s", args->argv[0]); -- for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); -- fprintf(fh, "\n"); - for (i=0; infilters; i++) - { - flt_stats_t *flt = &args->filters[i]; -- fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); -+ fprintf(args->fp_out,"DEF\tFLT%d\t%s\n", i, flt->expr); - } - for (i=0; infilters; i++) - { - flt_stats_t *flt = &args->filters[i]; - for (j=0; jntrio; j++) - { -- fprintf(fh,"FLT%d", i); -- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); -- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); -- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); -+ fprintf(args->fp_out,"FLT%d", i); -+ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); -+ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); -+ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); - trio_stats_t *stats = &flt->stats[j]; -- fprintf(fh,"\t%d", stats->npass); -- fprintf(fh,"\t%d", stats->nnon_ref); -- fprintf(fh,"\t%d", stats->nmendel_err); -- fprintf(fh,"\t%d", stats->nnovel); -- fprintf(fh,"\t%d", stats->nsingleton); -- fprintf(fh,"\t%d", stats->ndoubleton); -- fprintf(fh,"\t%d", stats->nts); -- fprintf(fh,"\t%d", stats->ntv); -- fprintf(fh,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); -- fprintf(fh,"\n"); -+ fprintf(args->fp_out,"\t%d", stats->npass); -+ fprintf(args->fp_out,"\t%d", stats->nnon_ref); -+ fprintf(args->fp_out,"\t%d", stats->nmendel_err); -+ fprintf(args->fp_out,"\t%d", stats->nnovel); -+ fprintf(args->fp_out,"\t%d", stats->nsingleton); -+ fprintf(args->fp_out,"\t%d", stats->ndoubleton); -+ fprintf(args->fp_out,"\t%d", stats->nts); -+ fprintf(args->fp_out,"\t%d", stats->ntv); -+ fprintf(args->fp_out,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); -+ fprintf(args->fp_out,"\t%d", stats->ndnm_hom); -+ fprintf(args->fp_out,"\t%d", stats->ndnm_recurrent); -+ fprintf(args->fp_out,"\n"); - } - } -- if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "stdout" : args->output_fname); - } - - static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) -@@ -406,6 +503,7 @@ - hts_expand(int, rec->n_allele, args->mac, args->ac); - if ( !bcf_calc_ac(args->hdr, rec, args->ac, BCF_UN_INFO|BCF_UN_FMT) ) return; - hts_expand(int, rec->n_allele, args->mac_trio, args->ac_trio); -+ hts_expand(int, rec->n_allele, args->mdnm_als, args->dnm_als); - - // Get the genotypes - int ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); -@@ -420,6 +518,9 @@ - for (i=1; in_allele; i++) - if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } - -+ // number of non-reference trios -+ if ( args->max_alt_trios ) alt_trios_reset(args, rec->n_allele); -+ - // Run the stats - for (i=0; intrio; i++) - { -@@ -441,8 +542,7 @@ - for (j=0; j<6; j++) - { - if ( als[j]==star_allele ) { has_star_allele = 1; continue; } -- if ( als[j]==0 ) continue; -- has_nonref = 1; -+ if ( als[j]!=0 ) has_nonref = 1; - args->ac_trio[ als[j] ]++; - } - if ( !has_nonref ) continue; // only ref or * in this trio -@@ -457,7 +557,7 @@ - { - if ( als[j]==0 || als[j]==star_allele ) continue; - if ( als[j] >= rec->n_allele ) -- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); -+ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); - if ( rec->d.allele[als[j]][1] ) continue; - - int alt = bcf_acgt2int(rec->d.allele[als[j]][0]); -@@ -473,21 +573,111 @@ - if ( has_star_allele ) continue; - - // Detect mendelian errors -- int mendel_ok = (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ? 1 : 0; -- if ( !mendel_ok ) mendel_ok = (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ? 1 : 0; -- if ( !mendel_ok ) stats->nmendel_err++; -+ int a0F = als_child[0]==als_father[0] || als_child[0]==als_father[1] ? 1 : 0; -+ int a1M = als_child[1]==als_mother[0] || als_child[1]==als_mother[1] ? 1 : 0; -+ if ( !a0F || !a1M ) -+ { -+ int a0M = als_child[0]==als_mother[0] || als_child[0]==als_mother[1] ? 1 : 0; -+ int a1F = als_child[1]==als_father[0] || als_child[1]==als_father[1] ? 1 : 0; -+ if ( !a0M || !a1F ) -+ { -+ stats->nmendel_err++; -+ -+ int dnm_hom = 0; -+ if ( als_child[0]==als_child[1] ) { stats->ndnm_hom++; dnm_hom = 1; } -+ -+ int culprit; // neglecting the unlikely possibility of alt het 1/2 DNM genotype -+ if ( !a0F && !a0M ) culprit = als_child[0]; -+ else if ( !a1F && !a1M ) culprit = als_child[1]; -+ else if ( args->ac[als_child[0]] < args->ac[als_child[1]] ) culprit = als_child[0]; -+ else culprit = als_child[1]; -+ -+ int dnm_recurrent = 0; -+ if ( (!dnm_hom && args->ac[culprit]>1) || (dnm_hom && args->ac[culprit]>2) ) { stats->ndnm_recurrent++; dnm_recurrent = 1; } -+ -+ if ( args->verbose & VERBOSE_MENDEL ) -+ fprintf(args->fp_out,"MERR\t%s\t%"PRId64"\t%s\t%s\t%s\t%s\t%s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[i].idx[iCHILD]], -+ args->hdr->samples[args->trio[i].idx[iFATHER]], -+ args->hdr->samples[args->trio[i].idx[iMOTHER]], -+ dnm_hom ? "HOM" : "-", -+ dnm_recurrent ? "RECURRENT" : "-" -+ ); -+ } -+ } - - // Is this a singleton, doubleton, neither? -- for (j=1; jn_allele; j++) -+ for (j=0; jn_allele; j++) - { -- if ( args->ac_trio[j]==1 && args->ac[j]==1 ) // singleton (in parent) or novel (in child) -+ if ( !args->ac_trio[j] ) continue; -+ if ( args->max_alt_trios ) args->alt_trios[j].nalt++; -+ -+ if ( args->ac_trio[j]==1 ) // singleton (in parent) or novel (in child) - { - if ( als_child[0]==j || als_child[1]==j ) stats->nnovel++; -- else stats->nsingleton++; -+ else -+ { -+ if ( !args->max_alt_trios ) -+ { -+ stats->nsingleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[i].idx[iCHILD]], -+ args->hdr->samples[args->trio[i].idx[iFATHER]], -+ args->hdr->samples[args->trio[i].idx[iMOTHER]] -+ ); -+ } -+ else alt_trios_add(args, i,j,1); -+ } -+ } -+ else if ( args->ac_trio[j]==2 ) // possibly a doubleton -+ { -+ if ( (als_child[0]!=j && als_child[1]!=j) || (als_child[0]==j && als_child[1]==j) ) continue; -+ if ( (als_father[0]==j && als_father[1]==j) || (als_mother[0]==j && als_mother[1]==j) ) continue; -+ if ( !args->max_alt_trios ) -+ { -+ stats->ndoubleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[i].idx[iCHILD]], -+ args->hdr->samples[args->trio[i].idx[iFATHER]], -+ args->hdr->samples[args->trio[i].idx[iMOTHER]] -+ ); -+ } -+ else alt_trios_add(args, i,j,0); - } -- else if ( args->ac_trio[j]==2 && args->ac[j]==2 ) // possibly a doubleton -+ } -+ } -+ if ( args->max_alt_trios ) -+ { -+ for (j=0; jn_allele; j++) -+ { -+ alt_trios_t *tr = &args->alt_trios[j]; -+ if ( !tr->nsd || tr->nalt > args->max_alt_trios ) continue; -+ for (i=0; insd; i++) - { -- if ( (als_child[0]==j || als_child[1]==j) && (als_child[0]!=j || als_child[1]!=j) ) stats->ndoubleton++; -+ int itr = tr->idx[i]; -+ trio_stats_t *stats = &flt->stats[itr]; -+ if ( kbs_exists(tr->sd_bset,i) ) -+ { -+ stats->nsingleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[itr].idx[iCHILD]], -+ args->hdr->samples[args->trio[itr].idx[iFATHER]], -+ args->hdr->samples[args->trio[itr].idx[iMOTHER]] -+ ); -+ } -+ else -+ { -+ stats->ndoubleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[itr].idx[iCHILD]], -+ args->hdr->samples[args->trio[itr].idx[iFATHER]], -+ args->hdr->samples[args->trio[itr].idx[iMOTHER]] -+ ); -+ } - } - } - } -@@ -500,10 +690,13 @@ - args->output_fname = "-"; - static struct option loptions[] = - { -+ {"debug",required_argument,0,'d'}, -+ {"alt-trios",required_argument,0,'a'}, - {"include",required_argument,0,'i'}, - {"exclude",required_argument,0,'e'}, - {"output",required_argument,NULL,'o'}, - {"ped",required_argument,NULL,'p'}, -+ {"pfm",required_argument,NULL,'P'}, - {"regions",1,0,'r'}, - {"regions-file",1,0,'R'}, - {"targets",1,0,'t'}, -@@ -511,10 +704,25 @@ - {NULL,0,NULL,0} - }; - int c, i; -- while ((c = getopt_long(argc, argv, "p:o:s:i:e:r:R:t:T:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "P:p:o:s:i:e:r:R:t:T:a:d:",loptions,NULL)) >= 0) - { - switch (c) - { -+ case 'd': -+ { -+ int n; -+ char **tmp = hts_readlist(optarg, 0, &n); -+ for(i=0; iverbose |= VERBOSE_MENDEL; -+ else if ( !strcasecmp(tmp[i],"transmitted") ) args->verbose |= VERBOSE_TRANSMITTED; -+ else error("Error: The argument \"%s\" to option --debug is not recognised\n", tmp[i]); -+ free(tmp[i]); -+ } -+ free(tmp); -+ break; -+ } -+ case 'a': args->max_alt_trios = atoi(optarg); break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 't': args->targets = optarg; break; -@@ -523,6 +731,7 @@ - case 'R': args->regions = optarg; args->regions_is_file = 1; break; - case 'o': args->output_fname = optarg; break; - case 'p': args->ped_fname = optarg; break; -+ case 'P': args->pfm = optarg; break; - case 'h': - case '?': - default: error("%s", usage_text()); break; -@@ -536,7 +745,7 @@ - else if ( optind+1!=argc ) error("%s", usage_text()); - else args->fname = argv[optind]; - -- if ( !args->ped_fname ) error("Missing the -p, --ped option\n"); -+ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); - - init_data(args); - ---- python-pysam.orig/bcftools/plugins/trio-stats.c.pysam.c -+++ python-pysam/bcftools/plugins/trio-stats.c.pysam.c -@@ -2,7 +2,7 @@ - - /* The MIT License - -- Copyright (c) 2018 Genome Research Ltd. -+ Copyright (c) 2018-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -28,14 +28,17 @@ - - #include - #include -+#include - #include - #include // for isatty -+#include - #include - #include - #include - #include - #include - #include -+#include - #include "bcftools.h" - #include "filter.h" - -@@ -48,6 +51,9 @@ - #define iFATHER 1 - #define iMOTHER 2 - -+#define VERBOSE_MENDEL 1 -+#define VERBOSE_TRANSMITTED 2 -+ - typedef struct - { - int idx[3]; // VCF sample index for father, mother and child -@@ -60,11 +66,13 @@ - uint32_t - npass, // number of genotypes passing the filter - nnon_ref, // number of non-reference genotypes -- nmendel_err, // number of mendelian errors -+ nmendel_err, // number of DNMs / mendelian errors - nnovel, // a singleton allele, but observed only in the child. Counted as mendel_err as well. - nsingleton, // het mother or father different from everyone else -- ndoubleton, // het mother+child or father+child different from everyone else -- nts, ntv; // number of transitions and transversions -+ ndoubleton, // het mother+child or father+child different from everyone else (transmitted alleles) -+ nts, ntv, // number of transitions and transversions -+ ndnm_recurrent, // number of recurrent DNMs / mendelian errors (counted as GTs, not sites; in ambiguous cases the allele with smaller AF is chosen) -+ ndnm_hom; // number of homozygous DNMs / mendelian errors - } - trio_stats_t; - -@@ -78,18 +86,33 @@ - - typedef struct - { -+ kbitset_t *sd_bset; // singleton (1) or doubleton (0) trio? -+ uint32_t -+ nalt, // number of all alternate trios -+ nsd, // number of singleton or doubleton trios -+ *idx; // indexes of the singleton and doubleon trios -+} -+alt_trios_t; // for one alt allele -+ -+typedef struct -+{ -+ int max_alt_trios; // maximum number of alternate trios [1] -+ int malt_trios; -+ alt_trios_t *alt_trios; - int argc, filter_logic, regions_is_file, targets_is_file; - int nflt_str; - char *filter_str, **flt_str; -- char **argv, *ped_fname, *output_fname, *fname, *regions, *targets; -+ char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; - bcf_srs_t *sr; - bcf_hdr_t *hdr; - trio_t *trio; - int ntrio, mtrio; - flt_stats_t *filters; - int nfilters; -- int32_t *gt_arr, *ac, *ac_trio; -- int mgt_arr, mac, mac_trio; -+ int32_t *gt_arr, *ac, *ac_trio, *dnm_als; -+ int mgt_arr, mac, mac_trio, mdnm_als; -+ int verbose; -+ FILE *fp_out; - } - args_t; - -@@ -108,10 +131,14 @@ - " a range of values simultaneously\n" - "Usage: bcftools +trio-stats [Plugin Options]\n" - "Plugin options:\n" -+ " -a, --alt-trios INT for transmission rate consider only sites with at most this\n" -+ " many alternate trios, 0 for unlimited [0]\n" -+ " -d, --debug TYPE comma-separted list of features: {mendel-errors,transmitted}\n" - " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" - " -i, --include EXPR include sites and samples for which the expression is true\n" - " -o, --output FILE output file name [bcftools_stdout]\n" - " -p, --ped FILE PED file\n" -+ " -P, --pfm P,F,M sample names of proband, father, and mother\n" - " -r, --regions REG restrict to comma-separated list of regions\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " -t, --targets REG similar to -r but streams rather than index-jumps\n" -@@ -171,13 +198,14 @@ - while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); - - fprintf(bcftools_stderr,"Identified %d complete trios in the VCF file\n", args->ntrio); -+ if ( !args->ntrio ) error("No complete trio identified\n"); - - // sort the sample by index so that they are accessed more or less sequentially - qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); - - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - static void parse_filters(args_t *args) -@@ -233,7 +261,33 @@ - if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); - args->hdr = bcf_sr_get_header(args->sr,0); - -- parse_ped(args, args->ped_fname); -+ if ( args->ped_fname ) -+ parse_ped(args, args->ped_fname); -+ else -+ { -+ args->ntrio = 1; -+ args->trio = (trio_t*) calloc(1,sizeof(trio_t)); -+ int ibeg, iend = 0; -+ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; -+ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); -+ args->pfm[iend] = 0; -+ int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm); -+ if ( child<0 ) error("No such sample: \"%s\"\n", args->pfm); -+ args->pfm[iend] = ','; -+ ibeg = ++iend; -+ while ( args->pfm[iend] && args->pfm[iend]!=',' ) iend++; -+ if ( !args->pfm[iend] ) error("Could not parse -P %s\n", args->pfm); -+ args->pfm[iend] = 0; -+ int father = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); -+ if ( father<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); -+ args->pfm[iend] = ','; -+ ibeg = ++iend; -+ int mother = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->pfm+ibeg); -+ if ( mother<0 ) error("No such sample: \"%s\"\n", args->pfm+ibeg); -+ args->trio[0].idx[iFATHER] = father; -+ args->trio[0].idx[iMOTHER] = mother; -+ args->trio[0].idx[iCHILD] = child; -+ } - parse_filters(args); - - int i; -@@ -263,6 +317,66 @@ - } - for (i=0; infilters; i++) - args->filters[i].stats = (trio_stats_t*) calloc(args->ntrio,sizeof(trio_stats_t)); -+ -+ args->fp_out = !args->output_fname || !strcmp("-",args->output_fname) ? bcftools_stdout : fopen(args->output_fname,"w"); -+ if ( !args->fp_out ) error("Could not open the file for writing: %s\n", args->output_fname); -+ fprintf(args->fp_out,"# CMD line shows the command line used to generate this output\n"); -+ fprintf(args->fp_out,"# DEF lines define expressions for all tested thresholds\n"); -+ fprintf(args->fp_out,"# FLT* lines report numbers for every threshold and every trio:\n"); -+ i = 0; -+ fprintf(args->fp_out,"# %d) filter id\n", ++i); -+ fprintf(args->fp_out,"# %d) child\n", ++i); -+ fprintf(args->fp_out,"# %d) father\n", ++i); -+ fprintf(args->fp_out,"# %d) mother\n", ++i); -+ fprintf(args->fp_out,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of DNMs/Mendelian errors\n", ++i); -+ fprintf(args->fp_out,"# %d) number of novel singleton alleles in the child (counted also as DNM / Mendelian error)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of untransmitted trio singletons (one alternate allele present in one parent)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of transmitted trio singletons (one alternate allele present in one parent and the child)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of transitions, all distinct ALT alleles present in the trio are considered\n", ++i); -+ fprintf(args->fp_out,"# %d) number of transversions, all distinct ALT alleles present in the trio are considered\n", ++i); -+ fprintf(args->fp_out,"# %d) overall ts/tv, all distinct ALT alleles present in the trio are considered\n", ++i); -+ fprintf(args->fp_out,"# %d) number of homozygous DNMs/Mendelian errors (likely genotyping errors)\n", ++i); -+ fprintf(args->fp_out,"# %d) number of recurrent DNMs/Mendelian errors (non-inherited alleles present in other samples; counts GTs, not sites)\n", ++i); -+ fprintf(args->fp_out, "CMD\t%s", args->argv[0]); -+ for (i=1; iargc; i++) fprintf(args->fp_out, " %s",args->argv[i]); -+ fprintf(args->fp_out, "\n"); -+} -+static void alt_trios_reset(args_t *args, int nals) -+{ -+ int i; -+ hts_expand0(alt_trios_t, nals, args->malt_trios, args->alt_trios); -+ for (i=0; ialt_trios[i]; -+ if ( !tr->idx ) -+ { -+ tr->idx = (uint32_t*)malloc(sizeof(*tr->idx)*args->ntrio); -+ tr->sd_bset = kbs_init(args->ntrio); -+ } -+ else -+ kbs_clear(tr->sd_bset); -+ tr->nsd = 0; -+ tr->nalt = 0; -+ } -+} -+static void alt_trios_destroy(args_t *args) -+{ -+ if ( !args->max_alt_trios ) return; -+ int i; -+ for (i=0; imalt_trios; i++) -+ { -+ free(args->alt_trios[i].idx); -+ kbs_destroy(args->alt_trios[i].sd_bset); -+ } -+ free(args->alt_trios); -+} -+static inline void alt_trios_add(args_t *args, int itrio, int ial, int is_singleton) -+{ -+ alt_trios_t *tr = &args->alt_trios[ial]; -+ if ( is_singleton ) kbs_insert(tr->sd_bset, tr->nsd); -+ tr->idx[ tr->nsd++ ] = itrio; - } - static void destroy_data(args_t *args) - { -@@ -277,64 +391,47 @@ - for (i=0; inflt_str; i++) free(args->flt_str[i]); - free(args->flt_str); - bcf_sr_destroy(args->sr); -+ alt_trios_destroy(args); - free(args->trio); - free(args->ac); - free(args->ac_trio); - free(args->gt_arr); -+ free(args->dnm_als); -+ if ( fclose(args->fp_out)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "bcftools_stdout" : args->output_fname); - free(args); - } - static void report_stats(args_t *args) - { - int i = 0,j; -- FILE *fh = !args->output_fname || !strcmp("-",args->output_fname) ? bcftools_stdout : fopen(args->output_fname,"w"); -- if ( !fh ) error("Could not open the file for writing: %s\n", args->output_fname); -- fprintf(fh,"# CMD line shows the command line used to generate this output\n"); -- fprintf(fh,"# DEF lines define expressions for all tested thresholds\n"); -- fprintf(fh,"# FLT* lines report numbers for every threshold and every trio:\n"); -- fprintf(fh,"# %d) filter id\n", ++i); -- fprintf(fh,"# %d) child\n", ++i); -- fprintf(fh,"# %d) father\n", ++i); -- fprintf(fh,"# %d) mother\n", ++i); -- fprintf(fh,"# %d) number of valid trio genotypes (all trio members pass filters, all non-missing)\n", ++i); -- fprintf(fh,"# %d) number of non-reference trio GTs (at least one trio member carries an alternate allele)\n", ++i); -- fprintf(fh,"# %d) number of Mendelian errors\n", ++i); -- fprintf(fh,"# %d) number of novel singleton alleles in the child (counted also as a Mendelian error)\n", ++i); -- fprintf(fh,"# %d) number of untransmitted singletons, present only in one parent\n", ++i); -- fprintf(fh,"# %d) number of transmitted singletons, present only in one parent and the child\n", ++i); -- fprintf(fh,"# %d) number of transitions, all ALT alleles present in the trio are considered\n", ++i); -- fprintf(fh,"# %d) number of transversions, all ALT alleles present in the trio are considered\n", ++i); -- fprintf(fh,"# %d) overall ts/tv, all ALT alleles present in the trio are considered\n", ++i); -- fprintf(fh, "CMD\t%s", args->argv[0]); -- for (i=1; iargc; i++) fprintf(fh, " %s",args->argv[i]); -- fprintf(fh, "\n"); - for (i=0; infilters; i++) - { - flt_stats_t *flt = &args->filters[i]; -- fprintf(fh,"DEF\tFLT%d\t%s\n", i, flt->expr); -+ fprintf(args->fp_out,"DEF\tFLT%d\t%s\n", i, flt->expr); - } - for (i=0; infilters; i++) - { - flt_stats_t *flt = &args->filters[i]; - for (j=0; jntrio; j++) - { -- fprintf(fh,"FLT%d", i); -- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); -- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); -- fprintf(fh,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); -+ fprintf(args->fp_out,"FLT%d", i); -+ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iCHILD]]); -+ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iFATHER]]); -+ fprintf(args->fp_out,"\t%s",args->hdr->samples[args->trio[j].idx[iMOTHER]]); - trio_stats_t *stats = &flt->stats[j]; -- fprintf(fh,"\t%d", stats->npass); -- fprintf(fh,"\t%d", stats->nnon_ref); -- fprintf(fh,"\t%d", stats->nmendel_err); -- fprintf(fh,"\t%d", stats->nnovel); -- fprintf(fh,"\t%d", stats->nsingleton); -- fprintf(fh,"\t%d", stats->ndoubleton); -- fprintf(fh,"\t%d", stats->nts); -- fprintf(fh,"\t%d", stats->ntv); -- fprintf(fh,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); -- fprintf(fh,"\n"); -+ fprintf(args->fp_out,"\t%d", stats->npass); -+ fprintf(args->fp_out,"\t%d", stats->nnon_ref); -+ fprintf(args->fp_out,"\t%d", stats->nmendel_err); -+ fprintf(args->fp_out,"\t%d", stats->nnovel); -+ fprintf(args->fp_out,"\t%d", stats->nsingleton); -+ fprintf(args->fp_out,"\t%d", stats->ndoubleton); -+ fprintf(args->fp_out,"\t%d", stats->nts); -+ fprintf(args->fp_out,"\t%d", stats->ntv); -+ fprintf(args->fp_out,"\t%.2f", stats->ntv ? (float)stats->nts/stats->ntv : INFINITY); -+ fprintf(args->fp_out,"\t%d", stats->ndnm_hom); -+ fprintf(args->fp_out,"\t%d", stats->ndnm_recurrent); -+ fprintf(args->fp_out,"\n"); - } - } -- if ( fclose(fh)!=0 ) error("Close failed: %s\n", (!args->output_fname || !strcmp("-",args->output_fname)) ? "bcftools_stdout" : args->output_fname); - } - - static inline int parse_genotype(int32_t *arr, int ngt1, int idx, int als[2]) -@@ -408,6 +505,7 @@ - hts_expand(int, rec->n_allele, args->mac, args->ac); - if ( !bcf_calc_ac(args->hdr, rec, args->ac, BCF_UN_INFO|BCF_UN_FMT) ) return; - hts_expand(int, rec->n_allele, args->mac_trio, args->ac_trio); -+ hts_expand(int, rec->n_allele, args->mdnm_als, args->dnm_als); - - // Get the genotypes - int ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); -@@ -422,6 +520,9 @@ - for (i=1; in_allele; i++) - if ( !rec->d.allele[i][1] && rec->d.allele[i][0]=='*' ) { star_allele = i; break; } - -+ // number of non-reference trios -+ if ( args->max_alt_trios ) alt_trios_reset(args, rec->n_allele); -+ - // Run the stats - for (i=0; intrio; i++) - { -@@ -443,8 +544,7 @@ - for (j=0; j<6; j++) - { - if ( als[j]==star_allele ) { has_star_allele = 1; continue; } -- if ( als[j]==0 ) continue; -- has_nonref = 1; -+ if ( als[j]!=0 ) has_nonref = 1; - args->ac_trio[ als[j] ]++; - } - if ( !has_nonref ) continue; // only ref or * in this trio -@@ -459,7 +559,7 @@ - { - if ( als[j]==0 || als[j]==star_allele ) continue; - if ( als[j] >= rec->n_allele ) -- error("The GT index is out of range at %s:%d in %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); -+ error("The GT index is out of range at %s:%"PRId64" in %s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,args->hdr->samples[args->trio[i].idx[j/2]]); - if ( rec->d.allele[als[j]][1] ) continue; - - int alt = bcf_acgt2int(rec->d.allele[als[j]][0]); -@@ -475,21 +575,111 @@ - if ( has_star_allele ) continue; - - // Detect mendelian errors -- int mendel_ok = (als_child[0]==als_father[0] || als_child[0]==als_father[1]) && (als_child[1]==als_mother[0] || als_child[1]==als_mother[1]) ? 1 : 0; -- if ( !mendel_ok ) mendel_ok = (als_child[1]==als_father[0] || als_child[1]==als_father[1]) && (als_child[0]==als_mother[0] || als_child[0]==als_mother[1]) ? 1 : 0; -- if ( !mendel_ok ) stats->nmendel_err++; -+ int a0F = als_child[0]==als_father[0] || als_child[0]==als_father[1] ? 1 : 0; -+ int a1M = als_child[1]==als_mother[0] || als_child[1]==als_mother[1] ? 1 : 0; -+ if ( !a0F || !a1M ) -+ { -+ int a0M = als_child[0]==als_mother[0] || als_child[0]==als_mother[1] ? 1 : 0; -+ int a1F = als_child[1]==als_father[0] || als_child[1]==als_father[1] ? 1 : 0; -+ if ( !a0M || !a1F ) -+ { -+ stats->nmendel_err++; -+ -+ int dnm_hom = 0; -+ if ( als_child[0]==als_child[1] ) { stats->ndnm_hom++; dnm_hom = 1; } -+ -+ int culprit; // neglecting the unlikely possibility of alt het 1/2 DNM genotype -+ if ( !a0F && !a0M ) culprit = als_child[0]; -+ else if ( !a1F && !a1M ) culprit = als_child[1]; -+ else if ( args->ac[als_child[0]] < args->ac[als_child[1]] ) culprit = als_child[0]; -+ else culprit = als_child[1]; -+ -+ int dnm_recurrent = 0; -+ if ( (!dnm_hom && args->ac[culprit]>1) || (dnm_hom && args->ac[culprit]>2) ) { stats->ndnm_recurrent++; dnm_recurrent = 1; } -+ -+ if ( args->verbose & VERBOSE_MENDEL ) -+ fprintf(args->fp_out,"MERR\t%s\t%"PRId64"\t%s\t%s\t%s\t%s\t%s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[i].idx[iCHILD]], -+ args->hdr->samples[args->trio[i].idx[iFATHER]], -+ args->hdr->samples[args->trio[i].idx[iMOTHER]], -+ dnm_hom ? "HOM" : "-", -+ dnm_recurrent ? "RECURRENT" : "-" -+ ); -+ } -+ } - - // Is this a singleton, doubleton, neither? -- for (j=1; jn_allele; j++) -+ for (j=0; jn_allele; j++) - { -- if ( args->ac_trio[j]==1 && args->ac[j]==1 ) // singleton (in parent) or novel (in child) -+ if ( !args->ac_trio[j] ) continue; -+ if ( args->max_alt_trios ) args->alt_trios[j].nalt++; -+ -+ if ( args->ac_trio[j]==1 ) // singleton (in parent) or novel (in child) - { - if ( als_child[0]==j || als_child[1]==j ) stats->nnovel++; -- else stats->nsingleton++; -+ else -+ { -+ if ( !args->max_alt_trios ) -+ { -+ stats->nsingleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[i].idx[iCHILD]], -+ args->hdr->samples[args->trio[i].idx[iFATHER]], -+ args->hdr->samples[args->trio[i].idx[iMOTHER]] -+ ); -+ } -+ else alt_trios_add(args, i,j,1); -+ } -+ } -+ else if ( args->ac_trio[j]==2 ) // possibly a doubleton -+ { -+ if ( (als_child[0]!=j && als_child[1]!=j) || (als_child[0]==j && als_child[1]==j) ) continue; -+ if ( (als_father[0]==j && als_father[1]==j) || (als_mother[0]==j && als_mother[1]==j) ) continue; -+ if ( !args->max_alt_trios ) -+ { -+ stats->ndoubleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[i].idx[iCHILD]], -+ args->hdr->samples[args->trio[i].idx[iFATHER]], -+ args->hdr->samples[args->trio[i].idx[iMOTHER]] -+ ); -+ } -+ else alt_trios_add(args, i,j,0); - } -- else if ( args->ac_trio[j]==2 && args->ac[j]==2 ) // possibly a doubleton -+ } -+ } -+ if ( args->max_alt_trios ) -+ { -+ for (j=0; jn_allele; j++) -+ { -+ alt_trios_t *tr = &args->alt_trios[j]; -+ if ( !tr->nsd || tr->nalt > args->max_alt_trios ) continue; -+ for (i=0; insd; i++) - { -- if ( (als_child[0]==j || als_child[1]==j) && (als_child[0]!=j || als_child[1]!=j) ) stats->ndoubleton++; -+ int itr = tr->idx[i]; -+ trio_stats_t *stats = &flt->stats[itr]; -+ if ( kbs_exists(tr->sd_bset,i) ) -+ { -+ stats->nsingleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tNO\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[itr].idx[iCHILD]], -+ args->hdr->samples[args->trio[itr].idx[iFATHER]], -+ args->hdr->samples[args->trio[itr].idx[iMOTHER]] -+ ); -+ } -+ else -+ { -+ stats->ndoubleton++; -+ if ( args->verbose & VERBOSE_TRANSMITTED ) -+ fprintf(args->fp_out,"TRANSMITTED\t%s\t%"PRId64"\t%s\t%s\t%s\tYES\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, -+ args->hdr->samples[args->trio[itr].idx[iCHILD]], -+ args->hdr->samples[args->trio[itr].idx[iFATHER]], -+ args->hdr->samples[args->trio[itr].idx[iMOTHER]] -+ ); -+ } - } - } - } -@@ -502,10 +692,13 @@ - args->output_fname = "-"; - static struct option loptions[] = - { -+ {"debug",required_argument,0,'d'}, -+ {"alt-trios",required_argument,0,'a'}, - {"include",required_argument,0,'i'}, - {"exclude",required_argument,0,'e'}, - {"output",required_argument,NULL,'o'}, - {"ped",required_argument,NULL,'p'}, -+ {"pfm",required_argument,NULL,'P'}, - {"regions",1,0,'r'}, - {"regions-file",1,0,'R'}, - {"targets",1,0,'t'}, -@@ -513,10 +706,25 @@ - {NULL,0,NULL,0} - }; - int c, i; -- while ((c = getopt_long(argc, argv, "p:o:s:i:e:r:R:t:T:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "P:p:o:s:i:e:r:R:t:T:a:d:",loptions,NULL)) >= 0) - { - switch (c) - { -+ case 'd': -+ { -+ int n; -+ char **tmp = hts_readlist(optarg, 0, &n); -+ for(i=0; iverbose |= VERBOSE_MENDEL; -+ else if ( !strcasecmp(tmp[i],"transmitted") ) args->verbose |= VERBOSE_TRANSMITTED; -+ else error("Error: The argument \"%s\" to option --debug is not recognised\n", tmp[i]); -+ free(tmp[i]); -+ } -+ free(tmp); -+ break; -+ } -+ case 'a': args->max_alt_trios = atoi(optarg); break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 't': args->targets = optarg; break; -@@ -525,6 +733,7 @@ - case 'R': args->regions = optarg; args->regions_is_file = 1; break; - case 'o': args->output_fname = optarg; break; - case 'p': args->ped_fname = optarg; break; -+ case 'P': args->pfm = optarg; break; - case 'h': - case '?': - default: error("%s", usage_text()); break; -@@ -538,7 +747,7 @@ - else if ( optind+1!=argc ) error("%s", usage_text()); - else args->fname = argv[optind]; - -- if ( !args->ped_fname ) error("Missing the -p, --ped option\n"); -+ if ( !args->ped_fname && !args->pfm ) error("Missing the -p or -P option\n"); - - init_data(args); - ---- python-pysam.orig/bcftools/plugins/trio-switch-rate.c -+++ python-pysam/bcftools/plugins/trio-switch-rate.c -@@ -141,7 +141,7 @@ - khash_str2int_destroy(pop2i); - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ---- python-pysam.orig/bcftools/plugins/trio-switch-rate.c.pysam.c -+++ python-pysam/bcftools/plugins/trio-switch-rate.c.pysam.c -@@ -143,7 +143,7 @@ - khash_str2int_destroy(pop2i); - free(str.s); - free(off); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); - } - - int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) ---- /dev/null -+++ python-pysam/bcftools/plugins/variantkey-hex.c -@@ -0,0 +1,136 @@ -+/* plugins/variantkey-hex.c -- Generate unsorted VariantKey lookup tables files in hexadecimal format. -+ -+ Copyright (C) 2017-2018 GENOMICS plc. -+ -+ Author: Nicola Asuni -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "../variantkey.h" -+ -+const char *FILE_VKRS = "vkrs.unsorted.hex"; -+const char *FILE_RSVK = "rsvk.unsorted.hex"; -+const char *FILE_NRVK = "nrvk.unsorted.tsv"; -+ -+FILE *fp_vkrs; // VariantKey -> rsID -+FILE *fp_rsvk; // rsID -> VariantKey -+FILE *fp_nrvk; // VariantKey non-reversible map (maps VariantKey to REF and ALT) -+ -+static uint64_t numvar; // number of variants -+static uint64_t nrv; // number of non-reversible variants -+ -+bcf_hdr_t *in_hdr; -+ -+const char *about(void) -+{ -+ return "Generate VariantKey index files\n"; -+} -+ -+const char *usage(void) -+{ -+ return -+ "\n" -+ "About: Generate unsorted VariantKey lookup tables files in hexadecimal format.\n" -+ "Usage: bcftools +variantkey-hex [General Options] \n" -+ "Options:\n" -+ " run \"bcftools plugin\" for a list of common options\n" -+ "\n" -+ "Example:\n" -+ " bcftools +variantkey-hex in.vcf\n" -+ "\n"; -+} -+ -+// Called once at startup, allows to initialize local variables. -+// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. -+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -+{ -+ in_hdr = in; -+ numvar = 0; -+ char path[1024]; -+ char dir[1024] = "./"; -+ if (argc > 1) -+ { -+ strcpy(dir, argv[1]); -+ } -+ strcpy(path, dir); -+ strcat(path, FILE_VKRS); -+ fp_vkrs = fopen(path, "w"); -+ if (!fp_vkrs) -+ { -+ fprintf(stderr, "%s: %s\n", path, strerror(errno)); -+ } -+ strcpy(path, dir); -+ strcat(path, FILE_RSVK); -+ fp_rsvk = fopen(path, "w"); -+ if (!fp_rsvk) -+ { -+ fprintf(stderr, "%s: %s\n", path, strerror(errno)); -+ } -+ strcpy(path, dir); -+ strcat(path, FILE_NRVK); -+ fp_nrvk = fopen(path, "w"); -+ if (!fp_nrvk) -+ { -+ fprintf(stderr, "%s: %s\n", path, strerror(errno)); -+ } -+ return 1; -+} -+ -+// Called for each VCF record. Return rec to output the line or NULL to suppress output. -+bcf1_t *process(bcf1_t *rec) -+{ -+ int len_ref = strlen(rec->d.allele[0]); -+ int len_alt = strlen(rec->d.allele[1]); -+ uint64_t vk = variantkey( -+ in_hdr->id[BCF_DT_CTG][rec->rid].key, -+ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), -+ rec->pos, -+ rec->d.allele[0], -+ len_ref, -+ rec->d.allele[1], -+ len_alt); -+ char *ptr = rec->d.id; -+ ptr += 2; // remove 'rs' -+ uint32_t rs = (uint32_t)strtoul(ptr, NULL, 10); -+ fprintf(fp_vkrs, "%016" PRIx64 "\t%08" PRIx32 "\n", vk, rs); // map VariantKey to rsID -+ fprintf(fp_rsvk, "%08" PRIx32 "\t%016" PRIx64 "\n", rs, vk); // map rsID to VariantKey -+ if (vk & 1) -+ { -+ // map VariantKey to REF and ALT -+ fprintf(fp_nrvk, "%016" PRIx64 "\t%s\t%s\n", vk, rec->d.allele[0], rec->d.allele[1]); -+ nrv++; -+ } -+ numvar++; -+ return NULL; -+} -+ -+void destroy(void) -+{ -+ fclose(fp_vkrs); -+ fclose(fp_rsvk); -+ printf("VariantKeys: %" PRIu64 "\n", numvar); -+ printf("Non-reversible VariantKeys: %" PRIu64 "\n", nrv); -+} ---- /dev/null -+++ python-pysam/bcftools/plugins/variantkey-hex.c.pysam.c -@@ -0,0 +1,138 @@ -+#include "bcftools.pysam.h" -+ -+/* plugins/variantkey-hex.c -- Generate unsorted VariantKey lookup tables files in hexadecimal format. -+ -+ Copyright (C) 2017-2018 GENOMICS plc. -+ -+ Author: Nicola Asuni -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "../variantkey.h" -+ -+const char *FILE_VKRS = "vkrs.unsorted.hex"; -+const char *FILE_RSVK = "rsvk.unsorted.hex"; -+const char *FILE_NRVK = "nrvk.unsorted.tsv"; -+ -+FILE *fp_vkrs; // VariantKey -> rsID -+FILE *fp_rsvk; // rsID -> VariantKey -+FILE *fp_nrvk; // VariantKey non-reversible map (maps VariantKey to REF and ALT) -+ -+static uint64_t numvar; // number of variants -+static uint64_t nrv; // number of non-reversible variants -+ -+bcf_hdr_t *in_hdr; -+ -+const char *about(void) -+{ -+ return "Generate VariantKey index files\n"; -+} -+ -+const char *usage(void) -+{ -+ return -+ "\n" -+ "About: Generate unsorted VariantKey lookup tables files in hexadecimal format.\n" -+ "Usage: bcftools +variantkey-hex [General Options] \n" -+ "Options:\n" -+ " run \"bcftools plugin\" for a list of common options\n" -+ "\n" -+ "Example:\n" -+ " bcftools +variantkey-hex in.vcf\n" -+ "\n"; -+} -+ -+// Called once at startup, allows to initialize local variables. -+// Return 1 to suppress VCF/BCF header from printing, 0 otherwise. -+int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) -+{ -+ in_hdr = in; -+ numvar = 0; -+ char path[1024]; -+ char dir[1024] = "./"; -+ if (argc > 1) -+ { -+ strcpy(dir, argv[1]); -+ } -+ strcpy(path, dir); -+ strcat(path, FILE_VKRS); -+ fp_vkrs = fopen(path, "w"); -+ if (!fp_vkrs) -+ { -+ fprintf(bcftools_stderr, "%s: %s\n", path, strerror(errno)); -+ } -+ strcpy(path, dir); -+ strcat(path, FILE_RSVK); -+ fp_rsvk = fopen(path, "w"); -+ if (!fp_rsvk) -+ { -+ fprintf(bcftools_stderr, "%s: %s\n", path, strerror(errno)); -+ } -+ strcpy(path, dir); -+ strcat(path, FILE_NRVK); -+ fp_nrvk = fopen(path, "w"); -+ if (!fp_nrvk) -+ { -+ fprintf(bcftools_stderr, "%s: %s\n", path, strerror(errno)); -+ } -+ return 1; -+} -+ -+// Called for each VCF record. Return rec to output the line or NULL to suppress output. -+bcf1_t *process(bcf1_t *rec) -+{ -+ int len_ref = strlen(rec->d.allele[0]); -+ int len_alt = strlen(rec->d.allele[1]); -+ uint64_t vk = variantkey( -+ in_hdr->id[BCF_DT_CTG][rec->rid].key, -+ strlen(in_hdr->id[BCF_DT_CTG][rec->rid].key), -+ rec->pos, -+ rec->d.allele[0], -+ len_ref, -+ rec->d.allele[1], -+ len_alt); -+ char *ptr = rec->d.id; -+ ptr += 2; // remove 'rs' -+ uint32_t rs = (uint32_t)strtoul(ptr, NULL, 10); -+ fprintf(fp_vkrs, "%016" PRIx64 "\t%08" PRIx32 "\n", vk, rs); // map VariantKey to rsID -+ fprintf(fp_rsvk, "%08" PRIx32 "\t%016" PRIx64 "\n", rs, vk); // map rsID to VariantKey -+ if (vk & 1) -+ { -+ // map VariantKey to REF and ALT -+ fprintf(fp_nrvk, "%016" PRIx64 "\t%s\t%s\n", vk, rec->d.allele[0], rec->d.allele[1]); -+ nrv++; -+ } -+ numvar++; -+ return NULL; -+} -+ -+void destroy(void) -+{ -+ fclose(fp_vkrs); -+ fclose(fp_rsvk); -+ fprintf(bcftools_stdout, "VariantKeys: %" PRIu64 "\n", numvar); -+ fprintf(bcftools_stdout, "Non-reversible VariantKeys: %" PRIu64 "\n", nrv); -+} ---- python-pysam.orig/bcftools/regidx.c -+++ python-pysam/bcftools/regidx.c -@@ -262,7 +262,11 @@ - } - - free(str.s); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) -+ { -+ fprintf(stderr,"[%s] Error: close failed .. %s\n", __func__,fname); -+ goto error; -+ } - return idx; - - error: -@@ -392,12 +396,11 @@ - { - int iend = iBIN(end); - if ( iend > list->nidx ) iend = list->nidx; -- for (i=ibeg; iidx[i] ) break; -- if ( i==iend ) return 0; -+ if ( i>iend ) return 0; - i = list->idx[i]; - } -- - for (ireg=i-1; iregnreg; ireg++) - { - if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region ---- python-pysam.orig/bcftools/regidx.c.pysam.c -+++ python-pysam/bcftools/regidx.c.pysam.c -@@ -264,7 +264,11 @@ - } - - free(str.s); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) -+ { -+ fprintf(bcftools_stderr,"[%s] Error: close failed .. %s\n", __func__,fname); -+ goto error; -+ } - return idx; - - error: -@@ -394,12 +398,11 @@ - { - int iend = iBIN(end); - if ( iend > list->nidx ) iend = list->nidx; -- for (i=ibeg; iidx[i] ) break; -- if ( i==iend ) return 0; -+ if ( i>iend ) return 0; - i = list->idx[i]; - } -- - for (ireg=i-1; iregnreg; ireg++) - { - if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region ---- python-pysam.orig/bcftools/regidx.h -+++ python-pysam/bcftools/regidx.h -@@ -33,14 +33,14 @@ - // and for working example see test/test-regidx.c. - regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); - -- // Query overlap with chr:from-to -+ // Query overlap with chr:beg-end (beg,end are 1-based coordinates) - regitr_t *itr = regitr_init(idx); -- if ( regidx_overlap(idx, chr,from,to, itr) ) printf("There is an overlap!\n"); -+ if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n"); - - while ( regitr_overlap(itr) ) - { -- printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to, -- itr->beg, itr->end, regitr_payload(itr,char*)); -+ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", beg,end, -+ itr->beg+1, itr->end+1, regitr_payload(itr,char*)); - } - - regidx_destroy(idx); -@@ -53,7 +53,7 @@ - regitr_t *itr = regitr_init(idx); - - while ( regitr_loop(itr) ) -- printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg, itr->end); -+ printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg+1, itr->end+1); - - regidx_destroy(idx); - regitr_destroy(itr); ---- python-pysam.orig/bcftools/reheader.c -+++ python-pysam/bcftools/reheader.c -@@ -33,17 +33,23 @@ - #include - #include - #include -+#ifdef _WIN32 -+#include -+#endif - #include - #include - #include // for hts_get_bgzfp() - #include - #include -+#include -+#include - #include "bcftools.h" - #include "khash_str2str.h" - - typedef struct _args_t - { - char **argv, *fname, *samples_fname, *header_fname, *output_fname; -+ char *fai_fname, *rm_tmpfile; - htsFile *fp; - htsFormat type; - htsThreadPool *threads; -@@ -51,6 +57,158 @@ - } - args_t; - -+static inline int is_escaped(const char *min, const char *str) -+{ -+ int n = 0; -+ while ( --str>=min && *str=='\\' ) n++; -+ return n%2; -+} -+static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_seen, kstring_t *dst) -+{ -+ kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0}; -+ char *chr_name = NULL, *p, *q = line + 9; // skip ##contig= -+ char *end = q; -+ int nopen = 1, chr_len = 0; -+ while ( *end && *end!='\n' ) end++; -+ while ( *q && *q!='\n' && nopen>0 ) -+ { -+ p = ++q; -+ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } -+ // ^[A-Za-z_][0-9A-Za-z_.]*$ -+ if (p==q && *q && (isalpha(*q) || *q=='_')) -+ { -+ q++; -+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -+ } -+ int n = q-p; -+ int m = 0; -+ while ( *q && (*q==' ' || *q=='\t') ) { q++; m++; } -+ if ( *q!='=' || !n ) -+ { -+ char *x = q; -+ while ( *x && *x!='\n' ) x++; -+ *x = '\0'; -+ error("Could not parse the line: %s [%s][%s]\n", line,p,q); -+ } -+ key.l = 0; -+ kputsn(p,q-p-m,&key); -+ p = ++q; -+ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } -+ int quoted = *p=='"' ? 1 : 0; -+ if ( quoted ) p++, q++; -+ while ( *q && *q != '\n' ) -+ { -+ if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } -+ else -+ { -+ if ( *q=='<' ) nopen++; -+ if ( *q=='>' ) nopen--; -+ if ( !nopen ) break; -+ if ( *q==',' && nopen==1 ) break; -+ } -+ q++; -+ } -+ char *r = q; -+ while ( r > p && r[-1] == ' ' ) r--; -+ val.l = 0; -+ kputsn(p,r-p,&val); -+ if ( quoted && *q=='"' ) q++; -+ if ( *q=='>' ) { nopen--; q++; } -+ if ( !strcmp("length",key.s) ) continue; -+ if ( !strcmp("ID",key.s) ) -+ { -+ if ( khash_str2int_has_key(chr_seen,val.s) ) continue; -+ chr_len = faidx_seq_len(fai, val.s); -+ if ( chr_len==-1 ) -+ { -+ free(val.s); free(key.s); free(tmp.s); -+ return end; // the sequence is not in fai, remove -+ } -+ chr_name = strdup(val.s); -+ khash_str2int_inc(chr_seen, chr_name); -+ continue; -+ } -+ kputc(',',&tmp); -+ kputs(key.s,&tmp); -+ kputc('=',&tmp); -+ if ( quoted ) kputc('"',&tmp); -+ kputs(val.s,&tmp); -+ if ( quoted ) kputc('"',&tmp); -+ } -+ if ( !chr_name ) return end; -+ ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); -+ free(key.s); free(val.s); free(tmp.s); -+ return q; -+} -+static void update_from_fai(args_t *args) -+{ -+ if ( !strcmp("-",args->fname) ) -+ error("Cannot use the --fai option when reading from standard input.\n"); -+ -+ faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA); -+ if ( !fai ) error("Could not parse %s\n", args->fai_fname); -+#ifdef _WIN32 -+ char tmp_path[MAX_PATH]; -+ int ret = GetTempPath(MAX_PATH, tmp_path); -+ if (!ret || ret > MAX_PATH) -+ error("Could not get the path to the temporary folder\n"); -+ if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH) -+ error("Full path to the temporary folder is too long\n"); -+ strcat(tmp_path, "/bcftools-fai-header-XXXXXX"); -+ args->rm_tmpfile = strdup(tmp_path); -+#else -+ args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX"); -+#endif -+ int fd = mkstemp(args->rm_tmpfile); -+ if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile); -+ -+ // get a template header: either from the original VCF or from --header -+ char *ori_hdr_fname = args->header_fname ? args->header_fname : args->fname; -+ htsFile *fp = hts_open(ori_hdr_fname,"r"); -+ if ( !fp ) error("Failed to open: %s\n", ori_hdr_fname); -+ bcf_hdr_t *hdr = bcf_hdr_read(fp); -+ if ( !hdr ) error("Failed to read the header: %s\n", ori_hdr_fname); -+ hts_close(fp); // no need to check the return status here -+ -+ // put the header in a text buffer -+ kstring_t hdr_txt_ori = {0,0,0}, hdr_txt_new = {0,0,0}; -+ bcf_hdr_format(hdr, 0, &hdr_txt_ori); -+ bcf_hdr_destroy(hdr); -+ -+ // update the existing contig lines and remove lines not present in the fai file -+ void *chr_seen = khash_str2int_init(); -+ char *tmp, *beg = hdr_txt_ori.s; -+ while ( beg && *beg ) -+ { -+ tmp = strstr(beg, "\n##contig=<"); -+ if ( !tmp ) break; -+ kputsn(beg, tmp-beg+1, &hdr_txt_new); -+ size_t l_prev = hdr_txt_new.l; -+ beg = copy_and_update_contig_line(fai,tmp+1,chr_seen, &hdr_txt_new); -+ if ( l_prev==hdr_txt_new.l ) hdr_txt_new.l--; // nothing was added, remove the newline -+ } -+ if ( !beg || !(tmp=strstr(beg,"\n#CHROM")) ) error("Failed to parse the header, #CHROM not found\n"); -+ kputsn(beg, tmp-beg+1, &hdr_txt_new); -+ -+ // add any new contig lines -+ int i, n = faidx_nseq(fai); -+ for (i=0; i\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i))); -+ } -+ kputs(tmp+1,&hdr_txt_new); -+ -+ if ( write(fd, hdr_txt_new.s, hdr_txt_new.l)!=hdr_txt_new.l ) error("Failed to write %zu bytes to %s\n", hdr_txt_new.l,args->rm_tmpfile); -+ if ( close(fd)!=0 ) error("Failed to close %s\n", args->rm_tmpfile); -+ args->header_fname = args->rm_tmpfile; -+ -+ free(hdr_txt_ori.s); -+ free(hdr_txt_new.s); -+ fai_destroy(fai); -+ khash_str2int_destroy_free(chr_seen); -+} -+ - static void read_header_file(char *fname, kstring_t *hdr) - { - kstring_t tmp = {0,0,0}; -@@ -313,8 +471,8 @@ - kputc('\n',&fp->line); - if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); - } -- hts_close(fp); -- close(out); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); -+ if ( close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - } - - static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) -@@ -346,12 +504,14 @@ - if ( j>=0 ) - { - j = atoi(src_hrec->vals[j]); -- hrec_add_idx(tmp, j); -+ if (hrec_add_idx(tmp, j) < 0) -+ error_errno("[%s] Failed to add IDX header", __func__); - } - bcf_hdr_add_hrec(out, tmp); - } - } -- bcf_hdr_sync(out); -+ if (bcf_hdr_sync(out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - for (i=0; inhrec; i++) - { - // finally add new structured fields -@@ -375,11 +535,10 @@ - - if ( args->n_threads > 0 ) - { -- args->threads = calloc(1, sizeof(*args->threads)); -+ args->threads = (htsThreadPool *) calloc(1, sizeof(htsThreadPool)); - if ( !args->threads ) error("Could not allocate memory\n"); - if ( !(args->threads->pool = hts_tpool_init(args->n_threads)) ) error("Could not initialize threading\n"); -- BGZF *bgzf = hts_get_bgzfp(fp); -- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); -+ hts_set_thread_pool(fp, args->threads); - } - - bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); -@@ -410,11 +569,8 @@ - htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu"); - if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno)); - if ( args->threads ) -- { -- BGZF *bgzf = hts_get_bgzfp(fp_out); -- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); -- } -- bcf_hdr_write(fp_out, hdr_out); -+ hts_set_thread_pool(fp_out, args->threads); -+ if ( bcf_hdr_write(fp_out, hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); - - bcf1_t *rec = bcf_init(); - while ( bcf_read(fp, hdr, rec)==0 ) -@@ -459,13 +615,13 @@ - if ( i!=rec->n_fmt ) - error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); - -- bcf_write(fp_out,hdr_out,rec); -+ if ( bcf_write(fp_out,hdr_out,rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); - } - bcf_destroy(rec); - - free(htxt.s); -- hts_close(fp_out); -- hts_close(fp); -+ if ( hts_close(fp_out)!=0 ) error("[%s] Error: failed to close the file %s\n",__func__,args->output_fname ? args->output_fname : "standard output"); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); - bcf_hdr_destroy(hdr_out); - bcf_hdr_destroy(hdr); - if ( args->threads ) -@@ -483,10 +639,21 @@ - fprintf(stderr, "Usage: bcftools reheader [OPTIONS] \n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Options:\n"); -+ fprintf(stderr, " -f, --fai update sequences and their lengths from the .fai file\n"); - fprintf(stderr, " -h, --header new header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -s, --samples new sample names\n"); -- fprintf(stderr, " --threads number of extra compression threads (BCF only) [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads (BCF only) [0]\n"); -+ fprintf(stderr, "\n"); -+ fprintf(stderr, "Example:\n"); -+ fprintf(stderr, " # Write out the header to be modified\n"); -+ fprintf(stderr, " bcftools view -h old.bcf > header.txt\n"); -+ fprintf(stderr, "\n"); -+ fprintf(stderr, " # Edit the header using your favorite text editor\n"); -+ fprintf(stderr, " vi header.txt\n"); -+ fprintf(stderr, "\n"); -+ fprintf(stderr, " # Reheader the file\n"); -+ fprintf(stderr, " bcftools reheader -h header.txt -o new.bcf old.bcf\n"); - fprintf(stderr, "\n"); - exit(1); - } -@@ -499,21 +666,23 @@ - - static struct option loptions[] = - { -+ {"fai",1,0,'f'}, - {"output",1,0,'o'}, - {"header",1,0,'h'}, - {"samples",1,0,'s'}, - {"threads",1,NULL,1}, - {0,0,0,0} - }; -- while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0) - { - switch (c) - { - case 1 : args->n_threads = strtol(optarg, 0, 0); break; -+ case 'f': args->fai_fname = optarg; break; - case 'o': args->output_fname = optarg; break; - case 's': args->samples_fname = optarg; break; - case 'h': args->header_fname = optarg; break; -- case '?': usage(args); -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -525,11 +694,12 @@ - } - else args->fname = argv[optind]; - -+ if ( args->fai_fname ) update_from_fai(args); - if ( !args->samples_fname && !args->header_fname ) usage(args); - if ( !args->fname ) usage(args); - - args->fp = hts_open(args->fname,"r"); -- if ( !args->fp ) error("Failed to open: %s\n", args->fname); -+ if ( !args->fp ) error("Failed to read from %s\n", !strcmp("-",args->fname)?"standard input":args->fname); - args->type = *hts_get_format(args->fp); - - if ( args->type.format==vcf ) -@@ -542,6 +712,11 @@ - else - reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); - -+ if ( args->rm_tmpfile ) -+ { -+ unlink(args->rm_tmpfile); -+ free(args->rm_tmpfile); -+ } - free(args); - return 0; - } ---- python-pysam.orig/bcftools/reheader.c.pysam.c -+++ python-pysam/bcftools/reheader.c.pysam.c -@@ -35,17 +35,23 @@ - #include - #include - #include -+#ifdef _WIN32 -+#include -+#endif - #include - #include - #include // for hts_get_bgzfp() - #include - #include -+#include -+#include - #include "bcftools.h" - #include "khash_str2str.h" - - typedef struct _args_t - { - char **argv, *fname, *samples_fname, *header_fname, *output_fname; -+ char *fai_fname, *rm_tmpfile; - htsFile *fp; - htsFormat type; - htsThreadPool *threads; -@@ -53,6 +59,158 @@ - } - args_t; - -+static inline int is_escaped(const char *min, const char *str) -+{ -+ int n = 0; -+ while ( --str>=min && *str=='\\' ) n++; -+ return n%2; -+} -+static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_seen, kstring_t *dst) -+{ -+ kstring_t key = {0,0,0}, val = {0,0,0}, tmp = {0,0,0}; -+ char *chr_name = NULL, *p, *q = line + 9; // skip ##contig= -+ char *end = q; -+ int nopen = 1, chr_len = 0; -+ while ( *end && *end!='\n' ) end++; -+ while ( *q && *q!='\n' && nopen>0 ) -+ { -+ p = ++q; -+ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } -+ // ^[A-Za-z_][0-9A-Za-z_.]*$ -+ if (p==q && *q && (isalpha(*q) || *q=='_')) -+ { -+ q++; -+ while ( *q && (isalnum(*q) || *q=='_' || *q=='.') ) q++; -+ } -+ int n = q-p; -+ int m = 0; -+ while ( *q && (*q==' ' || *q=='\t') ) { q++; m++; } -+ if ( *q!='=' || !n ) -+ { -+ char *x = q; -+ while ( *x && *x!='\n' ) x++; -+ *x = '\0'; -+ error("Could not parse the line: %s [%s][%s]\n", line,p,q); -+ } -+ key.l = 0; -+ kputsn(p,q-p-m,&key); -+ p = ++q; -+ while ( *q && (*q==' ' || *q=='\t') ) { p++; q++; } -+ int quoted = *p=='"' ? 1 : 0; -+ if ( quoted ) p++, q++; -+ while ( *q && *q != '\n' ) -+ { -+ if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } -+ else -+ { -+ if ( *q=='<' ) nopen++; -+ if ( *q=='>' ) nopen--; -+ if ( !nopen ) break; -+ if ( *q==',' && nopen==1 ) break; -+ } -+ q++; -+ } -+ char *r = q; -+ while ( r > p && r[-1] == ' ' ) r--; -+ val.l = 0; -+ kputsn(p,r-p,&val); -+ if ( quoted && *q=='"' ) q++; -+ if ( *q=='>' ) { nopen--; q++; } -+ if ( !strcmp("length",key.s) ) continue; -+ if ( !strcmp("ID",key.s) ) -+ { -+ if ( khash_str2int_has_key(chr_seen,val.s) ) continue; -+ chr_len = faidx_seq_len(fai, val.s); -+ if ( chr_len==-1 ) -+ { -+ free(val.s); free(key.s); free(tmp.s); -+ return end; // the sequence is not in fai, remove -+ } -+ chr_name = strdup(val.s); -+ khash_str2int_inc(chr_seen, chr_name); -+ continue; -+ } -+ kputc(',',&tmp); -+ kputs(key.s,&tmp); -+ kputc('=',&tmp); -+ if ( quoted ) kputc('"',&tmp); -+ kputs(val.s,&tmp); -+ if ( quoted ) kputc('"',&tmp); -+ } -+ if ( !chr_name ) return end; -+ ksprintf(dst,"##contig=",chr_name,chr_len,tmp.l ? tmp.s : ""); -+ free(key.s); free(val.s); free(tmp.s); -+ return q; -+} -+static void update_from_fai(args_t *args) -+{ -+ if ( !strcmp("-",args->fname) ) -+ error("Cannot use the --fai option when reading from standard input.\n"); -+ -+ faidx_t *fai = fai_load3(args->fai_fname,args->fai_fname,NULL,FAI_FASTA); -+ if ( !fai ) error("Could not parse %s\n", args->fai_fname); -+#ifdef _WIN32 -+ char tmp_path[MAX_PATH]; -+ int ret = GetTempPath(MAX_PATH, tmp_path); -+ if (!ret || ret > MAX_PATH) -+ error("Could not get the path to the temporary folder\n"); -+ if (strlen(tmp_path) + strlen("/bcftools-fai-header-XXXXXX") >= MAX_PATH) -+ error("Full path to the temporary folder is too long\n"); -+ strcat(tmp_path, "/bcftools-fai-header-XXXXXX"); -+ args->rm_tmpfile = strdup(tmp_path); -+#else -+ args->rm_tmpfile = strdup("/tmp/bcftools-fai-header-XXXXXX"); -+#endif -+ int fd = mkstemp(args->rm_tmpfile); -+ if ( fd<0 ) error("Could not open a temporary file for writing: %s\n", args->rm_tmpfile); -+ -+ // get a template header: either from the original VCF or from --header -+ char *ori_hdr_fname = args->header_fname ? args->header_fname : args->fname; -+ htsFile *fp = hts_open(ori_hdr_fname,"r"); -+ if ( !fp ) error("Failed to open: %s\n", ori_hdr_fname); -+ bcf_hdr_t *hdr = bcf_hdr_read(fp); -+ if ( !hdr ) error("Failed to read the header: %s\n", ori_hdr_fname); -+ hts_close(fp); // no need to check the return status here -+ -+ // put the header in a text buffer -+ kstring_t hdr_txt_ori = {0,0,0}, hdr_txt_new = {0,0,0}; -+ bcf_hdr_format(hdr, 0, &hdr_txt_ori); -+ bcf_hdr_destroy(hdr); -+ -+ // update the existing contig lines and remove lines not present in the fai file -+ void *chr_seen = khash_str2int_init(); -+ char *tmp, *beg = hdr_txt_ori.s; -+ while ( beg && *beg ) -+ { -+ tmp = strstr(beg, "\n##contig=<"); -+ if ( !tmp ) break; -+ kputsn(beg, tmp-beg+1, &hdr_txt_new); -+ size_t l_prev = hdr_txt_new.l; -+ beg = copy_and_update_contig_line(fai,tmp+1,chr_seen, &hdr_txt_new); -+ if ( l_prev==hdr_txt_new.l ) hdr_txt_new.l--; // nothing was added, remove the newline -+ } -+ if ( !beg || !(tmp=strstr(beg,"\n#CHROM")) ) error("Failed to parse the header, #CHROM not found\n"); -+ kputsn(beg, tmp-beg+1, &hdr_txt_new); -+ -+ // add any new contig lines -+ int i, n = faidx_nseq(fai); -+ for (i=0; i\n",faidx_iseq(fai,i),faidx_seq_len(fai,faidx_iseq(fai,i))); -+ } -+ kputs(tmp+1,&hdr_txt_new); -+ -+ if ( write(fd, hdr_txt_new.s, hdr_txt_new.l)!=hdr_txt_new.l ) error("Failed to write %zu bytes to %s\n", hdr_txt_new.l,args->rm_tmpfile); -+ if ( close(fd)!=0 ) error("Failed to close %s\n", args->rm_tmpfile); -+ args->header_fname = args->rm_tmpfile; -+ -+ free(hdr_txt_ori.s); -+ free(hdr_txt_new.s); -+ fai_destroy(fai); -+ khash_str2int_destroy_free(chr_seen); -+} -+ - static void read_header_file(char *fname, kstring_t *hdr) - { - kstring_t tmp = {0,0,0}; -@@ -315,8 +473,8 @@ - kputc('\n',&fp->line); - if ( write(out, fp->line.s, fp->line.l)!=fp->line.l ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); - } -- hts_close(fp); -- close(out); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); -+ if ( close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - } - - static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) -@@ -348,12 +506,14 @@ - if ( j>=0 ) - { - j = atoi(src_hrec->vals[j]); -- hrec_add_idx(tmp, j); -+ if (hrec_add_idx(tmp, j) < 0) -+ error_errno("[%s] Failed to add IDX header", __func__); - } - bcf_hdr_add_hrec(out, tmp); - } - } -- bcf_hdr_sync(out); -+ if (bcf_hdr_sync(out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - for (i=0; inhrec; i++) - { - // finally add new structured fields -@@ -377,11 +537,10 @@ - - if ( args->n_threads > 0 ) - { -- args->threads = calloc(1, sizeof(*args->threads)); -+ args->threads = (htsThreadPool *) calloc(1, sizeof(htsThreadPool)); - if ( !args->threads ) error("Could not allocate memory\n"); - if ( !(args->threads->pool = hts_tpool_init(args->n_threads)) ) error("Could not initialize threading\n"); -- BGZF *bgzf = hts_get_bgzfp(fp); -- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); -+ hts_set_thread_pool(fp, args->threads); - } - - bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); -@@ -412,11 +571,8 @@ - htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu"); - if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno)); - if ( args->threads ) -- { -- BGZF *bgzf = hts_get_bgzfp(fp_out); -- if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize); -- } -- bcf_hdr_write(fp_out, hdr_out); -+ hts_set_thread_pool(fp_out, args->threads); -+ if ( bcf_hdr_write(fp_out, hdr_out)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); - - bcf1_t *rec = bcf_init(); - while ( bcf_read(fp, hdr, rec)==0 ) -@@ -461,13 +617,13 @@ - if ( i!=rec->n_fmt ) - error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); - -- bcf_write(fp_out,hdr_out,rec); -+ if ( bcf_write(fp_out,hdr_out,rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname ? args->output_fname : "standard output"); - } - bcf_destroy(rec); - - free(htxt.s); -- hts_close(fp_out); -- hts_close(fp); -+ if ( hts_close(fp_out)!=0 ) error("[%s] Error: failed to close the file %s\n",__func__,args->output_fname ? args->output_fname : "standard output"); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fname); - bcf_hdr_destroy(hdr_out); - bcf_hdr_destroy(hdr); - if ( args->threads ) -@@ -485,10 +641,21 @@ - fprintf(bcftools_stderr, "Usage: bcftools reheader [OPTIONS] \n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "Options:\n"); -+ fprintf(bcftools_stderr, " -f, --fai update sequences and their lengths from the .fai file\n"); - fprintf(bcftools_stderr, " -h, --header new header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -s, --samples new sample names\n"); -- fprintf(bcftools_stderr, " --threads number of extra compression threads (BCF only) [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads (BCF only) [0]\n"); -+ fprintf(bcftools_stderr, "\n"); -+ fprintf(bcftools_stderr, "Example:\n"); -+ fprintf(bcftools_stderr, " # Write out the header to be modified\n"); -+ fprintf(bcftools_stderr, " bcftools view -h old.bcf > header.txt\n"); -+ fprintf(bcftools_stderr, "\n"); -+ fprintf(bcftools_stderr, " # Edit the header using your favorite text editor\n"); -+ fprintf(bcftools_stderr, " vi header.txt\n"); -+ fprintf(bcftools_stderr, "\n"); -+ fprintf(bcftools_stderr, " # Reheader the file\n"); -+ fprintf(bcftools_stderr, " bcftools reheader -h header.txt -o new.bcf old.bcf\n"); - fprintf(bcftools_stderr, "\n"); - exit(1); - } -@@ -501,21 +668,23 @@ - - static struct option loptions[] = - { -+ {"fai",1,0,'f'}, - {"output",1,0,'o'}, - {"header",1,0,'h'}, - {"samples",1,0,'s'}, - {"threads",1,NULL,1}, - {0,0,0,0} - }; -- while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "s:h:o:f:",loptions,NULL)) >= 0) - { - switch (c) - { - case 1 : args->n_threads = strtol(optarg, 0, 0); break; -+ case 'f': args->fai_fname = optarg; break; - case 'o': args->output_fname = optarg; break; - case 's': args->samples_fname = optarg; break; - case 'h': args->header_fname = optarg; break; -- case '?': usage(args); -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -527,11 +696,12 @@ - } - else args->fname = argv[optind]; - -+ if ( args->fai_fname ) update_from_fai(args); - if ( !args->samples_fname && !args->header_fname ) usage(args); - if ( !args->fname ) usage(args); - - args->fp = hts_open(args->fname,"r"); -- if ( !args->fp ) error("Failed to open: %s\n", args->fname); -+ if ( !args->fp ) error("Failed to read from %s\n", !strcmp("-",args->fname)?"standard input":args->fname); - args->type = *hts_get_format(args->fp); - - if ( args->type.format==vcf ) -@@ -544,6 +714,11 @@ - else - reheader_bcf(args, args->type.compression==bgzf || args->type.compression==gzip); - -+ if ( args->rm_tmpfile ) -+ { -+ unlink(args->rm_tmpfile); -+ free(args->rm_tmpfile); -+ } - free(args); - return 0; - } ---- python-pysam.orig/bcftools/smpl_ilist.c -+++ python-pysam/bcftools/smpl_ilist.c -@@ -22,15 +22,29 @@ - THE SOFTWARE. - */ - -+#include - #include "bcftools.h" - #include "smpl_ilist.h" - - void smpl_ilist_destroy(smpl_ilist_t *smpl) - { -+ int i; -+ if ( smpl->pair ) -+ { -+ for (i=0; in; i++) free(smpl->pair[i]); -+ free(smpl->pair); -+ } - free(smpl->idx); - free(smpl); - } - -+static inline int is_space_or_escaped(const char *min, const char *str) -+{ -+ if ( !isspace(*str) ) return 0; -+ int n = 0; -+ while ( --str>=min && *str=='\\' ) n++; -+ return n%2 ? 0 : 1; -+} - smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags) - { - smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t)); -@@ -44,32 +58,63 @@ - return smpl; - } - -+ int negate = sample_list[0]=='^' ? 1 : 0; - int nlist; -- char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist); -+ char **list = hts_readlist(negate?sample_list+1:sample_list, is_file, &nlist); - if ( !list ) error("Could not parse %s\n", sample_list); - - // preserve the VCF order - int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); -+ char **pair = NULL; - for (i=0; i=0 ) -+ char *smpl1 = list[i]; -+ char *smpl2 = NULL; -+ -+ char *ptr = list[i]; -+ while ( *ptr && !is_space_or_escaped(list[i], ptr) ) ptr++; -+ if ( *ptr ) -+ { -+ *ptr = 0; -+ smpl2 = ptr+1; -+ } -+ -+ char *smpl_name = flags&SMPL_PAIR2 && smpl2 ? smpl2 : smpl1; -+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, smpl_name); -+ if ( idx<0 ) - { -- tmp[idx] = 1; -- smpl->n++; -+ if ( !(flags&SMPL_STRICT) ) -+ { -+ if ( flags&SMPL_VERBOSE ) fprintf(stderr,"No such sample: \"%s\"\n",smpl_name); -+ continue; -+ } -+ error("No such sample: \"%s\"\n", smpl_name); - } -- else if ( flags&SMPL_STRICT ) -- error("No such sample: %s\n", list[i]); -+ -+ tmp[idx] = 1; -+ if ( smpl2 ) -+ { -+ if ( !pair ) pair = (char**)calloc(bcf_hdr_nsamples(hdr),sizeof(char*)); -+ if ( flags&SMPL_PAIR2 ) pair[idx] = strdup(smpl1); -+ else if ( flags&SMPL_PAIR1 ) pair[idx] = strdup(smpl2); -+ } -+ smpl->n++; - } - -- if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; -+ if ( negate ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; - smpl->idx = (int*) malloc(sizeof(int)*smpl->n); - - int j = 0; -- if ( sample_list[0]!='^' ) -+ if ( !negate ) - { -+ if ( pair ) smpl->pair = (char**) calloc(smpl->n,sizeof(char*)); - for (i=0; iidx[j++] = i; -+ { -+ if ( !tmp[i] ) continue; -+ smpl->idx[j] = i; -+ if ( pair && pair[i] ) smpl->pair[j] = pair[i]; -+ j++; -+ } - } - else - { -@@ -78,6 +123,7 @@ - } - - free(tmp); -+ free(pair); - for (i=0; i - #include "bcftools.h" - #include "smpl_ilist.h" - - void smpl_ilist_destroy(smpl_ilist_t *smpl) - { -+ int i; -+ if ( smpl->pair ) -+ { -+ for (i=0; in; i++) free(smpl->pair[i]); -+ free(smpl->pair); -+ } - free(smpl->idx); - free(smpl); - } - -+static inline int is_space_or_escaped(const char *min, const char *str) -+{ -+ if ( !isspace(*str) ) return 0; -+ int n = 0; -+ while ( --str>=min && *str=='\\' ) n++; -+ return n%2 ? 0 : 1; -+} - smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags) - { - smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t)); -@@ -46,32 +60,63 @@ - return smpl; - } - -+ int negate = sample_list[0]=='^' ? 1 : 0; - int nlist; -- char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist); -+ char **list = hts_readlist(negate?sample_list+1:sample_list, is_file, &nlist); - if ( !list ) error("Could not parse %s\n", sample_list); - - // preserve the VCF order - int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); -+ char **pair = NULL; - for (i=0; i=0 ) -+ char *smpl1 = list[i]; -+ char *smpl2 = NULL; -+ -+ char *ptr = list[i]; -+ while ( *ptr && !is_space_or_escaped(list[i], ptr) ) ptr++; -+ if ( *ptr ) -+ { -+ *ptr = 0; -+ smpl2 = ptr+1; -+ } -+ -+ char *smpl_name = flags&SMPL_PAIR2 && smpl2 ? smpl2 : smpl1; -+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, smpl_name); -+ if ( idx<0 ) - { -- tmp[idx] = 1; -- smpl->n++; -+ if ( !(flags&SMPL_STRICT) ) -+ { -+ if ( flags&SMPL_VERBOSE ) fprintf(bcftools_stderr,"No such sample: \"%s\"\n",smpl_name); -+ continue; -+ } -+ error("No such sample: \"%s\"\n", smpl_name); - } -- else if ( flags&SMPL_STRICT ) -- error("No such sample: %s\n", list[i]); -+ -+ tmp[idx] = 1; -+ if ( smpl2 ) -+ { -+ if ( !pair ) pair = (char**)calloc(bcf_hdr_nsamples(hdr),sizeof(char*)); -+ if ( flags&SMPL_PAIR2 ) pair[idx] = strdup(smpl1); -+ else if ( flags&SMPL_PAIR1 ) pair[idx] = strdup(smpl2); -+ } -+ smpl->n++; - } - -- if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; -+ if ( negate ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n; - smpl->idx = (int*) malloc(sizeof(int)*smpl->n); - - int j = 0; -- if ( sample_list[0]!='^' ) -+ if ( !negate ) - { -+ if ( pair ) smpl->pair = (char**) calloc(smpl->n,sizeof(char*)); - for (i=0; iidx[j++] = i; -+ { -+ if ( !tmp[i] ) continue; -+ smpl->idx[j] = i; -+ if ( pair && pair[i] ) smpl->pair[j] = pair[i]; -+ j++; -+ } - } - else - { -@@ -80,6 +125,7 @@ - } - - free(tmp); -+ free(pair); - for (i=0; i - --#define SMPL_NONE 0 // flexible error recovery --#define SMPL_STRICT 1 // samples must exist -+#define SMPL_NONE 0 // flexible error recovery -+#define SMPL_STRICT 1 // samples must exist -+#define SMPL_SINGLE 2 // single sample expected -+#define SMPL_PAIR1 4 // two samples expected, the first is from the bcf hdr -+#define SMPL_PAIR2 8 // two samples expected, the second is from the bcf hdr -+#define SMPL_VERBOSE 16 // print warnings - - typedef struct - { -- int *idx; // index to bcf_hdr_t.samples -+ char **pair; // the other sample in the pair -+ int *idx; // index to bcf_hdr_t.samples; the first (SMPL_SINGLE|SMPL_PAIR1) or second sample (SMPL_PAIR2) - int n; - } - smpl_ilist_t; ---- python-pysam.orig/bcftools/tabix.c -+++ python-pysam/bcftools/tabix.c -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -84,7 +85,6 @@ - { - // auto-detect file type by file name - int l = strlen(argv[optind]); -- int strcasecmp(const char *s1, const char *s2); - if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam; ---- python-pysam.orig/bcftools/tabix.c.pysam.c -+++ python-pysam/bcftools/tabix.c.pysam.c -@@ -29,6 +29,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -86,7 +87,6 @@ - { - // auto-detect file type by file name - int l = strlen(argv[optind]); -- int strcasecmp(const char *s1, const char *s2); - if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed; - else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam; ---- python-pysam.orig/bcftools/test/test-regidx.c -+++ python-pysam/bcftools/test/test-regidx.c -@@ -32,6 +32,7 @@ - #include - #include - #include -+#include - #include - #include "regidx.h" - -@@ -225,6 +226,54 @@ - regidx_destroy(idx); - free(str.s); - } -+void test_explicit(char *tgt, char *qry, char *exp) -+{ -+ regidx_t *idx = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); -+ -+ char *beg = tgt, *end, *exp_ori = exp; -+ kstring_t str = {0,0,0}; -+ while ( *beg ) -+ { -+ end = tgt; -+ while ( *end && *end!=';' ) end++; -+ str.l = 0; -+ kputsn(beg, end-beg, &str); -+ debug("insert: %s\n", str.s); -+ if ( regidx_insert(idx,str.s)!=0 ) error("insert failed: %s\n", str.s); -+ beg = *end ? end + 1 : end; -+ } -+ -+ beg = qry; -+ while ( *beg ) -+ { -+ end = qry; -+ while ( *end && *end!=';' ) end++; -+ str.l = 0; -+ kputsn(beg, end-beg, &str); -+ beg = *end ? end + 1 : end; -+ -+ char *chr_beg, *chr_end; -+ uint32_t reg_beg, reg_end; -+ if ( regidx_parse_reg(str.s, &chr_beg, &chr_end, ®_beg, ®_end, NULL, NULL)!=0 ) error("could not parse: %s in %s\n", str.s, qry); -+ chr_end[1] = 0; -+ int hit = regidx_overlap(idx,chr_beg,reg_beg,reg_end,NULL); -+ if ( *exp=='1' ) -+ { -+ if ( !hit ) error("query failed, there should be a hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ debug("ok: overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ } -+ else if ( *exp=='0' ) -+ { -+ if ( hit ) error("query failed, there should be no hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ debug("ok: no overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ } -+ else error("could not parse: %s\n", exp_ori); -+ exp++; -+ } -+ -+ free(str.s); -+ regidx_destroy(idx); -+} - - void create_line_bed(char *line, char *chr, int start, int end) - { -@@ -259,6 +308,11 @@ - set_line(line,chr,start,end); - debug("insert: %s", line); - if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); -+ -+ start = 20000*i; end = start + 2000; -+ set_line(line,chr,start,end); -+ debug("insert: %s", line); -+ if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); - } - - regitr_t *itr = regitr_init(idx); -@@ -311,6 +365,19 @@ - } - if ( nhit!=2 ) error("query failed, expected two hits, found %d: %s:%d-%d\n",nhit,chr,start,end); - -+ // fully contained interval, one hit -+ start = 20000*i - 5000; end = 20000*i + 3000; -+ set_line(line,chr,start,end); -+ if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); -+ debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); -+ nhit = 0; -+ while ( regitr_overlap(itr) ) -+ { -+ if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); -+ debug("\t %d-%d\n",itr->beg+1,itr->end+1); -+ nhit++; -+ } -+ if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); - } - regitr_destroy(itr); - regidx_destroy(idx); -@@ -363,6 +430,9 @@ - info("Testing custom payload\n"); - test_custom_payload(); - -+ info("Testing cases encountered in past\n"); -+ test_explicit("12:2064519-2064763","12:2064488-2067434","1"); -+ - int i, ntest = 1000, nreg = 50; - srandom(seed); - info("%d randomized tests, %d regions per test. Random seed is %d\n", ntest,nreg,seed); ---- python-pysam.orig/bcftools/test/test-regidx.c.pysam.c -+++ python-pysam/bcftools/test/test-regidx.c.pysam.c -@@ -34,6 +34,7 @@ - #include - #include - #include -+#include - #include - #include "regidx.h" - -@@ -227,6 +228,54 @@ - regidx_destroy(idx); - free(str.s); - } -+void test_explicit(char *tgt, char *qry, char *exp) -+{ -+ regidx_t *idx = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL); -+ -+ char *beg = tgt, *end, *exp_ori = exp; -+ kstring_t str = {0,0,0}; -+ while ( *beg ) -+ { -+ end = tgt; -+ while ( *end && *end!=';' ) end++; -+ str.l = 0; -+ kputsn(beg, end-beg, &str); -+ debug("insert: %s\n", str.s); -+ if ( regidx_insert(idx,str.s)!=0 ) error("insert failed: %s\n", str.s); -+ beg = *end ? end + 1 : end; -+ } -+ -+ beg = qry; -+ while ( *beg ) -+ { -+ end = qry; -+ while ( *end && *end!=';' ) end++; -+ str.l = 0; -+ kputsn(beg, end-beg, &str); -+ beg = *end ? end + 1 : end; -+ -+ char *chr_beg, *chr_end; -+ uint32_t reg_beg, reg_end; -+ if ( regidx_parse_reg(str.s, &chr_beg, &chr_end, ®_beg, ®_end, NULL, NULL)!=0 ) error("could not parse: %s in %s\n", str.s, qry); -+ chr_end[1] = 0; -+ int hit = regidx_overlap(idx,chr_beg,reg_beg,reg_end,NULL); -+ if ( *exp=='1' ) -+ { -+ if ( !hit ) error("query failed, there should be a hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ debug("ok: overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ } -+ else if ( *exp=='0' ) -+ { -+ if ( hit ) error("query failed, there should be no hit .. %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ debug("ok: no overlap found for %s:%d-%d\n",chr_beg,reg_beg+1,reg_end+1); -+ } -+ else error("could not parse: %s\n", exp_ori); -+ exp++; -+ } -+ -+ free(str.s); -+ regidx_destroy(idx); -+} - - void create_line_bed(char *line, char *chr, int start, int end) - { -@@ -261,6 +310,11 @@ - set_line(line,chr,start,end); - debug("insert: %s", line); - if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); -+ -+ start = 20000*i; end = start + 2000; -+ set_line(line,chr,start,end); -+ debug("insert: %s", line); -+ if ( regidx_insert(idx,line)!=0 ) error("insert failed: %s\n", line); - } - - regitr_t *itr = regitr_init(idx); -@@ -313,6 +367,19 @@ - } - if ( nhit!=2 ) error("query failed, expected two hits, found %d: %s:%d-%d\n",nhit,chr,start,end); - -+ // fully contained interval, one hit -+ start = 20000*i - 5000; end = 20000*i + 3000; -+ set_line(line,chr,start,end); -+ if ( !regidx_overlap(idx,chr,start-1,end-1,itr) ) error("query failed, there should be a hit: %s:%d-%d\n",chr,start,end); -+ debug("ok: overlap(s) found for %s:%d-%d\n",chr,start,end); -+ nhit = 0; -+ while ( regitr_overlap(itr) ) -+ { -+ if ( itr->beg > end-1 || itr->end < start-1 ) error("query failed, incorrect region: %d-%d for %d-%d\n",itr->beg+1,itr->end+1,start,end); -+ debug("\t %d-%d\n",itr->beg+1,itr->end+1); -+ nhit++; -+ } -+ if ( nhit!=1 ) error("query failed, expected one hit, found %d: %s:%d-%d\n",nhit,chr,start,end); - } - regitr_destroy(itr); - regidx_destroy(idx); -@@ -365,6 +432,9 @@ - info("Testing custom payload\n"); - test_custom_payload(); - -+ info("Testing cases encountered in past\n"); -+ test_explicit("12:2064519-2064763","12:2064488-2067434","1"); -+ - int i, ntest = 1000, nreg = 50; - srandom(seed); - info("%d randomized tests, %d regions per test. Random seed is %d\n", ntest,nreg,seed); ---- /dev/null -+++ python-pysam/bcftools/variantkey.h -@@ -0,0 +1,583 @@ -+// VariantKey -+// -+// variantkey.h -+// -+// @category Libraries -+// @author Nicola Asuni -+// @copyright 2017-2018 GENOMICS plc -+// @license MIT (see LICENSE) -+// @link https://github.com/genomicsplc/variantkey -+// -+// LICENSE -+// -+// Copyright (c) 2017-2018 GENOMICS plc -+// -+// Permission is hereby granted, free of charge, to any person obtaining a copy -+// of this software and associated documentation files (the "Software"), to deal -+// in the Software without restriction, including without limitation the rights -+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+// copies of the Software, and to permit persons to whom the Software is -+// furnished to do so, subject to the following conditions: -+// -+// The above copyright notice and this permission notice shall be included in -+// all copies or substantial portions of the Software. -+// -+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -+// THE SOFTWARE. -+ -+/** -+ * @file variantkey.h -+ * @brief VariantKey main functions. -+ * -+ * The functions provided here allows to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants. -+ * The VariantKey is sortable for chromosome and position, -+ * and it is also fully reversible for variants with up to 11 bases between Reference and Alternate alleles. -+ * It can be used to sort, search and match variant-based data easily and very quickly. -+ */ -+ -+#ifndef VARIANTKEY_H -+#define VARIANTKEY_H -+ -+#include -+#include -+#include -+#include "hex.h" -+ -+#define VKMASK_CHROM 0xF800000000000000 //!< VariantKey binary mask for CHROM [ 11111000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ] -+#define VKMASK_POS 0x07FFFFFF80000000 //!< VariantKey binary mask for POS [ 00000111 11111111 11111111 11111111 10000000 00000000 00000000 00000000 ] -+#define VKMASK_CHROMPOS 0xFFFFFFFF80000000 //!< VariantKey binary mask for CHROM+POS [ 11111111 11111111 11111111 11111111 10000000 00000000 00000000 00000000 ] -+#define VKMASK_REFALT 0x000000007FFFFFFF //!< VariantKey binary mask for REF+ALT [ 00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111111 ] -+#define VKSHIFT_CHROM 59 //!< CHROM LSB position from the VariantKey LSB -+#define VKSHIFT_POS 31 //!< POS LSB position from the VariantKey LSB -+ -+/** -+ * VariantKey struct. -+ * Contains the numerically encoded VariantKey components (CHROM, POS, REF+ALT). -+ */ -+typedef struct variantkey_t -+{ -+ uint8_t chrom; //!< Chromosome encoded number (only the LSB 5 bit are used) -+ uint32_t pos; //!< Reference position, with the first base having position 0 (only the LSB 28 bit are used) -+ uint32_t refalt; //!< Code for Reference and Alternate allele (only the LSB 31 bits are used) -+} variantkey_t; -+ -+/** -+ * Struct containing the minimum and maximum VariantKey values for range searches. -+ */ -+typedef struct vkrange_t -+{ -+ uint64_t min; //!< Minimum VariantKey value for any given REF+ALT encoding -+ uint64_t max; //!< Maximum VariantKey value for any given REF+ALT encoding -+} vkrange_t; -+ -+/** @brief Returns chromosome numerical encoding. -+ * -+ * @param chrom Chromosome. An identifier from the reference genome, no white-space permitted. -+ * @param size Length of the chrom string, excluding the terminating null byte. -+ * -+ * @return CHROM code -+ */ -+static inline uint8_t encode_chrom(const char *chrom, size_t size) -+{ -+ // X > 23 ; Y > 24 ; M > 25 -+ static const uint8_t onecharmap[] = -+ { -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ /* M X Y */ -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,23,24, 0, 0, 0, 0, 0, 0, -+ /* m x y */ -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,23,24, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -+ }; -+ // remove "chr" prefix -+ if ((size > 3) -+ && ((chrom[0] == 'c') || (chrom[0] == 'C')) -+ && ((chrom[1] == 'h') || (chrom[1] == 'H')) -+ && ((chrom[2] == 'r') || (chrom[2] == 'R'))) -+ { -+ chrom += 3; -+ size -= 3; -+ } -+ if (size == 0) -+ { -+ return 0; -+ } -+ if ((chrom[0] <= '9') && (chrom[0] >= '0')) // Number -+ { -+ size_t i; -+ uint8_t v = (chrom[0] - '0'); -+ for (i = 1; i < size; i++) -+ { -+ if ((chrom[i] > '9') || (chrom[i] < '0')) -+ { -+ return 0; // NA -+ } -+ v = ((v * 10) + (chrom[i] - '0')); -+ } -+ return v; -+ } -+ if ((size == 1) || ((size == 2) && ((chrom[1] == 'T') || (chrom[1] == 't')))) -+ { -+ return onecharmap[((uint8_t)chrom[0])]; -+ } -+ return 0; // NA -+} -+ -+/** @brief Decode the chromosome numerical code. -+ * -+ * @param code CHROM code. -+ * @param chrom CHROM string buffer to be returned. Its size should be enough to contain the results (max 4 bytes). -+ * -+ * @return If successful, the total number of characters written is returned, -+ * excluding the null-character appended at the end of the string, -+ * otherwise a negative number is returned in case of failure. -+ */ -+static inline size_t decode_chrom(uint8_t code, char *chrom) -+{ -+ if ((code < 1) || (code > 25)) -+ { -+ return sprintf(chrom, "NA"); -+ } -+ if (code < 23) -+ { -+ return sprintf(chrom, "%" PRIu8, code); -+ } -+ static const char *map[] = {"X", "Y", "MT"}; -+ return sprintf(chrom, "%s", map[(code - 23)]); -+} -+ -+static inline uint32_t encode_base(const uint8_t c) -+{ -+ /* -+ Encode base: -+ A > 0 -+ C > 1 -+ G > 2 -+ T > 3 -+ */ -+ static const uint32_t map[] = -+ { -+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, -+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, -+ /*A C G T*/ -+ 4,0,4,1,4,4,4,2,4,4,4,4,4,4,4,4,4,4,4,4,3,4,4,4,4,4,4,4,4,4,4,4, -+ /*a c g t*/ -+ 4,0,4,1,4,4,4,2,4,4,4,4,4,4,4,4,4,4,4,4,3,4,4,4,4,4,4,4,4,4,4,4, -+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, -+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, -+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, -+ 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, -+ }; -+ return map[c]; -+} -+ -+static inline int encode_allele(uint32_t *h, uint8_t *bitpos, const char *str, size_t size) -+{ -+ uint32_t v; -+ while (size--) -+ { -+ v = encode_base(*str++); -+ if (v > 3) -+ { -+ return -1; -+ } -+ *bitpos -= 2; -+ *h |= (v << *bitpos); -+ } -+ return 0; -+} -+ -+static inline uint32_t encode_refalt_rev(const char *ref, size_t sizeref, const char *alt, size_t sizealt) -+{ -+ //[******** ******** ******** ******** *RRRRAAA A1122334 45566778 8990011*] -+ uint32_t h = 0; -+ h |= ((uint32_t)(sizeref) << 27); // RRRR: length of (REF - 1) -+ h |= ((uint32_t)(sizealt) << 23); // AAAA: length of (ALT - 1) -+ uint8_t bitpos = 23; -+ if ((encode_allele(&h, &bitpos, ref, sizeref) < 0) || (encode_allele(&h, &bitpos, alt, sizealt) < 0)) -+ { -+ return 0; // error code -+ } -+ return h; -+} -+ -+// Mix two 32 bit hash numbers using a MurmurHash3-like algorithm -+static inline uint32_t muxhash(uint32_t k, uint32_t h) -+{ -+ k *= 0xcc9e2d51; -+ k = (k >> 17) | (k << 15); -+ k *= 0x1b873593; -+ h ^= k; -+ h = (h >> 19) | (h << 13); -+ return ((h * 5) + 0xe6546b64); -+} -+ -+static inline uint32_t encode_packchar(int c) -+{ -+ if (c < 'A') -+ { -+ return 27; -+ } -+ if (c >= 'a') -+ { -+ return (uint32_t)(c - 'a' + 1); -+ } -+ return (uint32_t)(c - 'A' + 1); -+} -+ -+// pack blocks of 6 characters in 32 bit (6 x 5 bit + 2 spare bit) [ 01111122 22233333 44444555 55666660 ] -+static inline uint32_t pack_chars_tail(const char *str, size_t size) -+{ -+ uint32_t h = 0; -+ const char *pos = (str + size - 1); -+ switch (size) -+ { -+ case 5: -+ h ^= encode_packchar(*pos--) << (1 + (5 * 1)); -+ // fall through -+ case 4: -+ h ^= encode_packchar(*pos--) << (1 + (5 * 2)); -+ // fall through -+ case 3: -+ h ^= encode_packchar(*pos--) << (1 + (5 * 3)); -+ // fall through -+ case 2: -+ h ^= encode_packchar(*pos--) << (1 + (5 * 4)); -+ // fall through -+ case 1: -+ h ^= encode_packchar(*pos) << (1 + (5 * 5)); -+ } -+ return h; -+} -+ -+static inline uint32_t pack_chars(const char *str) -+{ -+ const char *pos = (str + 5); -+ return ((encode_packchar(*pos) << 1) -+ ^ (encode_packchar(*(pos-1)) << (1 + (5 * 1))) -+ ^ (encode_packchar(*(pos-2)) << (1 + (5 * 2))) -+ ^ (encode_packchar(*(pos-3)) << (1 + (5 * 3))) -+ ^ (encode_packchar(*(pos-4)) << (1 + (5 * 4))) -+ ^ (encode_packchar(*(pos-5)) << (1 + (5 * 5)))); -+} -+ -+// Return a 32 bit hash of a nucleotide string -+static inline uint32_t hash32(const char *str, size_t size) -+{ -+ uint32_t h = 0; -+ size_t len = 6; -+ while (size >= len) -+ { -+ h = muxhash(pack_chars(str), h); -+ str += len; -+ size -= len; -+ } -+ if (size > 0) -+ { -+ h = muxhash(pack_chars_tail(str, size), h); -+ } -+ return h; -+} -+ -+static inline uint32_t encode_refalt_hash(const char *ref, size_t sizeref, const char *alt, size_t sizealt) -+{ -+ // 0x3 is the separator character between REF and ALT [00000000 00000000 00000000 00000011] -+ uint32_t h = muxhash(hash32(alt, sizealt), muxhash(0x3, hash32(ref, sizeref))); -+ // MurmurHash3 finalization mix - force all bits of a hash block to avalanche -+ h ^= h >> 16; -+ h *= 0x85ebca6b; -+ h ^= h >> 13; -+ h *= 0xc2b2ae35; -+ h ^= h >> 16; -+ return ((h >> 1) | 0x1); // 0x1 is the set bit to indicate HASH mode [00000000 00000000 00000000 00000001] -+} -+ -+/** @brief Returns reference+alternate numerical encoding. -+ * -+ * @param ref Reference allele. String containing a sequence of nucleotide letters. -+ * The value in the pos field refers to the position of the first nucleotide in the String. -+ * Characters must be A-Z, a-z or * -+ * @param sizeref Length of the ref string, excluding the terminating null byte. -+ * @param alt Alternate non-reference allele string. -+ * Characters must be A-Z, a-z or * -+ * @param sizealt Length of the alt string, excluding the terminating null byte. -+ * -+ * @return REF+ALT code -+ */ -+static inline uint32_t encode_refalt(const char *ref, size_t sizeref, const char *alt, size_t sizealt) -+{ -+ if ((sizeref + sizealt) <= 11) -+ { -+ uint32_t h = encode_refalt_rev(ref, sizeref, alt, sizealt); -+ if (h != 0) -+ { -+ return h; -+ } -+ } -+ return encode_refalt_hash(ref, sizeref, alt, sizealt); -+} -+ -+static inline char decode_base(uint32_t code, int bitpos) -+{ -+ static const char base[4] = {'A', 'C', 'G', 'T'}; -+ return base[((code >> bitpos) & 0x3)]; // 0x3 is the 2 bit mask [00000011] -+} -+ -+static inline size_t decode_refalt_rev(uint32_t code, char *ref, size_t *sizeref, char *alt, size_t *sizealt) -+{ -+ *sizeref = (size_t)((code & 0x78000000) >> 27); // [01111000 00000000 00000000 00000000] -+ *sizealt = (size_t)((code & 0x07800000) >> 23); // [00000111 10000000 00000000 00000000] -+ switch (*sizeref) -+ { -+ case 10: -+ ref[9] = decode_base(code, (3 + (2 * 0))); -+ // fall through -+ case 9: -+ ref[8] = decode_base(code, (3 + (2 * 1))); -+ // fall through -+ case 8: -+ ref[7] = decode_base(code, (3 + (2 * 2))); -+ // fall through -+ case 7: -+ ref[6] = decode_base(code, (3 + (2 * 3))); -+ // fall through -+ case 6: -+ ref[5] = decode_base(code, (3 + (2 * 4))); -+ // fall through -+ case 5: -+ ref[4] = decode_base(code, (3 + (2 * 5))); -+ // fall through -+ case 4: -+ ref[3] = decode_base(code, (3 + (2 * 6))); -+ // fall through -+ case 3: -+ ref[2] = decode_base(code, (3 + (2 * 7))); -+ // fall through -+ case 2: -+ ref[1] = decode_base(code, (3 + (2 * 8))); -+ // fall through -+ case 1: -+ ref[0] = decode_base(code, (3 + (2 * 9))); -+ } -+ ref[*sizeref] = 0; -+ uint8_t bitpos = (23 - ((*sizeref) << 1)); -+ switch (*sizealt) -+ { -+ case 10: -+ alt[9] = decode_base(code, bitpos - (2 * 10)); -+ // fall through -+ case 9: -+ alt[8] = decode_base(code, bitpos - (2 * 9)); -+ // fall through -+ case 8: -+ alt[7] = decode_base(code, bitpos - (2 * 8)); -+ // fall through -+ case 7: -+ alt[6] = decode_base(code, bitpos - (2 * 7)); -+ // fall through -+ case 6: -+ alt[5] = decode_base(code, bitpos - (2 * 6)); -+ // fall through -+ case 5: -+ alt[4] = decode_base(code, bitpos - (2 * 5)); -+ // fall through -+ case 4: -+ alt[3] = decode_base(code, bitpos - (2 * 4)); -+ // fall through -+ case 3: -+ alt[2] = decode_base(code, bitpos - (2 * 3)); -+ // fall through -+ case 2: -+ alt[1] = decode_base(code, bitpos - (2 * 2)); -+ // fall through -+ case 1: -+ alt[0] = decode_base(code, bitpos - (2 * 1)); -+ } -+ alt[*sizealt] = 0; -+ return (*sizeref + *sizealt); -+} -+ -+/** @brief Decode the 32 bit REF+ALT code if reversible (if it has 11 or less bases in total and only contains ACGT letters). -+ * -+ * @param code REF+ALT code -+ * @param ref REF string buffer to be returned. -+ * @param sizeref Pointer to the size of the ref buffer, excluding the terminating null byte. -+ * This will contain the final ref size. -+ * @param alt ALT string buffer to be returned. -+ * @param sizealt Pointer to the size of the alt buffer, excluding the terminating null byte. -+ * This will contain the final alt size. -+ * -+ * @return If the code is reversible, then the total number of characters of REF+ALT is returned. -+ * Otherwise 0 is returned. -+ */ -+static inline size_t decode_refalt(uint32_t code, char *ref, size_t *sizeref, char *alt, size_t *sizealt) -+{ -+ if (code & 0x1) // check last bit -+ { -+ return 0; // non-reversible encoding -+ } -+ return decode_refalt_rev(code, ref, sizeref, alt, sizealt); -+} -+ -+/** @brief Returns a 64 bit variant key based on the pre-encoded CHROM, POS (0-based) and REF+ALT. -+ * -+ * @param chrom Encoded Chromosome (see encode_chrom). -+ * @param pos Position. The reference position, with the first base having position 0. -+ * @param refalt Encoded Reference + Alternate (see encode_refalt). -+ * -+ * @return VariantKey 64 bit code. -+ */ -+static inline uint64_t encode_variantkey(uint8_t chrom, uint32_t pos, uint32_t refalt) -+{ -+ return (((uint64_t)chrom << VKSHIFT_CHROM) | ((uint64_t)pos << VKSHIFT_POS) | (uint64_t)refalt); -+} -+ -+/** @brief Extract the CHROM code from VariantKey. -+ * -+ * @param vk VariantKey code. -+ * -+ * @return CHROM code. -+ */ -+static inline uint8_t extract_variantkey_chrom(uint64_t vk) -+{ -+ return (uint8_t)((vk & VKMASK_CHROM) >> VKSHIFT_CHROM); -+} -+ -+/** @brief Extract the POS code from VariantKey. -+ * -+ * @param vk VariantKey code. -+ * -+ * @return POS. -+ */ -+static inline uint32_t extract_variantkey_pos(uint64_t vk) -+{ -+ return (uint32_t)((vk & VKMASK_POS) >> VKSHIFT_POS); -+} -+ -+/** @brief Extract the REF+ALT code from VariantKey. -+ * -+ * @param vk VariantKey code. -+ * -+ * @return REF+ALT code. -+ */ -+static inline uint32_t extract_variantkey_refalt(uint64_t vk) -+{ -+ return (uint32_t)(vk & VKMASK_REFALT); -+} -+ -+/** @brief Decode a VariantKey code and returns the components as variantkey_t structure. -+ * -+ * @param code VariantKey code. -+ * @param vk Decoded variantkey structure. -+ */ -+static inline void decode_variantkey(uint64_t code, variantkey_t *vk) -+{ -+ vk->chrom = extract_variantkey_chrom(code); -+ vk->pos = extract_variantkey_pos(code); -+ vk->refalt = extract_variantkey_refalt(code); -+} -+ -+/** @brief Returns a 64 bit variant key based on CHROM, POS (0-based), REF, ALT. -+ * -+ * @param chrom Chromosome. An identifier from the reference genome, no white-space or leading zeros permitted. -+ * @param sizechrom Length of the chrom string, excluding the terminating null byte. -+ * @param pos Position. The reference position, with the first base having position 0. -+ * @param ref Reference allele. String containing a sequence of nucleotide letters. -+ * The value in the pos field refers to the position of the first nucleotide in the String. -+ * Characters must be A-Z, a-z or * -+ * @param sizeref Length of the ref string, excluding the terminating null byte. -+ * @param alt Alternate non-reference allele string. -+ * Characters must be A-Z, a-z or * -+ * @param sizealt Length of the alt string, excluding the terminating null byte. -+ * -+ * @return VariantKey 64 bit code. -+ */ -+static inline uint64_t variantkey(const char *chrom, size_t sizechrom, uint32_t pos, const char *ref, size_t sizeref, const char *alt, size_t sizealt) -+{ -+ return encode_variantkey(encode_chrom(chrom, sizechrom), pos, encode_refalt(ref, sizeref, alt, sizealt)); -+} -+ -+/** @brief Returns minimum and maximum VariantKeys for range searches. -+ * -+ * @param chrom Chromosome encoded number. -+ * @param pos_min Start reference position, with the first base having position 0. -+ * @param pos_max End reference position, with the first base having position 0. -+ * @param range VariantKey range values. -+ */ -+static inline void variantkey_range(uint8_t chrom, uint32_t pos_min, uint32_t pos_max, vkrange_t *range) -+{ -+ uint64_t c = ((uint64_t)chrom << VKSHIFT_CHROM); -+ range->min = (c | ((uint64_t)pos_min << VKSHIFT_POS)); -+ range->max = (c | ((uint64_t)pos_max << VKSHIFT_POS) | VKMASK_REFALT); -+} -+ -+static inline int8_t compare_uint64_t(uint64_t a, uint64_t b) -+{ -+ return (a < b) ? -1 : (a > b); -+} -+ -+/** @brief Compares two VariantKeys by chromosome only. -+ * -+ * @param vka The first VariantKey to be compared. -+ * @param vkb The second VariantKey to be compared. -+ * -+ * @return -1 if the first chromosome is smaller than the second, 0 if they are equal and 1 if the first is greater than the second. -+ */ -+static inline int8_t compare_variantkey_chrom(uint64_t vka, uint64_t vkb) -+{ -+ return compare_uint64_t((vka >> VKSHIFT_CHROM), (vkb >> VKSHIFT_CHROM)); -+} -+ -+/** @brief Compares two VariantKeys by chromosome and position. -+ * -+ * @param vka The first VariantKey to be compared. -+ * @param vkb The second VariantKey to be compared. -+ * -+ * @return -1 if the first CHROM+POS is smaller than the second, 0 if they are equal and 1 if the first is greater than the second. -+ */ -+static inline int8_t compare_variantkey_chrom_pos(uint64_t vka, uint64_t vkb) -+{ -+ return compare_uint64_t((vka >> VKSHIFT_POS), (vkb >> VKSHIFT_POS)); -+} -+ -+/** @brief Returns VariantKey hexadecimal string (16 characters). -+ * -+ * The string represent a 64 bit number or: -+ * - 5 bit for CHROM -+ * - 28 bit for POS -+ * - 31 bit for REF+ALT -+ * -+ * @param vk VariantKey code. -+ * @param str String buffer to be returned (it must be sized 17 bytes at least). -+ * -+ * @return Upon successful return, these function returns the number of characters processed -+ * (excluding the null byte used to end output to strings). -+ * If the buffer size is not sufficient, then the return value is the number of characters required for -+ * buffer string, including the terminating null byte. -+ */ -+static inline size_t variantkey_hex(uint64_t vk, char *str) -+{ -+ return hex_uint64_t(vk, str); -+} -+ -+/** @brief Parses a VariantKey hexadecimal string and returns the code. -+ * -+ * @param vs VariantKey hexadecimal string (it must contain 16 hexadecimal characters). -+ * -+ * @return A VariantKey code. -+ */ -+static inline uint64_t parse_variantkey_hex(const char *vs) -+{ -+ return parse_hex_uint64_t(vs); -+} -+ -+#endif // VARIANTKEY_H ---- python-pysam.orig/bcftools/vcfannotate.c -+++ python-pysam/bcftools/vcfannotate.c -@@ -1,6 +1,6 @@ - /* vcfannotate.c -- Annotate and edit VCF/BCF files. - -- Copyright (C) 2013-2018 Genome Research Ltd. -+ Copyright (C) 2013-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -33,16 +33,17 @@ - #include - #include - #include -+#include - #include - #include - #include - #include --#include - #include "bcftools.h" - #include "vcmp.h" - #include "filter.h" - #include "convert.h" - #include "smpl_ilist.h" -+#include "regidx.h" - - struct _args_t; - -@@ -65,15 +66,30 @@ - } - annot_line_t; - --#define REPLACE_MISSING 0 // replace only missing values --#define REPLACE_ALL 1 // replace both missing and existing values --#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing --#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise -+#define REPLACE_MISSING 0 // replace only missing values -+#define REPLACE_ALL 1 // replace both missing and existing values -+#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing -+#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise -+#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest -+#define MM_APPEND 1 // append, possibly multiple times -+#define MM_UNIQUE 2 // append, only unique values -+#define MM_SUM 3 -+#define MM_AVG 4 -+#define MM_MIN 5 -+#define MM_MAX 6 - typedef struct _annot_col_t - { - int icol, replace, number; // number: one of BCF_VL_* types - char *hdr_key_src, *hdr_key_dst; - int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); -+ int merge_method; // one of the MM_* defines -+ khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values -+ kstring_t mm_kstr; -+ double -+ mm_dbl_nalloc, // the allocated size --merge-logic values array -+ mm_dbl_nused, // the number of used elements in the mm_dbl array -+ mm_dbl_ndat, // the number of merged rows (for calculating the average) -+ *mm_dbl; - } - annot_col_t; - -@@ -92,6 +108,10 @@ - int output_type, n_threads; - bcf_sr_regions_t *tgts; - -+ regidx_t *tgt_idx; -+ regitr_t *tgt_itr; -+ int tgt_is_bed; -+ - filter_t *filter; - char *filter_str; - int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE -@@ -104,7 +124,7 @@ - vcmp_t *vcmp; // for matching annotation and VCF lines by allele - annot_line_t *alines; // buffered annotation lines - int nalines, malines; -- int ref_idx, alt_idx, chr_idx, from_idx, to_idx; // -1 if not present -+ int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present - annot_col_t *cols; // column indexes and setters - int ncols; - -@@ -125,18 +145,40 @@ - - char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; - char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; -- int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic; -+ char *merge_method_str; -+ int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; - } - args_t; - - char *msprintf(const char *fmt, ...); - -+int parse_with_payload(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -+{ -+ args_t *args = (args_t*) usr; -+ int ret = args->tgt_is_bed ? regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL) : regidx_parse_tab(line, chr_beg, chr_end, beg, end, NULL, NULL); -+ if ( ret<0 ) return ret; -+ *((char **)payload) = strdup(line); -+ return 0; -+} -+void free_payload(void *payload) -+{ -+ char *str = *((char**)payload); -+ free(str); -+} -+ - void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag) - { - bcf_update_id(args->hdr,line,NULL); - } - void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag) - { -+ if ( tag->key && tag->hdr_id<0 ) -+ { -+ error("Error: Cannot proceed, not even with the --force option, bad things could happen.\n" -+ " Note that \"bcftools annotate -x FILTER\" can be used to remove ALL filters.\n" -+ " Even better, use \"bcftools view -h\" and \"bcftools reheader\" to fix the header!\n" -+ ); -+ } - if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass); - else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass); - } -@@ -223,7 +265,10 @@ - memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); - bcf_hrec_destroy(hrec); - } -- if ( nrm ) bcf_hdr_sync(hdr); -+ if ( nrm ) { -+ if (bcf_hdr_sync(hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); -+ } - } - - static void init_remove_annots(args_t *args) -@@ -264,8 +309,14 @@ - tag->handler = remove_filter; - tag->key = strdup(str.s); - tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key); -- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s); -- if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); -+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) -+ { -+ if ( args->keep_sites ) -+ error("Error: The filter \"%s\" is not defined in the header, cannot use the -k option\n", str.s); -+ else -+ fprintf(stderr,"Warning: The filter \"%s\" is not defined in the header\n", str.s); -+ } -+ else if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); - } - else - { -@@ -280,8 +331,14 @@ - int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s); - if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) ) - { -- fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); -- args->nrm--; -+ if ( args->keep_sites ) -+ error("Error: The tag \"%s\" is not defined in the header, cannot use the -k option\n", str.s); -+ else -+ fprintf(stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); -+ -+ tag->key = strdup(str.s); -+ if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; -+ else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; - } - else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) - { -@@ -364,7 +421,8 @@ - } - khash_str2int_destroy_free(keep); - if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - static void init_header_lines(args_t *args) - { -@@ -376,13 +434,17 @@ - if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); - bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) - } -- hts_close(file); -+ if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); - free(str.s); -- bcf_hdr_sync(args->hdr_out); -- bcf_hdr_sync(args->hdr); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update output header", __func__); -+ if (bcf_hdr_sync(args->hdr) < 0) -+ error_errno("[%s] Failed to update input header", __func__); - } - static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n"); -+ - // note: so far this works only with one filter, not a list of filters - annot_line_t *tab = (annot_line_t*) data; - if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." -@@ -432,6 +494,8 @@ - } - static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); -+ - // possible cases: - // IN ANNOT OUT ACHIEVED_BY - // x y x -c +ID -@@ -493,6 +557,8 @@ - } - static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with QUAL (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; // empty -@@ -501,7 +567,7 @@ - - line->qual = strtod(str, &str); - if ( str == tab->cols[col->icol] ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - return 0; - } - static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -@@ -514,13 +580,15 @@ - } - static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with INFO type=Flag (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; - - if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); - if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - return -1; - } - static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -@@ -533,13 +601,13 @@ - static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) - { - if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; - int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); -- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - // fill in any missing values in the target VCF (or all, if not present) - int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); -@@ -565,19 +633,75 @@ - static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { - annot_line_t *tab = (annot_line_t*) data; -- char *str = tab->cols[col->icol], *end = str; -- if ( str[0]=='.' && str[1]==0 ) return 0; - -- int ntmpi = 0; -- while ( *end ) -+ if ( !tab ) -+ { -+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) -+ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n"); -+ } -+ -+ int i,ntmpi = 0; -+ if ( tab ) -+ { -+ char *str = tab->cols[col->icol], *end = str; -+ if ( str[0]=='.' && str[1]==0 ) return 0; -+ -+ while ( *end ) -+ { -+ int val = strtol(str, &end, 10); -+ if ( end==str ) -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); -+ ntmpi++; -+ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); -+ args->tmpi[ntmpi-1] = val; -+ str = end+1; -+ } -+ if ( col->merge_method!=MM_FIRST ) -+ { -+ if ( !col->mm_dbl_nused ) -+ { -+ col->mm_dbl_nused = ntmpi; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i] = args->tmpi[i]; -+ } -+ else -+ { -+ if ( col->merge_method==MM_APPEND ) -+ { -+ int nori = col->mm_dbl_nused; -+ col->mm_dbl_nused += ntmpi; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i+nori] = args->tmpi[i]; -+ } -+ else -+ { -+ if ( ntmpi!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); -+ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) -+ for (i=0; imm_dbl[i] += args->tmpi[i]; -+ else if ( col->merge_method==MM_MIN ) -+ for (i=0; imm_dbl[i] > args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } -+ else if ( col->merge_method==MM_MAX ) -+ for (i=0; imm_dbl[i] < args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } -+ } -+ } -+ col->mm_dbl_ndat++; -+ } -+ } -+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) -+ { -+ ntmpi = col->mm_dbl_nused; -+ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); -+ for (i=0; itmpi[i] = col->mm_dbl[i]; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; -+ } -+ else if ( col->merge_method==MM_AVG ) - { -- int val = strtol(str, &end, 10); -- if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -- ntmpi++; -+ ntmpi = col->mm_dbl_nused; - hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); -- args->tmpi[ntmpi-1] = val; -- str = end+1; -+ for (i=0; itmpi[i] = col->mm_dbl[i]/col->mm_dbl_ndat; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; - } - - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -@@ -613,13 +737,13 @@ - static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) - { - if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; - int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); -- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - // fill in any missing values in the target VCF (or all, if not present) - int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); -@@ -645,19 +769,75 @@ - static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { - annot_line_t *tab = (annot_line_t*) data; -- char *str = tab->cols[col->icol], *end = str; -- if ( str[0]=='.' && str[1]==0 ) return 0; - -- int ntmpf = 0; -- while ( *end ) -+ if ( !tab ) - { -- double val = strtod(str, &end); -- if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -- ntmpf++; -- hts_expand(float,ntmpf,args->mtmpf,args->tmpf); -- args->tmpf[ntmpf-1] = val; -- str = end+1; -+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) -+ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n"); -+ } -+ -+ int i,ntmpf = 0; -+ if ( tab ) -+ { -+ char *str = tab->cols[col->icol], *end = str; -+ if ( str[0]=='.' && str[1]==0 ) return 0; -+ -+ while ( *end ) -+ { -+ double val = strtod(str, &end); -+ if ( end==str ) -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); -+ ntmpf++; -+ hts_expand(float,ntmpf,args->mtmpf,args->tmpf); -+ args->tmpf[ntmpf-1] = val; -+ str = end+1; -+ } -+ if ( col->merge_method!=MM_FIRST ) -+ { -+ if ( !col->mm_dbl_nused ) -+ { -+ col->mm_dbl_nused = ntmpf; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i] = args->tmpf[i]; -+ } -+ else -+ { -+ if ( col->merge_method==MM_APPEND ) -+ { -+ int nori = col->mm_dbl_nused; -+ col->mm_dbl_nused += ntmpf; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i+nori] = args->tmpf[i]; -+ } -+ else -+ { -+ if ( ntmpf!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); -+ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) -+ for (i=0; imm_dbl[i] += args->tmpf[i]; -+ else if ( col->merge_method==MM_MIN ) -+ for (i=0; imm_dbl[i] > args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } -+ else if ( col->merge_method==MM_MAX ) -+ for (i=0; imm_dbl[i] < args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } -+ } -+ } -+ col->mm_dbl_ndat++; -+ } -+ } -+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) -+ { -+ ntmpf = col->mm_dbl_nused; -+ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); -+ for (i=0; itmpf[i] = col->mm_dbl[i]; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; -+ } -+ else if ( col->merge_method==MM_AVG ) -+ { -+ ntmpf = col->mm_dbl_nused; -+ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); -+ for (i=0; itmpf[i] = col->mm_dbl[i]/col->mm_dbl_ndat; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; - } - - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -@@ -693,6 +873,8 @@ - int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c - static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) - { -+ assert( col->merge_method==MM_FIRST ); -+ - int nsrc = 1, lsrc = 0; - while ( args->tmps[lsrc] ) - { -@@ -700,13 +882,13 @@ - lsrc++; - } - if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; - int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); -- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - // fill in any missing values in the target VCF (or all, if not present) - int i, empty = 0, nstr, mstr = args->tmpks.m; -@@ -746,22 +928,76 @@ - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); - return 0; - } -+void khash_str2int_clear_free(void *_hash) -+{ -+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash; -+ khint_t k; -+ if (hash == 0) return; -+ for (k = 0; k < kh_end(hash); ++k) -+ if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); -+ kh_clear(str2int, hash); -+} - static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) -+ { -+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); -+ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; -+ } -+ - annot_line_t *tab = (annot_line_t*) data; -- int len = strlen(tab->cols[col->icol]); -- if ( !len ) return 0; -- hts_expand(char,len+1,args->mtmps,args->tmps); -- memcpy(args->tmps,tab->cols[col->icol],len+1); -- if ( args->tmps[0]=='.' && args->tmps[1]==0 ) return 0; -+ -+ int len = 0; -+ if ( tab ) -+ { -+ len = strlen(tab->cols[col->icol]); -+ if ( !len ) return 0; -+ if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0; -+ } - -- if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -- return setter_ARinfo_string(args,line,col,tab->nals,tab->als); -+ if ( col->merge_method!=MM_FIRST ) -+ { -+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -+ error("Error: the --merge-logic option cannot be used with INFO tags Type=String,Number={A,R,G}\n"); - -- if ( col->replace==REPLACE_MISSING ) -+ if ( data ) -+ { -+ assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE ); -+ if ( col->merge_method==MM_UNIQUE ) -+ { -+ if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); -+ if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0; -+ khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); -+ } -+ -+ if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); -+ kputs(tab->cols[col->icol], &col->mm_kstr); -+ return 0; -+ } -+ -+ if ( col->mm_kstr.l ) -+ { -+ hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps); -+ memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1); -+ } -+ else -+ return 0; -+ -+ if ( !data ) // flush the line -+ { -+ if ( col->merge_method==MM_UNIQUE ) -+ khash_str2int_clear_free(col->mm_str_hash); -+ col->mm_kstr.l = 0; -+ } -+ } -+ else - { -- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); -- if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; -+ assert(tab); -+ hts_expand(char,len+1,args->mtmps,args->tmps); -+ memcpy(args->tmps,tab->cols[col->icol],len+1); -+ -+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -+ return setter_ARinfo_string(args,line,col,tab->nals,tab->als); - } - - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); -@@ -785,6 +1021,48 @@ - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; - } -+static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str) -+{ -+ int i, isrc, idst; -+ int blen = nsrc1 > 1 ? nsrc1 + 1 : 1; // typically the genotypes take three bytes 0/1, no 0-termination is needed -+ -+gt_length_too_big: -+ str->l = 0; -+ for (idst=0; idstsample_map ? args->sample_map[idst] : idst; -+ if ( isrc==-1 ) -+ { -+ kputc_('.', str); -+ for (i=1; i < blen; i++) kputc_(0, str); -+ continue; -+ } -+ -+ size_t plen = str->l; -+ int32_t *ptr = src + isrc*nsrc1; -+ for (i=0; il - plen > blen ) -+ { -+ // too many alternate alleles or ploidy is too large, the genotype does not fit -+ // three characters ("0/0" vs "10/10"). -+ blen *= 2; -+ goto gt_length_too_big; -+ } -+ plen = str->l - plen; -+ while ( plen < blen ) -+ { -+ kputc_(0, str); -+ plen++; -+ } -+ } -+ return 0; -+} - static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { - bcf1_t *rec = (bcf1_t*) data; -@@ -792,6 +1070,16 @@ - if ( nsrc==-3 ) return 0; // the tag is not present - if ( nsrc<=0 ) return 1; // error - -+ // Genotypes are internally represented as integers. This is a complication when -+ // adding as a different Type=String field, such as FMT/newGT:=GT -+ if ( strcmp(col->hdr_key_src,col->hdr_key_dst) ) -+ { -+ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); -+ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); -+ genotypes_to_string(args,nsrc/nsmpl_src,args->tmpi,nsmpl_dst,&args->tmpks); -+ return bcf_update_format_char(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s,args->tmpks.l); -+ } -+ - if ( !args->sample_map ) - return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc); - -@@ -1057,9 +1345,11 @@ - } - static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) -- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); - hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); - -@@ -1082,7 +1372,7 @@ - char *end = str; - ptr[ival] = strtol(str, &end, 10); - if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; -@@ -1094,9 +1384,11 @@ - } - static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) -- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); - hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); - -@@ -1120,7 +1412,7 @@ - char *end = str; - ptr[ival] = strtod(str, &end); - if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; -@@ -1132,9 +1424,11 @@ - } - static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) -- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ismpl; - for (ismpl=0; ismplnsmpl_annot; ismpl++) -@@ -1186,7 +1480,7 @@ - // create mapping from src to dst genotypes, haploid and diploid version - int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; - int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); -- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int i, j; - if ( rec->n_allele==line->n_allele ) -@@ -1226,15 +1520,15 @@ - } - int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); - if ( pld_src<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); - int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); - if ( pld_dst<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; - if ( ndst1_new != ndst1 ) - { -- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); -+ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - ndst1 = ndst1_new; - hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2); - } -@@ -1254,7 +1548,7 @@ - if ( col->number==BCF_VL_G ) - { - if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) -- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - if ( !args->dst_smpl_pld[i] ) - for (j=0; jfiles->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); - if ( nsrc==-3 ) return 0; // the tag is not present -@@ -1294,7 +1587,7 @@ - // create mapping from src to dst genotypes, haploid and diploid version - int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; - int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); -- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int i, j; - if ( rec->n_allele==line->n_allele ) -@@ -1334,15 +1627,15 @@ - } - int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); - if ( pld_src<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); - int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); - if ( pld_dst<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; - if ( ndst1_new != ndst1 ) - { -- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); -+ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - ndst1 = ndst1_new; - hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2); - } -@@ -1362,7 +1655,7 @@ - if ( col->number==BCF_VL_G ) - { - if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) -- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - if ( !args->dst_smpl_pld[i] ) - for (j=0; jtmps = args->tmpp[0]; // tmps might be realloced - if ( ret==-3 ) return 0; // the tag is not present - if ( ret<=0 ) return 1; // error -- return core_setter_format_str(args,line,col,args->tmpp); -+ if ( strcmp("GT",col->hdr_key_dst) ) -+ return core_setter_format_str(args,line,col,args->tmpp); -+ -+ // Genotypes are internally represented as integers. This is a complication for FMT/GT:=oldGT -+ // First determine the maximum number of alleles per-sample ndst1 -+ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); -+ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); -+ int isrc,idst, ndst1 = 0, nsrc1 = ret / nsmpl_src; -+ char *ptr = args->tmps, *ptr_end = ptr + ret; -+ while ( ptr < ptr_end ) -+ { -+ char *smpl_end = ptr + nsrc1; -+ int n = 1; -+ while ( ptr < smpl_end ) -+ { -+ if ( *ptr=='/' || *ptr=='|' ) n++; -+ ptr++; -+ } -+ if ( ndst1 < n ) ndst1 = n; -+ } -+ assert( ndst1 ); -+ -+ int ndst = ndst1*nsmpl_dst; -+ hts_expand(int32_t,ndst,args->mtmpi,args->tmpi); -+ hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated -+ for (idst=0; idsttmpi + idst*ndst1; -+ isrc = args->sample_map ? args->sample_map[idst] : idst; -+ if ( isrc==-1 ) -+ { -+ dst[0] = bcf_gt_missing; -+ for (i=1; itmps + isrc*nsrc1, *tmp; -+ char *keep_ptr = beg+nsrc1, keep = *keep_ptr; *keep_ptr = 0; -+ while ( *beg ) -+ { -+ char *end = beg; -+ while ( *end && *end!='/' && *end!='|' ) end++; -+ if ( *beg=='.' && end-beg==1 ) dst[i] = bcf_gt_missing; -+ else -+ { -+ if ( *end=='|' ) is_phased = 1; -+ dst[i] = strtol(beg, &tmp, 10); -+ if ( tmp!=end ) -+ error("Could not parse the %s field at %s:%"PRId64" in %s\n", col->hdr_key_src,bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1,args->targets_fname); -+ if ( dst[i] >= line->n_allele ) -+ error("The source allele index is bigger than the number of destination alleles at %s:%"PRId64"\n", bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1); -+ dst[i] = is_phased ? bcf_gt_phased(dst[i]) : bcf_gt_unphased(dst[i]); -+ } -+ beg = *end ? end+1 : end; -+ i++; -+ } -+ *keep_ptr = keep; -+ for (; ihdr_out,line,args->tmpi,ndst); - } - static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) - { -@@ -1446,62 +1798,25 @@ - args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); - for (i=0; insample_map; i++) args->sample_map[i] = -1; - -- // possible todo: could do with smpl_ilist only -- smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT); -- if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names); -- char **samples = (char**) malloc(sizeof(char*)*ilist->n); -- for (i=0; in; i++) samples[i] = strdup(dst->samples[i]); -+ int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file -+ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src -+ if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names); - args->nsmpl_annot = ilist->n; -- smpl_ilist_destroy(ilist); - int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1; -- if ( !src ) -+ for (i=0; insmpl_annot; i++) - { -- // tab annotation file -- for (i=0; insmpl_annot; i++) -+ int idst = ilist->idx[i]; -+ const char *src_name = ilist->pair && ilist->pair[i] ? ilist->pair[i] : bcf_hdr_int2id(dst, BCF_DT_SAMPLE, idst); -+ int isrc = i; -+ if ( src ) // the annotation file is a VCF, not a tab-delimited file - { -- int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]); -- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]); -- args->sample_map[idst] = i; -- if ( idst!=i ) need_sample_map = 1; -- } -- } -- else -- { -- // vcf annotation file -- for (i=0; insmpl_annot; i++) -- { -- int isrc, idst; -- char *ss = samples[i], *se = samples[i]; -- while ( *se && !isspace(*se) ) se++; -- if ( !*se ) -- { -- // only one sample name -- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); -- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); -- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); -- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); -- args->sample_map[idst] = isrc; -- if ( idst!=isrc ) need_sample_map = 1; -- continue; -- } -- *se = 0; -- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); -- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); -- -- ss = se+1; -- while ( isspace(*ss) ) ss++; -- se = ss; -- while ( *se && !isspace(*se) ) se++; -- -- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); -- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); -- -- args->sample_map[idst] = isrc; -- if ( idst!=isrc ) need_sample_map = 1; -+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE, src_name); -+ if ( isrc==-1 ) error("Sample \"%s\" not found in the annotation file\n", src_name); - } -+ if ( isrc!=idst ) need_sample_map = 1; -+ args->sample_map[idst] = isrc; - } -- for (i=0; insmpl_annot; i++) free(samples[i]); -- free(samples); -+ smpl_ilist_destroy(ilist); - return need_sample_map; - } - static char *columns_complement(char *columns, void **skip_info, void **skip_fmt) -@@ -1605,9 +1920,9 @@ - kputsn(ss, se-ss, &str); - if ( !str.s[0] || !strcasecmp("-",str.s) ) ; - else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol; -- else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol; -- else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol; -- else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol; -+ else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol; -+ else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol; -+ else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol; - else if ( !strcasecmp("REF",str.s) ) - { - if ( args->tgts_is_vcf ) -@@ -1667,7 +1982,8 @@ - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); - } -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - } - else if ( !strcasecmp("QUAL",str.s) ) -@@ -1698,7 +2014,8 @@ - tmp.l = 0; - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); - args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); - annot_col_t *col = &args->cols[args->ncols-1]; -@@ -1732,7 +2049,8 @@ - tmp.l = 0; - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); - args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); - annot_col_t *col = &args->cols[args->ncols-1]; -@@ -1774,7 +2092,8 @@ - tmp.l = 0; - bcf_hrec_format_rename(hrec, key_dst, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); - if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) -@@ -1811,13 +2130,30 @@ - { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); -- char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s; -+ int explicit_info = 0; -+ char *key_dst; -+ if ( !strncasecmp("INFO/",str.s,5) ) -+ { -+ key_dst = str.s + 5; -+ explicit_info = 1; -+ } -+ else -+ key_dst = str.s; - char *key_src = strstr(key_dst,":="); - if ( key_src ) - { - *key_src = 0; - key_src += 2; -- if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5; -+ if ( !strncasecmp("INFO/",key_src,5) ) -+ { -+ key_src += 5; -+ explicit_info = 1; -+ } -+ else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) ) -+ { -+ key_src[-2] = ':'; -+ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); -+ } - } - else - key_src = key_dst; -@@ -1827,11 +2163,18 @@ - if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line - { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); -- if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname); -+ if ( !hrec ) -+ { -+ if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) -+ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); -+ fprintf(stderr,"[%s] %d\n",key_src,explicit_info); -+ error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname); -+ } - tmp.l = 0; - bcf_hrec_format_rename(hrec, key_dst, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); - } - else -@@ -1860,7 +2203,6 @@ - } - free(str.s); - free(tmp.s); -- if ( args->to_idx==-1 ) args->to_idx = args->from_idx; - free(args->columns); - if ( skip_info ) khash_str2int_destroy_free(skip_info); - if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt); -@@ -1879,6 +2221,54 @@ - else if ( sample_map_ok<0 ) - error("No matching samples in source and destination file?\n"); - } -+static void init_merge_method(args_t *args) -+{ -+ int i; -+ for (i=0; incols; i++) -+ { -+ args->cols[i].merge_method = MM_FIRST; -+ args->cols[i].mm_str_hash = NULL; -+ args->cols[i].mm_dbl = NULL; -+ args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0; -+ memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr)); -+ } -+ if ( !args->merge_method_str ) return; -+ if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n"); -+ if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n"); -+ char *sb = args->merge_method_str; -+ while ( *sb ) -+ { -+ char *se = sb; -+ while ( *se && *se!=',' ) se++; -+ args->tmpks.l = 0; -+ kputsn(sb, se-sb, &args->tmpks); -+ kputc(0, &args->tmpks); -+ char *mm_type_str = args->tmpks.s + args->tmpks.l; -+ while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--; -+ if ( *mm_type_str!=':' ) -+ error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str); -+ *mm_type_str = 0; -+ mm_type_str++; -+ int mm_type = MM_FIRST; -+ if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE; -+ else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND; -+ else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM; -+ else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG; -+ else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN; -+ else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX; -+ else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str); -+ for (i=0; incols; i++) -+ { -+ if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue; -+ if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR ) -+ error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n"); -+ args->cols[i].merge_method = mm_type; -+ break; -+ } -+ if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s); -+ sb = *se ? se + 1 : se; -+ } -+} - - static void rename_chrs(args_t *args, char *fname) - { -@@ -1927,13 +2317,30 @@ - { - if ( !args->columns ) error("The -c option not given\n"); - if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n"); -- if ( args->from_idx==-1 ) error("The -c POS option not given\n"); -- if ( args->to_idx==-1 ) args->to_idx = -args->from_idx - 1; -- -- args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->from_idx,args->to_idx); -- if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); -- if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); -+ if ( args->beg_idx==-1 ) error("The -c POS option not given\n"); -+ if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); -+ if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) ) -+ { -+ args->end_idx = -args->beg_idx - 1; -+ args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx); -+ if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); -+ if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); -+ } -+ else -+ { -+ if ( args->ref_idx!=-1 ) error("Error: the REF columns will be ignored when BEG,END (or FROM,TO) is present. Replace END (or TO) with \"-\".\n"); -+ int len = strlen(args->targets_fname); -+ if ( len>=7 && !strcasecmp(".bed.gz",args->targets_fname+len-7) ) args->tgt_is_bed = 1; -+ else if ( len>=8 && !strcasecmp(".bed.bgz",args->targets_fname+len-8) ) args->tgt_is_bed = 1; -+ else if ( len>=4 && !strcasecmp(".bed",args->targets_fname+len-4) ) args->tgt_is_bed = 1; -+ args->tgt_idx = regidx_init(args->targets_fname,parse_with_payload,free_payload,sizeof(char*),args); -+ if ( !args->tgt_idx ) error("Failed to parse: %s\n", args->targets_fname); -+ args->tgt_itr = regitr_init(args->tgt_idx); -+ args->nalines++; -+ hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); -+ } - } -+ init_merge_method(args); - args->vcmp = vcmp_init(); - - if ( args->filter_str ) -@@ -1958,10 +2365,10 @@ - if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); - - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); - if ( args->n_threads ) - hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); -- bcf_hdr_write(args->out_fh, args->hdr_out); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); - } - } - -@@ -1976,6 +2383,9 @@ - { - free(args->cols[i].hdr_key_src); - free(args->cols[i].hdr_key_dst); -+ free(args->cols[i].mm_kstr.s); -+ if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash); -+ free(args->cols[i].mm_dbl); - } - free(args->cols); - for (i=0; imalines; i++) -@@ -1985,6 +2395,11 @@ - free(args->alines[i].line.s); - } - free(args->alines); -+ if ( args->tgt_idx ) -+ { -+ regidx_destroy(args->tgt_idx); -+ regitr_destroy(args->tgt_itr); -+ } - if ( args->tgts ) bcf_sr_regions_destroy(args->tgts); - free(args->tmpks.s); - free(args->tmpi); -@@ -2007,6 +2422,48 @@ - free(args->sample_map); - } - -+static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) -+{ -+ tmp->line.l = 0; -+ kputs(str, &tmp->line); -+ char *s = tmp->line.s; -+ tmp->ncols = 1; -+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -+ tmp->cols[0] = s; -+ while ( *s ) -+ { -+ if ( *s=='\t' ) -+ { -+ tmp->ncols++; -+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -+ tmp->cols[tmp->ncols-1] = s+1; -+ *s = 0; -+ } -+ s++; -+ } -+ if ( args->ref_idx != -1 ) -+ { -+ if ( args->ref_idx >= tmp->ncols ) -+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str); -+ if ( args->alt_idx >= tmp->ncols ) -+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str); -+ tmp->nals = 2; -+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -+ tmp->als[0] = tmp->cols[args->ref_idx]; -+ tmp->als[1] = s = tmp->cols[args->alt_idx]; -+ while ( *s ) -+ { -+ if ( *s==',' ) -+ { -+ tmp->nals++; -+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -+ tmp->als[tmp->nals-1] = s+1; -+ *s = 0; -+ } -+ s++; -+ } -+ } -+} - static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos) - { - if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0; -@@ -2037,44 +2494,9 @@ - tmp->rid = line->rid; - tmp->start = args->tgts->start; - tmp->end = args->tgts->end; -- tmp->line.l = 0; -- kputs(args->tgts->line.s, &tmp->line); -- char *s = tmp->line.s; -- tmp->ncols = 1; -- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -- tmp->cols[0] = s; -- while ( *s ) -- { -- if ( *s=='\t' ) -- { -- tmp->ncols++; -- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -- tmp->cols[tmp->ncols-1] = s+1; -- *s = 0; -- } -- s++; -- } -+ parse_annot_line(args, args->tgts->line.s, tmp); - if ( args->ref_idx != -1 ) - { -- if ( args->ref_idx >= tmp->ncols ) -- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s); -- if ( args->alt_idx >= tmp->ncols ) -- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s); -- tmp->nals = 2; -- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -- tmp->als[0] = tmp->cols[args->ref_idx]; -- tmp->als[1] = s = tmp->cols[args->alt_idx]; -- while ( *s ) -- { -- if ( *s==',' ) -- { -- tmp->nals++; -- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -- tmp->als[tmp->nals-1] = s+1; -- *s = 0; -- } -- s++; -- } - int iseq = args->tgts->iseq; - if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break; - } -@@ -2088,7 +2510,30 @@ - for (i=0; inrm; i++) - args->rm[i].handler(args, line, &args->rm[i]); - -- if ( args->tgts ) -+ int has_overlap = 0; -+ -+ if ( args->tgt_idx ) -+ { -+ if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) -+ { -+ while ( regitr_overlap(args->tgt_itr) ) -+ { -+ annot_line_t *tmp = &args->alines[0]; -+ tmp->rid = line->rid; -+ tmp->start = args->tgt_itr->beg; -+ tmp->end = args->tgt_itr->end; -+ parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); -+ for (j=0; jncols; j++) -+ if ( args->cols[j].setter(args,line,&args->cols[j],tmp) ) -+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); -+ } -+ has_overlap = 1; -+ } -+ for (j=0; jncols; j++) -+ if ( args->cols[j].merge_method != MM_FIRST ) -+ args->cols[j].setter(args,line,&args->cols[j],NULL); -+ } -+ else if ( args->tgts ) - { - // Buffer annotation lines. When multiple ALT alleles are present in the - // annotation file, at least one must match one of the VCF alleles. -@@ -2119,18 +2564,9 @@ - // there is a matching line - for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) -- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -- -- } -- -- if ( args->mark_sites ) -- { -- // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 -- if ( args->mark_sites_logic==MARK_LISTED ) -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?1:0); -- else -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?0:1); -+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - } -+ has_overlap = inalines ? 1 : 0; - } - else if ( args->files->nreaders == 2 ) - { -@@ -2139,13 +2575,10 @@ - bcf1_t *aline = bcf_sr_get_line(args->files,1); - for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) -- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - -- if ( args->mark_sites ) -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0); -+ has_overlap = 1; - } -- else if ( args->mark_sites ) -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0); - } - if ( args->set_ids ) - { -@@ -2160,6 +2593,15 @@ - bcf_update_id(args->hdr_out,line,args->tmpks.s); - } - } -+ -+ if ( args->mark_sites ) -+ { -+ // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 -+ if ( args->mark_sites_logic==MARK_LISTED ) -+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0); -+ else -+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1); -+ } - } - - static void usage(args_t *args) -@@ -2173,10 +2615,12 @@ - fprintf(stderr, " --collapse matching records by , see man page for details [some]\n"); - fprintf(stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); -+ fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n"); - fprintf(stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(stderr, " -I, --set-id [+] set ID column, see man page for details\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); -+ fprintf(stderr, " -l, --merge-logic merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); - fprintf(stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); -@@ -2186,6 +2630,7 @@ - fprintf(stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); - fprintf(stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); -+ fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); - fprintf(stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); - fprintf(stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(stderr, "\n"); -@@ -2202,19 +2647,20 @@ - args->output_type = FT_VCF; - args->n_threads = 0; - args->record_cmd_line = 1; -- args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; -+ args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; - args->set_ids_replace = 1; - int regions_is_file = 0, collapse = 0; - - static struct option loptions[] = - { -- {"keep-sites",required_argument,NULL,'k'}, -+ {"keep-sites",no_argument,NULL,'k'}, - {"mark-sites",required_argument,NULL,'m'}, - {"set-id",required_argument,NULL,'I'}, - {"output",required_argument,NULL,'o'}, - {"output-type",required_argument,NULL,'O'}, - {"threads",required_argument,NULL,9}, - {"annotations",required_argument,NULL,'a'}, -+ {"merge-logic",required_argument,NULL,'l'}, - {"collapse",required_argument,NULL,2}, - {"include",required_argument,NULL,'i'}, - {"exclude",required_argument,NULL,'e'}, -@@ -2226,12 +2672,15 @@ - {"header-lines",required_argument,NULL,'h'}, - {"samples",required_argument,NULL,'s'}, - {"samples-file",required_argument,NULL,'S'}, -+ {"single-overlaps",no_argument,NULL,10}, - {"no-version",no_argument,NULL,8}, -+ {"force",no_argument,NULL,'f'}, - {NULL,0,NULL,0} - }; -- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) - { - switch (c) { -+ case 'f': args->force = 1; break; - case 'k': args->keep_sites = 1; break; - case 'm': - args->mark_sites_logic = MARK_LISTED; -@@ -2239,6 +2688,7 @@ - else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } - else args->mark_sites = optarg; - break; -+ case 'l': args->merge_method_str = optarg; break; - case 'I': args->set_ids_fmt = optarg; break; - case 's': args->sample_names = optarg; break; - case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; -@@ -2273,6 +2723,7 @@ - break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -+ case 10 : args->single_overlaps = 1; break; - case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } -@@ -2294,6 +2745,7 @@ - if ( args->targets_fname ) - { - htsFile *fp = hts_open(args->targets_fname,"r"); -+ if ( !fp ) error("Failed to open %s\n", args->targets_fname); - htsFormat type = *hts_get_format(fp); - hts_close(fp); - -@@ -2305,26 +2757,40 @@ - } - } - if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - -+ static int line_errcode_warned = 0; - init_data(args); - while ( bcf_sr_next_line(args->files) ) - { - if ( !bcf_sr_has_line(args->files,0) ) continue; - bcf1_t *line = bcf_sr_get_line(args->files,0); -- if ( line->errcode ) error("Encountered error, cannot proceed. Please check the error output above.\n"); -+ if ( line->errcode ) -+ { -+ if ( !args->force ) -+ error("Encountered an error, cannot proceed. Please check the error output above.\n" -+ "If feeling adventurous, use the --force option. (At your own risk!)\n"); -+ else if ( !line_errcode_warned ) -+ { -+ fprintf(stderr, -+ "Warning: Encountered an error, proceeding only because --force was given.\n" -+ " Note that this can result in a segfault or a silent corruption of the output file!\n"); -+ line_errcode_warned = 1; -+ line->errcode = 0; -+ } -+ } - if ( args->filter ) - { - int pass = filter_test(args->filter, line, NULL); - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) - { -- if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line); -+ if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); - continue; - } - } - annotate(args, line); -- bcf_write1(args->out_fh, args->hdr_out, line); -+ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); - } - destroy_data(args); - bcf_sr_destroy(args->files); ---- python-pysam.orig/bcftools/vcfannotate.c.pysam.c -+++ python-pysam/bcftools/vcfannotate.c.pysam.c -@@ -2,7 +2,7 @@ - - /* vcfannotate.c -- Annotate and edit VCF/BCF files. - -- Copyright (C) 2013-2018 Genome Research Ltd. -+ Copyright (C) 2013-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -35,16 +35,17 @@ - #include - #include - #include -+#include - #include - #include - #include - #include --#include - #include "bcftools.h" - #include "vcmp.h" - #include "filter.h" - #include "convert.h" - #include "smpl_ilist.h" -+#include "regidx.h" - - struct _args_t; - -@@ -67,15 +68,30 @@ - } - annot_line_t; - --#define REPLACE_MISSING 0 // replace only missing values --#define REPLACE_ALL 1 // replace both missing and existing values --#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing --#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise -+#define REPLACE_MISSING 0 // replace only missing values -+#define REPLACE_ALL 1 // replace both missing and existing values -+#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing -+#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise -+#define MM_FIRST 0 // if multiple annotation lines overlap a VCF record, use the first, discarding the rest -+#define MM_APPEND 1 // append, possibly multiple times -+#define MM_UNIQUE 2 // append, only unique values -+#define MM_SUM 3 -+#define MM_AVG 4 -+#define MM_MIN 5 -+#define MM_MAX 6 - typedef struct _annot_col_t - { - int icol, replace, number; // number: one of BCF_VL_* types - char *hdr_key_src, *hdr_key_dst; - int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*); -+ int merge_method; // one of the MM_* defines -+ khash_t(str2int) *mm_str_hash; // lookup table to ensure uniqueness of added string values -+ kstring_t mm_kstr; -+ double -+ mm_dbl_nalloc, // the allocated size --merge-logic values array -+ mm_dbl_nused, // the number of used elements in the mm_dbl array -+ mm_dbl_ndat, // the number of merged rows (for calculating the average) -+ *mm_dbl; - } - annot_col_t; - -@@ -94,6 +110,10 @@ - int output_type, n_threads; - bcf_sr_regions_t *tgts; - -+ regidx_t *tgt_idx; -+ regitr_t *tgt_itr; -+ int tgt_is_bed; -+ - filter_t *filter; - char *filter_str; - int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE -@@ -106,7 +126,7 @@ - vcmp_t *vcmp; // for matching annotation and VCF lines by allele - annot_line_t *alines; // buffered annotation lines - int nalines, malines; -- int ref_idx, alt_idx, chr_idx, from_idx, to_idx; // -1 if not present -+ int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present - annot_col_t *cols; // column indexes and setters - int ncols; - -@@ -127,18 +147,40 @@ - - char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; - char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; -- int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic; -+ char *merge_method_str; -+ int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; - } - args_t; - - char *msprintf(const char *fmt, ...); - -+int parse_with_payload(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -+{ -+ args_t *args = (args_t*) usr; -+ int ret = args->tgt_is_bed ? regidx_parse_bed(line, chr_beg, chr_end, beg, end, NULL, NULL) : regidx_parse_tab(line, chr_beg, chr_end, beg, end, NULL, NULL); -+ if ( ret<0 ) return ret; -+ *((char **)payload) = strdup(line); -+ return 0; -+} -+void free_payload(void *payload) -+{ -+ char *str = *((char**)payload); -+ free(str); -+} -+ - void remove_id(args_t *args, bcf1_t *line, rm_tag_t *tag) - { - bcf_update_id(args->hdr,line,NULL); - } - void remove_filter(args_t *args, bcf1_t *line, rm_tag_t *tag) - { -+ if ( tag->key && tag->hdr_id<0 ) -+ { -+ error("Error: Cannot proceed, not even with the --force option, bad things could happen.\n" -+ " Note that \"bcftools annotate -x FILTER\" can be used to remove ALL filters.\n" -+ " Even better, use \"bcftools view -h\" and \"bcftools reheader\" to fix the header!\n" -+ ); -+ } - if ( !tag->key ) bcf_update_filter(args->hdr, line, NULL, args->flt_keep_pass); - else bcf_remove_filter(args->hdr, line, tag->hdr_id, args->flt_keep_pass); - } -@@ -225,7 +267,10 @@ - memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); - bcf_hrec_destroy(hrec); - } -- if ( nrm ) bcf_hdr_sync(hdr); -+ if ( nrm ) { -+ if (bcf_hdr_sync(hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); -+ } - } - - static void init_remove_annots(args_t *args) -@@ -266,8 +311,14 @@ - tag->handler = remove_filter; - tag->key = strdup(str.s); - tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key); -- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s); -- if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); -+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) -+ { -+ if ( args->keep_sites ) -+ error("Error: The filter \"%s\" is not defined in the header, cannot use the -k option\n", str.s); -+ else -+ fprintf(bcftools_stderr,"Warning: The filter \"%s\" is not defined in the header\n", str.s); -+ } -+ else if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key); - } - else - { -@@ -282,8 +333,14 @@ - int id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,str.s); - if ( !bcf_hdr_idinfo_exists(args->hdr,type,id) ) - { -- fprintf(bcftools_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); -- args->nrm--; -+ if ( args->keep_sites ) -+ error("Error: The tag \"%s\" is not defined in the header, cannot use the -k option\n", str.s); -+ else -+ fprintf(bcftools_stderr,"Warning: The tag \"%s\" not defined in the header\n", str.s); -+ -+ tag->key = strdup(str.s); -+ if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag; -+ else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag; - } - else if ( (type==BCF_HL_FMT && keep_fmt) || (type==BCF_HL_INFO && keep_info) ) - { -@@ -366,7 +423,8 @@ - } - khash_str2int_destroy_free(keep); - if ( !args->nrm ) error("No matching tag in -x %s\n", args->remove_annots); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - static void init_header_lines(args_t *args) - { -@@ -378,13 +436,17 @@ - if ( bcf_hdr_append(args->hdr_out,str.s) ) error("Could not parse %s: %s\n", args->header_fname, str.s); - bcf_hdr_append(args->hdr,str.s); // the input file may not have the header line if run with -h (and nothing else) - } -- hts_close(file); -+ if ( hts_close(file)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->header_fname); - free(str.s); -- bcf_hdr_sync(args->hdr_out); -- bcf_hdr_sync(args->hdr); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update output header", __func__); -+ if (bcf_hdr_sync(args->hdr) < 0) -+ error_errno("[%s] Failed to update input header", __func__); - } - static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FILTER (yet?)\n"); -+ - // note: so far this works only with one filter, not a list of filters - annot_line_t *tab = (annot_line_t*) data; - if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." -@@ -434,6 +496,8 @@ - } - static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with ID (yet?)\n"); -+ - // possible cases: - // IN ANNOT OUT ACHIEVED_BY - // x y x -c +ID -@@ -495,6 +559,8 @@ - } - static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with QUAL (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; // empty -@@ -503,7 +569,7 @@ - - line->qual = strtod(str, &str); - if ( str == tab->cols[col->icol] ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - return 0; - } - static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -@@ -516,13 +582,15 @@ - } - static int setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with INFO type=Flag (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - char *str = tab->cols[col->icol]; - if ( str[0]=='.' && str[1]==0 ) return 0; - - if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1); - if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0); -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - return -1; - } - static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data) -@@ -535,13 +603,13 @@ - static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi) - { - if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; - int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); -- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - // fill in any missing values in the target VCF (or all, if not present) - int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2); -@@ -567,19 +635,75 @@ - static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { - annot_line_t *tab = (annot_line_t*) data; -- char *str = tab->cols[col->icol], *end = str; -- if ( str[0]=='.' && str[1]==0 ) return 0; - -- int ntmpi = 0; -- while ( *end ) -+ if ( !tab ) -+ { -+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) -+ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n"); -+ } -+ -+ int i,ntmpi = 0; -+ if ( tab ) -+ { -+ char *str = tab->cols[col->icol], *end = str; -+ if ( str[0]=='.' && str[1]==0 ) return 0; -+ -+ while ( *end ) -+ { -+ int val = strtol(str, &end, 10); -+ if ( end==str ) -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); -+ ntmpi++; -+ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); -+ args->tmpi[ntmpi-1] = val; -+ str = end+1; -+ } -+ if ( col->merge_method!=MM_FIRST ) -+ { -+ if ( !col->mm_dbl_nused ) -+ { -+ col->mm_dbl_nused = ntmpi; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i] = args->tmpi[i]; -+ } -+ else -+ { -+ if ( col->merge_method==MM_APPEND ) -+ { -+ int nori = col->mm_dbl_nused; -+ col->mm_dbl_nused += ntmpi; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i+nori] = args->tmpi[i]; -+ } -+ else -+ { -+ if ( ntmpi!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); -+ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) -+ for (i=0; imm_dbl[i] += args->tmpi[i]; -+ else if ( col->merge_method==MM_MIN ) -+ for (i=0; imm_dbl[i] > args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } -+ else if ( col->merge_method==MM_MAX ) -+ for (i=0; imm_dbl[i] < args->tmpi[i] ) col->mm_dbl[i] = args->tmpi[i]; } -+ } -+ } -+ col->mm_dbl_ndat++; -+ } -+ } -+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) -+ { -+ ntmpi = col->mm_dbl_nused; -+ hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); -+ for (i=0; itmpi[i] = col->mm_dbl[i]; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; -+ } -+ else if ( col->merge_method==MM_AVG ) - { -- int val = strtol(str, &end, 10); -- if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -- ntmpi++; -+ ntmpi = col->mm_dbl_nused; - hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); -- args->tmpi[ntmpi-1] = val; -- str = end+1; -+ for (i=0; itmpi[i] = col->mm_dbl[i]/col->mm_dbl_ndat; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; - } - - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -@@ -615,13 +739,13 @@ - static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) - { - if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; - int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); -- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - // fill in any missing values in the target VCF (or all, if not present) - int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2); -@@ -647,19 +771,75 @@ - static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { - annot_line_t *tab = (annot_line_t*) data; -- char *str = tab->cols[col->icol], *end = str; -- if ( str[0]=='.' && str[1]==0 ) return 0; - -- int ntmpf = 0; -- while ( *end ) -+ if ( !tab ) - { -- double val = strtod(str, &end); -- if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -- ntmpf++; -- hts_expand(float,ntmpf,args->mtmpf,args->tmpf); -- args->tmpf[ntmpf-1] = val; -- str = end+1; -+ if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) -+ error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n"); -+ } -+ -+ int i,ntmpf = 0; -+ if ( tab ) -+ { -+ char *str = tab->cols[col->icol], *end = str; -+ if ( str[0]=='.' && str[1]==0 ) return 0; -+ -+ while ( *end ) -+ { -+ double val = strtod(str, &end); -+ if ( end==str ) -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); -+ ntmpf++; -+ hts_expand(float,ntmpf,args->mtmpf,args->tmpf); -+ args->tmpf[ntmpf-1] = val; -+ str = end+1; -+ } -+ if ( col->merge_method!=MM_FIRST ) -+ { -+ if ( !col->mm_dbl_nused ) -+ { -+ col->mm_dbl_nused = ntmpf; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i] = args->tmpf[i]; -+ } -+ else -+ { -+ if ( col->merge_method==MM_APPEND ) -+ { -+ int nori = col->mm_dbl_nused; -+ col->mm_dbl_nused += ntmpf; -+ hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); -+ for (i=0; imm_dbl[i+nori] = args->tmpf[i]; -+ } -+ else -+ { -+ if ( ntmpf!=col->mm_dbl_nused ) error("Error: cannot merge fields of unequal length\n"); -+ if ( col->merge_method==MM_SUM || col->merge_method==MM_AVG ) -+ for (i=0; imm_dbl[i] += args->tmpf[i]; -+ else if ( col->merge_method==MM_MIN ) -+ for (i=0; imm_dbl[i] > args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } -+ else if ( col->merge_method==MM_MAX ) -+ for (i=0; imm_dbl[i] < args->tmpf[i] ) col->mm_dbl[i] = args->tmpf[i]; } -+ } -+ } -+ col->mm_dbl_ndat++; -+ } -+ } -+ else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) -+ { -+ ntmpf = col->mm_dbl_nused; -+ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); -+ for (i=0; itmpf[i] = col->mm_dbl[i]; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; -+ } -+ else if ( col->merge_method==MM_AVG ) -+ { -+ ntmpf = col->mm_dbl_nused; -+ hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); -+ for (i=0; itmpf[i] = col->mm_dbl[i]/col->mm_dbl_ndat; -+ col->mm_dbl_nused = col->mm_dbl_ndat = 0; - } - - if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -@@ -695,6 +875,8 @@ - int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c - static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) - { -+ assert( col->merge_method==MM_FIRST ); -+ - int nsrc = 1, lsrc = 0; - while ( args->tmps[lsrc] ) - { -@@ -702,13 +884,13 @@ - lsrc++; - } - if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) ) -- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values (%d) for the %s tag at %s:%"PRId64"\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; - int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele); -- if ( !map ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - // fill in any missing values in the target VCF (or all, if not present) - int i, empty = 0, nstr, mstr = args->tmpks.m; -@@ -748,22 +930,76 @@ - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); - return 0; - } -+void khash_str2int_clear_free(void *_hash) -+{ -+ khash_t(str2int) *hash = (khash_t(str2int)*)_hash; -+ khint_t k; -+ if (hash == 0) return; -+ for (k = 0; k < kh_end(hash); ++k) -+ if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); -+ kh_clear(str2int, hash); -+} - static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( col->replace==REPLACE_MISSING && col->number!=BCF_VL_A && col->number!=BCF_VL_R ) -+ { -+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); -+ if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; -+ } -+ - annot_line_t *tab = (annot_line_t*) data; -- int len = strlen(tab->cols[col->icol]); -- if ( !len ) return 0; -- hts_expand(char,len+1,args->mtmps,args->tmps); -- memcpy(args->tmps,tab->cols[col->icol],len+1); -- if ( args->tmps[0]=='.' && args->tmps[1]==0 ) return 0; -+ -+ int len = 0; -+ if ( tab ) -+ { -+ len = strlen(tab->cols[col->icol]); -+ if ( !len ) return 0; -+ if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0; -+ } - -- if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -- return setter_ARinfo_string(args,line,col,tab->nals,tab->als); -+ if ( col->merge_method!=MM_FIRST ) -+ { -+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -+ error("Error: the --merge-logic option cannot be used with INFO tags Type=String,Number={A,R,G}\n"); - -- if ( col->replace==REPLACE_MISSING ) -+ if ( data ) -+ { -+ assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE ); -+ if ( col->merge_method==MM_UNIQUE ) -+ { -+ if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); -+ if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0; -+ khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); -+ } -+ -+ if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); -+ kputs(tab->cols[col->icol], &col->mm_kstr); -+ return 0; -+ } -+ -+ if ( col->mm_kstr.l ) -+ { -+ hts_expand(char,col->mm_kstr.l+1,args->mtmps,args->tmps); -+ memcpy(args->tmps,col->mm_kstr.s,col->mm_kstr.l+1); -+ } -+ else -+ return 0; -+ -+ if ( !data ) // flush the line -+ { -+ if ( col->merge_method==MM_UNIQUE ) -+ khash_str2int_clear_free(col->mm_str_hash); -+ col->mm_kstr.l = 0; -+ } -+ } -+ else - { -- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2); -- if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; -+ assert(tab); -+ hts_expand(char,len+1,args->mtmps,args->tmps); -+ memcpy(args->tmps,tab->cols[col->icol],len+1); -+ -+ if ( col->number==BCF_VL_A || col->number==BCF_VL_R ) -+ return setter_ARinfo_string(args,line,col,tab->nals,tab->als); - } - - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); -@@ -787,6 +1023,48 @@ - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; - } -+static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str) -+{ -+ int i, isrc, idst; -+ int blen = nsrc1 > 1 ? nsrc1 + 1 : 1; // typically the genotypes take three bytes 0/1, no 0-termination is needed -+ -+gt_length_too_big: -+ str->l = 0; -+ for (idst=0; idstsample_map ? args->sample_map[idst] : idst; -+ if ( isrc==-1 ) -+ { -+ kputc_('.', str); -+ for (i=1; i < blen; i++) kputc_(0, str); -+ continue; -+ } -+ -+ size_t plen = str->l; -+ int32_t *ptr = src + isrc*nsrc1; -+ for (i=0; il - plen > blen ) -+ { -+ // too many alternate alleles or ploidy is too large, the genotype does not fit -+ // three characters ("0/0" vs "10/10"). -+ blen *= 2; -+ goto gt_length_too_big; -+ } -+ plen = str->l - plen; -+ while ( plen < blen ) -+ { -+ kputc_(0, str); -+ plen++; -+ } -+ } -+ return 0; -+} - static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { - bcf1_t *rec = (bcf1_t*) data; -@@ -794,6 +1072,16 @@ - if ( nsrc==-3 ) return 0; // the tag is not present - if ( nsrc<=0 ) return 1; // error - -+ // Genotypes are internally represented as integers. This is a complication when -+ // adding as a different Type=String field, such as FMT/newGT:=GT -+ if ( strcmp(col->hdr_key_src,col->hdr_key_dst) ) -+ { -+ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); -+ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); -+ genotypes_to_string(args,nsrc/nsmpl_src,args->tmpi,nsmpl_dst,&args->tmpks); -+ return bcf_update_format_char(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s,args->tmpks.l); -+ } -+ - if ( !args->sample_map ) - return bcf_update_genotypes(args->hdr_out,line,args->tmpi,nsrc); - -@@ -1059,9 +1347,11 @@ - } - static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) -- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); - hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi); - -@@ -1084,7 +1374,7 @@ - char *end = str; - ptr[ival] = strtol(str, &end, 10); - if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; -@@ -1096,9 +1386,11 @@ - } - static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) -- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot); - hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf); - -@@ -1122,7 +1414,7 @@ - char *end = str; - ptr[ival] = strtod(str, &end); - if ( end==str ) -- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]); -+ error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); - - ival++; - str = *end ? end+1 : end; -@@ -1134,9 +1426,11 @@ - } - static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) - { -+ if ( !data ) error("Error: the --merge-logic option cannot be used with FORMAT tags (yet?)\n"); -+ - annot_line_t *tab = (annot_line_t*) data; - if ( col->icol+args->nsmpl_annot > tab->ncols ) -- error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("Incorrect number of values for %s at %s:%"PRId64"\n",col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ismpl; - for (ismpl=0; ismplnsmpl_annot; ismpl++) -@@ -1188,7 +1482,7 @@ - // create mapping from src to dst genotypes, haploid and diploid version - int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; - int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); -- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int i, j; - if ( rec->n_allele==line->n_allele ) -@@ -1228,15 +1522,15 @@ - } - int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); - if ( pld_src<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); - int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); - if ( pld_dst<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; - if ( ndst1_new != ndst1 ) - { -- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); -+ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - ndst1 = ndst1_new; - hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2); - } -@@ -1256,7 +1550,7 @@ - if ( col->number==BCF_VL_G ) - { - if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) -- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - if ( !args->dst_smpl_pld[i] ) - for (j=0; jfiles->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); - if ( nsrc==-3 ) return 0; // the tag is not present -@@ -1296,7 +1589,7 @@ - // create mapping from src to dst genotypes, haploid and diploid version - int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; - int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); -- if ( !map_hap ) error("REF alleles not compatible at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !map_hap ) error("REF alleles not compatible at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int i, j; - if ( rec->n_allele==line->n_allele ) -@@ -1336,15 +1629,15 @@ - } - int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); - if ( pld_src<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),(int64_t) rec->pos+1); - int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); - if ( pld_dst<0 ) -- error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); -+ error("Unexpected number of %s values (%d) for %d alleles at %s:%"PRId64"\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - - int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; - if ( ndst1_new != ndst1 ) - { -- if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); -+ if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%"PRId64"\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - ndst1 = ndst1_new; - hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2); - } -@@ -1364,7 +1657,7 @@ - if ( col->number==BCF_VL_G ) - { - if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) -- error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ error("Sample ploidy differs at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - if ( !args->dst_smpl_pld[i] ) - for (j=0; jtmps = args->tmpp[0]; // tmps might be realloced - if ( ret==-3 ) return 0; // the tag is not present - if ( ret<=0 ) return 1; // error -- return core_setter_format_str(args,line,col,args->tmpp); -+ if ( strcmp("GT",col->hdr_key_dst) ) -+ return core_setter_format_str(args,line,col,args->tmpp); -+ -+ // Genotypes are internally represented as integers. This is a complication for FMT/GT:=oldGT -+ // First determine the maximum number of alleles per-sample ndst1 -+ int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); -+ int nsmpl_dst = bcf_hdr_nsamples(args->hdr_out); -+ int isrc,idst, ndst1 = 0, nsrc1 = ret / nsmpl_src; -+ char *ptr = args->tmps, *ptr_end = ptr + ret; -+ while ( ptr < ptr_end ) -+ { -+ char *smpl_end = ptr + nsrc1; -+ int n = 1; -+ while ( ptr < smpl_end ) -+ { -+ if ( *ptr=='/' || *ptr=='|' ) n++; -+ ptr++; -+ } -+ if ( ndst1 < n ) ndst1 = n; -+ } -+ assert( ndst1 ); -+ -+ int ndst = ndst1*nsmpl_dst; -+ hts_expand(int32_t,ndst,args->mtmpi,args->tmpi); -+ hts_expand(char,ret+1,args->mtmps,args->tmps); args->tmps[ret] = 0; // the FORMAT string may not be 0-terminated -+ for (idst=0; idsttmpi + idst*ndst1; -+ isrc = args->sample_map ? args->sample_map[idst] : idst; -+ if ( isrc==-1 ) -+ { -+ dst[0] = bcf_gt_missing; -+ for (i=1; itmps + isrc*nsrc1, *tmp; -+ char *keep_ptr = beg+nsrc1, keep = *keep_ptr; *keep_ptr = 0; -+ while ( *beg ) -+ { -+ char *end = beg; -+ while ( *end && *end!='/' && *end!='|' ) end++; -+ if ( *beg=='.' && end-beg==1 ) dst[i] = bcf_gt_missing; -+ else -+ { -+ if ( *end=='|' ) is_phased = 1; -+ dst[i] = strtol(beg, &tmp, 10); -+ if ( tmp!=end ) -+ error("Could not parse the %s field at %s:%"PRId64" in %s\n", col->hdr_key_src,bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1,args->targets_fname); -+ if ( dst[i] >= line->n_allele ) -+ error("The source allele index is bigger than the number of destination alleles at %s:%"PRId64"\n", bcf_seqname(args->files->readers[1].header,rec),(int64_t) rec->pos+1); -+ dst[i] = is_phased ? bcf_gt_phased(dst[i]) : bcf_gt_unphased(dst[i]); -+ } -+ beg = *end ? end+1 : end; -+ i++; -+ } -+ *keep_ptr = keep; -+ for (; ihdr_out,line,args->tmpi,ndst); - } - static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) - { -@@ -1448,62 +1800,25 @@ - args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); - for (i=0; insample_map; i++) args->sample_map[i] = -1; - -- // possible todo: could do with smpl_ilist only -- smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT); -- if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names); -- char **samples = (char**) malloc(sizeof(char*)*ilist->n); -- for (i=0; in; i++) samples[i] = strdup(dst->samples[i]); -+ int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file -+ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src -+ if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names); - args->nsmpl_annot = ilist->n; -- smpl_ilist_destroy(ilist); - int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1; -- if ( !src ) -+ for (i=0; insmpl_annot; i++) - { -- // tab annotation file -- for (i=0; insmpl_annot; i++) -+ int idst = ilist->idx[i]; -+ const char *src_name = ilist->pair && ilist->pair[i] ? ilist->pair[i] : bcf_hdr_int2id(dst, BCF_DT_SAMPLE, idst); -+ int isrc = i; -+ if ( src ) // the annotation file is a VCF, not a tab-delimited file - { -- int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]); -- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]); -- args->sample_map[idst] = i; -- if ( idst!=i ) need_sample_map = 1; -- } -- } -- else -- { -- // vcf annotation file -- for (i=0; insmpl_annot; i++) -- { -- int isrc, idst; -- char *ss = samples[i], *se = samples[i]; -- while ( *se && !isspace(*se) ) se++; -- if ( !*se ) -- { -- // only one sample name -- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); -- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); -- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); -- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); -- args->sample_map[idst] = isrc; -- if ( idst!=isrc ) need_sample_map = 1; -- continue; -- } -- *se = 0; -- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss); -- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss); -- -- ss = se+1; -- while ( isspace(*ss) ) ss++; -- se = ss; -- while ( *se && !isspace(*se) ) se++; -- -- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss); -- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss); -- -- args->sample_map[idst] = isrc; -- if ( idst!=isrc ) need_sample_map = 1; -+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE, src_name); -+ if ( isrc==-1 ) error("Sample \"%s\" not found in the annotation file\n", src_name); - } -+ if ( isrc!=idst ) need_sample_map = 1; -+ args->sample_map[idst] = isrc; - } -- for (i=0; insmpl_annot; i++) free(samples[i]); -- free(samples); -+ smpl_ilist_destroy(ilist); - return need_sample_map; - } - static char *columns_complement(char *columns, void **skip_info, void **skip_fmt) -@@ -1607,9 +1922,9 @@ - kputsn(ss, se-ss, &str); - if ( !str.s[0] || !strcasecmp("-",str.s) ) ; - else if ( !strcasecmp("CHROM",str.s) ) args->chr_idx = icol; -- else if ( !strcasecmp("POS",str.s) ) args->from_idx = icol; -- else if ( !strcasecmp("FROM",str.s) ) args->from_idx = icol; -- else if ( !strcasecmp("TO",str.s) ) args->to_idx = icol; -+ else if ( !strcasecmp("POS",str.s) ) args->beg_idx = icol; -+ else if ( !strcasecmp("FROM",str.s) || !strcasecmp("BEG",str.s) ) args->beg_idx = icol; -+ else if ( !strcasecmp("TO",str.s) || !strcasecmp("END",str.s) ) args->end_idx = icol; - else if ( !strcasecmp("REF",str.s) ) - { - if ( args->tgts_is_vcf ) -@@ -1669,7 +1984,8 @@ - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); - } -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - } - else if ( !strcasecmp("QUAL",str.s) ) -@@ -1700,7 +2016,8 @@ - tmp.l = 0; - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); - args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); - annot_col_t *col = &args->cols[args->ncols-1]; -@@ -1734,7 +2051,8 @@ - tmp.l = 0; - bcf_hrec_format(hrec, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, hrec->vals[k]); - args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); - annot_col_t *col = &args->cols[args->ncols-1]; -@@ -1776,7 +2094,8 @@ - tmp.l = 0; - bcf_hrec_format_rename(hrec, key_dst, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); - if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) -@@ -1813,13 +2132,30 @@ - { - if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n"); - if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n"); -- char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s; -+ int explicit_info = 0; -+ char *key_dst; -+ if ( !strncasecmp("INFO/",str.s,5) ) -+ { -+ key_dst = str.s + 5; -+ explicit_info = 1; -+ } -+ else -+ key_dst = str.s; - char *key_src = strstr(key_dst,":="); - if ( key_src ) - { - *key_src = 0; - key_src += 2; -- if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5; -+ if ( !strncasecmp("INFO/",key_src,5) ) -+ { -+ key_src += 5; -+ explicit_info = 1; -+ } -+ else if ( !strncasecmp("FMT/",key_src,4) || !strncasecmp("FORMAT/",key_src,5) ) -+ { -+ key_src[-2] = ':'; -+ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); -+ } - } - else - key_src = key_dst; -@@ -1829,11 +2165,18 @@ - if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line - { - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL); -- if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname); -+ if ( !hrec ) -+ { -+ if ( !explicit_info && bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL) ) -+ error("Did you mean \"FMT/%s\" rather than \"%s\"?\n",str.s,str.s); -+ fprintf(bcftools_stderr,"[%s] %d\n",key_src,explicit_info); -+ error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname); -+ } - tmp.l = 0; - bcf_hrec_format_rename(hrec, key_dst, &tmp); - bcf_hdr_append(args->hdr_out, tmp.s); -- bcf_hdr_sync(args->hdr_out); -+ if (bcf_hdr_sync(args->hdr_out) < 0) -+ error_errno("[%s] Failed to update header", __func__); - hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); - } - else -@@ -1862,7 +2205,6 @@ - } - free(str.s); - free(tmp.s); -- if ( args->to_idx==-1 ) args->to_idx = args->from_idx; - free(args->columns); - if ( skip_info ) khash_str2int_destroy_free(skip_info); - if ( skip_fmt ) khash_str2int_destroy_free(skip_fmt); -@@ -1881,6 +2223,54 @@ - else if ( sample_map_ok<0 ) - error("No matching samples in source and destination file?\n"); - } -+static void init_merge_method(args_t *args) -+{ -+ int i; -+ for (i=0; incols; i++) -+ { -+ args->cols[i].merge_method = MM_FIRST; -+ args->cols[i].mm_str_hash = NULL; -+ args->cols[i].mm_dbl = NULL; -+ args->cols[i].mm_dbl_nalloc = args->cols[i].mm_dbl_nused = args->cols[i].mm_dbl_ndat = 0; -+ memset(&args->cols[i].mm_kstr, 0, sizeof(args->cols[i].mm_kstr)); -+ } -+ if ( !args->merge_method_str ) return; -+ if ( args->tgts_is_vcf ) error("Error: the --merge-logic is intended for use with BED or TAB-delimited files only.\n"); -+ if ( !args->tgt_idx ) error("Error: BEG,END (or FROM,TO) columns are expected with the --merge-logic option.\n"); -+ char *sb = args->merge_method_str; -+ while ( *sb ) -+ { -+ char *se = sb; -+ while ( *se && *se!=',' ) se++; -+ args->tmpks.l = 0; -+ kputsn(sb, se-sb, &args->tmpks); -+ kputc(0, &args->tmpks); -+ char *mm_type_str = args->tmpks.s + args->tmpks.l; -+ while ( *mm_type_str!=':' && mm_type_str > args->tmpks.s ) mm_type_str--; -+ if ( *mm_type_str!=':' ) -+ error("Error: could not parse the argument to --merge-logic: %s\n", args->merge_method_str); -+ *mm_type_str = 0; -+ mm_type_str++; -+ int mm_type = MM_FIRST; -+ if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE; -+ else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND; -+ else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM; -+ else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG; -+ else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN; -+ else if ( !strcasecmp("max",mm_type_str) ) mm_type = MM_MAX; -+ else error("Error: could not parse --merge-logic %s, the logic \"%s\" is not recognised\n", args->merge_method_str,mm_type_str); -+ for (i=0; incols; i++) -+ { -+ if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue; -+ if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR ) -+ error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n"); -+ args->cols[i].merge_method = mm_type; -+ break; -+ } -+ if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s); -+ sb = *se ? se + 1 : se; -+ } -+} - - static void rename_chrs(args_t *args, char *fname) - { -@@ -1929,13 +2319,30 @@ - { - if ( !args->columns ) error("The -c option not given\n"); - if ( args->chr_idx==-1 ) error("The -c CHROM option not given\n"); -- if ( args->from_idx==-1 ) error("The -c POS option not given\n"); -- if ( args->to_idx==-1 ) args->to_idx = -args->from_idx - 1; -- -- args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->from_idx,args->to_idx); -- if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); -- if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); -+ if ( args->beg_idx==-1 ) error("The -c POS option not given\n"); -+ if ( args->single_overlaps && args->merge_method_str ) error("The options --merge-logic and --single-overlaps cannot be combined\n"); -+ if ( args->end_idx==-1 || (args->single_overlaps && !args->merge_method_str) ) -+ { -+ args->end_idx = -args->beg_idx - 1; -+ args->tgts = bcf_sr_regions_init(args->targets_fname,1,args->chr_idx,args->beg_idx,args->end_idx); -+ if ( !args->tgts ) error("Could not initialize the annotation file: %s\n", args->targets_fname); -+ if ( !args->tgts->tbx ) error("Expected tabix-indexed annotation file: %s\n", args->targets_fname); -+ } -+ else -+ { -+ if ( args->ref_idx!=-1 ) error("Error: the REF columns will be ignored when BEG,END (or FROM,TO) is present. Replace END (or TO) with \"-\".\n"); -+ int len = strlen(args->targets_fname); -+ if ( len>=7 && !strcasecmp(".bed.gz",args->targets_fname+len-7) ) args->tgt_is_bed = 1; -+ else if ( len>=8 && !strcasecmp(".bed.bgz",args->targets_fname+len-8) ) args->tgt_is_bed = 1; -+ else if ( len>=4 && !strcasecmp(".bed",args->targets_fname+len-4) ) args->tgt_is_bed = 1; -+ args->tgt_idx = regidx_init(args->targets_fname,parse_with_payload,free_payload,sizeof(char*),args); -+ if ( !args->tgt_idx ) error("Failed to parse: %s\n", args->targets_fname); -+ args->tgt_itr = regitr_init(args->tgt_idx); -+ args->nalines++; -+ hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); -+ } - } -+ init_merge_method(args); - args->vcmp = vcmp_init(); - - if ( args->filter_str ) -@@ -1960,10 +2367,10 @@ - if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); - - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); -- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); - if ( args->n_threads ) - hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); -- bcf_hdr_write(args->out_fh, args->hdr_out); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: failed to write the header to %s\n", __func__,args->output_fname); - } - } - -@@ -1978,6 +2385,9 @@ - { - free(args->cols[i].hdr_key_src); - free(args->cols[i].hdr_key_dst); -+ free(args->cols[i].mm_kstr.s); -+ if ( args->cols[i].mm_str_hash ) khash_str2int_destroy_free(args->cols[i].mm_str_hash); -+ free(args->cols[i].mm_dbl); - } - free(args->cols); - for (i=0; imalines; i++) -@@ -1987,6 +2397,11 @@ - free(args->alines[i].line.s); - } - free(args->alines); -+ if ( args->tgt_idx ) -+ { -+ regidx_destroy(args->tgt_idx); -+ regitr_destroy(args->tgt_itr); -+ } - if ( args->tgts ) bcf_sr_regions_destroy(args->tgts); - free(args->tmpks.s); - free(args->tmpi); -@@ -2009,6 +2424,48 @@ - free(args->sample_map); - } - -+static void parse_annot_line(args_t *args, char *str, annot_line_t *tmp) -+{ -+ tmp->line.l = 0; -+ kputs(str, &tmp->line); -+ char *s = tmp->line.s; -+ tmp->ncols = 1; -+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -+ tmp->cols[0] = s; -+ while ( *s ) -+ { -+ if ( *s=='\t' ) -+ { -+ tmp->ncols++; -+ hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -+ tmp->cols[tmp->ncols-1] = s+1; -+ *s = 0; -+ } -+ s++; -+ } -+ if ( args->ref_idx != -1 ) -+ { -+ if ( args->ref_idx >= tmp->ncols ) -+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,str); -+ if ( args->alt_idx >= tmp->ncols ) -+ error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,str); -+ tmp->nals = 2; -+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -+ tmp->als[0] = tmp->cols[args->ref_idx]; -+ tmp->als[1] = s = tmp->cols[args->alt_idx]; -+ while ( *s ) -+ { -+ if ( *s==',' ) -+ { -+ tmp->nals++; -+ hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -+ tmp->als[tmp->nals-1] = s+1; -+ *s = 0; -+ } -+ s++; -+ } -+ } -+} - static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int end_pos) - { - if ( args->nalines && args->alines[0].rid != line->rid ) args->nalines = 0; -@@ -2039,44 +2496,9 @@ - tmp->rid = line->rid; - tmp->start = args->tgts->start; - tmp->end = args->tgts->end; -- tmp->line.l = 0; -- kputs(args->tgts->line.s, &tmp->line); -- char *s = tmp->line.s; -- tmp->ncols = 1; -- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -- tmp->cols[0] = s; -- while ( *s ) -- { -- if ( *s=='\t' ) -- { -- tmp->ncols++; -- hts_expand(char*,tmp->ncols,tmp->mcols,tmp->cols); -- tmp->cols[tmp->ncols-1] = s+1; -- *s = 0; -- } -- s++; -- } -+ parse_annot_line(args, args->tgts->line.s, tmp); - if ( args->ref_idx != -1 ) - { -- if ( args->ref_idx >= tmp->ncols ) -- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->ref_idx+1,tmp->ncols,args->tgts->line.s); -- if ( args->alt_idx >= tmp->ncols ) -- error("Could not parse the line, expected %d+ columns, found %d:\n\t%s\n",args->alt_idx+1,tmp->ncols,args->tgts->line.s); -- tmp->nals = 2; -- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -- tmp->als[0] = tmp->cols[args->ref_idx]; -- tmp->als[1] = s = tmp->cols[args->alt_idx]; -- while ( *s ) -- { -- if ( *s==',' ) -- { -- tmp->nals++; -- hts_expand(char*,tmp->nals,tmp->mals,tmp->als); -- tmp->als[tmp->nals-1] = s+1; -- *s = 0; -- } -- s++; -- } - int iseq = args->tgts->iseq; - if ( bcf_sr_regions_next(args->tgts)<0 || args->tgts->iseq!=iseq ) break; - } -@@ -2090,7 +2512,30 @@ - for (i=0; inrm; i++) - args->rm[i].handler(args, line, &args->rm[i]); - -- if ( args->tgts ) -+ int has_overlap = 0; -+ -+ if ( args->tgt_idx ) -+ { -+ if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) -+ { -+ while ( regitr_overlap(args->tgt_itr) ) -+ { -+ annot_line_t *tmp = &args->alines[0]; -+ tmp->rid = line->rid; -+ tmp->start = args->tgt_itr->beg; -+ tmp->end = args->tgt_itr->end; -+ parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); -+ for (j=0; jncols; j++) -+ if ( args->cols[j].setter(args,line,&args->cols[j],tmp) ) -+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); -+ } -+ has_overlap = 1; -+ } -+ for (j=0; jncols; j++) -+ if ( args->cols[j].merge_method != MM_FIRST ) -+ args->cols[j].setter(args,line,&args->cols[j],NULL); -+ } -+ else if ( args->tgts ) - { - // Buffer annotation lines. When multiple ALT alleles are present in the - // annotation file, at least one must match one of the VCF alleles. -@@ -2121,18 +2566,9 @@ - // there is a matching line - for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) -- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -- -- } -- -- if ( args->mark_sites ) -- { -- // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 -- if ( args->mark_sites_logic==MARK_LISTED ) -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?1:0); -- else -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,inalines?0:1); -+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - } -+ has_overlap = inalines ? 1 : 0; - } - else if ( args->files->nreaders == 2 ) - { -@@ -2141,13 +2577,10 @@ - bcf1_t *aline = bcf_sr_get_line(args->files,1); - for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],aline) ) -- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1); -+ error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - -- if ( args->mark_sites ) -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0); -+ has_overlap = 1; - } -- else if ( args->mark_sites ) -- bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0); - } - if ( args->set_ids ) - { -@@ -2162,6 +2595,15 @@ - bcf_update_id(args->hdr_out,line,args->tmpks.s); - } - } -+ -+ if ( args->mark_sites ) -+ { -+ // ideally, we'd like to be far more general than this in future, see https://github.com/samtools/bcftools/issues/87 -+ if ( args->mark_sites_logic==MARK_LISTED ) -+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?1:0); -+ else -+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,has_overlap?0:1); -+ } - } - - static void usage(args_t *args) -@@ -2175,10 +2617,12 @@ - fprintf(bcftools_stderr, " --collapse matching records by , see man page for details [some]\n"); - fprintf(bcftools_stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(bcftools_stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); -+ fprintf(bcftools_stderr, " --force continue despite parsing error (at your own risk!)\n"); - fprintf(bcftools_stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(bcftools_stderr, " -I, --set-id [+] set ID column, see man page for details\n"); - fprintf(bcftools_stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); -+ fprintf(bcftools_stderr, " -l, --merge-logic merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); - fprintf(bcftools_stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); -@@ -2188,6 +2632,7 @@ - fprintf(bcftools_stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); - fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); -+ fprintf(bcftools_stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); - fprintf(bcftools_stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); - fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); - fprintf(bcftools_stderr, "\n"); -@@ -2204,19 +2649,20 @@ - args->output_type = FT_VCF; - args->n_threads = 0; - args->record_cmd_line = 1; -- args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1; -+ args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; - args->set_ids_replace = 1; - int regions_is_file = 0, collapse = 0; - - static struct option loptions[] = - { -- {"keep-sites",required_argument,NULL,'k'}, -+ {"keep-sites",no_argument,NULL,'k'}, - {"mark-sites",required_argument,NULL,'m'}, - {"set-id",required_argument,NULL,'I'}, - {"output",required_argument,NULL,'o'}, - {"output-type",required_argument,NULL,'O'}, - {"threads",required_argument,NULL,9}, - {"annotations",required_argument,NULL,'a'}, -+ {"merge-logic",required_argument,NULL,'l'}, - {"collapse",required_argument,NULL,2}, - {"include",required_argument,NULL,'i'}, - {"exclude",required_argument,NULL,'e'}, -@@ -2228,12 +2674,15 @@ - {"header-lines",required_argument,NULL,'h'}, - {"samples",required_argument,NULL,'s'}, - {"samples-file",required_argument,NULL,'S'}, -+ {"single-overlaps",no_argument,NULL,10}, - {"no-version",no_argument,NULL,8}, -+ {"force",no_argument,NULL,'f'}, - {NULL,0,NULL,0} - }; -- while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) - { - switch (c) { -+ case 'f': args->force = 1; break; - case 'k': args->keep_sites = 1; break; - case 'm': - args->mark_sites_logic = MARK_LISTED; -@@ -2241,6 +2690,7 @@ - else if ( optarg[0]=='-' ) { args->mark_sites = optarg+1; args->mark_sites_logic = MARK_UNLISTED; } - else args->mark_sites = optarg; - break; -+ case 'l': args->merge_method_str = optarg; break; - case 'I': args->set_ids_fmt = optarg; break; - case 's': args->sample_names = optarg; break; - case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; -@@ -2275,6 +2725,7 @@ - break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -+ case 10 : args->single_overlaps = 1; break; - case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } -@@ -2296,6 +2747,7 @@ - if ( args->targets_fname ) - { - htsFile *fp = hts_open(args->targets_fname,"r"); -+ if ( !fp ) error("Failed to open %s\n", args->targets_fname); - htsFormat type = *hts_get_format(fp); - hts_close(fp); - -@@ -2307,26 +2759,40 @@ - } - } - if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - -+ static int line_errcode_warned = 0; - init_data(args); - while ( bcf_sr_next_line(args->files) ) - { - if ( !bcf_sr_has_line(args->files,0) ) continue; - bcf1_t *line = bcf_sr_get_line(args->files,0); -- if ( line->errcode ) error("Encountered error, cannot proceed. Please check the error output above.\n"); -+ if ( line->errcode ) -+ { -+ if ( !args->force ) -+ error("Encountered an error, cannot proceed. Please check the error output above.\n" -+ "If feeling adventurous, use the --force option. (At your own risk!)\n"); -+ else if ( !line_errcode_warned ) -+ { -+ fprintf(bcftools_stderr, -+ "Warning: Encountered an error, proceeding only because --force was given.\n" -+ " Note that this can result in a segfault or a silent corruption of the output file!\n"); -+ line_errcode_warned = 1; -+ line->errcode = 0; -+ } -+ } - if ( args->filter ) - { - int pass = filter_test(args->filter, line, NULL); - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) - { -- if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line); -+ if ( args->keep_sites && bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); - continue; - } - } - annotate(args, line); -- bcf_write1(args->out_fh, args->hdr_out, line); -+ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); - } - destroy_data(args); - bcf_sr_destroy(args->files); ---- python-pysam.orig/bcftools/vcfbuf.c -+++ python-pysam/bcftools/vcfbuf.c -@@ -1,6 +1,6 @@ - /* The MIT License - -- Copyright (c) 2016 Genome Research Ltd. -+ Copyright (c) 2016-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -57,6 +57,12 @@ - - typedef struct - { -+ int active; -+} -+rmdup_t; -+ -+typedef struct -+{ - int active, rid, end; - } - overlap_t; -@@ -70,6 +76,7 @@ - ld_t ld; - prune_t prune; - overlap_t overlap; -+ rmdup_t rmdup; - }; - - vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) -@@ -103,6 +110,7 @@ - if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; } - if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } - if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } -+ if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } - } - - int vcfbuf_nsites(vcfbuf_t *buf) -@@ -126,6 +134,21 @@ - return ret; - } - -+bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx) -+{ -+ int i = rbuf_kth(&buf->rbuf, idx); -+ return i<0 ? NULL : buf->vcf[i].rec; -+} -+ -+bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx) -+{ -+ int i = rbuf_kth(&buf->rbuf, idx); -+ if ( i<0 ) return NULL; -+ bcf1_t *rec = buf->vcf[i].rec; -+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, idx, buf->vcf); -+ return rec; -+} -+ - static int cmpvrec(const void *_a, const void *_b) - { - vcfrec_t *a = *((vcfrec_t**) _a); -@@ -198,6 +221,24 @@ - rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf); - } - -+static int _rmdup_can_flush(vcfbuf_t *buf, int flush_all) -+{ -+ if ( flush_all ) return 1; -+ -+ if ( buf->rbuf.n==1 ) return 0; -+ -+ int k1 = rbuf_kth(&buf->rbuf, -1); -+ int k2 = rbuf_kth(&buf->rbuf, -2); -+ -+ vcfrec_t *rec1 = &buf->vcf[k1]; -+ vcfrec_t *rec2 = &buf->vcf[k2]; -+ -+ if ( rec1->rec->rid!=rec2->rec->rid ) return 1; -+ if ( rec1->rec->pos!=rec2->rec->pos ) return 1; -+ -+ return 0; -+} -+ - static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) - { - if ( flush_all ) { buf->overlap.rid = -1; return 1; } -@@ -252,13 +293,8 @@ - j = rbuf_last(&buf->rbuf); // last - - if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret; -- if ( buf->overlap.active ) -- { -- int ret = _overlap_can_flush(buf, flush_all); -- //printf("can_flush: %d %d - %d\n", ret, buf->vcf[i].rec->pos+1, buf->vcf[j].rec->pos+1); -- if ( ret ) goto ret; -- } -- //if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; -+ if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; -+ if ( buf->rmdup.active && _rmdup_can_flush(buf, flush_all) ) goto ret; - - if ( buf->win > 0 ) - { ---- python-pysam.orig/bcftools/vcfbuf.c.pysam.c -+++ python-pysam/bcftools/vcfbuf.c.pysam.c -@@ -2,7 +2,7 @@ - - /* The MIT License - -- Copyright (c) 2016 Genome Research Ltd. -+ Copyright (c) 2016-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -59,6 +59,12 @@ - - typedef struct - { -+ int active; -+} -+rmdup_t; -+ -+typedef struct -+{ - int active, rid, end; - } - overlap_t; -@@ -72,6 +78,7 @@ - ld_t ld; - prune_t prune; - overlap_t overlap; -+ rmdup_t rmdup; - }; - - vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) -@@ -105,6 +112,7 @@ - if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; } - if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } - if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } -+ if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } - } - - int vcfbuf_nsites(vcfbuf_t *buf) -@@ -128,6 +136,21 @@ - return ret; - } - -+bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx) -+{ -+ int i = rbuf_kth(&buf->rbuf, idx); -+ return i<0 ? NULL : buf->vcf[i].rec; -+} -+ -+bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx) -+{ -+ int i = rbuf_kth(&buf->rbuf, idx); -+ if ( i<0 ) return NULL; -+ bcf1_t *rec = buf->vcf[i].rec; -+ rbuf_remove_kth(&buf->rbuf, vcfrec_t, idx, buf->vcf); -+ return rec; -+} -+ - static int cmpvrec(const void *_a, const void *_b) - { - vcfrec_t *a = *((vcfrec_t**) _a); -@@ -200,6 +223,24 @@ - rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf); - } - -+static int _rmdup_can_flush(vcfbuf_t *buf, int flush_all) -+{ -+ if ( flush_all ) return 1; -+ -+ if ( buf->rbuf.n==1 ) return 0; -+ -+ int k1 = rbuf_kth(&buf->rbuf, -1); -+ int k2 = rbuf_kth(&buf->rbuf, -2); -+ -+ vcfrec_t *rec1 = &buf->vcf[k1]; -+ vcfrec_t *rec2 = &buf->vcf[k2]; -+ -+ if ( rec1->rec->rid!=rec2->rec->rid ) return 1; -+ if ( rec1->rec->pos!=rec2->rec->pos ) return 1; -+ -+ return 0; -+} -+ - static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) - { - if ( flush_all ) { buf->overlap.rid = -1; return 1; } -@@ -254,13 +295,8 @@ - j = rbuf_last(&buf->rbuf); // last - - if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret; -- if ( buf->overlap.active ) -- { -- int ret = _overlap_can_flush(buf, flush_all); -- //printf("can_flush: %d %d - %d\n", ret, buf->vcf[i].rec->pos+1, buf->vcf[j].rec->pos+1); -- if ( ret ) goto ret; -- } -- //if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; -+ if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret; -+ if ( buf->rmdup.active && _rmdup_can_flush(buf, flush_all) ) goto ret; - - if ( buf->win > 0 ) - { ---- python-pysam.orig/bcftools/vcfbuf.h -+++ python-pysam/bcftools/vcfbuf.h -@@ -1,6 +1,6 @@ - /* The MIT License - -- Copyright (c) 2017 Genome Research Ltd. -+ Copyright (c) 2017-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -44,6 +44,7 @@ - VCFBUF_NSITES, // leave at max this many sites in the window - VCFBUF_AF_TAG, // use this INFO tag with LD_NSITES - VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window -+ VCFBUF_RMDUP, // remove duplicate sites (completely) - } - vcfbuf_opt_t; - -@@ -64,6 +65,18 @@ - */ - bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap); - -+/* -+ * vcfbuf_peek() - return pointer to i-th record in the buffer but do not remove it from the buffer -+ * @idx: 0-based index to buffered lines -+ */ -+bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx); -+ -+/* -+ * vcfbuf_remove() - return pointer to i-th record in the buffer and remove it from the buffer -+ * @idx: 0-based index to buffered lines -+ */ -+bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx); -+ - bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all); - - /* ---- python-pysam.orig/bcftools/vcfcall.c -+++ python-pysam/bcftools/vcfcall.c -@@ -42,14 +42,11 @@ - #include "prob1.h" - #include "ploidy.h" - #include "gvcf.h" -+#include "regidx.h" -+#include "vcfbuf.h" - - void error(const char *format, ...); - --#ifdef _WIN32 --#define srand48(x) srand(x) --#define lrand48() rand() --#endif -- - #define CF_NO_GENO 1 - #define CF_INS_MISSED (1<<1) - #define CF_CCALL (1<<2) -@@ -68,6 +65,13 @@ - - typedef struct - { -+ tgt_als_t *als; -+ int nmatch_als, ibuf; -+} -+rec_tgt_t; -+ -+typedef struct -+{ - int flag; // combination of CF_* flags above - int output_type, n_threads, record_cmd_line; - htsFile *bcf_in, *out_fh; -@@ -76,6 +80,9 @@ - int nsamples, *samples_map; // mapping from output sample names to original VCF - char *regions, *targets; // regions to process - int regions_is_file, targets_is_file; -+ regidx_t *tgt_idx; -+ regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp; -+ vcfbuf_t *vcfbuf; - - char *samples_fname; - int samples_is_file; -@@ -86,6 +93,7 @@ - - bcf1_t *missed_line; - call_t aux; // parameters and temporary data -+ kstring_t str; - - int argc; - char **argv; -@@ -297,7 +305,7 @@ - if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } - if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } - -- ss = se+1; -+ ss = se+(x != '\0'); - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) ss = "2"; // default ploidy - se = ss; -@@ -347,26 +355,253 @@ - bcf_float_set_missing(args->missed_line->qual); - } - --static void print_missed_line(bcf_sr_regions_t *regs, void *data) -+static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -+{ -+ char *ss = (char*) line; -+ while ( *ss && isspace(*ss) ) ss++; -+ if ( !*ss ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; } -+ if ( *ss=='#' ) return -1; // skip comments -+ -+ char *se = ss; -+ while ( *se && !isspace(*se) ) se++; -+ -+ *chr_beg = ss; -+ *chr_end = se-1; -+ -+ if ( !*se ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; } -+ -+ ss = se+1; -+ *beg = strtod(ss, &se); -+ if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; } -+ if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } -+ (*beg)--; -+ *end = *beg; -+ -+ if ( !usr ) return 0; // allele information not required -+ -+ ss = se+1; -+ tgt_als_t *als = (tgt_als_t*)payload; -+ als->used = 0; -+ als->n = 0; -+ als->allele = NULL; -+ while ( *ss ) -+ { -+ se = ss; -+ while ( *se && *se!=',' ) se++; -+ als->n++; -+ als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele)); -+ als->allele[als->n-1] = (char*)malloc(se-ss+1); -+ memcpy(als->allele[als->n-1],ss,se-ss); -+ als->allele[als->n-1][se-ss] = 0; -+ ss = se+1; -+ if ( !*se ) break; -+ } -+ return 0; -+} -+static void tgt_free(void *payload) -+{ -+ tgt_als_t *als = (tgt_als_t*)payload; -+ int i; -+ for (i=0; in; i++) free(als->allele[i]); -+ free(als->allele); -+} -+static void tgt_flush_region(args_t *args, char *chr, uint32_t beg, uint32_t end) -+{ -+ if ( !regidx_overlap(args->tgt_idx, chr,beg,end,args->tgt_itr_tmp) ) return; -+ while ( regitr_overlap(args->tgt_itr_tmp) ) -+ { -+ if ( args->tgt_itr_tmp->beg < beg ) continue; -+ -+ tgt_als_t *tgt_als = ®itr_payload(args->tgt_itr_tmp,tgt_als_t); -+ if ( tgt_als->used ) continue; -+ -+ args->missed_line->rid = bcf_hdr_name2id(args->aux.hdr,chr); -+ args->missed_line->pos = args->tgt_itr_tmp->beg; -+ bcf_unpack(args->missed_line,BCF_UN_ALL); -+ bcf_update_alleles(args->aux.hdr, args->missed_line, (const char**)tgt_als->allele, tgt_als->n); -+ tgt_als->used = 1; -+ if ( bcf_write1(args->out_fh, args->aux.hdr, args->missed_line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); -+ } -+} -+static void tgt_flush(args_t *args, bcf1_t *rec) -+{ -+ if ( rec ) -+ { -+ char *chr = (char*)bcf_seqname(args->aux.hdr,rec); -+ -+ if ( !args->tgt_itr_prev ) // first record -+ tgt_flush_region(args,chr,0,rec->pos-1); -+ -+ else if ( strcmp(chr,args->tgt_itr_prev->seq) ) // first record on a new chromosome -+ { -+ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX); -+ tgt_flush_region(args,chr,0,rec->pos-1); -+ } -+ else // another record on the same chromosome -+ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,rec->pos-1); -+ } -+ else -+ { -+ // flush everything -+ if ( args->tgt_itr_prev ) -+ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,REGIDX_MAX); -+ -+ int i, nchr = 0; -+ char **chr = regidx_seq_names(args->tgt_idx, &nchr); -+ for (i=0; i" is not present at indels sites and there are no other symbolic alleles than <*> -+ if ( als[1][0]=='<' ) return 0; -+ -+ int i; -+ for (i=0; iaux; -- bcf1_t *missed = args->missed_line; -+ bcf1_t *rec = NULL; -+ if ( !args->vcfbuf ) -+ { -+ while ( bcf_sr_next_line(args->aux.srs) ) -+ { -+ rec = args->aux.srs->readers[0].buffer[0]; -+ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); -+ if ( args->tgt_idx ) -+ { -+ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; -+ -+ // For backward compatibility: require the exact position, not an interval overlap -+ int pos_match = 0; -+ while ( regitr_overlap(args->tgt_itr) ) -+ { -+ if ( args->tgt_itr->beg != rec->pos ) continue; -+ pos_match = 1; -+ break; -+ } -+ if ( !pos_match ) continue; -+ } -+ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); -+ bcf_unpack(rec, BCF_UN_STR); -+ return rec; -+ } -+ return NULL; -+ } -+ -+ // If we are here,-C alleles was given and vcfbuf and tgt_idx are set -+ -+ // Fill the buffer with duplicate lines -+ int vcfbuf_full = 1; -+ int nbuf = vcfbuf_nsites(args->vcfbuf); -+ bcf1_t *rec0 = NULL, *recN = NULL; -+ if ( nbuf==0 ) vcfbuf_full = 0; -+ else if ( nbuf==1 ) -+ { -+ vcfbuf_full = 0; -+ rec0 = vcfbuf_peek(args->vcfbuf, 0); -+ } -+ else -+ { -+ rec0 = vcfbuf_peek(args->vcfbuf, 0); -+ recN = vcfbuf_peek(args->vcfbuf, nbuf-1); -+ if ( rec0->rid == recN->rid && rec0->pos == recN->pos ) vcfbuf_full = 0; -+ } -+ if ( !vcfbuf_full ) -+ { -+ while ( bcf_sr_next_line(args->aux.srs) ) -+ { -+ rec = args->aux.srs->readers[0].buffer[0]; -+ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); -+ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; -+ // as above: require the exact position, not an interval overlap -+ int exact_match = 0; -+ while ( regitr_overlap(args->tgt_itr) ) -+ { -+ if ( args->tgt_itr->beg != rec->pos ) continue; -+ exact_match = 1; -+ break; -+ } -+ if ( !exact_match ) continue; -+ -+ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); -+ bcf_unpack(rec, BCF_UN_STR); -+ if ( !rec0 ) rec0 = rec; -+ recN = rec; -+ args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); -+ if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break; -+ } -+ } - -- char *ss = regs->line.s; -- int i = 0; -- while ( iaux.srs->targets_als-1 && *ss ) -+ nbuf = vcfbuf_nsites(args->vcfbuf); -+ int n, i,j; -+ for (n=nbuf; n>1; n--) - { -- if ( *ss=='\t' ) i++; -- ss++; -+ recN = vcfbuf_peek(args->vcfbuf, n-1); -+ if ( rec0->rid==recN->rid && rec0->pos==recN->pos ) break; - } -- if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als); -+ if ( n==0 ) -+ { -+ assert( !nbuf ); -+ return NULL; -+ } -+ -+ // Find the VCF and tab record with the best matching combination of alleles, prioritize -+ // records of the same type (snp vs indel) -+ rec_tgt_t rec_tgt; -+ memset(&rec_tgt,0,sizeof(rec_tgt)); -+ regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr); -+ regitr_t *tmp_itr = regitr_init(args->tgt_idx); -+ regitr_copy(tmp_itr, args->tgt_itr); -+ for (i=0; ivcfbuf, i); -+ int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1; -+ while ( regitr_overlap(tmp_itr) ) -+ { -+ if ( tmp_itr->beg != rec->pos ) continue; -+ tgt_als_t *als = ®itr_payload(tmp_itr,tgt_als_t); -+ if ( als->used ) continue; -+ int nmatch_als = 0; -+ vcmp_t *vcmp = vcmp_init(); -+ int ret = vcmp_set_ref(vcmp, rec->d.allele[0], als->allele[0]); -+ if ( ret==0 ) -+ { -+ nmatch_als++; -+ if ( rec->n_allele > 1 && als->n > 1 ) -+ { -+ for (j=1; jn; j++) -+ { -+ if ( vcmp_find_allele(vcmp, rec->d.allele+1, rec->n_allele-1, als->allele[j])>=0 ) nmatch_als++; -+ } -+ } -+ } -+ int als_indel = is_indel(als->n, als->allele) ? 1 : -1; -+ nmatch_als *= rec_indel*als_indel; -+ if ( nmatch_als > rec_tgt.nmatch_als || !rec_tgt.als ) -+ { -+ rec_tgt.nmatch_als = nmatch_als; -+ rec_tgt.als = als; -+ rec_tgt.ibuf = i; -+ } -+ vcmp_destroy(vcmp); -+ } -+ } -+ regitr_destroy(tmp_itr); - -- missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]); -- missed->pos = regs->start; -- bcf_update_alleles_str(call->hdr, missed,ss); -+ args->aux.tgt_als = rec_tgt.als; -+ if ( rec_tgt.als ) rec_tgt.als->used = 1; - -- bcf_write1(args->out_fh, call->hdr, missed); -+ rec = vcfbuf_remove(args->vcfbuf, rec_tgt.ibuf); -+ return rec; - } - - static void init_data(args_t *args) -@@ -376,22 +611,19 @@ - // Open files for input and output, initialize structures - if ( args->targets ) - { -- if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) -- error("Failed to read the targets: %s\n", args->targets); -- -- if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) -- { -- args->aux.srs->targets->missed_reg_handler = print_missed_line; -- args->aux.srs->targets->missed_reg_data = args; -- } -+ args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); -+ args->tgt_itr = regitr_init(args->tgt_idx); -+ args->tgt_itr_tmp = regitr_init(args->tgt_idx); - } -+ - if ( args->regions ) - { - if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions); - } - -- if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); -+ if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) -+ error("Failed to read from %s: %s\n", !strcmp("-",args->bcf_fname)?"standard input":args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); - args->aux.hdr = bcf_sr_get_header(args->aux.srs,0); - - int i; -@@ -451,8 +683,11 @@ - } - } - -+ if ( args->aux.flag & CALL_CONSTR_ALLELES ) -+ args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); -+ - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); -- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); - - if ( args->flag & CF_QCALL ) -@@ -468,13 +703,21 @@ - bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); - - if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); -- bcf_hdr_write(args->out_fh, args->aux.hdr); -+ if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); - - if ( args->flag&CF_INS_MISSED ) init_missed_line(args); - } - - static void destroy_data(args_t *args) - { -+ if ( args->vcfbuf ) vcfbuf_destroy(args->vcfbuf); -+ if ( args->tgt_idx ) -+ { -+ regidx_destroy(args->tgt_idx); -+ regitr_destroy(args->tgt_itr); -+ regitr_destroy(args->tgt_itr_tmp); -+ if ( args->tgt_itr_prev ) regitr_destroy(args->tgt_itr_prev); -+ } - if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux); - else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux); - else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); -@@ -496,9 +739,10 @@ - free(args->samples_map); - free(args->sample2sex); - free(args->aux.ploidy); -+ free(args->str.s); - if ( args->gvcf ) gvcf_destroy(args->gvcf); - bcf_hdr_destroy(args->aux.hdr); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - bcf_sr_destroy(args->aux.srs); - } - -@@ -604,7 +848,7 @@ - static void usage(args_t *args) - { - fprintf(stderr, "\n"); -- fprintf(stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); -+ fprintf(stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with bcftools mpileup.\n"); - fprintf(stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); - fprintf(stderr, " functionality has been temporarily lost in the process of transition to htslib,\n"); - fprintf(stderr, " but will be added back on popular demand. The original calling model can be\n"); -@@ -623,12 +867,13 @@ - fprintf(stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Input/output options:\n"); - fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); - fprintf(stderr, " -F, --prior-freqs use prior allele frequencies\n"); -+ fprintf(stderr, " -G, --group-samples group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n"); - fprintf(stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); - fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); -@@ -642,6 +887,10 @@ - fprintf(stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); -+ fprintf(stderr, "\n"); -+ fprintf(stderr, "Example:\n"); -+ fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"); -+ fprintf(stderr, " bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"); - - // todo (and more) - // fprintf(stderr, "\nContrast calling and association test options:\n"); -@@ -680,6 +929,7 @@ - {"format-fields",required_argument,NULL,'f'}, - {"prior-freqs",required_argument,NULL,'F'}, - {"gvcf",required_argument,NULL,'g'}, -+ {"group-samples",required_argument,NULL,'G'}, - {"output",required_argument,NULL,'o'}, - {"output-type",required_argument,NULL,'O'}, - {"regions",required_argument,NULL,'r'}, -@@ -710,7 +960,7 @@ - }; - - char *tmp = NULL; -- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0) - { - switch (c) - { -@@ -718,6 +968,7 @@ - case 1 : ploidy = optarg; break; - case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; - case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; -+ case 'G': args.aux.sample_groups = optarg; break; - case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; - case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N - case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) -@@ -805,13 +1056,14 @@ - } - if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); - if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n"); -+ if ( args.aux.sample_groups && !(args.flag & CF_MCALL) ) error("The -G feature is supported only with the -m calling mode\n"); - init_data(&args); - -- while ( bcf_sr_next_line(args.aux.srs) ) -+ bcf1_t *bcf_rec; -+ while ( (bcf_rec = next_line(&args)) ) - { -- bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; -- if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); -- bcf_unpack(bcf_rec, BCF_UN_STR); -+ // Skip duplicate positions with all matching `-C alleles -T` used up -+ if ( args.aux.flag&CALL_CONSTR_ALLELES && !args.aux.tgt_als ) continue; - - // Skip unwanted sites - int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1; -@@ -845,6 +1097,13 @@ - continue; - } - -+ if ( args.flag & CF_INS_MISSED ) -+ { -+ tgt_flush(&args,bcf_rec); -+ if ( !args.tgt_itr_prev ) args.tgt_itr_prev = regitr_init(args.tgt_idx); -+ regitr_copy(args.tgt_itr_prev, args.tgt_itr); -+ } -+ - // Calling modes which output VCFs - int ret; - if ( args.flag & CF_MCALL ) -@@ -858,11 +1117,10 @@ - if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant - if ( args.gvcf ) - bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0); -- if ( bcf_rec ) -- bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); -+ if ( bcf_rec && bcf_write1(args.out_fh, args.aux.hdr, bcf_rec)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args.output_fname); - } - if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0); -- if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); -+ if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,NULL); - destroy_data(&args); - return 0; - } ---- python-pysam.orig/bcftools/vcfcall.c.pysam.c -+++ python-pysam/bcftools/vcfcall.c.pysam.c -@@ -44,14 +44,11 @@ - #include "prob1.h" - #include "ploidy.h" - #include "gvcf.h" -+#include "regidx.h" -+#include "vcfbuf.h" - - void error(const char *format, ...); - --#ifdef _WIN32 --#define srand48(x) srand(x) --#define lrand48() rand() --#endif -- - #define CF_NO_GENO 1 - #define CF_INS_MISSED (1<<1) - #define CF_CCALL (1<<2) -@@ -70,6 +67,13 @@ - - typedef struct - { -+ tgt_als_t *als; -+ int nmatch_als, ibuf; -+} -+rec_tgt_t; -+ -+typedef struct -+{ - int flag; // combination of CF_* flags above - int output_type, n_threads, record_cmd_line; - htsFile *bcf_in, *out_fh; -@@ -78,6 +82,9 @@ - int nsamples, *samples_map; // mapping from output sample names to original VCF - char *regions, *targets; // regions to process - int regions_is_file, targets_is_file; -+ regidx_t *tgt_idx; -+ regitr_t *tgt_itr, *tgt_itr_prev, *tgt_itr_tmp; -+ vcfbuf_t *vcfbuf; - - char *samples_fname; - int samples_is_file; -@@ -88,6 +95,7 @@ - - bcf1_t *missed_line; - call_t aux; // parameters and temporary data -+ kstring_t str; - - int argc; - char **argv; -@@ -299,7 +307,7 @@ - if ( ismpl < 0 ) { fprintf(bcftools_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } - if ( old2new[ismpl] != -1 ) { fprintf(bcftools_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; } - -- ss = se+1; -+ ss = se+(x != '\0'); - while ( *ss && isspace(*ss) ) ss++; - if ( !*ss ) ss = "2"; // default ploidy - se = ss; -@@ -349,26 +357,253 @@ - bcf_float_set_missing(args->missed_line->qual); - } - --static void print_missed_line(bcf_sr_regions_t *regs, void *data) -+static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr) -+{ -+ char *ss = (char*) line; -+ while ( *ss && isspace(*ss) ) ss++; -+ if ( !*ss ) { fprintf(bcftools_stderr,"Could not parse the line: %s\n", line); return -2; } -+ if ( *ss=='#' ) return -1; // skip comments -+ -+ char *se = ss; -+ while ( *se && !isspace(*se) ) se++; -+ -+ *chr_beg = ss; -+ *chr_end = se-1; -+ -+ if ( !*se ) { fprintf(bcftools_stderr,"Could not parse the line: %s\n", line); return -2; } -+ -+ ss = se+1; -+ *beg = strtod(ss, &se); -+ if ( ss==se ) { fprintf(bcftools_stderr,"Could not parse tab line: %s\n", line); return -2; } -+ if ( *beg==0 ) { fprintf(bcftools_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; } -+ (*beg)--; -+ *end = *beg; -+ -+ if ( !usr ) return 0; // allele information not required -+ -+ ss = se+1; -+ tgt_als_t *als = (tgt_als_t*)payload; -+ als->used = 0; -+ als->n = 0; -+ als->allele = NULL; -+ while ( *ss ) -+ { -+ se = ss; -+ while ( *se && *se!=',' ) se++; -+ als->n++; -+ als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele)); -+ als->allele[als->n-1] = (char*)malloc(se-ss+1); -+ memcpy(als->allele[als->n-1],ss,se-ss); -+ als->allele[als->n-1][se-ss] = 0; -+ ss = se+1; -+ if ( !*se ) break; -+ } -+ return 0; -+} -+static void tgt_free(void *payload) -+{ -+ tgt_als_t *als = (tgt_als_t*)payload; -+ int i; -+ for (i=0; in; i++) free(als->allele[i]); -+ free(als->allele); -+} -+static void tgt_flush_region(args_t *args, char *chr, uint32_t beg, uint32_t end) -+{ -+ if ( !regidx_overlap(args->tgt_idx, chr,beg,end,args->tgt_itr_tmp) ) return; -+ while ( regitr_overlap(args->tgt_itr_tmp) ) -+ { -+ if ( args->tgt_itr_tmp->beg < beg ) continue; -+ -+ tgt_als_t *tgt_als = ®itr_payload(args->tgt_itr_tmp,tgt_als_t); -+ if ( tgt_als->used ) continue; -+ -+ args->missed_line->rid = bcf_hdr_name2id(args->aux.hdr,chr); -+ args->missed_line->pos = args->tgt_itr_tmp->beg; -+ bcf_unpack(args->missed_line,BCF_UN_ALL); -+ bcf_update_alleles(args->aux.hdr, args->missed_line, (const char**)tgt_als->allele, tgt_als->n); -+ tgt_als->used = 1; -+ if ( bcf_write1(args->out_fh, args->aux.hdr, args->missed_line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname); -+ } -+} -+static void tgt_flush(args_t *args, bcf1_t *rec) -+{ -+ if ( rec ) -+ { -+ char *chr = (char*)bcf_seqname(args->aux.hdr,rec); -+ -+ if ( !args->tgt_itr_prev ) // first record -+ tgt_flush_region(args,chr,0,rec->pos-1); -+ -+ else if ( strcmp(chr,args->tgt_itr_prev->seq) ) // first record on a new chromosome -+ { -+ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX); -+ tgt_flush_region(args,chr,0,rec->pos-1); -+ } -+ else // another record on the same chromosome -+ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,rec->pos-1); -+ } -+ else -+ { -+ // flush everything -+ if ( args->tgt_itr_prev ) -+ tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg,REGIDX_MAX); -+ -+ int i, nchr = 0; -+ char **chr = regidx_seq_names(args->tgt_idx, &nchr); -+ for (i=0; i" is not present at indels sites and there are no other symbolic alleles than <*> -+ if ( als[1][0]=='<' ) return 0; -+ -+ int i; -+ for (i=0; iaux; -- bcf1_t *missed = args->missed_line; -+ bcf1_t *rec = NULL; -+ if ( !args->vcfbuf ) -+ { -+ while ( bcf_sr_next_line(args->aux.srs) ) -+ { -+ rec = args->aux.srs->readers[0].buffer[0]; -+ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); -+ if ( args->tgt_idx ) -+ { -+ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; -+ -+ // For backward compatibility: require the exact position, not an interval overlap -+ int pos_match = 0; -+ while ( regitr_overlap(args->tgt_itr) ) -+ { -+ if ( args->tgt_itr->beg != rec->pos ) continue; -+ pos_match = 1; -+ break; -+ } -+ if ( !pos_match ) continue; -+ } -+ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); -+ bcf_unpack(rec, BCF_UN_STR); -+ return rec; -+ } -+ return NULL; -+ } -+ -+ // If we are here,-C alleles was given and vcfbuf and tgt_idx are set -+ -+ // Fill the buffer with duplicate lines -+ int vcfbuf_full = 1; -+ int nbuf = vcfbuf_nsites(args->vcfbuf); -+ bcf1_t *rec0 = NULL, *recN = NULL; -+ if ( nbuf==0 ) vcfbuf_full = 0; -+ else if ( nbuf==1 ) -+ { -+ vcfbuf_full = 0; -+ rec0 = vcfbuf_peek(args->vcfbuf, 0); -+ } -+ else -+ { -+ rec0 = vcfbuf_peek(args->vcfbuf, 0); -+ recN = vcfbuf_peek(args->vcfbuf, nbuf-1); -+ if ( rec0->rid == recN->rid && rec0->pos == recN->pos ) vcfbuf_full = 0; -+ } -+ if ( !vcfbuf_full ) -+ { -+ while ( bcf_sr_next_line(args->aux.srs) ) -+ { -+ rec = args->aux.srs->readers[0].buffer[0]; -+ if ( args->aux.srs->errnum || rec->errcode ) error("Error: could not parse the input VCF\n"); -+ if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) continue; -+ // as above: require the exact position, not an interval overlap -+ int exact_match = 0; -+ while ( regitr_overlap(args->tgt_itr) ) -+ { -+ if ( args->tgt_itr->beg != rec->pos ) continue; -+ exact_match = 1; -+ break; -+ } -+ if ( !exact_match ) continue; -+ -+ if ( args->samples_map ) bcf_subset(args->aux.hdr, rec, args->nsamples, args->samples_map); -+ bcf_unpack(rec, BCF_UN_STR); -+ if ( !rec0 ) rec0 = rec; -+ recN = rec; -+ args->aux.srs->readers[0].buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1); -+ if ( rec0->rid!=recN->rid || rec0->pos!=recN->pos ) break; -+ } -+ } - -- char *ss = regs->line.s; -- int i = 0; -- while ( iaux.srs->targets_als-1 && *ss ) -+ nbuf = vcfbuf_nsites(args->vcfbuf); -+ int n, i,j; -+ for (n=nbuf; n>1; n--) - { -- if ( *ss=='\t' ) i++; -- ss++; -+ recN = vcfbuf_peek(args->vcfbuf, n-1); -+ if ( rec0->rid==recN->rid && rec0->pos==recN->pos ) break; - } -- if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als); -+ if ( n==0 ) -+ { -+ assert( !nbuf ); -+ return NULL; -+ } -+ -+ // Find the VCF and tab record with the best matching combination of alleles, prioritize -+ // records of the same type (snp vs indel) -+ rec_tgt_t rec_tgt; -+ memset(&rec_tgt,0,sizeof(rec_tgt)); -+ regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr); -+ regitr_t *tmp_itr = regitr_init(args->tgt_idx); -+ regitr_copy(tmp_itr, args->tgt_itr); -+ for (i=0; ivcfbuf, i); -+ int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1; -+ while ( regitr_overlap(tmp_itr) ) -+ { -+ if ( tmp_itr->beg != rec->pos ) continue; -+ tgt_als_t *als = ®itr_payload(tmp_itr,tgt_als_t); -+ if ( als->used ) continue; -+ int nmatch_als = 0; -+ vcmp_t *vcmp = vcmp_init(); -+ int ret = vcmp_set_ref(vcmp, rec->d.allele[0], als->allele[0]); -+ if ( ret==0 ) -+ { -+ nmatch_als++; -+ if ( rec->n_allele > 1 && als->n > 1 ) -+ { -+ for (j=1; jn; j++) -+ { -+ if ( vcmp_find_allele(vcmp, rec->d.allele+1, rec->n_allele-1, als->allele[j])>=0 ) nmatch_als++; -+ } -+ } -+ } -+ int als_indel = is_indel(als->n, als->allele) ? 1 : -1; -+ nmatch_als *= rec_indel*als_indel; -+ if ( nmatch_als > rec_tgt.nmatch_als || !rec_tgt.als ) -+ { -+ rec_tgt.nmatch_als = nmatch_als; -+ rec_tgt.als = als; -+ rec_tgt.ibuf = i; -+ } -+ vcmp_destroy(vcmp); -+ } -+ } -+ regitr_destroy(tmp_itr); - -- missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]); -- missed->pos = regs->start; -- bcf_update_alleles_str(call->hdr, missed,ss); -+ args->aux.tgt_als = rec_tgt.als; -+ if ( rec_tgt.als ) rec_tgt.als->used = 1; - -- bcf_write1(args->out_fh, call->hdr, missed); -+ rec = vcfbuf_remove(args->vcfbuf, rec_tgt.ibuf); -+ return rec; - } - - static void init_data(args_t *args) -@@ -378,22 +613,19 @@ - // Open files for input and output, initialize structures - if ( args->targets ) - { -- if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) -- error("Failed to read the targets: %s\n", args->targets); -- -- if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) -- { -- args->aux.srs->targets->missed_reg_handler = print_missed_line; -- args->aux.srs->targets->missed_reg_data = args; -- } -+ args->tgt_idx = regidx_init(args->targets, tgt_parse, args->aux.flag&CALL_CONSTR_ALLELES ? tgt_free : NULL, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL); -+ args->tgt_itr = regitr_init(args->tgt_idx); -+ args->tgt_itr_tmp = regitr_init(args->tgt_idx); - } -+ - if ( args->regions ) - { - if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) - error("Failed to read the regions: %s\n", args->regions); - } - -- if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); -+ if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) -+ error("Failed to read from %s: %s\n", !strcmp("-",args->bcf_fname)?"standard input":args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); - args->aux.hdr = bcf_sr_get_header(args->aux.srs,0); - - int i; -@@ -453,8 +685,11 @@ - } - } - -+ if ( args->aux.flag & CALL_CONSTR_ALLELES ) -+ args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); -+ - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); -- if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -+ if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); - - if ( args->flag & CF_QCALL ) -@@ -470,13 +705,21 @@ - bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); - - if (args->record_cmd_line) bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); -- bcf_hdr_write(args->out_fh, args->aux.hdr); -+ if ( bcf_hdr_write(args->out_fh, args->aux.hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); - - if ( args->flag&CF_INS_MISSED ) init_missed_line(args); - } - - static void destroy_data(args_t *args) - { -+ if ( args->vcfbuf ) vcfbuf_destroy(args->vcfbuf); -+ if ( args->tgt_idx ) -+ { -+ regidx_destroy(args->tgt_idx); -+ regitr_destroy(args->tgt_itr); -+ regitr_destroy(args->tgt_itr_tmp); -+ if ( args->tgt_itr_prev ) regitr_destroy(args->tgt_itr_prev); -+ } - if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux); - else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux); - else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); -@@ -498,9 +741,10 @@ - free(args->samples_map); - free(args->sample2sex); - free(args->aux.ploidy); -+ free(args->str.s); - if ( args->gvcf ) gvcf_destroy(args->gvcf); - bcf_hdr_destroy(args->aux.hdr); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - bcf_sr_destroy(args->aux.srs); - } - -@@ -606,7 +850,7 @@ - static void usage(args_t *args) - { - fprintf(bcftools_stderr, "\n"); -- fprintf(bcftools_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with samtools mpileup.\n"); -+ fprintf(bcftools_stderr, "About: SNP/indel variant calling from VCF/BCF. To be used in conjunction with bcftools mpileup.\n"); - fprintf(bcftools_stderr, " This command replaces the former \"bcftools view\" caller. Some of the original\n"); - fprintf(bcftools_stderr, " functionality has been temporarily lost in the process of transition to htslib,\n"); - fprintf(bcftools_stderr, " but will be added back on popular demand. The original calling model can be\n"); -@@ -625,12 +869,13 @@ - fprintf(bcftools_stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "Input/output options:\n"); - fprintf(bcftools_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); - fprintf(bcftools_stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); - fprintf(bcftools_stderr, " -F, --prior-freqs use prior allele frequencies\n"); -+ fprintf(bcftools_stderr, " -G, --group-samples group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n"); - fprintf(bcftools_stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(bcftools_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); - fprintf(bcftools_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); -@@ -644,6 +889,10 @@ - fprintf(bcftools_stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(bcftools_stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); -+ fprintf(bcftools_stderr, "\n"); -+ fprintf(bcftools_stderr, "Example:\n"); -+ fprintf(bcftools_stderr, " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n"); -+ fprintf(bcftools_stderr, " bcftools mpileup -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n"); - - // todo (and more) - // fprintf(bcftools_stderr, "\nContrast calling and association test options:\n"); -@@ -682,6 +931,7 @@ - {"format-fields",required_argument,NULL,'f'}, - {"prior-freqs",required_argument,NULL,'F'}, - {"gvcf",required_argument,NULL,'g'}, -+ {"group-samples",required_argument,NULL,'G'}, - {"output",required_argument,NULL,'o'}, - {"output-type",required_argument,NULL,'O'}, - {"regions",required_argument,NULL,'r'}, -@@ -712,7 +962,7 @@ - }; - - char *tmp = NULL; -- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:G:", loptions, NULL)) >= 0) - { - switch (c) - { -@@ -720,6 +970,7 @@ - case 1 : ploidy = optarg; break; - case 'X': ploidy = "X"; fprintf(bcftools_stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; - case 'Y': ploidy = "Y"; fprintf(bcftools_stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; -+ case 'G': args.aux.sample_groups = optarg; break; - case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; - case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N - case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) -@@ -807,13 +1058,14 @@ - } - if ( args.flag & CF_INS_MISSED && !(args.aux.flag&CALL_CONSTR_ALLELES) ) error("The -i option requires -C alleles\n"); - if ( args.aux.flag&CALL_VARONLY && args.gvcf ) error("The two options cannot be combined: --variants-only and --gvcf\n"); -+ if ( args.aux.sample_groups && !(args.flag & CF_MCALL) ) error("The -G feature is supported only with the -m calling mode\n"); - init_data(&args); - -- while ( bcf_sr_next_line(args.aux.srs) ) -+ bcf1_t *bcf_rec; -+ while ( (bcf_rec = next_line(&args)) ) - { -- bcf1_t *bcf_rec = args.aux.srs->readers[0].buffer[0]; -- if ( args.samples_map ) bcf_subset(args.aux.hdr, bcf_rec, args.nsamples, args.samples_map); -- bcf_unpack(bcf_rec, BCF_UN_STR); -+ // Skip duplicate positions with all matching `-C alleles -T` used up -+ if ( args.aux.flag&CALL_CONSTR_ALLELES && !args.aux.tgt_als ) continue; - - // Skip unwanted sites - int i, is_indel = bcf_is_snp(bcf_rec) ? 0 : 1; -@@ -847,6 +1099,13 @@ - continue; - } - -+ if ( args.flag & CF_INS_MISSED ) -+ { -+ tgt_flush(&args,bcf_rec); -+ if ( !args.tgt_itr_prev ) args.tgt_itr_prev = regitr_init(args.tgt_idx); -+ regitr_copy(args.tgt_itr_prev, args.tgt_itr); -+ } -+ - // Calling modes which output VCFs - int ret; - if ( args.flag & CF_MCALL ) -@@ -860,11 +1119,10 @@ - if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant - if ( args.gvcf ) - bcf_rec = gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, bcf_rec, ret==1?1:0); -- if ( bcf_rec ) -- bcf_write1(args.out_fh, args.aux.hdr, bcf_rec); -+ if ( bcf_rec && bcf_write1(args.out_fh, args.aux.hdr, bcf_rec)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args.output_fname); - } - if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0); -- if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets); -+ if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,NULL); - destroy_data(&args); - return 0; - } ---- python-pysam.orig/bcftools/vcfcnv.c -+++ python-pysam/bcftools/vcfcnv.c -@@ -34,6 +34,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -226,9 +227,9 @@ - } - static void close_sample_files(sample_t *smpl) - { -- fclose(smpl->dat_fh); -- fclose(smpl->cn_fh); -- fclose(smpl->summary_fh); -+ if ( fclose(smpl->dat_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->dat_fname); -+ if ( fclose(smpl->cn_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->cn_fname); -+ if ( fclose(smpl->summary_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->summary_fname); - } - - static double norm_cdf(double mean, double dev); -@@ -1190,10 +1191,10 @@ - args->control_sample.lrr[args->nsites-1] = lrr2; - args->control_sample.baf[args->nsites-1] = baf2; - if ( baf2>=0 ) // skip missing values -- fprintf(args->control_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf2,lrr2); -+ fprintf(args->control_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf2,lrr2); - } - if ( baf1>=0 ) // skip missing values -- fprintf(args->query_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf1,lrr1); -+ fprintf(args->query_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf1,lrr1); - - if ( baf1>=0 ) - { -@@ -1277,13 +1278,13 @@ - {"LRR-weight",1,0,'l'}, - {"same-prob",1,0,'P'}, - {"xy-prob",1,0,'x'}, -- {"sample",1,0,'s'}, -- {"control",1,0,'c'}, -+ {"query-sample",1,0,'s'}, -+ {"control-sample",1,0,'c'}, - {"targets",1,0,'t'}, - {"targets-file",1,0,'T'}, - {"regions",1,0,'r'}, - {"regions-file",1,0,'R'}, -- {"plot",1,0,'p'}, -+ {"plot-threshold",1,0,'p'}, - {"output-dir",1,0,'o'}, - {0,0,0,0} - }; -@@ -1399,7 +1400,8 @@ - if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) - error("Failed to read the targets: %s\n", args->af_fname); - } -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) -+ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - while ( bcf_sr_next_line(args->files) ) ---- python-pysam.orig/bcftools/vcfcnv.c.pysam.c -+++ python-pysam/bcftools/vcfcnv.c.pysam.c -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -228,9 +229,9 @@ - } - static void close_sample_files(sample_t *smpl) - { -- fclose(smpl->dat_fh); -- fclose(smpl->cn_fh); -- fclose(smpl->summary_fh); -+ if ( fclose(smpl->dat_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->dat_fname); -+ if ( fclose(smpl->cn_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->cn_fname); -+ if ( fclose(smpl->summary_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,smpl->summary_fname); - } - - static double norm_cdf(double mean, double dev); -@@ -1192,10 +1193,10 @@ - args->control_sample.lrr[args->nsites-1] = lrr2; - args->control_sample.baf[args->nsites-1] = baf2; - if ( baf2>=0 ) // skip missing values -- fprintf(args->control_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf2,lrr2); -+ fprintf(args->control_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf2,lrr2); - } - if ( baf1>=0 ) // skip missing values -- fprintf(args->query_sample.dat_fh,"%s\t%d\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), line->pos+1,baf1,lrr1); -+ fprintf(args->query_sample.dat_fh,"%s\t%"PRId64"\t%.3f\t%.3f\n",bcf_hdr_id2name(args->hdr,args->prev_rid), (int64_t) line->pos+1,baf1,lrr1); - - if ( baf1>=0 ) - { -@@ -1279,13 +1280,13 @@ - {"LRR-weight",1,0,'l'}, - {"same-prob",1,0,'P'}, - {"xy-prob",1,0,'x'}, -- {"sample",1,0,'s'}, -- {"control",1,0,'c'}, -+ {"query-sample",1,0,'s'}, -+ {"control-sample",1,0,'c'}, - {"targets",1,0,'t'}, - {"targets-file",1,0,'T'}, - {"regions",1,0,'r'}, - {"regions-file",1,0,'R'}, -- {"plot",1,0,'p'}, -+ {"plot-threshold",1,0,'p'}, - {"output-dir",1,0,'o'}, - {0,0,0,0} - }; -@@ -1401,7 +1402,8 @@ - if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) - error("Failed to read the targets: %s\n", args->af_fname); - } -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) -+ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - while ( bcf_sr_next_line(args->files) ) ---- python-pysam.orig/bcftools/vcfconcat.c -+++ python-pysam/bcftools/vcfconcat.c -@@ -1,6 +1,6 @@ - /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - -- Copyright (C) 2013-2015 Genome Research Ltd. -+ Copyright (C) 2013-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -34,6 +34,8 @@ - #include - #include - #include // for hts_get_bgzfp() -+#include -+#include - #include "bcftools.h" - - typedef struct _args_t -@@ -53,7 +55,9 @@ - - char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; - int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; -- int compact_PS, phase_set_changed, naive_concat; -+ int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; -+ int verbose; -+ htsThreadPool *tpool; - } - args_t; - -@@ -70,6 +74,7 @@ - line = bcf_init(); - } - -+ if ( args->verbose ) fprintf(stderr,"Checking the headers and starting positions of %d files\n", args->nfnames); - kstring_t str = {0,0,0}; - int i, prev_chrid = -1; - for (i=0; infnames; i++) -@@ -97,7 +102,7 @@ - } - } - bcf_hdr_destroy(hdr); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); - } - free(str.s); - if ( line ) bcf_destroy(line); -@@ -112,14 +117,30 @@ - if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); -- -- bcf_hdr_write(args->out_fh, args->out_hdr); -- -- if ( args->allow_overlaps ) -+ if ( args->allow_overlaps || args->phased_concat ) - { - args->files = bcf_sr_init(); - args->files->require_index = 1; -+ } -+ if ( args->n_threads ) -+ { -+ if ( args->files ) -+ { -+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -+ args->tpool = args->files->p; -+ } -+ else -+ { -+ args->tpool = (htsThreadPool*) calloc(1, sizeof(htsThreadPool)); -+ if ( !args->tpool ) error("Failed to allocate memory\n"); -+ if ( !(args->tpool->pool = hts_tpool_init(args->n_threads)) ) error("Failed to initialize %d threads\n",args->n_threads); -+ } -+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); -+ } -+ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); -+ -+ if ( args->allow_overlaps ) -+ { - if ( args->regions_list ) - { - if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) -@@ -167,8 +188,6 @@ - args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); - args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); - args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); -- args->files = bcf_sr_init(); -- args->files->require_index = 1; - args->ifname = 0; - } - } -@@ -176,13 +195,16 @@ - static void destroy_data(args_t *args) - { - int i; -- for (i=0; infnames; i++) free(args->fnames[i]); -- free(args->fnames); -- if ( args->files ) bcf_sr_destroy(args->files); - if ( args->out_fh ) - { - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); - } -+ if ( args->tpool && !args->files ) -+ { -+ hts_tpool_destroy(args->tpool->pool); -+ free(args->tpool); -+ } -+ if ( args->files ) bcf_sr_destroy(args->files); - if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr); - free(args->seen_seq); - free(args->start_pos); -@@ -195,6 +217,8 @@ - free(args->nmism); - free(args->phase_qual); - free(args->phase_set); -+ for (i=0; infnames; i++) free(args->fnames[i]); -+ free(args->fnames); - } - - int vcf_write_line(htsFile *fp, kstring_t *line); -@@ -235,7 +259,7 @@ - { - if ( !gt_absent_warned ) - { -- fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); -+ fprintf(stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1); - gt_absent_warned = 1; - } - continue; -@@ -246,7 +270,7 @@ - { - if ( !gt_absent_warned ) - { -- fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); -+ fprintf(stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), (int64_t) brec->pos+1); - gt_absent_warned = 1; - } - continue; -@@ -282,9 +306,9 @@ - bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); - args->phase_set_changed = 0; - } -- bcf_write(args->out_fh, args->out_hdr, arec); -+ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - -- if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1); -+ if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = arec->pos; - } - args->nswap = 0; -@@ -332,9 +356,9 @@ - bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); - args->phase_set_changed = 0; - } -- bcf_write(args->out_fh, args->out_hdr, brec); -+ if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - -- if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1); -+ if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = brec->pos; - } - args->nbuf = 0; -@@ -343,9 +367,9 @@ - static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) - { - if ( arec && arec->errcode ) -- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname); -+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); - if ( brec && brec->errcode ) -- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname); -+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); - - int i, nsmpl = bcf_hdr_nsamples(args->out_hdr); - int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec)); -@@ -373,10 +397,10 @@ - bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); - args->phase_set_changed = 0; - } -- bcf_write(args->out_fh, args->out_hdr, arec); -+ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - - if ( arec->pos < args->prev_pos_check ) -- error("FIXME, disorder: %s:%d in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); -+ error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); - args->prev_pos_check = arec->pos; - return; - } -@@ -393,6 +417,7 @@ - - static void concat(args_t *args) - { -+ static int site_drop_warned = 0; - int i; - if ( args->phased_concat ) // phased concat - { -@@ -429,8 +454,20 @@ - if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader - { - // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped -- if ( ! bcf_sr_region_done(args->files,0) ) continue; -- -+ if ( ! bcf_sr_region_done(args->files,0) ) -+ { -+ if ( !site_drop_warned ) -+ { -+ fprintf(stderr, -+ "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" -+ " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" -+ " This warning is printed only once.\n", -+ bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1 -+ ); -+ site_drop_warned = 1; -+ } -+ continue; -+ } - phased_flush(args); - bcf_sr_remove_reader(args->files, 0); - } -@@ -483,20 +520,27 @@ - bcf1_t *line = bcf_sr_get_line(args->files,i); - if ( !line ) continue; - bcf_translate(args->out_hdr, args->files->readers[i].header, line); -- bcf_write1(args->out_fh, args->out_hdr, line); -+ if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( args->remove_dups ) break; - } - } - } - else // concatenating - { -+ struct timeval t0, t1; - kstring_t tmp = {0,0,0}; - int prev_chr_id = -1, prev_pos; - bcf1_t *line = bcf_init(); - for (i=0; infnames; i++) - { -- htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); -- bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); -+ if ( args->verbose ) -+ { -+ fprintf(stderr,"Concatenating %s", args->fnames[i]); -+ gettimeofday(&t0, NULL); -+ } -+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]); -+ if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool); -+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]); - if ( !fp->is_bin && args->output_type&FT_VCF ) - { - line->max_unpack = BCF_UN_STR; -@@ -508,7 +552,7 @@ - tmp.l = 0; - kputsn(fp->line.s,str-fp->line.s,&tmp); - int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); -- if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); -+ if ( chr_id<0 ) error("\nThe sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); - if ( prev_chr_id!=chr_id ) - { - prev_pos = -1; -@@ -519,11 +563,11 @@ - int pos = strtol(str+1,&end,10) - 1; - if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); - if ( prev_pos > pos ) -- error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); -+ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", tmp.s); - args->seen_seq[chr_id] = 1; - prev_chr_id = chr_id; - -- if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); -+ if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("\nFailed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); - } - } - else -@@ -541,15 +585,21 @@ - error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); - } - if ( prev_pos > line->pos ) -- error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); -+ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); - args->seen_seq[line->rid] = 1; - prev_chr_id = line->rid; - -- if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); -+ if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("\nFailed to write\n"); - } - } - bcf_hdr_destroy(hdr); - hts_close(fp); -+ if ( args->verbose ) -+ { -+ gettimeofday(&t1, NULL); -+ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); -+ fprintf(stderr,"\t%f seconds\n",delta/1e6); -+ } - } - bcf_destroy(line); - free(tmp.s); -@@ -612,63 +662,141 @@ - && header[12] == 'B' && header[13] == 'C' - && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; - } -+static void _check_hrecs(const bcf_hdr_t *hdr0, const bcf_hdr_t *hdr, char *fname0, char *fname) -+{ -+ int j; -+ for (j=0; jnhrec; j++) -+ { -+ bcf_hrec_t *hrec0 = hdr0->hrec[j]; -+ if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue; // skip fiels w/o IDX -+ int itag = bcf_hrec_find_key(hrec0, "ID"); -+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, hrec0->type, "ID", hrec0->vals[itag], NULL); -+ -+ char *type = NULL; -+ if ( hrec0->type==BCF_HL_FLT ) type = "FILTER"; -+ if ( hrec0->type==BCF_HL_INFO ) type = "INFO"; -+ if ( hrec0->type==BCF_HL_FMT ) type = "FORMAT"; -+ if ( hrec0->type==BCF_HL_CTG ) type = "contig"; -+ -+ if ( !hrec ) -+ error("Cannot use --naive, incompatible headers, the tag %s/%s not present in %s\n",type,hrec0->vals[itag],fname); -+ -+ int idx0 = bcf_hrec_find_key(hrec0, "IDX"); -+ int idx = bcf_hrec_find_key(hrec, "IDX"); -+ if ( idx0<0 || idx<0 ) -+ error("fixme: unexpected IDX<0 for %s/%s in %s or %s\n",type,hrec0->vals[itag],fname0,fname); -+ if ( strcmp(hrec0->vals[idx0],hrec->vals[idx]) ) -+ error("Cannot use --naive, use --naive-force instead: different order the tag %s/%s in %s vs %s\n",type,hrec0->vals[itag],fname0,fname); -+ } -+} -+static void naive_concat_check_headers(args_t *args) -+{ -+ fprintf(stderr,"Checking the headers of %d files.\n",args->nfnames); -+ bcf_hdr_t *hdr0 = NULL; -+ int i,j; -+ for (i=0; infnames; i++) -+ { -+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); -+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); -+ htsFormat type = *hts_get_format(fp); -+ hts_close(fp); -+ -+ if ( i==0 ) -+ { -+ hdr0 = hdr; -+ continue; -+ } -+ -+ // check the samples -+ if ( bcf_hdr_nsamples(hdr0)!=bcf_hdr_nsamples(hdr) ) -+ error("Cannot concatenate, different number of samples: %d vs %d in %s vs %s\n",bcf_hdr_nsamples(hdr0),bcf_hdr_nsamples(hdr),args->fnames[0],args->fnames[i]); -+ for (j=0; jsamples[j],hdr->samples[j]) ) -+ error("Cannot concatenate, different samples in %s vs %s\n",args->fnames[0],args->fnames[i]); -+ -+ // if BCF, check if tag IDs are consistent in the dictionary of strings -+ if ( type.compression!=bgzf ) -+ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); -+ if ( type.format==vcf ) -+ { -+ bcf_hdr_destroy(hdr); -+ continue; -+ } -+ -+ _check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]); -+ _check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]); -+ -+ bcf_hdr_destroy(hdr); -+ } -+ if ( hdr0 ) bcf_hdr_destroy(hdr0); -+ fprintf(stderr,"Done, the headers are compatible.\n"); -+} - static void naive_concat(args_t *args) - { -+ if ( !args->naive_concat_trust_headers ) -+ naive_concat_check_headers(args); -+ - // only compressed BCF atm - BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; - -+ struct timeval t0, t1; - const size_t page_size = BGZF_MAX_BLOCK_SIZE; - uint8_t *buf = (uint8_t*) malloc(page_size); - kstring_t tmp = {0,0,0}; - int i, file_types = 0; - for (i=0; infnames; i++) - { -+ if ( args->verbose ) -+ { -+ fprintf(stderr,"Concatenating %s", args->fnames[i]); -+ gettimeofday(&t0, NULL); -+ } - htsFile *hts_fp = hts_open(args->fnames[i],"r"); -- if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); -+ if ( !hts_fp ) error("\nFailed to open: %s\n", args->fnames[i]); - htsFormat type = *hts_get_format(hts_fp); - - if ( type.compression!=bgzf ) -- error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); -+ error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); - file_types |= type.format==vcf ? 1 : 2; - if ( file_types==3 ) -- error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); -+ error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); - - BGZF *fp = hts_get_bgzfp(hts_fp); - if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) -- error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); -+ error("\nFailed to read %s: %s\n", args->fnames[i], strerror(errno)); - - int nskip; - if ( type.format==bcf ) - { - uint8_t magic[5]; -- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); -- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); -+ if ( bgzf_read(fp, magic, 5) != 5 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); -+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("\nInvalid BCF magic string in %s\n", args->fnames[i]); - -- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); -+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); - hts_expand(char,tmp.l,tmp.m,tmp.s); -- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); -+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); - - // write only the first header - if ( i==0 ) - { -- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); -- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); -- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); -+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("\nFailed to write %d bytes to %s\n", 5,args->output_fname); -+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("\nFailed to write %d bytes to %s\n", 4,args->output_fname); -+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("\nFailed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); - } - nskip = fp->block_offset; - } - else - { - nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp); -- if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]); -+ if ( nskip==-1 ) error("\nError reading %s\n", args->fnames[i]); - } - - // Output all non-header data that were read together with the header block - if ( fp->block_length - nskip > 0 ) - { -- if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); -+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode); - } -- if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); -+ if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode); - - - // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks -@@ -680,16 +808,22 @@ - { - nread = bgzf_raw_read(fp, buf, nheader); - if ( !nread ) break; -- if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]); -+ if ( nread != nheader || check_header(buf)!=0 ) error("\nCould not parse the header of a bgzf block: %s\n",args->fnames[i]); - nblock = unpackInt16(buf+16) + 1; - assert( nblock <= page_size && nblock >= nheader ); - nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader); -- if ( nread!=nblock ) error("Could not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); -+ if ( nread!=nblock ) error("\nCould not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); - if ( nread==neof && !memcmp(buf,eof,neof) ) continue; - nwr = bgzf_raw_write(bgzf_out, buf, nread); -- if ( nwr != nread ) error("Write failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); -+ if ( nwr != nread ) error("\nWrite failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); -+ } -+ if (hts_close(hts_fp)) error("\nClose failed: %s\n",args->fnames[i]); -+ if ( args->verbose ) -+ { -+ gettimeofday(&t1, NULL); -+ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); -+ fprintf(stderr,"\t%f seconds\n",delta/1e6); - } -- if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); - } - free(buf); - free(tmp.s); -@@ -705,8 +839,7 @@ - fprintf(stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); - fprintf(stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); - fprintf(stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); -- fprintf(stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); -- fprintf(stderr, " if the BCF headers differ.\n"); -+ fprintf(stderr, " are concatenated without being recompressed, which is very fast.\n"); - fprintf(stderr, "Usage: bcftools concat [options] [ [...]]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Options:\n"); -@@ -717,13 +850,15 @@ - fprintf(stderr, " -f, --file-list Read the list of files from a file.\n"); - fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); - fprintf(stderr, " --no-version Do not append version and command line to the header\n"); -- fprintf(stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); -+ fprintf(stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n"); -+ fprintf(stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n"); - fprintf(stderr, " -o, --output Write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); - fprintf(stderr, " -r, --regions Restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file Restrict to regions listed in a file\n"); -- fprintf(stderr, " --threads Number of extra output compression threads [0]\n"); -+ fprintf(stderr, " --threads Use multithreading with worker threads [0]\n"); -+ fprintf(stderr, " -v, --verbose <0|1> Set verbosity level [1]\n"); - fprintf(stderr, "\n"); - exit(1); - } -@@ -738,10 +873,13 @@ - args->n_threads = 0; - args->record_cmd_line = 1; - args->min_PQ = 30; -+ args->verbose = 1; - - static struct option loptions[] = - { -+ {"verbose",required_argument,NULL,'v'}, - {"naive",no_argument,NULL,'n'}, -+ {"naive-force",no_argument,NULL,7}, - {"compact-PS",no_argument,NULL,'c'}, - {"regions",required_argument,NULL,'r'}, - {"regions-file",required_argument,NULL,'R'}, -@@ -758,7 +896,7 @@ - {NULL,0,NULL,0} - }; - char *tmp; -- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0) - { - switch (c) { - case 'c': args->compact_PS = 1; break; -@@ -786,6 +924,11 @@ - break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -+ case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; -+ case 'v': -+ args->verbose = strtol(optarg, 0, 0); -+ error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); -+ break; - case 'h': - case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); -@@ -798,7 +941,7 @@ - args->fnames[args->nfnames-1] = strdup(argv[optind]); - optind++; - } -- if ( args->allow_overlaps && args->phased_concat ) args->allow_overlaps = 0; -+ if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); - if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); - if ( args->file_list ) - { ---- python-pysam.orig/bcftools/vcfconcat.c.pysam.c -+++ python-pysam/bcftools/vcfconcat.c.pysam.c -@@ -2,7 +2,7 @@ - - /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - -- Copyright (C) 2013-2015 Genome Research Ltd. -+ Copyright (C) 2013-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -36,6 +36,8 @@ - #include - #include - #include // for hts_get_bgzfp() -+#include -+#include - #include "bcftools.h" - - typedef struct _args_t -@@ -55,7 +57,9 @@ - - char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list; - int argc, nfnames, allow_overlaps, phased_concat, regions_is_file; -- int compact_PS, phase_set_changed, naive_concat; -+ int compact_PS, phase_set_changed, naive_concat, naive_concat_trust_headers; -+ int verbose; -+ htsThreadPool *tpool; - } - args_t; - -@@ -72,6 +76,7 @@ - line = bcf_init(); - } - -+ if ( args->verbose ) fprintf(bcftools_stderr,"Checking the headers and starting positions of %d files\n", args->nfnames); - kstring_t str = {0,0,0}; - int i, prev_chrid = -1; - for (i=0; infnames; i++) -@@ -99,7 +104,7 @@ - } - } - bcf_hdr_destroy(hdr); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); - } - free(str.s); - if ( line ) bcf_destroy(line); -@@ -114,14 +119,30 @@ - if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); -- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); -- -- bcf_hdr_write(args->out_fh, args->out_hdr); -- -- if ( args->allow_overlaps ) -+ if ( args->allow_overlaps || args->phased_concat ) - { - args->files = bcf_sr_init(); - args->files->require_index = 1; -+ } -+ if ( args->n_threads ) -+ { -+ if ( args->files ) -+ { -+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -+ args->tpool = args->files->p; -+ } -+ else -+ { -+ args->tpool = (htsThreadPool*) calloc(1, sizeof(htsThreadPool)); -+ if ( !args->tpool ) error("Failed to allocate memory\n"); -+ if ( !(args->tpool->pool = hts_tpool_init(args->n_threads)) ) error("Failed to initialize %d threads\n",args->n_threads); -+ } -+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->tpool); -+ } -+ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); -+ -+ if ( args->allow_overlaps ) -+ { - if ( args->regions_list ) - { - if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) -@@ -169,8 +190,6 @@ - args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); - args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); - args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); -- args->files = bcf_sr_init(); -- args->files->require_index = 1; - args->ifname = 0; - } - } -@@ -178,13 +197,16 @@ - static void destroy_data(args_t *args) - { - int i; -- for (i=0; infnames; i++) free(args->fnames[i]); -- free(args->fnames); -- if ( args->files ) bcf_sr_destroy(args->files); - if ( args->out_fh ) - { - if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); - } -+ if ( args->tpool && !args->files ) -+ { -+ hts_tpool_destroy(args->tpool->pool); -+ free(args->tpool); -+ } -+ if ( args->files ) bcf_sr_destroy(args->files); - if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr); - free(args->seen_seq); - free(args->start_pos); -@@ -197,6 +219,8 @@ - free(args->nmism); - free(args->phase_qual); - free(args->phase_set); -+ for (i=0; infnames; i++) free(args->fnames[i]); -+ free(args->fnames); - } - - int vcf_write_line(htsFile *fp, kstring_t *line); -@@ -237,7 +261,7 @@ - { - if ( !gt_absent_warned ) - { -- fprintf(bcftools_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1); -+ fprintf(bcftools_stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), (int64_t) arec->pos+1); - gt_absent_warned = 1; - } - continue; -@@ -248,7 +272,7 @@ - { - if ( !gt_absent_warned ) - { -- fprintf(bcftools_stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1); -+ fprintf(bcftools_stderr,"GT is not present at %s:%"PRId64". (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), (int64_t) brec->pos+1); - gt_absent_warned = 1; - } - continue; -@@ -284,9 +308,9 @@ - bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); - args->phase_set_changed = 0; - } -- bcf_write(args->out_fh, args->out_hdr, arec); -+ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - -- if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1); -+ if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [1]\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = arec->pos; - } - args->nswap = 0; -@@ -334,9 +358,9 @@ - bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl); - args->phase_set_changed = 0; - } -- bcf_write(args->out_fh, args->out_hdr, brec); -+ if ( bcf_write(args->out_fh, args->out_hdr, brec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - -- if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1); -+ if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%"PRId64" vs %d [2]\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1,args->prev_pos_check+1); - args->prev_pos_check = brec->pos; - } - args->nbuf = 0; -@@ -345,9 +369,9 @@ - static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec) - { - if ( arec && arec->errcode ) -- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname); -+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),(int64_t) arec->pos+1, args->files->readers[0].fname); - if ( brec && brec->errcode ) -- error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname); -+ error("Parse error at %s:%"PRId64", cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),(int64_t) brec->pos+1, args->files->readers[1].fname); - - int i, nsmpl = bcf_hdr_nsamples(args->out_hdr); - int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec)); -@@ -375,10 +399,10 @@ - bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl); - args->phase_set_changed = 0; - } -- bcf_write(args->out_fh, args->out_hdr, arec); -+ if ( bcf_write(args->out_fh, args->out_hdr, arec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - - if ( arec->pos < args->prev_pos_check ) -- error("FIXME, disorder: %s:%d in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); -+ error("FIXME, disorder: %s:%"PRId64" in %s vs %d written [3]\n", bcf_seqname(args->files->readers[0].header,arec), (int64_t) arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1); - args->prev_pos_check = arec->pos; - return; - } -@@ -395,6 +419,7 @@ - - static void concat(args_t *args) - { -+ static int site_drop_warned = 0; - int i; - if ( args->phased_concat ) // phased concat - { -@@ -431,8 +456,20 @@ - if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader - { - // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped -- if ( ! bcf_sr_region_done(args->files,0) ) continue; -- -+ if ( ! bcf_sr_region_done(args->files,0) ) -+ { -+ if ( !site_drop_warned ) -+ { -+ fprintf(bcftools_stderr, -+ "Warning: Dropping the site %s:%"PRId64". The --ligate option is intended for VCFs with perfect\n" -+ " overlap, sites in overlapping regions present in one but missing in other are dropped.\n" -+ " This warning is printed only once.\n", -+ bcf_seqname(bcf_sr_get_header(args->files,1),bcf_sr_get_line(args->files,1)), (int64_t) bcf_sr_get_line(args->files,1)->pos+1 -+ ); -+ site_drop_warned = 1; -+ } -+ continue; -+ } - phased_flush(args); - bcf_sr_remove_reader(args->files, 0); - } -@@ -485,20 +522,27 @@ - bcf1_t *line = bcf_sr_get_line(args->files,i); - if ( !line ) continue; - bcf_translate(args->out_hdr, args->files->readers[i].header, line); -- bcf_write1(args->out_fh, args->out_hdr, line); -+ if ( bcf_write1(args->out_fh, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( args->remove_dups ) break; - } - } - } - else // concatenating - { -+ struct timeval t0, t1; - kstring_t tmp = {0,0,0}; - int prev_chr_id = -1, prev_pos; - bcf1_t *line = bcf_init(); - for (i=0; infnames; i++) - { -- htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); -- bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); -+ if ( args->verbose ) -+ { -+ fprintf(bcftools_stderr,"Concatenating %s", args->fnames[i]); -+ gettimeofday(&t0, NULL); -+ } -+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("\nFailed to open: %s\n", args->fnames[i]); -+ if ( args->n_threads ) hts_set_opt(fp, HTS_OPT_THREAD_POOL, args->tpool); -+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("\nFailed to parse header: %s\n", args->fnames[i]); - if ( !fp->is_bin && args->output_type&FT_VCF ) - { - line->max_unpack = BCF_UN_STR; -@@ -510,7 +554,7 @@ - tmp.l = 0; - kputsn(fp->line.s,str-fp->line.s,&tmp); - int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); -- if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); -+ if ( chr_id<0 ) error("\nThe sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); - if ( prev_chr_id!=chr_id ) - { - prev_pos = -1; -@@ -521,11 +565,11 @@ - int pos = strtol(str+1,&end,10) - 1; - if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); - if ( prev_pos > pos ) -- error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); -+ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", tmp.s); - args->seen_seq[chr_id] = 1; - prev_chr_id = chr_id; - -- if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); -+ if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("\nFailed to write %"PRIu64" bytes\n", (uint64_t)fp->line.l); - } - } - else -@@ -543,15 +587,21 @@ - error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); - } - if ( prev_pos > line->pos ) -- error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); -+ error("\nThe chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); - args->seen_seq[line->rid] = 1; - prev_chr_id = line->rid; - -- if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); -+ if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("\nFailed to write\n"); - } - } - bcf_hdr_destroy(hdr); - hts_close(fp); -+ if ( args->verbose ) -+ { -+ gettimeofday(&t1, NULL); -+ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); -+ fprintf(bcftools_stderr,"\t%f seconds\n",delta/1e6); -+ } - } - bcf_destroy(line); - free(tmp.s); -@@ -614,63 +664,141 @@ - && header[12] == 'B' && header[13] == 'C' - && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; - } -+static void _check_hrecs(const bcf_hdr_t *hdr0, const bcf_hdr_t *hdr, char *fname0, char *fname) -+{ -+ int j; -+ for (j=0; jnhrec; j++) -+ { -+ bcf_hrec_t *hrec0 = hdr0->hrec[j]; -+ if ( hrec0->type!=BCF_HL_FLT && hrec0->type!=BCF_HL_INFO && hrec0->type!=BCF_HL_FMT && hrec0->type!=BCF_HL_CTG ) continue; // skip fiels w/o IDX -+ int itag = bcf_hrec_find_key(hrec0, "ID"); -+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, hrec0->type, "ID", hrec0->vals[itag], NULL); -+ -+ char *type = NULL; -+ if ( hrec0->type==BCF_HL_FLT ) type = "FILTER"; -+ if ( hrec0->type==BCF_HL_INFO ) type = "INFO"; -+ if ( hrec0->type==BCF_HL_FMT ) type = "FORMAT"; -+ if ( hrec0->type==BCF_HL_CTG ) type = "contig"; -+ -+ if ( !hrec ) -+ error("Cannot use --naive, incompatible headers, the tag %s/%s not present in %s\n",type,hrec0->vals[itag],fname); -+ -+ int idx0 = bcf_hrec_find_key(hrec0, "IDX"); -+ int idx = bcf_hrec_find_key(hrec, "IDX"); -+ if ( idx0<0 || idx<0 ) -+ error("fixme: unexpected IDX<0 for %s/%s in %s or %s\n",type,hrec0->vals[itag],fname0,fname); -+ if ( strcmp(hrec0->vals[idx0],hrec->vals[idx]) ) -+ error("Cannot use --naive, use --naive-force instead: different order the tag %s/%s in %s vs %s\n",type,hrec0->vals[itag],fname0,fname); -+ } -+} -+static void naive_concat_check_headers(args_t *args) -+{ -+ fprintf(bcftools_stderr,"Checking the headers of %d files.\n",args->nfnames); -+ bcf_hdr_t *hdr0 = NULL; -+ int i,j; -+ for (i=0; infnames; i++) -+ { -+ htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); -+ bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); -+ htsFormat type = *hts_get_format(fp); -+ hts_close(fp); -+ -+ if ( i==0 ) -+ { -+ hdr0 = hdr; -+ continue; -+ } -+ -+ // check the samples -+ if ( bcf_hdr_nsamples(hdr0)!=bcf_hdr_nsamples(hdr) ) -+ error("Cannot concatenate, different number of samples: %d vs %d in %s vs %s\n",bcf_hdr_nsamples(hdr0),bcf_hdr_nsamples(hdr),args->fnames[0],args->fnames[i]); -+ for (j=0; jsamples[j],hdr->samples[j]) ) -+ error("Cannot concatenate, different samples in %s vs %s\n",args->fnames[0],args->fnames[i]); -+ -+ // if BCF, check if tag IDs are consistent in the dictionary of strings -+ if ( type.compression!=bgzf ) -+ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); -+ if ( type.format==vcf ) -+ { -+ bcf_hdr_destroy(hdr); -+ continue; -+ } -+ -+ _check_hrecs(hdr0,hdr,args->fnames[0],args->fnames[i]); -+ _check_hrecs(hdr,hdr0,args->fnames[i],args->fnames[0]); -+ -+ bcf_hdr_destroy(hdr); -+ } -+ if ( hdr0 ) bcf_hdr_destroy(hdr0); -+ fprintf(bcftools_stderr,"Done, the headers are compatible.\n"); -+} - static void naive_concat(args_t *args) - { -+ if ( !args->naive_concat_trust_headers ) -+ naive_concat_check_headers(args); -+ - // only compressed BCF atm - BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; - -+ struct timeval t0, t1; - const size_t page_size = BGZF_MAX_BLOCK_SIZE; - uint8_t *buf = (uint8_t*) malloc(page_size); - kstring_t tmp = {0,0,0}; - int i, file_types = 0; - for (i=0; infnames; i++) - { -+ if ( args->verbose ) -+ { -+ fprintf(bcftools_stderr,"Concatenating %s", args->fnames[i]); -+ gettimeofday(&t0, NULL); -+ } - htsFile *hts_fp = hts_open(args->fnames[i],"r"); -- if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); -+ if ( !hts_fp ) error("\nFailed to open: %s\n", args->fnames[i]); - htsFormat type = *hts_get_format(hts_fp); - - if ( type.compression!=bgzf ) -- error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); -+ error("\nThe --naive option works only for compressed BCFs or VCFs, sorry :-/\n"); - file_types |= type.format==vcf ? 1 : 2; - if ( file_types==3 ) -- error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); -+ error("\nThe --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n"); - - BGZF *fp = hts_get_bgzfp(hts_fp); - if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) -- error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); -+ error("\nFailed to read %s: %s\n", args->fnames[i], strerror(errno)); - - int nskip; - if ( type.format==bcf ) - { - uint8_t magic[5]; -- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); -- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); -+ if ( bgzf_read(fp, magic, 5) != 5 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); -+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("\nInvalid BCF magic string in %s\n", args->fnames[i]); - -- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); -+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); - hts_expand(char,tmp.l,tmp.m,tmp.s); -- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); -+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("\nFailed to read the BCF header in %s\n", args->fnames[i]); - - // write only the first header - if ( i==0 ) - { -- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); -- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); -- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); -+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("\nFailed to write %d bytes to %s\n", 5,args->output_fname); -+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("\nFailed to write %d bytes to %s\n", 4,args->output_fname); -+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("\nFailed to write %"PRId64" bytes to %s\n", (uint64_t)tmp.l,args->output_fname); - } - nskip = fp->block_offset; - } - else - { - nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp); -- if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]); -+ if ( nskip==-1 ) error("\nError reading %s\n", args->fnames[i]); - } - - // Output all non-header data that were read together with the header block - if ( fp->block_length - nskip > 0 ) - { -- if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); -+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("\nError: %d\n",fp->errcode); - } -- if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); -+ if ( bgzf_flush(bgzf_out)<0 ) error("\nError: %d\n",bgzf_out->errcode); - - - // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks -@@ -682,16 +810,22 @@ - { - nread = bgzf_raw_read(fp, buf, nheader); - if ( !nread ) break; -- if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]); -+ if ( nread != nheader || check_header(buf)!=0 ) error("\nCould not parse the header of a bgzf block: %s\n",args->fnames[i]); - nblock = unpackInt16(buf+16) + 1; - assert( nblock <= page_size && nblock >= nheader ); - nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader); -- if ( nread!=nblock ) error("Could not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); -+ if ( nread!=nblock ) error("\nCould not read %"PRId64" bytes: %s\n",(uint64_t)nblock,args->fnames[i]); - if ( nread==neof && !memcmp(buf,eof,neof) ) continue; - nwr = bgzf_raw_write(bgzf_out, buf, nread); -- if ( nwr != nread ) error("Write failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); -+ if ( nwr != nread ) error("\nWrite failed, wrote %"PRId64" instead of %d bytes.\n", (uint64_t)nwr,(int)nread); -+ } -+ if (hts_close(hts_fp)) error("\nClose failed: %s\n",args->fnames[i]); -+ if ( args->verbose ) -+ { -+ gettimeofday(&t1, NULL); -+ double delta = (t1.tv_sec - t0.tv_sec) * 1e6 + (t1.tv_usec - t0.tv_usec); -+ fprintf(bcftools_stderr,"\t%f seconds\n",delta/1e6); - } -- if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); - } - free(buf); - free(tmp.s); -@@ -707,8 +841,7 @@ - fprintf(bcftools_stderr, " VCF into one. The input files must be sorted by chr and position. The files\n"); - fprintf(bcftools_stderr, " must be given in the correct order to produce sorted VCF on output unless\n"); - fprintf(bcftools_stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n"); -- fprintf(bcftools_stderr, " are concatenated without being recompressed, which is very fast but dangerous\n"); -- fprintf(bcftools_stderr, " if the BCF headers differ.\n"); -+ fprintf(bcftools_stderr, " are concatenated without being recompressed, which is very fast.\n"); - fprintf(bcftools_stderr, "Usage: bcftools concat [options] [ [...]]\n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "Options:\n"); -@@ -719,13 +852,15 @@ - fprintf(bcftools_stderr, " -f, --file-list Read the list of files from a file.\n"); - fprintf(bcftools_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n"); - fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); -- fprintf(bcftools_stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n"); -+ fprintf(bcftools_stderr, " -n, --naive Concatenate files without recompression, a header check compatibility is performed\n"); -+ fprintf(bcftools_stderr, " --naive-force Same as --naive, but header compatibility is not checked. Dangerous, use with caution.\n"); - fprintf(bcftools_stderr, " -o, --output Write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -q, --min-PQ Break phase set if phasing quality is lower than [30]\n"); - fprintf(bcftools_stderr, " -r, --regions Restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file Restrict to regions listed in a file\n"); -- fprintf(bcftools_stderr, " --threads Number of extra output compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads Use multithreading with worker threads [0]\n"); -+ fprintf(bcftools_stderr, " -v, --verbose <0|1> Set verbosity level [1]\n"); - fprintf(bcftools_stderr, "\n"); - exit(1); - } -@@ -740,10 +875,13 @@ - args->n_threads = 0; - args->record_cmd_line = 1; - args->min_PQ = 30; -+ args->verbose = 1; - - static struct option loptions[] = - { -+ {"verbose",required_argument,NULL,'v'}, - {"naive",no_argument,NULL,'n'}, -+ {"naive-force",no_argument,NULL,7}, - {"compact-PS",no_argument,NULL,'c'}, - {"regions",required_argument,NULL,'r'}, - {"regions-file",required_argument,NULL,'R'}, -@@ -760,7 +898,7 @@ - {NULL,0,NULL,0} - }; - char *tmp; -- while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0) -+ while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cnv:",loptions,NULL)) >= 0) - { - switch (c) { - case 'c': args->compact_PS = 1; break; -@@ -788,6 +926,11 @@ - break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -+ case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; -+ case 'v': -+ args->verbose = strtol(optarg, 0, 0); -+ error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); -+ break; - case 'h': - case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); -@@ -800,7 +943,7 @@ - args->fnames[args->nfnames-1] = strdup(argv[optind]); - optind++; - } -- if ( args->allow_overlaps && args->phased_concat ) args->allow_overlaps = 0; -+ if ( args->allow_overlaps && args->phased_concat ) error("The options -a and -l should not be combined. Please run with -l only.\n"); - if ( args->compact_PS && !args->phased_concat ) error("The -c option is intended only with -l\n"); - if ( args->file_list ) - { ---- python-pysam.orig/bcftools/vcfconvert.c -+++ python-pysam/bcftools/vcfconvert.c -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -387,7 +388,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); - bcf1_t *rec = bcf_init(); - - nsamples -= 2; -@@ -399,7 +400,9 @@ - bcf_clear(rec); - args->n.total++; - if ( !tsv_parse(tsv, rec, line.s) ) -- bcf_write(out_fh, args->header, rec); -+ { -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); -+ } - else - error("Error occurred while parsing: %s\n", line.s); - } -@@ -513,7 +516,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); - bcf1_t *rec = bcf_init(); - - args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); -@@ -531,7 +534,7 @@ - if ( tsv_parse(hap_tsv, rec, line.s) ) - error("Error occurred while parsing %s: %s\n", hap_fname,line.s); - -- bcf_write(out_fh, args->header, rec); -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 ) - { -@@ -627,7 +630,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - bcf1_t *rec = bcf_init(); - - nsamples -= 2; -@@ -638,7 +641,9 @@ - bcf_clear(rec); - args->n.total++; - if ( !tsv_parse(tsv, rec, line.s) ) -- bcf_write(out_fh, args->header, rec); -+ { -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); -+ } - else - error("Error occurred while parsing: %s\n", line.s); - } -@@ -938,9 +943,9 @@ - if (legend_fname) { - str.l = 0; - if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) ) -- ksprintf(&str, "%s %d %s %s\n", line->d.id, line->pos+1, line->d.allele[0], line->d.allele[1]); -+ ksprintf(&str, "%s %"PRId64" %s %s\n", line->d.id, (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); - else -- ksprintf(&str, "%s:%d_%s_%s %d %s %s\n", bcf_seqname(args->header, line), line->pos+1, line->d.allele[0], line->d.allele[1], line->pos+1, line->d.allele[0], line->d.allele[1]); -+ ksprintf(&str, "%s:%"PRId64"_%s_%s %"PRId64" %s %s\n", bcf_seqname(args->header, line), (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1], (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); - - // write legend file - ret = bgzf_write(lout, str.s, str.l); -@@ -1141,7 +1146,7 @@ - - int len; - char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - - int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 }; // a,c,g,t,n - ref[0] = toupper(ref[0]); -@@ -1156,10 +1161,10 @@ - if ( i>0 ) - { - ret = tsv_next(tsv); -- if ( ret==-1 ) error("Too few columns for %d samples at %s:%d\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); -+ if ( ret==-1 ) error("Too few columns for %d samples at %s:%"PRId64"\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - } - ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); -- if ( ret==-1 ) error("Error parsing the site %s:%d, expected two characters\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); -+ if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - if ( ret==-2 ) - { - // something else than a SNP -@@ -1213,7 +1218,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); - if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); -@@ -1234,7 +1239,9 @@ - - args->n.total++; - if ( !tsv_parse(tsv, rec, line.s) ) -- bcf_write(out_fh, args->header, rec); -+ { -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); -+ } - else - args->n.skipped++; - } -@@ -1242,7 +1249,7 @@ - free(line.s); - - bcf_hdr_destroy(args->header); -- hts_close(out_fh); -+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); - tsv_destroy(tsv); - bcf_destroy(rec); - free(args->str.s); -@@ -1265,7 +1272,7 @@ - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); - - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); -- bcf_hdr_write(out_fh,hdr); -+ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - while ( bcf_sr_next_line(args->files) ) - { -@@ -1276,9 +1283,9 @@ - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) continue; - } -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - } -- hts_close(out_fh); -+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); - } - - static void gvcf_to_vcf(args_t *args) -@@ -1295,7 +1302,7 @@ - - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); - if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); -- bcf_hdr_write(out_fh,hdr); -+ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - int32_t *itmp = NULL, nitmp = 0; - -@@ -1308,7 +1315,7 @@ - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) - { -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - continue; - } - } -@@ -1332,7 +1339,7 @@ - // no gVCF compatible alleles - if (gallele<0) - { -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - continue; - } - -@@ -1340,7 +1347,7 @@ - if ( nend!=1 ) - { - // No INFO/END => not gVCF record -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - continue; - } - bcf_update_info_int32(hdr,line,"END",NULL,0); -@@ -1349,14 +1356,14 @@ - { - line->pos = pos; - char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1); - strncpy(line->d.allele[0],ref,len); -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - free(ref); - } - } - free(itmp); -- hts_close(out_fh); -+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); - } - - static void usage(void) -@@ -1381,7 +1388,7 @@ - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output output file name [stdout]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); -- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); - fprintf(stderr, " -G, --gensample2vcf <...> |,\n"); -@@ -1505,7 +1512,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 10 : args->record_cmd_line = 0; break; - case 11 : args->sex_fname = optarg; break; -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfconvert.c.pysam.c -+++ python-pysam/bcftools/vcfconvert.c.pysam.c -@@ -33,6 +33,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -389,7 +390,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); - bcf1_t *rec = bcf_init(); - - nsamples -= 2; -@@ -401,7 +402,9 @@ - bcf_clear(rec); - args->n.total++; - if ( !tsv_parse(tsv, rec, line.s) ) -- bcf_write(out_fh, args->header, rec); -+ { -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); -+ } - else - error("Error occurred while parsing: %s\n", line.s); - } -@@ -515,7 +518,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); - bcf1_t *rec = bcf_init(); - - args->gts = (int32_t *) malloc(sizeof(int32_t)*nsamples*2); -@@ -533,7 +536,7 @@ - if ( tsv_parse(hap_tsv, rec, line.s) ) - error("Error occurred while parsing %s: %s\n", hap_fname,line.s); - -- bcf_write(out_fh, args->header, rec); -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - if ( hts_getline(leg_fh, KS_SEP_LINE, &line)<=0 ) - { -@@ -629,7 +632,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - bcf1_t *rec = bcf_init(); - - nsamples -= 2; -@@ -640,7 +643,9 @@ - bcf_clear(rec); - args->n.total++; - if ( !tsv_parse(tsv, rec, line.s) ) -- bcf_write(out_fh, args->header, rec); -+ { -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); -+ } - else - error("Error occurred while parsing: %s\n", line.s); - } -@@ -940,9 +945,9 @@ - if (legend_fname) { - str.l = 0; - if ( args->output_vcf_ids && (line->d.id[0]!='.' || line->d.id[1]!=0) ) -- ksprintf(&str, "%s %d %s %s\n", line->d.id, line->pos+1, line->d.allele[0], line->d.allele[1]); -+ ksprintf(&str, "%s %"PRId64" %s %s\n", line->d.id, (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); - else -- ksprintf(&str, "%s:%d_%s_%s %d %s %s\n", bcf_seqname(args->header, line), line->pos+1, line->d.allele[0], line->d.allele[1], line->pos+1, line->d.allele[0], line->d.allele[1]); -+ ksprintf(&str, "%s:%"PRId64"_%s_%s %"PRId64" %s %s\n", bcf_seqname(args->header, line), (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1], (int64_t) line->pos+1, line->d.allele[0], line->d.allele[1]); - - // write legend file - ret = bgzf_write(lout, str.s, str.l); -@@ -1143,7 +1148,7 @@ - - int len; - char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(args->header,rec->rid), rec->pos, rec->pos, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - - int nals = 1, alleles[5] = { -1, -1, -1, -1, -1 }; // a,c,g,t,n - ref[0] = toupper(ref[0]); -@@ -1158,10 +1163,10 @@ - if ( i>0 ) - { - ret = tsv_next(tsv); -- if ( ret==-1 ) error("Too few columns for %d samples at %s:%d\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); -+ if ( ret==-1 ) error("Too few columns for %d samples at %s:%"PRId64"\n", rec->n_sample,bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - } - ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); -- if ( ret==-1 ) error("Error parsing the site %s:%d, expected two characters\n", bcf_hdr_id2name(args->header,rec->rid), rec->pos+1); -+ if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - if ( ret==-2 ) - { - // something else than a SNP -@@ -1215,7 +1220,7 @@ - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); - if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); -- bcf_hdr_write(out_fh,args->header); -+ if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - tsv_t *tsv = tsv_init(args->columns ? args->columns : "ID,CHROM,POS,AA"); - if ( tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header) < 0 ) error("Expected CHROM column\n"); -@@ -1236,7 +1241,9 @@ - - args->n.total++; - if ( !tsv_parse(tsv, rec, line.s) ) -- bcf_write(out_fh, args->header, rec); -+ { -+ if ( bcf_write(out_fh, args->header, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); -+ } - else - args->n.skipped++; - } -@@ -1244,7 +1251,7 @@ - free(line.s); - - bcf_hdr_destroy(args->header); -- hts_close(out_fh); -+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); - tsv_destroy(tsv); - bcf_destroy(rec); - free(args->str.s); -@@ -1267,7 +1274,7 @@ - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); - - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); -- bcf_hdr_write(out_fh,hdr); -+ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - while ( bcf_sr_next_line(args->files) ) - { -@@ -1278,9 +1285,9 @@ - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) continue; - } -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - } -- hts_close(out_fh); -+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); - } - - static void gvcf_to_vcf(args_t *args) -@@ -1297,7 +1304,7 @@ - - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0); - if (args->record_cmd_line) bcf_hdr_append_version(hdr, args->argc, args->argv, "bcftools_convert"); -- bcf_hdr_write(out_fh,hdr); -+ if ( bcf_hdr_write(out_fh,hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - - int32_t *itmp = NULL, nitmp = 0; - -@@ -1310,7 +1317,7 @@ - if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) - { -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - continue; - } - } -@@ -1334,7 +1341,7 @@ - // no gVCF compatible alleles - if (gallele<0) - { -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - continue; - } - -@@ -1342,7 +1349,7 @@ - if ( nend!=1 ) - { - // No INFO/END => not gVCF record -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - continue; - } - bcf_update_info_int32(hdr,line,"END",NULL,0); -@@ -1351,14 +1358,14 @@ - { - line->pos = pos; - char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_hdr_id2name(hdr,line->rid),(int64_t) line->pos+1); - strncpy(line->d.allele[0],ref,len); -- bcf_write(out_fh,hdr,line); -+ if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); - free(ref); - } - } - free(itmp); -- hts_close(out_fh); -+ if ( hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->outfname); - } - - static void usage(void) -@@ -1383,7 +1390,7 @@ - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output output file name [bcftools_stdout]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); -- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); - fprintf(bcftools_stderr, " -G, --gensample2vcf <...> |,\n"); -@@ -1507,7 +1514,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 10 : args->record_cmd_line = 0; break; - case 11 : args->sex_fname = optarg; break; -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcffilter.c -+++ python-pysam/bcftools/vcffilter.c -@@ -188,7 +188,7 @@ - if ( args->snp_gap && rec->d.flt[j]==args->SnpGap_id ) { pass = 0; break; } - } - } -- if ( pass ) bcf_write1(args->out_fh, args->hdr, rec); -+ if ( pass && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - -@@ -278,7 +278,7 @@ - if ( k_flush || !line ) - { - // Select the best indel from the cluster of k_flush indels -- int k = 0, max_ac = -1, imax_ac = -1; -+ int k = 0, max_ac = -1, imax_ac = -1, max_qual = -1, imax_qual = -1; - for (i=-1; rbuf_next(&args->rbuf,&i) && kn_allele, args->ntmpi, args->tmpi); - int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL); - if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; } -+ if ( imax_qual==-1 || max_qual < rec->qual ) { max_qual = rec->qual; imax_qual = i; } - } - -- // Filter all but the best indel (with max AF or first if AF not available) -+ // Filter all but the best indel (with the best QUAL, bigger AC, or take the first if neither QUAL nor AC are available) - k = 0; - for (i=-1; rbuf_next(&args->rbuf,&i) && krbuf_lines[i]; - if ( !(rec->d.var_type & IndelGap_set) ) continue; - rec->d.var_type |= IndelGap_flush; -- if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); -+ -+ int do_filter = 0; -+ if ( max_qual>0 ) -+ { -+ if ( i!=imax_qual ) do_filter = 1; -+ } -+ else if ( i!=imax_ac ) do_filter = 1; -+ if ( do_filter ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); - } - } - } -@@ -418,7 +426,7 @@ - fprintf(stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, "\n"); - exit(1); - } -@@ -494,7 +502,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; - case 'h': -- case '?': usage(args); -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -531,10 +539,10 @@ - if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) - error("Failed to read the targets: %s\n", args->targets_list); - } -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); -- bcf_hdr_write(args->out_fh, args->hdr); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); - while ( bcf_sr_next_line(args->files) ) - { - bcf1_t *line = bcf_sr_get_line(args->files, 0); -@@ -558,14 +566,16 @@ - } - if ( args->set_gts ) set_genotypes(args, line, pass); - if ( !args->rbuf_lines ) -- bcf_write1(args->out_fh, args->hdr, line); -+ { -+ if ( bcf_write1(args->out_fh, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ } - else - buffered_filters(args, line); - } - } - buffered_filters(args, NULL); - -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - destroy_data(args); - bcf_sr_destroy(args->files); - free(args); ---- python-pysam.orig/bcftools/vcffilter.c.pysam.c -+++ python-pysam/bcftools/vcffilter.c.pysam.c -@@ -190,7 +190,7 @@ - if ( args->snp_gap && rec->d.flt[j]==args->SnpGap_id ) { pass = 0; break; } - } - } -- if ( pass ) bcf_write1(args->out_fh, args->hdr, rec); -+ if ( pass && bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - -@@ -280,7 +280,7 @@ - if ( k_flush || !line ) - { - // Select the best indel from the cluster of k_flush indels -- int k = 0, max_ac = -1, imax_ac = -1; -+ int k = 0, max_ac = -1, imax_ac = -1, max_qual = -1, imax_qual = -1; - for (i=-1; rbuf_next(&args->rbuf,&i) && kn_allele, args->ntmpi, args->tmpi); - int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL); - if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; } -+ if ( imax_qual==-1 || max_qual < rec->qual ) { max_qual = rec->qual; imax_qual = i; } - } - -- // Filter all but the best indel (with max AF or first if AF not available) -+ // Filter all but the best indel (with the best QUAL, bigger AC, or take the first if neither QUAL nor AC are available) - k = 0; - for (i=-1; rbuf_next(&args->rbuf,&i) && krbuf_lines[i]; - if ( !(rec->d.var_type & IndelGap_set) ) continue; - rec->d.var_type |= IndelGap_flush; -- if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); -+ -+ int do_filter = 0; -+ if ( max_qual>0 ) -+ { -+ if ( i!=imax_qual ) do_filter = 1; -+ } -+ else if ( i!=imax_ac ) do_filter = 1; -+ if ( do_filter ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); - } - } - } -@@ -420,7 +428,7 @@ - fprintf(bcftools_stderr, " -S, --set-GTs <.|0> set genotypes of failed samples to missing (.) or ref (0)\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, "\n"); - exit(1); - } -@@ -496,7 +504,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; - case 'h': -- case '?': usage(args); -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -533,10 +541,10 @@ - if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) - error("Failed to read the targets: %s\n", args->targets_list); - } -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); -- bcf_hdr_write(args->out_fh, args->hdr); -+ if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname); - while ( bcf_sr_next_line(args->files) ) - { - bcf1_t *line = bcf_sr_get_line(args->files, 0); -@@ -560,14 +568,16 @@ - } - if ( args->set_gts ) set_genotypes(args, line, pass); - if ( !args->rbuf_lines ) -- bcf_write1(args->out_fh, args->hdr, line); -+ { -+ if ( bcf_write1(args->out_fh, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); -+ } - else - buffered_filters(args, line); - } - } - buffered_filters(args, NULL); - -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - destroy_data(args); - bcf_sr_destroy(args->files); - free(args); ---- python-pysam.orig/bcftools/vcfgtcheck.c -+++ python-pysam/bcftools/vcfgtcheck.c -@@ -302,7 +302,7 @@ - int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs - int nsm_gt, i; - if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) -- error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); -+ error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); - nsm_gt /= bcf_hdr_nsamples(hdr); - int npl = line->n_allele*(line->n_allele+1)/2; - hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); -@@ -399,7 +399,7 @@ - // Target genotypes - int ngt, npl; - if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) -- error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); -+ error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - ngt /= bcf_hdr_nsamples(args->gt_hdr); - if ( ngt!=2 ) continue; // checking only diploid genotypes - -@@ -415,7 +415,7 @@ - npl = fake_PLs(args, args->sm_hdr, sm_line); - } - else -- error("PL not present at %s:%d?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); -+ error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1); - } - else - npl /= bcf_hdr_nsamples(args->sm_hdr); -@@ -460,7 +460,7 @@ - int a = bcf_gt_allele(gt_ptr[0]); - int b = bcf_gt_allele(gt_ptr[1]); - if ( args->hom_only && a!=b ) continue; // heterozygous genotype -- fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); -+ fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); - fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); - fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk); -@@ -515,7 +515,7 @@ - - if ( args->plot ) - { -- fclose(fp); -+ if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__); - plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); - } - } -@@ -788,7 +788,7 @@ - case 't': targets = optarg; break; - case 'T': targets = optarg; targets_is_file = 1; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -805,7 +805,8 @@ - if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); - if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); - if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -- if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum)); -+ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) -+ error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); - args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; - if ( args->plot ) args->plot = init_prefix(args->plot); - init_data(args); ---- python-pysam.orig/bcftools/vcfgtcheck.c.pysam.c -+++ python-pysam/bcftools/vcfgtcheck.c.pysam.c -@@ -304,7 +304,7 @@ - int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs - int nsm_gt, i; - if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) -- error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); -+ error("GT not present at %s:%"PRId64"?\n", hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); - nsm_gt /= bcf_hdr_nsamples(hdr); - int npl = line->n_allele*(line->n_allele+1)/2; - hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); -@@ -401,7 +401,7 @@ - // Target genotypes - int ngt, npl; - if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) -- error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); -+ error("GT not present at %s:%"PRId64"?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - ngt /= bcf_hdr_nsamples(args->gt_hdr); - if ( ngt!=2 ) continue; // checking only diploid genotypes - -@@ -417,7 +417,7 @@ - npl = fake_PLs(args, args->sm_hdr, sm_line); - } - else -- error("PL not present at %s:%d?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); -+ error("PL not present at %s:%"PRId64"?\n", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, (int64_t) sm_line->pos+1); - } - else - npl /= bcf_hdr_nsamples(args->sm_hdr); -@@ -462,7 +462,7 @@ - int a = bcf_gt_allele(gt_ptr[0]); - int b = bcf_gt_allele(gt_ptr[1]); - if ( args->hom_only && a!=b ) continue; // heterozygous genotype -- fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); -+ fprintf(fp, "SC\t%s\t%"PRId64, args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, (int64_t) gt_line->pos+1); - for (i=0; in_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); - fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); - fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk); -@@ -517,7 +517,7 @@ - - if ( args->plot ) - { -- fclose(fp); -+ if ( fclose(fp)!=0 ) error("[%s] Error: close failed\n", __func__); - plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); - } - } -@@ -790,7 +790,7 @@ - case 't': targets = optarg; break; - case 'T': targets = optarg; targets_is_file = 1; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -807,7 +807,8 @@ - if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); - if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); - if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -- if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum)); -+ if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) -+ error("Failed to read from %s: %s\n", !strcmp("-",args->gt_fname)?"standard input":args->gt_fname,bcf_sr_strerror(args->files->errnum)); - args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; - if ( args->plot ) args->plot = init_prefix(args->plot); - init_data(args); ---- python-pysam.orig/bcftools/vcfindex.c -+++ python-pysam/bcftools/vcfindex.c -@@ -49,7 +49,7 @@ - fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(stderr, " -o, --output-file FILE optional output index file name\n"); - fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n"); -- fprintf(stderr, " --threads sets the number of threads [0]\n"); -+ fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Stats options:\n"); - fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n"); -@@ -112,7 +112,7 @@ - } - if (stats&2) printf("%" PRIu64 "\n", sum); - free(seq); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); - bcf_hdr_destroy(hdr); - if (tbx) - tbx_destroy(tbx); ---- python-pysam.orig/bcftools/vcfindex.c.pysam.c -+++ python-pysam/bcftools/vcfindex.c.pysam.c -@@ -51,7 +51,7 @@ - fprintf(bcftools_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n"); - fprintf(bcftools_stderr, " -o, --output-file FILE optional output index file name\n"); - fprintf(bcftools_stderr, " -t, --tbi generate TBI-format index for VCF files\n"); -- fprintf(bcftools_stderr, " --threads sets the number of threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "Stats options:\n"); - fprintf(bcftools_stderr, " -n, --nrecords print number of records based on existing index file\n"); -@@ -114,7 +114,7 @@ - } - if (stats&2) fprintf(bcftools_stdout, "%" PRIu64 "\n", sum); - free(seq); -- hts_close(fp); -+ if ( hts_close(fp)!=0 ) error("[%s] Error: close failed\n", __func__); - bcf_hdr_destroy(hdr); - if (tbx) - tbx_destroy(tbx); ---- python-pysam.orig/bcftools/vcfisec.c -+++ python-pysam/bcftools/vcfisec.c -@@ -1,6 +1,6 @@ - /* vcfisec.c -- Create intersections, unions and complements of VCF files. - -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -33,6 +33,7 @@ - #include - #include - #include -+#include - #include "bcftools.h" - #include "filter.h" - -@@ -144,7 +145,7 @@ - if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); - if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); -- bcf_hdr_write(out_fh, files->readers[args->iwrite].header); -+ if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); - } - if ( !args->nwrite && !out_std && !args->prefix ) - fprintf(stderr,"Note: -w option not given, printing list of sites...\n"); -@@ -195,8 +196,8 @@ - - if ( out_std ) - { -- if ( bcf_sr_has_line(files,args->iwrite) ) -- bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); -+ if ( bcf_sr_has_line(files,args->iwrite) && bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0])!=0 ) -+ error("[%s] Error: cannot write to %s\n", __func__, args->output_fname ? args->output_fname : "standard output"); - continue; - } - else if ( args->fh_sites ) -@@ -218,7 +219,8 @@ - for (i=0; inreaders; i++) - kputc(bcf_sr_has_line(files,i)?'1':'0', &str); - kputc('\n', &str); -- fwrite(str.s,sizeof(char),str.l,args->fh_sites); -+ if ( fwrite(str.s,sizeof(char),str.l,args->fh_sites)!=str.l ) -+ error("[%s] Error: failed to write %d bytes to %s\n", __func__,(int)str.l,args->output_fname ? args->output_fname : "standard output"); - } - - if ( args->prefix ) -@@ -226,9 +228,15 @@ - if ( args->isec_op==OP_VENN && ret==3 ) - { - if ( !args->nwrite || args->write[0] ) -- bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); -+ { -+ if ( bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0))!=0 ) -+ error("[%s] Error: cannot write\n", __func__); -+ } - if ( !args->nwrite || args->write[1] ) -- bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); -+ { -+ if ( bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1))!=0 ) -+ error("[%s] Error: cannot write\n", __func__); -+ } - } - else - { -@@ -236,13 +244,13 @@ - { - if ( !bcf_sr_has_line(files,i) ) continue; - if ( args->write && !args->write[i] ) continue; -- bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); -+ if ( bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0])!=0 ) error("[%s] Error: cannot write\n", __func__); - } - } - } - } - if ( str.s ) free(str.s); -- if ( out_fh ) hts_close(out_fh); -+ if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); - } - - static void add_filter(args_t *args, char *expr, int logic) -@@ -352,7 +360,7 @@ - if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ - if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ - if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ -- bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \ -+ if ( bcf_hdr_write(args->fh_out[i], args->files->readers[j].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fnames[i]); \ - } - if ( !args->nwrite || args->write[0] ) - { -@@ -425,7 +433,7 @@ - for (i=0; ifnames[i] ) continue; -- hts_close(args->fh_out[i]); -+ if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); - if ( args->output_type==FT_VCF_GZ ) - { - tbx_conf_t conf = tbx_conf_vcf; -@@ -465,7 +473,7 @@ - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Examples:\n"); -@@ -478,6 +486,9 @@ - fprintf(stderr, " # Extract and write records from A shared by both A and B using exact allele match\n"); - fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); - fprintf(stderr, "\n"); -+ fprintf(stderr, " # Extract and write records from C found in A and C but not in B\n"); -+ fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz C.vcf.gz -p dir -n~101 -w 3\n"); -+ fprintf(stderr, "\n"); - fprintf(stderr, " # Extract records private to A or B comparing by position only\n"); - fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); - fprintf(stderr, "\n"); -@@ -540,7 +551,9 @@ - else error("The --collapse string \"%s\" not recognised.\n", optarg); - break; - case 'f': args->files->apply_filters = optarg; break; -- case 'C': args->isec_op = OP_COMPLEMENT; break; -+ case 'C': -+ if ( args->isec_op!=0 && args->isec_op!=OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); -+ args->isec_op = OP_COMPLEMENT; break; - case 'r': args->regions_list = optarg; break; - case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 't': args->targets_list = optarg; break; -@@ -551,6 +564,8 @@ - case 'e': add_filter(args, optarg, FLT_EXCLUDE); break; - case 'n': - { -+ if ( args->isec_op!=0 && args->isec_op==OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); -+ if ( args->isec_op!=0 ) error("Error: -n should be given only once.\n"); - char *p = optarg; - if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; } - else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; } -@@ -565,7 +580,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfisec.c.pysam.c -+++ python-pysam/bcftools/vcfisec.c.pysam.c -@@ -2,7 +2,7 @@ - - /* vcfisec.c -- Create intersections, unions and complements of VCF files. - -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -35,6 +35,7 @@ - #include - #include - #include -+#include - #include "bcftools.h" - #include "filter.h" - -@@ -146,7 +147,7 @@ - if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); - if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); - if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); -- bcf_hdr_write(out_fh, files->readers[args->iwrite].header); -+ if ( bcf_hdr_write(out_fh, files->readers[args->iwrite].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); - } - if ( !args->nwrite && !out_std && !args->prefix ) - fprintf(bcftools_stderr,"Note: -w option not given, printing list of sites...\n"); -@@ -197,8 +198,8 @@ - - if ( out_std ) - { -- if ( bcf_sr_has_line(files,args->iwrite) ) -- bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); -+ if ( bcf_sr_has_line(files,args->iwrite) && bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0])!=0 ) -+ error("[%s] Error: cannot write to %s\n", __func__, args->output_fname ? args->output_fname : "standard output"); - continue; - } - else if ( args->fh_sites ) -@@ -220,7 +221,8 @@ - for (i=0; inreaders; i++) - kputc(bcf_sr_has_line(files,i)?'1':'0', &str); - kputc('\n', &str); -- fwrite(str.s,sizeof(char),str.l,args->fh_sites); -+ if ( fwrite(str.s,sizeof(char),str.l,args->fh_sites)!=str.l ) -+ error("[%s] Error: failed to write %d bytes to %s\n", __func__,(int)str.l,args->output_fname ? args->output_fname : "standard output"); - } - - if ( args->prefix ) -@@ -228,9 +230,15 @@ - if ( args->isec_op==OP_VENN && ret==3 ) - { - if ( !args->nwrite || args->write[0] ) -- bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); -+ { -+ if ( bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0))!=0 ) -+ error("[%s] Error: cannot write\n", __func__); -+ } - if ( !args->nwrite || args->write[1] ) -- bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); -+ { -+ if ( bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1))!=0 ) -+ error("[%s] Error: cannot write\n", __func__); -+ } - } - else - { -@@ -238,13 +246,13 @@ - { - if ( !bcf_sr_has_line(files,i) ) continue; - if ( args->write && !args->write[i] ) continue; -- bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); -+ if ( bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0])!=0 ) error("[%s] Error: cannot write\n", __func__); - } - } - } - } - if ( str.s ) free(str.s); -- if ( out_fh ) hts_close(out_fh); -+ if ( out_fh && hts_close(out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname? args->output_fname : "-"); - } - - static void add_filter(args_t *args, char *expr, int logic) -@@ -354,7 +362,7 @@ - if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ - if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ - if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ -- bcf_hdr_write(args->fh_out[i], args->files->readers[j].header); \ -+ if ( bcf_hdr_write(args->fh_out[i], args->files->readers[j].header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fnames[i]); \ - } - if ( !args->nwrite || args->write[0] ) - { -@@ -427,7 +435,7 @@ - for (i=0; ifnames[i] ) continue; -- hts_close(args->fh_out[i]); -+ if ( hts_close(args->fh_out[i])!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fnames[i]); - if ( args->output_type==FT_VCF_GZ ) - { - tbx_conf_t conf = tbx_conf_vcf; -@@ -467,7 +475,7 @@ - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -w, --write list of files to write with -p given as 1-based indexes. By default, all files are written\n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "Examples:\n"); -@@ -480,6 +488,9 @@ - fprintf(bcftools_stderr, " # Extract and write records from A shared by both A and B using exact allele match\n"); - fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n"); - fprintf(bcftools_stderr, "\n"); -+ fprintf(bcftools_stderr, " # Extract and write records from C found in A and C but not in B\n"); -+ fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz C.vcf.gz -p dir -n~101 -w 3\n"); -+ fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, " # Extract records private to A or B comparing by position only\n"); - fprintf(bcftools_stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n -1 -c all\n"); - fprintf(bcftools_stderr, "\n"); -@@ -542,7 +553,9 @@ - else error("The --collapse string \"%s\" not recognised.\n", optarg); - break; - case 'f': args->files->apply_filters = optarg; break; -- case 'C': args->isec_op = OP_COMPLEMENT; break; -+ case 'C': -+ if ( args->isec_op!=0 && args->isec_op!=OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); -+ args->isec_op = OP_COMPLEMENT; break; - case 'r': args->regions_list = optarg; break; - case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 't': args->targets_list = optarg; break; -@@ -553,6 +566,8 @@ - case 'e': add_filter(args, optarg, FLT_EXCLUDE); break; - case 'n': - { -+ if ( args->isec_op!=0 && args->isec_op==OP_COMPLEMENT ) error("Error: either -C or -n should be given, not both.\n"); -+ if ( args->isec_op!=0 ) error("Error: -n should be given only once.\n"); - char *p = optarg; - if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; } - else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; } -@@ -567,7 +582,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfmerge.c -+++ python-pysam/bcftools/vcfmerge.c -@@ -1,6 +1,6 @@ - /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - -- Copyright (C) 2012-2016 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -28,6 +28,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -84,7 +85,7 @@ - typedef struct - { - bcf1_t *line; -- int end, active; -+ int end, active; // end: 0-based INFO/END - } - gvcf_aux_t; - -@@ -121,13 +122,16 @@ - int nfmt_map; // number of rows in the fmt_map array - int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes - void *tmp_arr; -- int ntmp_arr; -+ size_t ntmp_arr; - buffer_t *buf; - AGR_info_t *AGR_info; - int nAGR_info, mAGR_info; - bcf_srs_t *files; -- int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present -- gvcf_aux_t *gvcf; // buffer of gVCF lines -+ int gvcf_min, // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present -+ gvcf_break; // 0-based position of a next record which breaks a gVCF block -+ gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line -+ int nout_smpl; -+ kstring_t *str; - } - maux_t; - -@@ -397,7 +401,7 @@ - { - int msize = args->maux->ntmp_arr / rule->type_size; - int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type); -- if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret); -+ if ( ret<=0 ) error("FIXME: error parsing %s at %s:%"PRId64" .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); - args->maux->ntmp_arr = msize * rule->type_size; - - rule->nblocks++; -@@ -416,7 +420,7 @@ - int i, j; - if ( var_len==BCF_VL_A ) - { -- if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); -+ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); - args->maux->nagr_map = ret; - hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); - // create mapping from source file ALT indexes to dst file indexes -@@ -425,7 +429,7 @@ - } - else if ( var_len==BCF_VL_R ) - { -- if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); -+ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); - args->maux->nagr_map = ret; - hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); - for (i=0; imaux->agr_map[i] = als->map[i]; -@@ -460,7 +464,7 @@ - else - { - if ( rule->nblocks>1 && ret!=rule->block_size ) -- error("Mismatch in number of values for INFO/%s at %s:%d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); -+ error("Mismatch in number of values for INFO/%s at %s:%"PRId64"\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); - rule->block_size = ret; - args->maux->nagr_map = 0; - } -@@ -501,20 +505,24 @@ - int i; - for (i=0; isamples[i]; -- if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) -+ char *rmme = NULL, *name = hr->samples[i]; -+ while ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) - { - // there is a sample with the same name - if ( !force_samples ) error("Error: Duplicate sample names (%s), use --force-samples to proceed anyway.\n", name); - -- int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1; -- name = (char*) malloc(sizeof(char)*(len+1)); -- sprintf(name,"%s:%s",clash_prefix,hr->samples[i]); -- bcf_hdr_add_sample(hw,name); -- free(name); -+ // Resolve conflicting samples names. For example, replace: -+ // A + A with A,2:A -+ // A,2:A + A with A,2:A,2:2:A -+ -+ int len = strlen(name) + strlen(clash_prefix) + 1; -+ char *tmp = (char*) malloc(sizeof(char)*(len+1)); -+ sprintf(tmp,"%s:%s",clash_prefix,name); -+ free(rmme); -+ rmme = name = tmp; - } -- else -- bcf_hdr_add_sample(hw,name); -+ bcf_hdr_add_sample(hw,name); -+ free(rmme); - } - } - -@@ -677,6 +685,8 @@ - int i, n_smpl = 0; - for (i=0; in; i++) - n_smpl += bcf_hdr_nsamples(files->readers[i].header); -+ ma->nout_smpl = n_smpl; -+ assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) ); - if ( args->do_gvcf ) - { - ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); -@@ -688,11 +698,14 @@ - ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); - for (i=0; in; i++) - ma->buf[i].rid = -1; -+ ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); - return ma; - } - void maux_destroy(maux_t *ma) - { - int i,j; -+ for (i=0; inout_smpl; i++) free(ma->str[i].s); -+ free(ma->str); - for (i=0; imals; i++) - { - free(ma->als[i]); -@@ -776,7 +789,7 @@ - } - ma->buf[i].end = j; - ma->buf[i].cur = -1; -- if ( ma->buf[i].beg < ma->buf[i].end ) -+ if ( ma->buf[i].beg < ma->buf[i].end ) - { - ma->buf[i].lines = ma->files->readers[i].buffer; - if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record -@@ -1008,7 +1021,7 @@ - int end_src = start_src; - while ( end_srcmbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s; - } - else -- error("Not ready for type [%d]: %s at %d\n", info->type,agr->hdr_tag,line->pos+1); -+ error("Not ready for type [%d]: %s at %"PRId64"\n", info->type,agr->hdr_tag,(int64_t) line->pos+1); - } - - if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT ) -@@ -1137,7 +1150,7 @@ - { - int ret = copy_string_field((char*)info->vptr, iori-ifrom, info->len, &tmp, als->map[iori]-ifrom); - if ( ret ) -- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); -+ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); - } - } - else -@@ -1153,7 +1166,7 @@ - int knew = bcf_alleles2gt(inew,jnew); - int ret = copy_string_field((char*)info->vptr, kori, info->len, &tmp, knew); - if ( ret ) -- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); -+ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); - } - } - } -@@ -1227,7 +1240,7 @@ - } - kitr = kh_get(strdict, tmph, key); - int idx = kh_val(tmph, kitr); -- if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1); -+ if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%"PRId64"\n", key,bcf_seqname(hdr,line),(int64_t) line->pos+1); - merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]); - continue; - } -@@ -1318,6 +1331,7 @@ - bcf_hdr_t *out_hdr = args->out_hdr; - maux_t *ma = args->maux; - int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); -+ static int warned = 0; - - int nsize = 0, msize = sizeof(int32_t); - for (i=0; inreaders; i++) -@@ -1333,6 +1347,13 @@ - { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); -+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); -+ if ( ma->ntmp_arr > 2147483647 ) -+ { -+ if ( !warned ) fprintf(stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); -+ warned = 1; -+ return; -+ } - } - memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); - -@@ -1412,15 +1433,126 @@ - bcf_update_format_int32(out_hdr, out, "GT", (int32_t*)ma->tmp_arr, nsamples*nsize); - } - -+void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf1_t *out, int length, int nsize) -+{ -+ bcf_srs_t *files = args->files; -+ bcf_hdr_t *out_hdr = args->out_hdr; -+ maux_t *ma = args->maux; -+ int i,j, nsamples = bcf_hdr_nsamples(out_hdr); -+ static int warned = 0; -+ -+ // initialize empty strings, a dot for each value, e.g. ".,.,." -+ int nmax = 0; -+ for (i=0; istr[i]; -+ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR ) -+ { -+ str->l = 1; -+ ks_resize(str, str->l+1); -+ str->s[0] = '.'; -+ } -+ else -+ { -+ str->l = nsize*2 - 1; -+ ks_resize(str, str->l+1); -+ str->s[0] = '.'; -+ for (j=1; js[j*2-1] = ',', str->s[j*2] = '.'; -+ } -+ str->s[str->l] = 0; -+ if ( nmax < str->l ) nmax = str->l; -+ } -+ -+ // fill in values for each sample -+ int ismpl = 0; -+ for (i=0; inreaders; i++) -+ { -+ bcf_sr_t *reader = &files->readers[i]; -+ bcf_hdr_t *hdr = reader->header; -+ bcf_fmt_t *fmt_ori = fmt_map[i]; -+ if ( !fmt_ori ) -+ { -+ // the field is not present in this file -+ ismpl += bcf_hdr_nsamples(hdr); -+ continue; -+ } -+ -+ bcf1_t *line = maux_get_line(args, i); -+ int irec = ma->buf[i].cur; -+ char *src = (char*) fmt_ori->p; -+ -+ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) -+ { -+ // alleles unchanged, copy over -+ for (j=0; jstr[ismpl++]; -+ str->l = 0; -+ kputsn(src, fmt_ori->n, str); -+ if ( nmax < str->l ) nmax = str->l; -+ src += fmt_ori->n; -+ } -+ continue; -+ } -+ // NB, what is below is not the fastest way, copy_string_field() keeps -+ // finding the indexes repeatedly at multiallelic sites -+ if ( length==BCF_VL_A || length==BCF_VL_R ) -+ { -+ int ifrom = length==BCF_VL_A ? 1 : 0; -+ for (j=0; jstr[ismpl++]; -+ int iori,inew; -+ for (iori=ifrom; iorin_allele; iori++) -+ { -+ inew = ma->buf[i].rec[irec].map[iori] - ifrom; -+ int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); -+ if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); -+ } -+ src += fmt_ori->size; -+ } -+ continue; -+ } -+ assert( length==BCF_VL_G ); -+ error("[E::%s] Merging of Number=G FORMAT strings (in your case FORMAT/%s) is not supported yet, sorry!\n" -+ "Please open an issue on github if this feature is essential for you. However, note that using FORMAT strings is not\n" -+ "a good idea in general - it is slow to parse and does not compress well, it is better to use integer codes instead.\n" -+ "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key); -+ } -+ // update the record -+ if ( ma->ntmp_arr < nsamples*nmax ) -+ { -+ ma->ntmp_arr = nsamples*nmax; -+ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); -+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); -+ if ( ma->ntmp_arr > 2147483647 ) -+ { -+ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); -+ warned = 1; -+ return; -+ } -+ } -+ char *tgt = (char*) ma->tmp_arr; -+ for (i=0; istr[i].s, ma->str[i].l); -+ if ( ma->str[i].l < nmax ) memset(tgt + ma->str[i].l, 0, nmax - ma->str[i].l); -+ tgt += nmax; -+ } -+ bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax); -+} -+ - void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) - { - bcf_srs_t *files = args->files; - bcf_hdr_t *out_hdr = args->out_hdr; - maux_t *ma = args->maux; - int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); -+ static int warned = 0; - - const char *key = NULL; -- int nsize = 0, length = BCF_VL_FIXED, type = -1; -+ size_t nsize = 0, length = BCF_VL_FIXED; -+ int type = -1; - for (i=0; inreaders; i++) - { - if ( !maux_get_line(args,i) ) continue; -@@ -1447,12 +1579,24 @@ - } - if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n; - } -+ if ( type==BCF_BT_CHAR ) -+ { -+ merge_format_string(args, key, fmt_map, out, length, nsize); -+ return; -+ } - -- int msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); -+ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); - if ( ma->ntmp_arr < nsamples*nsize*msize ) - { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); -+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); -+ if ( ma->ntmp_arr > 2147483647 ) -+ { -+ if ( !warned ) fprintf(stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); -+ warned = 1; -+ return; -+ } - } - - // Fill the temp array for all samples by collecting values from all files -@@ -1463,6 +1607,7 @@ - bcf_fmt_t *fmt_ori = fmt_map[i]; - bcf1_t *line = maux_get_line(args, i); - int irec = ma->buf[i].cur; -+ - if ( fmt_ori ) - { - type = fmt_ori->type; -@@ -1471,23 +1616,23 @@ - { - // if all fields are missing then n==1 is valid - if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori*(nals_ori+1)/2 && fmt_map[i]->n != nals_ori ) -- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=G, but found\n" -+ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=G, but found\n" - "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", -- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); -+ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); - } - else if ( length==BCF_VL_A ) - { - if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori-1 ) -- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=A, but found\n" -+ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=A, but found\n" - "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", -- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); -+ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); - } - else if ( length==BCF_VL_R ) - { - if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori ) -- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=R, but found\n" -+ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=R, but found\n" - "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", -- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); -+ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); - } - } - -@@ -1619,15 +1764,12 @@ - case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; - case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; -- case BCF_BT_CHAR: BRANCH(uint8_t, uint8_t, *src==bcf_str_missing, *src==bcf_str_vector_end, *tgt=bcf_str_missing, *tgt=bcf_str_vector_end); break; - default: error("Unexpected case: %d, %s\n", type, key); - } - #undef BRANCH - } - if ( type==BCF_BT_FLOAT ) - bcf_update_format_float(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); -- else if ( type==BCF_BT_CHAR ) -- bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); - else - bcf_update_format_int32(out_hdr, out, key, (int32_t*)ma->tmp_arr, nsamples*nsize); - } -@@ -1718,6 +1860,7 @@ - { - if ( !gaux[i].active ) continue; - bcf1_t *line = maux_get_line(args, i); -+ if ( !line ) continue; - int irec = maux->buf[i].cur; - - hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map); -@@ -1739,7 +1882,7 @@ - if ( !maux->als ) - { - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); -- error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1); -+ error("Failed to merge alleles at %s:%"PRId64"\n",bcf_seqname(hdr,line),(int64_t) line->pos+1); - } - } - } -@@ -1748,6 +1891,7 @@ - /* - Output staged gVCF blocks, end is the last position of the block. Assuming - gaux[i].active flags are set and maux_get_line returns correct lines. -+ Both start,end coordinates are 0-based. - */ - void gvcf_write_block(args_t *args, int start, int end) - { -@@ -1757,7 +1901,7 @@ - assert(gaux); - - // Update POS -- int min = INT_MAX; -+ int min = INT_MAX; // the minimum active gVCF INFO/END (0-based) - char ref = 'N'; - for (i=0; ifiles->nreaders; i++) - { -@@ -1778,7 +1922,7 @@ - if ( min > gaux[i].end ) min = gaux[i].end; - } - // Check for valid gVCF blocks in this region -- if ( min==INT_MAX ) -+ if ( min==INT_MAX ) // this probably should not happen - { - assert(0); - maux->gvcf_min = 0; -@@ -1814,7 +1958,7 @@ - } - else - bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); -- bcf_write1(args->out_fh, args->out_hdr, out); -+ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - bcf_clear1(out); - - -@@ -1872,7 +2016,7 @@ - } - - // When called on a region, trim the blocks accordingly -- int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; -+ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; // the start of a new gvcf block to output - if ( args->regs ) - { - int rstart = -1, rend = -1; -@@ -1892,7 +2036,7 @@ - // does the block end before the new line or is it interrupted? - int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until; - if ( start > tmp-1 ) break; -- gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based -+ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based, passing 0-based coordinates - start = tmp; - } - } -@@ -1901,6 +2045,7 @@ - Check incoming lines for new gVCF blocks, set pointer to the current source - buffer (gvcf or readers). In contrast to gvcf_flush, this function can be - called only after maux_reset as it relies on updated maux buffers. -+ The coordinate is 0-based - */ - void gvcf_stage(args_t *args, int pos) - { -@@ -1935,8 +2080,16 @@ - int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); - if ( ret==1 ) - { -+ if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END -+ { -+ maux->gvcf_break = line->pos; -+ continue; -+ } -+ if ( end[0] <= line->pos ) error("Error: Incorrect END at %s:%"PRId64" .. END=%d\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,end[0]); -+ - // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with - // an empty record: the gaux line must be kept until we reach its END. -+ - gaux[i].active = 1; - gaux[i].end = end[0] - 1; - SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line); -@@ -1982,7 +2135,15 @@ - { - // Invalidate pointer to reader's buffer or else gvcf_flush will attempt - // to use the old lines via maux_get_line() -- if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; -+ if ( ma->gvcf ) -+ { -+ if ( ma->gvcf[ir].active ) -+ { -+ if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; -+ else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block -+ } -+ if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; -+ } - - bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir); - if ( !reader->nbuffer ) continue; // nothing to clean -@@ -2043,14 +2204,15 @@ - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); - const char *chr = bcf_hdr_id2name(hdr, maux->buf[i].rid); - fprintf(stderr,"\t"); -- for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1); -+ for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(stderr," %s:%"PRId64,chr,(int64_t) maux->buf[i].lines[j]->pos+1); - } - fprintf(stderr,"\n"); - } -+ fprintf(stderr,"gvcf_min=%d\n", args->maux->gvcf_min); - for (i=0; ifiles->nreaders; i++) - { - fprintf(stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); -- if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1); -+ if ( maux->gvcf[i].active ) fprintf(stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1); - fprintf(stderr,"\n"); - } - fprintf(stderr,"\n"); -@@ -2185,7 +2347,7 @@ - } - // normalize alleles - maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); -- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",maux->chr,line->pos+1,reader->fname); -+ if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=1; kn_allele; k++) - maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files -@@ -2286,33 +2448,46 @@ - if ( args->do_gvcf ) - bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); - merge_format(args, out); -- bcf_write1(args->out_fh, args->out_hdr, out); -+ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - bcf_clear1(out); - } - - void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd) - { - kstring_t str = {0,0,0}; -- ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()); -- bcf_hdr_append(hdr,str.s); -+ int e = 0; -+ if (ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()) < 0) -+ goto fail; -+ if (bcf_hdr_append(hdr,str.s) < 0) -+ goto fail; - - str.l = 0; -- ksprintf(&str,"##%sCommand=%s", cmd, argv[0]); -+ e |= ksprintf(&str,"##%sCommand=%s", cmd, argv[0]) < 0; - int i; - for (i=1; ifiles->nreaders; i++) - { -- char buf[10]; snprintf(buf,10,"%d",i+1); -+ char buf[24]; snprintf(buf,sizeof buf,"%d",i+1); - merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples); - } - if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); -- bcf_hdr_sync(args->out_hdr); -+ if (bcf_hdr_sync(args->out_hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - info_rules_init(args); - - bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); -- bcf_hdr_write(args->out_fh, args->out_hdr); -+ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( args->header_only ) - { - bcf_hdr_destroy(args->out_hdr); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - return; - } - -@@ -2379,7 +2555,7 @@ - info_rules_destroy(args); - maux_destroy(args->maux); - bcf_hdr_destroy(args->out_hdr); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - bcf_destroy1(args->out_line); - kh_destroy(strdict, args->tmph); - if ( args->tmps.m ) free(args->tmps.s); -@@ -2410,7 +2586,7 @@ - fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); -- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, "\n"); - exit(1); - } -@@ -2497,7 +2673,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfmerge.c.pysam.c -+++ python-pysam/bcftools/vcfmerge.c.pysam.c -@@ -2,7 +2,7 @@ - - /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - -- Copyright (C) 2012-2016 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -30,6 +30,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -86,7 +87,7 @@ - typedef struct - { - bcf1_t *line; -- int end, active; -+ int end, active; // end: 0-based INFO/END - } - gvcf_aux_t; - -@@ -123,13 +124,16 @@ - int nfmt_map; // number of rows in the fmt_map array - int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes - void *tmp_arr; -- int ntmp_arr; -+ size_t ntmp_arr; - buffer_t *buf; - AGR_info_t *AGR_info; - int nAGR_info, mAGR_info; - bcf_srs_t *files; -- int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present -- gvcf_aux_t *gvcf; // buffer of gVCF lines -+ int gvcf_min, // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present -+ gvcf_break; // 0-based position of a next record which breaks a gVCF block -+ gvcf_aux_t *gvcf; // buffer of gVCF lines, for each reader one line -+ int nout_smpl; -+ kstring_t *str; - } - maux_t; - -@@ -399,7 +403,7 @@ - { - int msize = args->maux->ntmp_arr / rule->type_size; - int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type); -- if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret); -+ if ( ret<=0 ) error("FIXME: error parsing %s at %s:%"PRId64" .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); - args->maux->ntmp_arr = msize * rule->type_size; - - rule->nblocks++; -@@ -418,7 +422,7 @@ - int i, j; - if ( var_len==BCF_VL_A ) - { -- if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); -+ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); - args->maux->nagr_map = ret; - hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); - // create mapping from source file ALT indexes to dst file indexes -@@ -427,7 +431,7 @@ - } - else if ( var_len==BCF_VL_R ) - { -- if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); -+ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); - args->maux->nagr_map = ret; - hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map); - for (i=0; imaux->agr_map[i] = als->map[i]; -@@ -462,7 +466,7 @@ - else - { - if ( rule->nblocks>1 && ret!=rule->block_size ) -- error("Mismatch in number of values for INFO/%s at %s:%d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1); -+ error("Mismatch in number of values for INFO/%s at %s:%"PRId64"\n", rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); - rule->block_size = ret; - args->maux->nagr_map = 0; - } -@@ -503,20 +507,24 @@ - int i; - for (i=0; isamples[i]; -- if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) -+ char *rmme = NULL, *name = hr->samples[i]; -+ while ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) - { - // there is a sample with the same name - if ( !force_samples ) error("Error: Duplicate sample names (%s), use --force-samples to proceed anyway.\n", name); - -- int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1; -- name = (char*) malloc(sizeof(char)*(len+1)); -- sprintf(name,"%s:%s",clash_prefix,hr->samples[i]); -- bcf_hdr_add_sample(hw,name); -- free(name); -+ // Resolve conflicting samples names. For example, replace: -+ // A + A with A,2:A -+ // A,2:A + A with A,2:A,2:2:A -+ -+ int len = strlen(name) + strlen(clash_prefix) + 1; -+ char *tmp = (char*) malloc(sizeof(char)*(len+1)); -+ sprintf(tmp,"%s:%s",clash_prefix,name); -+ free(rmme); -+ rmme = name = tmp; - } -- else -- bcf_hdr_add_sample(hw,name); -+ bcf_hdr_add_sample(hw,name); -+ free(rmme); - } - } - -@@ -679,6 +687,8 @@ - int i, n_smpl = 0; - for (i=0; in; i++) - n_smpl += bcf_hdr_nsamples(files->readers[i].header); -+ ma->nout_smpl = n_smpl; -+ assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) ); - if ( args->do_gvcf ) - { - ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t)); -@@ -690,11 +700,14 @@ - ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t)); - for (i=0; in; i++) - ma->buf[i].rid = -1; -+ ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t)); - return ma; - } - void maux_destroy(maux_t *ma) - { - int i,j; -+ for (i=0; inout_smpl; i++) free(ma->str[i].s); -+ free(ma->str); - for (i=0; imals; i++) - { - free(ma->als[i]); -@@ -778,7 +791,7 @@ - } - ma->buf[i].end = j; - ma->buf[i].cur = -1; -- if ( ma->buf[i].beg < ma->buf[i].end ) -+ if ( ma->buf[i].beg < ma->buf[i].end ) - { - ma->buf[i].lines = ma->files->readers[i].buffer; - if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record -@@ -1010,7 +1023,7 @@ - int end_src = start_src; - while ( end_srcmbuf = tmp.m; agr->nbuf = tmp.l; agr->buf = (uint8_t*)tmp.s; - } - else -- error("Not ready for type [%d]: %s at %d\n", info->type,agr->hdr_tag,line->pos+1); -+ error("Not ready for type [%d]: %s at %"PRId64"\n", info->type,agr->hdr_tag,(int64_t) line->pos+1); - } - - if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 || info->type==BCF_BT_FLOAT ) -@@ -1139,7 +1152,7 @@ - { - int ret = copy_string_field((char*)info->vptr, iori-ifrom, info->len, &tmp, als->map[iori]-ifrom); - if ( ret ) -- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); -+ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); - } - } - else -@@ -1155,7 +1168,7 @@ - int knew = bcf_alleles2gt(inew,jnew); - int ret = copy_string_field((char*)info->vptr, kori, info->len, &tmp, knew); - if ( ret ) -- error("Error at %s:%d: wrong number of fields in %s?\n", bcf_seqname(hdr,line),line->pos+1,agr->hdr_tag); -+ error("Error at %s:%"PRId64": wrong number of fields in %s?\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,agr->hdr_tag); - } - } - } -@@ -1229,7 +1242,7 @@ - } - kitr = kh_get(strdict, tmph, key); - int idx = kh_val(tmph, kitr); -- if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1); -+ if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%"PRId64"\n", key,bcf_seqname(hdr,line),(int64_t) line->pos+1); - merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]); - continue; - } -@@ -1320,6 +1333,7 @@ - bcf_hdr_t *out_hdr = args->out_hdr; - maux_t *ma = args->maux; - int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); -+ static int warned = 0; - - int nsize = 0, msize = sizeof(int32_t); - for (i=0; inreaders; i++) -@@ -1335,6 +1349,13 @@ - { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); -+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); -+ if ( ma->ntmp_arr > 2147483647 ) -+ { -+ if ( !warned ) fprintf(bcftools_stderr,"Warning: Too many genotypes at %s:%"PRId64", requires %zu bytes, skipping.\n", bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); -+ warned = 1; -+ return; -+ } - } - memset(ma->smpl_ploidy,0,nsamples*sizeof(int)); - -@@ -1414,15 +1435,126 @@ - bcf_update_format_int32(out_hdr, out, "GT", (int32_t*)ma->tmp_arr, nsamples*nsize); - } - -+void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf1_t *out, int length, int nsize) -+{ -+ bcf_srs_t *files = args->files; -+ bcf_hdr_t *out_hdr = args->out_hdr; -+ maux_t *ma = args->maux; -+ int i,j, nsamples = bcf_hdr_nsamples(out_hdr); -+ static int warned = 0; -+ -+ // initialize empty strings, a dot for each value, e.g. ".,.,." -+ int nmax = 0; -+ for (i=0; istr[i]; -+ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR ) -+ { -+ str->l = 1; -+ ks_resize(str, str->l+1); -+ str->s[0] = '.'; -+ } -+ else -+ { -+ str->l = nsize*2 - 1; -+ ks_resize(str, str->l+1); -+ str->s[0] = '.'; -+ for (j=1; js[j*2-1] = ',', str->s[j*2] = '.'; -+ } -+ str->s[str->l] = 0; -+ if ( nmax < str->l ) nmax = str->l; -+ } -+ -+ // fill in values for each sample -+ int ismpl = 0; -+ for (i=0; inreaders; i++) -+ { -+ bcf_sr_t *reader = &files->readers[i]; -+ bcf_hdr_t *hdr = reader->header; -+ bcf_fmt_t *fmt_ori = fmt_map[i]; -+ if ( !fmt_ori ) -+ { -+ // the field is not present in this file -+ ismpl += bcf_hdr_nsamples(hdr); -+ continue; -+ } -+ -+ bcf1_t *line = maux_get_line(args, i); -+ int irec = ma->buf[i].cur; -+ char *src = (char*) fmt_ori->p; -+ -+ if ( length==BCF_VL_FIXED || length==BCF_VL_VAR || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) -+ { -+ // alleles unchanged, copy over -+ for (j=0; jstr[ismpl++]; -+ str->l = 0; -+ kputsn(src, fmt_ori->n, str); -+ if ( nmax < str->l ) nmax = str->l; -+ src += fmt_ori->n; -+ } -+ continue; -+ } -+ // NB, what is below is not the fastest way, copy_string_field() keeps -+ // finding the indexes repeatedly at multiallelic sites -+ if ( length==BCF_VL_A || length==BCF_VL_R ) -+ { -+ int ifrom = length==BCF_VL_A ? 1 : 0; -+ for (j=0; jstr[ismpl++]; -+ int iori,inew; -+ for (iori=ifrom; iorin_allele; iori++) -+ { -+ inew = ma->buf[i].rec[irec].map[iori] - ifrom; -+ int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); -+ if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); -+ } -+ src += fmt_ori->size; -+ } -+ continue; -+ } -+ assert( length==BCF_VL_G ); -+ error("[E::%s] Merging of Number=G FORMAT strings (in your case FORMAT/%s) is not supported yet, sorry!\n" -+ "Please open an issue on github if this feature is essential for you. However, note that using FORMAT strings is not\n" -+ "a good idea in general - it is slow to parse and does not compress well, it is better to use integer codes instead.\n" -+ "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key); -+ } -+ // update the record -+ if ( ma->ntmp_arr < nsamples*nmax ) -+ { -+ ma->ntmp_arr = nsamples*nmax; -+ ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); -+ if ( !ma->tmp_arr ) error("Could not allocate %zu bytes\n",ma->ntmp_arr); -+ if ( ma->ntmp_arr > 2147483647 ) -+ { -+ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); -+ warned = 1; -+ return; -+ } -+ } -+ char *tgt = (char*) ma->tmp_arr; -+ for (i=0; istr[i].s, ma->str[i].l); -+ if ( ma->str[i].l < nmax ) memset(tgt + ma->str[i].l, 0, nmax - ma->str[i].l); -+ tgt += nmax; -+ } -+ bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax); -+} -+ - void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out) - { - bcf_srs_t *files = args->files; - bcf_hdr_t *out_hdr = args->out_hdr; - maux_t *ma = args->maux; - int i, ismpl = 0, nsamples = bcf_hdr_nsamples(out_hdr); -+ static int warned = 0; - - const char *key = NULL; -- int nsize = 0, length = BCF_VL_FIXED, type = -1; -+ size_t nsize = 0, length = BCF_VL_FIXED; -+ int type = -1; - for (i=0; inreaders; i++) - { - if ( !maux_get_line(args,i) ) continue; -@@ -1449,12 +1581,24 @@ - } - if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n; - } -+ if ( type==BCF_BT_CHAR ) -+ { -+ merge_format_string(args, key, fmt_map, out, length, nsize); -+ return; -+ } - -- int msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); -+ size_t msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t); - if ( ma->ntmp_arr < nsamples*nsize*msize ) - { - ma->ntmp_arr = nsamples*nsize*msize; - ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr); -+ if ( !ma->tmp_arr ) error("Failed to allocate %zu bytes at %s:%"PRId64" for FORMAT/%s\n", ma->ntmp_arr,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,key); -+ if ( ma->ntmp_arr > 2147483647 ) -+ { -+ if ( !warned ) fprintf(bcftools_stderr,"Warning: The row size is too big for FORMAT/%s at %s:%"PRId64", requires %zu bytes, skipping.\n", key,bcf_seqname(out_hdr,out),(int64_t) out->pos+1,ma->ntmp_arr); -+ warned = 1; -+ return; -+ } - } - - // Fill the temp array for all samples by collecting values from all files -@@ -1465,6 +1609,7 @@ - bcf_fmt_t *fmt_ori = fmt_map[i]; - bcf1_t *line = maux_get_line(args, i); - int irec = ma->buf[i].cur; -+ - if ( fmt_ori ) - { - type = fmt_ori->type; -@@ -1473,23 +1618,23 @@ - { - // if all fields are missing then n==1 is valid - if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori*(nals_ori+1)/2 && fmt_map[i]->n != nals_ori ) -- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=G, but found\n" -+ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=G, but found\n" - "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", -- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); -+ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); - } - else if ( length==BCF_VL_A ) - { - if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori-1 ) -- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=A, but found\n" -+ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=A, but found\n" - "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", -- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); -+ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); - } - else if ( length==BCF_VL_R ) - { - if ( fmt_ori->n!=1 && fmt_ori->n != nals_ori ) -- error("Incorrect number of FORMAT/%s values at %s:%d, cannot merge. The tag is defined as Number=R, but found\n" -+ error("Incorrect number of FORMAT/%s values at %s:%"PRId64", cannot merge. The tag is defined as Number=R, but found\n" - "%d values and %d alleles. See also http://samtools.github.io/bcftools/howtos/FAQ.html#incorrect-nfields\n", -- key,bcf_seqname(args->out_hdr,out),out->pos+1,fmt_ori->n,nals_ori); -+ key,bcf_seqname(args->out_hdr,out),(int64_t) out->pos+1,fmt_ori->n,nals_ori); - } - } - -@@ -1621,15 +1766,12 @@ - case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break; - case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break; -- case BCF_BT_CHAR: BRANCH(uint8_t, uint8_t, *src==bcf_str_missing, *src==bcf_str_vector_end, *tgt=bcf_str_missing, *tgt=bcf_str_vector_end); break; - default: error("Unexpected case: %d, %s\n", type, key); - } - #undef BRANCH - } - if ( type==BCF_BT_FLOAT ) - bcf_update_format_float(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); -- else if ( type==BCF_BT_CHAR ) -- bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize); - else - bcf_update_format_int32(out_hdr, out, key, (int32_t*)ma->tmp_arr, nsamples*nsize); - } -@@ -1720,6 +1862,7 @@ - { - if ( !gaux[i].active ) continue; - bcf1_t *line = maux_get_line(args, i); -+ if ( !line ) continue; - int irec = maux->buf[i].cur; - - hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map); -@@ -1741,7 +1884,7 @@ - if ( !maux->als ) - { - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); -- error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1); -+ error("Failed to merge alleles at %s:%"PRId64"\n",bcf_seqname(hdr,line),(int64_t) line->pos+1); - } - } - } -@@ -1750,6 +1893,7 @@ - /* - Output staged gVCF blocks, end is the last position of the block. Assuming - gaux[i].active flags are set and maux_get_line returns correct lines. -+ Both start,end coordinates are 0-based. - */ - void gvcf_write_block(args_t *args, int start, int end) - { -@@ -1759,7 +1903,7 @@ - assert(gaux); - - // Update POS -- int min = INT_MAX; -+ int min = INT_MAX; // the minimum active gVCF INFO/END (0-based) - char ref = 'N'; - for (i=0; ifiles->nreaders; i++) - { -@@ -1780,7 +1924,7 @@ - if ( min > gaux[i].end ) min = gaux[i].end; - } - // Check for valid gVCF blocks in this region -- if ( min==INT_MAX ) -+ if ( min==INT_MAX ) // this probably should not happen - { - assert(0); - maux->gvcf_min = 0; -@@ -1816,7 +1960,7 @@ - } - else - bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); -- bcf_write1(args->out_fh, args->out_hdr, out); -+ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - bcf_clear1(out); - - -@@ -1874,7 +2018,7 @@ - } - - // When called on a region, trim the blocks accordingly -- int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; -+ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos; // the start of a new gvcf block to output - if ( args->regs ) - { - int rstart = -1, rend = -1; -@@ -1894,7 +2038,7 @@ - // does the block end before the new line or is it interrupted? - int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until; - if ( start > tmp-1 ) break; -- gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based -+ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based, passing 0-based coordinates - start = tmp; - } - } -@@ -1903,6 +2047,7 @@ - Check incoming lines for new gVCF blocks, set pointer to the current source - buffer (gvcf or readers). In contrast to gvcf_flush, this function can be - called only after maux_reset as it relies on updated maux buffers. -+ The coordinate is 0-based - */ - void gvcf_stage(args_t *args, int pos) - { -@@ -1937,8 +2082,16 @@ - int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend); - if ( ret==1 ) - { -+ if ( end[0] == line->pos + 1 ) // POS and INFO/END are identical, treat as if a normal w/o INFO/END -+ { -+ maux->gvcf_break = line->pos; -+ continue; -+ } -+ if ( end[0] <= line->pos ) error("Error: Incorrect END at %s:%"PRId64" .. END=%d\n", bcf_seqname(hdr,line),(int64_t) line->pos+1,end[0]); -+ - // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with - // an empty record: the gaux line must be kept until we reach its END. -+ - gaux[i].active = 1; - gaux[i].end = end[0] - 1; - SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line); -@@ -1984,7 +2137,15 @@ - { - // Invalidate pointer to reader's buffer or else gvcf_flush will attempt - // to use the old lines via maux_get_line() -- if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; -+ if ( ma->gvcf ) -+ { -+ if ( ma->gvcf[ir].active ) -+ { -+ if ( ma->pos >= ma->gvcf[ir].end ) ma->gvcf[ir].active = 0; -+ else if ( ma->buf[ir].cur==-1 ) ma->buf[ir].cur = ma->buf[ir].beg; // re-activate interrupted gVCF block -+ } -+ if ( !ma->gvcf[ir].active ) ma->buf[ir].cur = -1; -+ } - - bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir); - if ( !reader->nbuffer ) continue; // nothing to clean -@@ -2045,14 +2206,15 @@ - bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i); - const char *chr = bcf_hdr_id2name(hdr, maux->buf[i].rid); - fprintf(bcftools_stderr,"\t"); -- for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(bcftools_stderr," %s:%d",chr,maux->buf[i].lines[j]->pos+1); -+ for (j=maux->buf[i].beg; jbuf[i].end; j++) fprintf(bcftools_stderr," %s:%"PRId64,chr,(int64_t) maux->buf[i].lines[j]->pos+1); - } - fprintf(bcftools_stderr,"\n"); - } -+ fprintf(bcftools_stderr,"gvcf_min=%d\n", args->maux->gvcf_min); - for (i=0; ifiles->nreaders; i++) - { - fprintf(bcftools_stderr,"reader %d:\tgvcf_active=%d", i,maux->gvcf[i].active); -- if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%d,%d", maux->gvcf[i].line->pos+1,maux->gvcf[i].end+1); -+ if ( maux->gvcf[i].active ) fprintf(bcftools_stderr,"\tpos,end=%"PRId64",%"PRId64, (int64_t) maux->gvcf[i].line->pos+1,(int64_t) maux->gvcf[i].end+1); - fprintf(bcftools_stderr,"\n"); - } - fprintf(bcftools_stderr,"\n"); -@@ -2187,7 +2349,7 @@ - } - // normalize alleles - maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals); -- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",maux->chr,line->pos+1,reader->fname); -+ if ( !maux->als ) error("Failed to merge alleles at %s:%"PRId64" in %s\n",maux->chr,(int64_t) line->pos+1,reader->fname); - hts_expand0(int, maux->nals, maux->ncnt, maux->cnt); - for (k=1; kn_allele; k++) - maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files -@@ -2288,33 +2450,46 @@ - if ( args->do_gvcf ) - bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0); - merge_format(args, out); -- bcf_write1(args->out_fh, args->out_hdr, out); -+ if ( bcf_write1(args->out_fh, args->out_hdr, out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - bcf_clear1(out); - } - - void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd) - { - kstring_t str = {0,0,0}; -- ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()); -- bcf_hdr_append(hdr,str.s); -+ int e = 0; -+ if (ksprintf(&str,"##%sVersion=%s+htslib-%s\n", cmd, bcftools_version(), hts_version()) < 0) -+ goto fail; -+ if (bcf_hdr_append(hdr,str.s) < 0) -+ goto fail; - - str.l = 0; -- ksprintf(&str,"##%sCommand=%s", cmd, argv[0]); -+ e |= ksprintf(&str,"##%sCommand=%s", cmd, argv[0]) < 0; - int i; - for (i=1; ifiles->nreaders; i++) - { -- char buf[10]; snprintf(buf,10,"%d",i+1); -+ char buf[24]; snprintf(buf,sizeof buf,"%d",i+1); - merge_headers(args->out_hdr, args->files->readers[i].header,buf,args->force_samples); - } - if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_merge"); -- bcf_hdr_sync(args->out_hdr); -+ if (bcf_hdr_sync(args->out_hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); - } - info_rules_init(args); - - bcf_hdr_set_version(args->out_hdr, bcf_hdr_get_version(args->files->readers[0].header)); -- bcf_hdr_write(args->out_fh, args->out_hdr); -+ if ( bcf_hdr_write(args->out_fh, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - if ( args->header_only ) - { - bcf_hdr_destroy(args->out_hdr); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - return; - } - -@@ -2381,7 +2557,7 @@ - info_rules_destroy(args); - maux_destroy(args->maux); - bcf_hdr_destroy(args->out_hdr); -- hts_close(args->out_fh); -+ if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - bcf_destroy1(args->out_line); - kh_destroy(strdict, args->tmph); - if ( args->tmps.m ) free(args->tmps.s); -@@ -2412,7 +2588,7 @@ - fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(bcftools_stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); -- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, "\n"); - exit(1); - } -@@ -2499,7 +2675,7 @@ - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfnorm.c -+++ python-pysam/bcftools/vcfnorm.c -@@ -1,6 +1,6 @@ - /* vcfnorm.c -- Left-align and normalize indels. - -- Copyright (C) 2013-2017 Genome Research Ltd. -+ Copyright (C) 2013-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -38,10 +39,10 @@ - #include "bcftools.h" - #include "rbuf.h" - --#define CHECK_REF_EXIT 0 --#define CHECK_REF_WARN 1 --#define CHECK_REF_SKIP 2 --#define CHECK_REF_FIX 4 -+#define CHECK_REF_EXIT 1 -+#define CHECK_REF_WARN 2 -+#define CHECK_REF_SKIP 4 -+#define CHECK_REF_FIX 8 - - #define MROWS_SPLIT 1 - #define MROWS_MERGE 2 -@@ -61,6 +62,13 @@ - char *ref, *alt; - void *hash; - } -+cmpals1_t; -+ -+typedef struct -+{ -+ cmpals1_t *cmpals; -+ int ncmpals, mcmpals; -+} - cmpals_t; - - typedef struct -@@ -83,14 +91,13 @@ - int aln_win; // the realignment window size (maximum repeat size) - bcf_srs_t *files; // using the synced reader only for -r option - bcf_hdr_t *hdr; -- cmpals_t *cmpals; -- int ncmpals, mcmpals; -+ cmpals_t cmpals_in, cmpals_out; - faidx_t *fai; - struct { int tot, set, swap; } nref; - char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; - int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; - int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; -- int record_cmd_line; -+ int record_cmd_line, force, force_warned; - } - args_t; - -@@ -137,7 +144,7 @@ - } - - char *ref = faidx_fetch_seq(args->fai, (char*)bcf_seqname(args->hdr,line), line->pos, line->pos+maxlen-1, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - replace_iupac_codes(ref,len); - - args->nref.tot++; -@@ -248,7 +255,7 @@ - int i, j, nals = line->n_allele, nals_ori = line->n_allele; - for (i=1, j=1; in_allele; i++) - { -- if ( strcmp(line->d.allele[0],line->d.allele[i]) ) -+ if ( strcasecmp(line->d.allele[0],line->d.allele[i]) ) - { - args->tmp_arr1[i] = j++; - continue; -@@ -295,7 +302,7 @@ - // Sanity check REF - int i, nref, reflen = strlen(line->d.allele[0]); - char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); - seq_to_upper(ref,0); - replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N - -@@ -303,18 +310,18 @@ - if ( has_non_acgtn(line->d.allele[0],reflen) ) - { - if ( args->check_ref==CHECK_REF_EXIT ) -- error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); -+ error("Non-ACGTN reference allele at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); - if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); -+ fprintf(stderr,"NON_ACGTN_REF\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0]); - free(ref); - return ERR_REF_MISMATCH; - } - if ( strcasecmp(ref,line->d.allele[0]) ) - { - if ( args->check_ref==CHECK_REF_EXIT ) -- error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); -+ error("Reference allele mismatch at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); - if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); -+ fprintf(stderr,"REF_MISMATCH\t%s\t%"PRId64"\t%s\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0],ref); - free(ref); - return ERR_REF_MISMATCH; - } -@@ -342,9 +349,9 @@ - if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) - { - if ( args->check_ref==CHECK_REF_EXIT ) -- error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); -+ error("Non-ACGTN alternate allele at %s:%"PRId64" .. VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); - if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); -+ fprintf(stderr,"NON_ACGTN_ALT\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); - return ERR_REF_MISMATCH; - } - -@@ -352,7 +359,7 @@ - kputs(line->d.allele[i], &als[i]); - seq_to_upper(als[i].s,0); - -- if ( i>0 && als[i].l==als[0].l && !strcmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; -+ if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; - } - - // trim from right -@@ -363,7 +370,7 @@ - int min_len = als[0].l; - for (i=1; in_allele; i++) - { -- if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break; -+ if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break; - if ( als[i].l < min_len ) min_len = als[i].l; - } - if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed -@@ -380,7 +387,7 @@ - int npad = line->pos >= args->aln_win ? args->aln_win : line->pos; - free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1); - replace_iupac_codes(ref,nref); - for (i=0; in_allele; i++) - { -@@ -420,7 +427,7 @@ - - // Have the alleles changed? - als[0].s[ als[0].l ] = 0; // in order for strcmp to work -- if ( ori_pos==line->pos && !strcmp(line->d.allele[0],als[0].s) ) return ERR_OK; -+ if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; - - // Create new block of alleles and update - args->tmp_als_str.l = 0; -@@ -459,23 +466,68 @@ - if ( len==BCF_VL_A ) \ - { \ - if ( ret!=src->n_allele-1 ) \ -- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(stderr, \ -+ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ -+ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ - } \ - else if ( len==BCF_VL_R ) \ - { \ - if ( ret!=src->n_allele ) \ -- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(stderr, \ -+ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ -+ } \ - if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ - bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ - } \ - else if ( len==BCF_VL_G ) \ - { \ - if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \ -- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(stderr, \ -+ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ -+ } \ - if ( ialt!=0 ) \ - { \ - vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ -@@ -620,8 +672,23 @@ - if ( len==BCF_VL_A ) \ - { \ - if ( nvals!=(src->n_allele-1)*nsmpl ) \ -- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(stderr, \ -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ } \ - nvals /= nsmpl; \ - type_t *src_vals = vals, *dst_vals = vals; \ - for (i=0; in_allele*nsmpl ) \ -- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(stderr, \ -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*nsmpl,nvals); \ -+ } \ - nvals /= nsmpl; \ - type_t *src_vals = vals, *dst_vals = vals; \ - for (i=0; in_allele*(src->n_allele+1)/2*nsmpl && nvals!=src->n_allele*nsmpl ) \ -- error("Error at %s:%d, the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(stderr, \ -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ -+ } \ - nvals /= nsmpl; \ - int all_haploid = nvals==src->n_allele ? 1 : 0; \ - type_t *src_vals = vals, *dst_vals = vals; \ -@@ -704,6 +801,7 @@ - { - const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); - int ret = bcf_get_format_char(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1); -+ if ( !ret ) return; // all values can be empty, leave out the tag, no need to panic - assert( ret>0 ); - - kstring_t str; -@@ -760,9 +858,25 @@ - if ( *se==',' ) nfields++; - se++; - } -+ if ( nfields==1 && se-ptr==1 && *ptr=='.' ) continue; // missing value - if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele ) -- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n", -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); -+ { -+ if ( args->force && !args->force_warned ) -+ { -+ fprintf(stderr, -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d. Removing the field.\n" -+ " (This warning is printed only once.)\n", -+ tag,bcf_seqname(args->hdr,src),(int64_t)src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); -+ args->force_warned = 1; -+ } -+ if ( args->force ) -+ { -+ bcf_update_format_char(args->hdr,dst,tag,NULL,0); -+ return; -+ } -+ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n", -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); -+ } - - int len = 0; - if ( nfields==src->n_allele ) // haploid -@@ -888,7 +1002,7 @@ - if ( len==BCF_VL_A ) \ - { \ - if (nvals_ori!=lines[0]->n_allele - 1) \ -- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ -+ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ - int nvals = dst->n_allele - 1; \ - ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ - vals = (type_t*) args->tmp_arr1; \ -@@ -899,7 +1013,7 @@ - if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - if (nvals2!=lines[i]->n_allele-1) \ -- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals2 = (type_t*) args->tmp_arr2; \ - for (k=0; kn_allele) \ -- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ -+ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ - int nvals = dst->n_allele; \ - ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ - vals = (type_t*) args->tmp_arr1; \ -@@ -923,7 +1037,7 @@ - if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - if (nvals2!=lines[i]->n_allele) \ -- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals2 = (type_t*) args->tmp_arr2; \ - for (k=0; kn_allele*(lines[0]->n_allele+1)/2) { \ - fprintf(stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \ -- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ -+ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ - } \ - int nvals = dst->n_allele*(dst->n_allele+1)/2; \ - ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ -@@ -950,7 +1064,7 @@ - if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - if (nvals2!=lines[i]->n_allele*(lines[i]->n_allele+1)/2) \ -- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals2 = (type_t*) args->tmp_arr2; \ - int ia,ib; \ - k = 0; \ -@@ -1062,7 +1176,7 @@ - int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2); - args->ntmp_arr2 = ntmp2 * 4; - ngts2 /= nsmpl; -- if ( ngts!=ngts2 ) error("Error at %s:%d: cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); -+ if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); - - int32_t *gt = (int32_t*) args->tmp_arr1; - int32_t *gt2 = (int32_t*) args->tmp_arr2; -@@ -1076,7 +1190,7 @@ - else - { - int ial = bcf_gt_allele(gt2[k]); -- if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial); -+ if ( ial>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial); - gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); - } - } -@@ -1123,7 +1237,7 @@ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - nvals2 /= nsmpl; \ - if (nvals2!=lines[i]->n_allele-1) \ -- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals = (type_t*) args->tmp_arr1; \ - vals2 = (type_t*) args->tmp_arr2; \ - for (j=0; jntmp_arr2 = ntmp2 * sizeof(type_t); \ - nvals2 /= nsmpl; \ - if (nvals2!=lines[i]->n_allele) \ -- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals = (type_t*) args->tmp_arr1; \ - vals2 = (type_t*) args->tmp_arr2; \ - for (j=0; jn_allele*(lines[i]->n_allele+1)/2; \ - int line_diploid = nvals2==ndiploid ? 1 : 0; \ - if (!(nvals2==1 || nvals2==lines[i]->n_allele || nvals2==lines[i]->n_allele*(lines[i]->n_allele+1)/2)) \ -- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals = (type_t*) args->tmp_arr1; \ - vals2 = (type_t*) args->tmp_arr2; \ - for (j=0; jn_allele*(dst->n_allele+1)/2; - } -- else error("The field %s at %s:%d neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),dst->pos+1); -+ else error("The field %s at %s:%"PRId64" neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); - - kstring_t *tmp = &args->tmp_str[i]; - kputc('.',tmp); -@@ -1415,7 +1529,7 @@ - args->maps[i].nals = lines[i]->n_allele; - hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map); - args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals); -- if ( !args->als ) error("Failed to merge alleles at %s:%d\n", bcf_seqname(args->hdr,dst),dst->pos+1); -+ if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); - } - bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals); - for (i=0; inals; i++) -@@ -1533,11 +1647,11 @@ - } - return NULL; - } --static void cmpals_add(args_t *args, bcf1_t *rec) -+static void cmpals_add(cmpals_t *ca, bcf1_t *rec) - { -- args->ncmpals++; -- hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals); -- cmpals_t *cmpals = args->cmpals + args->ncmpals - 1; -+ ca->ncmpals++; -+ hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals); -+ cmpals1_t *cmpals = ca->cmpals + ca->ncmpals - 1; - free(cmpals->ref); - cmpals->ref = strdup(rec->d.allele[0]); - cmpals->n = rec->n_allele; -@@ -1555,21 +1669,21 @@ - khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i])); - } - } --static int cmpals_match(args_t *args, bcf1_t *rec) -+static int cmpals_match(cmpals_t *ca, bcf1_t *rec) - { - int i, j; -- for (i=0; incmpals; i++) -+ for (i=0; incmpals; i++) - { -- cmpals_t *cmpals = args->cmpals + i; -+ cmpals1_t *cmpals = ca->cmpals + i; - if ( rec->n_allele != cmpals->n ) continue; - - // NB. assuming both are normalized -- if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue; -+ if ( strcasecmp(rec->d.allele[0], cmpals->ref) ) continue; - - // the most frequent case - if ( rec->n_allele==2 ) - { -- if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue; -+ if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue; - return 1; - } - -@@ -1579,21 +1693,20 @@ - if ( jn_allele ) continue; - return 1; - } -- cmpals_add(args, rec); - return 0; - } --static void cmpals_reset(args_t *args) { args->ncmpals = 0; } --static void cmpals_destroy(args_t *args) -+static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; } -+static void cmpals_destroy(cmpals_t *ca) - { - int i; -- for (i=0; imcmpals; i++) -+ for (i=0; imcmpals; i++) - { -- cmpals_t *cmpals = args->cmpals + i; -+ cmpals1_t *cmpals = ca->cmpals + i; - free(cmpals->ref); - free(cmpals->alt); - if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); - } -- free(args->cmpals); -+ free(ca->cmpals); - } - - static void flush_buffer(args_t *args, htsFile *file, int n) -@@ -1608,7 +1721,8 @@ - { - if ( mrows_ready_to_flush(args, args->lines[k]) ) - { -- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); -+ while ( (line=mrows_flush(args)) ) -+ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - int merge = 1; - if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY ) -@@ -1629,23 +1743,24 @@ - if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only - if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; -- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue; -+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue; - } - else - { - prev_rid = args->lines[k]->rid; - prev_pos = args->lines[k]->pos; - prev_type = 0; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out); - } - prev_type |= line_type; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); - } -- bcf_write1(file, args->hdr, args->lines[k]); -+ if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) - { -- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); -+ while ( (line=mrows_flush(args)) ) -+ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - -@@ -1669,7 +1784,8 @@ - - static void destroy_data(args_t *args) - { -- cmpals_destroy(args); -+ cmpals_destroy(&args->cmpals_in); -+ cmpals_destroy(&args->cmpals_out); - int i; - for (i=0; irbuf.m; i++) - if ( args->lines[i] ) bcf_destroy1(args->lines[i]); -@@ -1727,9 +1843,9 @@ - if ( args->check_ref & CHECK_REF_FIX ) - fix_dup_alt(args, line); - else if ( args->check_ref==CHECK_REF_EXIT ) -- error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1); -+ error("Duplicate alleles at %s:%"PRId64"; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ fprintf(stderr,"ALT_DUP\t%s\t%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - } - } - } -@@ -1754,7 +1870,7 @@ - if ( args->n_threads ) - hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); -- bcf_hdr_write(out, args->hdr); -+ if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - - int prev_rid = -1, prev_pos = -1, prev_type = 0; - while ( bcf_sr_next_line(args->files) ) -@@ -1770,17 +1886,17 @@ - if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only - if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; -- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue; -+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_in, line) ) continue; - } - else - { - prev_rid = line->rid; - prev_pos = line->pos; - prev_type = 0; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_in); - } - prev_type |= line_type; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_in, line); - } - - // still on the same chromosome? -@@ -1819,7 +1935,7 @@ - if ( j>0 ) flush_buffer(args, out, j); - } - flush_buffer(args, out, args->rbuf.n); -- hts_close(out); -+ if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - - fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); - if ( args->check_ref & CHECK_REF_FIX ) -@@ -1837,8 +1953,9 @@ - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); - fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); -- fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|none\n"); -- fprintf(stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); -+ fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|exact\n"); -+ fprintf(stderr, " -f, --fasta-ref reference sequence\n"); -+ fprintf(stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); - fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); -@@ -1849,9 +1966,16 @@ - fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(stderr, " --threads number of extra (de)compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); - fprintf(stderr, "\n"); -+ fprintf(stderr, "Examples:\n"); -+ fprintf(stderr, " # normalize and left-align indels\n"); -+ fprintf(stderr, " bcftools norm -f ref.fa in.vcf\n"); -+ fprintf(stderr, "\n"); -+ fprintf(stderr, " # split multi-allelic sites\n"); -+ fprintf(stderr, " bcftools norm -m- in.vcf\n"); -+ fprintf(stderr, "\n"); - exit(1); - } - -@@ -1875,6 +1999,7 @@ - static struct option loptions[] = - { - {"help",no_argument,NULL,'h'}, -+ {"force",no_argument,NULL,7}, - {"fasta-ref",required_argument,NULL,'f'}, - {"do-not-normalize",no_argument,NULL,'N'}, - {"multiallelics",required_argument,NULL,'m'}, -@@ -1904,6 +2029,7 @@ - else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; - else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; - else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; -+ else if ( !strcmp("exact",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; - else error("The argument to -d not recognised: %s\n", optarg); - break; - case 'm': -@@ -1951,8 +2077,9 @@ - break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -+ case 7 : args->force = 1; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -1966,7 +2093,8 @@ - else fname = argv[optind]; - - if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); -- if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n"); -+ if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT; -+ if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n"); - - if ( args->region ) - { -@@ -1980,7 +2108,7 @@ - } - - if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n"); - init_data(args); - normalize_vcf(args); ---- python-pysam.orig/bcftools/vcfnorm.c.pysam.c -+++ python-pysam/bcftools/vcfnorm.c.pysam.c -@@ -2,7 +2,7 @@ - - /* vcfnorm.c -- Left-align and normalize indels. - -- Copyright (C) 2013-2017 Genome Research Ltd. -+ Copyright (C) 2013-2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -33,6 +33,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -40,10 +41,10 @@ - #include "bcftools.h" - #include "rbuf.h" - --#define CHECK_REF_EXIT 0 --#define CHECK_REF_WARN 1 --#define CHECK_REF_SKIP 2 --#define CHECK_REF_FIX 4 -+#define CHECK_REF_EXIT 1 -+#define CHECK_REF_WARN 2 -+#define CHECK_REF_SKIP 4 -+#define CHECK_REF_FIX 8 - - #define MROWS_SPLIT 1 - #define MROWS_MERGE 2 -@@ -63,6 +64,13 @@ - char *ref, *alt; - void *hash; - } -+cmpals1_t; -+ -+typedef struct -+{ -+ cmpals1_t *cmpals; -+ int ncmpals, mcmpals; -+} - cmpals_t; - - typedef struct -@@ -85,14 +93,13 @@ - int aln_win; // the realignment window size (maximum repeat size) - bcf_srs_t *files; // using the synced reader only for -r option - bcf_hdr_t *hdr; -- cmpals_t *cmpals; -- int ncmpals, mcmpals; -+ cmpals_t cmpals_in, cmpals_out; - faidx_t *fai; - struct { int tot, set, swap; } nref; - char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; - int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; - int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; -- int record_cmd_line; -+ int record_cmd_line, force, force_warned; - } - args_t; - -@@ -139,7 +146,7 @@ - } - - char *ref = faidx_fetch_seq(args->fai, (char*)bcf_seqname(args->hdr,line), line->pos, line->pos+maxlen-1, &len); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - replace_iupac_codes(ref,len); - - args->nref.tot++; -@@ -250,7 +257,7 @@ - int i, j, nals = line->n_allele, nals_ori = line->n_allele; - for (i=1, j=1; in_allele; i++) - { -- if ( strcmp(line->d.allele[0],line->d.allele[i]) ) -+ if ( strcasecmp(line->d.allele[0],line->d.allele[i]) ) - { - args->tmp_arr1[i] = j++; - continue; -@@ -297,7 +304,7 @@ - // Sanity check REF - int i, nref, reflen = strlen(line->d.allele[0]); - char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1); - seq_to_upper(ref,0); - replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N - -@@ -305,18 +312,18 @@ - if ( has_non_acgtn(line->d.allele[0],reflen) ) - { - if ( args->check_ref==CHECK_REF_EXIT ) -- error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); -+ error("Non-ACGTN reference allele at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); - if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(bcftools_stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); -+ fprintf(bcftools_stderr,"NON_ACGTN_REF\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0]); - free(ref); - return ERR_REF_MISMATCH; - } - if ( strcasecmp(ref,line->d.allele[0]) ) - { - if ( args->check_ref==CHECK_REF_EXIT ) -- error("Reference allele mismatch at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]); -+ error("Reference allele mismatch at %s:%"PRId64" .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,ref,line->d.allele[0]); - if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(bcftools_stderr,"REF_MISMATCH\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]); -+ fprintf(bcftools_stderr,"REF_MISMATCH\t%s\t%"PRId64"\t%s\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[0],ref); - free(ref); - return ERR_REF_MISMATCH; - } -@@ -344,9 +351,9 @@ - if ( has_non_acgtn(line->d.allele[i],line->shared.l) ) - { - if ( args->check_ref==CHECK_REF_EXIT ) -- error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]); -+ error("Non-ACGTN alternate allele at %s:%"PRId64" .. VCF:'%s'\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); - if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(bcftools_stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]); -+ fprintf(bcftools_stderr,"NON_ACGTN_ALT\t%s\t%"PRId64"\t%s\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1,line->d.allele[i]); - return ERR_REF_MISMATCH; - } - -@@ -354,7 +361,7 @@ - kputs(line->d.allele[i], &als[i]); - seq_to_upper(als[i].s,0); - -- if ( i>0 && als[i].l==als[0].l && !strcmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; -+ if ( i>0 && als[i].l==als[0].l && !strcasecmp(als[0].s,als[i].s) ) return ERR_DUP_ALLELE; - } - - // trim from right -@@ -365,7 +372,7 @@ - int min_len = als[0].l; - for (i=1; in_allele; i++) - { -- if ( als[0].s[ als[0].l-1 ]!=als[i].s[ als[i].l-1 ] ) break; -+ if ( toupper(als[0].s[ als[0].l-1 ])!=toupper(als[i].s[ als[i].l-1 ]) ) break; - if ( als[i].l < min_len ) min_len = als[i].l; - } - if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed -@@ -382,7 +389,7 @@ - int npad = line->pos >= args->aln_win ? args->aln_win : line->pos; - free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref); -- if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad+1); -+ if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1); - replace_iupac_codes(ref,nref); - for (i=0; in_allele; i++) - { -@@ -422,7 +429,7 @@ - - // Have the alleles changed? - als[0].s[ als[0].l ] = 0; // in order for strcmp to work -- if ( ori_pos==line->pos && !strcmp(line->d.allele[0],als[0].s) ) return ERR_OK; -+ if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; - - // Create new block of alleles and update - args->tmp_als_str.l = 0; -@@ -461,23 +468,68 @@ - if ( len==BCF_VL_A ) \ - { \ - if ( ret!=src->n_allele-1 ) \ -- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(bcftools_stderr, \ -+ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ -+ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ - } \ - else if ( len==BCF_VL_R ) \ - { \ - if ( ret!=src->n_allele ) \ -- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(bcftools_stderr, \ -+ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele,ret); \ -+ } \ - if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ - bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ - } \ - else if ( len==BCF_VL_G ) \ - { \ - if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \ -- error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(bcftools_stderr, \ -+ "Warning: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \ -+ } \ - if ( ialt!=0 ) \ - { \ - vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ -@@ -622,8 +674,23 @@ - if ( len==BCF_VL_A ) \ - { \ - if ( nvals!=(src->n_allele-1)*nsmpl ) \ -- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(bcftools_stderr, \ -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ } \ - nvals /= nsmpl; \ - type_t *src_vals = vals, *dst_vals = vals; \ - for (i=0; in_allele*nsmpl ) \ -- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \ -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(bcftools_stderr, \ -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*nsmpl,nvals); \ -+ } \ - nvals /= nsmpl; \ - type_t *src_vals = vals, *dst_vals = vals; \ - for (i=0; in_allele*(src->n_allele+1)/2*nsmpl && nvals!=src->n_allele*nsmpl ) \ -- error("Error at %s:%d, the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ -+ { \ -+ if ( args->force && !args->force_warned ) \ -+ { \ -+ fprintf(bcftools_stderr, \ -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d. Removing the field.\n" \ -+ " (This warning is printed only once.)\n", \ -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,(src->n_allele-1)*nsmpl,nvals); \ -+ args->force_warned = 1; \ -+ } \ -+ if ( args->force ) \ -+ { \ -+ bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ -+ return; \ -+ } \ -+ error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ -+ } \ - nvals /= nsmpl; \ - int all_haploid = nvals==src->n_allele ? 1 : 0; \ - type_t *src_vals = vals, *dst_vals = vals; \ -@@ -706,6 +803,7 @@ - { - const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); - int ret = bcf_get_format_char(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1); -+ if ( !ret ) return; // all values can be empty, leave out the tag, no need to panic - assert( ret>0 ); - - kstring_t str; -@@ -762,9 +860,25 @@ - if ( *se==',' ) nfields++; - se++; - } -+ if ( nfields==1 && se-ptr==1 && *ptr=='.' ) continue; // missing value - if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele ) -- error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n", -- tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); -+ { -+ if ( args->force && !args->force_warned ) -+ { -+ fprintf(bcftools_stderr, -+ "Warning: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d. Removing the field.\n" -+ " (This warning is printed only once.)\n", -+ tag,bcf_seqname(args->hdr,src),(int64_t)src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); -+ args->force_warned = 1; -+ } -+ if ( args->force ) -+ { -+ bcf_update_format_char(args->hdr,dst,tag,NULL,0); -+ return; -+ } -+ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n", -+ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields); -+ } - - int len = 0; - if ( nfields==src->n_allele ) // haploid -@@ -890,7 +1004,7 @@ - if ( len==BCF_VL_A ) \ - { \ - if (nvals_ori!=lines[0]->n_allele - 1) \ -- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ -+ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele-1); \ - int nvals = dst->n_allele - 1; \ - ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ - vals = (type_t*) args->tmp_arr1; \ -@@ -901,7 +1015,7 @@ - if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - if (nvals2!=lines[i]->n_allele-1) \ -- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals2 = (type_t*) args->tmp_arr2; \ - for (k=0; kn_allele) \ -- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ -+ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele); \ - int nvals = dst->n_allele; \ - ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ - vals = (type_t*) args->tmp_arr1; \ -@@ -925,7 +1039,7 @@ - if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - if (nvals2!=lines[i]->n_allele) \ -- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals2 = (type_t*) args->tmp_arr2; \ - for (k=0; kn_allele*(lines[0]->n_allele+1)/2) { \ - fprintf(bcftools_stderr, "todo: merge Number=G INFO fields for haploid sites\n"); \ -- error("vcfnorm: number of fields in first record at position %s:%d for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ -+ error("vcfnorm: number of fields in first record at position %s:%"PRId64" for INFO tag %s not as expected [found: %d vs expected:%d]\n", bcf_seqname(args->hdr,lines[0]),(int64_t) lines[0]->pos+1, tag, nvals_ori, lines[0]->n_allele*(lines[0]->n_allele+1)/2); \ - } \ - int nvals = dst->n_allele*(dst->n_allele+1)/2; \ - ENLARGE_ARRAY(type_t,set_missing,args->tmp_arr1,args->ntmp_arr1,1,nvals_ori,nvals); \ -@@ -952,7 +1066,7 @@ - if (nvals2<0) continue; /* info tag does not exist in this record, skip */ \ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - if (nvals2!=lines[i]->n_allele*(lines[i]->n_allele+1)/2) \ -- error("vcfnorm: could not merge INFO tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge INFO tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals2 = (type_t*) args->tmp_arr2; \ - int ia,ib; \ - k = 0; \ -@@ -1064,7 +1178,7 @@ - int ngts2 = bcf_get_genotypes(args->hdr,lines[i],&args->tmp_arr2,&ntmp2); - args->ntmp_arr2 = ntmp2 * 4; - ngts2 /= nsmpl; -- if ( ngts!=ngts2 ) error("Error at %s:%d: cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); -+ if ( ngts!=ngts2 ) error("Error at %s:%"PRId64": cannot combine diploid with haploid genotype\n", bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); - - int32_t *gt = (int32_t*) args->tmp_arr1; - int32_t *gt2 = (int32_t*) args->tmp_arr2; -@@ -1078,7 +1192,7 @@ - else - { - int ial = bcf_gt_allele(gt2[k]); -- if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial); -+ if ( ial>=args->maps[i].nals ) error("Error at %s:%"PRId64": incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1,ial); - gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]); - } - } -@@ -1125,7 +1239,7 @@ - args->ntmp_arr2 = ntmp2 * sizeof(type_t); \ - nvals2 /= nsmpl; \ - if (nvals2!=lines[i]->n_allele-1) \ -- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals = (type_t*) args->tmp_arr1; \ - vals2 = (type_t*) args->tmp_arr2; \ - for (j=0; jntmp_arr2 = ntmp2 * sizeof(type_t); \ - nvals2 /= nsmpl; \ - if (nvals2!=lines[i]->n_allele) \ -- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals = (type_t*) args->tmp_arr1; \ - vals2 = (type_t*) args->tmp_arr2; \ - for (j=0; jn_allele*(lines[i]->n_allele+1)/2; \ - int line_diploid = nvals2==ndiploid ? 1 : 0; \ - if (!(nvals2==1 || nvals2==lines[i]->n_allele || nvals2==lines[i]->n_allele*(lines[i]->n_allele+1)/2)) \ -- error("vcfnorm: could not merge FORMAT tag %s at position %s:%d\n", tag, bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1); \ -+ error("vcfnorm: could not merge FORMAT tag %s at position %s:%"PRId64"\n", tag, bcf_seqname(args->hdr,lines[i]),(int64_t) lines[i]->pos+1); \ - vals = (type_t*) args->tmp_arr1; \ - vals2 = (type_t*) args->tmp_arr2; \ - for (j=0; jn_allele*(dst->n_allele+1)/2; - } -- else error("The field %s at %s:%d neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),dst->pos+1); -+ else error("The field %s at %s:%"PRId64" neither diploid nor haploid?\n", tag,bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); - - kstring_t *tmp = &args->tmp_str[i]; - kputc('.',tmp); -@@ -1417,7 +1531,7 @@ - args->maps[i].nals = lines[i]->n_allele; - hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map); - args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals); -- if ( !args->als ) error("Failed to merge alleles at %s:%d\n", bcf_seqname(args->hdr,dst),dst->pos+1); -+ if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); - } - bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals); - for (i=0; inals; i++) -@@ -1535,11 +1649,11 @@ - } - return NULL; - } --static void cmpals_add(args_t *args, bcf1_t *rec) -+static void cmpals_add(cmpals_t *ca, bcf1_t *rec) - { -- args->ncmpals++; -- hts_expand0(cmpals_t, args->ncmpals, args->mcmpals, args->cmpals); -- cmpals_t *cmpals = args->cmpals + args->ncmpals - 1; -+ ca->ncmpals++; -+ hts_expand0(cmpals1_t, ca->ncmpals, ca->mcmpals, ca->cmpals); -+ cmpals1_t *cmpals = ca->cmpals + ca->ncmpals - 1; - free(cmpals->ref); - cmpals->ref = strdup(rec->d.allele[0]); - cmpals->n = rec->n_allele; -@@ -1557,21 +1671,21 @@ - khash_str2int_inc(cmpals->hash, strdup(rec->d.allele[i])); - } - } --static int cmpals_match(args_t *args, bcf1_t *rec) -+static int cmpals_match(cmpals_t *ca, bcf1_t *rec) - { - int i, j; -- for (i=0; incmpals; i++) -+ for (i=0; incmpals; i++) - { -- cmpals_t *cmpals = args->cmpals + i; -+ cmpals1_t *cmpals = ca->cmpals + i; - if ( rec->n_allele != cmpals->n ) continue; - - // NB. assuming both are normalized -- if ( strcmp(rec->d.allele[0], cmpals->ref) ) continue; -+ if ( strcasecmp(rec->d.allele[0], cmpals->ref) ) continue; - - // the most frequent case - if ( rec->n_allele==2 ) - { -- if ( strcmp(rec->d.allele[1], cmpals->alt) ) continue; -+ if ( strcasecmp(rec->d.allele[1], cmpals->alt) ) continue; - return 1; - } - -@@ -1581,21 +1695,20 @@ - if ( jn_allele ) continue; - return 1; - } -- cmpals_add(args, rec); - return 0; - } --static void cmpals_reset(args_t *args) { args->ncmpals = 0; } --static void cmpals_destroy(args_t *args) -+static void cmpals_reset(cmpals_t *ca) { ca->ncmpals = 0; } -+static void cmpals_destroy(cmpals_t *ca) - { - int i; -- for (i=0; imcmpals; i++) -+ for (i=0; imcmpals; i++) - { -- cmpals_t *cmpals = args->cmpals + i; -+ cmpals1_t *cmpals = ca->cmpals + i; - free(cmpals->ref); - free(cmpals->alt); - if ( cmpals->hash ) khash_str2int_destroy_free(cmpals->hash); - } -- free(args->cmpals); -+ free(ca->cmpals); - } - - static void flush_buffer(args_t *args, htsFile *file, int n) -@@ -1610,7 +1723,8 @@ - { - if ( mrows_ready_to_flush(args, args->lines[k]) ) - { -- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); -+ while ( (line=mrows_flush(args)) ) -+ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - int merge = 1; - if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY ) -@@ -1631,23 +1745,24 @@ - if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only - if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; -- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, args->lines[k]) ) continue; -+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_out, args->lines[k]) ) continue; - } - else - { - prev_rid = args->lines[k]->rid; - prev_pos = args->lines[k]->pos; - prev_type = 0; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_out); - } - prev_type |= line_type; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, args->lines[k]); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); - } -- bcf_write1(file, args->hdr, args->lines[k]); -+ if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) - { -- while ( (line=mrows_flush(args)) ) bcf_write1(file, args->hdr, line); -+ while ( (line=mrows_flush(args)) ) -+ if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - -@@ -1671,7 +1786,8 @@ - - static void destroy_data(args_t *args) - { -- cmpals_destroy(args); -+ cmpals_destroy(&args->cmpals_in); -+ cmpals_destroy(&args->cmpals_out); - int i; - for (i=0; irbuf.m; i++) - if ( args->lines[i] ) bcf_destroy1(args->lines[i]); -@@ -1729,9 +1845,9 @@ - if ( args->check_ref & CHECK_REF_FIX ) - fix_dup_alt(args, line); - else if ( args->check_ref==CHECK_REF_EXIT ) -- error("Duplicate alleles at %s:%d; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),line->pos+1); -+ error("Duplicate alleles at %s:%"PRId64"; run with -cw to turn the error into warning or with -cs to fix.\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - else if ( args->check_ref & CHECK_REF_WARN ) -- fprintf(bcftools_stderr,"ALT_DUP\t%s\t%d\n", bcf_seqname(args->hdr,line),line->pos+1); -+ fprintf(bcftools_stderr,"ALT_DUP\t%s\t%"PRId64"\n", bcf_seqname(args->hdr,line),(int64_t) line->pos+1); - } - } - } -@@ -1756,7 +1872,7 @@ - if ( args->n_threads ) - hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); -- bcf_hdr_write(out, args->hdr); -+ if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - - int prev_rid = -1, prev_pos = -1, prev_type = 0; - while ( bcf_sr_next_line(args->files) ) -@@ -1772,17 +1888,17 @@ - if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only - if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; -- if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, line) ) continue; -+ if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(&args->cmpals_in, line) ) continue; - } - else - { - prev_rid = line->rid; - prev_pos = line->pos; - prev_type = 0; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(args); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_reset(&args->cmpals_in); - } - prev_type |= line_type; -- if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(args, line); -+ if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_in, line); - } - - // still on the same chromosome? -@@ -1821,7 +1937,7 @@ - if ( j>0 ) flush_buffer(args, out, j); - } - flush_buffer(args, out, args->rbuf.n); -- hts_close(out); -+ if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - - fprintf(bcftools_stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); - if ( args->check_ref & CHECK_REF_FIX ) -@@ -1839,8 +1955,9 @@ - fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); - fprintf(bcftools_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); -- fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|none\n"); -- fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence (MANDATORY)\n"); -+ fprintf(bcftools_stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|exact\n"); -+ fprintf(bcftools_stderr, " -f, --fasta-ref reference sequence\n"); -+ fprintf(bcftools_stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); - fprintf(bcftools_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); -@@ -1851,9 +1968,16 @@ - fprintf(bcftools_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(bcftools_stderr, " --threads number of extra (de)compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); - fprintf(bcftools_stderr, "\n"); -+ fprintf(bcftools_stderr, "Examples:\n"); -+ fprintf(bcftools_stderr, " # normalize and left-align indels\n"); -+ fprintf(bcftools_stderr, " bcftools norm -f ref.fa in.vcf\n"); -+ fprintf(bcftools_stderr, "\n"); -+ fprintf(bcftools_stderr, " # split multi-allelic sites\n"); -+ fprintf(bcftools_stderr, " bcftools norm -m- in.vcf\n"); -+ fprintf(bcftools_stderr, "\n"); - exit(1); - } - -@@ -1877,6 +2001,7 @@ - static struct option loptions[] = - { - {"help",no_argument,NULL,'h'}, -+ {"force",no_argument,NULL,7}, - {"fasta-ref",required_argument,NULL,'f'}, - {"do-not-normalize",no_argument,NULL,'N'}, - {"multiallelics",required_argument,NULL,'m'}, -@@ -1906,6 +2031,7 @@ - else if ( !strcmp("all",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; - else if ( !strcmp("any",optarg) ) args->rmdup = BCF_SR_PAIR_ANY; - else if ( !strcmp("none",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; -+ else if ( !strcmp("exact",optarg) ) args->rmdup = BCF_SR_PAIR_EXACT; - else error("The argument to -d not recognised: %s\n", optarg); - break; - case 'm': -@@ -1953,8 +2079,9 @@ - break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -+ case 7 : args->force = 1; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -1968,7 +2095,8 @@ - else fname = argv[optind]; - - if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); -- if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n"); -+ if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT; -+ if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n"); - - if ( args->region ) - { -@@ -1982,7 +2110,7 @@ - } - - if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n"); - init_data(args); - normalize_vcf(args); ---- python-pysam.orig/bcftools/vcfplugin.c -+++ python-pysam/bcftools/vcfplugin.c -@@ -38,7 +38,11 @@ - #include - #include - #include -+#ifdef _WIN32 -+#include -+#else - #include -+#endif - #include "bcftools.h" - #include "vcmp.h" - #include "filter.h" -@@ -154,7 +158,7 @@ - { - while (1) - { -- size_t len = strcspn(path, ":"); -+ size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); - - if ( len == 0 ) - { -@@ -185,7 +189,7 @@ - } - - path += len; -- if ( *path == ':' ) path++; -+ if ( *path == HTS_PATH_SEPARATOR_CHAR ) path++; - else break; - } - } -@@ -207,28 +211,55 @@ - - void *handle; - char *tmp; -- if ( fname[0]!='/' ) // not an absolute path -+ int is_absolute_path = 0; -+#ifdef _WIN32 -+ // Windows accepts both forward slash (/) and backslash (\) as folder separator -+ // and can have any path prefixed by the drive letter and a colon (:). -+ if ( fname[0]=='/' || fname[0]=='\\') is_absolute_path = 1; -+ else if ( fname[0] && fname[1]==':' && (fname[2]=='/' || fname[2]=='\\') ) is_absolute_path = 1; -+#else -+ if ( fname[0]=='/' ) is_absolute_path = 1; -+#endif -+ if ( !is_absolute_path ) - { - int i; - for (i=0; inplugin_paths; i++) - { -- tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); -+ tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); -+#ifdef _WIN32 -+ handle = LoadLibraryA(tmp); -+#else - handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though -+#endif - if ( args->verbose > 1 ) - { -- if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); -- else fprintf(stderr,"%s:\n\tdlopen .. ok\n", tmp); -+ if ( !handle ) -+#ifdef _WIN32 -+ fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError()); -+#else -+ fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror()); -+#endif -+ else fprintf(stderr,"%s:\n\tplugin open .. ok\n", tmp); - } - free(tmp); - if ( handle ) return handle; - } - } - -+#ifdef _WIN32 -+ handle = LoadLibraryA(fname); -+#else - handle = dlopen(fname, RTLD_NOW); -+#endif - if ( args->verbose > 1 ) - { -- if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); -- else fprintf(stderr,"%s:\n\tdlopen .. ok\n", fname); -+ if ( !handle ) -+#ifdef _WIN32 -+ fprintf(stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError()); -+#else -+ fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror()); -+#endif -+ else fprintf(stderr,"%s:\n\tplugin open .. ok\n", fname); - } - - return handle; -@@ -264,6 +295,55 @@ - return -1; - } - -+#ifdef _WIN32 -+ plugin->init = (dl_init_f) GetProcAddress(plugin->handle, "init"); -+ if ( plugin->init && args->verbose > 1 ) fprintf(stderr,"\tinit .. ok\n"); -+ -+ plugin->run = (dl_run_f) GetProcAddress(plugin->handle, "run"); -+ if ( plugin->run && args->verbose > 1 ) fprintf(stderr,"\trun .. ok\n"); -+ -+ if ( !plugin->init && !plugin->run ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); -+ else if ( args->verbose > 1 ) fprintf(stderr,"\tinit/run .. not found\n"); -+ return -1; -+ } -+ -+ plugin->version = (dl_version_f) GetProcAddress(plugin->handle, "version"); -+ if ( !plugin->version ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: version string not found\n", plugin->name); -+ else if ( args->verbose > 1 ) fprintf(stderr,"\tversion .. not found\n"); -+ return -1; -+ } -+ -+ plugin->about = (dl_about_f) GetProcAddress(plugin->handle, "about"); -+ if ( !plugin->about ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: about string not found\n", plugin->name); -+ return -1; -+ } -+ -+ plugin->usage = (dl_about_f) GetProcAddress(plugin->handle, "usage"); -+ if ( !plugin->usage ) -+ plugin->usage = plugin->about; -+ -+ if ( plugin->run ) return 0; -+ -+ plugin->process = (dl_process_f) GetProcAddress(plugin->handle, "process"); -+ if ( !plugin->process ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: process method not found\n", plugin->name); -+ return -1; -+ } -+ -+ plugin->destroy = (dl_destroy_f) GetProcAddress(plugin->handle, "destroy"); -+ if ( !plugin->destroy ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: destroy method not found\n", plugin->name); -+ return -1; -+ } -+#else - dlerror(); - plugin->init = (dl_init_f) dlsym(plugin->handle, "init"); - char *ret = dlerror(); -@@ -325,6 +405,7 @@ - if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret); - return -1; - } -+#endif - - return 0; - } -@@ -427,7 +508,7 @@ - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); -- bcf_hdr_write(args->out_fh, args->hdr_out); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - -@@ -435,7 +516,11 @@ - { - free(args->plugin.name); - if ( args->plugin.destroy ) args->plugin.destroy(); -+#ifdef _WIN32 -+ FreeLibrary(args->plugin.handle); -+#else - dlclose(args->plugin.handle); -+#endif - if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); - if ( args->nplugin_paths>0 ) - { -@@ -445,7 +530,7 @@ - } - if ( args->filter ) - filter_destroy(args->filter); -- if (args->out_fh) hts_close(args->out_fh); -+ if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - } - - static void usage(args_t *args) -@@ -466,7 +551,7 @@ - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); -- fprintf(stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, "Plugin options:\n"); - fprintf(stderr, " -h, --help list plugin's options\n"); - fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); -@@ -599,10 +684,16 @@ - char *fname = NULL; - if ( optind>=argc || argv[optind][0]=='-' ) - { -- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin -- else usage(args); - args->plugin.argc = argc - optind + 1; - args->plugin.argv = argv + optind - 1; -+ -+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin -+ else if ( optind>=argc ) usage(args); -+ else -+ { -+ optind = 1; -+ init_plugin(args); -+ } - } - else - { -@@ -624,7 +715,7 @@ - error("Failed to read the targets: %s\n", args->targets_list); - args->files->collapse |= COLLAPSE_SOME; - } -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - while ( bcf_sr_next_line(args->files) ) -@@ -640,7 +731,7 @@ - if ( line ) - { - if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode); -- bcf_write1(args->out_fh, args->hdr_out, line); -+ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - destroy_data(args); ---- python-pysam.orig/bcftools/vcfplugin.c.pysam.c -+++ python-pysam/bcftools/vcfplugin.c.pysam.c -@@ -40,7 +40,11 @@ - #include - #include - #include -+#ifdef _WIN32 -+#include -+#else - #include -+#endif - #include "bcftools.h" - #include "vcmp.h" - #include "filter.h" -@@ -156,7 +160,7 @@ - { - while (1) - { -- size_t len = strcspn(path, ":"); -+ size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); - - if ( len == 0 ) - { -@@ -187,7 +191,7 @@ - } - - path += len; -- if ( *path == ':' ) path++; -+ if ( *path == HTS_PATH_SEPARATOR_CHAR ) path++; - else break; - } - } -@@ -209,28 +213,55 @@ - - void *handle; - char *tmp; -- if ( fname[0]!='/' ) // not an absolute path -+ int is_absolute_path = 0; -+#ifdef _WIN32 -+ // Windows accepts both forward slash (/) and backslash (\) as folder separator -+ // and can have any path prefixed by the drive letter and a colon (:). -+ if ( fname[0]=='/' || fname[0]=='\\') is_absolute_path = 1; -+ else if ( fname[0] && fname[1]==':' && (fname[2]=='/' || fname[2]=='\\') ) is_absolute_path = 1; -+#else -+ if ( fname[0]=='/' ) is_absolute_path = 1; -+#endif -+ if ( !is_absolute_path ) - { - int i; - for (i=0; inplugin_paths; i++) - { -- tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); -+ tmp = msprintf("%s/%s%s", args->plugin_paths[i], fname, PLUGIN_EXT); -+#ifdef _WIN32 -+ handle = LoadLibraryA(tmp); -+#else - handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though -+#endif - if ( args->verbose > 1 ) - { -- if ( !handle ) fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror()); -- else fprintf(bcftools_stderr,"%s:\n\tdlopen .. ok\n", tmp); -+ if ( !handle ) -+#ifdef _WIN32 -+ fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", tmp, GetLastError()); -+#else -+ fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", tmp, dlerror()); -+#endif -+ else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", tmp); - } - free(tmp); - if ( handle ) return handle; - } - } - -+#ifdef _WIN32 -+ handle = LoadLibraryA(fname); -+#else - handle = dlopen(fname, RTLD_NOW); -+#endif - if ( args->verbose > 1 ) - { -- if ( !handle ) fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror()); -- else fprintf(bcftools_stderr,"%s:\n\tdlopen .. ok\n", fname); -+ if ( !handle ) -+#ifdef _WIN32 -+ fprintf(bcftools_stderr,"%s:\n\tLoadLibraryA .. %lu\n", fname, GetLastError()); -+#else -+ fprintf(bcftools_stderr,"%s:\n\tdlopen .. %s\n", fname, dlerror()); -+#endif -+ else fprintf(bcftools_stderr,"%s:\n\tplugin open .. ok\n", fname); - } - - return handle; -@@ -266,6 +297,55 @@ - return -1; - } - -+#ifdef _WIN32 -+ plugin->init = (dl_init_f) GetProcAddress(plugin->handle, "init"); -+ if ( plugin->init && args->verbose > 1 ) fprintf(bcftools_stderr,"\tinit .. ok\n"); -+ -+ plugin->run = (dl_run_f) GetProcAddress(plugin->handle, "run"); -+ if ( plugin->run && args->verbose > 1 ) fprintf(bcftools_stderr,"\trun .. ok\n"); -+ -+ if ( !plugin->init && !plugin->run ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name); -+ else if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\tinit/run .. not found\n"); -+ return -1; -+ } -+ -+ plugin->version = (dl_version_f) GetProcAddress(plugin->handle, "version"); -+ if ( !plugin->version ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: version string not found\n", plugin->name); -+ else if ( args->verbose > 1 ) fprintf(bcftools_stderr,"\tversion .. not found\n"); -+ return -1; -+ } -+ -+ plugin->about = (dl_about_f) GetProcAddress(plugin->handle, "about"); -+ if ( !plugin->about ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: about string not found\n", plugin->name); -+ return -1; -+ } -+ -+ plugin->usage = (dl_about_f) GetProcAddress(plugin->handle, "usage"); -+ if ( !plugin->usage ) -+ plugin->usage = plugin->about; -+ -+ if ( plugin->run ) return 0; -+ -+ plugin->process = (dl_process_f) GetProcAddress(plugin->handle, "process"); -+ if ( !plugin->process ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: process method not found\n", plugin->name); -+ return -1; -+ } -+ -+ plugin->destroy = (dl_destroy_f) GetProcAddress(plugin->handle, "destroy"); -+ if ( !plugin->destroy ) -+ { -+ if ( exit_on_error ) error("Could not initialize %s: destroy method not found\n", plugin->name); -+ return -1; -+ } -+#else - dlerror(); - plugin->init = (dl_init_f) dlsym(plugin->handle, "init"); - char *ret = dlerror(); -@@ -327,6 +407,7 @@ - if ( exit_on_error ) error("Could not initialize %s: %s\n", plugin->name, ret); - return -1; - } -+#endif - - return 0; - } -@@ -429,7 +510,7 @@ - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); - if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); - if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); -- bcf_hdr_write(args->out_fh, args->hdr_out); -+ if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - -@@ -437,7 +518,11 @@ - { - free(args->plugin.name); - if ( args->plugin.destroy ) args->plugin.destroy(); -+#ifdef _WIN32 -+ FreeLibrary(args->plugin.handle); -+#else - dlclose(args->plugin.handle); -+#endif - if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out); - if ( args->nplugin_paths>0 ) - { -@@ -447,7 +532,7 @@ - } - if ( args->filter ) - filter_destroy(args->filter); -- if (args->out_fh) hts_close(args->out_fh); -+ if (args->out_fh && hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - } - - static void usage(args_t *args) -@@ -468,7 +553,7 @@ - fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n"); - fprintf(bcftools_stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); -- fprintf(bcftools_stderr, " --threads number of extra output compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, "Plugin options:\n"); - fprintf(bcftools_stderr, " -h, --help list plugin's options\n"); - fprintf(bcftools_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n"); -@@ -601,10 +686,16 @@ - char *fname = NULL; - if ( optind>=argc || argv[optind][0]=='-' ) - { -- if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin -- else usage(args); - args->plugin.argc = argc - optind + 1; - args->plugin.argv = argv + optind - 1; -+ -+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin -+ else if ( optind>=argc ) usage(args); -+ else -+ { -+ optind = 1; -+ init_plugin(args); -+ } - } - else - { -@@ -626,7 +717,7 @@ - error("Failed to read the targets: %s\n", args->targets_list); - args->files->collapse |= COLLAPSE_SOME; - } -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - while ( bcf_sr_next_line(args->files) ) -@@ -642,7 +733,7 @@ - if ( line ) - { - if ( line->errcode ) error("[E::main_plugin] Unchecked error (%d), exiting\n",line->errcode); -- bcf_write1(args->out_fh, args->hdr_out, line); -+ if ( bcf_write1(args->out_fh, args->hdr_out, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - } - } - destroy_data(args); ---- python-pysam.orig/bcftools/vcfquery.c -+++ python-pysam/bcftools/vcfquery.c -@@ -128,7 +128,7 @@ - if ( args->print_header ) - { - convert_header(args->convert,&str); -- fwrite(str.s, str.l, 1, args->out); -+ if ( fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); - } - - int i,max_convert_unpack = convert_max_unpack(args->convert); -@@ -168,8 +168,7 @@ - - str.l = 0; - convert_line(args->convert, line, &str); -- if ( str.l ) -- fwrite(str.s, str.l, 1, args->out); -+ if ( str.l && fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); - } - if ( str.m ) free(str.s); - } -@@ -308,7 +307,7 @@ - case 's': args->sample_list = optarg; break; - case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -324,14 +323,18 @@ - { - if ( !fname ) error("Missing the VCF file name\n"); - args->files = bcf_sr_init(); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - list_columns(args); - bcf_sr_destroy(args->files); - free(args); - return 0; - } - -- if ( !args->format_str ) usage(); -+ if ( !args->format_str ) -+ { -+ if ( argc==1 && !fname ) usage(); -+ error("Error: Missing the --format option\n"); -+ } - args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout; - if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); - -@@ -349,7 +352,7 @@ - } - while ( fname ) - { -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - fname = ++optind < argc ? argv[optind] : NULL; - } - init_data(args); -@@ -357,7 +360,7 @@ - free(args->format_str); - destroy_data(args); - bcf_sr_destroy(args->files); -- fclose(args->out); -+ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out); - free(args); - return 0; - } -@@ -384,7 +387,10 @@ - if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum)); - init_data(args); - if ( i==0 ) -+ { - prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header)); -+ prev_nsamples = bcf_hdr_nsamples(args->files->readers[0].header); -+ } - else - { - args->print_header = 0; -@@ -395,7 +401,7 @@ - destroy_data(args); - bcf_sr_destroy(args->files); - } -- fclose(args->out); -+ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);; - destroy_list(fnames, nfiles); - destroy_list(prev_samples, prev_nsamples); - free(args->format_str); ---- python-pysam.orig/bcftools/vcfquery.c.pysam.c -+++ python-pysam/bcftools/vcfquery.c.pysam.c -@@ -130,7 +130,7 @@ - if ( args->print_header ) - { - convert_header(args->convert,&str); -- fwrite(str.s, str.l, 1, args->out); -+ if ( fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); - } - - int i,max_convert_unpack = convert_max_unpack(args->convert); -@@ -170,8 +170,7 @@ - - str.l = 0; - convert_line(args->convert, line, &str); -- if ( str.l ) -- fwrite(str.s, str.l, 1, args->out); -+ if ( str.l && fwrite(str.s, str.l, 1, args->out)!=1 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out?args->fn_out:"standard output"); - } - if ( str.m ) free(str.s); - } -@@ -310,7 +309,7 @@ - case 's': args->sample_list = optarg; break; - case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -326,14 +325,18 @@ - { - if ( !fname ) error("Missing the VCF file name\n"); - args->files = bcf_sr_init(); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - list_columns(args); - bcf_sr_destroy(args->files); - free(args); - return 0; - } - -- if ( !args->format_str ) usage(); -+ if ( !args->format_str ) -+ { -+ if ( argc==1 && !fname ) usage(); -+ error("Error: Missing the --format option\n"); -+ } - args->out = args->fn_out ? fopen(args->fn_out, "w") : bcftools_stdout; - if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); - -@@ -351,7 +354,7 @@ - } - while ( fname ) - { -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - fname = ++optind < argc ? argv[optind] : NULL; - } - init_data(args); -@@ -359,7 +362,7 @@ - free(args->format_str); - destroy_data(args); - bcf_sr_destroy(args->files); -- fclose(args->out); -+ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out); - free(args); - return 0; - } -@@ -386,7 +389,10 @@ - if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum)); - init_data(args); - if ( i==0 ) -+ { - prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header)); -+ prev_nsamples = bcf_hdr_nsamples(args->files->readers[0].header); -+ } - else - { - args->print_header = 0; -@@ -397,7 +403,7 @@ - destroy_data(args); - bcf_sr_destroy(args->files); - } -- fclose(args->out); -+ if ( fclose(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->fn_out);; - destroy_list(fnames, nfiles); - destroy_list(prev_samples, prev_nsamples); - free(args->format_str); ---- python-pysam.orig/bcftools/vcfroh.c -+++ python-pysam/bcftools/vcfroh.c -@@ -130,6 +130,11 @@ - return mem; - } - -+static inline int max255(int i) -+{ -+ return i < 256 ? i : 255; -+} -+ - static void init_data(args_t *args) - { - int i; -@@ -156,7 +161,7 @@ - if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3; - else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; } - if ( strcmp("-",args->estimate_AF) ) -- args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); -+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE|SMPL_VERBOSE); - } - - if ( args->estimate_AF || args->fake_PLs ) -@@ -181,7 +186,7 @@ - error("Error: The FORMAT/GT tag not found in the header\n"); - } - -- args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); -+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE|SMPL_VERBOSE); - if ( args->samples ) - { - // we may be able to subset to a few samples, for a text VCF this can be a major speedup -@@ -749,9 +754,9 @@ - if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ - if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ - double prob[3], norm = 0; \ -- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ -- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ -- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ -+ prob[0] = args->pl2p[ max255(p[irr]) ]; \ -+ prob[1] = args->pl2p[ max255(p[ira]) ]; \ -+ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ - for (j=0; j<3; j++) norm += prob[j]; \ - for (j=0; j<3; j++) prob[j] /= norm; \ - af += 0.5*prob[1] + prob[2]; \ -@@ -779,9 +784,9 @@ - if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ - if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ - double prob[3], norm = 0; \ -- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ -- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ -- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ -+ prob[0] = args->pl2p[ max255(p[irr]) ]; \ -+ prob[1] = args->pl2p[ max255(p[ira]) ]; \ -+ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ - for (j=0; j<3; j++) norm += prob[j]; \ - for (j=0; j<3; j++) prob[j] /= norm; \ - af += 0.5*prob[1] + prob[2]; \ -@@ -827,7 +832,7 @@ - if ( ret>0 ) - alt_freq = args->AFs[ial-1]; - if ( ret==-2 ) -- error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); -+ error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1); - } - else if ( args->af_fname ) - { -@@ -926,9 +931,9 @@ - type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ - if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ - if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ -- pdg[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ -- pdg[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ -- pdg[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ -+ pdg[0] = args->pl2p[ max255(p[irr]) ]; \ -+ pdg[1] = args->pl2p[ max255(p[ira]) ]; \ -+ pdg[2] = args->pl2p[ max255(p[iaa]) ]; \ - } - switch (fmt_pl->type) { - case BCF_BT_INT8: BRANCH(int8_t); break; -@@ -1089,7 +1094,7 @@ - fprintf(stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(stderr, " --threads number of extra decompression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "HMM Options:\n"); - fprintf(stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); -@@ -1198,7 +1203,7 @@ - } - } - if ( !args->output_fname ) args->output_fname = "stdout"; -- if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG; -+ if ( !args->output_type || args->output_type==OUTPUT_GZ ) args->output_type |= OUTPUT_ST|OUTPUT_RG; - char *fname = NULL; - if ( optind==argc ) - { -@@ -1229,7 +1234,7 @@ - } - if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) - error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - while ( bcf_sr_next_line(args->files) ) ---- python-pysam.orig/bcftools/vcfroh.c.pysam.c -+++ python-pysam/bcftools/vcfroh.c.pysam.c -@@ -132,6 +132,11 @@ - return mem; - } - -+static inline int max255(int i) -+{ -+ return i < 256 ? i : 255; -+} -+ - static void init_data(args_t *args) - { - int i; -@@ -158,7 +163,7 @@ - if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3; - else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; } - if ( strcmp("-",args->estimate_AF) ) -- args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE); -+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE|SMPL_VERBOSE); - } - - if ( args->estimate_AF || args->fake_PLs ) -@@ -183,7 +188,7 @@ - error("Error: The FORMAT/GT tag not found in the header\n"); - } - -- args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE); -+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE|SMPL_VERBOSE); - if ( args->samples ) - { - // we may be able to subset to a few samples, for a text VCF this can be a major speedup -@@ -751,9 +756,9 @@ - if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ - if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ - double prob[3], norm = 0; \ -- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ -- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ -- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ -+ prob[0] = args->pl2p[ max255(p[irr]) ]; \ -+ prob[1] = args->pl2p[ max255(p[ira]) ]; \ -+ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ - for (j=0; j<3; j++) norm += prob[j]; \ - for (j=0; j<3; j++) prob[j] /= norm; \ - af += 0.5*prob[1] + prob[2]; \ -@@ -781,9 +786,9 @@ - if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ - if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ - double prob[3], norm = 0; \ -- prob[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ -- prob[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ -- prob[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ -+ prob[0] = args->pl2p[ max255(p[irr]) ]; \ -+ prob[1] = args->pl2p[ max255(p[ira]) ]; \ -+ prob[2] = args->pl2p[ max255(p[iaa]) ]; \ - for (j=0; j<3; j++) norm += prob[j]; \ - for (j=0; j<3; j++) prob[j] /= norm; \ - af += 0.5*prob[1] + prob[2]; \ -@@ -829,7 +834,7 @@ - if ( ret>0 ) - alt_freq = args->AFs[ial-1]; - if ( ret==-2 ) -- error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1); -+ error("Type mismatch for INFO/%s tag at %s:%"PRId64"\n", args->af_tag, bcf_seqname(args->hdr,line), (int64_t) line->pos+1); - } - else if ( args->af_fname ) - { -@@ -928,9 +933,9 @@ - type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \ - if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \ - if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \ -- pdg[0] = p[irr] < 256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \ -- pdg[1] = p[ira] < 256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \ -- pdg[2] = p[iaa] < 256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \ -+ pdg[0] = args->pl2p[ max255(p[irr]) ]; \ -+ pdg[1] = args->pl2p[ max255(p[ira]) ]; \ -+ pdg[2] = args->pl2p[ max255(p[iaa]) ]; \ - } - switch (fmt_pl->type) { - case BCF_BT_INT8: BRANCH(int8_t); break; -@@ -1091,7 +1096,7 @@ - fprintf(bcftools_stderr, " -S, --samples-file file of samples to analyze [all samples]\n"); - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); -- fprintf(bcftools_stderr, " --threads number of extra decompression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "HMM Options:\n"); - fprintf(bcftools_stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); -@@ -1200,7 +1205,7 @@ - } - } - if ( !args->output_fname ) args->output_fname = "bcftools_stdout"; -- if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG; -+ if ( !args->output_type || args->output_type==OUTPUT_GZ ) args->output_type |= OUTPUT_ST|OUTPUT_RG; - char *fname = NULL; - if ( optind==argc ) - { -@@ -1231,7 +1236,7 @@ - } - if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0) - error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - while ( bcf_sr_next_line(args->files) ) ---- python-pysam.orig/bcftools/vcfsom.c -+++ python-pysam/bcftools/vcfsom.c -@@ -35,6 +35,7 @@ - #include - #include - #include -+#include - #include - #include "bcftools.h" - -@@ -356,7 +357,7 @@ - if ( !som->w ) error("Could not alloc %"PRIu64" bytes [nbin=%d ndim=%d]\n", (uint64_t)(sizeof(double)*som->size),som->nbin,som->ndim); - int i; - for (i=0; isize*som->kdim; i++) -- som->w[i] = (double)random()/RAND_MAX; -+ som->w[i] = random(); - som->a_idx = (int*) malloc(sizeof(int)*som->ndim); - som->b_idx = (int*) malloc(sizeof(int)*som->ndim); - som->div = (double*) malloc(sizeof(double)*som->ndim); -@@ -695,7 +696,7 @@ - case 't': args->action = SOM_TRAIN; break; - case 'c': args->action = SOM_CLASSIFY; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfsom.c.pysam.c -+++ python-pysam/bcftools/vcfsom.c.pysam.c -@@ -37,6 +37,7 @@ - #include - #include - #include -+#include - #include - #include "bcftools.h" - -@@ -358,7 +359,7 @@ - if ( !som->w ) error("Could not alloc %"PRIu64" bytes [nbin=%d ndim=%d]\n", (uint64_t)(sizeof(double)*som->size),som->nbin,som->ndim); - int i; - for (i=0; isize*som->kdim; i++) -- som->w[i] = (double)random()/RAND_MAX; -+ som->w[i] = random(); - som->a_idx = (int*) malloc(sizeof(int)*som->ndim); - som->b_idx = (int*) malloc(sizeof(int)*som->ndim); - som->div = (double*) malloc(sizeof(double)*som->ndim); -@@ -697,7 +698,7 @@ - case 't': args->action = SOM_TRAIN; break; - case 'c': args->action = SOM_CLASSIFY; break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfsort.c -+++ python-pysam/bcftools/vcfsort.c -@@ -29,13 +29,18 @@ - #include - #include - #include -+#include - #include - #include - #include - #include - #include -+#ifdef _WIN32 -+#include -+#endif - #include - #include -+#include - #include "kheap.h" - #include "bcftools.h" - -@@ -59,6 +64,33 @@ - } - args_t; - -+void clean_files(args_t *args) -+{ -+ int i; -+ fprintf(stderr,"Cleaning\n"); -+ for (i=0; inblk; i++) -+ { -+ blk_t *blk = args->blk + i; -+ if ( blk->fname ) -+ { -+ unlink(blk->fname); -+ free(blk->fname); -+ } -+ if ( blk->rec ) -+ bcf_destroy(blk->rec); -+ } -+ rmdir(args->tmp_dir); -+} -+void clean_files_and_throw(args_t *args, const char *format, ...) -+{ -+ va_list ap; -+ va_start(ap, format); -+ vfprintf(stderr, format, ap); -+ va_end(ap); -+ clean_files(args); -+ exit(-1); -+} -+ - int cmp_bcf_pos(const void *aptr, const void *bptr) - { - bcf1_t *a = *((bcf1_t**)aptr); -@@ -98,18 +130,20 @@ - kstring_t str = {0,0,0}; - ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk); - blk->fname = str.s; -+ blk->rec = NULL; -+ blk->fh = NULL; - - htsFile *fh = hts_open(blk->fname, "wbu"); -- if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno)); -- bcf_hdr_write(fh, args->hdr); -+ if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno)); -+ if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - - int i; - for (i=0; inbuf; i++) - { -- bcf_write(fh, args->hdr, args->buf[i]); -+ if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - bcf_destroy(args->buf[i]); - } -- hts_close(fh); -+ if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); - - args->nbuf = 0; - args->mem = 0; -@@ -128,25 +162,27 @@ - void sort_blocks(args_t *args) - { - htsFile *in = hts_open(args->fname, "r"); -- if ( !in ) error("Could not read %s\n", args->fname); -+ if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname); - args->hdr = bcf_hdr_read(in); -+ if ( !args->hdr) clean_files_and_throw(args, "Could not read VCF/BCF headers from %s\n", args->fname); - - while ( 1 ) - { - bcf1_t *rec = bcf_init(); - int ret = bcf_read1(in, args->hdr, rec); -- if ( ret < -1 ) error("Error encountered while parsing the input\n"); -+ if ( ret < -1 ) clean_files_and_throw(args,"Error encountered while parsing the input\n"); - if ( ret == -1 ) - { - bcf_destroy(rec); - break; - } -+ if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1); - buf_push(args, rec); - } - buf_flush(args); - free(args->buf); - -- if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname); -+ if ( hts_close(in)!=0 ) clean_files_and_throw(args,"Close failed: %s\n", args->fname); - } - - static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) -@@ -159,14 +195,14 @@ - } - KHEAP_INIT(blk, blk_t*, blk_is_smaller) - --void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) -+void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) - { - if ( !blk->fh ) return; - int ret = bcf_read(blk->fh, hdr, blk->rec); -- if ( ret < -1 ) error("Error reading %s\n", blk->fname); -+ if ( ret < -1 ) clean_files_and_throw(args, "Error reading %s\n", blk->fname); - if ( ret == -1 ) - { -- if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname); -+ if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname); - blk->fh = 0; - return; - } -@@ -184,33 +220,26 @@ - { - blk_t *blk = args->blk + i; - blk->fh = hts_open(blk->fname, "r"); -- if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno)); -+ if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno)); - bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); - bcf_hdr_destroy(hdr); - blk->rec = bcf_init(); -- blk_read(bhp, args->hdr, blk); -+ blk_read(args, bhp, args->hdr, blk); - } - - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); -- bcf_hdr_write(out, args->hdr); -+ if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); - while ( bhp->ndat ) - { - blk_t *blk = bhp->dat[0]; -- bcf_write(out, args->hdr, blk->rec); -+ if ( bcf_write(out, args->hdr, blk->rec)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); - khp_delete(blk, bhp); -- blk_read(bhp, args->hdr, blk); -+ blk_read(args, bhp, args->hdr, blk); - } -- if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname); -+ if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); -+ -+ clean_files(args); - -- fprintf(stderr,"Cleaning\n"); -- for (i=0; inblk; i++) -- { -- blk_t *blk = args->blk + i; -- unlink(blk->fname); -- free(blk->fname); -- bcf_destroy(blk->rec); -- } -- rmdir(args->tmp_dir); - free(args->blk); - khp_destroy(blk, bhp); - fprintf(stderr,"Done\n"); -@@ -226,7 +255,7 @@ - fprintf(stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(stderr, " -o, --output-file output file name [stdout]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); -- fprintf(stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); -+ fprintf(stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX]\n"); - fprintf(stderr, "\n"); - exit(1); - } -@@ -243,22 +272,40 @@ - } - - void mkdir_p(const char *fmt, ...); --void init(args_t *args) -+static void init(args_t *args) - { -- if ( !args->tmp_dir ) -+#ifdef _WIN32 -+ char tmp_path[MAX_PATH]; -+ int ret = GetTempPath(MAX_PATH, tmp_path); -+ if (!ret || ret > MAX_PATH) -+ error("Could not get the path to the temporary folder\n"); -+ if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH) -+ error("Full path to the temporary folder is too long\n"); -+ strcat(tmp_path, "/bcftools-sort.XXXXXX"); -+ args->tmp_dir = strdup(tmp_path); -+#else -+ args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX"); -+#endif -+ size_t len = strlen(args->tmp_dir); -+ if ( !strcmp("XXXXXX",args->tmp_dir+len-6) ) - { -- args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX"); -- char *tmp_dir = mkdtemp(args->tmp_dir); -- if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); -+#ifdef _WIN32 -+ int ret = mkdir(mktemp(args->tmp_dir), 0700); -+ if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno)); -+#else -+ char *tmp = mkdtemp(args->tmp_dir); -+ if ( !tmp ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); -+ int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR); -+ if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno)); -+#endif - } -- else -- { -- args->tmp_dir = strdup(args->tmp_dir); -- mkdir_p(args->tmp_dir); -+ else { -+ mkdir_p("%s/",args->tmp_dir); - } -+ - fprintf(stderr,"Writing to %s\n", args->tmp_dir); - } --void destroy(args_t *args) -+static void destroy(args_t *args) - { - bcf_hdr_destroy(args->hdr); - free(args->tmp_dir); -@@ -298,8 +345,8 @@ - default: error("The output type \"%s\" not recognised\n", optarg); - }; - break; -- case 'h': usage(args); -- case '?': usage(args); -+ case 'h': -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfsort.c.pysam.c -+++ python-pysam/bcftools/vcfsort.c.pysam.c -@@ -31,13 +31,18 @@ - #include - #include - #include -+#include - #include - #include - #include - #include - #include -+#ifdef _WIN32 -+#include -+#endif - #include - #include -+#include - #include "kheap.h" - #include "bcftools.h" - -@@ -61,6 +66,33 @@ - } - args_t; - -+void clean_files(args_t *args) -+{ -+ int i; -+ fprintf(bcftools_stderr,"Cleaning\n"); -+ for (i=0; inblk; i++) -+ { -+ blk_t *blk = args->blk + i; -+ if ( blk->fname ) -+ { -+ unlink(blk->fname); -+ free(blk->fname); -+ } -+ if ( blk->rec ) -+ bcf_destroy(blk->rec); -+ } -+ rmdir(args->tmp_dir); -+} -+void clean_files_and_throw(args_t *args, const char *format, ...) -+{ -+ va_list ap; -+ va_start(ap, format); -+ vfprintf(bcftools_stderr, format, ap); -+ va_end(ap); -+ clean_files(args); -+ exit(-1); -+} -+ - int cmp_bcf_pos(const void *aptr, const void *bptr) - { - bcf1_t *a = *((bcf1_t**)aptr); -@@ -100,18 +132,20 @@ - kstring_t str = {0,0,0}; - ksprintf(&str, "%s/%05d.bcf", args->tmp_dir, (int)args->nblk); - blk->fname = str.s; -+ blk->rec = NULL; -+ blk->fh = NULL; - - htsFile *fh = hts_open(blk->fname, "wbu"); -- if ( fh == NULL ) error("Cannot write %s: %s\n", blk->fname, strerror(errno)); -- bcf_hdr_write(fh, args->hdr); -+ if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno)); -+ if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - - int i; - for (i=0; inbuf; i++) - { -- bcf_write(fh, args->hdr, args->buf[i]); -+ if ( bcf_write(fh, args->hdr, args->buf[i])!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - bcf_destroy(args->buf[i]); - } -- hts_close(fh); -+ if ( hts_close(fh)!=0 ) clean_files_and_throw(args, "[%s] Error: close failed .. %s\n", __func__,blk->fname); - - args->nbuf = 0; - args->mem = 0; -@@ -130,25 +164,27 @@ - void sort_blocks(args_t *args) - { - htsFile *in = hts_open(args->fname, "r"); -- if ( !in ) error("Could not read %s\n", args->fname); -+ if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname); - args->hdr = bcf_hdr_read(in); -+ if ( !args->hdr) clean_files_and_throw(args, "Could not read VCF/BCF headers from %s\n", args->fname); - - while ( 1 ) - { - bcf1_t *rec = bcf_init(); - int ret = bcf_read1(in, args->hdr, rec); -- if ( ret < -1 ) error("Error encountered while parsing the input\n"); -+ if ( ret < -1 ) clean_files_and_throw(args,"Error encountered while parsing the input\n"); - if ( ret == -1 ) - { - bcf_destroy(rec); - break; - } -+ if ( rec->errcode ) clean_files_and_throw(args,"Error encountered while parsing the input at %s:%d\n",bcf_seqname(args->hdr,rec),rec->pos+1); - buf_push(args, rec); - } - buf_flush(args); - free(args->buf); - -- if ( hts_close(in)!=0 ) error("Close failed: %s\n", args->fname); -+ if ( hts_close(in)!=0 ) clean_files_and_throw(args,"Close failed: %s\n", args->fname); - } - - static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) -@@ -161,14 +197,14 @@ - } - KHEAP_INIT(blk, blk_t*, blk_is_smaller) - --void blk_read(khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) -+void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) - { - if ( !blk->fh ) return; - int ret = bcf_read(blk->fh, hdr, blk->rec); -- if ( ret < -1 ) error("Error reading %s\n", blk->fname); -+ if ( ret < -1 ) clean_files_and_throw(args, "Error reading %s\n", blk->fname); - if ( ret == -1 ) - { -- if ( hts_close(blk->fh)!=0 ) error("Close failed: %s\n", blk->fname); -+ if ( hts_close(blk->fh)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", blk->fname); - blk->fh = 0; - return; - } -@@ -186,33 +222,26 @@ - { - blk_t *blk = args->blk + i; - blk->fh = hts_open(blk->fname, "r"); -- if ( !blk->fh ) error("Could not read %s: %s\n", blk->fname, strerror(errno)); -+ if ( !blk->fh ) clean_files_and_throw(args, "Could not read %s: %s\n", blk->fname, strerror(errno)); - bcf_hdr_t *hdr = bcf_hdr_read(blk->fh); - bcf_hdr_destroy(hdr); - blk->rec = bcf_init(); -- blk_read(bhp, args->hdr, blk); -+ blk_read(args, bhp, args->hdr, blk); - } - - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); -- bcf_hdr_write(out, args->hdr); -+ if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); - while ( bhp->ndat ) - { - blk_t *blk = bhp->dat[0]; -- bcf_write(out, args->hdr, blk->rec); -+ if ( bcf_write(out, args->hdr, blk->rec)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); - khp_delete(blk, bhp); -- blk_read(bhp, args->hdr, blk); -+ blk_read(args, bhp, args->hdr, blk); - } -- if ( hts_close(out)!=0 ) error("Close failed: %s\n", args->output_fname); -+ if ( hts_close(out)!=0 ) clean_files_and_throw(args, "Close failed: %s\n", args->output_fname); -+ -+ clean_files(args); - -- fprintf(bcftools_stderr,"Cleaning\n"); -- for (i=0; inblk; i++) -- { -- blk_t *blk = args->blk + i; -- unlink(blk->fname); -- free(blk->fname); -- bcf_destroy(blk->rec); -- } -- rmdir(args->tmp_dir); - free(args->blk); - khp_destroy(blk, bhp); - fprintf(bcftools_stderr,"Done\n"); -@@ -228,7 +257,7 @@ - fprintf(bcftools_stderr, " -m, --max-mem [kMG] maximum memory to use [768M]\n"); // using metric units, 1M=1e6 - fprintf(bcftools_stderr, " -o, --output-file output file name [bcftools_stdout]\n"); - fprintf(bcftools_stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); -- fprintf(bcftools_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX/]\n"); -+ fprintf(bcftools_stderr, " -T, --temp-dir temporary files [/tmp/bcftools-sort.XXXXXX]\n"); - fprintf(bcftools_stderr, "\n"); - exit(1); - } -@@ -245,22 +274,40 @@ - } - - void mkdir_p(const char *fmt, ...); --void init(args_t *args) -+static void init(args_t *args) - { -- if ( !args->tmp_dir ) -+#ifdef _WIN32 -+ char tmp_path[MAX_PATH]; -+ int ret = GetTempPath(MAX_PATH, tmp_path); -+ if (!ret || ret > MAX_PATH) -+ error("Could not get the path to the temporary folder\n"); -+ if (strlen(tmp_path) + strlen("/bcftools-sort.XXXXXX") >= MAX_PATH) -+ error("Full path to the temporary folder is too long\n"); -+ strcat(tmp_path, "/bcftools-sort.XXXXXX"); -+ args->tmp_dir = strdup(tmp_path); -+#else -+ args->tmp_dir = args->tmp_dir ? strdup(args->tmp_dir) : strdup("/tmp/bcftools-sort.XXXXXX"); -+#endif -+ size_t len = strlen(args->tmp_dir); -+ if ( !strcmp("XXXXXX",args->tmp_dir+len-6) ) - { -- args->tmp_dir = strdup("/tmp/bcftools-sort.XXXXXX"); -- char *tmp_dir = mkdtemp(args->tmp_dir); -- if ( !tmp_dir ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); -+#ifdef _WIN32 -+ int ret = mkdir(mktemp(args->tmp_dir), 0700); -+ if ( ret ) error("mkdir(%s) failed: %s\n", args->tmp_dir,strerror(errno)); -+#else -+ char *tmp = mkdtemp(args->tmp_dir); -+ if ( !tmp ) error("mkdtemp(%s) failed: %s\n", args->tmp_dir,strerror(errno)); -+ int ret = chmod(tmp, S_IRUSR|S_IWUSR|S_IXUSR); -+ if ( ret ) error("chmod(%s,S_IRUSR|S_IWUSR|S_IXUSR) failed: %s\n", args->tmp_dir,strerror(errno)); -+#endif - } -- else -- { -- args->tmp_dir = strdup(args->tmp_dir); -- mkdir_p(args->tmp_dir); -+ else { -+ mkdir_p("%s/",args->tmp_dir); - } -+ - fprintf(bcftools_stderr,"Writing to %s\n", args->tmp_dir); - } --void destroy(args_t *args) -+static void destroy(args_t *args) - { - bcf_hdr_destroy(args->hdr); - free(args->tmp_dir); -@@ -300,8 +347,8 @@ - default: error("The output type \"%s\" not recognised\n", optarg); - }; - break; -- case 'h': usage(args); -- case '?': usage(args); -+ case 'h': -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } ---- python-pysam.orig/bcftools/vcfstats.c -+++ python-pysam/bcftools/vcfstats.c -@@ -70,7 +70,7 @@ - - typedef struct - { -- int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; -+ uint32_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; - int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons - #if HWE_STATS - int *af_hwe; -@@ -88,12 +88,14 @@ - int subst[15]; - int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; - int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; -- int *smpl_indel_hets, *smpl_indel_homs; -+ int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; - int *smpl_frm_shifts; // not-applicable, in-frame, out-frame - unsigned long int *smpl_dp; - idist_t dp, dp_sites; - int nusr; - user_stats_t *usr; -+ double *dvaf; // distribution of the mean indel-allele frequency by length: -m_indel,-(m_indel-1),...-1,0,1,..,m_indel -+ uint32_t *nvaf; - } - stats_t; - -@@ -476,8 +478,10 @@ - stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int)); -- stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); -- stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_ins_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_del_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_ins_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_del_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int)); -@@ -489,6 +493,8 @@ - #endif - if ( args->exons_fname ) - stats->smpl_frm_shifts = (int*) calloc(args->files->n_smpl*3,sizeof(int)); -+ stats->nvaf = (uint32_t*) calloc(stats->m_indel*2+1,sizeof(*stats->nvaf)); -+ stats->dvaf = (double*) calloc(stats->m_indel*2+1,sizeof(*stats->dvaf)); - } - idist_init(&stats->dp, args->dp_min,args->dp_max,args->dp_step); - idist_init(&stats->dp_sites, args->dp_min,args->dp_max,args->dp_step); -@@ -558,8 +564,10 @@ - free(stats->smpl_homRR); - free(stats->smpl_hapRef); - free(stats->smpl_hapAlt); -- free(stats->smpl_indel_homs); -- free(stats->smpl_indel_hets); -+ free(stats->smpl_ins_homs); -+ free(stats->smpl_del_homs); -+ free(stats->smpl_ins_hets); -+ free(stats->smpl_del_hets); - free(stats->smpl_ts); - free(stats->smpl_tv); - free(stats->smpl_indels); -@@ -576,6 +584,8 @@ - } - free(stats->usr); - if ( args->exons ) free(stats->smpl_frm_shifts); -+ free(stats->nvaf); -+ free(stats->dvaf); - } - for (j=0; jnusr; j++) free(args->usr[j].tag); - if ( args->af_bins ) bin_destroy(args->af_bins); -@@ -844,6 +854,34 @@ - } - } - -+static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal) -+{ -+ if ( !fmt ) return; -+ -+ float dvaf; -+ #define BRANCH_INT(type_t,missing,vector_end) { \ -+ type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \ -+ if ( p[ial]==vector_end || p[jal]==vector_end ) return; \ -+ if ( p[ial]==missing || p[jal]==missing ) return; \ -+ if ( !p[ial] && !p[jal] ) return; \ -+ dvaf = (float)p[ial]/(p[ial]+p[jal]); \ -+ } -+ switch (fmt->type) { -+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; -+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; -+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; -+ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break; -+ } -+ #undef BRANCH_INT -+ -+ int len = line->d.var[ial].n; -+ if ( len < -stats->m_indel ) len = -stats->m_indel; -+ else if ( len > stats->m_indel ) len = stats->m_indel; -+ int bin = stats->m_indel + len; -+ stats->nvaf[bin]++; -+ stats->dvaf[bin] += dvaf; -+} -+ - static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) - { - bcf_srs_t *files = args->files; -@@ -854,6 +892,8 @@ - - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) ) - { -+ bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL; -+ - int ref = bcf_acgt2int(*line->d.allele[0]); - int is, n_nref = 0, i_nref = 0; - for (is=0; isfiles->n_smpl; is++) -@@ -910,8 +950,31 @@ - if ( gt != GT_HOM_RR ) - { - stats->smpl_indels[is]++; -- if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++; -- else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++; -+ -+ if ( gt==GT_HET_RA || gt==GT_HET_AA ) -+ { -+ int is_ins = 0, is_del = 0; -+ if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) -+ { -+ if ( line->d.var[ial].n < 0 ) is_del = 1; -+ else is_ins = 1; -+ update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal); -+ } -+ if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) -+ { -+ if ( line->d.var[jal].n < 0 ) is_del = 1; -+ else is_ins = 1; -+ update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial); -+ } -+ // Note that alt-het genotypes with both ins and del allele are counted twice!! -+ if ( is_del ) stats->smpl_del_hets[is]++; -+ if ( is_ins ) stats->smpl_ins_hets[is]++; -+ } -+ else if ( gt==GT_HOM_AA ) -+ { -+ if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++; -+ else stats->smpl_ins_homs[is]++; -+ } - } - if ( stats->smpl_frm_shifts ) - { -@@ -959,6 +1022,37 @@ - } - #undef BRANCH_INT - } -+ else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) ) -+ { -+ #define BRANCH_INT(type_t,missing,vector_end) { \ -+ int is,iv; \ -+ for (is=0; isfiles->n_smpl; is++) \ -+ { \ -+ type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ -+ int dp = 0, has_value = 0; \ -+ for (iv=0; ivn; iv++) \ -+ { \ -+ if ( p[iv]==vector_end ) break; \ -+ if ( p[iv]==missing ) continue; \ -+ has_value = 1; \ -+ dp += p[iv]; \ -+ } \ -+ if ( has_value ) \ -+ { \ -+ (*idist(&stats->dp, dp))++; \ -+ stats->smpl_ndp[is]++; \ -+ stats->smpl_dp[is] += dp; \ -+ } \ -+ } \ -+ } -+ switch (fmt_ptr->type) { -+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; -+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; -+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; -+ default: fprintf(stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; -+ } -+ #undef BRANCH_INT -+ } - - if ( matched==3 ) - { -@@ -968,6 +1062,7 @@ - fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return; - - // only the first ALT allele is considered -+ if (args->ntmp_iaf <= 1) return; // Do not consider invariate sites - int iaf = args->tmp_iaf[1]; - int line_type = bcf_get_variant_types(files->readers[0].buffer[0]); - gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels; -@@ -1019,7 +1114,7 @@ - { - nmm++; - bcf_sr_t *reader = &files->readers[0]; -- printf("DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); -+ printf("DBG\t%s\t%"PRId64"\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,files->samples[is],gt,gt2); - } - else - { -@@ -1028,7 +1123,7 @@ - } - } - float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0; -- printf("PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); -+ printf("PSD\t%s\t%"PRId64"\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,nm,nmm,nrd); - } - } - } -@@ -1162,14 +1257,14 @@ - for (id=0; idnstats; id++) - { - stats_t *stats = &args->stats[id]; -- printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); -- printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); -- printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); -- printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); -- printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); -- printf("SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); -- printf("SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); -- printf("SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); -+ printf("SN\t%d\tnumber of records:\t%u\n", id, stats->n_records); -+ printf("SN\t%d\tnumber of no-ALTs:\t%u\n", id, stats->n_noalts); -+ printf("SN\t%d\tnumber of SNPs:\t%u\n", id, stats->n_snps); -+ printf("SN\t%d\tnumber of MNPs:\t%u\n", id, stats->n_mnps); -+ printf("SN\t%d\tnumber of indels:\t%u\n", id, stats->n_indels); -+ printf("SN\t%d\tnumber of others:\t%u\n", id, stats->n_others); -+ printf("SN\t%d\tnumber of multiallelic sites:\t%u\n", id, stats->n_mals); -+ printf("SN\t%d\tnumber of multiallelic SNP sites:\t%u\n", id, stats->n_snp_mals); - } - printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); - for (id=0; idnstats; id++) -@@ -1287,14 +1382,33 @@ - } - } - } -- printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); -+ printf("# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]number of sites\t[5]number of genotypes\t[6]mean VAF\n"); - for (id=0; idnstats; id++) - { - stats_t *stats = &args->stats[id]; - for (i=stats->m_indel-1; i>=0; i--) -- if ( stats->deletions[i] ) printf("IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); -+ { -+ if ( !stats->deletions[i] ) continue; -+ // whops, differently organized arrow, dels are together with ins -+ int bin = stats->m_indel - i - 1; -+ printf("IDD\t%d\t%d\t%d\t", id,-i-1,stats->deletions[i]); -+ if ( stats->nvaf && stats->nvaf[bin] ) -+ printf("%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); -+ else -+ printf("0\t."); -+ printf("\n"); -+ } - for (i=0; im_indel; i++) -- if ( stats->insertions[i] ) printf("IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); -+ { -+ if ( !stats->insertions[i] ) continue; -+ int bin = stats->m_indel + i + 1; -+ printf("IDD\t%d\t%d\t%d\t", id,i+1,stats->insertions[i]); -+ if ( stats->nvaf && stats->nvaf[bin] ) -+ printf("%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); -+ else -+ printf("0\t."); -+ printf("\n"); -+ } - } - printf("# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); - for (id=0; idnstats; id++) -@@ -1517,8 +1631,8 @@ - } - } - -- -- printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); -+ printf("# PSI, Per-Sample Indels. Note that alt-het genotypes with both ins and del allele are counted twice, in both nInsHets and nDelHets.\n"); -+ printf("# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nInsHets\t[9]nDelHets\t[10]nInsAltHoms\t[11]nDelAltHoms\n"); - for (id=0; idnstats; id++) - { - stats_t *stats = &args->stats[id]; -@@ -1531,9 +1645,8 @@ - in = stats->smpl_frm_shifts[i*3 + 1]; - out = stats->smpl_frm_shifts[i*3 + 2]; - } -- int nhom = stats->smpl_indel_homs[i]; -- int nhet = stats->smpl_indel_hets[i]; -- printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); -+ printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0, -+ stats->smpl_ins_hets[i],stats->smpl_del_hets[i],stats->smpl_ins_homs[i],stats->smpl_del_homs[i]); - } - } - -@@ -1609,7 +1722,7 @@ - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); -- fprintf(stderr, " --threads number of extra decompression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); - fprintf(stderr, "\n"); - exit(1); -@@ -1686,7 +1799,7 @@ - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -1715,7 +1828,7 @@ - while (fname) - { - if ( !bcf_sr_add_reader(args->files, fname) ) -- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - fname = ++optind < argc ? argv[optind] : NULL; - } - ---- python-pysam.orig/bcftools/vcfstats.c.pysam.c -+++ python-pysam/bcftools/vcfstats.c.pysam.c -@@ -72,7 +72,7 @@ - - typedef struct - { -- int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; -+ uint32_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; - int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons - #if HWE_STATS - int *af_hwe; -@@ -90,12 +90,14 @@ - int subst[15]; - int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; - int *smpl_hapRef, *smpl_hapAlt, *smpl_missing; -- int *smpl_indel_hets, *smpl_indel_homs; -+ int *smpl_ins_hets, *smpl_del_hets, *smpl_ins_homs, *smpl_del_homs; - int *smpl_frm_shifts; // not-applicable, in-frame, out-frame - unsigned long int *smpl_dp; - idist_t dp, dp_sites; - int nusr; - user_stats_t *usr; -+ double *dvaf; // distribution of the mean indel-allele frequency by length: -m_indel,-(m_indel-1),...-1,0,1,..,m_indel -+ uint32_t *nvaf; - } - stats_t; - -@@ -478,8 +480,10 @@ - stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_hapRef = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_hapAlt = (int *) calloc(args->files->n_smpl,sizeof(int)); -- stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); -- stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_ins_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_del_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_ins_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); -+ stats->smpl_del_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int)); - stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int)); -@@ -491,6 +495,8 @@ - #endif - if ( args->exons_fname ) - stats->smpl_frm_shifts = (int*) calloc(args->files->n_smpl*3,sizeof(int)); -+ stats->nvaf = (uint32_t*) calloc(stats->m_indel*2+1,sizeof(*stats->nvaf)); -+ stats->dvaf = (double*) calloc(stats->m_indel*2+1,sizeof(*stats->dvaf)); - } - idist_init(&stats->dp, args->dp_min,args->dp_max,args->dp_step); - idist_init(&stats->dp_sites, args->dp_min,args->dp_max,args->dp_step); -@@ -560,8 +566,10 @@ - free(stats->smpl_homRR); - free(stats->smpl_hapRef); - free(stats->smpl_hapAlt); -- free(stats->smpl_indel_homs); -- free(stats->smpl_indel_hets); -+ free(stats->smpl_ins_homs); -+ free(stats->smpl_del_homs); -+ free(stats->smpl_ins_hets); -+ free(stats->smpl_del_hets); - free(stats->smpl_ts); - free(stats->smpl_tv); - free(stats->smpl_indels); -@@ -578,6 +586,8 @@ - } - free(stats->usr); - if ( args->exons ) free(stats->smpl_frm_shifts); -+ free(stats->nvaf); -+ free(stats->dvaf); - } - for (j=0; jnusr; j++) free(args->usr[j].tag); - if ( args->af_bins ) bin_destroy(args->af_bins); -@@ -846,6 +856,34 @@ - } - } - -+static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int ismpl, int ial, int jal) -+{ -+ if ( !fmt ) return; -+ -+ float dvaf; -+ #define BRANCH_INT(type_t,missing,vector_end) { \ -+ type_t *p = (type_t *) (fmt->p + fmt->size*ismpl); \ -+ if ( p[ial]==vector_end || p[jal]==vector_end ) return; \ -+ if ( p[ial]==missing || p[jal]==missing ) return; \ -+ if ( !p[ial] && !p[jal] ) return; \ -+ dvaf = (float)p[ial]/(p[ial]+p[jal]); \ -+ } -+ switch (fmt->type) { -+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; -+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; -+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; -+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt->type); exit(1); break; -+ } -+ #undef BRANCH_INT -+ -+ int len = line->d.var[ial].n; -+ if ( len < -stats->m_indel ) len = -stats->m_indel; -+ else if ( len > stats->m_indel ) len = stats->m_indel; -+ int bin = stats->m_indel + len; -+ stats->nvaf[bin]++; -+ stats->dvaf[bin] += dvaf; -+} -+ - static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) - { - bcf_srs_t *files = args->files; -@@ -856,6 +894,8 @@ - - if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"GT")) ) - { -+ bcf_fmt_t *ad_fmt_ptr = bcf_get_variant_types(line)&VCF_INDEL ? bcf_get_fmt(reader->header,reader->buffer[0],"AD") : NULL; -+ - int ref = bcf_acgt2int(*line->d.allele[0]); - int is, n_nref = 0, i_nref = 0; - for (is=0; isfiles->n_smpl; is++) -@@ -912,8 +952,31 @@ - if ( gt != GT_HOM_RR ) - { - stats->smpl_indels[is]++; -- if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++; -- else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++; -+ -+ if ( gt==GT_HET_RA || gt==GT_HET_AA ) -+ { -+ int is_ins = 0, is_del = 0; -+ if ( bcf_get_variant_type(line,ial)&VCF_INDEL ) -+ { -+ if ( line->d.var[ial].n < 0 ) is_del = 1; -+ else is_ins = 1; -+ update_dvaf(stats,line,ad_fmt_ptr,is,ial,jal); -+ } -+ if ( bcf_get_variant_type(line,jal)&VCF_INDEL ) -+ { -+ if ( line->d.var[jal].n < 0 ) is_del = 1; -+ else is_ins = 1; -+ update_dvaf(stats,line,ad_fmt_ptr,is,jal,ial); -+ } -+ // Note that alt-het genotypes with both ins and del allele are counted twice!! -+ if ( is_del ) stats->smpl_del_hets[is]++; -+ if ( is_ins ) stats->smpl_ins_hets[is]++; -+ } -+ else if ( gt==GT_HOM_AA ) -+ { -+ if ( line->d.var[ial].n < 0 ) stats->smpl_del_homs[is]++; -+ else stats->smpl_ins_homs[is]++; -+ } - } - if ( stats->smpl_frm_shifts ) - { -@@ -961,6 +1024,37 @@ - } - #undef BRANCH_INT - } -+ else if ( (fmt_ptr = bcf_get_fmt(reader->header,reader->buffer[0],"AD")) ) -+ { -+ #define BRANCH_INT(type_t,missing,vector_end) { \ -+ int is,iv; \ -+ for (is=0; isfiles->n_smpl; is++) \ -+ { \ -+ type_t *p = (type_t *) (fmt_ptr->p + fmt_ptr->size*is); \ -+ int dp = 0, has_value = 0; \ -+ for (iv=0; ivn; iv++) \ -+ { \ -+ if ( p[iv]==vector_end ) break; \ -+ if ( p[iv]==missing ) continue; \ -+ has_value = 1; \ -+ dp += p[iv]; \ -+ } \ -+ if ( has_value ) \ -+ { \ -+ (*idist(&stats->dp, dp))++; \ -+ stats->smpl_ndp[is]++; \ -+ stats->smpl_dp[is] += dp; \ -+ } \ -+ } \ -+ } -+ switch (fmt_ptr->type) { -+ case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; -+ case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; -+ case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; -+ default: fprintf(bcftools_stderr, "[E::%s] todo: %d\n", __func__, fmt_ptr->type); exit(1); break; -+ } -+ #undef BRANCH_INT -+ } - - if ( matched==3 ) - { -@@ -970,6 +1064,7 @@ - fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return; - - // only the first ALT allele is considered -+ if (args->ntmp_iaf <= 1) return; // Do not consider invariate sites - int iaf = args->tmp_iaf[1]; - int line_type = bcf_get_variant_types(files->readers[0].buffer[0]); - gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels; -@@ -1021,7 +1116,7 @@ - { - nmm++; - bcf_sr_t *reader = &files->readers[0]; -- fprintf(bcftools_stdout, "DBG\t%s\t%d\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,files->samples[is],gt,gt2); -+ fprintf(bcftools_stdout, "DBG\t%s\t%"PRId64"\t%s\t%d\t%d\n",reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,files->samples[is],gt,gt2); - } - else - { -@@ -1030,7 +1125,7 @@ - } - } - float nrd = nrefm+nmm ? 100.*nmm/(nrefm+nmm) : 0; -- fprintf(bcftools_stdout, "PSD\t%s\t%d\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,reader->buffer[0]->pos+1,nm,nmm,nrd); -+ fprintf(bcftools_stdout, "PSD\t%s\t%"PRId64"\t%d\t%d\t%f\n", reader->header->id[BCF_DT_CTG][reader->buffer[0]->rid].key,(int64_t) reader->buffer[0]->pos+1,nm,nmm,nrd); - } - } - } -@@ -1164,14 +1259,14 @@ - for (id=0; idnstats; id++) - { - stats_t *stats = &args->stats[id]; -- fprintf(bcftools_stdout, "SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); -- fprintf(bcftools_stdout, "SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); -- fprintf(bcftools_stdout, "SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); -- fprintf(bcftools_stdout, "SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); -- fprintf(bcftools_stdout, "SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); -- fprintf(bcftools_stdout, "SN\t%d\tnumber of others:\t%d\n", id, stats->n_others); -- fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%d\n", id, stats->n_mals); -- fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%d\n", id, stats->n_snp_mals); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of records:\t%u\n", id, stats->n_records); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of no-ALTs:\t%u\n", id, stats->n_noalts); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of SNPs:\t%u\n", id, stats->n_snps); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of MNPs:\t%u\n", id, stats->n_mnps); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of indels:\t%u\n", id, stats->n_indels); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of others:\t%u\n", id, stats->n_others); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic sites:\t%u\n", id, stats->n_mals); -+ fprintf(bcftools_stdout, "SN\t%d\tnumber of multiallelic SNP sites:\t%u\n", id, stats->n_snp_mals); - } - fprintf(bcftools_stdout, "# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); - for (id=0; idnstats; id++) -@@ -1289,14 +1384,33 @@ - } - } - } -- fprintf(bcftools_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]count\n"); -+ fprintf(bcftools_stdout, "# IDD, InDel distribution:\n# IDD\t[2]id\t[3]length (deletions negative)\t[4]number of sites\t[5]number of genotypes\t[6]mean VAF\n"); - for (id=0; idnstats; id++) - { - stats_t *stats = &args->stats[id]; - for (i=stats->m_indel-1; i>=0; i--) -- if ( stats->deletions[i] ) fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\n", id,-i-1,stats->deletions[i]); -+ { -+ if ( !stats->deletions[i] ) continue; -+ // whops, differently organized arrow, dels are together with ins -+ int bin = stats->m_indel - i - 1; -+ fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\t", id,-i-1,stats->deletions[i]); -+ if ( stats->nvaf && stats->nvaf[bin] ) -+ fprintf(bcftools_stdout, "%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); -+ else -+ fprintf(bcftools_stdout, "0\t."); -+ fprintf(bcftools_stdout, "\n"); -+ } - for (i=0; im_indel; i++) -- if ( stats->insertions[i] ) fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\n", id,i+1,stats->insertions[i]); -+ { -+ if ( !stats->insertions[i] ) continue; -+ int bin = stats->m_indel + i + 1; -+ fprintf(bcftools_stdout, "IDD\t%d\t%d\t%d\t", id,i+1,stats->insertions[i]); -+ if ( stats->nvaf && stats->nvaf[bin] ) -+ fprintf(bcftools_stdout, "%u\t%.2f",stats->nvaf[bin],stats->dvaf[bin]/stats->nvaf[bin]); -+ else -+ fprintf(bcftools_stdout, "0\t."); -+ fprintf(bcftools_stdout, "\n"); -+ } - } - fprintf(bcftools_stdout, "# ST, Substitution types:\n# ST\t[2]id\t[3]type\t[4]count\n"); - for (id=0; idnstats; id++) -@@ -1519,8 +1633,8 @@ - } - } - -- -- fprintf(bcftools_stdout, "# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); -+ fprintf(bcftools_stdout, "# PSI, Per-Sample Indels. Note that alt-het genotypes with both ins and del allele are counted twice, in both nInsHets and nDelHets.\n"); -+ fprintf(bcftools_stdout, "# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nInsHets\t[9]nDelHets\t[10]nInsAltHoms\t[11]nDelAltHoms\n"); - for (id=0; idnstats; id++) - { - stats_t *stats = &args->stats[id]; -@@ -1533,9 +1647,8 @@ - in = stats->smpl_frm_shifts[i*3 + 1]; - out = stats->smpl_frm_shifts[i*3 + 2]; - } -- int nhom = stats->smpl_indel_homs[i]; -- int nhet = stats->smpl_indel_hets[i]; -- fprintf(bcftools_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); -+ fprintf(bcftools_stdout, "PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0, -+ stats->smpl_ins_hets[i],stats->smpl_del_hets[i],stats->smpl_ins_homs[i],stats->smpl_del_homs[i]); - } - } - -@@ -1611,7 +1724,7 @@ - fprintf(bcftools_stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(bcftools_stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); -- fprintf(bcftools_stderr, " --threads number of extra decompression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); - fprintf(bcftools_stderr, "\n"); - exit(1); -@@ -1688,7 +1801,7 @@ - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 'h': -- case '?': usage(); -+ case '?': usage(); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -1717,7 +1830,7 @@ - while (fname) - { - if ( !bcf_sr_add_reader(args->files, fname) ) -- error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - fname = ++optind < argc ? argv[optind] : NULL; - } - ---- python-pysam.orig/bcftools/vcfview.c -+++ python-pysam/bcftools/vcfview.c -@@ -32,6 +32,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -85,11 +86,14 @@ - - if (args->calc_ac && args->update_info) - { -- bcf_hdr_append(args->hdr,"##INFO="); -- bcf_hdr_append(args->hdr,"##INFO="); -+ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) -+ error_errno("[%s] Failed to add \"AC\" INFO header", __func__); -+ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) -+ error_errno("[%s] Failed to add \"AN\" INFO header", __func__); - } - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); -- else bcf_hdr_sync(args->hdr); -+ else if (bcf_hdr_sync(args->hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); - - // setup sample data - if (args->sample_names) -@@ -452,7 +456,7 @@ - if (args->trim_alts) - { - int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); -- if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); -+ if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1); - } - if (args->phased) { - int phased = bcf_all_phased(args->hdr, line); -@@ -503,10 +507,10 @@ - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); -- fprintf(stderr, " --threads number of extra (de)compression threads [0]\n"); -+ fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Subset options:\n"); -- fprintf(stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); -+ fprintf(stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); - fprintf(stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); -@@ -694,7 +698,7 @@ - } - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -- case '?': usage(args); -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -737,12 +741,14 @@ - } - - if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); - if (args->print_header) -- bcf_hdr_write(args->out, out_hdr); -+ { -+ if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); -+ } - else if ( args->output_type & FT_BCF ) - error("BCF output requires header, cannot proceed with -H\n"); - -@@ -753,8 +759,7 @@ - { - bcf1_t *line = args->files->readers[0].buffer[0]; - if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); -- if ( subset_vcf(args, line) ) -- bcf_write1(args->out, out_hdr, line); -+ if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); - } - ret = args->files->errnum; - if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); ---- python-pysam.orig/bcftools/vcfview.c.pysam.c -+++ python-pysam/bcftools/vcfview.c.pysam.c -@@ -34,6 +34,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -87,11 +88,14 @@ - - if (args->calc_ac && args->update_info) - { -- bcf_hdr_append(args->hdr,"##INFO="); -- bcf_hdr_append(args->hdr,"##INFO="); -+ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) -+ error_errno("[%s] Failed to add \"AC\" INFO header", __func__); -+ if (bcf_hdr_append(args->hdr,"##INFO=") < 0) -+ error_errno("[%s] Failed to add \"AN\" INFO header", __func__); - } - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); -- else bcf_hdr_sync(args->hdr); -+ else if (bcf_hdr_sync(args->hdr) < 0) -+ error_errno("[%s] Failed to update header", __func__); - - // setup sample data - if (args->sample_names) -@@ -454,7 +458,7 @@ - if (args->trim_alts) - { - int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line); -- if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1); -+ if ( ret<0 ) error("Error: Could not trim alleles at %s:%"PRId64"\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), (int64_t) line->pos+1); - } - if (args->phased) { - int phased = bcf_all_phased(args->hdr, line); -@@ -505,10 +509,10 @@ - fprintf(bcftools_stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(bcftools_stderr, " -t, --targets [^] similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); - fprintf(bcftools_stderr, " -T, --targets-file [^] similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n"); -- fprintf(bcftools_stderr, " --threads number of extra (de)compression threads [0]\n"); -+ fprintf(bcftools_stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(bcftools_stderr, "\n"); - fprintf(bcftools_stderr, "Subset options:\n"); -- fprintf(bcftools_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n"); -+ fprintf(bcftools_stderr, " -a, --trim-alt-alleles trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); - fprintf(bcftools_stderr, " -I, --no-update do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(bcftools_stderr, " -s, --samples [^] comma separated list of samples to include (or exclude with \"^\" prefix)\n"); - fprintf(bcftools_stderr, " -S, --samples-file [^] file of samples to include (or exclude with \"^\" prefix)\n"); -@@ -696,7 +700,7 @@ - } - case 9 : args->n_threads = strtol(optarg, 0, 0); break; - case 8 : args->record_cmd_line = 0; break; -- case '?': usage(args); -+ case '?': usage(args); break; - default: error("Unknown argument: %s\n", optarg); - } - } -@@ -739,12 +743,14 @@ - } - - if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); -- if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); -+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); - - init_data(args); - bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); - if (args->print_header) -- bcf_hdr_write(args->out, out_hdr); -+ { -+ if ( bcf_hdr_write(args->out, out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); -+ } - else if ( args->output_type & FT_BCF ) - error("BCF output requires header, cannot proceed with -H\n"); - -@@ -755,8 +761,7 @@ - { - bcf1_t *line = args->files->readers[0].buffer[0]; - if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); -- if ( subset_vcf(args, line) ) -- bcf_write1(args->out, out_hdr, line); -+ if ( subset_vcf(args, line) && bcf_write1(args->out, out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->fn_out); - } - ret = args->files->errnum; - if ( ret ) fprintf(bcftools_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum)); ---- python-pysam.orig/bcftools/version.c -+++ python-pysam/bcftools/version.c -@@ -25,6 +25,7 @@ - #include - #include - #include -+#include - #include - #include "bcftools.h" - #include "version.h" -@@ -44,6 +45,22 @@ - exit(-1); - } - -+void error_errno(const char *format, ...) -+{ -+ va_list ap; -+ int e = errno; -+ va_start(ap, format); -+ vfprintf(stderr, format, ap); -+ va_end(ap); -+ if (e) { -+ fprintf(stderr, ": %s\n", strerror(e)); -+ } else { -+ fprintf(stderr, "\n"); -+ } -+ exit(-1); -+} -+ -+ - const char *hts_bcf_wmode(int file_type) - { - if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF ---- python-pysam.orig/bcftools/version.c.pysam.c -+++ python-pysam/bcftools/version.c.pysam.c -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - #include - #include "bcftools.h" - #include "version.h" -@@ -46,6 +47,22 @@ - exit(-1); - } - -+void error_errno(const char *format, ...) -+{ -+ va_list ap; -+ int e = errno; -+ va_start(ap, format); -+ vfprintf(bcftools_stderr, format, ap); -+ va_end(ap); -+ if (e) { -+ fprintf(bcftools_stderr, ": %s\n", strerror(e)); -+ } else { -+ fprintf(bcftools_stderr, "\n"); -+ } -+ exit(-1); -+} -+ -+ - const char *hts_bcf_wmode(int file_type) - { - if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF ---- python-pysam.orig/bcftools/version.h -+++ python-pysam/bcftools/version.h -@@ -1 +1 @@ --#define BCFTOOLS_VERSION "1.9" -+#define BCFTOOLS_VERSION "1.10" diff --git a/debian/patches/hts1.10 b/debian/patches/hts1.10 deleted file mode 100644 index 6fbe3ef..0000000 --- a/debian/patches/hts1.10 +++ /dev/null @@ -1,104 +0,0 @@ -Author: Michael R. Crusoe -Description: sync with htslib, samtools, and bcftools 1.10 - -- Remove symbols that was removed in libhts3 (hts_useek and uts_utell) -- use devtools/import.py and the contents of the samtools & bcftools 1.10 -Debian packages with their patches fully applied - ---- python-pysam.orig/pysam/htslib_util.h -+++ python-pysam/pysam/htslib_util.h -@@ -5,9 +5,6 @@ - #include "htslib/vcf.h" - #include "htslib/khash.h" - --int hts_useek(htsFile *fp, long uoffset, int where); --long hts_utell(htsFile *fp); -- - int hts_set_verbosity(int verbosity); - int hts_get_verbosity(void); - ---- python-pysam.orig/pysam/libchtslib.pxd -+++ python-pysam/pysam/libchtslib.pxd -@@ -632,8 +632,6 @@ - int8_t HTS_FMT_CRAI - - BGZF *hts_get_bgzfp(htsFile *fp) -- int hts_useek(htsFile *fp, long uoffset, int where) -- long hts_utell(htsFile *fp) - - ctypedef struct hts_idx_t - ---- python-pysam.orig/tests/00README.txt -+++ python-pysam/tests/00README.txt -@@ -15,7 +15,7 @@ - To try samtools, you may run the following commands: - - samtools faidx ex1.fa # index the reference FASTA -- samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM -+ samtools view -bt ex1.fa.fai -o ex1.bam ex1.sam.gz # SAM->BAM - samtools index ex1.bam # index BAM - samtools tview ex1.bam ex1.fa # view alignment - samtools pileup -cf ex1.fa ex1.bam # pileup and consensus ---- python-pysam.orig/tests/pysam_data/Makefile -+++ python-pysam/tests/pysam_data/Makefile -@@ -32,7 +32,7 @@ - samtools calmd --output-fmt BAM $^ > $@ - - #%.bam: %.sam ex1.fa.fai --# samtools import ex1.fa.fai $< $@ -+# samtools view -bt ex1.fa.fai -i $@ $< - - uncompressed.bam: ex2.sam - samtools view -buS $< > $@ -@@ -53,7 +53,7 @@ - samtools faidx ex1.fa - - ex1.bam:ex1.sam.gz ex1.fa.fai -- samtools import ex1.fa.fai ex1.sam.gz ex1.bam -+ samtools view -bt ex1.fa.fai -o ex1.bam ex1.sam.gz - - %.bam.bai:%.bam - samtools index $< -@@ -73,7 +73,7 @@ - - example_unmapped_reads_no_sq.bam: example_unmapped_reads_no_sq.sam - touch tmp.list -- samtools import tmp.list $< $@ -+ samtools view -bt tmp.list -o $@ $< - rm -f tmp.list - - example_bai.bam: ex1.bam ---- python-pysam.orig/setup.py -+++ python-pysam/setup.py -@@ -159,8 +159,7 @@ - package_list = ['pysam', - 'pysam.include', - 'pysam.include.samtools', -- 'pysam.include.bcftools', -- 'pysam.include.samtools.win32'] -+ 'pysam.include.bcftools'] - package_dirs = {'pysam': 'pysam', - 'pysam.include.samtools': 'samtools', - 'pysam.include.bcftools': 'bcftools'} ---- python-pysam.orig/pysam/libchtslib.pyx -+++ python-pysam/pysam/libchtslib.pyx -@@ -490,8 +490,7 @@ - with nogil: - ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET) - elif self.htsfile.format.compression == no_compression: -- with nogil: -- ret = hts_useek(self.htsfile, offset, SEEK_SET) -+ ret = 0 if (hseek(self.htsfile.fp.hfile, offset, SEEK_SET) >= 0) else -1 - else: - raise NotImplementedError("seek not implemented in files compressed by method {}".format( - self.htsfile.format.compression)) -@@ -509,8 +508,7 @@ - with nogil: - ret = bgzf_tell(hts_get_bgzfp(self.htsfile)) - elif self.htsfile.format.compression == no_compression: -- with nogil: -- ret = hts_utell(self.htsfile) -+ ret = htell(self.htsfile.fp.hfile) - elif self.htsfile.format.format == cram: - with nogil: - ret = htell(cram_fd_get_fp(self.htsfile.fp.cram)) diff --git a/debian/patches/samtools_v1.10 b/debian/patches/samtools_v1.10 deleted file mode 100644 index 14400c7..0000000 --- a/debian/patches/samtools_v1.10 +++ /dev/null @@ -1,3371 +0,0 @@ -Author: Michael R. Crusoe -Description: support samtools 1.10 as it is more strict - ---- python-pysam.orig/tests/pysam_data/example_empty_with_header.sam -+++ /dev/null -@@ -1 +0,0 @@ --@HD VN:1.3 SO:coordinate ---- python-pysam.orig/tests/pysam_data/rg_with_tab.sam -+++ /dev/null -@@ -1,3273 +0,0 @@ --@SQ SN:chr1 LN:1575 --@SQ SN:chr2 LN:1584 --@PG ID:bwa PN:bwa VN:0.7.9a-r786 CL:bwa mem -p -t 8 -M -R @RG ID:None SM:None /mnt/data/hg19.fa /mnt/analysis/default-0.fastq --EAS56_57:6:190:289:82 69 chr1 100 0 * = 100 0 CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA <<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<; MF:i:192 --EAS56_57:6:190:289:82 137 chr1 100 73 35M = 100 0 AGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCAC <<<<<<;<<<<<<<<<<;<<;<<<<;8<6;9;;2; MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:141:80:875 99 chr1 110 99 35M = 265 190 AGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAA <<<<<<<<<<<<<<<<<<<<<<:<<8;<<8+7;-7 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:3:40:1128:1940 163 chr1 112 99 35M = 291 214 CCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACC <<<<<<<<<<<<<<<<<<<;<<5;;<<<9;;;;7: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:5:290:319:736 69 chr1 113 0 * = 113 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<:7:<.<<<<7<<.<.<<.9*<4<:<4%74 MF:i:192 --EAS51_62:5:290:319:736 137 chr1 113 73 35M = 113 0 CGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCT ==;=======7====6=;==:;;====66=::27: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_597:2:132:493:921 69 chr1 119 0 * = 119 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<<<<<<<<<<<<<<;<<<<77;0<;;6777 MF:i:192 --B7_597:2:132:493:921 137 chr1 119 75 35M = 119 0 ACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTG <<<<<<<<<<<<<<<<;<<<<<<<<;<<<<;;88: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:7:283:799:560 163 chr1 121 66 35M = 283 197 GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGAC <<<<+<<<<8<<<+<<<<<;<<:07;8;7402447 MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:1:225:195:543 99 chr1 123 99 35M = 299 211 GGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTG <<<<<<<<<<<<<<<<<<<<<<<9<<;::388998 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:6:114:714:317 99 chr1 126 99 35M = 311 220 TGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<5;<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:1:70:147:84 163 chr1 128 73 35M = 285 192 CCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGA <<<<<<<<<<<<<<<<<<<<;<<<<<<<<<;(5<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:187:227:818 163 chr1 129 99 35M = 290 196 CAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<3<;<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:4:77:29:126 99 chr1 131 99 35M = 315 219 GCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCT <<<<<<<<<<3<<<<<<<;;;7<;<<449<-:977 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:4:327:795:103 99 chr1 133 99 35M = 302 204 ACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:3:139:117:262 69 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<7<<<<<<<<<<<<<<<<<<<;<;<<<<<37;3 MF:i:192 --EAS114_30:3:139:117:262 137 chr1 135 76 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<;<<<<<<<<<<<<<:<<<<<:<<8<<<<:<: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:5:29:817:854 73 chr1 135 77 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:5:29:817:854 133 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTTTATGTGAAC <<<<<<<<<<<<<<<1..;:;;;;1%407)07&7. MF:i:192 --EAS192_3:6:170:169:57 163 chr1 138 99 35M = 296 193 GGCTTGACCTCTGGTGACTGCCAGAGCTGCTGGCC <<<<<;<<<<<<;<<<<<<<<<<<<:<<<<<;;+% MF:i:18 Aq:i:45 NM:i:2 UQ:i:30 H0:i:0 H1:i:1 --B7_595:4:84:802:737 99 chr1 140 68 35M = 284 179 CTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAG <<<<<<<<<<;9<9<<<;<<;73;<<<<<37;1+. MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_4:7:78:583:670 163 chr1 142 99 35M = 316 209 TAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCT <<<<<<<<<<;;;<;;<<<:7;5;<5;;<2--8-; MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:90:435:691 99 chr1 147 99 35M = 318 206 TCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGT <<<<<<<<<<;<<<;<<<<:<<<;<81;<<1;784 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:3:13:122:187 163 chr1 153 99 35M = 343 225 GACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCCTT <<<<<<<;<;<<<;<<<<:;6<<<<;;;;:<<%%< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS221_1:6:69:735:1915 99 chr1 154 99 35M = 321 202 ACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTT <<<<<<<<;<<<<<<<<<;<<<<;<8<<<<;1:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:5:66:959:1311 163 chr1 159 95 35M = 336 212 CAGAGCTGCTGGCAAGCTAGAGGCCCATCTGGAGC ;;4;;;+;;;-01;;&-;;4;;&;;73)(&**274 MF:i:18 Aq:i:31 NM:i:2 UQ:i:12 H0:i:0 H1:i:1 --EAS56_57:6:325:759:288 99 chr1 163 99 35M = 341 213 GCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCT 8<;<<<<81<<<<<;<<;<<<;9<<<<1>><<<< MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:4:240:264:231 121 chr1 213 66 35M = 213 0 TGTAATGAAAACTATATTTATGCTATTCAGTTCTA 9;,;;62<9<)29<<<;96<<<;<<7<<<<<<;<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:4:240:264:231 181 chr1 213 0 * = 213 0 CAACAGATCAAGAAGGAGGGGCAATGGACGAGTTA %15+5022))0&<<)0)+7:4+&<0<<:0<<<7<< MF:i:192 --EAS1_93:7:14:426:613 99 chr1 214 99 35M = 379 200 GTAATGAAAACTATATTTATGCTATTCAGTTCTAA ======;=;==========;;==3=;==-=<;<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:2:173:995:93 163 chr1 215 99 35M = 382 202 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<<<<<<<<<<<<<7:<<<<;:<:<<<<:7 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:6:195:348:703 163 chr1 215 99 35M = 353 173 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<;<<<<<;:<<<<<<<<<<<<:<1:<:7< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:2:62:879:264 163 chr1 216 99 35M = 396 215 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<2<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_61:4:83:452:970 99 chr1 216 99 35M = 379 198 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT ==========================;======== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:2:64:1318:1711 99 chr1 218 99 35M = 389 206 TGAAAACTATATTTATGCTATTCAGTTCTAAATAT <<<<<<<<<<<<<<<<7<<<<<<<:<<<<<2<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:8:113:968:19 83 chr1 219 99 35M = 50 -204 GAAAACTATATTTATGCTATTCAGTTCTAAATATA 8;<;8;9<<<<<<<9<:<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:4:160:896:275 163 chr1 220 99 35M = 387 202 AAAACTATATTTATGCTATTCAGTTCTAAATATAG ============<====<==<====<==<==;=:6 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:6:181:191:418 163 chr1 221 99 36M = 387 202 AAACTATATTTATGCTATTCAGTTCTAAATATAGAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<988 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:7:242:354:637 99 chr1 222 99 36M = 417 231 AACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<6<;; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:1:122:77:789 163 chr1 223 99 35M = 396 208 ACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<:<4<<9<:7<<<:<<<7<<<<<<<<<<9<9<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:5:42:540:501 147 chr1 224 99 36M = 60 -200 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:6:155:12:674 83 chr1 224 99 36M = 52 -208 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<;<<<<;<<<<8<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:4:106:316:452 147 chr1 224 99 36M = 49 -211 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT :<<<<<;<<<<:<<:<<<<<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:5:89:525:113 163 chr1 227 78 40M = 397 210 TATTTATGCTATTCAGTTATAAATATAGAAATTGAAACAG <1<7<6;+0;7;7'<70;-<7<:<:<<5<<:9<5:7:%:7 MF:i:18 Aq:i:39 NM:i:1 UQ:i:12 H0:i:0 H1:i:1 --EAS54_65:3:321:311:983 147 chr1 228 99 35M = 51 -212 ATTTATGCTATTCAGTTCTAAATATAGAAATTGAA ;;4;;<7<<<<<<77<<<<<<<<<<17<<<<<<<< MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_65:8:76:493:708 147 chr1 229 44 35M = 73 -191 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA 5/)63.&1517(544(055(0454&7706566679 MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:2:125:628:79 163 chr1 229 99 35M = 400 205 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA ==================<6<====<<:<==7;:: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:5:78:583:499 83 chr1 229 74 35M = 37 -227 TTTACGCTATTCAGTACTAAATATAGAAATTGAAA &6&9774&<;67<44&-4<;<9<7<<<<<;<<<<< MF:i:18 Aq:i:37 NM:i:2 UQ:i:27 H0:i:0 H1:i:1 --EAS54_67:3:175:730:949 83 chr1 230 99 35M = 70 -195 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC <<<<;+<<<<7<;<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:1:84:275:1572 163 chr1 230 99 35M = 394 199 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC /6;;;4;;;;;;;;7;;4;.4;;;;;6;;;77077 MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:4:248:753:731 99 chr1 231 99 35M = 402 206 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA <<<<<<<<<<<8<<<<<<<<<<<<:<<<<&<:<.: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:1:9:1289:215 99 chr1 231 99 35M = 394 198 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA ;;;;;;9;;;67;;;;;99;9;;;;;;;;977747 MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:188:802:71 163 chr1 232 99 35M = 415 218 ATGCTATTCAGTTCTAAATATAGAAATTGAAACAG <<<<<<<<<;<<<<<9<<<:<<<:<<<<<<:<<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:7:252:171:323 83 chr1 234 99 35M = 43 -226 GCTATTCAGTTCTAAATATAGAAATTGAAACAGCT ;8<;<=3=6==:====;;======;========== MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:5:223:142:410 147 chr1 235 99 35M = 60 -210 CTATTCAGTTCTAAATATAGAAATTGAAACAGCTG 8;<<<;<<<<;<<<<<<;<;<<<<<<<<<<<<;<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:1:243:10:911 83 chr1 236 99 35M = 63 -208 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;<;;;<4;9:<<<;<<;<<<<<;;<<<<<<<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:6:5:730:1436 163 chr1 236 99 35M = 403 202 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;;;;;;;;;;;;;;;;;;8;;;;;8;;;;;67777 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:2:57:1672:1890 121 chr1 236 75 40M = 236 0 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTA :;;;9<8;;*<<<<<<:<<<<<<<<1:<<<<<<<<<<<7< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:2:57:1672:1890 181 chr1 236 0 * = 236 0 CCCCCCCCCCCCCCCCCAGCCACTGCGGCCCCCCCAGCCA -+)%)'-'+,,<066,))090+:&486083:5&&:<<5<0 MF:i:192 --EAS1_105:2:299:360:220 99 chr1 237 99 35M = 403 201 ATTCAGTTCTAAATATAGAAATTGAAACAGCTGTG <<<<<<<9<9<<<<<<<<<<<<<<<<<5<;<0<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:2:24:1037:84 163 chr1 238 99 35M = 415 212 TTCAGTTCTAAATATAGAAATTGAAACAGCTGTGT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<;:<57< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:3:86:823:683 163 chr1 240 99 35M = 408 203 CAGTTCTAAATATAGAAATTGAAACAGCTGTGTTT <<<<<<<<;<<<<<<<<<<<<<<<<<<<<;;9<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:4:130:568:978 99 chr1 246 88 35M = 434 223 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGAC 7<<;<<;<7<:;<7<<<<<<<<);4;+<7+3+%;< MF:i:18 Aq:i:24 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 --EAS114_45:4:73:1208:495 163 chr1 246 99 35M = 431 220 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGCC ;;;;;;;;;;;;;;;;;;;;;;;;5;;;;;37377 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:7:264:642:506 99 chr1 247 99 35M = 420 208 AAATATAGAAATTGAAACAGCTGTGTTTATTGTAT <<;<<<<<<;<<<;:;;:;;<<;<<<<;*+;*&.4 MF:i:18 Aq:i:56 NM:i:3 UQ:i:28 H0:i:1 H1:i:0 --EAS114_28:5:104:350:749 163 chr1 247 99 36M = 415 204 AAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <<8<<<<<<<<<<<;<<<<<<<<0;<<<9;<85;;; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_61:6:227:259:597 147 chr1 248 99 35M = 61 -222 AATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <8<;2;9;<;;-92<;;;<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:61 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_32:7:113:809:364 99 chr1 250 99 35M = 413 198 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<7<;<;<<<4 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:218:877:489 83 chr1 250 86 35M = 80 -205 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG 9<<<8<<<;<9<<<<<<<<<<;<<<<<<<<<<<<< MF:i:18 Aq:i:10 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:7:20:979:96 83 chr1 254 99 35M = 79 -210 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA '9996;(:;-<;1<<<<=<<<<=<<<<<<<<<<<< MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:259:219:114 99 chr1 254 99 35M = 411 192 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA <<<<<<<<<<<<<<<;<<<<<<7<7<<<<<0<<9< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:6:13:1034:1144 99 chr1 256 99 35M = 429 208 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<<<<<<<<<<<<<<<<<<<<<<<3<<<;<<;<++ MF:i:18 Aq:i:69 NM:i:2 UQ:i:48 H0:i:1 H1:i:0 --EAS221_1:2:29:1486:672 147 chr1 256 99 35M = 79 -212 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<:<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<++ MF:i:18 Aq:i:29 NM:i:2 UQ:i:54 H0:i:0 H1:i:0 --EAS139_11:7:46:695:738 163 chr1 259 74 35M = 428 204 TGAAACAGCTGAGTTTAGCGCCTGTGTTCACATAG <;<<<<;<<),&4<3<<7&7<0;)).3;79;7<;0 MF:i:130 Aq:i:74 NM:i:3 UQ:i:18 H0:i:0 H1:i:0 --EAS139_11:8:26:1221:222 163 chr1 261 99 35M = 446 220 AAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:190:727:308 147 chr1 263 99 35M = 103 -195 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ;;<;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:3:284:261:124 83 chr1 263 99 35M = 79 -219 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ===27===.====&===========;;======== MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:141:80:875 147 chr1 265 99 35M = 110 -190 AGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCA 6/<;84<;<;<<<<<<5<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:3:24:1135:563 163 chr1 266 99 40M = 446 220 GCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACC <<<<:<<<<:1:<<<<<<.<<<<<<<<;<;;;43+:30:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:2:90:986:1224 83 chr1 267 99 35M = 67 -235 CTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAAC <7*37;;;;;;;9<<;<7<<<<<<<<<<<;;<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:7:287:492:169 99 chr1 269 99 36M = 449 216 GTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAAC <<<7<<<<<<<<<<<<<<<<<<<<<<<8;;<;6<<; MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:1:48:9:409 99 chr1 271 75 18M5I12M = 464 228 GTTTAGTGCCTTTGTTCACATAGACCCCCTTGCAA <<<<<<<<<<<<<:<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:75 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS139_19:1:87:1222:878 163 chr1 272 10 40M = 435 203 TATAGGGCCTTTGTTCAAACCCCTTGCAACAACCTTGAGA &+6<6&<:<<9<1112<<;)9227>>>>>>>>>>>>>;<>>>>><<>>>;<+<>=>>+==>>==<==<=8=><:;8/;7<<<<<<<<;<:<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:240:603:890 83 chr1 740 99 36M = 590 -186 GCTCCCAAGAGGGAAAGCTTTCAACGCTTCTAGCCA ;+&+//&<<<<<<<<<<9<<<8<<<<9<<<<<<<<< MF:i:18 Aq:i:66 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --B7_591:7:129:956:115 163 chr1 740 99 36M = 927 223 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;877- MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:4:168:528:288 83 chr1 740 99 35M = 570 -205 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCC 8<%<31;<<;<;<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:8:275:851:240 147 chr1 743 99 35M = 561 -217 CCCCAGAGGGAAAGCTTTCAACGTTTCTAGCCATT 66614/&3616630666&66666&66666868666 MF:i:18 Aq:i:31 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS188_7:6:205:873:464 147 chr1 743 99 35M = 552 -226 CCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATT <-((+:+;289<--;<;-;<:;;<<<;;<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:6:37:610:260 163 chr1 745 99 35M = 913 203 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC <<<;<;<<7<<<<<<<<<<<<<<;6<963;;;3;1 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:7:93:945:176 147 chr1 745 99 35M = 582 -198 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC 6;;;8<<3<<8.<;6)<<<<<9<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:6:61:628:681 83 chr1 746 99 36M = 586 -196 CAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTT 95<<<<<<<<;<<<<;<<<:<<;;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:7:288:552:440 83 chr1 747 87 35M = 560 -222 AGAGGGAACGCTTTCAACTCTTCTAGCCATTTCTT 9<<%'%<<.2<<<<<<<<5:<<<<<<<<<<<<<<< MF:i:18 Aq:i:26 NM:i:2 UQ:i:33 H0:i:0 H1:i:0 --EAS56_53:2:170:265:818 163 chr1 748 10 35M = 920 207 GAGGGGAAGCTTTCAACGCTTCTAGCACTTTCTTT <<<<<(5/959<8.<9<8<<<2<&59&&:22:8+( MF:i:18 Aq:i:10 NM:i:3 UQ:i:17 H0:i:0 H1:i:0 --B7_595:2:251:121:479 83 chr1 750 99 35M = 572 -213 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<6'..663;&<<;<<9<<<9<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:8:67:1797:1931 147 chr1 750 99 35M = 562 -223 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:2:226:302:758 83 chr1 751 99 35M = 556 -230 GGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGG ;<<<<9;<<<<<<<<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:2:163:618:570 83 chr1 751 99 35M = 571 -215 GGAAAGCTGTCAACGCTTCTAGCCATTTCTTTTGG <9774<88&:8<:8<8:8<8<<<<<;88<88<<<< MF:i:18 Aq:i:41 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS1_97:3:73:292:429 99 chr1 752 99 35M = 920 203 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTTGC <<<<<<<<<<7<<;<<<<<<<2<<<5<<<<<:%)< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS1_108:3:82:356:253 99 chr1 752 99 35M = 927 210 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC ===================<========;===39= MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:6:62:386:959 147 chr1 752 99 35M = 594 -193 AAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC %;71131((<<6<92(+<1<<;<-3<8<<;<;;<< MF:i:18 Aq:i:57 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS51_62:3:263:74:407 83 chr1 754 99 35M = 574 -215 AAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCAT ;;88<::+;<)<5<<:<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_597:3:67:620:344 99 chr1 755 99 35M = 905 185 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT <<<<2<:2<<<<<<7<<<<:<<*<<<<<<***3<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:6:148:776:486 83 chr1 755 99 35M = 578 -212 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT ;:<<<;<<;<<<<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_61:3:150:933:810 83 chr1 755 99 35M = 593 -197 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT :89===:=:=;;==;==================== MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:4:102:467:897 99 chr1 756 97 35M = 940 219 GCTTTCAACGCTTCTAGCCATTTCTTTTGTCTTTT <<<<9<<<<9<2<<<&,/=====>=>=>>>=>>==>=>>>>>> MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:3:297:637:86 83 chr1 869 99 35M = 704 -200 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG <:75<;<;;<<<<<<;;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 --EAS54_65:3:290:558:349 147 chr1 869 99 35M = 719 -185 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG 2;2;;'5&;<<5<<;5/<<<<<7<<;+;<<+1<8< MF:i:18 Aq:i:59 NM:i:1 UQ:i:6 H0:i:1 H1:i:0 --EAS1_95:3:308:956:873 99 chr1 870 99 35M = 1068 233 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC <<<<<<<<<<<<<;<;<;1<<<<<.<9<;<<<<+; MF:i:18 Aq:i:31 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_78:7:147:64:416 147 chr1 870 99 35M = 701 -204 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC /;49;:6<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:4:30:812:345 163 chr1 871 99 36M = 1036 201 TCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTC <<<<<<<7<;<<7<;77;3<&0-;<5<;6<1'13<: MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:7:134:243:630 163 chr1 871 99 35M = 1052 216 TCATCTAGGGGAACAGGGAGGCGCACTAATGAGCT <<<:<<<<::1:818;;&::<>.; MF:i:18 Aq:i:35 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS54_81:2:31:98:804 147 chr1 982 99 35M = 805 -212 CTTTACTGTCATAACTATGAAGAGACTATTGCCAG ====;========7===================== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:2:235:805:373 163 chr1 983 99 35M = 1146 198 TTTACTGTCATAACTATGAAGAGACTATTTCCAGA <<<<<<<<<<<<<<<<<<<<;<;<<<<<<;;<99; MF:i:18 Aq:i:74 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 --EAS114_28:5:11:868:62 99 chr1 983 99 36M = 1154 207 TTTACTGTCATAACTATGAAGAGACTATTGCCAGAT <<<<<<<<<<<<<<<<<<<<:<<<;<<<<(7:7039 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_81:2:280:512:316 163 chr1 984 99 35M = 1159 210 TTACTGTCATAACTATGAAGAGACTATTGCCAGCT ==<========6==4==6;;==:===;=2/:+8%6 MF:i:18 Aq:i:68 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS1_93:5:292:122:666 99 chr1 985 99 35M = 1159 209 TACTGTCATAACTATGAAGAGACTATTGTCAGATG <<<<<<6<<<<<<<<8;<<<<<<<<<<3&9+;;(; MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS56_53:1:23:403:981 99 chr1 985 99 35M = 1151 201 TACTGTCATAACTATGAAGAGACTATTGCCAGATG <8<<<;<<<<<<;<<<<<<8;<<<9<9,3;,6(91 MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:7:33:1566:588 99 chr1 985 76 35M = 1166 216 TACTGTCATAACTATGAAGAGCCTATTGCCAGATG <;.;;;;6;;;;6;;29;;;<+9;;;.3;;73797 MF:i:18 Aq:i:37 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 --EAS139_11:7:92:367:1495 83 chr1 987 99 35M = 820 -202 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA <8<88<<<<7<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS220_1:8:38:1576:1923 83 chr1 987 99 35M = 822 -200 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA 8;<98<<<<<<<;<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:190:481:295 163 chr1 990 99 35M = 1161 206 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<<<<<<<<<<<<<<<<<<<<<9<<<<<9<7<2:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_32:7:168:117:441 99 chr1 990 99 35M = 1151 196 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<3<<<<<<<<<<<<<<<<<<<+<<17;<;:<995 MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:239:796:221 163 chr1 992 99 35M = 1160 203 ATAACTATGAAGAGACTATTGCCAGCTGACCCCCC <<<7<<7<<7<<7<;<<<<<,;;,+'<+/+99%:' MF:i:18 Aq:i:37 NM:i:4 UQ:i:26 H0:i:0 H1:i:1 --EAS220_1:4:69:88:1154 147 chr1 992 99 35M = 805 -222 ATAACTATGAAGAGACTATTGCCAGATGAACCACA <<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_3:8:34:956:1309 99 chr1 994 99 35M = 1168 209 AACTATGAAGAGACTATTGCCAGATGAACCACACA <<<<<<7<<<<<<<<<<<6<<<<<<<<<<<:<8<8 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:5:229:717:121 99 chr1 995 99 35M = 1150 190 ACTATGAAGAGACTATTGCCAGATGAACCACACAC =================<)=<4<0=.<<<71;41& MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS219_1:1:67:191:668 99 chr1 995 99 35M = 1134 174 ACTATGAAGAGACTATTGCCAGATGAACCACACCT <<<<<<<<<<<<<<<<<6<<;<;<;<<<<<<6;%2 MF:i:18 Aq:i:74 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS51_64:3:309:303:278 163 chr1 996 99 35M = 1178 217 CTATGAAGAGACTATTGCCAGATGAACCACACATT <<<<<<<<<<<<<<<<+<<+<<7<<<<<5<<<;;; MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:8:60:1020:1259 99 chr1 996 99 35M = 1157 196 CTATGAAGAGACTATTGCCAGATGAACCACACATT <;<<<<;<<<<<<<<<;<<<<<<<8<<<<<:<:<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:7:89:1487:520 83 chr1 997 99 35M = 805 -227 TATGAAGAGACTATTGCCAGATGAACCACACATTA 4;;/<<<<<:<;<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:4:15:805:420 163 chr1 998 35 35M = 1164 201 ATGAAGAGACTATTCACATGTGAACCACACATTTA ;73;;;;67.;1<<+*.;*&<4947<&474&*9*( MF:i:130 Aq:i:35 NM:i:4 UQ:i:33 H0:i:0 H1:i:0 --EAS56_57:3:119:761:239 147 chr1 999 99 35M = 813 -221 TGAAGAGACTATTGCCAGATGAACCACACATTAAT ;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:142:457:584 99 chr1 999 99 35M = 1160 196 TGAAGAGACTATTTCCAGATGAACCACACATTAAT <<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:7:190:95:706 147 chr1 1078 99 35M = 920 -193 TTGTGTCCATGTACACACGCTGTCCTATGTACTTA 9;97437;<;;<<;<;<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:1:101:825:28 83 chr1 1079 99 35M = 879 -235 TGTGTCCATGTACACACGCTGTCCTATGTACTTAT 0;0'0;<<<<<<8<;<<<<;;3<<;;<<<8<<<<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:4:188:460:1000 99 chr1 1080 99 35M = 1251 206 GTGTCCATGTACACACGCTGTCCTATGTACTTATC <<<<<<<<<<<<<<<<7<<;:4;44<;;:8;;9;; MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_95:3:268:523:511 99 chr1 1081 99 35M = 1241 195 TGTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<;<<<<6<:9<<3<44 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:6:54:263:585 99 chr1 1081 99 36M = 1254 209 TGTCCATGTACACACGCTGTCCTATGTACTTATCAT <<<<<<<<<<<<<<<<<<<:;<<;<:;::<<;;:;4 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:7:174:987:334 83 chr1 1082 99 35M = 908 -209 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ,;<;;<<<&<<<1<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:6:224:932:942 99 chr1 1082 99 34M = 1250 203 GTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<<<;<<<<7<<(;3, MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:1:12:1296:358 99 chr1 1082 96 35M = 1252 205 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ;;;6;7;7;;;;;7;9;;-*1;9;699/99/7477 MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:2:306:119:56 147 chr1 1083 99 35M = 919 -199 TCCATGTACACACGCTGTCCTATGTACTTATCATG ;;;;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_95:4:66:179:118 163 chr1 1084 99 35M = 1262 213 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<<<:<<;<<6<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:2:110:584:649 99 chr1 1084 99 35M = 1266 217 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<;<<<<<::<38 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:4:28:315:310 163 chr1 1085 99 35M = 1242 192 CATGTACACACGCTGTCCTATGTACTTATCATGAC <<<<<<<<<<<<<<<<<<<<<:<+.<<.<+7<*17 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:242:4:593 147 chr1 1086 99 35M = 905 -216 ATATACACACGCTGTCCTATGTACTTATCATGACT 1.%55877+8+88808887+7;7;18:8;;;.&;8 MF:i:18 Aq:i:53 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS1_93:1:131:946:353 163 chr1 1087 99 35M = 1249 197 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<;<<<<;;<<<<<<<;<:52;<2 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:4:4:1732:88 99 chr1 1087 99 35M = 1265 213 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<<<<<2<8;8<;<8;<2;2:<:< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:4:58:703:72 83 chr1 1088 99 35M = 905 -218 GTACACACGCTGTCCTATGTACTTATCATGACTCT 5&<<7;+95;7'6<<<<<.<<<<<;<<9<7<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:5:113:694:725 163 chr1 1088 99 35M = 1266 213 GTACACACGCTGTCCTATGTACTTATCATGACTCT <<<<<<<<<<<<9<<<<<:<<<<<<<<<<:;;<;; MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:5:278:848:765 147 chr1 1088 99 35M = 920 -203 GTACACACGCTGTCCTATGTACTTATCATGACTCT 7;;<;5<55<<;;<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:234:787:12 163 chr1 1092 97 35M = 1257 200 ACACGCTGGCCTATGTACTTATAATGACTCTATCC <;<<<9<<&+9;3;<993;<9<+94;9&41;08%9 MF:i:18 Aq:i:24 NM:i:2 UQ:i:15 H0:i:0 H1:i:0 --EAS218_1:4:15:856:340 147 chr1 1093 99 35M = 936 -192 CACGCTGTCCTATGTACTTATCATGACTCTATCCC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:2:258:266:101 163 chr1 1094 99 35M = 1285 226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA <<<<<<<<<<<<<<<<<<5<;,<-2<<<<;68<<6 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:177:552:234 147 chr1 1094 99 35M = 903 -226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA ::;:=;=99=====;;====;==========<=== MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:1:134:379:893 147 chr1 1095 99 35M = 927 -203 CGCTGTCCTATGTACTTATCATGACTCTATCCCAA 7137::;<<<<<<<;<<<<<<<<<<;<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:8:256:404:584 147 chr1 1096 99 35M = 928 -203 ACTGTCCTATGTACTTATCATGACTCTATCCCAAA &&326+23<3<<<+:<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:3:57:735:151 99 chr1 1121 94 35M = 1314 228 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<<<<8<<8<:<<*<:<<<4<<<;,<<<<:<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_81:8:142:858:903 147 chr1 1121 99 35M = 943 -213 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<;<<<<9<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:7:247:522:670 83 chr1 1121 99 35M = 960 -196 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC ;;;9;:<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:3:75:732:442 99 chr1 1121 99 40M = 1293 212 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<;<<<<<9<<<;<<;<<<5<<;8<<<<<<<<;:9%% MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:7:183:645:699 99 chr1 1122 86 35M = 1281 194 TATCCCAAATTCCCAATTACGTCCTATCTTCTTCT <<9<9<<<<<<<<<;<<;<<*175;173<;;;<-/ MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:6:175:437:950 163 chr1 1126 99 35M = 1298 207 CCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:59 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:3:93:1002:845 83 chr1 1129 99 35M = 954 -210 AATTCCCAATTACGTCCTATCTTCTTCTTAGGGAA <<::;;;<<<<<<<<<<<<<<<<;<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:6:50:542:881 163 chr1 1132 99 35M = 1324 227 TCCCAATTACGTCCTATCTTCTTCTTAGGTAAGAA <<<<<4<09<<9<<2<<<<<<<<<<<2/.&2<%<7 MF:i:18 Aq:i:63 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_99:3:118:851:285 83 chr1 1133 99 35M = 953 -215 CCCAATTACGTCCTATCTTCTTCTTAGGGAAGAAC 3+7<<<;<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:1:67:191:668 147 chr1 1134 99 35M = 995 -174 CCAATTACGTCCTATCTTCTTCTTAGGGAAGAACA <<<<<7<<7<<<<<<<;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:166:203:416 83 chr1 1136 99 35M = 963 -208 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC <<<<<<<<::<<<<<<<<<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:2:15:1497:1530 99 chr1 1136 99 35M = 1314 213 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC 0<;;;9;;86<;;;<<&<<.<<;)3;7;654-471 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:8:206:563:262 83 chr1 1137 99 35M = 971 -201 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<7 MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:4:40:352:151 99 chr1 1137 99 35M = 1327 225 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<<<<<<<<<<<<;<<9<<<<:<<<<;<99<3< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:7:67:302:762 99 chr1 1138 99 36M = 1313 211 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<;;65;<-<;<:8<<<3 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:5:84:927:843 147 chr1 1138 99 35M = 938 -235 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTT 588;<:<<<<<<<6<<<<;<<<:/<<3<:;<*<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:5:147:479:41 163 chr1 1139 99 35M = 1322 218 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<::6<<;<<<;;9;;6 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:3:329:177:267 83 chr1 1139 99 35M = 962 -212 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:7:72:916:763 163 chr1 1142 99 35M = 1340 233 GTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGT ==7>==9>=7=>=>>=>> MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_65:4:91:267:655 147 chr1 1365 99 35M = 1204 -196 TGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGT ;,:;5:<6:<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:2:91:856:504 99 chr1 1366 99 35M = 1520 189 GTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTT <<<<<<<<<<<<<<7<<<<<<<7<<<&;<<<&&<& MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_108:2:170:326:433 99 chr1 1367 99 35M = 1535 203 TTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGG =====<=9===:=<:==2=======2:===9==/5 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:6:132:717:233 99 chr1 1368 99 35M = 1529 196 TGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGA <<<<<<<<<<<<;<<<<<<<<<<<7<<<<&-<4<1 MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:1:14:420:712 99 chr1 1368 99 40M = 1525 197 TGTTGGTTTTCTGTTTCTTTGTTTGATTTTTTTGAAGACA <<<<<<<<<<<<;<<<<<<<;<<<-;<<<&,<&*8111:6 MF:i:18 Aq:i:66 NM:i:3 UQ:i:21 H0:i:1 H1:i:0 --EAS114_39:4:43:1047:1626 163 chr1 1369 99 35M = 1523 189 GTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAA <<<<<<<<<<<<<<<<<<<:<<<:<<<<:+;-4:( MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:2:20:413:1334 147 chr1 1370 99 35M = 1215 -190 TTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAG 88878777;:;:1:;9;;;6;;;6;9;;;;;296; MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:5:154:669:853 83 chr1 1371 99 35M = 1193 -213 TGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGA <::<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:7:117:857:942 99 chr1 1372 99 35M = 1527 190 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC <<<<<<<<<<<<<<<<<<<<<<<<<:6<;;7;9<; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:145:144:796 147 chr1 1372 99 35M = 1181 -226 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC ;<<<;<<<<<<<<<;<<<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:6:88:162:587 147 chr1 1372 99 35M = 1189 -218 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC 386;;388-<8;<;68<<;;<;<6<<<8<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:8:73:108:1621 99 chr1 1373 99 35M = 1532 194 GTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACA <<<<<<<<71<<<<<<<<<+<<<<70:0<9<<61< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:6:127:153:861 147 chr1 1374 99 35M = 1202 -207 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT :;:6;9<<1;<<95<<<9<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:152:765:744 163 chr1 1374 99 35M = 1534 195 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT <<<<<<<<<<<<<<<<<<:<<<<<<<<&<7293<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:313:827:992 147 chr1 1379 99 35M = 1197 -217 TGTTTCTTTGTTTGATTTGGTGGAAGACATAATCC '187:1'<75<.*<<:5<..<<*<<917<<7<<17 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:7:268:263 121 chr1 1381 22 35M = 1381 0 TTGCGTTATTTGAGTTGGTGGAAGACATAATCCCA ',)*&2<$7+<<<'<-<7<<<<<<<7<<7><>;>+>>/;>>=>=>=:>><>=<<==;)<=8; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:7:85:923:726 147 chr2 199 99 35M = 43 -191 GTAAAGTAACTGAACCTATGAGTCACAGGTATTCC <:<<<%3<<1<<86<<-<<<<<<<<<<<<6<<1<< MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:5:285:241:560 83 chr2 200 99 35M = 37 -198 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT :<<<<;<<,<<<<5<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:6:41:461:436 163 chr2 200 74 35M = 389 224 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT <<<<<<<<<<<<<<<<<<<:<<<<<<<<;<;;;:; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_61:6:25:949:33 99 chr2 201 99 35M = 383 217 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG =;===/8========*==&;6=&=&:=6&:=::67 MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_3:2:60:590:1760 99 chr2 201 99 35M = 376 210 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG <:<<<<<2<<<<:<::<<<::<<<<<6<<<<<<<6 MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:6:86:693:234 163 chr2 202 82 35M = 388 221 AAGTAACTGAACCTATGAGTCACAGGTATTCCTGA ;;;;;;;;;;;;;;;;9;;;;;;;;99;;&70777 MF:i:18 Aq:i:18 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:4:223:776 163 chr2 203 93 35M = 387 219 AGTAACTGAACCTATGAGTCACAGGTATTCCTGAG <<<<<<<<<<<<<<<<<<<<<<<<<9<<<:;<;2< MF:i:18 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:3:88:84:1558 99 chr2 203 95 35M = 394 226 AGTAACTGAACCTATGAGTCACAGGTATTCCTGTG <<;<<<<<<<<<<<<;;<<<<<<<::<<<<<<7&< MF:i:18 Aq:i:22 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --B7_597:2:168:829:88 163 chr2 205 99 35M = 369 199 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGA <<<<<<<<<<<<<<<<<<<<<<<6<<<<<<9;4;2 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:1:168:389:889 147 chr2 205 99 36M = 37 -204 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA ;<<;;56;==================8========8 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:5:81:685:141 99 chr2 207 85 34M = 382 210 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAA <<<<<<<<<<<<<<<<<<<<<;;<<;<<<',7,7 MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:4:26:1312:1400 99 chr2 207 99 40M = 385 218 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<;<<<:<<:<;<:<<<;:;<<<<<<:<8<1;;:::88 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:4:127:725:381 83 chr2 209 99 34M = 39 -204 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAA +<<.<<;<;<<<3;<;<<<<<<6<8;<<<<<<<1 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:19:736:559 99 chr2 209 99 35M = 370 196 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:3:117:284:589 83 chr2 210 99 35M = 43 -202 GAACCTATGAGTCACAGGTATTCCTGAGGAAAAAG ==8==;==================;========== MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:5:120:596:847 163 chr2 211 83 35M = 410 234 AACCTATGAGTCACAGGTATTCCTGAGGAAAAAGA <<<<<<<<<<<<<;<<<9<<<<<<<;:<62;58;2 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 --B7_610:5:51:904:391 163 chr2 212 97 35M = 401 224 ACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<<<<<<<<<<<<<<;<<<<<;:;<2<6;;;;; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:8:96:1314:1448 163 chr2 213 93 35M = 388 210 CCTATGAGTCACAGGTATTCCTGAGGAAAAATAAA <<<<<<<<<<<<<<<<<<<<<<<<5<4<<<<-<<< MF:i:18 Aq:i:18 NM:i:1 UQ:i:12 H0:i:1 H1:i:0 --EAS139_19:3:73:1158:535 163 chr2 213 99 40M = 377 204 CCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTG <<<<<<<<<<<<<<<<<<<<<<8<;;<<<<<9<<9::8:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:223:583:968 147 chr2 215 88 36M = 47 -204 TATGAGGCACAGGTATTCCTGAGGAAAAAGAAAAAG 1<';<<&%-:<<<<<:66%<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_97:3:160:173:889 163 chr2 215 99 35M = 379 199 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA <<<<<<<<<<<<<<<<<<<<;0<7<<;<<<;7<09 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:1:28:350:895 83 chr2 215 95 35M = 48 -202 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA :<;<<<:;<-<<<<<4;77<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:4:45:707:147 163 chr2 216 99 35M = 424 243 ATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<&<<<<:<<9<<<9<<<<75;;;< MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS220_1:8:18:1757:95 99 chr2 216 45 35M = 374 193 ATGAGTCGCAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<<<<<<<<1<:<<<<<<:<<<;:< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS51_66:6:310:747:415 163 chr2 217 99 35M = 387 205 TGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGT <<<<<<<<<<<<<<<<<<:<<<<<;<<<<<;<;<; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:114:938:216 147 chr2 218 99 36M = 63 -191 GAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGA <<<<7<6<<<<<<<6<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:1:179:629:513 163 chr2 220 99 35M = 409 224 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG <<<<<<<<<<<<<<<;<;<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:4:88:55:1187 99 chr2 220 66 35M = 391 206 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG ;;<;;;<<99<<;;<;;;;;:;49;:;;;;87898 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:5:119:38:945 99 chr2 221 99 35M = 428 242 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA <<<<<<<<<<<<<<<8<<<<<8<<<8<;<<7<:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:6:67:800:450 147 chr2 221 99 35M = 41 -215 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA 9-<9<;<<<<9;5<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:5:102:915:87 147 chr2 222 99 35M = 65 -192 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA ;<8<;;<<<<7;<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:1:113:367:659 163 chr2 222 72 35M = 390 203 CACAGGTATTCCTGAGGAAAAAGAAAAAGCGAGAA =9====8==========:=:=====9=:=&====5 MF:i:18 Aq:i:0 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS218_1:2:26:211:481 147 chr2 222 99 35M = 43 -214 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<:<<<<<<9:5<<<<<<<<<<<<<<:<:<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:3:90:1906:1528 83 chr2 222 99 35M = 41 -216 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<<<<<<<<<3:<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:13:100:876 163 chr2 223 73 36M = 397 210 ACAGGGATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <8<<<*<2<7<<<6<<<<<<6<<8<<<<5<<<<4<9 MF:i:18 Aq:i:30 NM:i:1 UQ:i:9 H0:i:0 H1:i:1 --EAS56_63:5:117:570:971 163 chr2 223 99 35M = 413 225 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <<<<<<<<<<<<<;;;<<<<6<7;9;<:;<;<;;< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_3:8:50:1203:1094 83 chr2 223 99 35M = 46 -212 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <7<<<<<5:+63<<<<<<<<<<<<<<<<2<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:6:107:395:312 83 chr2 224 99 35M = 44 -215 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT ;<;;<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:29:833:612 83 chr2 224 99 35M = 58 -201 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <<;<<<;<::<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:7:158:943:467 83 chr2 225 99 35M = 57 -203 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT <:<<;;<:5<<<<<<<<<<<<<<<<<<;<<<;<<< MF:i:18 Aq:i:46 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:201:768:529 163 chr2 225 99 35M = 396 206 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT ==========================1=======; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:6:11:994:584 99 chr2 226 97 35M = 417 226 GGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTT <<<<<<<<<<<7<<<<<<<<<<<<<6<<<<<<3<6 MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:206:873:186 83 chr2 227 99 35M = 66 -196 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;<<;--7<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:4:38:28:122 83 chr2 227 99 35M = 46 -216 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;9;9;-1<<<<<<<<<<<<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:5:66:372:343 83 chr2 228 99 35M = 40 -223 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG ;<1;89<<<<<;<9<<<<9<<<;8<9<;<<<<<;8 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:3:277:144:848 83 chr2 228 99 35M = 64 -199 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG <<<)63<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:6:21:1601:1666 83 chr2 228 99 40M = 56 -212 TATTACTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAA -;;3&1<<<<<<<<<<<<1<<<<<<<<<<<7<<<<<<<<=<<<<<<<<<<<<<< MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:23:268:529 153 chr2 329 71 35M * 0 0 TGAAAGAGGCTCAAAGAATGCCAGGAAGATACATT 7;<<<<<<57;-<<<<<<:<77<<<<<<<;<;<<< MF:i:32 Aq:i:28 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:2:315:219:7 153 chr2 330 69 35M * 0 0 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG 7==::<2=8<<<=====>888<=2=>==>,>,>>8 MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:4:63:5:870 83 chr2 330 75 35M = 148 -217 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG :<;<;<<<4:;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:5:243:557:560 163 chr2 331 75 36M = 499 204 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<9<<<<<<<<<<<<<<<<<<;<<89<<9<; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:2:270:430:269 163 chr2 331 99 36M = 519 224 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;7;: MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:6:284:442:747 89 chr2 331 75 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <;<<<<<:<;<<<<<<<<;<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:7:71:31:1973 89 chr2 331 76 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <<<<<7<:<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:2:30:466:652 147 chr2 332 98 35M = 163 -204 AAGAGGCTAAAAGAATGCCAGGAAGATACATTGCA <<<<<;3;&<<<<<<<============= MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:4 H1:i:13 --EAS114_39:3:88:84:1558 147 chr2 394 95 35M = 203 -226 ATCAGACTATCTAAAGTCAACATGAAGGAAAAAAA ;;<<;<<;<<5<<<<<<;<<:<<<;<<<<<<;<<< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:2 H1:i:3 --EAS56_59:2:201:768:529 83 chr2 396 99 35M = 225 -206 CAGACTATCTAAAGTCAACATGAAGGAAAAAAATT 3<:<9<<;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:13:100:876 83 chr2 397 73 36M = 223 -210 AGAATATATAAAGTCAACATGAAGGAAAAAAATTCT ;9<$<<<$<<<<<<<<75<<<<<<<9<9<<<<<<<< MF:i:18 Aq:i:30 NM:i:2 UQ:i:6 H0:i:1 H1:i:1 --EAS139_11:4:26:137:1382 99 chr2 397 99 35M = 579 217 AGACTATCTAAAGTCAACATGAAGGAAAAAAATTC <<<<<<7<<<77<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:1:93:490:901 83 chr2 445 99 35M = 280 -200 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<;<<<;<<<;<<;<<;<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:96:489:453 99 chr2 445 99 35M = 625 215 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;;: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:6:46:13:880 147 chr2 445 99 35M = 290 -190 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA =&====8==========0================= MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:167:905:852 163 chr2 445 99 36M = 647 238 AGAAAAGCATACAGTCATCTATAAAGAAAATCCCAT <<<7<<<<<<<<<<<<<<:<:<<:::&.<:<66:3< MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS219_FC30151:3:13:674:1717 163 chr2 445 99 35M = 623 213 AGAAAAGCATGCAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<%<<<<<<<<<<<<<<<<<<<;:;;; MF:i:18 Aq:i:45 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 --EAS51_62:7:196:511:896 83 chr2 446 99 35M = 283 -198 GAAAAGCATACAGTCATCTATAAAGGAAATCCCAT 8<<<<<;<<<:<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:52 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:1:154:118:488 163 chr2 447 99 35M = 624 212 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<<<<<<<<<<<<<<:7<<<<7<:;;:: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:44:153:969 83 chr2 447 95 35M = 245 -237 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<5<:7<72<51<7<*79<<<<<5<<<<<<<<<2< MF:i:18 Aq:i:36 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:4:215:246:640 99 chr2 447 99 36M = 624 213 AAAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<9<;<<<<<<<<<<9;<<<<<<3;<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:7:94:1655:1921 147 chr2 447 85 35M = 258 -224 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<;:===<==;<==<;================; MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:6:60:1037:1146 147 chr2 447 99 35M = 250 -232 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:1:23:536:229 99 chr2 448 99 35M = 614 201 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<:<8<:<<;<<<<<<7<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:6:130:865:838 163 chr2 448 99 35M = 649 236 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;:<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:239:1001:406 99 chr2 450 99 35M = 634 219 AGCATACAGTCATCTATAAAGGAAATCCCATCAGA <<<<<<7<<<<<<<<8<;<<<7<<<<36<<3<:33 MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:3:147:423:584 147 chr2 451 99 35M = 277 -209 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA 27<;<3<<<+<<;<<<;;-4<<<<<;<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:1:187:715:521 83 chr2 451 99 35M = 291 -195 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <7<:<9<<<<<<<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:3:172:196:746 99 chr2 451 99 35M = 620 204 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <<<<<<<<9<<<<9<<<<<<<<<;<<<<6<<<<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:3:267:821:860 83 chr2 451 99 34M = 296 -189 GCATACAGTCATCTATAAAGGAAATCCCATCAGA $&<<<.<:;6<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 --EAS56_61:7:7:682:201 83 chr2 452 99 35M = 288 -199 CATACAGTCATCTATAAAGGAAATCCCATCAGAAT 0:8;5<8<1:78<<<<<<<<<<<<:8<<2<<<<:< MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:3:82:13:897 163 chr2 453 99 35M = 606 188 ATACAGTCATCTATAAAGGAAATCCCAGCAGAATA <<<<;<<<<<<;<;<;5<51;<1<<<<%<<<<,58 MF:i:18 Aq:i:41 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 --EAS56_53:6:180:695:621 99 chr2 453 99 35M = 637 219 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;::<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:158:909:321 83 chr2 453 99 35M = 271 -217 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:2:237:497:165 99 chr2 454 99 35M = 619 200 TACAGTCATCTATAAAGGAAATCCCATCAGAATAA 8===<8===========37=<===7=;7=8===== MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:2:152:355:962 83 chr2 456 99 35M = 269 -222 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<.9.<;+;<;<<<<<<<<<<::<<:<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --EAS192_3:4:255:549:422 83 chr2 456 99 35M = 295 -196 AAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<;;+<;4;<<<<<<<<<<<;<;<<;<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:2 --EAS220_1:4:100:20:1199 163 chr2 456 99 35M = 614 193 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA 7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<4<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:71:408:741 163 chr2 457 99 35M = 637 215 AGTCATCTATAAAGGAAATCCCATCAGAATAACAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:5:285:395:450 147 chr2 458 99 35M = 269 -224 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT 8)3<8+;<)<<<<<<<<97:7<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:4:854:140 99 chr2 458 72 35M = 638 215 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<6<<<:<6<<<:36:<<<<3<<8:.6<38::4< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:6:227:657:95 147 chr2 458 99 35M = 280 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT ;3;<);<<<<<<<<<<<<18<<<<<<<<<<<<<<< MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:7:57:324:546 83 chr2 458 99 36M = 281 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAATG ;;5<;,<<<;;<<<<<<<97<<<<<<<<<<9<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:4:26:274:1078 83 chr2 458 99 40M = 282 -216 GTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCT 9:*:64<<;<<<<<<<<<;8;<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:6:107:636:642 163 chr2 458 99 35M = 630 207 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_81:7:226:869:36 147 chr2 461 99 35M = 273 -223 ATATATAAAGGAAATCCCATCAGAATAACAATGGG <0/)&<=,==4>4=>>= MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:8:82:1540:77 163 chr2 619 99 35M = 786 202 GAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGC <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:3:172:196:746 147 chr2 620 99 35M = 451 -204 AAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCT <<<;><<+<<<<:<<<<2<;<<<;<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:7:97:892:419 163 chr2 621 99 35M = 800 214 AATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:4:312:915:751 147 chr2 621 99 35M = 461 -195 AATAAAGTCAAGTCTTTCCTGACAAGCAAAAGCTA <:-<<<99:::);:7<4;8<<<<<<<;<2<+8<;< MF:i:18 Aq:i:41 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 --EAS1_93:4:325:352:67 163 chr2 622 99 35M = 794 207 ATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAA ==================<========<=<;-=== MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:4:83:731:540 99 chr2 623 99 35M = 804 216 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<;<<<<<<<<<<<<<:<7<*;&;<;;9 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:5:74:329:459 163 chr2 623 99 35M = 795 207 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<;<<;<<;9;599 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:3:13:674:1717 83 chr2 623 99 35M = 445 -213 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:1:141:415:738 69 chr2 624 0 * = 624 0 TTACCTAGTTGCTCTGTAGTCTCAATTAATTGTTT <<<<<<<<<<<<<<<<<<<<<<<;<<<<<;<8<<< MF:i:192 --EAS1_105:1:141:415:738 137 chr2 624 76 35M = 624 0 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<<<<<<<<<<<<<<<<<<<<:<<;<<;<<<<6: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:1:154:118:488 83 chr2 624 99 35M = 447 -212 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<;58<<95:<<;<;<<<;<<<;;<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:5:198:929:684 83 chr2 624 99 35M = 471 -188 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<;<<<<<:<<<<<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:4:215:246:640 147 chr2 624 99 36M = 447 -213 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<<,<<<96<<:<:<9<6<97<<<<<9<<<<9<<9< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:2:234:167:381 83 chr2 625 99 35M = 443 -217 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT <<;<;<<<<;<<<<7<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:96:489:453 147 chr2 625 99 35M = 445 -215 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<;;;<<<<5:<<:<<<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:3:79:879:15 99 chr2 626 99 35M = 790 199 AGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<2<;<<1< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:2:59:286:290 147 chr2 628 99 35M = 467 -196 TCAAGTCTTTCCTGACAAGCAAATGCTAAGATAAT 77<<<<7<<<97<<,7<<<;<<<;<9<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_95:5:263:511:936 99 chr2 629 99 35M = 801 207 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<<<<<<<<<<;<<<<<;<<:<:<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:3:181:582:435 147 chr2 629 99 35M = 471 -193 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<;<<<<<;<<4<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:6:107:636:642 83 chr2 630 99 35M = 458 -207 AAGTCTTTCCTGACAAGCAAATGCTAAGATAATTC <<<<<<<;<<<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:4:12:273:89 83 chr2 631 99 35M = 477 -189 AGTCTTTCCTGACAAGCAAATGCTAAGATAATTCA <:737<288<<<7<<<<<<<<<:9<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:239:1001:406 147 chr2 634 99 35M = 450 -219 CTTTCCTGACAAGCAAATGCTAAGATAATTCATCA 0':.71;;:9==9=;====;=;============= MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_4:7:96:899:106 147 chr2 636 99 35M = 462 -209 TTCCTGACAAGCAAATGCTAAGATAATTCATCATC ;;;;<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_65:6:67:56:806 147 chr2 637 99 35M = 464 -208 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA 844:8;7<88;8<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:71:408:741 83 chr2 637 99 35M = 457 -215 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA ;7;<;<0<<<<<<<<:;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:6:180:695:621 147 chr2 637 99 35M = 453 -219 TACTGAAAAGCAAATGCTAAGATAATTCATCATCA ;&377<&<<;7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:2 UQ:i:10 H0:i:1 H1:i:0 --EAS114_30:6:49:656:507 147 chr2 637 99 35M = 468 -204 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA %44;;<:<<;<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:4:854:140 147 chr2 638 72 35M = 458 -215 CCTGACAAGCAAATGCTAAGATAATTCATCATCAC :9':<;<<<;<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:1:85:1521:58 99 chr2 639 99 40M = 813 214 CTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<9<<<<<<<<<<<<<7<<<<<<<<<<<<;;:7: MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:2:57:1064:925 137 chr2 640 76 35M * 0 0 TGACAAGCAAATGCTAAGATAATTCATCATCACTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:3:323:196:855 163 chr2 642 99 35M = 809 202 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<7<<<<<<:<<<<<<<<<<<<<<<<<;7: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:117:33:262 163 chr2 642 99 35M = 814 207 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<; MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:1:59:742:549 99 chr2 642 99 35M = 816 209 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<8< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:55:562:403 163 chr2 643 99 36M = 825 218 CAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<;<<;<<<<<<<;<;: MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:7:97:743:602 163 chr2 644 99 35M = 821 211 AAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:167:905:852 83 chr2 647 99 36M = 445 -238 CAAATGCTAAGATAATTCATCATCACTAAACCAGTC +<<<9;7;<<+<<<<<39<;9<;9<<7<<<<<<<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:6:130:865:838 83 chr2 649 99 35M = 448 -236 AATGCTAAGATAATTCATCATCACTAAACCAGTCC ;<:84<<<4<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:1:60:1420:660 163 chr2 649 99 35M = 808 194 AATGCTAAGATAATTCATCATCACTAAACCAGTCC <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<8<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:3:180:89:582 99 chr2 650 99 36M = 809 195 ATGCTAAGATAATTCATCATCACTAAACCAGTCCTA <<<<<<<<<7<<<<<<<<<7<<<:<<<:<<::77:< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:1:86:871:319 147 chr2 651 71 35M = 494 -192 TGCTAAGATAATTCATCATCACTAAACCAGTCCTA 7;+1;<:<<<<<<<<;<<;<<9<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:236:841:20 83 chr2 652 99 35M = 467 -220 GCTAAGATAATTCATCATCACTAAACCAGTCCTAT 7;<<<;<<<<;;<<<<<<<<<<<<<<<<<<<<;<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:2:133:8:379 83 chr2 653 99 35M = 470 -218 ATAAGATAATTCATCATCACTAAACCAGTCCTATA &=========='==7==0=2====28===00==== MF:i:18 Aq:i:70 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_105:8:96:720:940 83 chr2 654 99 35M = 467 -222 TAAGATAATTCATCATCACTAAACCAGTCCTATAA *<<<<;<<<9<<;,<;0<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:5:71:994:576 99 chr2 655 99 35M = 805 185 AAGATAATTCATCATCACTAAACCAGTCCTATAAG <<<<<<<<<<<<<<<<<<<<<<<;<<5<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:4:164:79:134 147 chr2 656 99 35M = 488 -203 AGATAATTCATCATCACTAAACCAGTCCTATAAGA <;<;<<<;<<;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:6:78:1029:512 83 chr2 656 99 40M = 500 -196 AGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGC ;;;;;<;;<<<.<<6;<<;<;8<<<<::<<<<<<<<;<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:1:214:784:690 147 chr2 657 99 35M = 472 -220 GATAATTCATCATCACTAAACCAGTCCTATAAGAA -<7<<7<:<<2<<<<;<<<<<;<<<<3<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 --EAS220_1:4:6:1178:1105 99 chr2 657 93 35M = 830 208 GATAATTCATCATCACTAAACCAGTCCTATAAGAA <<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:7:171:196:287 83 chr2 658 99 35M = 485 -208 ATAATTCATCATCACTAAACCAGTCCTATAAGAAA <;;;98;<;&<;;<<<<<<<;<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:1:220:801:282 99 chr2 660 99 36M = 837 213 AATTCATCATCACTAAACCAGTCCTATAAGAAATGC <<<<<<<<<<<<<<<<<<<<<;<+<;<<<<<::<<: MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --EAS221_1:2:73:955:728 163 chr2 660 44 35M = 823 198 AATTCATCATCACTAAACCAGTCCTATAAGAAATG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:14 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --EAS1_105:1:3:903:957 147 chr2 661 99 35M = 516 -180 ATTCATCATCACTAAACCAGTCCTATAAGAAATGC <%12<&<<<;<:<<<<<<<<<7<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 --EAS56_65:2:224:579:433 83 chr2 662 99 35M = 485 -212 TTCATCATCACTAAACCAGTCCTATAAGAAATGCT '<08/8<+<>===> MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:2:146:374:692 99 chr2 690 99 35M = 874 219 AAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAA <<<<<<<<<<<<<<<=>>>==>>===>==> MF:i:130 Aq:i:74 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS1_108:6:159:493:275 99 chr2 760 72 35M = 939 214 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA =====3============================= MF:i:130 Aq:i:72 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS139_11:1:81:1019:558 163 chr2 760 77 35M = 926 201 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA <<<<<<<<<<<6<<<<<<<<<<<<<<<<<<<<<7< MF:i:130 Aq:i:77 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS51_62:7:162:195:761 163 chr2 767 30 18M4I13M = 922 190 TCACAGGTTTTATAAAACAATTAATTGAGACTACA <<<<<<<<<<<<<<<<<<<<<<<<<<;<:<<<<;; MF:i:130 Aq:i:30 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --B7_597:3:115:646:430 147 chr2 768 45 17M4I14M = 582 -217 CACAGGTTTTATAAAACAATTAATTGAGACTACAG 5;5<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:45 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS114_30:6:243:209:110 163 chr2 768 48 17M4I14M = 920 187 CACAGGTTTTATAAAACAATTAATTGAGACTACAG <<<<<;<;<<<;<<<<<<<<<<<;<:;<<:;;+85 MF:i:130 Aq:i:48 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS1_108:2:266:994:429 147 chr2 769 76 16M4I15M = 612 -188 ACAGGTTTTATAAAACAATTAATTGAGACTACAGA <<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:1:85:1521:58 147 chr2 813 99 40M = 639 -214 AAATTAACATTACAACAGGAACAAAACCTCATATATCAAT :::86<<:<<8<<<<;<<8<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:4:164:719:947 99 chr2 813 99 35M = 1005 227 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<;<<<<<<<<<;<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:1:50:257:341 163 chr2 813 99 35M = 971 193 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<7<6<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:117:33:262 83 chr2 814 99 35M = 642 -207 AATTAACATTACAACAGGAACAAAACCTCATATAT <<;;<<;<:8<7<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:8:90:706:1276 163 chr2 814 99 35M = 980 201 AATTAACATTACAACAGGAACAAAACCTCATATAT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<:<:< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:2:116:966:193 163 chr2 815 99 35M = 967 187 ATTAACATTACAACAGGAACAAAACCTCATATATC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:1:59:742:549 147 chr2 816 99 35M = 642 -209 TTAACATTACAACAGGAACAAAACCTCATATATCA -<<<3<<<<6<<6<<<<<6<<<<6<<<<<<<<<<< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:7:74:213:877 99 chr2 816 99 35M = 996 215 TTAACATTACAACAGGAACAAAACCTCATATATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --B7_610:3:85:219:371 163 chr2 817 99 35M = 967 185 TAACATTACAACAGGAACAAAACCTCATATATCAA <<<<<<<<<<<<<<<<<<<<<:<<<<<<<<<<;<; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:2:176:653:957 163 chr2 819 82 35M = 982 198 ACATTACAACAGGAACAAAACCTCATATATCAATA ????????????<==>=>=>=>>>==>>>=>>> MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:5:57:366:844 83 chr2 877 99 40M = 708 -209 AAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGAT ;;;7:8&555<,;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:3:236:475:254 163 chr2 880 99 35M = 1051 206 TTCCCCCACTTAAGAGATATAGATTGGCAGAACAG <<<<<<<<<<<<<<<<<<<<<<<<<:::<:;>=>>>>==>=>>>==>=>=:=====;=:=6:::6 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:2:10:686:1024 163 chr2 947 99 35M = 1103 191 ACAAGAAACTCATTAATAAAGACATGAGTTCAGGT <:<<<<:<<<<<<<<<<:<:<<<<<<<<<<<5<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:5:53:61:31 163 chr2 949 99 35M = 1122 208 AAGAAACTCATTAATAAAGACATGAGTTCAGATAA <<<7;<7<<<;7<;;<7<7<7<;5<73<<<;>588>9<>7:<0<9; MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --B7_589:2:30:644:942 99 chr2 1045 83 35M = 1229 219 TATATCAGATAAAGCACACTTTAAATCAACAACAG <<<<<<<<<<<<<<<<<<<<<<<9<<<<<<9;<9< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 --B7_591:2:123:924:645 83 chr2 1045 84 36M = 861 -220 TATATCAGATAAAGCACACTTTAAATCAACAACAGT ;<<<<*<<<<<<<<8<<<<<><<<<<><<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS51_62:4:308:614:911 99 chr2 1319 90 35M = 1493 209 TGCGCTTGTACTTCTAAATCTATAACAAAATTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<8< MF:i:18 Aq:i:43 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS54_65:3:155:541:234 83 chr2 1319 99 35M = 1151 -203 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 78;<7<<<<<<<<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:175:289:351 147 chr2 1319 99 35M = 1144 -210 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 9;;:+<<<<<;<<:<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:7:283:186:707 83 chr2 1321 99 36M = 1154 -203 CGCTTGTACTTCTAAATCTATAACAAAATTAAAATT 889;<7;<7<<7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS1_105:3:308:66:538 147 chr2 1321 99 35M = 1138 -218 CGCTTGTACTTCTAAATCTATAACAAAATTAAAAT 996999;<9;<:<<<<<:<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS1_108:5:11:555:330 163 chr2 1321 99 35M = 1492 206 CGCTTGTACTTCTAAATCTATAAAAAAATTAAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<4<;< MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:7:84:411:336 73 chr2 1322 75 35M * 0 0 GCTTGTACTTCTAAATCTATAAAAAAATTAAAATT <<<;<<<;<<<<<<<<<<<<:<<;<<<<<<;8<;< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:5:52:1278:1478 163 chr2 1322 47 35M = 1513 226 GCTTGTACTTCTAAATCTATAACAAAATTAAAATT <<<<<<<<<<<<<<9<<<<<<<<<<<<<<<<9<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS56_53:3:101:809:776 147 chr2 1326 99 35M = 1160 -201 GTACTTCTAAATCTATAAAAAAATTAAAATTTAAC <<<-<;7;<<<<:;<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:3:221:881:916 147 chr2 1327 96 35M = 1168 -194 TAATTCTAAATCTAGAACAAAATTAAAATTTAACA 44%-4(5<;9/,:<68:1<:8<:<<84;<<<<<;< MF:i:18 Aq:i:24 NM:i:3 UQ:i:41 H0:i:0 H1:i:0 --EAS1_105:1:28:745:352 147 chr2 1329 99 35M = 1159 -205 CTTCTAAATCTATAACAAAATTAAAATTTAACAAA 4;;*;<<<;;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS114_45:2:23:1754:796 99 chr2 1329 99 35M = 1488 194 CTTCTAAATCTATAAAAAAATTAAAATTTAACAAA ;<<;<;<;<;<;<<;;;;;<<<<;;<<<<<97999 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:2:96:419:327 147 chr2 1331 99 35M = 1149 -217 TCTAAATCTATAACAAAATTAAAATTTAACAAAAG ;1<<<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS1_97:4:274:287:423 163 chr2 1332 75 35M = 1515 218 CTAAATCTATAAAAAAATTAAAATTTAACAAAAGT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:7:35:392:2042 83 chr2 1332 99 35M = 1168 -199 ATAAATCTATAAAAAAATTAAAATTTAACAAAAGT +<<<<4<>>>>;>>&>->9>9;4>->>>>,4>9>,<1> MF:i:18 Aq:i:27 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS139_19:2:82:154:1333 99 chr2 1349 77 40M = 1511 202 TTAAAATTTAACAAAAGTAAATAAAACACACAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;<;;:;: MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:1 H1:i:0 --EAS188_7:1:290:286:763 99 chr2 1349 75 35M = 1515 201 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<7<<<<<<<<<<<<<<<8<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:4:3:248:1491 73 chr2 1349 99 35M * 0 0 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:8:< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:3:6:1064:1805 99 chr2 1350 99 35M = 1502 187 TAAAATTTAACAAAAGTAAATAAAACACATAGCTA <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:6:137:811:130 83 chr2 1351 99 35M = 1175 -211 AAAATTTAACAAAAGTAAATAAAACACATAGCTAA <<<<<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:1:155:809:543 83 chr2 1352 99 35M = 1156 -231 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<+0<<<9<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:3:41:468:459 99 chr2 1352 75 35M = 1513 196 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<;;7 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:4:31:622:216 73 chr2 1354 99 35M * 0 0 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<8<<96<7 MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:8:105:854:975 163 chr2 1354 71 35M = 1523 202 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<7:<;;;;5 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:7:26:749:174 147 chr2 1357 78 35M = 1183 -209 TAACAAAAGTAAATAAAACACATAGCTAAAACTAA (<<)<<<<6<<<<<<<<<<&:<3<<<6<<<)<:<< MF:i:18 Aq:i:11 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:21:553:57 147 chr2 1358 99 35M = 1197 -196 AACAAAAGTAAATAAAACACATAGCTAAAACTAAA <<+<<<<<<<<<;<<<<8<<<<<<8<<<<<;<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:2:128:629:484 83 chr2 1359 96 35M = 1185 -209 AAAAAAGTAAATAAAACACATAGCTAAAACTAAAA :(::<<<<<<<<<< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:82 H1:i:85 --B7_589:6:33:356:636 73 chr2 1520 0 35M * 0 0 TTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTT <<<<<<<8;<<<<<<<<<<<<<7<<<<<<<;;3&3 MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:14 H1:i:85 --EAS114_45:6:86:859:1779 137 chr2 1520 0 35M * 0 0 TTTTTTTCATTTCTCTTTTTTTTTTTTTTTTTTTT ;;;;;;;;;;;;;;;;;;;;;;;;;;;8;;)7699 MF:i:32 Aq:i:0 NM:i:1 UQ:i:26 H0:i:0 H1:i:15 --EAS54_71:8:105:854:975 83 chr2 1523 71 33M = 1354 -202 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTG <<<<;<:<<;<&<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:85 H1:i:85 --EAS51_62:4:187:907:145 153 chr2 1524 28 35M * 0 0 TTTCTTCTCTCTCTTTTTTTTTTTTTTTATTGCAT <<<+;;,6<<<<6<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:28 NM:i:3 UQ:i:59 H0:i:0 H1:i:0 --EAS54_71:4:284:269:882 73 chr2 1524 0 34M * 0 0 TTTCTTTTCTCTTTTTTTTTTTTTTGTTTTTGCA <;<<<<<8<7<8;<<<;<7<<<<<;272;73&&) MF:i:32 Aq:i:0 NM:i:1 UQ:i:17 H0:i:0 H1:i:85 --EAS56_63:4:141:9:811 137 chr2 1524 10 35M * 0 0 TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT <<<;<<<<<<<;<;<:<<<;<<<<<<<<..));;. MF:i:32 Aq:i:0 NM:i:3 UQ:i:47 H0:i:2 H1:i:27 --EAS114_30:6:277:397:932 73 chr2 1524 0 35M * 0 0 TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT <<<<<<<<<<<<<<<<<<<<<<<<<<<<:8(,0%( MF:i:32 Aq:i:0 NM:i:3 UQ:i:42 H0:i:2 H1:i:85 --EAS139_11:7:50:1229:1313 83 chr2 1528 77 35M = 1376 -187 TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA <<<<,<&<7<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:11 H0:i:3 H1:i:7 --EAS54_65:3:320:20:250 147 chr2 1532 77 35M = 1367 -200 TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA +'''/<<<<7:;+<;::<<<;;<<<<<<<<<<<<< MF:i:18 Aq:i:6 NM:i:2 UQ:i:24 H0:i:1 H1:i:2 --EAS114_26:7:37:79:581 83 chr2 1533 68 35M = 1349 -219 TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA 3,,,===6===<===<;=====-============ MF:i:18 Aq:i:27 NM:i:2 UQ:i:23 H0:i:0 H1:i:1 ---- python-pysam.orig/tests/pysam_data/example_user_header.sam -+++ /dev/null -@@ -1,8 +0,0 @@ --@HD VN:1.0 --@SQ SN:chr1 LN:1575 --@SQ SN:chr2 LN:1584 --@x1 A:2 B:5 --@x2 A:4 B:5 --@x3 A:6 B:5 --read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 --read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 ---- python-pysam.orig/tests/pysam_data/Makefile -+++ python-pysam/tests/pysam_data/Makefile -@@ -14,7 +14,6 @@ - $(BAM) $(BAI) \ - $(CRAM) $(CRAI) \ - example_bai.bam \ -- rg_with_tab.bam \ - ex2_truncated.bam \ - empty.bam empty.bam.bai \ - explicit_index.bam explicit_index.cram \ ---- python-pysam.orig/pysam/alternatives.py.obsolete -+++ python-pysam/pysam/alternatives.py.obsolete -@@ -12,7 +12,6 @@ - int bam_merge(int argc, char *argv[]) - int bam_index(int argc, char *argv[]) - int bam_sort(int argc, char *argv[]) -- int bam_tview_main(int argc, char *argv[]) - int bam_mating(int argc, char *argv[]) - int bam_rmdup(int argc, char *argv[]) - int bam_rmdupse(int argc, char *argv[]) ---- python-pysam.orig/tests/AlignmentFile_test.py -+++ python-pysam/tests/AlignmentFile_test.py -@@ -1382,19 +1382,19 @@ - os.unlink(tmpfilename) - - --class TestDeNovoConstructionUserTags(TestDeNovoConstruction): -- -- '''test de novo construction with a header that contains lower-case tags.''' -- -- header = {'HD': {'VN': '1.0'}, -- 'SQ': [{'LN': 1575, 'SN': 'chr1'}, -- {'LN': 1584, 'SN': 'chr2'}], -- 'x1': {'A': 2, 'B': 5}, -- 'x3': {'A': 6, 'B': 5}, -- 'x2': {'A': 4, 'B': 5}} -- -- bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") -- samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") -+# class TestDeNovoConstructionUserTags(TestDeNovoConstruction): -+# -+# '''test de novo construction with a header that contains lower-case tags.''' -+# -+# header = {'HD': {'VN': '1.0'}, -+# 'SQ': [{'LN': 1575, 'SN': 'chr1'}, -+# {'LN': 1584, 'SN': 'chr2'}], -+# 'x1': {'A': 2, 'B': 5}, -+# 'x3': {'A': 6, 'B': 5}, -+# 'x2': {'A': 4, 'B': 5}} -+# -+# bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") -+# samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") - - - class TestEmptyHeader(unittest.TestCase): ---- python-pysam.orig/tests/samtools_test.py -+++ python-pysam/tests/samtools_test.py -@@ -78,7 +78,7 @@ - # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam", - "sort ex1.bam -o %(out)s_ex1.sort.bam", - "mpileup ex1.bam > %(out)s_ex1.pileup", -- "depth ex1.bam > %(out)s_ex1.depth", -+ #"depth ex1.bam > %(out)s_ex1.depth", - # TODO: issues with file naming - # "faidx ex1.fa; %(out)s_ex1.fa.fai", - "index ex1.bam %(out)s_ex1.bam.fai", -@@ -100,8 +100,8 @@ - "cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam", - "targetcut ex1.bam > %(out)s_ex1.targetcut", - "phase ex1.bam > %(out)s_ex1.phase", -- "import ex1.fa.fai ex1.sam.gz %(out)s_ex1.bam", -- "bam2fq ex1.bam > %(out)s_ex1.bam2fq", -+ #"view -bt ex1.fa.fai -o %(out)s_ex1.bam ex1.sam.gz", -+ #"bam2fq ex1.bam > %(out)s_ex1.bam2fq", - # TODO: not the same - # "pad2unpad -T ex1.fa ex2.bam > %(out)s_ex2.unpad", - # TODO: command line option problem diff --git a/debian/patches/samtools_v1.10_full b/debian/patches/samtools_v1.10_full deleted file mode 100644 index dce902d..0000000 --- a/debian/patches/samtools_v1.10_full +++ /dev/null @@ -1,39678 +0,0 @@ -Author: Michael R. Crusoe -Description: sync with samtools 1.10 - -use devtools/import.py and the contents of the samtools -Debian package with its patches fully applied - ---- python-pysam.orig/samtools/LICENSE -+++ python-pysam/samtools/LICENSE -@@ -1,6 +1,6 @@ - The MIT/Expat License - --Copyright (C) 2008-2018 Genome Research Ltd. -+Copyright (C) 2008-2019 Genome Research Ltd. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal ---- python-pysam.orig/samtools/README -+++ python-pysam/samtools/README -@@ -9,7 +9,7 @@ - The typical simple case of building Samtools using the HTSlib bundled within - this Samtools release tarball is done as follows: - -- cd .../samtools-1.9 # Within the unpacked release directory -+ cd .../samtools-1.10 # Within the unpacked release directory - ./configure - make - -@@ -21,7 +21,7 @@ - installation using the HTSlib bundled within this Samtools release tarball, - and building the various HTSlib utilities such as bgzip is done as follows: - -- cd .../samtools-1.9 # Within the unpacked release directory -+ cd .../samtools-1.10 # Within the unpacked release directory - ./configure --prefix=/path/to/location - make all all-htslib - make install install-htslib -@@ -48,7 +48,7 @@ - To build with plug-ins, you need to use the --enable-plugins configure option - as follows: - -- cd .../samtools-1.9 # Within the unpacked release directory -+ cd .../samtools-1.10 # Within the unpacked release directory - ./configure --enable-plugins --prefix=/path/to/location - make all all-htslib - make install install-htslib -@@ -66,8 +66,8 @@ - the source distribution instead of installing the package. In that case - you can use: - -- cd .../samtools-1.9 # Within the unpacked release directory -- ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.9 -+ cd .../samtools-1.10 # Within the unpacked release directory -+ ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.10 - make all all-htslib - - It is possible to override the built-in search path using the HTS_PATH ---- python-pysam.orig/samtools/bam.c -+++ python-pysam/samtools/bam.c -@@ -1,6 +1,6 @@ - /* bam.c -- BAM format. - -- Copyright (C) 2008-2013, 2015 Genome Research Ltd. -+ Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd. - Portions copyright (C) 2009-2012 Broad Institute. - - Author: Heng Li -@@ -30,7 +30,6 @@ - #include - #include "bam.h" - #include "htslib/kstring.h" --#include "sam_header.h" - - char *bam_format1(const bam_header_t *header, const bam1_t *b) - { -@@ -59,7 +58,7 @@ - char *s; - - if (b->core.tid < -1 || b->core.mtid < -1) return 0; -- if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; -+ if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0; - - if (b->data_len < b->core.l_qname) return 0; - s = memchr(bam1_qname(b), '\0', b->core.l_qname); -@@ -77,9 +76,8 @@ - // FIXME: we should also check the LB tag associated with each alignment - const char *bam_get_library(bam_header_t *h, const bam1_t *b) - { -- // Slow and inefficient. Rewrite once we get a proper header API. - const char *rg; -- char *cp = h->text; -+ kstring_t lib = { 0, 0, NULL }; - rg = (char *)bam_aux_get(b, "RG"); - - if (!rg) -@@ -87,50 +85,18 @@ - else - rg++; - -- // Header is guaranteed to be nul terminated, so this is valid. -- while (*cp) { -- char *ID, *LB; -- char last = '\t'; -- -- // Find a @RG line -- if (strncmp(cp, "@RG", 3) != 0) { -- while (*cp && *cp != '\n') cp++; // skip line -- if (*cp) cp++; -- continue; -- } -- -- // Find ID: and LB: keys -- cp += 4; -- ID = LB = NULL; -- while (*cp && *cp != '\n') { -- if (last == '\t') { -- if (strncmp(cp, "LB:", 3) == 0) -- LB = cp+3; -- else if (strncmp(cp, "ID:", 3) == 0) -- ID = cp+3; -- } -- last = *cp++; -- } -- -- if (!ID || !LB) -- continue; -- -- // Check it's the correct ID -- if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t') -- continue; -- -- // Valid until next query -- static char LB_text[1024]; -- for (cp = LB; *cp && *cp != '\t' && *cp != '\n'; cp++) -- ; -- strncpy(LB_text, LB, MIN(cp-LB, 1023)); -- LB_text[MIN(cp-LB, 1023)] = 0; -+ if (sam_hdr_find_tag_id(h, "RG", "ID", rg, "LB", &lib) < 0) -+ return NULL; - -- // Return it; valid until the next query. -- return LB_text; -- } -+ static char LB_text[1024]; -+ int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; -+ -+ memcpy(LB_text, lib.s, len); -+ LB_text[len] = 0; -+ -+ free(lib.s); - -- return NULL; -+ return LB_text; - } - - int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) ---- python-pysam.orig/samtools/bam.c.pysam.c -+++ python-pysam/samtools/bam.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam.c -- BAM format. - -- Copyright (C) 2008-2013, 2015 Genome Research Ltd. -+ Copyright (C) 2008-2013, 2015, 2019 Genome Research Ltd. - Portions copyright (C) 2009-2012 Broad Institute. - - Author: Heng Li -@@ -32,7 +32,6 @@ - #include - #include "bam.h" - #include "htslib/kstring.h" --#include "sam_header.h" - - char *bam_format1(const bam_header_t *header, const bam1_t *b) - { -@@ -61,7 +60,7 @@ - char *s; - - if (b->core.tid < -1 || b->core.mtid < -1) return 0; -- if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; -+ if (header && (b->core.tid >= sam_hdr_nref(header) || b->core.mtid >= sam_hdr_nref(header))) return 0; - - if (b->data_len < b->core.l_qname) return 0; - s = memchr(bam1_qname(b), '\0', b->core.l_qname); -@@ -79,9 +78,8 @@ - // FIXME: we should also check the LB tag associated with each alignment - const char *bam_get_library(bam_header_t *h, const bam1_t *b) - { -- // Slow and inefficient. Rewrite once we get a proper header API. - const char *rg; -- char *cp = h->text; -+ kstring_t lib = { 0, 0, NULL }; - rg = (char *)bam_aux_get(b, "RG"); - - if (!rg) -@@ -89,50 +87,18 @@ - else - rg++; - -- // Header is guaranteed to be nul terminated, so this is valid. -- while (*cp) { -- char *ID, *LB; -- char last = '\t'; -- -- // Find a @RG line -- if (strncmp(cp, "@RG", 3) != 0) { -- while (*cp && *cp != '\n') cp++; // skip line -- if (*cp) cp++; -- continue; -- } -- -- // Find ID: and LB: keys -- cp += 4; -- ID = LB = NULL; -- while (*cp && *cp != '\n') { -- if (last == '\t') { -- if (strncmp(cp, "LB:", 3) == 0) -- LB = cp+3; -- else if (strncmp(cp, "ID:", 3) == 0) -- ID = cp+3; -- } -- last = *cp++; -- } -- -- if (!ID || !LB) -- continue; -- -- // Check it's the correct ID -- if (strncmp(rg, ID, strlen(rg)) != 0 || ID[strlen(rg)] != '\t') -- continue; -- -- // Valid until next query -- static char LB_text[1024]; -- for (cp = LB; *cp && *cp != '\t' && *cp != '\n'; cp++) -- ; -- strncpy(LB_text, LB, MIN(cp-LB, 1023)); -- LB_text[MIN(cp-LB, 1023)] = 0; -+ if (sam_hdr_find_tag_id(h, "RG", "ID", rg, "LB", &lib) < 0) -+ return NULL; - -- // Return it; valid until the next query. -- return LB_text; -- } -+ static char LB_text[1024]; -+ int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; -+ -+ memcpy(LB_text, lib.s, len); -+ LB_text[len] = 0; -+ -+ free(lib.s); - -- return NULL; -+ return LB_text; - } - - int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) ---- python-pysam.orig/samtools/bam.h -+++ python-pysam/samtools/bam.h -@@ -1,6 +1,6 @@ - /* bam.h -- BAM API. - -- Copyright (C) 2008-2014 Genome Research Ltd. -+ Copyright (C) 2008-2014, 2019 Genome Research Ltd. - Portions copyright (C) 2010-2012 Broad Institute. - - Author: Heng Li -@@ -38,7 +38,7 @@ - @copyright Genome Research Ltd. - */ - --#define BAM_VERSION "1.9" -+#define BAM_VERSION "1.10" - - #include - #include -@@ -224,16 +224,6 @@ - // int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); - - /*! -- @abstract Read header information from a TAB-delimited list file. -- @param fn_list file name for the list -- @return a pointer to the header structure -- -- @discussion Each line in this file consists of chromosome name and -- the length of chromosome. -- */ -- bam_header_t *sam_header_read2(const char *fn_list); -- -- /*! - @abstract Read header from a SAM file (if present) - @param fp SAM file handler - @return pointer to header struct; 0 if no @SQ lines available -@@ -252,13 +242,13 @@ - @abstract Initialize a header structure. - @return the pointer to the header structure - */ -- static inline bam_header_t *bam_header_init(void) { return bam_hdr_init(); } -+ static inline bam_header_t *bam_header_init(void) { return sam_hdr_init(); } - - /*! - @abstract Destroy a header structure. - @param header pointer to the header - */ -- static inline void bam_header_destroy(bam_header_t *header) { bam_hdr_destroy(header); } -+ static inline void bam_header_destroy(bam_header_t *header) { sam_hdr_destroy(header); } - - /*! - @abstract Read a header structure from BAM. -@@ -277,7 +267,7 @@ - @param header pointer to the header structure - @return always 0 currently - */ -- static inline int bam_header_write(bamFile fp, const bam_header_t *header) { return bam_hdr_write(fp, header); } -+ static inline int bam_header_write(bamFile fp, bam_header_t *header) { return bam_hdr_write(fp, header); } - - /*! - @abstract Read an alignment from BAM. ---- python-pysam.orig/samtools/bam2bcf.c -+++ python-pysam/samtools/bam2bcf.c -@@ -1,7 +1,7 @@ - /* bam2bcf.c -- variant calling. - - Copyright (C) 2010-2012 Broad Institute. -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2015 Genome Research Ltd. - - Author: Heng Li - ---- python-pysam.orig/samtools/bam2bcf.c.pysam.c -+++ python-pysam/samtools/bam2bcf.c.pysam.c -@@ -3,7 +3,7 @@ - /* bam2bcf.c -- variant calling. - - Copyright (C) 2010-2012 Broad Institute. -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2015 Genome Research Ltd. - - Author: Heng Li - ---- python-pysam.orig/samtools/bam2bcf.h -+++ python-pysam/samtools/bam2bcf.h -@@ -1,7 +1,7 @@ - /* bam2bcf.h -- variant calling. - - Copyright (C) 2010-2012 Broad Institute. -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2014, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -99,7 +99,8 @@ - } bcf_callret1_t; - - typedef struct { -- int tid, pos; -+ int tid; -+ hts_pos_t pos; - bcf_hdr_t *bcf_hdr; - int a[5]; // alleles: ref, alt, alt2, alt3 - float qsum[5]; // for the QS tag -@@ -128,7 +129,7 @@ - int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); - int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, - const bcf_callaux_t *bca, const char *ref); -- int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, -+ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, - const void *rghash); - void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); - ---- python-pysam.orig/samtools/bam2bcf_indel.c -+++ python-pysam/samtools/bam2bcf_indel.c -@@ -1,7 +1,7 @@ - /* bam2bcf_indel.c -- indel caller. - - Copyright (C) 2010, 2011 Broad Institute. -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2014, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -87,9 +87,10 @@ - kh_destroy(rg, hash); - } - --static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) -+static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos) - { -- int k, x = c->pos, y = 0, last_y = 0; -+ int k, y = 0, last_y = 0; -+ hts_pos_t x = c->pos; - *_tpos = c->pos; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; -@@ -124,9 +125,10 @@ - return q < qh? q : qh; - } - --static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) -+static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4) - { -- int i, j, max = 0, max_i = pos, score = 0; -+ int j, max = 0, score = 0; -+ hts_pos_t i, max_i = pos; - l = abs(l); - for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { - if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; -@@ -146,11 +148,12 @@ - - 8: estimated sequence quality .. (aux>>8)&0xff - - 8: indel quality .. aux&0xff - */ --int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, -+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, - const void *rghash) - { -- int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; -+ int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2; - int N, K, l_run, ref_type, n_alt; -+ hts_pos_t i, j, left, right; - char *inscns = 0, *ref2, *query, **ref_sample; - khash_t(rg) *hash = (khash_t(rg)*)rghash; - if (ref == 0 || bca == 0) return -1; -@@ -225,7 +228,7 @@ - free(aux); - // TODO revisit how/whether to control printing this warning - if (hts_verbose >= 2) -- fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); -+ fprintf(stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); -@@ -274,7 +277,7 @@ - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); -- int x = b->core.pos, y = 0; -+ hts_pos_t x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; -@@ -382,7 +385,8 @@ - // align each read to ref2 - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; -- int qbeg, qend, tbeg, tend, sc, kk; -+ int qbeg, qend, sc, kk; -+ hts_pos_t tbeg, tend; - uint8_t *seq = bam_get_seq(p->b); - uint32_t *cigar = bam_get_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads ---- python-pysam.orig/samtools/bam2bcf_indel.c.pysam.c -+++ python-pysam/samtools/bam2bcf_indel.c.pysam.c -@@ -3,7 +3,7 @@ - /* bam2bcf_indel.c -- indel caller. - - Copyright (C) 2010, 2011 Broad Institute. -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2014, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -89,9 +89,10 @@ - kh_destroy(rg, hash); - } - --static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) -+static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, hts_pos_t tpos, hts_pos_t is_left, hts_pos_t *_tpos) - { -- int k, x = c->pos, y = 0, last_y = 0; -+ int k, y = 0, last_y = 0; -+ hts_pos_t x = c->pos; - *_tpos = c->pos; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; -@@ -126,9 +127,10 @@ - return q < qh? q : qh; - } - --static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) -+static inline int est_indelreg(hts_pos_t pos, const char *ref, int l, char *ins4) - { -- int i, j, max = 0, max_i = pos, score = 0; -+ int j, max = 0, score = 0; -+ hts_pos_t i, max_i = pos; - l = abs(l); - for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { - if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; -@@ -148,11 +150,12 @@ - - 8: estimated sequence quality .. (aux>>8)&0xff - - 8: indel quality .. aux&0xff - */ --int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, -+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, hts_pos_t pos, bcf_callaux_t *bca, const char *ref, - const void *rghash) - { -- int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; -+ int s, k, t, n_types, *types, max_rd_len, max_ins, *score1, *score2, max_ref2; - int N, K, l_run, ref_type, n_alt; -+ hts_pos_t i, j, left, right; - char *inscns = 0, *ref2, *query, **ref_sample; - khash_t(rg) *hash = (khash_t(rg)*)rghash; - if (ref == 0 || bca == 0) return -1; -@@ -227,7 +230,7 @@ - free(aux); - // TODO revisit how/whether to control printing this warning - if (hts_verbose >= 2) -- fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); -+ fprintf(samtools_stderr, "[%s] excessive INDEL alleles at position %"PRIhts_pos". Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); -@@ -276,7 +279,7 @@ - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); -- int x = b->core.pos, y = 0; -+ hts_pos_t x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; -@@ -384,7 +387,8 @@ - // align each read to ref2 - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; -- int qbeg, qend, tbeg, tend, sc, kk; -+ int qbeg, qend, sc, kk; -+ hts_pos_t tbeg, tend; - uint8_t *seq = bam_get_seq(p->b); - uint32_t *cigar = bam_get_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads ---- python-pysam.orig/samtools/bam2depth.c -+++ python-pysam/samtools/bam2depth.c -@@ -1,7 +1,7 @@ - /* bam2depth.c -- depth subcommand. - - Copyright (C) 2011, 2012 Broad Institute. -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -39,20 +39,19 @@ - #include - #include "htslib/sam.h" - #include "samtools.h" -+#include "bedidx.h" - #include "sam_opts.h" - -+#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1) -+ - typedef struct { // auxiliary data structure - samFile *fp; // the file handle -- bam_hdr_t *hdr; // the file header -+ sam_hdr_t *hdr; // the file header - hts_itr_t *iter; // NULL if a region not specified - int min_mapQ, min_len; // mapQ filter; length filter -+ uint32_t flags; // read filtering flags - } aux_t; - --void *bed_read(const char *fn); // read a BED or position list file --void bed_destroy(void *_h); // destroy the BED data structure --int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps --int bed_query(const void *_h, const char *chr, int pos, int *beg, int *end); -- - // This function reads a BAM alignment from one BAM file. - static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup - { -@@ -62,7 +61,7 @@ - { - ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); - if ( ret<0 ) break; -- if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; -+ if ( b->core.flag & aux->flags) continue; - if ( (int)b->core.qual < aux->min_mapQ ) continue; - if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; - break; -@@ -79,15 +78,21 @@ - fprintf(stderr, " -a output all positions (including zero depth)\n"); - fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); - fprintf(stderr, " -b list of positions or regions\n"); -+ fprintf(stderr, " -X use customized index files\n"); - fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); -+ fprintf(stderr, " -H print a file header\n"); - fprintf(stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); - fprintf(stderr, " -d/-m maximum coverage depth [8000]. If 0, depth is set to the maximum\n" - " integer value, effectively removing any depth limit.\n"); // the htslib's default -+ fprintf(stderr, " -o FILE where to write output to [stdout]\n"); - fprintf(stderr, " -q base quality threshold [0]\n"); - fprintf(stderr, " -Q mapping quality threshold [0]\n"); - fprintf(stderr, " -r region\n"); -+ fprintf(stderr, " -g include reads that have any of the specified flags set [0]\n"); -+ fprintf(stderr, " -G filter out reads that have any of the specified flags set" -+ " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); - -- sam_global_opt_help(stderr, "-.--.-"); -+ sam_global_opt_help(stderr, "-.--.--."); - - fprintf(stderr, "\n"); - fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); -@@ -95,21 +100,27 @@ - fprintf(stderr, "omitted by default; see the -a option.\n"); - fprintf(stderr, "\n"); - -- return 1; -+ return EXIT_FAILURE; - } - - int main_depth(int argc, char *argv[]) - { -- int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; -+ int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0; -+ hts_pos_t beg, end, pos, last_pos = -1; - int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; - const bam_pileup1_t **plp; - char *reg = 0; // specified region - void *bed = 0; // BED data structure - char *file_list = NULL, **fn = NULL; -- bam_hdr_t *h = NULL; // BAM header of the 1st input -+ sam_hdr_t *h = NULL; // BAM header of the 1st input - aux_t **data; - bam_mplp_t mplp; -- int last_pos = -1, last_tid = -1, ret; -+ int last_tid = -1, ret; -+ int print_header = 0; -+ char *output_file = NULL; -+ FILE *file_out = stdout; -+ uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); -+ int tflags = 0; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { -@@ -118,19 +129,41 @@ - }; - - // parse the command line -- while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) { -+ while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) { - switch (n) { - case 'l': min_len = atoi(optarg); break; // minimum query length - case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header - case 'b': - bed = bed_read(optarg); // BED or position list file can be parsed now -- if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; } -+ if (!bed) { -+ print_error_errno("depth", "Could not read file \"%s\"", optarg); -+ return EXIT_FAILURE; -+ } - break; -+ case 'X': has_index_file = 1; break; - case 'q': baseQ = atoi(optarg); break; // base quality threshold - case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold - case 'f': file_list = optarg; break; - case 'a': all++; break; - case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth -+ case 'H': print_header = 1; break; -+ case 'o': output_file = optarg; break; -+ case 'g': -+ tflags = bam_str2flag(optarg); -+ if (tflags < 0 || tflags > BAM_FMAX) { -+ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); -+ return 1; -+ } -+ flags &= ~tflags; -+ break; -+ case 'G': -+ tflags = bam_str2flag(optarg); -+ if (tflags < 0 || tflags > BAM_FMAX) { -+ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); -+ return 1; -+ } -+ flags |= tflags; -+ break; - default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return usage(); -@@ -139,18 +172,40 @@ - if (optind == argc && !file_list) - return usage(); - -+ /* output file provided by user */ -+ if (output_file != NULL && strcmp(output_file,"-")!=0) { -+ file_out = fopen( output_file, "w" ); -+ if (file_out == NULL) { -+ print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file); -+ return EXIT_FAILURE; -+ } -+ } -+ -+ - // initialize the auxiliary data structures - if (file_list) - { -- if ( read_file_list(file_list,&nfiles,&fn) ) return 1; -+ if (has_index_file) { -+ print_error("depth", "The -f option cannot be combined with -X"); -+ return 1; -+ } -+ if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE; - n = nfiles; - argv = fn; - optind = 0; - } -- else -- n = argc - optind; // the number of BAMs on the command line -+ else if (has_index_file) { // Calculate # of input BAM files -+ if ((argc - optind) % 2 != 0) { -+ fprintf(stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n"); -+ return 1; -+ } -+ n = (argc - optind) / 2; -+ } else { -+ n = argc - optind; -+ } - data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input -- reg_tid = 0; beg = 0; end = INT_MAX; // set the default region -+ reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region -+ - for (i = 0; i < n; ++i) { - int rf; - data[i] = calloc(1, sizeof(aux_t)); -@@ -163,24 +218,32 @@ - rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; - if (baseQ) rf |= SAM_QUAL; - if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -- fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); -- return 1; -+ print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); -+ status = EXIT_FAILURE; -+ goto depth_end; - } - if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { -- fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); -- return 1; -+ print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value"); -+ status = EXIT_FAILURE; -+ goto depth_end; - } - data[i]->min_mapQ = mapQ; // set the mapQ filter - data[i]->min_len = min_len; // set the qlen filter - data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header - if (data[i]->hdr == NULL) { -- fprintf(stderr, "Couldn't read header for \"%s\"\n", -- argv[optind+i]); -+ print_error_errno("depth", "Couldn't read header for \"%s\"", -+ argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - if (reg) { // if a region is specified -- hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (has_index_file) { -+ idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index -+ } else { -+ idx = sam_index_load(data[i]->fp, argv[optind+i]); -+ } - if (idx == NULL) { - print_error("depth", "can't load index for \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; -@@ -194,8 +257,16 @@ - goto depth_end; - } - } -+ data[i]->flags = flags; - } -- -+ if (print_header) { -+ fputs("#CHROM\tPOS", file_out); -+ for (i = 0; i < n; ++i) { -+ fputc('\t', file_out); -+ fputs(argv[optind+i], file_out); -+ } -+ fputc('\n', file_out); -+ } - h = data[0]->hdr; // easy access to the header of the 1st BAM - if (reg) { - beg = data[0]->iter->beg; // and to the parsed region coordinates -@@ -211,21 +282,22 @@ - bam_mplp_set_maxcnt(mplp,INT_MAX); - n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM - plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) -- while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position -+ while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position - if (pos < beg || pos >= end) continue; // out of range; skip -- if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? -+ if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file? - if (all) { - while (tid > last_tid) { - if (last_tid >= 0 && !reg) { - // Deal with remainder or entirety of last tid. -- while (++last_pos < h->target_len[last_tid]) { -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - // Horribly inefficient, but the bed API is an obfuscated black box. -- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); -+ fputs(sam_hdr_tid2name(h, last_tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) -- putchar('\t'), putchar('0'); -- putchar('\n'); -+ fputc('\t', file_out), fputc('0', file_out); -+ fputc('\n', file_out); - } - } - last_tid++; -@@ -237,19 +309,21 @@ - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (last_pos < beg) continue; // out of range; skip -- if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; -- fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1); -+ fputs(sam_hdr_tid2name(h, tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) -- putchar('\t'), putchar('0'); -- putchar('\n'); -+ fputc('\t', file_out), fputc('0', file_out); -+ fputc('\n', file_out); - } - - last_tid = tid; - last_pos = pos; - } -- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; -- fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue; -+ fputs(sam_hdr_tid2name(h, tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized printf() would be faster - for (i = 0; i < n; ++i) { // base level filters have to go here - int j, m = 0; - for (j = 0; j < n_plp[i]; ++j) { -@@ -258,9 +332,9 @@ - else if (p->qpos < p->b->core.l_qseq && - bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality - } -- printf("\t%d", n_plp[i] - m); // this the depth to output -+ fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output - } -- putchar('\n'); -+ fputc('\n', file_out); - } - if (ret < 0) status = EXIT_FAILURE; - free(n_plp); free(plp); -@@ -268,19 +342,20 @@ - - if (all) { - // Handle terminating region -- if (last_tid < 0 && reg && all > 1) { -+ if (last_tid < 0 && reg) { - last_tid = reg_tid; - last_pos = beg-1; - } -- while (last_tid >= 0 && last_tid < h->n_targets) { -- while (++last_pos < h->target_len[last_tid]) { -+ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (last_pos >= end) break; -- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); -+ fputs(sam_hdr_tid2name(h, last_tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) -- putchar('\t'), putchar('0'); -- putchar('\n'); -+ fputc('\t', file_out), fputc('0', file_out); -+ fputc('\n', file_out); - } - last_tid++; - last_pos = -1; -@@ -290,8 +365,17 @@ - } - - depth_end: -+ if (fclose(file_out) != 0) { -+ if (status == EXIT_SUCCESS) { -+ print_error_errno("depth", "error on closing \"%s\"", -+ (output_file && strcmp(output_file, "-") != 0 -+ ? output_file : "stdout")); -+ status = EXIT_FAILURE; -+ } -+ } -+ - for (i = 0; i < n && data[i]; ++i) { -- bam_hdr_destroy(data[i]->hdr); -+ sam_hdr_destroy(data[i]->hdr); - if (data[i]->fp) sam_close(data[i]->fp); - hts_itr_destroy(data[i]->iter); - free(data[i]); ---- python-pysam.orig/samtools/bam2depth.c.pysam.c -+++ python-pysam/samtools/bam2depth.c.pysam.c -@@ -3,7 +3,7 @@ - /* bam2depth.c -- depth subcommand. - - Copyright (C) 2011, 2012 Broad Institute. -- Copyright (C) 2012-2014 Genome Research Ltd. -+ Copyright (C) 2012-2016, 2018, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -41,20 +41,19 @@ - #include - #include "htslib/sam.h" - #include "samtools.h" -+#include "bedidx.h" - #include "sam_opts.h" - -+#define BAM_FMAX ((BAM_FSUPPLEMENTARY << 1) - 1) -+ - typedef struct { // auxiliary data structure - samFile *fp; // the file handle -- bam_hdr_t *hdr; // the file header -+ sam_hdr_t *hdr; // the file header - hts_itr_t *iter; // NULL if a region not specified - int min_mapQ, min_len; // mapQ filter; length filter -+ uint32_t flags; // read filtering flags - } aux_t; - --void *bed_read(const char *fn); // read a BED or position list file --void bed_destroy(void *_h); // destroy the BED data structure --int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps --int bed_query(const void *_h, const char *chr, int pos, int *beg, int *end); -- - // This function reads a BAM alignment from one BAM file. - static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup - { -@@ -64,7 +63,7 @@ - { - ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); - if ( ret<0 ) break; -- if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; -+ if ( b->core.flag & aux->flags) continue; - if ( (int)b->core.qual < aux->min_mapQ ) continue; - if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; - break; -@@ -81,15 +80,21 @@ - fprintf(samtools_stderr, " -a output all positions (including zero depth)\n"); - fprintf(samtools_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); - fprintf(samtools_stderr, " -b list of positions or regions\n"); -+ fprintf(samtools_stderr, " -X use customized index files\n"); - fprintf(samtools_stderr, " -f list of input BAM filenames, one per line [null]\n"); -+ fprintf(samtools_stderr, " -H print a file header\n"); - fprintf(samtools_stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); - fprintf(samtools_stderr, " -d/-m maximum coverage depth [8000]. If 0, depth is set to the maximum\n" - " integer value, effectively removing any depth limit.\n"); // the htslib's default -+ fprintf(samtools_stderr, " -o FILE where to write output to [samtools_stdout]\n"); - fprintf(samtools_stderr, " -q base quality threshold [0]\n"); - fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); - fprintf(samtools_stderr, " -r region\n"); -+ fprintf(samtools_stderr, " -g include reads that have any of the specified flags set [0]\n"); -+ fprintf(samtools_stderr, " -G filter out reads that have any of the specified flags set" -+ " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); - -- sam_global_opt_help(samtools_stderr, "-.--.-"); -+ sam_global_opt_help(samtools_stderr, "-.--.--."); - - fprintf(samtools_stderr, "\n"); - fprintf(samtools_stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); -@@ -97,21 +102,27 @@ - fprintf(samtools_stderr, "omitted by default; see the -a option.\n"); - fprintf(samtools_stderr, "\n"); - -- return 1; -+ return EXIT_FAILURE; - } - - int main_depth(int argc, char *argv[]) - { -- int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; -+ int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0; -+ hts_pos_t beg, end, pos, last_pos = -1; - int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; - const bam_pileup1_t **plp; - char *reg = 0; // specified region - void *bed = 0; // BED data structure - char *file_list = NULL, **fn = NULL; -- bam_hdr_t *h = NULL; // BAM header of the 1st input -+ sam_hdr_t *h = NULL; // BAM header of the 1st input - aux_t **data; - bam_mplp_t mplp; -- int last_pos = -1, last_tid = -1, ret; -+ int last_tid = -1, ret; -+ int print_header = 0; -+ char *output_file = NULL; -+ FILE *file_out = samtools_stdout; -+ uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); -+ int tflags = 0; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { -@@ -120,19 +131,41 @@ - }; - - // parse the command line -- while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) { -+ while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:", lopts, NULL)) >= 0) { - switch (n) { - case 'l': min_len = atoi(optarg); break; // minimum query length - case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header - case 'b': - bed = bed_read(optarg); // BED or position list file can be parsed now -- if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; } -+ if (!bed) { -+ print_error_errno("depth", "Could not read file \"%s\"", optarg); -+ return EXIT_FAILURE; -+ } - break; -+ case 'X': has_index_file = 1; break; - case 'q': baseQ = atoi(optarg); break; // base quality threshold - case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold - case 'f': file_list = optarg; break; - case 'a': all++; break; - case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth -+ case 'H': print_header = 1; break; -+ case 'o': output_file = optarg; break; -+ case 'g': -+ tflags = bam_str2flag(optarg); -+ if (tflags < 0 || tflags > BAM_FMAX) { -+ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); -+ return 1; -+ } -+ flags &= ~tflags; -+ break; -+ case 'G': -+ tflags = bam_str2flag(optarg); -+ if (tflags < 0 || tflags > BAM_FMAX) { -+ print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); -+ return 1; -+ } -+ flags |= tflags; -+ break; - default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return usage(); -@@ -141,18 +174,40 @@ - if (optind == argc && !file_list) - return usage(); - -+ /* output file provided by user */ -+ if (output_file != NULL && strcmp(output_file,"-")!=0) { -+ file_out = fopen( output_file, "w" ); -+ if (file_out == NULL) { -+ print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file); -+ return EXIT_FAILURE; -+ } -+ } -+ -+ - // initialize the auxiliary data structures - if (file_list) - { -- if ( read_file_list(file_list,&nfiles,&fn) ) return 1; -+ if (has_index_file) { -+ print_error("depth", "The -f option cannot be combined with -X"); -+ return 1; -+ } -+ if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE; - n = nfiles; - argv = fn; - optind = 0; - } -- else -- n = argc - optind; // the number of BAMs on the command line -+ else if (has_index_file) { // Calculate # of input BAM files -+ if ((argc - optind) % 2 != 0) { -+ fprintf(samtools_stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n"); -+ return 1; -+ } -+ n = (argc - optind) / 2; -+ } else { -+ n = argc - optind; -+ } - data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input -- reg_tid = 0; beg = 0; end = INT_MAX; // set the default region -+ reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region -+ - for (i = 0; i < n; ++i) { - int rf; - data[i] = calloc(1, sizeof(aux_t)); -@@ -165,24 +220,32 @@ - rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; - if (baseQ) rf |= SAM_QUAL; - if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -- fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); -- return 1; -+ print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); -+ status = EXIT_FAILURE; -+ goto depth_end; - } - if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { -- fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); -- return 1; -+ print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value"); -+ status = EXIT_FAILURE; -+ goto depth_end; - } - data[i]->min_mapQ = mapQ; // set the mapQ filter - data[i]->min_len = min_len; // set the qlen filter - data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header - if (data[i]->hdr == NULL) { -- fprintf(samtools_stderr, "Couldn't read header for \"%s\"\n", -- argv[optind+i]); -+ print_error_errno("depth", "Couldn't read header for \"%s\"", -+ argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - if (reg) { // if a region is specified -- hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (has_index_file) { -+ idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index -+ } else { -+ idx = sam_index_load(data[i]->fp, argv[optind+i]); -+ } - if (idx == NULL) { - print_error("depth", "can't load index for \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; -@@ -196,8 +259,16 @@ - goto depth_end; - } - } -+ data[i]->flags = flags; - } -- -+ if (print_header) { -+ fputs("#CHROM\tPOS", file_out); -+ for (i = 0; i < n; ++i) { -+ fputc('\t', file_out); -+ fputs(argv[optind+i], file_out); -+ } -+ fputc('\n', file_out); -+ } - h = data[0]->hdr; // easy access to the header of the 1st BAM - if (reg) { - beg = data[0]->iter->beg; // and to the parsed region coordinates -@@ -213,21 +284,22 @@ - bam_mplp_set_maxcnt(mplp,INT_MAX); - n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM - plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) -- while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position -+ while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position - if (pos < beg || pos >= end) continue; // out of range; skip -- if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? -+ if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file? - if (all) { - while (tid > last_tid) { - if (last_tid >= 0 && !reg) { - // Deal with remainder or entirety of last tid. -- while (++last_pos < h->target_len[last_tid]) { -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - // Horribly inefficient, but the bed API is an obfuscated black box. -- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- fputs(h->target_name[last_tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); -+ fputs(sam_hdr_tid2name(h, last_tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) -- fputc('\t', samtools_stdout), fputc('0', samtools_stdout); -- fputc('\n', samtools_stdout); -+ fputc('\t', file_out), fputc('0', file_out); -+ fputc('\n', file_out); - } - } - last_tid++; -@@ -239,19 +311,21 @@ - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (last_pos < beg) continue; // out of range; skip -- if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; -- fputs(h->target_name[tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); -+ fputs(sam_hdr_tid2name(h, tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) -- fputc('\t', samtools_stdout), fputc('0', samtools_stdout); -- fputc('\n', samtools_stdout); -+ fputc('\t', file_out), fputc('0', file_out); -+ fputc('\n', file_out); - } - - last_tid = tid; - last_pos = pos; - } -- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; -- fputs(h->target_name[tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", pos+1); // a customized fprintf(samtools_stdout, ) would be faster -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue; -+ fputs(sam_hdr_tid2name(h, tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized fprintf(samtools_stdout, ) would be faster - for (i = 0; i < n; ++i) { // base level filters have to go here - int j, m = 0; - for (j = 0; j < n_plp[i]; ++j) { -@@ -260,9 +334,9 @@ - else if (p->qpos < p->b->core.l_qseq && - bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality - } -- fprintf(samtools_stdout, "\t%d", n_plp[i] - m); // this the depth to output -+ fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output - } -- fputc('\n', samtools_stdout); -+ fputc('\n', file_out); - } - if (ret < 0) status = EXIT_FAILURE; - free(n_plp); free(plp); -@@ -270,19 +344,20 @@ - - if (all) { - // Handle terminating region -- if (last_tid < 0 && reg && all > 1) { -+ if (last_tid < 0 && reg) { - last_tid = reg_tid; - last_pos = beg-1; - } -- while (last_tid >= 0 && last_tid < h->n_targets) { -- while (++last_pos < h->target_len[last_tid]) { -+ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (last_pos >= end) break; -- if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- fputs(h->target_name[last_tid], samtools_stdout); fprintf(samtools_stdout, "\t%d", last_pos+1); -+ fputs(sam_hdr_tid2name(h, last_tid), file_out); -+ fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) -- fputc('\t', samtools_stdout), fputc('0', samtools_stdout); -- fputc('\n', samtools_stdout); -+ fputc('\t', file_out), fputc('0', file_out); -+ fputc('\n', file_out); - } - last_tid++; - last_pos = -1; -@@ -292,8 +367,17 @@ - } - - depth_end: -+ if (fclose(file_out) != 0) { -+ if (status == EXIT_SUCCESS) { -+ print_error_errno("depth", "error on closing \"%s\"", -+ (output_file && strcmp(output_file, "-") != 0 -+ ? output_file : "samtools_stdout")); -+ status = EXIT_FAILURE; -+ } -+ } -+ - for (i = 0; i < n && data[i]; ++i) { -- bam_hdr_destroy(data[i]->hdr); -+ sam_hdr_destroy(data[i]->hdr); - if (data[i]->fp) sam_close(data[i]->fp); - hts_itr_destroy(data[i]->iter); - free(data[i]); ---- python-pysam.orig/samtools/bam_addrprg.c -+++ python-pysam/samtools/bam_addrprg.c -@@ -1,6 +1,6 @@ - /* bam_addrprg.c -- samtools command to add or replace readgroups. - -- Copyright (c) 2013, 2015, 2016 Genome Research Limited. -+ Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited. - - Author: Martin O. Pollard - -@@ -47,6 +47,7 @@ - char* output_name; - char* rg_id; - char* rg_line; -+ int no_pg; - rg_mode mode; - sam_global_args ga; - htsThreadPool p; -@@ -58,9 +59,9 @@ - - struct state { - samFile* input_file; -- bam_hdr_t* input_header; -+ sam_hdr_t* input_header; - samFile* output_file; -- bam_hdr_t* output_header; -+ sam_hdr_t* output_header; - char* rg_id; - void (*mode_func)(const state_t*, bam1_t*); - }; -@@ -71,6 +72,7 @@ - free(opts->rg_id); - free(opts->output_name); - free(opts->input_name); -+ free(opts->rg_line); - if (opts->p.pool) hts_tpool_destroy(opts->p.pool); - sam_global_args_free(&opts->ga); - free(opts); -@@ -81,9 +83,9 @@ - if (!state) return; - free(state->rg_id); - if (state->output_file) sam_close(state->output_file); -- bam_hdr_destroy(state->output_header); -+ sam_hdr_destroy(state->output_header); - if (state->input_file) sam_close(state->input_file); -- bam_hdr_destroy(state->input_header); -+ sam_hdr_destroy(state->input_header); - free(state); - } - -@@ -147,20 +149,6 @@ - return ns; - } - --// These are to be replaced by samtools header parser --// Extracts the first @RG line from a string. --static char* get_rg_line(const char* text, size_t* last) --{ -- const char* rg = text; -- if (rg[0] != '@' || rg[1] != 'R' || rg[2] != 'G' ) { -- if ((rg = (const char*)strstr(text,"\n@RG")) == NULL) { -- return NULL; -- } -- rg++;//skip initial \n -- } -- // duplicate the line for return -- return dup_substring(rg, strchr(rg, '\n'), last); --} - - // Given a @RG line return the id - static char* get_rg_id(const char *line) -@@ -172,44 +160,6 @@ - return dup_substring(id, strchr(id, '\t'), NULL); - } - --// Confirms the existance of an RG line with a given ID in a bam header --static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) --{ -- assert( hdr != NULL && rgid != NULL ); -- -- const char *ptr = hdr->text; -- bool found = false; -- while (ptr != NULL && *ptr != '\0' && found == false ) { -- size_t end = 0; -- char* line = get_rg_line(ptr, &end); -- if (line == NULL) break; // No more @RG -- char* id; -- if (((id = get_rg_id(line)) != NULL) && !strcmp(id, rgid)) { -- found = true; -- } -- free(id); -- free(line); -- ptr += end; -- } -- return found; --} -- --static char* get_first_rgid( const bam_hdr_t *hdr ) --{ -- assert( hdr != NULL ); -- const char *ptr = hdr->text; -- char* found = NULL; -- while (ptr != NULL && *ptr != '\0' && found == NULL ) { -- size_t end = 0; -- char* line = get_rg_line(ptr, &end); -- if ( line ) { -- found = get_rg_id(line); -- } else break; -- free(line); -- ptr += end; -- } -- return found; --} - - static void usage(FILE *fp) - { -@@ -221,8 +171,9 @@ - " -o FILE Where to write output to [stdout]\n" - " -r STRING @RG line text\n" - " -R STRING ID of @RG line in existing header to use\n" -+ " --no-PG Do not add a PG line\n" - ); -- sam_global_opt_help(fp, "..O..@"); -+ sam_global_opt_help(fp, "..O..@.."); - } - - static bool parse_args(int argc, char** argv, parsed_opts_t** opts) -@@ -242,6 +193,7 @@ - sam_global_args_init(&retval->ga); - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - kstring_t rg_line = {0,0,NULL}; -@@ -280,6 +232,9 @@ - usage(stdout); - free(retval); - return true; -+ case 1: -+ retval->no_pg = 1; -+ break; - case '?': - usage(stderr); - free(retval); -@@ -316,6 +271,7 @@ - cleanup_opts(retval); - return false; - } -+ free(retval->rg_line); - retval->rg_line = tmp; - } - retval->input_name = strdup(argv[optind+0]); -@@ -375,7 +331,7 @@ - } - retval->input_header = sam_hdr_read(retval->input_file); - -- retval->output_header = bam_hdr_dup(retval->input_header); -+ retval->output_header = sam_hdr_dup(retval->input_header); - if (opts->output_name) // File format auto-detection - sam_open_mode(output_mode + 1, opts->output_name, NULL); - retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out); -@@ -393,34 +349,39 @@ - if (opts->rg_line) { - // Append new RG line to header. - // Check does not already exist -- if ( confirm_rg(retval->output_header, opts->rg_id) ) { -+ kstring_t hdr_line = { 0, 0, NULL }; -+ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) { - fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); -+ free(hdr_line.s); - return false; - } -- retval->rg_id = strdup(opts->rg_id); -- size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; -- char* new_header = malloc(new_len); -- if (!new_header) { -- fprintf(stderr, "[init] Out of memory whilst writing new header.\n"); -+ if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) { -+ fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id); -+ return false; -+ } -+ if (opts->mode == overwrite_all && -+ -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) { -+ fprintf(stderr, "[init] Error removing the old RG lines from the output header.\n"); - return false; - } -- sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); -- free(retval->output_header->text); -- retval->output_header->text = new_header; -- retval->output_header->l_text = (int)new_len - 1; -+ retval->rg_id = strdup(opts->rg_id); - } else { - if (opts->rg_id) { - // Confirm what has been supplied exists -- if ( !confirm_rg(retval->output_header, opts->rg_id) ) { -+ kstring_t hdr_line = { 0, 0, NULL }; -+ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) { - fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); - return false; - } - retval->rg_id = strdup(opts->rg_id); -+ free(hdr_line.s); - } else { -- if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { -+ kstring_t rg_id = { 0, 0, NULL }; -+ if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) { - fprintf(stderr, "No RG specified on command line or in existing header.\n"); - return false; - } -+ retval->rg_id = ks_release(&rg_id); - } - } - -@@ -436,12 +397,24 @@ - return true; - } - --static bool readgroupise(state_t* state) -+static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list) - { -+ if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ return false; -+ - if (sam_hdr_write(state->output_file, state->output_header) != 0) { - print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__); - return false; - } -+ char *idx_fn = NULL; -+ if (opts->ga.write_index) { -+ if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header))) -+ return false; -+ } - - bam1_t* file_read = bam_init1(); - int ret; -@@ -451,14 +424,25 @@ - if (sam_write1(state->output_file, state->output_header, file_read) < 0) { - print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__); - bam_destroy1(file_read); -+ free(idx_fn); - return false; - } - } - bam_destroy1(file_read); - if (ret != -1) { - print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__); -+ free(idx_fn); - return false; - } else { -+ -+ if (opts->ga.write_index) { -+ if (sam_idx_save(state->output_file) < 0) { -+ print_error_errno("addreplacerg", "[%s] Writing index failed", __func__); -+ free(idx_fn); -+ return false; -+ } -+ } -+ free(idx_fn); - return true; - } - } -@@ -467,20 +451,25 @@ - { - parsed_opts_t* opts = NULL; - state_t* state = NULL; -+ char *arg_list = stringify_argv(argc+1, argv-1); -+ if (!arg_list) -+ return EXIT_FAILURE; - - if (!parse_args(argc, argv, &opts)) goto error; -- if (opts == NULL) return EXIT_SUCCESS; // Not an error but user doesn't want us to proceed -- if (!opts || !init(opts, &state)) goto error; -- -- if (!readgroupise(state)) goto error; -+ if (opts) { // Not an error but user doesn't want us to proceed -+ if (!init(opts, &state) || !readgroupise(opts, state, arg_list)) -+ goto error; -+ } - - cleanup_state(state); - cleanup_opts(opts); -+ free(arg_list); - - return EXIT_SUCCESS; - error: - cleanup_state(state); - cleanup_opts(opts); -+ free(arg_list); - - return EXIT_FAILURE; - } ---- python-pysam.orig/samtools/bam_addrprg.c.pysam.c -+++ python-pysam/samtools/bam_addrprg.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_addrprg.c -- samtools command to add or replace readgroups. - -- Copyright (c) 2013, 2015, 2016 Genome Research Limited. -+ Copyright (c) 2013, 2015-2017, 2019 Genome Research Limited. - - Author: Martin O. Pollard - -@@ -49,6 +49,7 @@ - char* output_name; - char* rg_id; - char* rg_line; -+ int no_pg; - rg_mode mode; - sam_global_args ga; - htsThreadPool p; -@@ -60,9 +61,9 @@ - - struct state { - samFile* input_file; -- bam_hdr_t* input_header; -+ sam_hdr_t* input_header; - samFile* output_file; -- bam_hdr_t* output_header; -+ sam_hdr_t* output_header; - char* rg_id; - void (*mode_func)(const state_t*, bam1_t*); - }; -@@ -73,6 +74,7 @@ - free(opts->rg_id); - free(opts->output_name); - free(opts->input_name); -+ free(opts->rg_line); - if (opts->p.pool) hts_tpool_destroy(opts->p.pool); - sam_global_args_free(&opts->ga); - free(opts); -@@ -83,9 +85,9 @@ - if (!state) return; - free(state->rg_id); - if (state->output_file) sam_close(state->output_file); -- bam_hdr_destroy(state->output_header); -+ sam_hdr_destroy(state->output_header); - if (state->input_file) sam_close(state->input_file); -- bam_hdr_destroy(state->input_header); -+ sam_hdr_destroy(state->input_header); - free(state); - } - -@@ -149,20 +151,6 @@ - return ns; - } - --// These are to be replaced by samtools header parser --// Extracts the first @RG line from a string. --static char* get_rg_line(const char* text, size_t* last) --{ -- const char* rg = text; -- if (rg[0] != '@' || rg[1] != 'R' || rg[2] != 'G' ) { -- if ((rg = (const char*)strstr(text,"\n@RG")) == NULL) { -- return NULL; -- } -- rg++;//skip initial \n -- } -- // duplicate the line for return -- return dup_substring(rg, strchr(rg, '\n'), last); --} - - // Given a @RG line return the id - static char* get_rg_id(const char *line) -@@ -174,44 +162,6 @@ - return dup_substring(id, strchr(id, '\t'), NULL); - } - --// Confirms the existance of an RG line with a given ID in a bam header --static bool confirm_rg( const bam_hdr_t *hdr, const char* rgid ) --{ -- assert( hdr != NULL && rgid != NULL ); -- -- const char *ptr = hdr->text; -- bool found = false; -- while (ptr != NULL && *ptr != '\0' && found == false ) { -- size_t end = 0; -- char* line = get_rg_line(ptr, &end); -- if (line == NULL) break; // No more @RG -- char* id; -- if (((id = get_rg_id(line)) != NULL) && !strcmp(id, rgid)) { -- found = true; -- } -- free(id); -- free(line); -- ptr += end; -- } -- return found; --} -- --static char* get_first_rgid( const bam_hdr_t *hdr ) --{ -- assert( hdr != NULL ); -- const char *ptr = hdr->text; -- char* found = NULL; -- while (ptr != NULL && *ptr != '\0' && found == NULL ) { -- size_t end = 0; -- char* line = get_rg_line(ptr, &end); -- if ( line ) { -- found = get_rg_id(line); -- } else break; -- free(line); -- ptr += end; -- } -- return found; --} - - static void usage(FILE *fp) - { -@@ -223,8 +173,9 @@ - " -o FILE Where to write output to [samtools_stdout]\n" - " -r STRING @RG line text\n" - " -R STRING ID of @RG line in existing header to use\n" -+ " --no-PG Do not add a PG line\n" - ); -- sam_global_opt_help(fp, "..O..@"); -+ sam_global_opt_help(fp, "..O..@.."); - } - - static bool parse_args(int argc, char** argv, parsed_opts_t** opts) -@@ -244,6 +195,7 @@ - sam_global_args_init(&retval->ga); - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - kstring_t rg_line = {0,0,NULL}; -@@ -282,6 +234,9 @@ - usage(samtools_stdout); - free(retval); - return true; -+ case 1: -+ retval->no_pg = 1; -+ break; - case '?': - usage(samtools_stderr); - free(retval); -@@ -318,6 +273,7 @@ - cleanup_opts(retval); - return false; - } -+ free(retval->rg_line); - retval->rg_line = tmp; - } - retval->input_name = strdup(argv[optind+0]); -@@ -377,7 +333,7 @@ - } - retval->input_header = sam_hdr_read(retval->input_file); - -- retval->output_header = bam_hdr_dup(retval->input_header); -+ retval->output_header = sam_hdr_dup(retval->input_header); - if (opts->output_name) // File format auto-detection - sam_open_mode(output_mode + 1, opts->output_name, NULL); - retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out); -@@ -395,34 +351,39 @@ - if (opts->rg_line) { - // Append new RG line to header. - // Check does not already exist -- if ( confirm_rg(retval->output_header, opts->rg_id) ) { -+ kstring_t hdr_line = { 0, 0, NULL }; -+ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) { - fprintf(samtools_stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); -+ free(hdr_line.s); - return false; - } -- retval->rg_id = strdup(opts->rg_id); -- size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; -- char* new_header = malloc(new_len); -- if (!new_header) { -- fprintf(samtools_stderr, "[init] Out of memory whilst writing new header.\n"); -+ if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) { -+ fprintf(samtools_stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id); -+ return false; -+ } -+ if (opts->mode == overwrite_all && -+ -1 == sam_hdr_remove_except(retval->output_header, "RG", "ID", opts->rg_id)) { -+ fprintf(samtools_stderr, "[init] Error removing the old RG lines from the output header.\n"); - return false; - } -- sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); -- free(retval->output_header->text); -- retval->output_header->text = new_header; -- retval->output_header->l_text = (int)new_len - 1; -+ retval->rg_id = strdup(opts->rg_id); - } else { - if (opts->rg_id) { - // Confirm what has been supplied exists -- if ( !confirm_rg(retval->output_header, opts->rg_id) ) { -+ kstring_t hdr_line = { 0, 0, NULL }; -+ if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) < 0) { - fprintf(samtools_stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); - return false; - } - retval->rg_id = strdup(opts->rg_id); -+ free(hdr_line.s); - } else { -- if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { -+ kstring_t rg_id = { 0, 0, NULL }; -+ if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) { - fprintf(samtools_stderr, "No RG specified on command line or in existing header.\n"); - return false; - } -+ retval->rg_id = ks_release(&rg_id); - } - } - -@@ -438,12 +399,24 @@ - return true; - } - --static bool readgroupise(state_t* state) -+static bool readgroupise(parsed_opts_t *opts, state_t* state, char *arg_list) - { -+ if (!opts->no_pg && sam_hdr_add_pg(state->output_header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ return false; -+ - if (sam_hdr_write(state->output_file, state->output_header) != 0) { - print_error_errno("addreplacerg", "[%s] Could not write header to output file", __func__); - return false; - } -+ char *idx_fn = NULL; -+ if (opts->ga.write_index) { -+ if (!(idx_fn = auto_index(state->output_file, opts->output_name, state->output_header))) -+ return false; -+ } - - bam1_t* file_read = bam_init1(); - int ret; -@@ -453,14 +426,25 @@ - if (sam_write1(state->output_file, state->output_header, file_read) < 0) { - print_error_errno("addreplacerg", "[%s] Could not write read to output file", __func__); - bam_destroy1(file_read); -+ free(idx_fn); - return false; - } - } - bam_destroy1(file_read); - if (ret != -1) { - print_error_errno("addreplacerg", "[%s] Error reading from input file", __func__); -+ free(idx_fn); - return false; - } else { -+ -+ if (opts->ga.write_index) { -+ if (sam_idx_save(state->output_file) < 0) { -+ print_error_errno("addreplacerg", "[%s] Writing index failed", __func__); -+ free(idx_fn); -+ return false; -+ } -+ } -+ free(idx_fn); - return true; - } - } -@@ -469,20 +453,25 @@ - { - parsed_opts_t* opts = NULL; - state_t* state = NULL; -+ char *arg_list = stringify_argv(argc+1, argv-1); -+ if (!arg_list) -+ return EXIT_FAILURE; - - if (!parse_args(argc, argv, &opts)) goto error; -- if (opts == NULL) return EXIT_SUCCESS; // Not an error but user doesn't want us to proceed -- if (!opts || !init(opts, &state)) goto error; -- -- if (!readgroupise(state)) goto error; -+ if (opts) { // Not an error but user doesn't want us to proceed -+ if (!init(opts, &state) || !readgroupise(opts, state, arg_list)) -+ goto error; -+ } - - cleanup_state(state); - cleanup_opts(opts); -+ free(arg_list); - - return EXIT_SUCCESS; - error: - cleanup_state(state); - cleanup_opts(opts); -+ free(arg_list); - - return EXIT_FAILURE; - } ---- python-pysam.orig/samtools/bam_aux.c -+++ python-pysam/samtools/bam_aux.c -@@ -1,6 +1,6 @@ - /* bam_aux.c -- remaining aux field handling. - -- Copyright (C) 2008-2010, 2013 Genome Research Ltd. -+ Copyright (C) 2008-2010, 2013, 2015, 2019 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li -@@ -61,21 +61,15 @@ - return 0; - } - -+// Only here due to libbam.a being used by some applications. - int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) - { -- const char *name_lim = hts_parse_reg(str, beg, end); -- if (name_lim) { -- char *name = malloc(name_lim - str + 1); -- memcpy(name, str, name_lim - str); -- name[name_lim - str] = '\0'; -- *ref_id = bam_name2id(header, name); -- free(name); -- } -- else { -- // not parsable as a region, but possibly a sequence named "foo:a" -- *ref_id = bam_name2id(header, str); -- *beg = 0; *end = INT_MAX; -- } -- if (*ref_id == -1) return -1; -- return *beg <= *end? 0 : -1; -+ hts_pos_t beg64, end64; -+ int r; -+ r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1; -+ if (beg64 > INT_MAX || end64 > INT_MAX) -+ return -1; -+ *beg = beg64; -+ *end = end64; -+ return r; - } ---- python-pysam.orig/samtools/bam_aux.c.pysam.c -+++ python-pysam/samtools/bam_aux.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_aux.c -- remaining aux field handling. - -- Copyright (C) 2008-2010, 2013 Genome Research Ltd. -+ Copyright (C) 2008-2010, 2013, 2015, 2019 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li -@@ -63,21 +63,15 @@ - return 0; - } - -+// Only here due to libbam.a being used by some applications. - int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) - { -- const char *name_lim = hts_parse_reg(str, beg, end); -- if (name_lim) { -- char *name = malloc(name_lim - str + 1); -- memcpy(name, str, name_lim - str); -- name[name_lim - str] = '\0'; -- *ref_id = bam_name2id(header, name); -- free(name); -- } -- else { -- // not parsable as a region, but possibly a sequence named "foo:a" -- *ref_id = bam_name2id(header, str); -- *beg = 0; *end = INT_MAX; -- } -- if (*ref_id == -1) return -1; -- return *beg <= *end? 0 : -1; -+ hts_pos_t beg64, end64; -+ int r; -+ r = sam_parse_region(header, str, ref_id, &beg64, &end64, 0) ? 0 : -1; -+ if (beg64 > INT_MAX || end64 > INT_MAX) -+ return -1; -+ *beg = beg64; -+ *end = end64; -+ return r; - } ---- python-pysam.orig/samtools/bam_cat.c -+++ python-pysam/samtools/bam_cat.c -@@ -1,6 +1,6 @@ - /* bam_cat.c -- efficiently concatenates bam files. - -- Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd. -+ Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd. - Modified SAMtools work copyright (C) 2010 Illumina, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy -@@ -45,162 +45,43 @@ - #include "htslib/bgzf.h" - #include "htslib/sam.h" - #include "htslib/cram.h" --#include "htslib/khash.h" -+#include "htslib/kstring.h" - #include "samtools.h" -- --KHASH_MAP_INIT_STR(s2i, int) -- --// Bi-directional lookup. --// We can go from name to ID or ID to name. --typedef struct khash_s2i { -- khash_t(s2i) *h; -- int n_id, a_id; -- const char **id; // map Nth entry back to key -- const char **line; --} khash_s2i; -- --static int hash_s2i_inc(khash_s2i *hash, const char *str, const char *line, int *added) { -- // loosly based on khash_str2int_inc -- khint_t k; -- int n; -- -- if ( !hash ) return -1; -- // inefficient, but works -- char *my_str = strdup(str); -- k = kh_put(s2i, hash->h, my_str, added); -- if (*added == 0) { -- free(my_str); -- return kh_val(hash->h, k); -- } -- n = hash->n_id++; -- kh_val(hash->h, k) = n; -- if (hash->a_id <= n) { -- const char **id; -- hash->a_id = (n+1)*2; -- if (!(id = realloc(hash->id, hash->a_id*sizeof(*hash->id)))) -- return -1; -- hash->id = id; -- if (!(id = realloc(hash->line, hash->a_id*sizeof(*hash->line)))) -- return -1; -- hash->line = id; -- } -- hash->id[n] = my_str; // reverse map -- if (line) -- hash->line[n] = line; -- -- return n; --} -- --khash_s2i *hash_s2i_create(void) { -- khash_s2i *h = calloc(1, sizeof(*h)); -- if (!h) -- return NULL; -- -- h->h = kh_init(s2i); -- if (!h->h) { -- free(h); -- return NULL; -- } -- return h; --} -- --static void hash_s2i_free(khash_s2i *hash) { -- // based on khash_str2int_destroy_free -- khint_t k; -- if (!hash) return; -- if (hash->h) { -- for (k = 0; k < kh_end(hash->h); ++k) -- if (kh_exist(hash->h, k)) free((char*)kh_key(hash->h, k)); -- kh_destroy(s2i, hash->h); -- } -- if (hash->id) -- free(hash->id); -- if (hash->line) -- free(hash->line); -- -- free(hash); --} -- --static khash_s2i *hash_rg(const bam_hdr_t *h) { -- khash_s2i *rg2id = hash_s2i_create(); -- char *cp, *line; -- int j, l; -- -- if (!h) -- return rg2id; -- -- if (!rg2id) -- return NULL; -- -- cp = h->text; -- -- for (l = 0; l+3 < h->l_text; l++) { -- line = &cp[l]; -- if (!(cp[l] == '@' && cp[l+1] == 'R' && cp[l+2] == 'G')) { -- while (l < h->l_text && cp[l] != '\n') -- l++; -- continue; -- } -- -- // Found an @RG line; add to hash -- while (cp[l] != '\n') { -- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') -- l++; -- if (l+4 < h->l_text && cp[l+1] == 'I' && cp[l+2] == 'D') -- break; -- } -- if (cp[l] == '\n') -- continue; -- l = (j = l+4); -- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') -- l++; -- -- // To do: save id and keep realloc as needed, as hash_s2i_inc strdups. -- char *id = malloc(l-j+1); -- strncpy(id, &cp[j], l-j); -- id[l-j] = 0; -- -- int added; -- hash_s2i_inc(rg2id, id, line, &added); -- free(id); -- -- while (l < h->l_text && cp[l] != '\n') -- l++; -- } -- -- return rg2id; --} -+#include "sam_opts.h" - - /* - * Check the files are consistent and capable of being concatenated. -- * Also fills out the rg2id read-group hash and the version numbers -- * and produces a new bam_hdr_t structure with merged RG lines. -- * Note it is only a simple merge, as we lack the niceties of a proper -- * header API. -+ * Also fills out the version numbers and produces a new sam_hdr_t -+ * structure with merged RG lines. -+ * Note it is only a simple merge. - * - * Returns updated header on success; - * NULL on failure. - */ --static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h, -- khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) { -+static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h, -+ int *vers_maj_p, int *vers_min_p) { - int i, vers_maj = -1, vers_min = -1; -- bam_hdr_t *new_h = NULL; -+ sam_hdr_t *new_h = NULL, *old_h = NULL; -+ samFile *in = NULL; -+ kstring_t ks = KS_INITIALIZE; - - if (h) { -- new_h = bam_hdr_dup(h); -- *rg2id = hash_rg(new_h); -+ new_h = sam_hdr_dup(h); -+ if (!new_h) { -+ fprintf(stderr, "[%s] ERROR: header duplication failed.\n", -+ __func__); -+ goto fail; -+ } - } - - for (i = 0; i < nfn; ++i) { -- samFile *in; - cram_fd *in_c; -- khint_t ki; -- int new_rg = -1; -+ int ki; - - in = sam_open(fn[i], "rc"); - if (in == 0) { - print_error_errno("cat", "fail to open file '%s'", fn[i]); -- return NULL; -+ goto fail; - } - in_c = in->fp.cram; - -@@ -210,55 +91,81 @@ - (vers_min != -1 && vers_min != vmin)) { - fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n", - __func__); -- return NULL; -+ goto fail; - } - vers_maj = vmaj; - vers_min = vmin; - -- bam_hdr_t *old = sam_hdr_read(in); -- khash_s2i *rg2id_in = hash_rg(old); -+ old_h = sam_hdr_read(in); -+ if (!old_h) { -+ fprintf(stderr, "[%s] ERROR: header reading for file '%s' filed.\n", -+ __func__, fn[i]); -+ goto fail; -+ } - - if (!new_h) { -- new_h = bam_hdr_dup(old); -- *rg2id = hash_rg(new_h); -+ new_h = sam_hdr_dup(old_h); -+ if (!new_h) { -+ fprintf(stderr, "[%s] ERROR: header duplication for file '%s' failed.\n", -+ __func__, fn[i]); -+ goto fail; -+ } -+ sam_hdr_destroy(old_h); -+ sam_close(in); -+ continue; - } - -- // Add any existing @RG entries to our global @RG hash. -- for (ki = 0; ki < rg2id_in->n_id; ki++) { -- int added; -- -- new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); -- //fprintf(stderr, "RG %s: #%d -> #%d\n", -- // rg2id_in->id[ki], ki, new_rg); -- -- if (added) { -- // Also add to new_h -- const char *line = rg2id_in->line[ki]; -- const char *line_end = line; -- while (*line && *line_end++ != '\n') -- ; -- new_h->l_text += line_end - line; -- new_h->text = realloc(new_h->text, new_h->l_text+1); -- strncat(&new_h->text[new_h->l_text - (line_end - line)], -- line, line_end - line); -+ int old_count = sam_hdr_count_lines(old_h, "RG"); -+ for (ki = 0; ki < old_count; ki++) { -+ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); -+ if (old_name) { -+ int new_i = sam_hdr_line_index(new_h, "RG", old_name); -+ if (-1 == new_i) { // line does not exist in the new header -+ if (sam_hdr_find_line_pos(old_h, "RG", ki, &ks) || -+ !ks.s || sam_hdr_add_lines(new_h, ks.s, ks.l)) { -+ fprintf(stderr, "[%s] ERROR: failed to add @RG line 'ID:%s' from file '%s'\n", -+ __func__, old_name, fn[i]); -+ goto fail; -+ } -+ ks_free(&ks); -+ } -+ } else { -+ fprintf(stderr, "[%s] ERROR: failed to read %d @RG line from file '%s'\n", -+ __func__, ki, fn[i]); -+ goto fail; - } -+ } - -- if (new_rg != ki && rg2id_in->n_id > 1) { -- fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", -- __func__); -- return NULL; -+ if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) { -+ for (ki = 0; ki < old_count; ki++) { -+ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); -+ const char *new_name = sam_hdr_line_name(new_h, "RG", ki); -+ if (!old_name || !new_name || strcmp(old_name, new_name)) { -+ fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", -+ __func__); -+ goto fail; -+ } - } - } - -- hash_s2i_free(rg2id_in); -- bam_hdr_destroy(old); -+ sam_hdr_destroy(old_h); - sam_close(in); - } - -+ ks_free(&ks); -+ - *vers_maj_p = vers_maj; - *vers_min_p = vers_min; - - return new_h; -+ -+fail: -+ ks_free(&ks); -+ if (old_h) sam_hdr_destroy(old_h); -+ if (new_h) sam_hdr_destroy(new_h); -+ if (in) sam_close(in); -+ -+ return NULL; - } - - -@@ -289,22 +196,21 @@ - * huffman code. In this situation we can change the meta-data in the - * compression header to renumber an RG value.. - */ --int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) -+int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg) - { - samFile *out; - cram_fd *out_c; - int i, vers_maj, vers_min; -- khash_s2i *rg2id = NULL; -- bam_hdr_t *new_h = NULL; -+ sam_hdr_t *new_h = NULL; - - /* Check consistent versioning and compatible headers */ -- if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) -+ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min))) - return -1; - - /* Open the file with cram_vers */ - char vers[100]; - sprintf(vers, "%d.%d", vers_maj, vers_min); -- out = sam_open(outcram, "wc"); -+ out = sam_open_format(outcram, "wc", &ga->out); - if (out == 0) { - print_error_errno("cat", "fail to open output file '%s'", outcram); - return -1; -@@ -313,7 +219,13 @@ - cram_set_option(out_c, CRAM_OPT_VERSION, vers); - //fprintf(stderr, "Creating cram vers %s\n", vers); - -- cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? -+ if (!no_pg && sam_hdr_add_pg(new_h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ return -1; -+ - if (sam_hdr_write(out, new_h) < 0) { - print_error_errno("cat", "Couldn't write header"); - return -1; -@@ -323,7 +235,7 @@ - samFile *in; - cram_fd *in_c; - cram_container *c; -- bam_hdr_t *old; -+ sam_hdr_t *old_h; - int new_rg = -1; - - in = sam_open(fn[i], "rc"); -@@ -333,20 +245,29 @@ - } - in_c = in->fp.cram; - -- old = sam_hdr_read(in); -- khash_s2i *rg2id_in = hash_rg(old); -+ old_h = sam_hdr_read(in); -+ if (!old_h) { -+ print_error("cat", "fail to read the header of file '%s'", fn[i]); -+ return -1; -+ } - - // Compute RG mapping if suitable for changing. -- if (rg2id_in->n_id == 1) { -- int _; -- new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); -+ if (sam_hdr_count_lines(old_h, "RG") == 1) { -+ const char *old_name = sam_hdr_line_name(old_h, "RG", 0); -+ if (old_name) { -+ new_rg = sam_hdr_line_index(new_h, "RG", old_name); -+ if (new_rg < 0) { -+ print_error("cat", "fail to find @RG line '%s' in the new header", old_name); -+ return -1; -+ } -+ } else { -+ print_error("cat", "fail to find @RG line in file '%s'", fn[i]); -+ return -1; -+ } - } else { - new_rg = 0; - } - -- hash_s2i_free(rg2id_in); -- -- - // Copy contains and blocks within them - while ((c = cram_read_container(in_c))) { - cram_block *blk; -@@ -400,13 +321,11 @@ - cram_free_container(c); - } - -- bam_hdr_destroy(old); -+ sam_hdr_destroy(old_h); - sam_close(in); - } - sam_close(out); -- -- hash_s2i_free(rg2id); -- bam_hdr_destroy(new_h); -+ sam_hdr_destroy(new_h); - - return 0; - } -@@ -419,7 +338,7 @@ - - #define BGZF_EMPTY_BLOCK_SIZE 28 - --int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) -+int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg) - { - BGZF *fp, *in = NULL; - uint8_t *buf = NULL; -@@ -433,6 +352,13 @@ - return -1; - } - if (h) { -+ if (!no_pg && sam_hdr_add_pg(h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ goto fail; -+ - if (bam_hdr_write(fp, h) < 0) { - print_error_errno("cat", "Couldn't write header"); - goto fail; -@@ -445,7 +371,7 @@ - goto fail; - } - for(i = 0; i < nfn; ++i){ -- bam_hdr_t *old; -+ sam_hdr_t *old; - int len,j; - - in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); -@@ -462,6 +388,13 @@ - goto fail; - } - if (h == 0 && i == 0) { -+ if (!no_pg && sam_hdr_add_pg(old, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ goto fail; -+ - if (bam_hdr_write(fp, old) < 0) { - print_error_errno("cat", "Couldn't write header"); - goto fail; -@@ -507,7 +440,7 @@ - if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; - } - } -- bam_hdr_destroy(old); -+ sam_hdr_destroy(old); - bgzf_close(in); - in = NULL; - } -@@ -530,14 +463,25 @@ - - int main_cat(int argc, char *argv[]) - { -- bam_hdr_t *h = 0; -+ sam_hdr_t *h = 0; - char *outfn = 0; - char **infns = NULL; // files to concatenate - int infns_size = 0; -- int c, ret = 0; -+ int c, ret = 0, no_pg = 0; - samFile *in; -+ sam_global_args ga; -+ -+ static const struct option lopts[] = { -+ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'), -+ {"no-PG", no_argument, NULL, 1}, -+ { NULL, 0, NULL, 0 } -+ }; -+ -+ char *arg_list = NULL; - -- while ((c = getopt(argc, argv, "h:o:b:")) >= 0) { -+ sam_global_args_init(&ga); -+ -+ while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) { - switch (c) { - case 'h': { - samFile *fph = sam_open(optarg, "r"); -@@ -573,9 +517,19 @@ - } - break; - } -+ case 1: -+ no_pg = 1; -+ break; -+ default: -+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - } - } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("cat", "failed to create arg_list"); -+ return 1; -+ } -+ - // Append files specified in argv to the list. - int nargv_fns = argc - optind; - if (nargv_fns > 0) { -@@ -592,6 +546,8 @@ - fprintf(stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); - fprintf(stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); - fprintf(stderr, " -o FILE output BAM/CRAM\n"); -+ fprintf(stderr, " --no-PG do not add a PG line\n"); -+ sam_global_opt_help(stderr, "--..-@-."); - return 1; - } - -@@ -604,13 +560,13 @@ - switch (hts_get_format(in)->format) { - case bam: - sam_close(in); -- if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) -+ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0) - ret = 1; - break; - - case cram: - sam_close(in); -- if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) -+ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0) - ret = 1; - break; - -@@ -629,9 +585,9 @@ - - free(outfn); - free(infns); -- -+ free(arg_list); - if (h) -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - - return ret; - } ---- python-pysam.orig/samtools/bam_cat.c.pysam.c -+++ python-pysam/samtools/bam_cat.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_cat.c -- efficiently concatenates bam files. - -- Copyright (C) 2008-2009, 2011-2013, 2015-2016 Genome Research Ltd. -+ Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd. - Modified SAMtools work copyright (C) 2010 Illumina, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy -@@ -47,162 +47,43 @@ - #include "htslib/bgzf.h" - #include "htslib/sam.h" - #include "htslib/cram.h" --#include "htslib/khash.h" -+#include "htslib/kstring.h" - #include "samtools.h" -- --KHASH_MAP_INIT_STR(s2i, int) -- --// Bi-directional lookup. --// We can go from name to ID or ID to name. --typedef struct khash_s2i { -- khash_t(s2i) *h; -- int n_id, a_id; -- const char **id; // map Nth entry back to key -- const char **line; --} khash_s2i; -- --static int hash_s2i_inc(khash_s2i *hash, const char *str, const char *line, int *added) { -- // loosly based on khash_str2int_inc -- khint_t k; -- int n; -- -- if ( !hash ) return -1; -- // inefficient, but works -- char *my_str = strdup(str); -- k = kh_put(s2i, hash->h, my_str, added); -- if (*added == 0) { -- free(my_str); -- return kh_val(hash->h, k); -- } -- n = hash->n_id++; -- kh_val(hash->h, k) = n; -- if (hash->a_id <= n) { -- const char **id; -- hash->a_id = (n+1)*2; -- if (!(id = realloc(hash->id, hash->a_id*sizeof(*hash->id)))) -- return -1; -- hash->id = id; -- if (!(id = realloc(hash->line, hash->a_id*sizeof(*hash->line)))) -- return -1; -- hash->line = id; -- } -- hash->id[n] = my_str; // reverse map -- if (line) -- hash->line[n] = line; -- -- return n; --} -- --khash_s2i *hash_s2i_create(void) { -- khash_s2i *h = calloc(1, sizeof(*h)); -- if (!h) -- return NULL; -- -- h->h = kh_init(s2i); -- if (!h->h) { -- free(h); -- return NULL; -- } -- return h; --} -- --static void hash_s2i_free(khash_s2i *hash) { -- // based on khash_str2int_destroy_free -- khint_t k; -- if (!hash) return; -- if (hash->h) { -- for (k = 0; k < kh_end(hash->h); ++k) -- if (kh_exist(hash->h, k)) free((char*)kh_key(hash->h, k)); -- kh_destroy(s2i, hash->h); -- } -- if (hash->id) -- free(hash->id); -- if (hash->line) -- free(hash->line); -- -- free(hash); --} -- --static khash_s2i *hash_rg(const bam_hdr_t *h) { -- khash_s2i *rg2id = hash_s2i_create(); -- char *cp, *line; -- int j, l; -- -- if (!h) -- return rg2id; -- -- if (!rg2id) -- return NULL; -- -- cp = h->text; -- -- for (l = 0; l+3 < h->l_text; l++) { -- line = &cp[l]; -- if (!(cp[l] == '@' && cp[l+1] == 'R' && cp[l+2] == 'G')) { -- while (l < h->l_text && cp[l] != '\n') -- l++; -- continue; -- } -- -- // Found an @RG line; add to hash -- while (cp[l] != '\n') { -- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') -- l++; -- if (l+4 < h->l_text && cp[l+1] == 'I' && cp[l+2] == 'D') -- break; -- } -- if (cp[l] == '\n') -- continue; -- l = (j = l+4); -- while (l < h->l_text && cp[l] != '\n' && cp[l] != '\t') -- l++; -- -- // To do: save id and keep realloc as needed, as hash_s2i_inc strdups. -- char *id = malloc(l-j+1); -- strncpy(id, &cp[j], l-j); -- id[l-j] = 0; -- -- int added; -- hash_s2i_inc(rg2id, id, line, &added); -- free(id); -- -- while (l < h->l_text && cp[l] != '\n') -- l++; -- } -- -- return rg2id; --} -+#include "sam_opts.h" - - /* - * Check the files are consistent and capable of being concatenated. -- * Also fills out the rg2id read-group hash and the version numbers -- * and produces a new bam_hdr_t structure with merged RG lines. -- * Note it is only a simple merge, as we lack the niceties of a proper -- * header API. -+ * Also fills out the version numbers and produces a new sam_hdr_t -+ * structure with merged RG lines. -+ * Note it is only a simple merge. - * - * Returns updated header on success; - * NULL on failure. - */ --static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h, -- khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) { -+static sam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const sam_hdr_t *h, -+ int *vers_maj_p, int *vers_min_p) { - int i, vers_maj = -1, vers_min = -1; -- bam_hdr_t *new_h = NULL; -+ sam_hdr_t *new_h = NULL, *old_h = NULL; -+ samFile *in = NULL; -+ kstring_t ks = KS_INITIALIZE; - - if (h) { -- new_h = bam_hdr_dup(h); -- *rg2id = hash_rg(new_h); -+ new_h = sam_hdr_dup(h); -+ if (!new_h) { -+ fprintf(samtools_stderr, "[%s] ERROR: header duplication failed.\n", -+ __func__); -+ goto fail; -+ } - } - - for (i = 0; i < nfn; ++i) { -- samFile *in; - cram_fd *in_c; -- khint_t ki; -- int new_rg = -1; -+ int ki; - - in = sam_open(fn[i], "rc"); - if (in == 0) { - print_error_errno("cat", "fail to open file '%s'", fn[i]); -- return NULL; -+ goto fail; - } - in_c = in->fp.cram; - -@@ -212,55 +93,81 @@ - (vers_min != -1 && vers_min != vmin)) { - fprintf(samtools_stderr, "[%s] ERROR: input files have differing version numbers.\n", - __func__); -- return NULL; -+ goto fail; - } - vers_maj = vmaj; - vers_min = vmin; - -- bam_hdr_t *old = sam_hdr_read(in); -- khash_s2i *rg2id_in = hash_rg(old); -+ old_h = sam_hdr_read(in); -+ if (!old_h) { -+ fprintf(samtools_stderr, "[%s] ERROR: header reading for file '%s' filed.\n", -+ __func__, fn[i]); -+ goto fail; -+ } - - if (!new_h) { -- new_h = bam_hdr_dup(old); -- *rg2id = hash_rg(new_h); -+ new_h = sam_hdr_dup(old_h); -+ if (!new_h) { -+ fprintf(samtools_stderr, "[%s] ERROR: header duplication for file '%s' failed.\n", -+ __func__, fn[i]); -+ goto fail; -+ } -+ sam_hdr_destroy(old_h); -+ sam_close(in); -+ continue; - } - -- // Add any existing @RG entries to our global @RG hash. -- for (ki = 0; ki < rg2id_in->n_id; ki++) { -- int added; -- -- new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); -- //fprintf(samtools_stderr, "RG %s: #%d -> #%d\n", -- // rg2id_in->id[ki], ki, new_rg); -- -- if (added) { -- // Also add to new_h -- const char *line = rg2id_in->line[ki]; -- const char *line_end = line; -- while (*line && *line_end++ != '\n') -- ; -- new_h->l_text += line_end - line; -- new_h->text = realloc(new_h->text, new_h->l_text+1); -- strncat(&new_h->text[new_h->l_text - (line_end - line)], -- line, line_end - line); -+ int old_count = sam_hdr_count_lines(old_h, "RG"); -+ for (ki = 0; ki < old_count; ki++) { -+ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); -+ if (old_name) { -+ int new_i = sam_hdr_line_index(new_h, "RG", old_name); -+ if (-1 == new_i) { // line does not exist in the new header -+ if (sam_hdr_find_line_pos(old_h, "RG", ki, &ks) || -+ !ks.s || sam_hdr_add_lines(new_h, ks.s, ks.l)) { -+ fprintf(samtools_stderr, "[%s] ERROR: failed to add @RG line 'ID:%s' from file '%s'\n", -+ __func__, old_name, fn[i]); -+ goto fail; -+ } -+ ks_free(&ks); -+ } -+ } else { -+ fprintf(samtools_stderr, "[%s] ERROR: failed to read %d @RG line from file '%s'\n", -+ __func__, ki, fn[i]); -+ goto fail; - } -+ } - -- if (new_rg != ki && rg2id_in->n_id > 1) { -- fprintf(samtools_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", -- __func__); -- return NULL; -+ if (old_count > 1 && sam_hdr_count_lines(new_h, "RG") == old_count) { -+ for (ki = 0; ki < old_count; ki++) { -+ const char *old_name = sam_hdr_line_name(old_h, "RG", ki); -+ const char *new_name = sam_hdr_line_name(new_h, "RG", ki); -+ if (!old_name || !new_name || strcmp(old_name, new_name)) { -+ fprintf(samtools_stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", -+ __func__); -+ goto fail; -+ } - } - } - -- hash_s2i_free(rg2id_in); -- bam_hdr_destroy(old); -+ sam_hdr_destroy(old_h); - sam_close(in); - } - -+ ks_free(&ks); -+ - *vers_maj_p = vers_maj; - *vers_min_p = vers_min; - - return new_h; -+ -+fail: -+ ks_free(&ks); -+ if (old_h) sam_hdr_destroy(old_h); -+ if (new_h) sam_hdr_destroy(new_h); -+ if (in) sam_close(in); -+ -+ return NULL; - } - - -@@ -291,22 +198,21 @@ - * huffman code. In this situation we can change the meta-data in the - * compression header to renumber an RG value.. - */ --int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) -+int cram_cat(int nfn, char * const *fn, const sam_hdr_t *h, const char* outcram, sam_global_args *ga, char *arg_list, int no_pg) - { - samFile *out; - cram_fd *out_c; - int i, vers_maj, vers_min; -- khash_s2i *rg2id = NULL; -- bam_hdr_t *new_h = NULL; -+ sam_hdr_t *new_h = NULL; - - /* Check consistent versioning and compatible headers */ -- if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) -+ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &vers_maj, &vers_min))) - return -1; - - /* Open the file with cram_vers */ - char vers[100]; - sprintf(vers, "%d.%d", vers_maj, vers_min); -- out = sam_open(outcram, "wc"); -+ out = sam_open_format(outcram, "wc", &ga->out); - if (out == 0) { - print_error_errno("cat", "fail to open output file '%s'", outcram); - return -1; -@@ -315,7 +221,13 @@ - cram_set_option(out_c, CRAM_OPT_VERSION, vers); - //fprintf(samtools_stderr, "Creating cram vers %s\n", vers); - -- cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? -+ if (!no_pg && sam_hdr_add_pg(new_h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ return -1; -+ - if (sam_hdr_write(out, new_h) < 0) { - print_error_errno("cat", "Couldn't write header"); - return -1; -@@ -325,7 +237,7 @@ - samFile *in; - cram_fd *in_c; - cram_container *c; -- bam_hdr_t *old; -+ sam_hdr_t *old_h; - int new_rg = -1; - - in = sam_open(fn[i], "rc"); -@@ -335,20 +247,29 @@ - } - in_c = in->fp.cram; - -- old = sam_hdr_read(in); -- khash_s2i *rg2id_in = hash_rg(old); -+ old_h = sam_hdr_read(in); -+ if (!old_h) { -+ print_error("cat", "fail to read the header of file '%s'", fn[i]); -+ return -1; -+ } - - // Compute RG mapping if suitable for changing. -- if (rg2id_in->n_id == 1) { -- int _; -- new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); -+ if (sam_hdr_count_lines(old_h, "RG") == 1) { -+ const char *old_name = sam_hdr_line_name(old_h, "RG", 0); -+ if (old_name) { -+ new_rg = sam_hdr_line_index(new_h, "RG", old_name); -+ if (new_rg < 0) { -+ print_error("cat", "fail to find @RG line '%s' in the new header", old_name); -+ return -1; -+ } -+ } else { -+ print_error("cat", "fail to find @RG line in file '%s'", fn[i]); -+ return -1; -+ } - } else { - new_rg = 0; - } - -- hash_s2i_free(rg2id_in); -- -- - // Copy contains and blocks within them - while ((c = cram_read_container(in_c))) { - cram_block *blk; -@@ -402,13 +323,11 @@ - cram_free_container(c); - } - -- bam_hdr_destroy(old); -+ sam_hdr_destroy(old_h); - sam_close(in); - } - sam_close(out); -- -- hash_s2i_free(rg2id); -- bam_hdr_destroy(new_h); -+ sam_hdr_destroy(new_h); - - return 0; - } -@@ -421,7 +340,7 @@ - - #define BGZF_EMPTY_BLOCK_SIZE 28 - --int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) -+int bam_cat(int nfn, char * const *fn, sam_hdr_t *h, const char* outbam, char *arg_list, int no_pg) - { - BGZF *fp, *in = NULL; - uint8_t *buf = NULL; -@@ -435,6 +354,13 @@ - return -1; - } - if (h) { -+ if (!no_pg && sam_hdr_add_pg(h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ goto fail; -+ - if (bam_hdr_write(fp, h) < 0) { - print_error_errno("cat", "Couldn't write header"); - goto fail; -@@ -447,7 +373,7 @@ - goto fail; - } - for(i = 0; i < nfn; ++i){ -- bam_hdr_t *old; -+ sam_hdr_t *old; - int len,j; - - in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); -@@ -464,6 +390,13 @@ - goto fail; - } - if (h == 0 && i == 0) { -+ if (!no_pg && sam_hdr_add_pg(old, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ goto fail; -+ - if (bam_hdr_write(fp, old) < 0) { - print_error_errno("cat", "Couldn't write header"); - goto fail; -@@ -509,7 +442,7 @@ - if (bgzf_raw_write(fp, ebuf, es) < 0) goto write_fail; - } - } -- bam_hdr_destroy(old); -+ sam_hdr_destroy(old); - bgzf_close(in); - in = NULL; - } -@@ -532,14 +465,25 @@ - - int main_cat(int argc, char *argv[]) - { -- bam_hdr_t *h = 0; -+ sam_hdr_t *h = 0; - char *outfn = 0; - char **infns = NULL; // files to concatenate - int infns_size = 0; -- int c, ret = 0; -+ int c, ret = 0, no_pg = 0; - samFile *in; -+ sam_global_args ga; -+ -+ static const struct option lopts[] = { -+ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', 0, '-', '@'), -+ {"no-PG", no_argument, NULL, 1}, -+ { NULL, 0, NULL, 0 } -+ }; -+ -+ char *arg_list = NULL; - -- while ((c = getopt(argc, argv, "h:o:b:")) >= 0) { -+ sam_global_args_init(&ga); -+ -+ while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) { - switch (c) { - case 'h': { - samFile *fph = sam_open(optarg, "r"); -@@ -575,9 +519,19 @@ - } - break; - } -+ case 1: -+ no_pg = 1; -+ break; -+ default: -+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - } - } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("cat", "failed to create arg_list"); -+ return 1; -+ } -+ - // Append files specified in argv to the list. - int nargv_fns = argc - optind; - if (nargv_fns > 0) { -@@ -594,6 +548,8 @@ - fprintf(samtools_stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n"); - fprintf(samtools_stderr, " -h FILE copy the header from FILE [default is 1st input file]\n"); - fprintf(samtools_stderr, " -o FILE output BAM/CRAM\n"); -+ fprintf(samtools_stderr, " --no-PG do not add a PG line\n"); -+ sam_global_opt_help(samtools_stderr, "--..-@-."); - return 1; - } - -@@ -606,13 +562,13 @@ - switch (hts_get_format(in)->format) { - case bam: - sam_close(in); -- if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) -+ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", arg_list, no_pg) < 0) - ret = 1; - break; - - case cram: - sam_close(in); -- if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0) -+ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-", &ga, arg_list, no_pg) < 0) - ret = 1; - break; - -@@ -631,9 +587,9 @@ - - free(outfn); - free(infns); -- -+ free(arg_list); - if (h) -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - - return ret; - } ---- /dev/null -+++ python-pysam/samtools/bam_fastq.c -@@ -0,0 +1,1037 @@ -+/* bam_fastq.c -- FASTA and FASTQ file generation -+ -+ Copyright (C) 2009-2017, 2019 Genome Research Ltd. -+ Portions copyright (C) 2009, 2011, 2012 Broad Institute. -+ -+ Author: Heng Li -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notices and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "htslib/sam.h" -+#include "htslib/klist.h" -+#include "htslib/kstring.h" -+#include "htslib/bgzf.h" -+#include "htslib/thread_pool.h" -+#include "samtools.h" -+#include "sam_opts.h" -+ -+#define taglist_free(p) -+KLIST_INIT(ktaglist, char*, taglist_free) -+ -+#define DEFAULT_BARCODE_TAG "BC" -+#define DEFAULT_QUALITY_TAG "QT" -+#define INDEX_SEPARATOR "+" -+ -+int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; -+static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; -+ -+static void bam2fq_usage(FILE *to, const char *command) -+{ -+ int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; -+ fprintf(to, -+"Usage: samtools %s [options...] \n", command); -+ fprintf(to, -+"\n" -+"Description:\n" -+"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n" -+"\n" -+"Options:\n" -+" -0 FILE write reads designated READ_OTHER to FILE\n" -+" -1 FILE write reads designated READ1 to FILE\n" -+" -2 FILE write reads designated READ2 to FILE\n" -+" -o FILE write reads designated READ1 or READ2 to FILE\n" -+" note: if a singleton file is specified with -s, only\n" -+" paired reads will be written to the -1 and -2 files.\n" -+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -+" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 -+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -+" -n don't append /1 and /2 to the read name\n" -+" -N always append /1 and /2 to the read name\n"); -+ if (fq) fprintf(to, -+" -O output quality in the OQ tag if present\n"); -+ fprintf(to, -+" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" -+" -t copy RG, BC and QT tags to the %s header line\n", -+ fq ? "FASTQ" : "FASTA"); -+ fprintf(to, -+" -T TAGLIST copy arbitrary tags to the %s header line\n", -+ fq ? "FASTQ" : "FASTA"); -+ if (fq) fprintf(to, -+" -v INT default quality score if not given in file [1]\n" -+" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" -+" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n" -+" --i1 FILE write first index reads to FILE\n" -+" --i2 FILE write second index reads to FILE\n" -+" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" -+" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" -+" --index-format STR How to parse barcode and quality tags\n\n"); -+ sam_global_opt_help(to, "-.--.@-."); -+ fprintf(to, -+"\n" -+"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n" -+"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n" -+"\n" -+"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" -+"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" -+"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" -+"or both unset.\n" -+"Run 'samtools flags' for more information on flag codes and meanings.\n"); -+ fprintf(to, -+"\n" -+"The index-format string describes how to parse the barcode and quality tags, for example:\n" -+" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" -+" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" -+"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" -+"'read until the separator or end of tag', for example:\n" -+" n*i* ignore the left part of the tag until the separator, then use the second part\n" -+" of the tag as index 1\n"); -+ fprintf(to, -+"\n" -+"Examples:\n" -+" To get just the paired reads in separate files, use:\n" -+" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n" -+"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" -+" samtools %s in.bam > all_reads.%s\n", -+ command, fq ? "fq" : "fa", fq ? "fq" : "fa", -+ command, fq ? "fq" : "fa"); -+} -+ -+typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; -+typedef enum { FASTA, FASTQ } fastfile; -+typedef struct bam2fq_opts { -+ char *fnse; -+ char *fnr[3]; -+ char *fn_input; // pointer to input filename in argv do not free -+ bool has12, has12always, use_oq, copy_tags, illumina_tag; -+ int flag_on, flag_off, flag_alloff; -+ sam_global_args ga; -+ fastfile filetype; -+ int def_qual; -+ char *barcode_tag; -+ char *quality_tag; -+ char *index_file[2]; -+ char *index_format; -+ char *extra_tags; -+ char compression_level; -+} bam2fq_opts_t; -+ -+typedef struct bam2fq_state { -+ samFile *fp; -+ BGZF *fpse; -+ BGZF *fpr[3]; -+ BGZF *fpi[2]; -+ BGZF *hstdout; -+ sam_hdr_t *h; -+ bool has12, use_oq, copy_tags, illumina_tag; -+ int flag_on, flag_off, flag_alloff; -+ fastfile filetype; -+ int def_qual; -+ klist_t(ktaglist) *taglist; -+ char *index_sequence; -+ char compression_level; -+ htsThreadPool p; -+} bam2fq_state_t; -+ -+/* -+ * Get and decode the read from a BAM record. -+ * -+ * TODO: htslib really needs an interface for this. Consider this or perhaps -+ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str -+ * functions as string formatted equivalents to bam_get_{seq,qual}? -+ */ -+ -+/* -+ * Reverse a string in place. -+ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. -+ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik -+ */ -+static char *reverse(char *str) -+{ -+ int i = strlen(str)-1,j=0; -+ char ch; -+ while (i>j) { -+ ch = str[i]; -+ str[i]= str[j]; -+ str[j] = ch; -+ i--; -+ j++; -+ } -+ return str; -+} -+ -+/* return the read, reverse complemented if necessary */ -+static char *get_read(const bam1_t *rec) -+{ -+ int len = rec->core.l_qseq + 1; -+ char *read = calloc(1, len); -+ char *seq = (char *)bam_get_seq(rec); -+ int n; -+ -+ if (!read) return NULL; -+ -+ for (n=0; n < rec->core.l_qseq; n++) { -+ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; -+ else read[n] = seq_nt16_str[bam_seqi(seq,n)]; -+ } -+ if (rec->core.flag & BAM_FREVERSE) reverse(read); -+ return read; -+} -+ -+/* -+ * get and decode the quality from a BAM record -+ */ -+static int get_quality(const bam1_t *rec, char **qual_out) -+{ -+ char *quality = calloc(1, rec->core.l_qseq + 1); -+ char *q = (char *)bam_get_qual(rec); -+ int n; -+ -+ if (!quality) return -1; -+ -+ if (*q == '\xff') { -+ free(quality); -+ *qual_out = NULL; -+ return 0; -+ } -+ -+ for (n=0; n < rec->core.l_qseq; n++) { -+ quality[n] = q[n]+33; -+ } -+ if (rec->core.flag & BAM_FREVERSE) reverse(quality); -+ *qual_out = quality; -+ return 0; -+} -+ -+// -+// End of htslib complaints -+// -+ -+ -+static readpart which_readpart(const bam1_t *b) -+{ -+ if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { -+ return READ_1; -+ } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { -+ return READ_2; -+ } else { -+ return READ_UNKNOWN; -+ } -+} -+ -+/* -+ * parse the length part from the index-format string -+ */ -+static int getLength(char **s) -+{ -+ int n = 0; -+ while (**s) { -+ if (**s == '*') { n=-1; (*s)++; break; } -+ if ( !isdigit(**s)) break; -+ n = n*10 + ((**s)-'0'); -+ (*s)++; -+ } -+ return n; -+} -+ -+static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) -+{ -+ uint8_t *s = bam_aux_get(rec, tag); -+ if (s) { -+ char aux_type = *s; -+ switch (aux_type) { -+ case 'C': -+ case 'S': aux_type = 'I'; break; -+ case 'c': -+ case 's': aux_type = 'i'; break; -+ case 'd': aux_type = 'f'; break; -+ } -+ -+ // Ensure space. Need 6 chars + length of tag. Max length of -+ // i is 16, A is 21, B currently 26, Z is unknown, so -+ // have to check that one later. -+ if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; -+ -+ kputc('\t', linebuf); -+ kputsn(tag, 2, linebuf); -+ kputc(':', linebuf); -+ kputc(aux_type=='I'? 'i': aux_type, linebuf); -+ kputc(':', linebuf); -+ switch (aux_type) { -+ case 'H': -+ case 'Z': -+ if (kputs(bam_aux2Z(s), linebuf) < 0) return false; -+ break; -+ case 'i': kputw(bam_aux2i(s), linebuf); break; -+ case 'I': kputuw(bam_aux2i(s), linebuf); break; -+ case 'A': kputc(bam_aux2A(s), linebuf); break; -+ case 'f': kputd(bam_aux2f(s), linebuf); break; -+ case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; -+ default: kputs("*** Unknown aux type ***", linebuf); return false; -+ } -+ } -+ return true; -+} -+ -+static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) -+{ -+ if (!index_sequence) return 0; -+ -+ kstring_t new = {0,0,NULL}; -+ if (linebuf->s) { -+ char *s = strchr(linebuf->s, '\n'); -+ if (s) { -+ if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) -+ return -1; -+ *s = 0; -+ kputs(linebuf->s, &new); -+ kputc(' ', &new); -+ readpart readpart = which_readpart(rec); -+ if (readpart == READ_1) kputc('1', &new); -+ else if (readpart == READ_2) kputc('2', &new); -+ else kputc('0', &new); -+ -+ kputc(':', &new); -+ if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); -+ else kputc('N', &new); -+ -+ kputs(":0:", &new); -+ kputs(index_sequence, &new); -+ kputc('\n', &new); -+ kputs(s+1, &new); -+ free(ks_release(linebuf)); -+ linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; -+ } -+ } -+ return 0; -+} -+ -+static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) -+{ -+ int i; -+ -+ linebuf->l = 0; -+ // Write read name -+ if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; -+ if (kputs(bam_get_qname(rec), linebuf) < 0) return false; -+ // Add the /1 /2 if requested -+ if (state->has12) { -+ readpart readpart = which_readpart(rec); -+ if (readpart == READ_1) { -+ if (kputs("/1", linebuf) < 0) return false; -+ } else if (readpart == READ_2) { -+ if (kputs("/2", linebuf) < 0) return false; -+ } -+ } -+ if (state->copy_tags) { -+ for (i = 0; copied_tags[i]; ++i) { -+ if (!copy_tag(copied_tags[i], rec, linebuf)) { -+ fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -+ return false; -+ } -+ } -+ } -+ -+ if (state->taglist->size) { -+ kliter_t(ktaglist) *p; -+ for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { -+ if (!copy_tag(kl_val(p), rec, linebuf)) { -+ fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -+ return false; -+ } -+ } -+ } -+ -+ if (kputc('\n', linebuf) < 0) return false; -+ if (kputs(seq, linebuf) < 0) return false; -+ if (kputc('\n', linebuf) < 0) return false; -+ -+ if (state->filetype == FASTQ) { -+ // Write quality -+ if (kputs("+\n", linebuf) < 0) return false; -+ if (qual && *qual) { -+ if (kputs(qual, linebuf) < 0) return false; -+ } else { -+ int len = strlen(seq); -+ if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; -+ for (i = 0; i < len; ++i) { -+ kputc(33 + state->def_qual, linebuf); -+ } -+ } -+ if (kputc('\n', linebuf) < 0) return false; -+ } -+ return true; -+} -+ -+/* -+ * Create FASTQ lines from the barcode tag using the index-format -+ */ -+static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) -+{ -+ uint8_t *p; -+ char *ifmt = opts->index_format; -+ char *tag = NULL; -+ char *qual = NULL; -+ char *sub_tag = NULL; -+ char *sub_qual = NULL; -+ size_t tag_len; -+ int file_number = 0; -+ kstring_t linebuf = { 0, 0, NULL }; // Buffer -+ -+ if (!ifmt) return true; -+ -+ // read barcode tag -+ p = bam_aux_get(rec,opts->barcode_tag); -+ if (p) tag = bam_aux2Z(p); -+ -+ if (!tag) return true; // there is no tag -+ -+ tag_len = strlen(tag); -+ sub_tag = calloc(1, tag_len + 1); -+ if (!sub_tag) goto fail; -+ sub_qual = calloc(1, tag_len + 1); -+ if (!sub_qual) goto fail; -+ -+ // read quality tag -+ p = bam_aux_get(rec, opts->quality_tag); -+ if (p) qual = bam_aux2Z(p); -+ -+ // Parse the index-format string -+ while (*ifmt) { -+ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly -+ char action = *ifmt; // should be 'i' or 'n' -+ ifmt++; // skip over action -+ int index_len = getLength(&ifmt); -+ int n = 0; -+ -+ if (index_len < 0) { -+ // read until separator -+ while (isalpha(*tag)) { -+ sub_tag[n] = *tag++; -+ if (qual) sub_qual[n] = *qual++; -+ n++; -+ } -+ if (*tag) { // skip separator -+ tag++; -+ if (qual) qual++; -+ } -+ } else { -+ // read index_len characters -+ while (index_len-- && *tag) { -+ sub_tag[n] = *tag++; -+ if (qual) sub_qual[n] = *qual++; -+ n++; -+ } -+ } -+ sub_tag[n] = '\0'; -+ sub_qual[n] = '\0'; -+ -+ if (action=='i' && *sub_tag) { -+ if (state->index_sequence) { -+ char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2); -+ if (!new_index_sequence) goto fail; -+ state->index_sequence = new_index_sequence; -+ strcat(state->index_sequence, INDEX_SEPARATOR); -+ strcat(state->index_sequence, sub_tag); -+ } else { -+ state->index_sequence = strdup(sub_tag); // we're going to need this later... -+ } -+ if (!state->index_sequence) goto fail; -+ if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; -+ if (state->illumina_tag) { -+ if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) { -+ goto fail; -+ } -+ } -+ if (state->fpi[file_number]) { -+ if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) -+ goto fail; -+ } -+ } -+ -+ } -+ -+ free(sub_qual); free(sub_tag); -+ free(linebuf.s); -+ return true; -+ -+ fail: -+ perror(__func__); -+ free(sub_qual); free(sub_tag); -+ free(linebuf.s); -+ return false; -+} -+ -+// Transform a bam1_t record into a string with the FASTQ representation of it -+// @returns false for error, true for success -+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) -+{ -+ int32_t qlen = b->core.l_qseq; -+ assert(qlen >= 0); -+ const uint8_t *oq = NULL; -+ char *qual = NULL; -+ -+ char *seq = get_read(b); -+ if (!seq) return false; -+ -+ if (state->use_oq) oq = bam_aux_get(b, "OQ"); -+ if (oq && *oq=='Z') { -+ qual = strdup(bam_aux2Z(oq)); -+ if (!qual) goto fail; -+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented -+ reverse(qual); -+ } -+ } else { -+ if (get_quality(b, &qual) < 0) goto fail; -+ } -+ -+ if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; -+ -+ free(qual); -+ free(seq); -+ return true; -+ -+ fail: -+ free(seq); -+ free(qual); -+ return false; -+} -+ -+static void free_opts(bam2fq_opts_t *opts) -+{ -+ free(opts->barcode_tag); -+ free(opts->quality_tag); -+ free(opts->index_format); -+ free(opts->extra_tags); -+ free(opts); -+} -+ -+// return true if valid -+static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) -+{ -+ // Parse args -+ bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); -+ opts->has12 = true; -+ opts->has12always = false; -+ opts->filetype = FASTQ; -+ opts->def_qual = 1; -+ opts->barcode_tag = NULL; -+ opts->quality_tag = NULL; -+ opts->index_format = NULL; -+ opts->index_file[0] = NULL; -+ opts->index_file[1] = NULL; -+ opts->extra_tags = NULL; -+ opts->compression_level = 1; -+ opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY; -+ int flag_off_set = 0; -+ -+ int c; -+ sam_global_args_init(&opts->ga); -+ static const struct option lopts[] = { -+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), -+ {"i1", required_argument, NULL, 1}, -+ {"I1", required_argument, NULL, 1}, -+ {"i2", required_argument, NULL, 2}, -+ {"I2", required_argument, NULL, 2}, -+ {"if", required_argument, NULL, 3}, -+ {"IF", required_argument, NULL, 3}, -+ {"index-format", required_argument, NULL, 3}, -+ {"barcode-tag", required_argument, NULL, 'b'}, -+ {"quality-tag", required_argument, NULL, 'q'}, -+ { NULL, 0, NULL, 0 } -+ }; -+ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { -+ switch (c) { -+ case 'b': opts->barcode_tag = strdup(optarg); break; -+ case 'q': opts->quality_tag = strdup(optarg); break; -+ case 1 : opts->index_file[0] = optarg; break; -+ case 2 : opts->index_file[1] = optarg; break; -+ case 3 : opts->index_format = strdup(optarg); break; -+ case '0': opts->fnr[0] = optarg; break; -+ case '1': opts->fnr[1] = optarg; break; -+ case '2': opts->fnr[2] = optarg; break; -+ case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break; -+ case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; -+ case 'F': -+ if (!flag_off_set) { -+ flag_off_set = 1; -+ opts->flag_off = 0; -+ } -+ opts->flag_off |= strtol(optarg, 0, 0); break; -+ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; -+ case 'n': opts->has12 = false; break; -+ case 'N': opts->has12always = true; break; -+ case 'O': opts->use_oq = true; break; -+ case 's': opts->fnse = optarg; break; -+ case 't': opts->copy_tags = true; break; -+ case 'i': opts->illumina_tag = true; break; -+ case 'c': opts->compression_level = atoi(optarg); break; -+ case 'T': opts->extra_tags = strdup(optarg); break; -+ case 'v': opts->def_qual = atoi(optarg); break; -+ case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; -+ default: -+ if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { -+ bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; -+ } -+ break; -+ } -+ } -+ -+ if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; -+ if (opts->has12always) opts->has12 = true; -+ -+ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); -+ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); -+ -+ int nIndex = 0; -+ if (opts->index_format) { -+ char *s; -+ for (s = opts->index_format; *s; s++) { -+ if (*s == 'i') nIndex++; -+ } -+ } -+ if (nIndex>2) { -+ fprintf(stderr,"Invalid index format: more than 2 indexes\n"); -+ bam2fq_usage(stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (opts->index_file[1] && !opts->index_file[0]) { -+ fprintf(stderr, "Index one specified, but index two not given\n"); -+ bam2fq_usage(stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (opts->illumina_tag && !nIndex) { -+ fprintf(stderr, "You must specify an index format (--index-format) with the Illumina Casava (-i) option\n"); -+ bam2fq_usage(stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (nIndex==0 && opts->index_file[0]) { -+ fprintf(stderr, "index_format not specified, but index file given\n"); -+ bam2fq_usage(stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (opts->def_qual < 0 || 93 < opts->def_qual) { -+ fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); -+ bam2fq_usage(stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ const char* type_str = argv[0]; -+ if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { -+ opts->filetype = FASTQ; -+ } else if (strcasecmp("fasta", type_str) == 0) { -+ opts->filetype = FASTA; -+ } else { -+ print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); -+ bam2fq_usage(stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (argc == optind && isatty(STDIN_FILENO)) { -+ bam2fq_usage(stdout, argv[0]); -+ free_opts(opts); -+ return true; -+ } -+ -+ if (argc - optind > 1) { -+ fprintf(stderr, "Too many arguments.\n"); -+ bam2fq_usage(stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ opts->fn_input = argc > optind ? argv[optind] : "-"; -+ *opts_out = opts; -+ return true; -+} -+ -+static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp) -+{ -+ char mode[4] = "w"; -+ size_t len = strlen(filename); -+ -+ mode[2] = 0; mode[3] = 0; -+ if (len > 3 && strstr(filename + (len - 3),".gz")) { -+ mode[1] = 'g'; mode[2] = c+'0'; -+ } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) -+ || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { -+ mode[1] = c+'0'; -+ } else { -+ mode[1] = 'u'; -+ } -+ -+ BGZF *fp = bgzf_open(filename,mode); -+ if (!fp) -+ return fp; -+ if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) { -+ bgzf_close(fp); -+ return NULL; -+ } -+ return fp; -+} -+ -+static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) -+{ -+ bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); -+ state->flag_on = opts->flag_on; -+ state->flag_off = opts->flag_off; -+ state->flag_alloff = opts->flag_alloff; -+ state->has12 = opts->has12; -+ state->use_oq = opts->use_oq; -+ state->illumina_tag = opts->illumina_tag; -+ state->copy_tags = opts->copy_tags; -+ state->filetype = opts->filetype; -+ state->def_qual = opts->def_qual; -+ state->index_sequence = NULL; -+ state->hstdout = NULL; -+ state->compression_level = opts->compression_level; -+ -+ state->taglist = kl_init(ktaglist); -+ if (opts->extra_tags) { -+ char *save_p; -+ char *s = strtok_r(opts->extra_tags, ",", &save_p); -+ while (s) { -+ if (strlen(s) != 2) { -+ fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s); -+ free(state); -+ return false; -+ } -+ char **et = kl_pushp(ktaglist, state->taglist); -+ *et = s; -+ s = strtok_r(NULL, ",", &save_p); -+ } -+ } -+ -+ state->fp = sam_open(opts->fn_input, "r"); -+ if (state->fp == NULL) { -+ print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); -+ free(state); -+ return false; -+ } -+ -+ state->p.pool = NULL; -+ if (opts->ga.nthreads > 0) { -+ if (!(state->p.pool = hts_tpool_init(opts->ga.nthreads))) { -+ fprintf(stderr, "Failed to create thread pool\n"); -+ free(state); -+ return false; -+ } -+ state->p.qsize = opts->ga.nthreads*2; -+ hts_set_thread_pool(state->fp, &state->p); -+ } -+ -+ uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; -+ if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; -+ if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -+ fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); -+ free(state); -+ return false; -+ } -+ if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { -+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); -+ free(state); -+ return false; -+ } -+ if (opts->fnse) { -+ state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p); -+ if (state->fpse == NULL) { -+ print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); -+ free(state); -+ return false; -+ } -+ } -+ -+ if (opts->ga.reference) { -+ if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { -+ print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); -+ free(state); -+ return false; -+ } -+ } -+ -+ int i, j; -+ for (i = 0; i < 3; ++i) { -+ if (opts->fnr[i]) { -+ for (j = 0; j < i; j++) -+ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0) -+ break; -+ if (j == i) { -+ state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p); -+ if (state->fpr[i] == NULL) { -+ print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", -+ i, opts->fnr[i]); -+ free(state); -+ return false; -+ } -+ } else { -+ state->fpr[i] = state->fpr[j]; -+ } -+ } else { -+ if (!state->hstdout) { -+ state->hstdout = bgzf_dopen(fileno(stdout), "wu"); -+ if (!state->hstdout) { -+ print_error_errno("bam2fq", "Cannot open STDOUT"); -+ free(state); -+ return false; -+ } -+ } -+ state->fpr[i] = state->hstdout; -+ } -+ } -+ for (i = 0; i < 2; i++) { -+ state->fpi[i] = NULL; -+ if (opts->index_file[i]) { -+ for (j = 0; j < 3; j++) -+ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->index_file[i]) == 0) -+ break; -+ for (j -= 3; j >= 0 && j < i; j++) -+ if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0) -+ break; -+ if (i == j) { -+ state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p); -+ if (state->fpi[i] == NULL) { -+ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", -+ i+1, opts->index_file[i]); -+ free(state); -+ return false; -+ } -+ } else if (j < 0) { -+ state->fpi[i] = state->fpr[j+3]; -+ } else { -+ state->fpi[i] = state->fpi[j]; -+ } -+ } -+ } -+ -+ state->h = sam_hdr_read(state->fp); -+ if (state->h == NULL) { -+ fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input); -+ free(state); -+ return false; -+ } -+ -+ *state_out = state; -+ return true; -+} -+ -+static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) -+{ -+ bool valid = true; -+ sam_hdr_destroy(state->h); -+ check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); -+ if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } -+ int i, j; -+ for (i = 0; i < 3; ++i) { -+ if (state->fpr[i] != state->hstdout) { -+ for (j = 0; j < i; j++) -+ if (state->fpr[i] == state->fpr[j]) -+ break; -+ if (j == i && bgzf_close(state->fpr[i])) { -+ print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); -+ valid = false; -+ } -+ } -+ } -+ if (state->hstdout) { -+ if (bgzf_close(state->hstdout)) { -+ print_error_errno("bam2fq", "Error closing STDOUT"); -+ valid = false; -+ } -+ } -+ for (i = 0; i < 2; i++) { -+ for (j = 0; j < 3; j++) -+ if (state->fpi[i] == state->fpr[j]) -+ break; -+ for (j -= 3; j >= 0 && j < i; j++) -+ if (state->fpi[i] == state->fpi[j]) -+ break; -+ if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) { -+ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); -+ valid = false; -+ } -+ } -+ kl_destroy(ktaglist,state->taglist); -+ free(state->index_sequence); -+ if (state->p.pool) -+ hts_tpool_destroy(state->p.pool); -+ free(state); -+ return valid; -+} -+ -+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) -+{ -+ return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags -+ || (b->core.flag&(state->flag_off)) != 0 -+ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); -+ -+} -+ -+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) -+{ -+ int n; -+ bam1_t *records[3] = {NULL, NULL, NULL}; -+ char *current_qname = NULL; -+ int64_t n_reads = 0, n_singletons = 0; // Statistics -+ kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; -+ int score[3]; -+ int at_eof; -+ bool valid = true; -+ bam1_t* b = NULL; -+ -+ while (true) { -+ if (!b) -+ b = bam_init1(); -+ if (b == NULL) { -+ perror("[bam2fq_mainloop] Malloc error for bam record buffer."); -+ valid = false; -+ break; -+ } -+ int res = sam_read1(state->fp, state->h, b); -+ if (res < -1) { -+ fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); -+ valid = false; -+ break; -+ } -+ at_eof = res < 0; -+ -+ if (!at_eof && filter_it_out(b, state)) -+ continue; -+ if (!at_eof) ++n_reads; -+ -+ if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { -+ if (current_qname) { -+ if (state->illumina_tag) { -+ for (n=0; valid && n<3; n++) { -+ if (!records[n]) continue; -+ if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; -+ } -+ if (!valid) break; -+ } -+ free(state->index_sequence); state->index_sequence = NULL; -+ if (score[1] > 0 && score[2] > 0) { -+ // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] -+ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -+ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -+ } else if (score[1] > 0 || score[2] > 0) { -+ if (state->fpse) { -+ // print whichever one exists to fpse -+ if (score[1] > 0) { -+ if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -+ } else { -+ if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -+ } -+ ++n_singletons; -+ } else { -+ if (score[1] > 0) { -+ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -+ } else { -+ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -+ } -+ } -+ } -+ if (score[0]) { // TODO: check this -+ // print linebuf[0] to fpr[0] -+ if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } -+ } -+ } -+ -+ -+ free(current_qname); current_qname = NULL; -+ score[0] = score[1] = score[2] = 0; -+ for (n=0; n < 3; n++) { -+ bam_destroy1(records[n]); records[n]=NULL; -+ } -+ -+ if (at_eof) { break; } -+ -+ current_qname = strdup(bam_get_qname(b)); -+ if (!current_qname) { valid = false; break; } -+ } -+ -+ // Prefer a copy of the read that has base qualities -+ int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; -+ readpart rp = which_readpart(b); -+ if (b_score > score[rp]) { -+ if (!tags2fq(b, state, opts)) { valid = false; break; } -+ if (records[rp]) bam_destroy1(records[rp]); -+ records[rp] = b; -+ score[rp] = b_score; -+ b = NULL; -+ if(!bam1_to_fq(records[rp], &linebuf[rp], state)) { -+ fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__); -+ valid = false; break; -+ } -+ } -+ } -+ if (!valid) -+ { -+ perror("[bam2fq_mainloop] Error writing to FASTx files."); -+ } -+ bam_destroy1(b); -+ for (n=0; n < 3; n++) { -+ bam_destroy1(records[n]); -+ } -+ free(current_qname); -+ free(linebuf[0].s); -+ free(linebuf[1].s); -+ free(linebuf[2].s); -+ fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); -+ fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); -+ -+ return valid; -+} -+ -+int main_bam2fq(int argc, char *argv[]) -+{ -+ int status = EXIT_SUCCESS; -+ bam2fq_opts_t* opts = NULL; -+ bam2fq_state_t* state = NULL; -+ -+ bool valid = parse_opts(argc, argv, &opts); -+ if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; -+ -+ if (!init_state(opts, &state)) return EXIT_FAILURE; -+ -+ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; -+ -+ if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; -+ sam_global_args_free(&opts->ga); -+ free_opts(opts); -+ -+ return status; -+} ---- /dev/null -+++ python-pysam/samtools/bam_fastq.c.pysam.c -@@ -0,0 +1,1039 @@ -+#include "samtools.pysam.h" -+ -+/* bam_fastq.c -- FASTA and FASTQ file generation -+ -+ Copyright (C) 2009-2017, 2019 Genome Research Ltd. -+ Portions copyright (C) 2009, 2011, 2012 Broad Institute. -+ -+ Author: Heng Li -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notices and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "htslib/sam.h" -+#include "htslib/klist.h" -+#include "htslib/kstring.h" -+#include "htslib/bgzf.h" -+#include "htslib/thread_pool.h" -+#include "samtools.h" -+#include "sam_opts.h" -+ -+#define taglist_free(p) -+KLIST_INIT(ktaglist, char*, taglist_free) -+ -+#define DEFAULT_BARCODE_TAG "BC" -+#define DEFAULT_QUALITY_TAG "QT" -+#define INDEX_SEPARATOR "+" -+ -+int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; -+static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; -+ -+static void bam2fq_usage(FILE *to, const char *command) -+{ -+ int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; -+ fprintf(to, -+"Usage: samtools %s [options...] \n", command); -+ fprintf(to, -+"\n" -+"Description:\n" -+"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n" -+"\n" -+"Options:\n" -+" -0 FILE write reads designated READ_OTHER to FILE\n" -+" -1 FILE write reads designated READ1 to FILE\n" -+" -2 FILE write reads designated READ2 to FILE\n" -+" -o FILE write reads designated READ1 or READ2 to FILE\n" -+" note: if a singleton file is specified with -s, only\n" -+" paired reads will be written to the -1 and -2 files.\n" -+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -+" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 -+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -+" -n don't append /1 and /2 to the read name\n" -+" -N always append /1 and /2 to the read name\n"); -+ if (fq) fprintf(to, -+" -O output quality in the OQ tag if present\n"); -+ fprintf(to, -+" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" -+" -t copy RG, BC and QT tags to the %s header line\n", -+ fq ? "FASTQ" : "FASTA"); -+ fprintf(to, -+" -T TAGLIST copy arbitrary tags to the %s header line\n", -+ fq ? "FASTQ" : "FASTA"); -+ if (fq) fprintf(to, -+" -v INT default quality score if not given in file [1]\n" -+" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" -+" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n" -+" --i1 FILE write first index reads to FILE\n" -+" --i2 FILE write second index reads to FILE\n" -+" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" -+" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" -+" --index-format STR How to parse barcode and quality tags\n\n"); -+ sam_global_opt_help(to, "-.--.@-."); -+ fprintf(to, -+"\n" -+"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n" -+"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n" -+"\n" -+"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" -+"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" -+"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" -+"or both unset.\n" -+"Run 'samtools flags' for more information on flag codes and meanings.\n"); -+ fprintf(to, -+"\n" -+"The index-format string describes how to parse the barcode and quality tags, for example:\n" -+" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" -+" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" -+"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" -+"'read until the separator or end of tag', for example:\n" -+" n*i* ignore the left part of the tag until the separator, then use the second part\n" -+" of the tag as index 1\n"); -+ fprintf(to, -+"\n" -+"Examples:\n" -+" To get just the paired reads in separate files, use:\n" -+" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n" -+"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" -+" samtools %s in.bam > all_reads.%s\n", -+ command, fq ? "fq" : "fa", fq ? "fq" : "fa", -+ command, fq ? "fq" : "fa"); -+} -+ -+typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; -+typedef enum { FASTA, FASTQ } fastfile; -+typedef struct bam2fq_opts { -+ char *fnse; -+ char *fnr[3]; -+ char *fn_input; // pointer to input filename in argv do not free -+ bool has12, has12always, use_oq, copy_tags, illumina_tag; -+ int flag_on, flag_off, flag_alloff; -+ sam_global_args ga; -+ fastfile filetype; -+ int def_qual; -+ char *barcode_tag; -+ char *quality_tag; -+ char *index_file[2]; -+ char *index_format; -+ char *extra_tags; -+ char compression_level; -+} bam2fq_opts_t; -+ -+typedef struct bam2fq_state { -+ samFile *fp; -+ BGZF *fpse; -+ BGZF *fpr[3]; -+ BGZF *fpi[2]; -+ BGZF *hsamtools_stdout; -+ sam_hdr_t *h; -+ bool has12, use_oq, copy_tags, illumina_tag; -+ int flag_on, flag_off, flag_alloff; -+ fastfile filetype; -+ int def_qual; -+ klist_t(ktaglist) *taglist; -+ char *index_sequence; -+ char compression_level; -+ htsThreadPool p; -+} bam2fq_state_t; -+ -+/* -+ * Get and decode the read from a BAM record. -+ * -+ * TODO: htslib really needs an interface for this. Consider this or perhaps -+ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str -+ * functions as string formatted equivalents to bam_get_{seq,qual}? -+ */ -+ -+/* -+ * Reverse a string in place. -+ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. -+ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik -+ */ -+static char *reverse(char *str) -+{ -+ int i = strlen(str)-1,j=0; -+ char ch; -+ while (i>j) { -+ ch = str[i]; -+ str[i]= str[j]; -+ str[j] = ch; -+ i--; -+ j++; -+ } -+ return str; -+} -+ -+/* return the read, reverse complemented if necessary */ -+static char *get_read(const bam1_t *rec) -+{ -+ int len = rec->core.l_qseq + 1; -+ char *read = calloc(1, len); -+ char *seq = (char *)bam_get_seq(rec); -+ int n; -+ -+ if (!read) return NULL; -+ -+ for (n=0; n < rec->core.l_qseq; n++) { -+ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; -+ else read[n] = seq_nt16_str[bam_seqi(seq,n)]; -+ } -+ if (rec->core.flag & BAM_FREVERSE) reverse(read); -+ return read; -+} -+ -+/* -+ * get and decode the quality from a BAM record -+ */ -+static int get_quality(const bam1_t *rec, char **qual_out) -+{ -+ char *quality = calloc(1, rec->core.l_qseq + 1); -+ char *q = (char *)bam_get_qual(rec); -+ int n; -+ -+ if (!quality) return -1; -+ -+ if (*q == '\xff') { -+ free(quality); -+ *qual_out = NULL; -+ return 0; -+ } -+ -+ for (n=0; n < rec->core.l_qseq; n++) { -+ quality[n] = q[n]+33; -+ } -+ if (rec->core.flag & BAM_FREVERSE) reverse(quality); -+ *qual_out = quality; -+ return 0; -+} -+ -+// -+// End of htslib complaints -+// -+ -+ -+static readpart which_readpart(const bam1_t *b) -+{ -+ if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { -+ return READ_1; -+ } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { -+ return READ_2; -+ } else { -+ return READ_UNKNOWN; -+ } -+} -+ -+/* -+ * parse the length part from the index-format string -+ */ -+static int getLength(char **s) -+{ -+ int n = 0; -+ while (**s) { -+ if (**s == '*') { n=-1; (*s)++; break; } -+ if ( !isdigit(**s)) break; -+ n = n*10 + ((**s)-'0'); -+ (*s)++; -+ } -+ return n; -+} -+ -+static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) -+{ -+ uint8_t *s = bam_aux_get(rec, tag); -+ if (s) { -+ char aux_type = *s; -+ switch (aux_type) { -+ case 'C': -+ case 'S': aux_type = 'I'; break; -+ case 'c': -+ case 's': aux_type = 'i'; break; -+ case 'd': aux_type = 'f'; break; -+ } -+ -+ // Ensure space. Need 6 chars + length of tag. Max length of -+ // i is 16, A is 21, B currently 26, Z is unknown, so -+ // have to check that one later. -+ if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; -+ -+ kputc('\t', linebuf); -+ kputsn(tag, 2, linebuf); -+ kputc(':', linebuf); -+ kputc(aux_type=='I'? 'i': aux_type, linebuf); -+ kputc(':', linebuf); -+ switch (aux_type) { -+ case 'H': -+ case 'Z': -+ if (kputs(bam_aux2Z(s), linebuf) < 0) return false; -+ break; -+ case 'i': kputw(bam_aux2i(s), linebuf); break; -+ case 'I': kputuw(bam_aux2i(s), linebuf); break; -+ case 'A': kputc(bam_aux2A(s), linebuf); break; -+ case 'f': kputd(bam_aux2f(s), linebuf); break; -+ case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; -+ default: kputs("*** Unknown aux type ***", linebuf); return false; -+ } -+ } -+ return true; -+} -+ -+static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) -+{ -+ if (!index_sequence) return 0; -+ -+ kstring_t new = {0,0,NULL}; -+ if (linebuf->s) { -+ char *s = strchr(linebuf->s, '\n'); -+ if (s) { -+ if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) -+ return -1; -+ *s = 0; -+ kputs(linebuf->s, &new); -+ kputc(' ', &new); -+ readpart readpart = which_readpart(rec); -+ if (readpart == READ_1) kputc('1', &new); -+ else if (readpart == READ_2) kputc('2', &new); -+ else kputc('0', &new); -+ -+ kputc(':', &new); -+ if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); -+ else kputc('N', &new); -+ -+ kputs(":0:", &new); -+ kputs(index_sequence, &new); -+ kputc('\n', &new); -+ kputs(s+1, &new); -+ free(ks_release(linebuf)); -+ linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; -+ } -+ } -+ return 0; -+} -+ -+static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) -+{ -+ int i; -+ -+ linebuf->l = 0; -+ // Write read name -+ if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; -+ if (kputs(bam_get_qname(rec), linebuf) < 0) return false; -+ // Add the /1 /2 if requested -+ if (state->has12) { -+ readpart readpart = which_readpart(rec); -+ if (readpart == READ_1) { -+ if (kputs("/1", linebuf) < 0) return false; -+ } else if (readpart == READ_2) { -+ if (kputs("/2", linebuf) < 0) return false; -+ } -+ } -+ if (state->copy_tags) { -+ for (i = 0; copied_tags[i]; ++i) { -+ if (!copy_tag(copied_tags[i], rec, linebuf)) { -+ fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -+ return false; -+ } -+ } -+ } -+ -+ if (state->taglist->size) { -+ kliter_t(ktaglist) *p; -+ for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { -+ if (!copy_tag(kl_val(p), rec, linebuf)) { -+ fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -+ return false; -+ } -+ } -+ } -+ -+ if (kputc('\n', linebuf) < 0) return false; -+ if (kputs(seq, linebuf) < 0) return false; -+ if (kputc('\n', linebuf) < 0) return false; -+ -+ if (state->filetype == FASTQ) { -+ // Write quality -+ if (kputs("+\n", linebuf) < 0) return false; -+ if (qual && *qual) { -+ if (kputs(qual, linebuf) < 0) return false; -+ } else { -+ int len = strlen(seq); -+ if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; -+ for (i = 0; i < len; ++i) { -+ kputc(33 + state->def_qual, linebuf); -+ } -+ } -+ if (kputc('\n', linebuf) < 0) return false; -+ } -+ return true; -+} -+ -+/* -+ * Create FASTQ lines from the barcode tag using the index-format -+ */ -+static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) -+{ -+ uint8_t *p; -+ char *ifmt = opts->index_format; -+ char *tag = NULL; -+ char *qual = NULL; -+ char *sub_tag = NULL; -+ char *sub_qual = NULL; -+ size_t tag_len; -+ int file_number = 0; -+ kstring_t linebuf = { 0, 0, NULL }; // Buffer -+ -+ if (!ifmt) return true; -+ -+ // read barcode tag -+ p = bam_aux_get(rec,opts->barcode_tag); -+ if (p) tag = bam_aux2Z(p); -+ -+ if (!tag) return true; // there is no tag -+ -+ tag_len = strlen(tag); -+ sub_tag = calloc(1, tag_len + 1); -+ if (!sub_tag) goto fail; -+ sub_qual = calloc(1, tag_len + 1); -+ if (!sub_qual) goto fail; -+ -+ // read quality tag -+ p = bam_aux_get(rec, opts->quality_tag); -+ if (p) qual = bam_aux2Z(p); -+ -+ // Parse the index-format string -+ while (*ifmt) { -+ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly -+ char action = *ifmt; // should be 'i' or 'n' -+ ifmt++; // skip over action -+ int index_len = getLength(&ifmt); -+ int n = 0; -+ -+ if (index_len < 0) { -+ // read until separator -+ while (isalpha(*tag)) { -+ sub_tag[n] = *tag++; -+ if (qual) sub_qual[n] = *qual++; -+ n++; -+ } -+ if (*tag) { // skip separator -+ tag++; -+ if (qual) qual++; -+ } -+ } else { -+ // read index_len characters -+ while (index_len-- && *tag) { -+ sub_tag[n] = *tag++; -+ if (qual) sub_qual[n] = *qual++; -+ n++; -+ } -+ } -+ sub_tag[n] = '\0'; -+ sub_qual[n] = '\0'; -+ -+ if (action=='i' && *sub_tag) { -+ if (state->index_sequence) { -+ char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2); -+ if (!new_index_sequence) goto fail; -+ state->index_sequence = new_index_sequence; -+ strcat(state->index_sequence, INDEX_SEPARATOR); -+ strcat(state->index_sequence, sub_tag); -+ } else { -+ state->index_sequence = strdup(sub_tag); // we're going to need this later... -+ } -+ if (!state->index_sequence) goto fail; -+ if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; -+ if (state->illumina_tag) { -+ if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) { -+ goto fail; -+ } -+ } -+ if (state->fpi[file_number]) { -+ if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) -+ goto fail; -+ } -+ } -+ -+ } -+ -+ free(sub_qual); free(sub_tag); -+ free(linebuf.s); -+ return true; -+ -+ fail: -+ perror(__func__); -+ free(sub_qual); free(sub_tag); -+ free(linebuf.s); -+ return false; -+} -+ -+// Transform a bam1_t record into a string with the FASTQ representation of it -+// @returns false for error, true for success -+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) -+{ -+ int32_t qlen = b->core.l_qseq; -+ assert(qlen >= 0); -+ const uint8_t *oq = NULL; -+ char *qual = NULL; -+ -+ char *seq = get_read(b); -+ if (!seq) return false; -+ -+ if (state->use_oq) oq = bam_aux_get(b, "OQ"); -+ if (oq && *oq=='Z') { -+ qual = strdup(bam_aux2Z(oq)); -+ if (!qual) goto fail; -+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented -+ reverse(qual); -+ } -+ } else { -+ if (get_quality(b, &qual) < 0) goto fail; -+ } -+ -+ if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; -+ -+ free(qual); -+ free(seq); -+ return true; -+ -+ fail: -+ free(seq); -+ free(qual); -+ return false; -+} -+ -+static void free_opts(bam2fq_opts_t *opts) -+{ -+ free(opts->barcode_tag); -+ free(opts->quality_tag); -+ free(opts->index_format); -+ free(opts->extra_tags); -+ free(opts); -+} -+ -+// return true if valid -+static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) -+{ -+ // Parse args -+ bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); -+ opts->has12 = true; -+ opts->has12always = false; -+ opts->filetype = FASTQ; -+ opts->def_qual = 1; -+ opts->barcode_tag = NULL; -+ opts->quality_tag = NULL; -+ opts->index_format = NULL; -+ opts->index_file[0] = NULL; -+ opts->index_file[1] = NULL; -+ opts->extra_tags = NULL; -+ opts->compression_level = 1; -+ opts->flag_off = BAM_FSECONDARY|BAM_FSUPPLEMENTARY; -+ int flag_off_set = 0; -+ -+ int c; -+ sam_global_args_init(&opts->ga); -+ static const struct option lopts[] = { -+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), -+ {"i1", required_argument, NULL, 1}, -+ {"I1", required_argument, NULL, 1}, -+ {"i2", required_argument, NULL, 2}, -+ {"I2", required_argument, NULL, 2}, -+ {"if", required_argument, NULL, 3}, -+ {"IF", required_argument, NULL, 3}, -+ {"index-format", required_argument, NULL, 3}, -+ {"barcode-tag", required_argument, NULL, 'b'}, -+ {"quality-tag", required_argument, NULL, 'q'}, -+ { NULL, 0, NULL, 0 } -+ }; -+ while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { -+ switch (c) { -+ case 'b': opts->barcode_tag = strdup(optarg); break; -+ case 'q': opts->quality_tag = strdup(optarg); break; -+ case 1 : opts->index_file[0] = optarg; break; -+ case 2 : opts->index_file[1] = optarg; break; -+ case 3 : opts->index_format = strdup(optarg); break; -+ case '0': opts->fnr[0] = optarg; break; -+ case '1': opts->fnr[1] = optarg; break; -+ case '2': opts->fnr[2] = optarg; break; -+ case 'o': opts->fnr[1] = optarg; opts->fnr[2] = optarg; break; -+ case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; -+ case 'F': -+ if (!flag_off_set) { -+ flag_off_set = 1; -+ opts->flag_off = 0; -+ } -+ opts->flag_off |= strtol(optarg, 0, 0); break; -+ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; -+ case 'n': opts->has12 = false; break; -+ case 'N': opts->has12always = true; break; -+ case 'O': opts->use_oq = true; break; -+ case 's': opts->fnse = optarg; break; -+ case 't': opts->copy_tags = true; break; -+ case 'i': opts->illumina_tag = true; break; -+ case 'c': opts->compression_level = atoi(optarg); break; -+ case 'T': opts->extra_tags = strdup(optarg); break; -+ case 'v': opts->def_qual = atoi(optarg); break; -+ case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; -+ default: -+ if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { -+ bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; -+ } -+ break; -+ } -+ } -+ -+ if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; -+ if (opts->has12always) opts->has12 = true; -+ -+ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); -+ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); -+ -+ int nIndex = 0; -+ if (opts->index_format) { -+ char *s; -+ for (s = opts->index_format; *s; s++) { -+ if (*s == 'i') nIndex++; -+ } -+ } -+ if (nIndex>2) { -+ fprintf(samtools_stderr,"Invalid index format: more than 2 indexes\n"); -+ bam2fq_usage(samtools_stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (opts->index_file[1] && !opts->index_file[0]) { -+ fprintf(samtools_stderr, "Index one specified, but index two not given\n"); -+ bam2fq_usage(samtools_stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (opts->illumina_tag && !nIndex) { -+ fprintf(samtools_stderr, "You must specify an index format (--index-format) with the Illumina Casava (-i) option\n"); -+ bam2fq_usage(samtools_stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (nIndex==0 && opts->index_file[0]) { -+ fprintf(samtools_stderr, "index_format not specified, but index file given\n"); -+ bam2fq_usage(samtools_stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (opts->def_qual < 0 || 93 < opts->def_qual) { -+ fprintf(samtools_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); -+ bam2fq_usage(samtools_stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ const char* type_str = argv[0]; -+ if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { -+ opts->filetype = FASTQ; -+ } else if (strcasecmp("fasta", type_str) == 0) { -+ opts->filetype = FASTA; -+ } else { -+ print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); -+ bam2fq_usage(samtools_stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ -+ if (argc == optind && isatty(STDIN_FILENO)) { -+ bam2fq_usage(samtools_stdout, argv[0]); -+ free_opts(opts); -+ return true; -+ } -+ -+ if (argc - optind > 1) { -+ fprintf(samtools_stderr, "Too many arguments.\n"); -+ bam2fq_usage(samtools_stderr, argv[0]); -+ free_opts(opts); -+ return false; -+ } -+ opts->fn_input = argc > optind ? argv[optind] : "-"; -+ *opts_out = opts; -+ return true; -+} -+ -+static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp) -+{ -+ char mode[4] = "w"; -+ size_t len = strlen(filename); -+ -+ mode[2] = 0; mode[3] = 0; -+ if (len > 3 && strstr(filename + (len - 3),".gz")) { -+ mode[1] = 'g'; mode[2] = c+'0'; -+ } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) -+ || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { -+ mode[1] = c+'0'; -+ } else { -+ mode[1] = 'u'; -+ } -+ -+ BGZF *fp = bgzf_open(filename,mode); -+ if (!fp) -+ return fp; -+ if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) { -+ bgzf_close(fp); -+ return NULL; -+ } -+ return fp; -+} -+ -+static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) -+{ -+ bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); -+ state->flag_on = opts->flag_on; -+ state->flag_off = opts->flag_off; -+ state->flag_alloff = opts->flag_alloff; -+ state->has12 = opts->has12; -+ state->use_oq = opts->use_oq; -+ state->illumina_tag = opts->illumina_tag; -+ state->copy_tags = opts->copy_tags; -+ state->filetype = opts->filetype; -+ state->def_qual = opts->def_qual; -+ state->index_sequence = NULL; -+ state->hsamtools_stdout = NULL; -+ state->compression_level = opts->compression_level; -+ -+ state->taglist = kl_init(ktaglist); -+ if (opts->extra_tags) { -+ char *save_p; -+ char *s = strtok_r(opts->extra_tags, ",", &save_p); -+ while (s) { -+ if (strlen(s) != 2) { -+ fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s); -+ free(state); -+ return false; -+ } -+ char **et = kl_pushp(ktaglist, state->taglist); -+ *et = s; -+ s = strtok_r(NULL, ",", &save_p); -+ } -+ } -+ -+ state->fp = sam_open(opts->fn_input, "r"); -+ if (state->fp == NULL) { -+ print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); -+ free(state); -+ return false; -+ } -+ -+ state->p.pool = NULL; -+ if (opts->ga.nthreads > 0) { -+ if (!(state->p.pool = hts_tpool_init(opts->ga.nthreads))) { -+ fprintf(samtools_stderr, "Failed to create thread pool\n"); -+ free(state); -+ return false; -+ } -+ state->p.qsize = opts->ga.nthreads*2; -+ hts_set_thread_pool(state->fp, &state->p); -+ } -+ -+ uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; -+ if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; -+ if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -+ fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); -+ free(state); -+ return false; -+ } -+ if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { -+ fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); -+ free(state); -+ return false; -+ } -+ if (opts->fnse) { -+ state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p); -+ if (state->fpse == NULL) { -+ print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); -+ free(state); -+ return false; -+ } -+ } -+ -+ if (opts->ga.reference) { -+ if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { -+ print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); -+ free(state); -+ return false; -+ } -+ } -+ -+ int i, j; -+ for (i = 0; i < 3; ++i) { -+ if (opts->fnr[i]) { -+ for (j = 0; j < i; j++) -+ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0) -+ break; -+ if (j == i) { -+ state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p); -+ if (state->fpr[i] == NULL) { -+ print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", -+ i, opts->fnr[i]); -+ free(state); -+ return false; -+ } -+ } else { -+ state->fpr[i] = state->fpr[j]; -+ } -+ } else { -+ if (!state->hsamtools_stdout) { -+ state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu"); -+ if (!state->hsamtools_stdout) { -+ print_error_errno("bam2fq", "Cannot open STDOUT"); -+ free(state); -+ return false; -+ } -+ } -+ state->fpr[i] = state->hsamtools_stdout; -+ } -+ } -+ for (i = 0; i < 2; i++) { -+ state->fpi[i] = NULL; -+ if (opts->index_file[i]) { -+ for (j = 0; j < 3; j++) -+ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->index_file[i]) == 0) -+ break; -+ for (j -= 3; j >= 0 && j < i; j++) -+ if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0) -+ break; -+ if (i == j) { -+ state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p); -+ if (state->fpi[i] == NULL) { -+ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", -+ i+1, opts->index_file[i]); -+ free(state); -+ return false; -+ } -+ } else if (j < 0) { -+ state->fpi[i] = state->fpr[j+3]; -+ } else { -+ state->fpi[i] = state->fpi[j]; -+ } -+ } -+ } -+ -+ state->h = sam_hdr_read(state->fp); -+ if (state->h == NULL) { -+ fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", opts->fn_input); -+ free(state); -+ return false; -+ } -+ -+ *state_out = state; -+ return true; -+} -+ -+static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) -+{ -+ bool valid = true; -+ sam_hdr_destroy(state->h); -+ check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); -+ if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } -+ int i, j; -+ for (i = 0; i < 3; ++i) { -+ if (state->fpr[i] != state->hsamtools_stdout) { -+ for (j = 0; j < i; j++) -+ if (state->fpr[i] == state->fpr[j]) -+ break; -+ if (j == i && bgzf_close(state->fpr[i])) { -+ print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); -+ valid = false; -+ } -+ } -+ } -+ if (state->hsamtools_stdout) { -+ if (bgzf_close(state->hsamtools_stdout)) { -+ print_error_errno("bam2fq", "Error closing STDOUT"); -+ valid = false; -+ } -+ } -+ for (i = 0; i < 2; i++) { -+ for (j = 0; j < 3; j++) -+ if (state->fpi[i] == state->fpr[j]) -+ break; -+ for (j -= 3; j >= 0 && j < i; j++) -+ if (state->fpi[i] == state->fpi[j]) -+ break; -+ if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) { -+ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); -+ valid = false; -+ } -+ } -+ kl_destroy(ktaglist,state->taglist); -+ free(state->index_sequence); -+ if (state->p.pool) -+ hts_tpool_destroy(state->p.pool); -+ free(state); -+ return valid; -+} -+ -+static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) -+{ -+ return ((b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags -+ || (b->core.flag&(state->flag_off)) != 0 -+ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); -+ -+} -+ -+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) -+{ -+ int n; -+ bam1_t *records[3] = {NULL, NULL, NULL}; -+ char *current_qname = NULL; -+ int64_t n_reads = 0, n_singletons = 0; // Statistics -+ kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; -+ int score[3]; -+ int at_eof; -+ bool valid = true; -+ bam1_t* b = NULL; -+ -+ while (true) { -+ if (!b) -+ b = bam_init1(); -+ if (b == NULL) { -+ perror("[bam2fq_mainloop] Malloc error for bam record buffer."); -+ valid = false; -+ break; -+ } -+ int res = sam_read1(state->fp, state->h, b); -+ if (res < -1) { -+ fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); -+ valid = false; -+ break; -+ } -+ at_eof = res < 0; -+ -+ if (!at_eof && filter_it_out(b, state)) -+ continue; -+ if (!at_eof) ++n_reads; -+ -+ if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { -+ if (current_qname) { -+ if (state->illumina_tag) { -+ for (n=0; valid && n<3; n++) { -+ if (!records[n]) continue; -+ if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; -+ } -+ if (!valid) break; -+ } -+ free(state->index_sequence); state->index_sequence = NULL; -+ if (score[1] > 0 && score[2] > 0) { -+ // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] -+ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -+ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -+ } else if (score[1] > 0 || score[2] > 0) { -+ if (state->fpse) { -+ // print whichever one exists to fpse -+ if (score[1] > 0) { -+ if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -+ } else { -+ if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -+ } -+ ++n_singletons; -+ } else { -+ if (score[1] > 0) { -+ if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -+ } else { -+ if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -+ } -+ } -+ } -+ if (score[0]) { // TODO: check this -+ // print linebuf[0] to fpr[0] -+ if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } -+ } -+ } -+ -+ -+ free(current_qname); current_qname = NULL; -+ score[0] = score[1] = score[2] = 0; -+ for (n=0; n < 3; n++) { -+ bam_destroy1(records[n]); records[n]=NULL; -+ } -+ -+ if (at_eof) { break; } -+ -+ current_qname = strdup(bam_get_qname(b)); -+ if (!current_qname) { valid = false; break; } -+ } -+ -+ // Prefer a copy of the read that has base qualities -+ int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; -+ readpart rp = which_readpart(b); -+ if (b_score > score[rp]) { -+ if (!tags2fq(b, state, opts)) { valid = false; break; } -+ if (records[rp]) bam_destroy1(records[rp]); -+ records[rp] = b; -+ score[rp] = b_score; -+ b = NULL; -+ if(!bam1_to_fq(records[rp], &linebuf[rp], state)) { -+ fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); -+ valid = false; break; -+ } -+ } -+ } -+ if (!valid) -+ { -+ perror("[bam2fq_mainloop] Error writing to FASTx files."); -+ } -+ bam_destroy1(b); -+ for (n=0; n < 3; n++) { -+ bam_destroy1(records[n]); -+ } -+ free(current_qname); -+ free(linebuf[0].s); -+ free(linebuf[1].s); -+ free(linebuf[2].s); -+ fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); -+ fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); -+ -+ return valid; -+} -+ -+int main_bam2fq(int argc, char *argv[]) -+{ -+ int status = EXIT_SUCCESS; -+ bam2fq_opts_t* opts = NULL; -+ bam2fq_state_t* state = NULL; -+ -+ bool valid = parse_opts(argc, argv, &opts); -+ if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; -+ -+ if (!init_state(opts, &state)) return EXIT_FAILURE; -+ -+ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; -+ -+ if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; -+ sam_global_args_free(&opts->ga); -+ free_opts(opts); -+ -+ return status; -+} ---- python-pysam.orig/samtools/bam_import.c -+++ /dev/null -@@ -1,65 +0,0 @@ --/* bam_import.c -- SAM format parsing. -- -- Copyright (C) 2008-2013 Genome Research Ltd. -- -- Author: Heng Li -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL --THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING --FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER --DEALINGS IN THE SOFTWARE. */ -- --#include -- --#include --#include --#include --#include --#include "htslib/kstring.h" --#include "bam.h" --#include "htslib/kseq.h" -- --KSTREAM_INIT(gzFile, gzread, 16384) -- --bam_header_t *sam_header_read2(const char *fn) --{ -- bam_header_t *header; -- int c, dret, n_targets = 0; -- gzFile fp; -- kstream_t *ks; -- kstring_t *str; -- kstring_t samstr = { 0, 0, NULL }; -- if (fn == 0) return 0; -- fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); -- if (fp == 0) return 0; -- ks = ks_init(fp); -- str = (kstring_t*)calloc(1, sizeof(kstring_t)); -- while (ks_getuntil(ks, 0, str, &dret) > 0) { -- ksprintf(&samstr, "@SQ\tSN:%s", str->s); -- ks_getuntil(ks, 0, str, &dret); -- ksprintf(&samstr, "\tLN:%d\n", atoi(str->s)); -- n_targets++; -- if (dret != '\n') -- while ((c = ks_getc(ks)) != '\n' && c != -1); -- } -- ks_destroy(ks); -- gzclose(fp); -- free(str->s); free(str); -- header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); -- free(samstr.s); -- fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets); -- return header; --} ---- python-pysam.orig/samtools/bam_import.c.pysam.c -+++ /dev/null -@@ -1,67 +0,0 @@ --#include "samtools.pysam.h" -- --/* bam_import.c -- SAM format parsing. -- -- Copyright (C) 2008-2013 Genome Research Ltd. -- -- Author: Heng Li -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL --THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING --FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER --DEALINGS IN THE SOFTWARE. */ -- --#include -- --#include --#include --#include --#include --#include "htslib/kstring.h" --#include "bam.h" --#include "htslib/kseq.h" -- --KSTREAM_INIT(gzFile, gzread, 16384) -- --bam_header_t *sam_header_read2(const char *fn) --{ -- bam_header_t *header; -- int c, dret, n_targets = 0; -- gzFile fp; -- kstream_t *ks; -- kstring_t *str; -- kstring_t samstr = { 0, 0, NULL }; -- if (fn == 0) return 0; -- fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); -- if (fp == 0) return 0; -- ks = ks_init(fp); -- str = (kstring_t*)calloc(1, sizeof(kstring_t)); -- while (ks_getuntil(ks, 0, str, &dret) > 0) { -- ksprintf(&samstr, "@SQ\tSN:%s", str->s); -- ks_getuntil(ks, 0, str, &dret); -- ksprintf(&samstr, "\tLN:%d\n", atoi(str->s)); -- n_targets++; -- if (dret != '\n') -- while ((c = ks_getc(ks)) != '\n' && c != -1); -- } -- ks_destroy(ks); -- gzclose(fp); -- free(str->s); free(str); -- header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); -- free(samstr.s); -- fprintf(samtools_stderr, "[sam_header_read2] %d sequences loaded.\n", n_targets); -- return header; --} ---- python-pysam.orig/samtools/bam_index.c -+++ python-pysam/samtools/bam_index.c -@@ -1,6 +1,6 @@ - /* bam_index.c -- index and idxstats subcommands. - -- Copyright (C) 2008-2011, 2013, 2014 Genome Research Ltd. -+ Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd. - Portions copyright (C) 2010 Broad Institute. - Portions copyright (C) 2013 Peter Cock, The James Hutton Institute. - -@@ -114,20 +114,20 @@ - * Returns 0 on success, - * -1 on failure. - */ --int slow_idxstats(samFile *fp, bam_hdr_t *header) { -+int slow_idxstats(samFile *fp, sam_hdr_t *header) { - int ret, last_tid = -2; - bam1_t *b = bam_init1(); - - if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_FLAG)) - return -1; - -- uint64_t (*count0)[2] = calloc(header->n_targets+1, sizeof(*count0)); -+ uint64_t (*count0)[2] = calloc(sam_hdr_nref(header)+1, sizeof(*count0)); - uint64_t (*counts)[2] = count0+1; - if (!count0) - return -1; - - while ((ret = sam_read1(fp, header, b)) >= 0) { -- if (b->core.tid >= header->n_targets || b->core.tid < -1) { -+ if (b->core.tid >= sam_hdr_nref(header) || b->core.tid < -1) { - free(count0); - return -1; - } -@@ -148,10 +148,10 @@ - - if (ret == -1) { - int i; -- for (i = 0; i < header->n_targets; i++) { -- printf("%s\t%d\t%"PRIu64"\t%"PRIu64"\n", -- header->target_name[i], -- header->target_len[i], -+ for (i = 0; i < sam_hdr_nref(header); i++) { -+ printf("%s\t%"PRId64"\t%"PRIu64"\t%"PRIu64"\n", -+ sam_hdr_tid2name(header, i), -+ (int64_t) sam_hdr_tid2len(header, i), - counts[i][0], counts[i][1]); - } - printf("*\t0\t%"PRIu64"\t%"PRIu64"\n", counts[-1][0], counts[-1][1]); -@@ -167,14 +167,14 @@ - static void usage_exit(FILE *fp, int exit_status) - { - fprintf(fp, "Usage: samtools idxstats [options] \n"); -- sam_global_opt_help(fp, "-.---@"); -+ sam_global_opt_help(fp, "-.---@-."); - exit(exit_status); - } - - int bam_idxstats(int argc, char *argv[]) - { - hts_idx_t* idx; -- bam_hdr_t* header; -+ sam_hdr_t* header; - samFile* fp; - int c; - -@@ -227,9 +227,9 @@ - } - - int i; -- for (i = 0; i < header->n_targets; ++i) { -+ for (i = 0; i < sam_hdr_nref(header); ++i) { - // Print out contig name and length -- printf("%s\t%d", header->target_name[i], header->target_len[i]); -+ printf("%s\t%"PRId64, sam_hdr_tid2name(header, i), (int64_t) sam_hdr_tid2len(header, i)); - // Now fetch info about it from the meta bin - uint64_t u, v; - hts_idx_get_stat(idx, i, &u, &v); -@@ -240,7 +240,7 @@ - hts_idx_destroy(idx); - } - -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - sam_close(fp); - return 0; - } ---- python-pysam.orig/samtools/bam_index.c.pysam.c -+++ python-pysam/samtools/bam_index.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_index.c -- index and idxstats subcommands. - -- Copyright (C) 2008-2011, 2013, 2014 Genome Research Ltd. -+ Copyright (C) 2008-2011, 2013-2016, 2018, 2019 Genome Research Ltd. - Portions copyright (C) 2010 Broad Institute. - Portions copyright (C) 2013 Peter Cock, The James Hutton Institute. - -@@ -116,20 +116,20 @@ - * Returns 0 on success, - * -1 on failure. - */ --int slow_idxstats(samFile *fp, bam_hdr_t *header) { -+int slow_idxstats(samFile *fp, sam_hdr_t *header) { - int ret, last_tid = -2; - bam1_t *b = bam_init1(); - - if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_FLAG)) - return -1; - -- uint64_t (*count0)[2] = calloc(header->n_targets+1, sizeof(*count0)); -+ uint64_t (*count0)[2] = calloc(sam_hdr_nref(header)+1, sizeof(*count0)); - uint64_t (*counts)[2] = count0+1; - if (!count0) - return -1; - - while ((ret = sam_read1(fp, header, b)) >= 0) { -- if (b->core.tid >= header->n_targets || b->core.tid < -1) { -+ if (b->core.tid >= sam_hdr_nref(header) || b->core.tid < -1) { - free(count0); - return -1; - } -@@ -150,10 +150,10 @@ - - if (ret == -1) { - int i; -- for (i = 0; i < header->n_targets; i++) { -- fprintf(samtools_stdout, "%s\t%d\t%"PRIu64"\t%"PRIu64"\n", -- header->target_name[i], -- header->target_len[i], -+ for (i = 0; i < sam_hdr_nref(header); i++) { -+ fprintf(samtools_stdout, "%s\t%"PRId64"\t%"PRIu64"\t%"PRIu64"\n", -+ sam_hdr_tid2name(header, i), -+ (int64_t) sam_hdr_tid2len(header, i), - counts[i][0], counts[i][1]); - } - fprintf(samtools_stdout, "*\t0\t%"PRIu64"\t%"PRIu64"\n", counts[-1][0], counts[-1][1]); -@@ -169,14 +169,14 @@ - static void usage_exit(FILE *fp, int exit_status) - { - fprintf(fp, "Usage: samtools idxstats [options] \n"); -- sam_global_opt_help(fp, "-.---@"); -+ sam_global_opt_help(fp, "-.---@-."); - exit(exit_status); - } - - int bam_idxstats(int argc, char *argv[]) - { - hts_idx_t* idx; -- bam_hdr_t* header; -+ sam_hdr_t* header; - samFile* fp; - int c; - -@@ -229,9 +229,9 @@ - } - - int i; -- for (i = 0; i < header->n_targets; ++i) { -+ for (i = 0; i < sam_hdr_nref(header); ++i) { - // Print out contig name and length -- fprintf(samtools_stdout, "%s\t%d", header->target_name[i], header->target_len[i]); -+ fprintf(samtools_stdout, "%s\t%"PRId64, sam_hdr_tid2name(header, i), (int64_t) sam_hdr_tid2len(header, i)); - // Now fetch info about it from the meta bin - uint64_t u, v; - hts_idx_get_stat(idx, i, &u, &v); -@@ -242,7 +242,7 @@ - hts_idx_destroy(idx); - } - -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - sam_close(fp); - return 0; - } ---- python-pysam.orig/samtools/bam_lpileup.c -+++ python-pysam/samtools/bam_lpileup.c -@@ -100,7 +100,7 @@ - buf->n_nodes = 0; - } - --static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) -+static int tview_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data) - { - bam_lplbuf_t *tv = (bam_lplbuf_t*)data; - freenode_t *p; ---- python-pysam.orig/samtools/bam_lpileup.c.pysam.c -+++ python-pysam/samtools/bam_lpileup.c.pysam.c -@@ -102,7 +102,7 @@ - buf->n_nodes = 0; - } - --static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) -+static int tview_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data) - { - bam_lplbuf_t *tv = (bam_lplbuf_t*)data; - freenode_t *p; ---- python-pysam.orig/samtools/bam_lpileup.h -+++ python-pysam/samtools/bam_lpileup.h -@@ -33,7 +33,7 @@ - - #ifndef BAM_PILEUP_F_DEFINED - #define BAM_PILEUP_F_DEFINED --typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); -+typedef int (*bam_pileup_f)(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data); - #endif //BAM_PILEUP_F_DEFINED - - ---- python-pysam.orig/samtools/bam_markdup.c -+++ python-pysam/samtools/bam_markdup.c -@@ -1,7 +1,7 @@ - /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone - through fixmates with the mate scoring option on. - -- Copyright (C) 2017-18 Genome Research Ltd. -+ Copyright (C) 2017-2019 Genome Research Ltd. - - Author: Andrew Whitwham - -@@ -22,6 +22,9 @@ - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - DEALINGS IN THE SOFTWARE -+ -+Estimate library size derived from Picard DuplicationMetrics.java -+Copyright (c) 2009,2018 The Broad Institute. MIT license. - */ - - #include -@@ -33,6 +36,7 @@ - #include - #include - #include -+#include - #include "htslib/thread_pool.h" - #include "htslib/sam.h" - #include "sam_opts.h" -@@ -42,26 +46,53 @@ - #include "htslib/kstring.h" - #include "tmp_file.h" - -+ -+typedef struct { -+ samFile *in; -+ samFile *out; -+ char *prefix; -+ int remove_dups; -+ int32_t max_length; -+ int do_stats; -+ int supp; -+ int tag; -+ int opt_dist; -+ int no_pg; -+ int clear; -+ int mode; -+ int write_index; -+ int include_fails; -+ char *stats_file; -+ char *arg_list; -+ char *out_fn; -+} md_param_t; -+ - typedef struct { -- int32_t single; -+ hts_pos_t this_coord; -+ hts_pos_t other_coord; - int32_t this_ref; -- int32_t this_coord; - int32_t other_ref; -- int32_t other_coord; -- int32_t leftmost; -- int32_t orientation; -+ int8_t single; -+ int8_t leftmost; -+ int8_t orientation; - } key_data_t; - -+typedef struct read_queue_s { -+ key_data_t pair_key; -+ key_data_t single_key; -+ bam1_t *b; -+ struct read_queue_s *duplicate; -+ hts_pos_t pos; -+} read_queue_t; -+ - typedef struct { -- bam1_t *p; -+ read_queue_t *p; - } in_hash_t; - - typedef struct { -- bam1_t *b; -- int32_t pos; -- key_data_t pair_key; -- key_data_t single_key; --} read_queue_t; -+ char *name; -+ char type; -+} dup_map_t; - - - -@@ -72,22 +103,22 @@ - khint_t hash; - - if (key.single) { -- unsigned char sig[12]; -+ unsigned char sig[13]; - - memcpy(sig + i, &key.this_ref, 4); i += 4; -- memcpy(sig + i, &key.this_coord, 4); i += 4; -- memcpy(sig + i, &key.orientation, 4); i += 4; -+ memcpy(sig + i, &key.this_coord, 8); i += 8; -+ memcpy(sig + i, &key.orientation, 1); i += 1; - - hash = do_hash(sig, i); - } else { -- unsigned char sig[24]; -+ unsigned char sig[26]; - - memcpy(sig + i, &key.this_ref, 4); i += 4; -- memcpy(sig + i, &key.this_coord, 4); i += 4; -+ memcpy(sig + i, &key.this_coord, 8); i += 8; - memcpy(sig + i, &key.other_ref, 4); i += 4; -- memcpy(sig + i, &key.other_coord, 4); i += 4; -- memcpy(sig + i, &key.leftmost, 4); i += 4; -- memcpy(sig + i, &key.orientation, 4); i += 4; -+ memcpy(sig + i, &key.other_coord, 8); i += 8; -+ memcpy(sig + i, &key.leftmost, 1); i += 1; -+ memcpy(sig + i, &key.orientation, 1); i += 1; - - hash = do_hash(sig, i); - } -@@ -122,21 +153,35 @@ - - - #define __free_queue_element(p) -+ -+// Orientations (prime numbers to feed to hashing algorithm) - #define O_FF 2 - #define O_RR 3 - #define O_FR 5 - #define O_RF 7 - -+// Left or rightmost -+#define R_LE 11 -+#define R_RI 13 -+ -+#define BMD_WARNING_MAX 10 -+ -+#define MD_MIN_QUALITY 15 -+ -+// Duplicate finding mode -+#define MD_MODE_TEMPLATE 0 -+#define MD_MODE_SEQUENCE 1 -+ - KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash - KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer --KHASH_MAP_INIT_STR(duplicates, int) // map of duplicates for supplementary dup id -+KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id - - - /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ - --static int32_t unclipped_other_start(int32_t op, char *cigar) { -+static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { - char *c = cigar; -- int32_t clipped = 0; -+ int64_t clipped = 0; - - while (*c && *c != '*') { - long num = 0; -@@ -162,9 +207,9 @@ - - /* Calculate the current read's start based on the stored cigar string. */ - --static int32_t unclipped_start(bam1_t *b) { -+static hts_pos_t unclipped_start(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); -- int32_t clipped = 0; -+ int64_t clipped = 0; - uint32_t i; - - for (i = 0; i < b->core.n_cigar; i++) { -@@ -183,9 +228,9 @@ - - /* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ - --static int32_t unclipped_other_end(int32_t op, char *cigar) { -+static hts_pos_t unclipped_other_end(int64_t op, char *cigar) { - char *c = cigar; -- int32_t refpos = 0; -+ int64_t refpos = 0; - int skip = 1; - - while (*c && *c != '*') { -@@ -224,9 +269,9 @@ - - /* Calculate the current read's end based on the stored cigar string. */ - --static int32_t unclipped_end(bam1_t *b) { -+static hts_pos_t unclipped_end(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); -- int32_t end_pos, clipped = 0; -+ hts_pos_t end_pos, clipped = 0; - int32_t i; - - end_pos = bam_endpos(b); -@@ -293,7 +338,7 @@ - int i; - - for (i = 0; i < b->core.l_qseq; i++) { -- if (qual[i] >= 15) score += qual[i]; -+ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; - } - - return score; -@@ -305,10 +350,10 @@ - the reference id, orientation and whether the current - read is leftmost of the pair. */ - --static int make_pair_key(key_data_t *key, bam1_t *bam) { -- int32_t this_ref, this_coord, this_end; -- int32_t other_ref, other_coord, other_end; -- int32_t orientation, leftmost; -+static int make_pair_key_template(key_data_t *key, bam1_t *bam) { -+ hts_pos_t this_coord, other_coord, this_end, other_end; -+ int32_t this_ref, other_ref; -+ int8_t orientation, leftmost; - uint8_t *data; - char *cig; - -@@ -319,7 +364,11 @@ - this_end = unclipped_end(bam); - - if ((data = bam_aux_get(bam, "MC"))) { -- cig = bam_aux2Z(data); -+ if (!(cig = bam_aux2Z(data))) { -+ fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); -+ return 1; -+ } -+ - other_end = unclipped_other_end(bam->core.mpos, cig); - other_coord = unclipped_other_start(bam->core.mpos, cig); - } else { -@@ -402,9 +451,9 @@ - } - - if (!leftmost) -- leftmost = 13; -+ leftmost = R_RI; - else -- leftmost = 11; -+ leftmost = R_LE; - - key->single = 0; - key->this_ref = this_ref; -@@ -418,13 +467,140 @@ - } - - -+static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { -+ hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; -+ int32_t this_ref, other_ref; -+ int8_t orientation, left_read; -+ uint8_t *data; -+ char *cig; -+ -+ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash -+ other_ref = bam->core.mtid + 1; -+ -+ this_coord = unclipped_start(bam); -+ this_end = unclipped_end(bam); -+ -+ if ((data = bam_aux_get(bam, "MC"))) { -+ if (!(cig = bam_aux2Z(data))) { -+ fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); -+ return 1; -+ } -+ -+ other_end = unclipped_other_end(bam->core.mpos, cig); -+ other_coord = unclipped_other_start(bam->core.mpos, cig); -+ } else { -+ fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); -+ return 1; -+ } -+ -+ // work out orientations -+ if (this_ref != other_ref) { -+ leftmost = this_ref - other_ref; -+ } else { -+ if (bam_is_rev(bam) == bam_is_mrev(bam)) { -+ if (!bam_is_rev(bam)) { -+ leftmost = this_coord - other_coord; -+ } else { -+ leftmost = this_end - other_end; -+ } -+ } else { -+ if (bam_is_rev(bam)) { -+ leftmost = this_end - other_coord; -+ } else { -+ leftmost = this_coord - other_end; -+ } -+ } -+ } -+ -+ if (leftmost < 0) { -+ leftmost = 1; -+ } else if (leftmost > 0) { -+ leftmost = 0; -+ } else { -+ // tie breaks -+ -+ if (bam->core.pos == bam->core.mpos) { -+ if (bam->core.flag & BAM_FREAD1) { -+ leftmost = 1; -+ } else { -+ leftmost = 0; -+ } -+ } else if (bam->core.pos < bam->core.mpos) { -+ leftmost = 1; -+ } else { -+ leftmost = 0; -+ } -+ } -+ -+ // pair orientation -+ if (leftmost) { -+ if (bam_is_rev(bam) == bam_is_mrev(bam)) { -+ -+ if (!bam_is_rev(bam)) { -+ orientation = O_FF; -+ } else { -+ orientation = O_RR; -+ } -+ } else { -+ if (!bam_is_rev(bam)) { -+ orientation = O_FR; -+ } else { -+ orientation = O_RF; -+ } -+ } -+ } else { -+ if (bam_is_rev(bam) == bam_is_mrev(bam)) { -+ -+ if (!bam_is_rev(bam)) { -+ orientation = O_RR; -+ } else { -+ orientation = O_FF; -+ } -+ } else { -+ if (!bam_is_rev(bam)) { -+ orientation = O_RF; -+ } else { -+ orientation = O_FR; -+ } -+ } -+ } -+ -+ if (!leftmost) -+ left_read = R_RI; -+ else -+ left_read = R_LE; -+ -+ if (!bam_is_rev(bam)) { -+ this_coord = unclipped_start(bam); -+ } else { -+ this_coord = unclipped_end(bam); -+ } -+ -+ if (!bam_is_mrev(bam)) { -+ other_coord = unclipped_other_start(bam->core.mpos, cig); -+ } else { -+ other_coord = unclipped_other_end(bam->core.mpos, cig); -+ } -+ -+ key->single = 0; -+ key->this_ref = this_ref; -+ key->this_coord = this_coord; -+ key->other_ref = other_ref; -+ key->other_coord = other_coord; -+ key->leftmost = left_read; -+ key->orientation = orientation; -+ -+ return 0; -+} -+ - /* Create a signature hash of single read (or read with an unmatched pair). - Uses unclipped start (or end depending on orientation), reference id, - and orientation. */ - - static void make_single_key(key_data_t *key, bam1_t *bam) { -- int32_t this_ref, this_coord; -- int32_t orientation; -+ hts_pos_t this_coord; -+ int32_t this_ref; -+ int8_t orientation; - - this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash - -@@ -442,23 +618,45 @@ - key->orientation = orientation; - } - -+ - /* Add the duplicate name to a hash if it does not exist. */ - --static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { -+static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) { - khiter_t d; - int ret; - - d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); - - if (d == kh_end(d_hash)) { -- d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); -+ char *name = strdup(bam_get_qname(dupe)); -+ if (name) { -+ d = kh_put(duplicates, d_hash, name, &ret); -+ } else { -+ ret = -1; -+ } -+ -+ if (ret >= 0) { -+ if (orig_name) { -+ if (ret == 0) { -+ // replace old name -+ free(kh_value(d_hash, d).name); -+ free(name); -+ } - -- if (ret > 0) { -- kh_value(d_hash, d) = 1; -- } else if (ret == 0) { -- kh_value(d_hash, d)++; -+ kh_value(d_hash, d).name = strdup(orig_name); -+ -+ if (kh_value(d_hash, d).name == NULL) { -+ fprintf(stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n"); -+ return 1; -+ } -+ } else { -+ kh_value(d_hash, d).name = NULL; -+ } -+ -+ kh_value(d_hash, d).type = type; - } else { - fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n"); -+ free(name); - return 1; - } - } -@@ -467,6 +665,467 @@ - } - - -+static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { -+ int sep = 0; -+ int pos = 0; -+ -+ while (qname[pos]) { -+ if (qname[pos] == ':') { -+ sep++; -+ -+ if (sep == 2) { -+ *xpos = pos + 1; -+ } else if (sep == 3) { -+ *ypos = pos + 1; -+ } else if (sep == 4) { // HiSeq style names -+ *xpos = *ypos; -+ *ypos = pos + 1; -+ } else if (sep == 5) { // Newer Illumina format -+ *xpos = pos + 1; -+ } else if (sep == 6) { -+ *ypos = pos + 1; -+ } -+ } -+ -+ pos++; -+ } -+ -+ return sep; -+} -+ -+/* Using the coordinates from the Illumina read name, see whether the duplicated read is -+ close enough (set by max_dist) to the original to be counted as optical.*/ -+ -+static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { -+ int ret = 0, seps; -+ char *original, *duplicate; -+ int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; -+ -+ -+ original = bam_get_qname(ori); -+ duplicate = bam_get_qname(dup); -+ -+ seps = get_coordinate_positions(original, &oxpos, &oypos); -+ -+ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original); -+ } -+ -+ return ret; -+ } -+ -+ seps = get_coordinate_positions(duplicate, &dxpos, &dypos); -+ -+ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { -+ -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate); -+ } -+ -+ return ret; -+ } -+ -+ if (strncmp(original, duplicate, oxpos - 1) == 0) { -+ // the initial parts match, look at the numbers -+ long ox, oy, dx, dy, xdiff, ydiff; -+ char *end; -+ -+ ox = strtol(original + oxpos, &end, 10); -+ -+ if ((original + oxpos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original); -+ } -+ -+ return ret; -+ } -+ -+ dx = strtol(duplicate + dxpos, &end, 10); -+ -+ if ((duplicate + dxpos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate); -+ } -+ -+ return ret; -+ } -+ -+ if (ox > dx) { -+ xdiff = ox - dx; -+ } else { -+ xdiff = dx - ox; -+ } -+ -+ if (xdiff <= max_dist) { -+ // still might be optical -+ -+ oy = strtol(original + oypos, &end, 10); -+ -+ if ((original + oypos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original); -+ } -+ -+ return ret; -+ } -+ -+ dy = strtol(duplicate + dypos, &end, 10); -+ -+ if ((duplicate + dypos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate); -+ } -+ -+ return ret; -+ } -+ -+ if (oy > dy) { -+ ydiff = oy - dy; -+ } else { -+ ydiff = dy - oy; -+ } -+ -+ if (ydiff <= max_dist) ret = 1; -+ } -+ } -+ -+ return ret; -+} -+ -+ -+static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, -+ long *optical, long *warn) { -+ char dup_type = 0; -+ long incoming_warnings = *warn; -+ -+ dup->core.flag |= BAM_FDUP; -+ -+ if (param->tag) { -+ if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) { -+ fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); -+ return -1; -+ } -+ } -+ -+ if (param->opt_dist) { // mark optical duplicates -+ if (optical_duplicate(ori, dup, param->opt_dist, warn)) { -+ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ"); -+ dup_type = 'O'; -+ (*optical)++; -+ } else { -+ // not an optical duplicate -+ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB"); -+ } -+ } -+ -+ if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) { -+ fprintf(stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n", -+ *warn); -+ } -+ -+ if (param->supp) { -+ if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP) || bam_aux_get(dup, "XA")) { -+ char *original = NULL; -+ -+ if (param->tag) { -+ original = bam_get_qname(ori); -+ } -+ -+ if (add_duplicate(dup_hash, dup, original, dup_type)) -+ return -1; -+ } -+ } -+ -+ return 0; -+} -+ -+ -+static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { -+ int ret = 0; -+ uint8_t *data; -+ -+ // remove any existing dt tag -+ if ((data = bam_aux_get(b, "dt")) != NULL) { -+ bam_aux_del(b, data); -+ } -+ -+ if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) { -+ fprintf(stderr, "[markdup] error: unable to append 'dt' tag.\n"); -+ ret = -1; -+ } -+ -+ if (paired) { -+ (*optical_pair)++; -+ } else { -+ (*optical_single)++; -+ } -+ -+ if (param->supp) { -+ // Change the duplicate type -+ -+ if (bam_aux_get(b, "SA") || (b->core.flag & BAM_FMUNMAP) -+ || bam_aux_get(b, "XA")) { -+ khiter_t d; -+ -+ d = kh_get(duplicates, dup_hash, bam_get_qname(b)); -+ -+ if (d == kh_end(dup_hash)) { -+ // error, name should already be in dup hash -+ fprintf(stderr, "[markdup] error: duplicate name %s not found in hash.\n", -+ bam_get_qname(b)); -+ ret = -1; -+ } else { -+ kh_value(dup_hash, d).type = 'O'; -+ } -+ } -+ } -+ -+ return ret; -+} -+ -+ -+ -+/* -+ Where there is more than one duplicate go down the list and check for optical duplicates and change -+ do tags (where used) to point to original (non-duplicate) read. -+*/ -+static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, -+ long *warn, long *optical_single, long *optical_pair) { -+ int ret = 0; -+ read_queue_t *current = ori->duplicate; -+ char *ori_name = bam_get_qname(ori->b); -+ int have_original = !(ori->b->core.flag & BAM_FDUP); -+ int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP); -+ -+ while (current) { -+ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); -+ -+ if (param->tag && have_original) { -+ uint8_t *data; -+ -+ // at this stage all duplicates should have a do tag -+ if ((data = bam_aux_get(current->b, "do")) != NULL) { -+ // see if we need to change the tag -+ char *old_name = bam_aux2Z(data); -+ -+ if (old_name) { -+ if (strcmp(old_name, ori_name) != 0) { -+ bam_aux_del(current->b, data); -+ -+ if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) { -+ fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); -+ ret = -1; -+ break; -+ } -+ } -+ } else { -+ fprintf(stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); -+ ret = -1; -+ break; -+ } -+ } -+ } -+ -+ if (param->opt_dist) { -+ int is_cur_opt = 0, is_ori_opt = 0; -+ uint8_t *data; -+ char *dup_type; -+ -+ if ((data = bam_aux_get(ori->b, "dt"))) { -+ if ((dup_type = bam_aux2Z(data))) { -+ if (strcmp(dup_type, "SQ") == 0) { -+ is_ori_opt = 1; -+ } -+ } -+ } -+ -+ if ((data = bam_aux_get(current->b, "dt"))) { -+ if ((dup_type = bam_aux2Z(data))) { -+ if (strcmp(dup_type, "SQ") == 0) { -+ is_cur_opt = 1; -+ } -+ } -+ } -+ -+ if (!(is_ori_opt && is_cur_opt)) { -+ // if both are already optical duplicates there is no need to check again, otherwise... -+ -+ if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) { -+ // find out which one is the duplicate -+ int is_cur_dup = 0; -+ -+ if (have_original) { -+ // compared against an original, this is a dup. -+ is_cur_dup = 1; -+ } else if (ori_paired != current_paired) { -+ if (!current_paired) { -+ // current is single vs pair, this is a dup. -+ is_cur_dup = 1; -+ } -+ } else { -+ // do it by scores -+ int64_t ori_score, curr_score; -+ -+ if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) { -+ if (ori->b->core.flag & BAM_FQCFAIL) { -+ ori_score = 0; -+ curr_score = 1; -+ } else { -+ ori_score = 1; -+ curr_score = 0; -+ } -+ } else { -+ ori_score = calc_score(ori->b); -+ curr_score = calc_score(current->b); -+ -+ if (current_paired) { -+ // they are pairs so add mate scores. -+ int64_t mate_tmp; -+ -+ if ((mate_tmp = get_mate_score(ori->b)) == -1) { -+ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ ret = -1; -+ break; -+ } else { -+ ori_score += mate_tmp; -+ } -+ -+ if ((mate_tmp = get_mate_score(current->b)) == -1) { -+ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ ret = -1; -+ break; -+ } else { -+ curr_score += mate_tmp; -+ } -+ } -+ } -+ -+ if (ori_score == curr_score) { -+ if (strcmp(bam_get_qname(current->b), ori_name) < 0) { -+ curr_score++; -+ } else { -+ curr_score--; -+ } -+ } -+ -+ if (ori_score > curr_score) { -+ is_cur_dup = 1; -+ } -+ } -+ -+ if (is_cur_dup) { -+ // the current is the optical duplicate -+ if (!is_cur_opt) { // only change if not already an optical duplicate -+ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { -+ ret = -1; -+ break; -+ } -+ } -+ } else { -+ if (!is_ori_opt) { -+ if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) { -+ ret = -1; -+ break; -+ } -+ } -+ } -+ } -+ } -+ } -+ -+ current = current->duplicate; -+ } -+ -+ return ret; -+} -+ -+/* -+ Function to use when estimating library size. -+ -+ This is based on an approximate formula for the coverage of a set -+ obtained after sampling it a given number of times with replacement. -+ -+ x = number of items in the set (the number of unique fragments in the library) -+ -+ c = number of unique items (unique read pairs observed) -+ -+ n = number of items samples (total number of read pairs) -+ -+ c and n are known; x is unknown. -+ -+ As n -> infinity, the coverage (c/x) can be given as: -+ -+ c / x = 1 - exp(-n / x) (see https://math.stackexchange.com/questions/32800) -+ -+ This needs to be solved for x, so it is rearranged to put both terms on the -+ left side and estimate_library_size() finds a value of x which gives a -+ result of zero (or as close as it can get). -+ */ -+static inline double coverage_equation(double x, double c, double n) { -+ return c / x - 1 + exp(-n / x); -+} -+ -+ -+/* estimate the library size, based on the Picard code in DuplicationMetrics.java*/ -+static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) { -+ unsigned long estimated_size = 0; -+ -+ read_pairs /= 2; -+ duplicate_pairs /= 2; -+ -+ if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) { -+ unsigned long unique_pairs = read_pairs - duplicate_pairs; -+ double m = 1; -+ double M = 100; -+ int i; -+ -+ if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) { -+ fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n"); -+ return estimated_size; -+ } -+ -+ while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) { -+ M *= 10; -+ } -+ -+ for (i = 0; i < 40; i++) { -+ double r = (m + M) / 2; -+ double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs); -+ -+ if (u > 0) { -+ m = r; -+ } else if (u < 0) { -+ M = r; -+ } else { -+ break; -+ } -+ } -+ -+ estimated_size = (unsigned long)(unique_pairs * (m + M) / 2); -+ } else { -+ fprintf(stderr, "[markdup] warning: unable to calculate estimated library size." -+ " Read pairs %ld should be greater than duplicate pairs %ld," -+ " which should both be non zero.\n", -+ read_pairs, duplicate_pairs); -+ } -+ -+ return estimated_size; -+} -+ -+ - /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. - Generally the highest quality scoring is chosen as the original and all others the duplicates. - The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). -@@ -476,44 +1135,59 @@ - Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write - step. This is because the duplicate can occur before the primary read.*/ - --static int bam_mark_duplicates(samFile *in, samFile *out, char *prefix, int remove_dups, int32_t max_length, int do_stats, int supp, int tag) { -- bam_hdr_t *header; -+static int bam_mark_duplicates(md_param_t *param) { -+ bam_hdr_t *header = NULL; - khiter_t k; - khash_t(reads) *pair_hash = kh_init(reads); - khash_t(reads) *single_hash = kh_init(reads); - klist_t(read_queue) *read_buffer = kl_init(read_queue); - kliter_t(read_queue) *rq; - khash_t(duplicates) *dup_hash = kh_init(duplicates); -- int32_t prev_tid, prev_coord; -+ int32_t prev_tid; -+ hts_pos_t prev_coord; - read_queue_t *in_read; - int ret; -- int reading, writing, excluded, duplicate, single, pair, single_dup, examined; -+ long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; -+ long np_duplicate, np_opt_duplicate; -+ long opt_warnings = 0; - tmp_file_t temp; -+ char *idx_fn = NULL; -+ int exclude = 0; - -- if ((header = sam_hdr_read(in)) == NULL) { -+ if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { -+ fprintf(stderr, "[markdup] out of memory\n"); -+ goto fail; -+ } -+ -+ if ((header = sam_hdr_read(param->in)) == NULL) { - fprintf(stderr, "[markdup] error reading header\n"); -- return 1; -+ goto fail; - } - - // accept unknown, unsorted or coordinate sort order, but error on queryname sorted. - // only really works on coordinate sorted files. -- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { -- char *p, *q; -- -- p = strstr(header->text, "\tSO:queryname"); -- q = strchr(header->text, '\n'); -- -- // looking for SO:queryname within @HD only -- // (e.g. must ignore in a @CO comment line later in header) -- if ((p != 0) && (p < q)) { -- fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); -- return 1; -- } -+ kstring_t str = KS_INITIALIZE; -+ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) { -+ fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); -+ ks_free(&str); -+ goto fail; -+ } -+ ks_free(&str); -+ -+ if (!param->no_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(), -+ param->arg_list ? "CL" : NULL, -+ param->arg_list ? param->arg_list : NULL, -+ NULL) != 0) { -+ fprintf(stderr, "[markdup] warning: unable to add @PG line to header.\n"); - } - -- if (sam_hdr_write(out, header) < 0) { -+ if (sam_hdr_write(param->out, header) < 0) { - fprintf(stderr, "[markdup] error writing header.\n"); -- return 1; -+ goto fail; -+ } -+ if (param->write_index) { -+ if (!(idx_fn = auto_index(param->out, param->out_fn, header))) -+ goto fail; - } - - // used for coordinate order checks -@@ -521,30 +1195,35 @@ - - // get the buffer going - in_read = kl_pushp(read_queue, read_buffer); -+ if (!in_read) { -+ fprintf(stderr, "[markdup] out of memory\n"); -+ goto fail; -+ } - - // handling supplementary reads needs a temporary file -- if (supp) { -- if (tmp_file_open_write(&temp, prefix, 1)) { -- fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", prefix); -- return 1; -+ if (param->supp) { -+ if (tmp_file_open_write(&temp, param->prefix, 1)) { -+ fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix); -+ goto fail; - } - } - - if ((in_read->b = bam_init1()) == NULL) { - fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); -- return 1; -+ goto fail; - } - -- reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0; -+ reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; -+ np_duplicate = np_opt_duplicate = 0; - -- while ((ret = sam_read1(in, header, in_read->b)) >= 0) { -+ while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { - - // do some basic coordinate order checks - if (in_read->b->core.tid >= 0) { // -1 for unmapped reads - if (in_read->b->core.tid < prev_tid || - ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { -- fprintf(stderr, "[markdup] error: bad coordinate order.\n"); -- return 1; -+ fprintf(stderr, "[markdup] error: not in coordinate sorted order.\n"); -+ goto fail; - } - } - -@@ -555,10 +1234,30 @@ - - reading++; - -- // read must not be secondary, supplementary, unmapped or failed QC -- if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { -- examined++; -+ if (param->clear && (in_read->b->core.flag & BAM_FDUP)) { -+ uint8_t *data; -+ -+ in_read->b->core.flag ^= BAM_FDUP; - -+ if ((data = bam_aux_get(in_read->b, "dt")) != NULL) { -+ bam_aux_del(in_read->b, data); -+ } -+ -+ if ((data = bam_aux_get(in_read->b, "do")) != NULL) { -+ bam_aux_del(in_read->b, data); -+ } -+ } -+ -+ if (param->include_fails) { -+ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP); -+ } else { -+ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL); -+ } -+ -+ // read must not be secondary, supplementary, unmapped or (possibly) failed QC -+ if (!(in_read->b->core.flag & exclude)) { -+ examined++; -+ in_read->duplicate = NULL; - - // look at the pairs first - if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { -@@ -567,9 +1266,16 @@ - key_data_t single_key; - in_hash_t *bp; - -- if (make_pair_key(&pair_key, in_read->b)) { -- fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); -- return 1; -+ if (param->mode) { -+ if (make_pair_key_sequence(&pair_key, in_read->b)) { -+ fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); -+ goto fail; -+ } -+ } else { -+ if (make_pair_key_template(&pair_key, in_read->b)) { -+ fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); -+ goto fail; -+ } - } - - make_single_key(&single_key, in_read->b); -@@ -583,40 +1289,32 @@ - if (ret > 0) { // new - // add to single duplicate hash - bp = &kh_val(single_hash, k); -- bp->p = in_read->b; -+ bp->p = in_read; - in_read->single_key = single_key; - } else if (ret == 0) { // exists - // look at singles only for duplication marking - bp = &kh_val(single_hash, k); - -- if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) { -- bam1_t *dup = bp->p; -+ if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) { -+ // singleton will always be marked duplicate even if -+ // scores more than one read of the pair -+ bam1_t *dup = bp->p->b; -+ -+ in_read->duplicate = bp->p; -+ bp->p = in_read; - -- // singleton will always be marked duplicate even if -- // scores more than one read of the pair -+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) -+ goto fail; - -- bp->p = in_read->b; -- dup->core.flag |= BAM_FDUP; - single_dup++; - -- if (tag) { -- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -- } -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; - -- if (supp) { -- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, dup)) { -- return 1; -- } -- } -- } - } - } else { - fprintf(stderr, "[markdup] error: single hashing failure.\n"); -- return 1; -+ goto fail; - } - - // now do the pair -@@ -625,33 +1323,44 @@ - if (ret > 0) { // new - // add to the pair hash - bp = &kh_val(pair_hash, k); -- bp->p = in_read->b; -+ bp->p = in_read; - in_read->pair_key = pair_key; - } else if (ret == 0) { - int64_t old_score, new_score, tie_add = 0; - bam1_t *dup; -+ int check_chain = 0; - - bp = &kh_val(pair_hash, k); - -- if ((mate_tmp = get_mate_score(bp->p)) == -1) { -- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -- return 1; -+ if ((bp->p->b->core.flag & BAM_FQCFAIL) != (in_read->b->core.flag & BAM_FQCFAIL)) { -+ if (bp->p->b->core.flag & BAM_FQCFAIL) { -+ old_score = 0; -+ new_score = 1; -+ } else { -+ old_score = 1; -+ new_score = 0; -+ } - } else { -- old_score = calc_score(bp->p) + mate_tmp; -- } -+ if ((mate_tmp = get_mate_score(bp->p->b)) == -1) { -+ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ goto fail; -+ } else { -+ old_score = calc_score(bp->p->b) + mate_tmp; -+ } - -- if ((mate_tmp = get_mate_score(in_read->b)) == -1) { -- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -- return 1; -- } else { -- new_score = calc_score(in_read->b) + mate_tmp; -+ if ((mate_tmp = get_mate_score(in_read->b)) == -1) { -+ fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ goto fail; -+ } else { -+ new_score = calc_score(in_read->b) + mate_tmp; -+ } - } - - // choose the highest score as the original - // and add it to the pair hash, mark the other as duplicate - - if (new_score == old_score) { -- if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) { -+ if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p->b)) < 0) { - tie_add = 1; - } else { - tie_add = -1; -@@ -659,39 +1368,40 @@ - } - - if (new_score + tie_add > old_score) { // swap reads -- dup = bp->p; -- bp->p = in_read->b; -+ dup = bp->p->b; -+ in_read->duplicate = bp->p; -+ bp->p = in_read; - } else { -+ if (bp->p->duplicate) { -+ in_read->duplicate = bp->p->duplicate; -+ check_chain = 1; -+ } -+ -+ bp->p->duplicate = in_read; - dup = in_read->b; - } - -- dup->core.flag |= BAM_FDUP; -- -- if (tag) { -- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) -+ goto fail; - -+ if (check_chain) { -+ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) -+ goto fail; - } - -- if (supp) { -- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, dup)) { -- return 1; -- } -- } -- } -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; - - duplicate++; - } else { - fprintf(stderr, "[markdup] error: pair hashing failure.\n"); -- return 1; -+ goto fail; - } - } else { // do the single (or effectively single) reads - int ret; - key_data_t single_key; - in_hash_t *bp; -+ int check_chain = 0; - - make_single_key(&single_key, in_read->b); - -@@ -702,68 +1412,76 @@ - - if (ret > 0) { // new - bp = &kh_val(single_hash, k); -- bp->p = in_read->b; -+ bp->p = in_read; - in_read->single_key = single_key; - } else if (ret == 0) { // exists - bp = &kh_val(single_hash, k); - -- if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { -+ if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { - // if matched against one of a pair just mark as duplicate - -- if (tag) { -- if (bam_aux_append(in_read->b, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -+ if (bp->p->duplicate) { -+ in_read->duplicate = bp->p->duplicate; -+ check_chain = 1; - } - -- if (supp) { -- if (bam_aux_get(in_read->b, "SA") || (in_read->b->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, in_read->b)) { -- return 1; -- } -- } -+ bp->p->duplicate = in_read; -+ -+ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) -+ goto fail; -+ -+ if (check_chain) { -+ // check the new duplicate entry in the chain -+ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) -+ goto fail; - } - -- in_read->b->core.flag |= BAM_FDUP; -+ // check against the new original -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; -+ - } else { - int64_t old_score, new_score; - bam1_t *dup; - -- old_score = calc_score(bp->p); -+ old_score = calc_score(bp->p->b); - new_score = calc_score(in_read->b); - - // choose the highest score as the original, add it - // to the single hash and mark the other as duplicate - if (new_score > old_score) { // swap reads -- dup = bp->p; -- bp->p = in_read->b; -+ dup = bp->p->b; -+ in_read->duplicate = bp->p; -+ bp->p = in_read; - } else { -+ if (bp->p->duplicate) { -+ in_read->duplicate = bp->p->duplicate; -+ check_chain = 1; -+ } -+ -+ bp->p->duplicate = in_read; - dup = in_read->b; - } - -- dup->core.flag |= BAM_FDUP; -+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) -+ goto fail; - -- if (tag) { -- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -+ -+ if (check_chain) { -+ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) -+ goto fail; - } - -- if (supp) { -- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, dup)) { -- return 1; -- } -- } -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; -+ -+ - } -- } - - single_dup++; - } else { - fprintf(stderr, "[markdup] error: single hashing failure.\n"); -- return 1; -+ goto fail; - } - } - } else { -@@ -778,20 +1496,20 @@ - - /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads - should just be written as they cannot be matched as duplicates. */ -- if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { -+ if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { - break; - } - -- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -- if (supp) { -+ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -+ if (param->supp) { - if (tmp_file_write(&temp, in_read->b)) { - fprintf(stderr, "[markdup] error: writing temp output failed.\n"); -- return 1; -+ goto fail; - } - } else { -- if (sam_write1(out, header, in_read->b) < 0) { -+ if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(stderr, "[markdup] error: writing output failed.\n"); -- return 1; -+ goto fail; - } - } - -@@ -816,16 +1534,20 @@ - - // set the next one up for reading - in_read = kl_pushp(read_queue, read_buffer); -+ if (!in_read) { -+ fprintf(stderr, "[markdup] out of memory\n"); -+ goto fail; -+ } - - if ((in_read->b = bam_init1()) == NULL) { - fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); -- return 1; -+ goto fail; - } - } - - if (ret < -1) { - fprintf(stderr, "[markdup] error: truncated input file.\n"); -- return 1; -+ goto fail; - } - - // write out the end of the list -@@ -834,16 +1556,16 @@ - in_read = &kl_val(rq); - - if (bam_get_qname(in_read->b)) { // last entry will be blank -- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -- if (supp) { -+ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -+ if (param->supp) { - if (tmp_file_write(&temp, in_read->b)) { - fprintf(stderr, "[markdup] error: writing temp output failed.\n"); -- return 1; -+ goto fail; - } - } else { -- if (sam_write1(out, header, in_read->b) < 0) { -+ if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(stderr, "[markdup] error: writing output failed.\n"); -- return 1; -+ goto fail; - } - } - -@@ -856,71 +1578,155 @@ - rq = kl_begin(read_buffer); - } - -- if (supp) { -+ if (param->supp) { - bam1_t *b; - - if (tmp_file_end_write(&temp)) { - fprintf(stderr, "[markdup] error: unable to end tmp writing.\n"); -- return 1; -+ goto fail; - } - - // read data from temp file and mark duplicate supplementary alignments - -- if (tmp_file_begin_read(&temp, NULL)) { -- return 1; -+ if (tmp_file_begin_read(&temp)) { -+ goto fail; - } - - b = bam_init1(); - - while ((ret = tmp_file_read(&temp, b)) > 0) { - -- if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP)) { -+ if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FSECONDARY)) { -+ - k = kh_get(duplicates, dup_hash, bam_get_qname(b)); - - if (k != kh_end(dup_hash)) { -+ - b->core.flag |= BAM_FDUP; -+ np_duplicate++; -+ -+ if (param->tag && kh_val(dup_hash, k).name) { -+ if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) { -+ fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); -+ goto fail; -+ } -+ } -+ -+ if (param->opt_dist) { -+ if (kh_val(dup_hash, k).type) { -+ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ"); -+ np_opt_duplicate++; -+ } else { -+ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB"); -+ } -+ } - } - } - -- if (!remove_dups || !(b->core.flag & BAM_FDUP)) { -- if (sam_write1(out, header, b) < 0) { -+ if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { -+ if (sam_write1(param->out, header, b) < 0) { - fprintf(stderr, "[markdup] error: writing final output failed.\n"); -- return 1; -+ goto fail; - } - } - } - - if (ret == -1) { - fprintf(stderr, "[markdup] error: failed to read tmp file.\n"); -- return 1; -+ goto fail; - } - - for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { - if (kh_exist(dup_hash, k)) { -+ free(kh_val(dup_hash, k).name); - free((char *)kh_key(dup_hash, k)); -+ kh_key(dup_hash, k) = NULL; - } - } - -- tmp_file_destroy(&temp, b, 0); -- kh_destroy(duplicates, dup_hash); -+ tmp_file_destroy(&temp); - bam_destroy1(b); - } - -- if (do_stats) { -- fprintf(stderr, "READ %d WRITTEN %d \n" -- "EXCLUDED %d EXAMINED %d\n" -- "PAIRED %d SINGLE %d\n" -- "DULPICATE PAIR %d DUPLICATE SINGLE %d\n" -- "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single, -- duplicate, single_dup, single_dup + duplicate); -+ if (opt_warnings) { -+ fprintf(stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n", -+ opt_warnings); -+ } -+ -+ if (param->do_stats) { -+ FILE *fp; -+ int file_open = 0; -+ unsigned long els; -+ -+ if (param->stats_file) { -+ if (NULL == (fp = fopen(param->stats_file, "w"))) { -+ fprintf(stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file); -+ fp = stderr; -+ } else { -+ file_open = 1; -+ } -+ } else { -+ fp = stderr; -+ } -+ -+ els = estimate_library_size(pair, duplicate - optical); -+ -+ fprintf(fp, -+ "COMMAND: %s\n" -+ "READ: %ld\n" -+ "WRITTEN: %ld\n" -+ "EXCLUDED: %ld\n" -+ "EXAMINED: %ld\n" -+ "PAIRED: %ld\n" -+ "SINGLE: %ld\n" -+ "DUPLICATE PAIR: %ld\n" -+ "DUPLICATE SINGLE: %ld\n" -+ "DUPLICATE PAIR OPTICAL: %ld\n" -+ "DUPLICATE SINGLE OPTICAL: %ld\n" -+ "DUPLICATE NON PRIMARY: %ld\n" -+ "DUPLICATE NON PRIMARY OPTICAL: %ld\n" -+ "DUPLICATE PRIMARY TOTAL: %ld\n" -+ "DUPLICATE TOTAL: %ld\n" -+ "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single, -+ duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate, -+ single_dup + duplicate, single_dup + duplicate + np_duplicate, els); -+ -+ if (file_open) { -+ fclose(fp); -+ } -+ } -+ -+ if (param->write_index) { -+ if (sam_idx_save(param->out) < 0) { -+ print_error_errno("markdup", "writing index failed"); -+ goto fail; -+ } - } - - kh_destroy(reads, pair_hash); - kh_destroy(reads, single_hash); - kl_destroy(read_queue, read_buffer); -- bam_hdr_destroy(header); -+ kh_destroy(duplicates, dup_hash); -+ sam_hdr_destroy(header); - - return 0; -+ -+ fail: -+ for (rq = kl_begin(read_buffer); rq != kl_end(read_buffer); rq = kl_next(rq)) -+ bam_destroy1(kl_val(rq).b); -+ kl_destroy(read_queue, read_buffer); -+ -+ for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { -+ if (kh_exist(dup_hash, k)) { -+ free((char *)kh_key(dup_hash, k)); -+ } -+ } -+ kh_destroy(duplicates, dup_hash); -+ -+ kh_destroy(reads, pair_hash); -+ kh_destroy(reads, single_hash); -+ sam_hdr_destroy(header); -+ return 1; - } - - -@@ -928,15 +1734,23 @@ - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools markdup \n\n"); - fprintf(stderr, "Option: \n"); -- fprintf(stderr, " -r Remove duplicate reads\n"); -- fprintf(stderr, " -l INT Max read length (default 300 bases)\n"); -- fprintf(stderr, " -S Mark supplemenary alignments of duplicates as duplicates (slower).\n"); -- fprintf(stderr, " -s Report stats.\n"); -- fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); -- fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." -+ fprintf(stderr, " -r Remove duplicate reads\n"); -+ fprintf(stderr, " -l INT Max read length (default 300 bases)\n"); -+ fprintf(stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); -+ fprintf(stderr, " -s Report stats.\n"); -+ fprintf(stderr, " -f NAME Write stats to named file. Implies -s.\n"); -+ fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); -+ fprintf(stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); -+ fprintf(stderr, " -c Clear previous duplicate settings and tags.\n"); -+ fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" -+ " TYPE = t measure positions based on template start/end (default).\n" -+ " s measure positions based on sequence start.\n"); -+ fprintf(stderr, " --include-fails Include quality check failed reads.\n"); -+ fprintf(stderr, " --no-PG Do not add a PG line\n"); -+ fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." - " Mainly for information and debugging.\n"); - -- sam_global_opt_help(stderr, "-.O..@"); -+ sam_global_opt_help(stderr, "-.O..@.."); - - fprintf(stderr, "\nThe input file must be coordinate sorted and must have gone" - " through fixmates with the mate scoring option on.\n"); -@@ -946,29 +1760,47 @@ - - - int bam_markdup(int argc, char **argv) { -- int c, ret, remove_dups = 0, report_stats = 0, include_supplementary = 0, tag_dup = 0; -- int32_t max_length = 300; -- samFile *in = NULL, *out = NULL; -+ int c, ret; - char wmode[3] = {'w', 'b', 0}; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - htsThreadPool p = {NULL, 0}; - kstring_t tmpprefix = {0, 0, NULL}; - struct stat st; - unsigned int t; -+ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL}; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), -+ {"include-fails", no_argument, NULL, 1001}, -+ {"no-PG", no_argument, NULL, 1002}, -+ {"mode", required_argument, NULL, 'm'}, - {NULL, 0, NULL, 0} - }; - -- while ((c = getopt_long(argc, argv, "rsl:StT:O:@:", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) { - switch (c) { -- case 'r': remove_dups = 1; break; -- case 'l': max_length = atoi(optarg); break; -- case 's': report_stats = 1; break; -+ case 'r': param.remove_dups = 1; break; -+ case 'l': param.max_length = atoi(optarg); break; -+ case 's': param.do_stats = 1; break; - case 'T': kputs(optarg, &tmpprefix); break; -- case 'S': include_supplementary = 1; break; -- case 't': tag_dup = 1; break; -+ case 'S': param.supp = 1; break; -+ case 't': param.tag = 1; break; -+ case 'f': param.stats_file = optarg; param.do_stats = 1; break; -+ case 'd': param.opt_dist = atoi(optarg); break; -+ case 'c': param.clear = 1; break; -+ case 'm': -+ if (strcmp(optarg, "t") == 0) { -+ param.mode = MD_MODE_TEMPLATE; -+ } else if (strcmp(optarg, "s") == 0) { -+ param.mode = MD_MODE_SEQUENCE; -+ } else { -+ fprintf(stderr, "[markdup] error: unknown mode '%s'.\n", optarg); -+ return markdup_usage(); -+ } -+ -+ break; -+ case 1001: param.include_fails = 1; break; -+ case 1002: param.no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return markdup_usage(); -@@ -978,17 +1810,20 @@ - if (optind + 2 > argc) - return markdup_usage(); - -- in = sam_open_format(argv[optind], "r", &ga.in); -+ if (param.opt_dist < 0) param.opt_dist = 0; -+ if (param.max_length < 0) param.max_length = 300; -+ -+ param.in = sam_open_format(argv[optind], "r", &ga.in); - -- if (!in) { -+ if (!param.in) { - print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); - return 1; - } - - sam_open_mode(wmode + 1, argv[optind + 1], NULL); -- out = sam_open_format(argv[optind + 1], wmode, &ga.out); -+ param.out = sam_open_format(argv[optind + 1], wmode, &ga.out); - -- if (!out) { -+ if (!param.out) { - print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); - return 1; - } -@@ -999,8 +1834,8 @@ - return 1; - } - -- hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); -- hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); -+ hts_set_opt(param.in, HTS_OPT_THREAD_POOL, &p); -+ hts_set_opt(param.out, HTS_OPT_THREAD_POOL, &p); - } - - // actual stuff happens here -@@ -1020,18 +1855,24 @@ - - t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); - ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); -+ param.prefix = tmpprefix.s; -+ -+ param.arg_list = stringify_argv(argc + 1, argv - 1); -+ param.write_index = ga.write_index; -+ param.out_fn = argv[optind + 1]; - -- ret = bam_mark_duplicates(in, out, tmpprefix.s, remove_dups, max_length, report_stats, include_supplementary, tag_dup); -+ ret = bam_mark_duplicates(¶m); - -- sam_close(in); -+ sam_close(param.in); - -- if (sam_close(out) < 0) { -+ if (sam_close(param.out) < 0) { - fprintf(stderr, "[markdup] error closing output file\n"); - ret = 1; - } - - if (p.pool) hts_tpool_destroy(p.pool); - -+ free(param.arg_list); - free(tmpprefix.s); - sam_global_args_free(&ga); - ---- python-pysam.orig/samtools/bam_markdup.c.pysam.c -+++ python-pysam/samtools/bam_markdup.c.pysam.c -@@ -3,7 +3,7 @@ - /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone - through fixmates with the mate scoring option on. - -- Copyright (C) 2017-18 Genome Research Ltd. -+ Copyright (C) 2017-2019 Genome Research Ltd. - - Author: Andrew Whitwham - -@@ -24,6 +24,9 @@ - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - DEALINGS IN THE SOFTWARE -+ -+Estimate library size derived from Picard DuplicationMetrics.java -+Copyright (c) 2009,2018 The Broad Institute. MIT license. - */ - - #include -@@ -35,6 +38,7 @@ - #include - #include - #include -+#include - #include "htslib/thread_pool.h" - #include "htslib/sam.h" - #include "sam_opts.h" -@@ -44,26 +48,53 @@ - #include "htslib/kstring.h" - #include "tmp_file.h" - -+ -+typedef struct { -+ samFile *in; -+ samFile *out; -+ char *prefix; -+ int remove_dups; -+ int32_t max_length; -+ int do_stats; -+ int supp; -+ int tag; -+ int opt_dist; -+ int no_pg; -+ int clear; -+ int mode; -+ int write_index; -+ int include_fails; -+ char *stats_file; -+ char *arg_list; -+ char *out_fn; -+} md_param_t; -+ - typedef struct { -- int32_t single; -+ hts_pos_t this_coord; -+ hts_pos_t other_coord; - int32_t this_ref; -- int32_t this_coord; - int32_t other_ref; -- int32_t other_coord; -- int32_t leftmost; -- int32_t orientation; -+ int8_t single; -+ int8_t leftmost; -+ int8_t orientation; - } key_data_t; - -+typedef struct read_queue_s { -+ key_data_t pair_key; -+ key_data_t single_key; -+ bam1_t *b; -+ struct read_queue_s *duplicate; -+ hts_pos_t pos; -+} read_queue_t; -+ - typedef struct { -- bam1_t *p; -+ read_queue_t *p; - } in_hash_t; - - typedef struct { -- bam1_t *b; -- int32_t pos; -- key_data_t pair_key; -- key_data_t single_key; --} read_queue_t; -+ char *name; -+ char type; -+} dup_map_t; - - - -@@ -74,22 +105,22 @@ - khint_t hash; - - if (key.single) { -- unsigned char sig[12]; -+ unsigned char sig[13]; - - memcpy(sig + i, &key.this_ref, 4); i += 4; -- memcpy(sig + i, &key.this_coord, 4); i += 4; -- memcpy(sig + i, &key.orientation, 4); i += 4; -+ memcpy(sig + i, &key.this_coord, 8); i += 8; -+ memcpy(sig + i, &key.orientation, 1); i += 1; - - hash = do_hash(sig, i); - } else { -- unsigned char sig[24]; -+ unsigned char sig[26]; - - memcpy(sig + i, &key.this_ref, 4); i += 4; -- memcpy(sig + i, &key.this_coord, 4); i += 4; -+ memcpy(sig + i, &key.this_coord, 8); i += 8; - memcpy(sig + i, &key.other_ref, 4); i += 4; -- memcpy(sig + i, &key.other_coord, 4); i += 4; -- memcpy(sig + i, &key.leftmost, 4); i += 4; -- memcpy(sig + i, &key.orientation, 4); i += 4; -+ memcpy(sig + i, &key.other_coord, 8); i += 8; -+ memcpy(sig + i, &key.leftmost, 1); i += 1; -+ memcpy(sig + i, &key.orientation, 1); i += 1; - - hash = do_hash(sig, i); - } -@@ -124,21 +155,35 @@ - - - #define __free_queue_element(p) -+ -+// Orientations (prime numbers to feed to hashing algorithm) - #define O_FF 2 - #define O_RR 3 - #define O_FR 5 - #define O_RF 7 - -+// Left or rightmost -+#define R_LE 11 -+#define R_RI 13 -+ -+#define BMD_WARNING_MAX 10 -+ -+#define MD_MIN_QUALITY 15 -+ -+// Duplicate finding mode -+#define MD_MODE_TEMPLATE 0 -+#define MD_MODE_SEQUENCE 1 -+ - KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash - KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer --KHASH_MAP_INIT_STR(duplicates, int) // map of duplicates for supplementary dup id -+KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id - - - /* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ - --static int32_t unclipped_other_start(int32_t op, char *cigar) { -+static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { - char *c = cigar; -- int32_t clipped = 0; -+ int64_t clipped = 0; - - while (*c && *c != '*') { - long num = 0; -@@ -164,9 +209,9 @@ - - /* Calculate the current read's start based on the stored cigar string. */ - --static int32_t unclipped_start(bam1_t *b) { -+static hts_pos_t unclipped_start(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); -- int32_t clipped = 0; -+ int64_t clipped = 0; - uint32_t i; - - for (i = 0; i < b->core.n_cigar; i++) { -@@ -185,9 +230,9 @@ - - /* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ - --static int32_t unclipped_other_end(int32_t op, char *cigar) { -+static hts_pos_t unclipped_other_end(int64_t op, char *cigar) { - char *c = cigar; -- int32_t refpos = 0; -+ int64_t refpos = 0; - int skip = 1; - - while (*c && *c != '*') { -@@ -226,9 +271,9 @@ - - /* Calculate the current read's end based on the stored cigar string. */ - --static int32_t unclipped_end(bam1_t *b) { -+static hts_pos_t unclipped_end(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); -- int32_t end_pos, clipped = 0; -+ hts_pos_t end_pos, clipped = 0; - int32_t i; - - end_pos = bam_endpos(b); -@@ -295,7 +340,7 @@ - int i; - - for (i = 0; i < b->core.l_qseq; i++) { -- if (qual[i] >= 15) score += qual[i]; -+ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; - } - - return score; -@@ -307,10 +352,10 @@ - the reference id, orientation and whether the current - read is leftmost of the pair. */ - --static int make_pair_key(key_data_t *key, bam1_t *bam) { -- int32_t this_ref, this_coord, this_end; -- int32_t other_ref, other_coord, other_end; -- int32_t orientation, leftmost; -+static int make_pair_key_template(key_data_t *key, bam1_t *bam) { -+ hts_pos_t this_coord, other_coord, this_end, other_end; -+ int32_t this_ref, other_ref; -+ int8_t orientation, leftmost; - uint8_t *data; - char *cig; - -@@ -321,7 +366,11 @@ - this_end = unclipped_end(bam); - - if ((data = bam_aux_get(bam, "MC"))) { -- cig = bam_aux2Z(data); -+ if (!(cig = bam_aux2Z(data))) { -+ fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); -+ return 1; -+ } -+ - other_end = unclipped_other_end(bam->core.mpos, cig); - other_coord = unclipped_other_start(bam->core.mpos, cig); - } else { -@@ -404,9 +453,9 @@ - } - - if (!leftmost) -- leftmost = 13; -+ leftmost = R_RI; - else -- leftmost = 11; -+ leftmost = R_LE; - - key->single = 0; - key->this_ref = this_ref; -@@ -420,13 +469,140 @@ - } - - -+static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { -+ hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; -+ int32_t this_ref, other_ref; -+ int8_t orientation, left_read; -+ uint8_t *data; -+ char *cig; -+ -+ this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash -+ other_ref = bam->core.mtid + 1; -+ -+ this_coord = unclipped_start(bam); -+ this_end = unclipped_end(bam); -+ -+ if ((data = bam_aux_get(bam, "MC"))) { -+ if (!(cig = bam_aux2Z(data))) { -+ fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); -+ return 1; -+ } -+ -+ other_end = unclipped_other_end(bam->core.mpos, cig); -+ other_coord = unclipped_other_start(bam->core.mpos, cig); -+ } else { -+ fprintf(samtools_stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); -+ return 1; -+ } -+ -+ // work out orientations -+ if (this_ref != other_ref) { -+ leftmost = this_ref - other_ref; -+ } else { -+ if (bam_is_rev(bam) == bam_is_mrev(bam)) { -+ if (!bam_is_rev(bam)) { -+ leftmost = this_coord - other_coord; -+ } else { -+ leftmost = this_end - other_end; -+ } -+ } else { -+ if (bam_is_rev(bam)) { -+ leftmost = this_end - other_coord; -+ } else { -+ leftmost = this_coord - other_end; -+ } -+ } -+ } -+ -+ if (leftmost < 0) { -+ leftmost = 1; -+ } else if (leftmost > 0) { -+ leftmost = 0; -+ } else { -+ // tie breaks -+ -+ if (bam->core.pos == bam->core.mpos) { -+ if (bam->core.flag & BAM_FREAD1) { -+ leftmost = 1; -+ } else { -+ leftmost = 0; -+ } -+ } else if (bam->core.pos < bam->core.mpos) { -+ leftmost = 1; -+ } else { -+ leftmost = 0; -+ } -+ } -+ -+ // pair orientation -+ if (leftmost) { -+ if (bam_is_rev(bam) == bam_is_mrev(bam)) { -+ -+ if (!bam_is_rev(bam)) { -+ orientation = O_FF; -+ } else { -+ orientation = O_RR; -+ } -+ } else { -+ if (!bam_is_rev(bam)) { -+ orientation = O_FR; -+ } else { -+ orientation = O_RF; -+ } -+ } -+ } else { -+ if (bam_is_rev(bam) == bam_is_mrev(bam)) { -+ -+ if (!bam_is_rev(bam)) { -+ orientation = O_RR; -+ } else { -+ orientation = O_FF; -+ } -+ } else { -+ if (!bam_is_rev(bam)) { -+ orientation = O_RF; -+ } else { -+ orientation = O_FR; -+ } -+ } -+ } -+ -+ if (!leftmost) -+ left_read = R_RI; -+ else -+ left_read = R_LE; -+ -+ if (!bam_is_rev(bam)) { -+ this_coord = unclipped_start(bam); -+ } else { -+ this_coord = unclipped_end(bam); -+ } -+ -+ if (!bam_is_mrev(bam)) { -+ other_coord = unclipped_other_start(bam->core.mpos, cig); -+ } else { -+ other_coord = unclipped_other_end(bam->core.mpos, cig); -+ } -+ -+ key->single = 0; -+ key->this_ref = this_ref; -+ key->this_coord = this_coord; -+ key->other_ref = other_ref; -+ key->other_coord = other_coord; -+ key->leftmost = left_read; -+ key->orientation = orientation; -+ -+ return 0; -+} -+ - /* Create a signature hash of single read (or read with an unmatched pair). - Uses unclipped start (or end depending on orientation), reference id, - and orientation. */ - - static void make_single_key(key_data_t *key, bam1_t *bam) { -- int32_t this_ref, this_coord; -- int32_t orientation; -+ hts_pos_t this_coord; -+ int32_t this_ref; -+ int8_t orientation; - - this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash - -@@ -444,23 +620,45 @@ - key->orientation = orientation; - } - -+ - /* Add the duplicate name to a hash if it does not exist. */ - --static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { -+static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) { - khiter_t d; - int ret; - - d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); - - if (d == kh_end(d_hash)) { -- d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); -+ char *name = strdup(bam_get_qname(dupe)); -+ if (name) { -+ d = kh_put(duplicates, d_hash, name, &ret); -+ } else { -+ ret = -1; -+ } -+ -+ if (ret >= 0) { -+ if (orig_name) { -+ if (ret == 0) { -+ // replace old name -+ free(kh_value(d_hash, d).name); -+ free(name); -+ } - -- if (ret > 0) { -- kh_value(d_hash, d) = 1; -- } else if (ret == 0) { -- kh_value(d_hash, d)++; -+ kh_value(d_hash, d).name = strdup(orig_name); -+ -+ if (kh_value(d_hash, d).name == NULL) { -+ fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n"); -+ return 1; -+ } -+ } else { -+ kh_value(d_hash, d).name = NULL; -+ } -+ -+ kh_value(d_hash, d).type = type; - } else { - fprintf(samtools_stderr, "[markdup] error: unable to store supplementary duplicates.\n"); -+ free(name); - return 1; - } - } -@@ -469,6 +667,467 @@ - } - - -+static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { -+ int sep = 0; -+ int pos = 0; -+ -+ while (qname[pos]) { -+ if (qname[pos] == ':') { -+ sep++; -+ -+ if (sep == 2) { -+ *xpos = pos + 1; -+ } else if (sep == 3) { -+ *ypos = pos + 1; -+ } else if (sep == 4) { // HiSeq style names -+ *xpos = *ypos; -+ *ypos = pos + 1; -+ } else if (sep == 5) { // Newer Illumina format -+ *xpos = pos + 1; -+ } else if (sep == 6) { -+ *ypos = pos + 1; -+ } -+ } -+ -+ pos++; -+ } -+ -+ return sep; -+} -+ -+/* Using the coordinates from the Illumina read name, see whether the duplicated read is -+ close enough (set by max_dist) to the original to be counted as optical.*/ -+ -+static int optical_duplicate(bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { -+ int ret = 0, seps; -+ char *original, *duplicate; -+ int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; -+ -+ -+ original = bam_get_qname(ori); -+ duplicate = bam_get_qname(dup); -+ -+ seps = get_coordinate_positions(original, &oxpos, &oypos); -+ -+ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", original); -+ } -+ -+ return ret; -+ } -+ -+ seps = get_coordinate_positions(duplicate, &dxpos, &dypos); -+ -+ if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { -+ -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", duplicate); -+ } -+ -+ return ret; -+ } -+ -+ if (strncmp(original, duplicate, oxpos - 1) == 0) { -+ // the initial parts match, look at the numbers -+ long ox, oy, dx, dy, xdiff, ydiff; -+ char *end; -+ -+ ox = strtol(original + oxpos, &end, 10); -+ -+ if ((original + oxpos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original); -+ } -+ -+ return ret; -+ } -+ -+ dx = strtol(duplicate + dxpos, &end, 10); -+ -+ if ((duplicate + dxpos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate); -+ } -+ -+ return ret; -+ } -+ -+ if (ox > dx) { -+ xdiff = ox - dx; -+ } else { -+ xdiff = dx - ox; -+ } -+ -+ if (xdiff <= max_dist) { -+ // still might be optical -+ -+ oy = strtol(original + oypos, &end, 10); -+ -+ if ((original + oypos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original); -+ } -+ -+ return ret; -+ } -+ -+ dy = strtol(duplicate + dypos, &end, 10); -+ -+ if ((duplicate + dypos) == end) { -+ (*warnings)++; -+ -+ if (*warnings <= BMD_WARNING_MAX) { -+ fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate); -+ } -+ -+ return ret; -+ } -+ -+ if (oy > dy) { -+ ydiff = oy - dy; -+ } else { -+ ydiff = dy - oy; -+ } -+ -+ if (ydiff <= max_dist) ret = 1; -+ } -+ } -+ -+ return ret; -+} -+ -+ -+static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, -+ long *optical, long *warn) { -+ char dup_type = 0; -+ long incoming_warnings = *warn; -+ -+ dup->core.flag |= BAM_FDUP; -+ -+ if (param->tag) { -+ if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) { -+ fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); -+ return -1; -+ } -+ } -+ -+ if (param->opt_dist) { // mark optical duplicates -+ if (optical_duplicate(ori, dup, param->opt_dist, warn)) { -+ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ"); -+ dup_type = 'O'; -+ (*optical)++; -+ } else { -+ // not an optical duplicate -+ bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB"); -+ } -+ } -+ -+ if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) { -+ fprintf(samtools_stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n", -+ *warn); -+ } -+ -+ if (param->supp) { -+ if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP) || bam_aux_get(dup, "XA")) { -+ char *original = NULL; -+ -+ if (param->tag) { -+ original = bam_get_qname(ori); -+ } -+ -+ if (add_duplicate(dup_hash, dup, original, dup_type)) -+ return -1; -+ } -+ } -+ -+ return 0; -+} -+ -+ -+static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { -+ int ret = 0; -+ uint8_t *data; -+ -+ // remove any existing dt tag -+ if ((data = bam_aux_get(b, "dt")) != NULL) { -+ bam_aux_del(b, data); -+ } -+ -+ if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) { -+ fprintf(samtools_stderr, "[markdup] error: unable to append 'dt' tag.\n"); -+ ret = -1; -+ } -+ -+ if (paired) { -+ (*optical_pair)++; -+ } else { -+ (*optical_single)++; -+ } -+ -+ if (param->supp) { -+ // Change the duplicate type -+ -+ if (bam_aux_get(b, "SA") || (b->core.flag & BAM_FMUNMAP) -+ || bam_aux_get(b, "XA")) { -+ khiter_t d; -+ -+ d = kh_get(duplicates, dup_hash, bam_get_qname(b)); -+ -+ if (d == kh_end(dup_hash)) { -+ // error, name should already be in dup hash -+ fprintf(samtools_stderr, "[markdup] error: duplicate name %s not found in hash.\n", -+ bam_get_qname(b)); -+ ret = -1; -+ } else { -+ kh_value(dup_hash, d).type = 'O'; -+ } -+ } -+ } -+ -+ return ret; -+} -+ -+ -+ -+/* -+ Where there is more than one duplicate go down the list and check for optical duplicates and change -+ do tags (where used) to point to original (non-duplicate) read. -+*/ -+static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, -+ long *warn, long *optical_single, long *optical_pair) { -+ int ret = 0; -+ read_queue_t *current = ori->duplicate; -+ char *ori_name = bam_get_qname(ori->b); -+ int have_original = !(ori->b->core.flag & BAM_FDUP); -+ int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP); -+ -+ while (current) { -+ int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); -+ -+ if (param->tag && have_original) { -+ uint8_t *data; -+ -+ // at this stage all duplicates should have a do tag -+ if ((data = bam_aux_get(current->b, "do")) != NULL) { -+ // see if we need to change the tag -+ char *old_name = bam_aux2Z(data); -+ -+ if (old_name) { -+ if (strcmp(old_name, ori_name) != 0) { -+ bam_aux_del(current->b, data); -+ -+ if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) { -+ fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); -+ ret = -1; -+ break; -+ } -+ } -+ } else { -+ fprintf(samtools_stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); -+ ret = -1; -+ break; -+ } -+ } -+ } -+ -+ if (param->opt_dist) { -+ int is_cur_opt = 0, is_ori_opt = 0; -+ uint8_t *data; -+ char *dup_type; -+ -+ if ((data = bam_aux_get(ori->b, "dt"))) { -+ if ((dup_type = bam_aux2Z(data))) { -+ if (strcmp(dup_type, "SQ") == 0) { -+ is_ori_opt = 1; -+ } -+ } -+ } -+ -+ if ((data = bam_aux_get(current->b, "dt"))) { -+ if ((dup_type = bam_aux2Z(data))) { -+ if (strcmp(dup_type, "SQ") == 0) { -+ is_cur_opt = 1; -+ } -+ } -+ } -+ -+ if (!(is_ori_opt && is_cur_opt)) { -+ // if both are already optical duplicates there is no need to check again, otherwise... -+ -+ if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) { -+ // find out which one is the duplicate -+ int is_cur_dup = 0; -+ -+ if (have_original) { -+ // compared against an original, this is a dup. -+ is_cur_dup = 1; -+ } else if (ori_paired != current_paired) { -+ if (!current_paired) { -+ // current is single vs pair, this is a dup. -+ is_cur_dup = 1; -+ } -+ } else { -+ // do it by scores -+ int64_t ori_score, curr_score; -+ -+ if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) { -+ if (ori->b->core.flag & BAM_FQCFAIL) { -+ ori_score = 0; -+ curr_score = 1; -+ } else { -+ ori_score = 1; -+ curr_score = 0; -+ } -+ } else { -+ ori_score = calc_score(ori->b); -+ curr_score = calc_score(current->b); -+ -+ if (current_paired) { -+ // they are pairs so add mate scores. -+ int64_t mate_tmp; -+ -+ if ((mate_tmp = get_mate_score(ori->b)) == -1) { -+ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ ret = -1; -+ break; -+ } else { -+ ori_score += mate_tmp; -+ } -+ -+ if ((mate_tmp = get_mate_score(current->b)) == -1) { -+ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ ret = -1; -+ break; -+ } else { -+ curr_score += mate_tmp; -+ } -+ } -+ } -+ -+ if (ori_score == curr_score) { -+ if (strcmp(bam_get_qname(current->b), ori_name) < 0) { -+ curr_score++; -+ } else { -+ curr_score--; -+ } -+ } -+ -+ if (ori_score > curr_score) { -+ is_cur_dup = 1; -+ } -+ } -+ -+ if (is_cur_dup) { -+ // the current is the optical duplicate -+ if (!is_cur_opt) { // only change if not already an optical duplicate -+ if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { -+ ret = -1; -+ break; -+ } -+ } -+ } else { -+ if (!is_ori_opt) { -+ if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) { -+ ret = -1; -+ break; -+ } -+ } -+ } -+ } -+ } -+ } -+ -+ current = current->duplicate; -+ } -+ -+ return ret; -+} -+ -+/* -+ Function to use when estimating library size. -+ -+ This is based on an approximate formula for the coverage of a set -+ obtained after sampling it a given number of times with replacement. -+ -+ x = number of items in the set (the number of unique fragments in the library) -+ -+ c = number of unique items (unique read pairs observed) -+ -+ n = number of items samples (total number of read pairs) -+ -+ c and n are known; x is unknown. -+ -+ As n -> infinity, the coverage (c/x) can be given as: -+ -+ c / x = 1 - exp(-n / x) (see https://math.stackexchange.com/questions/32800) -+ -+ This needs to be solved for x, so it is rearranged to put both terms on the -+ left side and estimate_library_size() finds a value of x which gives a -+ result of zero (or as close as it can get). -+ */ -+static inline double coverage_equation(double x, double c, double n) { -+ return c / x - 1 + exp(-n / x); -+} -+ -+ -+/* estimate the library size, based on the Picard code in DuplicationMetrics.java*/ -+static unsigned long estimate_library_size(unsigned long read_pairs, unsigned long duplicate_pairs) { -+ unsigned long estimated_size = 0; -+ -+ read_pairs /= 2; -+ duplicate_pairs /= 2; -+ -+ if ((read_pairs && duplicate_pairs) && (read_pairs > duplicate_pairs)) { -+ unsigned long unique_pairs = read_pairs - duplicate_pairs; -+ double m = 1; -+ double M = 100; -+ int i; -+ -+ if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) < 0) { -+ fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size.\n"); -+ return estimated_size; -+ } -+ -+ while (coverage_equation(M * (double)unique_pairs, (double)unique_pairs, (double)read_pairs) > 0) { -+ M *= 10; -+ } -+ -+ for (i = 0; i < 40; i++) { -+ double r = (m + M) / 2; -+ double u = coverage_equation(r * (double)unique_pairs, (double)unique_pairs, (double)read_pairs); -+ -+ if (u > 0) { -+ m = r; -+ } else if (u < 0) { -+ M = r; -+ } else { -+ break; -+ } -+ } -+ -+ estimated_size = (unsigned long)(unique_pairs * (m + M) / 2); -+ } else { -+ fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size." -+ " Read pairs %ld should be greater than duplicate pairs %ld," -+ " which should both be non zero.\n", -+ read_pairs, duplicate_pairs); -+ } -+ -+ return estimated_size; -+} -+ -+ - /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. - Generally the highest quality scoring is chosen as the original and all others the duplicates. - The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). -@@ -478,44 +1137,59 @@ - Marking the supplementary reads of a duplicate as also duplicates takes an extra file read/write - step. This is because the duplicate can occur before the primary read.*/ - --static int bam_mark_duplicates(samFile *in, samFile *out, char *prefix, int remove_dups, int32_t max_length, int do_stats, int supp, int tag) { -- bam_hdr_t *header; -+static int bam_mark_duplicates(md_param_t *param) { -+ bam_hdr_t *header = NULL; - khiter_t k; - khash_t(reads) *pair_hash = kh_init(reads); - khash_t(reads) *single_hash = kh_init(reads); - klist_t(read_queue) *read_buffer = kl_init(read_queue); - kliter_t(read_queue) *rq; - khash_t(duplicates) *dup_hash = kh_init(duplicates); -- int32_t prev_tid, prev_coord; -+ int32_t prev_tid; -+ hts_pos_t prev_coord; - read_queue_t *in_read; - int ret; -- int reading, writing, excluded, duplicate, single, pair, single_dup, examined; -+ long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; -+ long np_duplicate, np_opt_duplicate; -+ long opt_warnings = 0; - tmp_file_t temp; -+ char *idx_fn = NULL; -+ int exclude = 0; - -- if ((header = sam_hdr_read(in)) == NULL) { -+ if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { -+ fprintf(samtools_stderr, "[markdup] out of memory\n"); -+ goto fail; -+ } -+ -+ if ((header = sam_hdr_read(param->in)) == NULL) { - fprintf(samtools_stderr, "[markdup] error reading header\n"); -- return 1; -+ goto fail; - } - - // accept unknown, unsorted or coordinate sort order, but error on queryname sorted. - // only really works on coordinate sorted files. -- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { -- char *p, *q; -- -- p = strstr(header->text, "\tSO:queryname"); -- q = strchr(header->text, '\n'); -- -- // looking for SO:queryname within @HD only -- // (e.g. must ignore in a @CO comment line later in header) -- if ((p != 0) && (p < q)) { -- fprintf(samtools_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); -- return 1; -- } -+ kstring_t str = KS_INITIALIZE; -+ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) { -+ fprintf(samtools_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); -+ ks_free(&str); -+ goto fail; -+ } -+ ks_free(&str); -+ -+ if (!param->no_pg && sam_hdr_add_pg(header, "samtools", "VN", samtools_version(), -+ param->arg_list ? "CL" : NULL, -+ param->arg_list ? param->arg_list : NULL, -+ NULL) != 0) { -+ fprintf(samtools_stderr, "[markdup] warning: unable to add @PG line to header.\n"); - } - -- if (sam_hdr_write(out, header) < 0) { -+ if (sam_hdr_write(param->out, header) < 0) { - fprintf(samtools_stderr, "[markdup] error writing header.\n"); -- return 1; -+ goto fail; -+ } -+ if (param->write_index) { -+ if (!(idx_fn = auto_index(param->out, param->out_fn, header))) -+ goto fail; - } - - // used for coordinate order checks -@@ -523,30 +1197,35 @@ - - // get the buffer going - in_read = kl_pushp(read_queue, read_buffer); -+ if (!in_read) { -+ fprintf(samtools_stderr, "[markdup] out of memory\n"); -+ goto fail; -+ } - - // handling supplementary reads needs a temporary file -- if (supp) { -- if (tmp_file_open_write(&temp, prefix, 1)) { -- fprintf(samtools_stderr, "[markdup] error: unable to open tmp file %s.\n", prefix); -- return 1; -+ if (param->supp) { -+ if (tmp_file_open_write(&temp, param->prefix, 1)) { -+ fprintf(samtools_stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix); -+ goto fail; - } - } - - if ((in_read->b = bam_init1()) == NULL) { - fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); -- return 1; -+ goto fail; - } - -- reading = writing = excluded = single_dup = duplicate = examined = pair = single = 0; -+ reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; -+ np_duplicate = np_opt_duplicate = 0; - -- while ((ret = sam_read1(in, header, in_read->b)) >= 0) { -+ while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { - - // do some basic coordinate order checks - if (in_read->b->core.tid >= 0) { // -1 for unmapped reads - if (in_read->b->core.tid < prev_tid || - ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { -- fprintf(samtools_stderr, "[markdup] error: bad coordinate order.\n"); -- return 1; -+ fprintf(samtools_stderr, "[markdup] error: not in coordinate sorted order.\n"); -+ goto fail; - } - } - -@@ -557,10 +1236,30 @@ - - reading++; - -- // read must not be secondary, supplementary, unmapped or failed QC -- if (!(in_read->b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL))) { -- examined++; -+ if (param->clear && (in_read->b->core.flag & BAM_FDUP)) { -+ uint8_t *data; -+ -+ in_read->b->core.flag ^= BAM_FDUP; - -+ if ((data = bam_aux_get(in_read->b, "dt")) != NULL) { -+ bam_aux_del(in_read->b, data); -+ } -+ -+ if ((data = bam_aux_get(in_read->b, "do")) != NULL) { -+ bam_aux_del(in_read->b, data); -+ } -+ } -+ -+ if (param->include_fails) { -+ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP); -+ } else { -+ exclude |= (BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FQCFAIL); -+ } -+ -+ // read must not be secondary, supplementary, unmapped or (possibly) failed QC -+ if (!(in_read->b->core.flag & exclude)) { -+ examined++; -+ in_read->duplicate = NULL; - - // look at the pairs first - if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { -@@ -569,9 +1268,16 @@ - key_data_t single_key; - in_hash_t *bp; - -- if (make_pair_key(&pair_key, in_read->b)) { -- fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); -- return 1; -+ if (param->mode) { -+ if (make_pair_key_sequence(&pair_key, in_read->b)) { -+ fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); -+ goto fail; -+ } -+ } else { -+ if (make_pair_key_template(&pair_key, in_read->b)) { -+ fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); -+ goto fail; -+ } - } - - make_single_key(&single_key, in_read->b); -@@ -585,40 +1291,32 @@ - if (ret > 0) { // new - // add to single duplicate hash - bp = &kh_val(single_hash, k); -- bp->p = in_read->b; -+ bp->p = in_read; - in_read->single_key = single_key; - } else if (ret == 0) { // exists - // look at singles only for duplication marking - bp = &kh_val(single_hash, k); - -- if (!(bp->p->core.flag & BAM_FPAIRED) || (bp->p->core.flag & BAM_FMUNMAP)) { -- bam1_t *dup = bp->p; -+ if (!(bp->p->b->core.flag & BAM_FPAIRED) || (bp->p->b->core.flag & BAM_FMUNMAP)) { -+ // singleton will always be marked duplicate even if -+ // scores more than one read of the pair -+ bam1_t *dup = bp->p->b; -+ -+ in_read->duplicate = bp->p; -+ bp->p = in_read; - -- // singleton will always be marked duplicate even if -- // scores more than one read of the pair -+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) -+ goto fail; - -- bp->p = in_read->b; -- dup->core.flag |= BAM_FDUP; - single_dup++; - -- if (tag) { -- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -- } -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; - -- if (supp) { -- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, dup)) { -- return 1; -- } -- } -- } - } - } else { - fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); -- return 1; -+ goto fail; - } - - // now do the pair -@@ -627,33 +1325,44 @@ - if (ret > 0) { // new - // add to the pair hash - bp = &kh_val(pair_hash, k); -- bp->p = in_read->b; -+ bp->p = in_read; - in_read->pair_key = pair_key; - } else if (ret == 0) { - int64_t old_score, new_score, tie_add = 0; - bam1_t *dup; -+ int check_chain = 0; - - bp = &kh_val(pair_hash, k); - -- if ((mate_tmp = get_mate_score(bp->p)) == -1) { -- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -- return 1; -+ if ((bp->p->b->core.flag & BAM_FQCFAIL) != (in_read->b->core.flag & BAM_FQCFAIL)) { -+ if (bp->p->b->core.flag & BAM_FQCFAIL) { -+ old_score = 0; -+ new_score = 1; -+ } else { -+ old_score = 1; -+ new_score = 0; -+ } - } else { -- old_score = calc_score(bp->p) + mate_tmp; -- } -+ if ((mate_tmp = get_mate_score(bp->p->b)) == -1) { -+ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ goto fail; -+ } else { -+ old_score = calc_score(bp->p->b) + mate_tmp; -+ } - -- if ((mate_tmp = get_mate_score(in_read->b)) == -1) { -- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -- return 1; -- } else { -- new_score = calc_score(in_read->b) + mate_tmp; -+ if ((mate_tmp = get_mate_score(in_read->b)) == -1) { -+ fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); -+ goto fail; -+ } else { -+ new_score = calc_score(in_read->b) + mate_tmp; -+ } - } - - // choose the highest score as the original - // and add it to the pair hash, mark the other as duplicate - - if (new_score == old_score) { -- if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p)) < 0) { -+ if (strcmp(bam_get_qname(in_read->b), bam_get_qname(bp->p->b)) < 0) { - tie_add = 1; - } else { - tie_add = -1; -@@ -661,39 +1370,40 @@ - } - - if (new_score + tie_add > old_score) { // swap reads -- dup = bp->p; -- bp->p = in_read->b; -+ dup = bp->p->b; -+ in_read->duplicate = bp->p; -+ bp->p = in_read; - } else { -+ if (bp->p->duplicate) { -+ in_read->duplicate = bp->p->duplicate; -+ check_chain = 1; -+ } -+ -+ bp->p->duplicate = in_read; - dup = in_read->b; - } - -- dup->core.flag |= BAM_FDUP; -- -- if (tag) { -- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) -+ goto fail; - -+ if (check_chain) { -+ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) -+ goto fail; - } - -- if (supp) { -- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, dup)) { -- return 1; -- } -- } -- } -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; - - duplicate++; - } else { - fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n"); -- return 1; -+ goto fail; - } - } else { // do the single (or effectively single) reads - int ret; - key_data_t single_key; - in_hash_t *bp; -+ int check_chain = 0; - - make_single_key(&single_key, in_read->b); - -@@ -704,68 +1414,76 @@ - - if (ret > 0) { // new - bp = &kh_val(single_hash, k); -- bp->p = in_read->b; -+ bp->p = in_read; - in_read->single_key = single_key; - } else if (ret == 0) { // exists - bp = &kh_val(single_hash, k); - -- if ((bp->p->core.flag & BAM_FPAIRED) && !(bp->p->core.flag & BAM_FMUNMAP)) { -+ if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { - // if matched against one of a pair just mark as duplicate - -- if (tag) { -- if (bam_aux_append(in_read->b, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -+ if (bp->p->duplicate) { -+ in_read->duplicate = bp->p->duplicate; -+ check_chain = 1; - } - -- if (supp) { -- if (bam_aux_get(in_read->b, "SA") || (in_read->b->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, in_read->b)) { -- return 1; -- } -- } -+ bp->p->duplicate = in_read; -+ -+ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) -+ goto fail; -+ -+ if (check_chain) { -+ // check the new duplicate entry in the chain -+ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) -+ goto fail; - } - -- in_read->b->core.flag |= BAM_FDUP; -+ // check against the new original -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; -+ - } else { - int64_t old_score, new_score; - bam1_t *dup; - -- old_score = calc_score(bp->p); -+ old_score = calc_score(bp->p->b); - new_score = calc_score(in_read->b); - - // choose the highest score as the original, add it - // to the single hash and mark the other as duplicate - if (new_score > old_score) { // swap reads -- dup = bp->p; -- bp->p = in_read->b; -+ dup = bp->p->b; -+ in_read->duplicate = bp->p; -+ bp->p = in_read; - } else { -+ if (bp->p->duplicate) { -+ in_read->duplicate = bp->p->duplicate; -+ check_chain = 1; -+ } -+ -+ bp->p->duplicate = in_read; - dup = in_read->b; - } - -- dup->core.flag |= BAM_FDUP; -+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) -+ goto fail; - -- if (tag) { -- if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(bp->p)) + 1, (uint8_t*)bam_get_qname(bp->p))) { -- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); -- return 1; -- } -+ -+ if (check_chain) { -+ if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) -+ goto fail; - } - -- if (supp) { -- if (bam_aux_get(dup, "SA") || (dup->core.flag & BAM_FMUNMAP)) { -- if (add_duplicate(dup_hash, dup)) { -- return 1; -- } -- } -+ if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) -+ goto fail; -+ -+ - } -- } - - single_dup++; - } else { - fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); -- return 1; -+ goto fail; - } - } - } else { -@@ -780,20 +1498,20 @@ - - /* keep a moving window of reads based on coordinates and max read length. Any unaligned reads - should just be written as they cannot be matched as duplicates. */ -- if (in_read->pos + max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { -+ if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { - break; - } - -- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -- if (supp) { -+ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -+ if (param->supp) { - if (tmp_file_write(&temp, in_read->b)) { - fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); -- return 1; -+ goto fail; - } - } else { -- if (sam_write1(out, header, in_read->b) < 0) { -+ if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); -- return 1; -+ goto fail; - } - } - -@@ -818,16 +1536,20 @@ - - // set the next one up for reading - in_read = kl_pushp(read_queue, read_buffer); -+ if (!in_read) { -+ fprintf(samtools_stderr, "[markdup] out of memory\n"); -+ goto fail; -+ } - - if ((in_read->b = bam_init1()) == NULL) { - fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); -- return 1; -+ goto fail; - } - } - - if (ret < -1) { - fprintf(samtools_stderr, "[markdup] error: truncated input file.\n"); -- return 1; -+ goto fail; - } - - // write out the end of the list -@@ -836,16 +1558,16 @@ - in_read = &kl_val(rq); - - if (bam_get_qname(in_read->b)) { // last entry will be blank -- if (!remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -- if (supp) { -+ if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { -+ if (param->supp) { - if (tmp_file_write(&temp, in_read->b)) { - fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); -- return 1; -+ goto fail; - } - } else { -- if (sam_write1(out, header, in_read->b) < 0) { -+ if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); -- return 1; -+ goto fail; - } - } - -@@ -858,71 +1580,155 @@ - rq = kl_begin(read_buffer); - } - -- if (supp) { -+ if (param->supp) { - bam1_t *b; - - if (tmp_file_end_write(&temp)) { - fprintf(samtools_stderr, "[markdup] error: unable to end tmp writing.\n"); -- return 1; -+ goto fail; - } - - // read data from temp file and mark duplicate supplementary alignments - -- if (tmp_file_begin_read(&temp, NULL)) { -- return 1; -+ if (tmp_file_begin_read(&temp)) { -+ goto fail; - } - - b = bam_init1(); - - while ((ret = tmp_file_read(&temp, b)) > 0) { - -- if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP)) { -+ if ((b->core.flag & BAM_FSUPPLEMENTARY) || (b->core.flag & BAM_FUNMAP) || (b->core.flag & BAM_FSECONDARY)) { -+ - k = kh_get(duplicates, dup_hash, bam_get_qname(b)); - - if (k != kh_end(dup_hash)) { -+ - b->core.flag |= BAM_FDUP; -+ np_duplicate++; -+ -+ if (param->tag && kh_val(dup_hash, k).name) { -+ if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) { -+ fprintf(samtools_stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); -+ goto fail; -+ } -+ } -+ -+ if (param->opt_dist) { -+ if (kh_val(dup_hash, k).type) { -+ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ"); -+ np_opt_duplicate++; -+ } else { -+ bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB"); -+ } -+ } - } - } - -- if (!remove_dups || !(b->core.flag & BAM_FDUP)) { -- if (sam_write1(out, header, b) < 0) { -+ if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { -+ if (sam_write1(param->out, header, b) < 0) { - fprintf(samtools_stderr, "[markdup] error: writing final output failed.\n"); -- return 1; -+ goto fail; - } - } - } - - if (ret == -1) { - fprintf(samtools_stderr, "[markdup] error: failed to read tmp file.\n"); -- return 1; -+ goto fail; - } - - for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { - if (kh_exist(dup_hash, k)) { -+ free(kh_val(dup_hash, k).name); - free((char *)kh_key(dup_hash, k)); -+ kh_key(dup_hash, k) = NULL; - } - } - -- tmp_file_destroy(&temp, b, 0); -- kh_destroy(duplicates, dup_hash); -+ tmp_file_destroy(&temp); - bam_destroy1(b); - } - -- if (do_stats) { -- fprintf(samtools_stderr, "READ %d WRITTEN %d \n" -- "EXCLUDED %d EXAMINED %d\n" -- "PAIRED %d SINGLE %d\n" -- "DULPICATE PAIR %d DUPLICATE SINGLE %d\n" -- "DUPLICATE TOTAL %d\n", reading, writing, excluded, examined, pair, single, -- duplicate, single_dup, single_dup + duplicate); -+ if (opt_warnings) { -+ fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n", -+ opt_warnings); -+ } -+ -+ if (param->do_stats) { -+ FILE *fp; -+ int file_open = 0; -+ unsigned long els; -+ -+ if (param->stats_file) { -+ if (NULL == (fp = fopen(param->stats_file, "w"))) { -+ fprintf(samtools_stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file); -+ fp = samtools_stderr; -+ } else { -+ file_open = 1; -+ } -+ } else { -+ fp = samtools_stderr; -+ } -+ -+ els = estimate_library_size(pair, duplicate - optical); -+ -+ fprintf(fp, -+ "COMMAND: %s\n" -+ "READ: %ld\n" -+ "WRITTEN: %ld\n" -+ "EXCLUDED: %ld\n" -+ "EXAMINED: %ld\n" -+ "PAIRED: %ld\n" -+ "SINGLE: %ld\n" -+ "DUPLICATE PAIR: %ld\n" -+ "DUPLICATE SINGLE: %ld\n" -+ "DUPLICATE PAIR OPTICAL: %ld\n" -+ "DUPLICATE SINGLE OPTICAL: %ld\n" -+ "DUPLICATE NON PRIMARY: %ld\n" -+ "DUPLICATE NON PRIMARY OPTICAL: %ld\n" -+ "DUPLICATE PRIMARY TOTAL: %ld\n" -+ "DUPLICATE TOTAL: %ld\n" -+ "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single, -+ duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate, -+ single_dup + duplicate, single_dup + duplicate + np_duplicate, els); -+ -+ if (file_open) { -+ fclose(fp); -+ } -+ } -+ -+ if (param->write_index) { -+ if (sam_idx_save(param->out) < 0) { -+ print_error_errno("markdup", "writing index failed"); -+ goto fail; -+ } - } - - kh_destroy(reads, pair_hash); - kh_destroy(reads, single_hash); - kl_destroy(read_queue, read_buffer); -- bam_hdr_destroy(header); -+ kh_destroy(duplicates, dup_hash); -+ sam_hdr_destroy(header); - - return 0; -+ -+ fail: -+ for (rq = kl_begin(read_buffer); rq != kl_end(read_buffer); rq = kl_next(rq)) -+ bam_destroy1(kl_val(rq).b); -+ kl_destroy(read_queue, read_buffer); -+ -+ for (k = kh_begin(dup_hash); k != kh_end(dup_hash); ++k) { -+ if (kh_exist(dup_hash, k)) { -+ free((char *)kh_key(dup_hash, k)); -+ } -+ } -+ kh_destroy(duplicates, dup_hash); -+ -+ kh_destroy(reads, pair_hash); -+ kh_destroy(reads, single_hash); -+ sam_hdr_destroy(header); -+ return 1; - } - - -@@ -930,15 +1736,23 @@ - fprintf(samtools_stderr, "\n"); - fprintf(samtools_stderr, "Usage: samtools markdup \n\n"); - fprintf(samtools_stderr, "Option: \n"); -- fprintf(samtools_stderr, " -r Remove duplicate reads\n"); -- fprintf(samtools_stderr, " -l INT Max read length (default 300 bases)\n"); -- fprintf(samtools_stderr, " -S Mark supplemenary alignments of duplicates as duplicates (slower).\n"); -- fprintf(samtools_stderr, " -s Report stats.\n"); -- fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); -- fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." -+ fprintf(samtools_stderr, " -r Remove duplicate reads\n"); -+ fprintf(samtools_stderr, " -l INT Max read length (default 300 bases)\n"); -+ fprintf(samtools_stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); -+ fprintf(samtools_stderr, " -s Report stats.\n"); -+ fprintf(samtools_stderr, " -f NAME Write stats to named file. Implies -s.\n"); -+ fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); -+ fprintf(samtools_stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); -+ fprintf(samtools_stderr, " -c Clear previous duplicate settings and tags.\n"); -+ fprintf(samtools_stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" -+ " TYPE = t measure positions based on template start/end (default).\n" -+ " s measure positions based on sequence start.\n"); -+ fprintf(samtools_stderr, " --include-fails Include quality check failed reads.\n"); -+ fprintf(samtools_stderr, " --no-PG Do not add a PG line\n"); -+ fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." - " Mainly for information and debugging.\n"); - -- sam_global_opt_help(samtools_stderr, "-.O..@"); -+ sam_global_opt_help(samtools_stderr, "-.O..@.."); - - fprintf(samtools_stderr, "\nThe input file must be coordinate sorted and must have gone" - " through fixmates with the mate scoring option on.\n"); -@@ -948,29 +1762,47 @@ - - - int bam_markdup(int argc, char **argv) { -- int c, ret, remove_dups = 0, report_stats = 0, include_supplementary = 0, tag_dup = 0; -- int32_t max_length = 300; -- samFile *in = NULL, *out = NULL; -+ int c, ret; - char wmode[3] = {'w', 'b', 0}; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - htsThreadPool p = {NULL, 0}; - kstring_t tmpprefix = {0, 0, NULL}; - struct stat st; - unsigned int t; -+ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL}; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), -+ {"include-fails", no_argument, NULL, 1001}, -+ {"no-PG", no_argument, NULL, 1002}, -+ {"mode", required_argument, NULL, 'm'}, - {NULL, 0, NULL, 0} - }; - -- while ((c = getopt_long(argc, argv, "rsl:StT:O:@:", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:", lopts, NULL)) >= 0) { - switch (c) { -- case 'r': remove_dups = 1; break; -- case 'l': max_length = atoi(optarg); break; -- case 's': report_stats = 1; break; -+ case 'r': param.remove_dups = 1; break; -+ case 'l': param.max_length = atoi(optarg); break; -+ case 's': param.do_stats = 1; break; - case 'T': kputs(optarg, &tmpprefix); break; -- case 'S': include_supplementary = 1; break; -- case 't': tag_dup = 1; break; -+ case 'S': param.supp = 1; break; -+ case 't': param.tag = 1; break; -+ case 'f': param.stats_file = optarg; param.do_stats = 1; break; -+ case 'd': param.opt_dist = atoi(optarg); break; -+ case 'c': param.clear = 1; break; -+ case 'm': -+ if (strcmp(optarg, "t") == 0) { -+ param.mode = MD_MODE_TEMPLATE; -+ } else if (strcmp(optarg, "s") == 0) { -+ param.mode = MD_MODE_SEQUENCE; -+ } else { -+ fprintf(samtools_stderr, "[markdup] error: unknown mode '%s'.\n", optarg); -+ return markdup_usage(); -+ } -+ -+ break; -+ case 1001: param.include_fails = 1; break; -+ case 1002: param.no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return markdup_usage(); -@@ -980,17 +1812,20 @@ - if (optind + 2 > argc) - return markdup_usage(); - -- in = sam_open_format(argv[optind], "r", &ga.in); -+ if (param.opt_dist < 0) param.opt_dist = 0; -+ if (param.max_length < 0) param.max_length = 300; -+ -+ param.in = sam_open_format(argv[optind], "r", &ga.in); - -- if (!in) { -+ if (!param.in) { - print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); - return 1; - } - - sam_open_mode(wmode + 1, argv[optind + 1], NULL); -- out = sam_open_format(argv[optind + 1], wmode, &ga.out); -+ param.out = sam_open_format(argv[optind + 1], wmode, &ga.out); - -- if (!out) { -+ if (!param.out) { - print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); - return 1; - } -@@ -1001,8 +1836,8 @@ - return 1; - } - -- hts_set_opt(in, HTS_OPT_THREAD_POOL, &p); -- hts_set_opt(out, HTS_OPT_THREAD_POOL, &p); -+ hts_set_opt(param.in, HTS_OPT_THREAD_POOL, &p); -+ hts_set_opt(param.out, HTS_OPT_THREAD_POOL, &p); - } - - // actual stuff happens here -@@ -1022,18 +1857,24 @@ - - t = ((unsigned) time(NULL)) ^ ((unsigned) clock()); - ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); -+ param.prefix = tmpprefix.s; -+ -+ param.arg_list = stringify_argv(argc + 1, argv - 1); -+ param.write_index = ga.write_index; -+ param.out_fn = argv[optind + 1]; - -- ret = bam_mark_duplicates(in, out, tmpprefix.s, remove_dups, max_length, report_stats, include_supplementary, tag_dup); -+ ret = bam_mark_duplicates(¶m); - -- sam_close(in); -+ sam_close(param.in); - -- if (sam_close(out) < 0) { -+ if (sam_close(param.out) < 0) { - fprintf(samtools_stderr, "[markdup] error closing output file\n"); - ret = 1; - } - - if (p.pool) hts_tpool_destroy(p.pool); - -+ free(param.arg_list); - free(tmpprefix.s); - sam_global_args_free(&ga); - ---- python-pysam.orig/samtools/bam_mate.c -+++ python-pysam/samtools/bam_mate.c -@@ -1,6 +1,6 @@ - /* bam_mate.c -- fix mate pairing information and clean up flags. - -- Copyright (C) 2009, 2011-2017 Genome Research Ltd. -+ Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. - -@@ -37,6 +37,9 @@ - #include "htslib/sam.h" - #include "samtools.h" - -+ -+#define MD_MIN_QUALITY 15 -+ - /* - * This function calculates ct tag for two bams, it assumes they are from the same template and - * writes the tag to the first read in position terms. -@@ -44,7 +47,8 @@ - static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) - { - bam1_t *swap; -- int i, end; -+ int i; -+ hts_pos_t end; - uint32_t *cigar; - str->l = 0; - if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip -@@ -140,8 +144,8 @@ - - bam1_t* first = a; - bam1_t* second = b; -- int32_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; -- int32_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; -+ hts_pos_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; -+ hts_pos_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; - if (a_pos > b_pos) { - first = b; - second = a; -@@ -226,7 +230,7 @@ - int i; - - for (i = 0; i < b->core.l_qseq; i++) { -- if (qual[i] >= 15) score += qual[i]; -+ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; - } - - return score; -@@ -250,31 +254,34 @@ - } - - // currently, this function ONLY works if each read has one hit --static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring) -+static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg) - { -- bam_hdr_t *header; -+ sam_hdr_t *header; - bam1_t *b[2] = { NULL, NULL }; -- int curr, has_prev, pre_end = 0, cur_end = 0, result; -- kstring_t str; -+ int curr, has_prev, result; -+ hts_pos_t pre_end = 0, cur_end = 0; -+ kstring_t str = KS_INITIALIZE; - -- str.l = str.m = 0; str.s = 0; - header = sam_hdr_read(in); - if (header == NULL) { - fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); - return 1; - } -+ - // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. -- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { -- char *p, *q; -- p = strstr(header->text, "\tSO:coordinate"); -- q = strchr(header->text, '\n'); -- // Looking for SO:coordinate within the @HD line only -- // (e.g. must ignore in a @CO comment line later in header) -- if ((p != 0) && (p < q)) { -- fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); -- goto fail; -- } -+ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "coordinate")) { -+ fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); -+ goto fail; - } -+ ks_free(&str); -+ -+ if (!no_pg && sam_hdr_add_pg(header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ goto fail; -+ - if (sam_hdr_write(out, header) < 0) goto write_fail; - - b[0] = bam_init1(); -@@ -303,7 +310,7 @@ - cur_end = bam_endpos(cur); - - // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag -- if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; -+ if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP; - } - if (has_prev) { // do we have a pair of reads to examine? - if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name -@@ -314,7 +321,7 @@ - if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) - && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE - { -- uint32_t cur5, pre5; -+ hts_pos_t cur5, pre5; - cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; - pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; - cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; -@@ -378,18 +385,19 @@ - - if (sam_write1(out, header, pre) < 0) goto write_fail; - } -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - bam_destroy1(b[0]); - bam_destroy1(b[1]); -- free(str.s); -+ ks_free(&str); - return 0; - - write_fail: - print_error_errno("fixmate", "Couldn't write to output file"); - fail: -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - bam_destroy1(b[0]); - bam_destroy1(b[1]); -+ ks_free(&str); - return 1; - } - -@@ -401,9 +409,10 @@ - " -r Remove unmapped reads and secondary alignments\n" - " -p Disable FR proper pair check\n" - " -c Add template cigar ct tag\n" --" -m Add mate score tag\n"); -+" -m Add mate score tag\n" -+" --no-PG do not add a PG line\n"); - -- sam_global_opt_help(where, "-.O..@"); -+ sam_global_opt_help(where, "-.O..@-."); - - fprintf(where, - "\n" -@@ -416,13 +425,15 @@ - { - htsThreadPool p = {NULL, 0}; - samFile *in = NULL, *out = NULL; -- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0; -+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - char wmode[3] = {'w', 'b', 0}; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; -+ char *arg_list = NULL; - - // parse args - if (argc == 1) { usage(stdout); return 0; } -@@ -432,6 +443,7 @@ - case 'p': proper_pair_check = 0; break; - case 'c': add_ct = 1; break; - case 'm': mate_score = 1; break; -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': usage(stderr); goto fail; -@@ -439,6 +451,9 @@ - } - if (optind+1 >= argc) { usage(stderr); goto fail; } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) -+ goto fail; -+ - // init - if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { - print_error_errno("fixmate", "cannot open input file"); -@@ -460,7 +475,7 @@ - } - - // run -- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score); -+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg); - - // cleanup - sam_close(in); -@@ -470,6 +485,7 @@ - } - - if (p.pool) hts_tpool_destroy(p.pool); -+ free(arg_list); - sam_global_args_free(&ga); - return res; - -@@ -477,6 +493,7 @@ - if (in) sam_close(in); - if (out) sam_close(out); - if (p.pool) hts_tpool_destroy(p.pool); -+ free(arg_list); - sam_global_args_free(&ga); - return 1; - } ---- python-pysam.orig/samtools/bam_mate.c.pysam.c -+++ python-pysam/samtools/bam_mate.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_mate.c -- fix mate pairing information and clean up flags. - -- Copyright (C) 2009, 2011-2017 Genome Research Ltd. -+ Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. - -@@ -39,6 +39,9 @@ - #include "htslib/sam.h" - #include "samtools.h" - -+ -+#define MD_MIN_QUALITY 15 -+ - /* - * This function calculates ct tag for two bams, it assumes they are from the same template and - * writes the tag to the first read in position terms. -@@ -46,7 +49,8 @@ - static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) - { - bam1_t *swap; -- int i, end; -+ int i; -+ hts_pos_t end; - uint32_t *cigar; - str->l = 0; - if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip -@@ -142,8 +146,8 @@ - - bam1_t* first = a; - bam1_t* second = b; -- int32_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; -- int32_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; -+ hts_pos_t a_pos = a->core.flag&BAM_FREVERSE ? bam_endpos(a) : a->core.pos; -+ hts_pos_t b_pos = b->core.flag&BAM_FREVERSE ? bam_endpos(b) : b->core.pos; - if (a_pos > b_pos) { - first = b; - second = a; -@@ -228,7 +232,7 @@ - int i; - - for (i = 0; i < b->core.l_qseq; i++) { -- if (qual[i] >= 15) score += qual[i]; -+ if (qual[i] >= MD_MIN_QUALITY) score += qual[i]; - } - - return score; -@@ -252,31 +256,34 @@ - } - - // currently, this function ONLY works if each read has one hit --static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring) -+static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg) - { -- bam_hdr_t *header; -+ sam_hdr_t *header; - bam1_t *b[2] = { NULL, NULL }; -- int curr, has_prev, pre_end = 0, cur_end = 0, result; -- kstring_t str; -+ int curr, has_prev, result; -+ hts_pos_t pre_end = 0, cur_end = 0; -+ kstring_t str = KS_INITIALIZE; - -- str.l = str.m = 0; str.s = 0; - header = sam_hdr_read(in); - if (header == NULL) { - fprintf(samtools_stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); - return 1; - } -+ - // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. -- if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { -- char *p, *q; -- p = strstr(header->text, "\tSO:coordinate"); -- q = strchr(header->text, '\n'); -- // Looking for SO:coordinate within the @HD line only -- // (e.g. must ignore in a @CO comment line later in header) -- if ((p != 0) && (p < q)) { -- fprintf(samtools_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); -- goto fail; -- } -+ if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "coordinate")) { -+ fprintf(samtools_stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); -+ goto fail; - } -+ ks_free(&str); -+ -+ if (!no_pg && sam_hdr_add_pg(header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) -+ goto fail; -+ - if (sam_hdr_write(out, header) < 0) goto write_fail; - - b[0] = bam_init1(); -@@ -305,7 +312,7 @@ - cur_end = bam_endpos(cur); - - // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag -- if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; -+ if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP; - } - if (has_prev) { // do we have a pair of reads to examine? - if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name -@@ -316,7 +323,7 @@ - if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) - && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE - { -- uint32_t cur5, pre5; -+ hts_pos_t cur5, pre5; - cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; - pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; - cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; -@@ -380,18 +387,19 @@ - - if (sam_write1(out, header, pre) < 0) goto write_fail; - } -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - bam_destroy1(b[0]); - bam_destroy1(b[1]); -- free(str.s); -+ ks_free(&str); - return 0; - - write_fail: - print_error_errno("fixmate", "Couldn't write to output file"); - fail: -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - bam_destroy1(b[0]); - bam_destroy1(b[1]); -+ ks_free(&str); - return 1; - } - -@@ -403,9 +411,10 @@ - " -r Remove unmapped reads and secondary alignments\n" - " -p Disable FR proper pair check\n" - " -c Add template cigar ct tag\n" --" -m Add mate score tag\n"); -+" -m Add mate score tag\n" -+" --no-PG do not add a PG line\n"); - -- sam_global_opt_help(where, "-.O..@"); -+ sam_global_opt_help(where, "-.O..@-."); - - fprintf(where, - "\n" -@@ -418,13 +427,15 @@ - { - htsThreadPool p = {NULL, 0}; - samFile *in = NULL, *out = NULL; -- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0; -+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - char wmode[3] = {'w', 'b', 0}; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; -+ char *arg_list = NULL; - - // parse args - if (argc == 1) { usage(samtools_stdout); return 0; } -@@ -434,6 +445,7 @@ - case 'p': proper_pair_check = 0; break; - case 'c': add_ct = 1; break; - case 'm': mate_score = 1; break; -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': usage(samtools_stderr); goto fail; -@@ -441,6 +453,9 @@ - } - if (optind+1 >= argc) { usage(samtools_stderr); goto fail; } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) -+ goto fail; -+ - // init - if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { - print_error_errno("fixmate", "cannot open input file"); -@@ -462,7 +477,7 @@ - } - - // run -- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score); -+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg); - - // cleanup - sam_close(in); -@@ -472,6 +487,7 @@ - } - - if (p.pool) hts_tpool_destroy(p.pool); -+ free(arg_list); - sam_global_args_free(&ga); - return res; - -@@ -479,6 +495,7 @@ - if (in) sam_close(in); - if (out) sam_close(out); - if (p.pool) hts_tpool_destroy(p.pool); -+ free(arg_list); - sam_global_args_free(&ga); - return 1; - } ---- python-pysam.orig/samtools/bam_md.c -+++ python-pysam/samtools/bam_md.c -@@ -1,6 +1,6 @@ - /* bam_md.c -- calmd subcommand. - -- Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd. -+ Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd. - Portions copyright (C) 2009-2011 Broad Institute. - - Author: Heng Li -@@ -46,12 +46,13 @@ - - int bam_aux_drop_other(bam1_t *b, uint8_t *s); - --void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm, int quiet_mode) -+void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode) - { - uint8_t *seq = bam_get_seq(b); - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; -- int i, x, y, u = 0; -+ int i, y, u = 0; -+ hts_pos_t x; - kstring_t *str; - int32_t old_nm_i = -1, nm = 0; - -@@ -67,7 +68,7 @@ - if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; - ++u; - } else { -- kputw(u, str); kputc(ref[x+j], str); -+ kputw(u, str); kputc(toupper(ref[x+j]), str); - u = 0; ++nm; - } - } -@@ -77,7 +78,7 @@ - kputw(u, str); kputc('^', str); - for (j = 0; j < l; ++j) { - if (x+j >= ref_len || ref[x+j] == '\0') break; -- kputc(ref[x+j], str); -+ kputc(toupper(ref[x+j]), str); - } - u = 0; - x += j; nm += j; -@@ -176,25 +177,28 @@ - " -A modify the quality string\n" - " -Q use quiet mode to output less debug info to stdout\n" - " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" --" -E extended BAQ for better sensitivity but lower specificity\n"); -+" -E extended BAQ for better sensitivity but lower specificity\n" -+" --no-PG do not add a PG line\n"); - -- sam_global_opt_help(stderr, "-....@"); -+ sam_global_opt_help(stderr, "-....@-."); - return 1; - } - - int bam_fillmd(int argc, char *argv[]) - { -- int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode; -+ int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; -+ hts_pos_t len; - htsThreadPool p = {NULL, 0}; - samFile *fp = NULL, *fpout = NULL; -- bam_hdr_t *header = NULL; -+ sam_hdr_t *header = NULL; - faidx_t *fai = NULL; -- char *ref = NULL, mode_w[8], *ref_file; -+ char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; - bam1_t *b = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -217,6 +221,7 @@ - case 'A': baq_flag |= 1; break; - case 'E': baq_flag |= 2; break; - case 'Q': quiet_mode = 1; break; -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); - /* else fall-through */ -@@ -234,8 +239,13 @@ - return 1; - } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("calmd", "failed to create arg_list"); -+ return 1; -+ } -+ - header = sam_hdr_read(fp); -- if (header == NULL || header->n_targets == 0) { -+ if (header == NULL || sam_hdr_nref(header) == 0) { - fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); - goto fail; - } -@@ -245,6 +255,14 @@ - print_error_errno("calmd", "Failed to open output"); - goto fail; - } -+ if (!no_pg && sam_hdr_add_pg(header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("calmd", "failed to add PG line to header"); -+ goto fail; -+ } - if (sam_hdr_write(fpout, header) < 0) { - print_error_errno("calmd", "Failed to write sam header"); - goto fail; -@@ -276,11 +294,11 @@ - if (b->core.tid >= 0) { - if (tid != b->core.tid) { - free(ref); -- ref = fai_fetch(fai, header->target_name[b->core.tid], &len); -+ ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len); - tid = b->core.tid; - if (ref == 0) { // FIXME: Should this always be fatal? - fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", -- header->target_name[tid]); -+ sam_hdr_tid2name(header, tid)); - if (is_realn || capQ > 10) goto fail; // Would otherwise crash - } - } -@@ -301,8 +319,9 @@ - goto fail; - } - bam_destroy1(b); -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - -+ free(arg_list); - free(ref); - fai_destroy(fai); - sam_close(fp); -@@ -315,9 +334,10 @@ - return 0; - - fail: -+ free(arg_list); - free(ref); - if (b) bam_destroy1(b); -- if (header) bam_hdr_destroy(header); -+ if (header) sam_hdr_destroy(header); - if (fai) fai_destroy(fai); - if (fp) sam_close(fp); - if (fpout) sam_close(fpout); ---- python-pysam.orig/samtools/bam_md.c.pysam.c -+++ python-pysam/samtools/bam_md.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_md.c -- calmd subcommand. - -- Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd. -+ Copyright (C) 2009-2011, 2014-2015, 2019 Genome Research Ltd. - Portions copyright (C) 2009-2011 Broad Institute. - - Author: Heng Li -@@ -48,12 +48,13 @@ - - int bam_aux_drop_other(bam1_t *b, uint8_t *s); - --void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm, int quiet_mode) -+void bam_fillmd1_core(bam1_t *b, char *ref, hts_pos_t ref_len, int flag, int max_nm, int quiet_mode) - { - uint8_t *seq = bam_get_seq(b); - uint32_t *cigar = bam_get_cigar(b); - bam1_core_t *c = &b->core; -- int i, x, y, u = 0; -+ int i, y, u = 0; -+ hts_pos_t x; - kstring_t *str; - int32_t old_nm_i = -1, nm = 0; - -@@ -69,7 +70,7 @@ - if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; - ++u; - } else { -- kputw(u, str); kputc(ref[x+j], str); -+ kputw(u, str); kputc(toupper(ref[x+j]), str); - u = 0; ++nm; - } - } -@@ -79,7 +80,7 @@ - kputw(u, str); kputc('^', str); - for (j = 0; j < l; ++j) { - if (x+j >= ref_len || ref[x+j] == '\0') break; -- kputc(ref[x+j], str); -+ kputc(toupper(ref[x+j]), str); - } - u = 0; - x += j; nm += j; -@@ -178,25 +179,28 @@ - " -A modify the quality string\n" - " -Q use quiet mode to output less debug info to samtools_stdout\n" - " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n" --" -E extended BAQ for better sensitivity but lower specificity\n"); -+" -E extended BAQ for better sensitivity but lower specificity\n" -+" --no-PG do not add a PG line\n"); - -- sam_global_opt_help(samtools_stderr, "-....@"); -+ sam_global_opt_help(samtools_stderr, "-....@-."); - return 1; - } - - int bam_fillmd(int argc, char *argv[]) - { -- int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode; -+ int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; -+ hts_pos_t len; - htsThreadPool p = {NULL, 0}; - samFile *fp = NULL, *fpout = NULL; -- bam_hdr_t *header = NULL; -+ sam_hdr_t *header = NULL; - faidx_t *fai = NULL; -- char *ref = NULL, mode_w[8], *ref_file; -+ char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; - bam1_t *b = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -219,6 +223,7 @@ - case 'A': baq_flag |= 1; break; - case 'E': baq_flag |= 2; break; - case 'Q': quiet_mode = 1; break; -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(samtools_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); - /* else fall-through */ -@@ -236,8 +241,13 @@ - return 1; - } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("calmd", "failed to create arg_list"); -+ return 1; -+ } -+ - header = sam_hdr_read(fp); -- if (header == NULL || header->n_targets == 0) { -+ if (header == NULL || sam_hdr_nref(header) == 0) { - fprintf(samtools_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); - goto fail; - } -@@ -247,6 +257,14 @@ - print_error_errno("calmd", "Failed to open output"); - goto fail; - } -+ if (!no_pg && sam_hdr_add_pg(header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("calmd", "failed to add PG line to header"); -+ goto fail; -+ } - if (sam_hdr_write(fpout, header) < 0) { - print_error_errno("calmd", "Failed to write sam header"); - goto fail; -@@ -278,11 +296,11 @@ - if (b->core.tid >= 0) { - if (tid != b->core.tid) { - free(ref); -- ref = fai_fetch(fai, header->target_name[b->core.tid], &len); -+ ref = fai_fetch64(fai, sam_hdr_tid2name(header, b->core.tid), &len); - tid = b->core.tid; - if (ref == 0) { // FIXME: Should this always be fatal? - fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", -- header->target_name[tid]); -+ sam_hdr_tid2name(header, tid)); - if (is_realn || capQ > 10) goto fail; // Would otherwise crash - } - } -@@ -303,8 +321,9 @@ - goto fail; - } - bam_destroy1(b); -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - -+ free(arg_list); - free(ref); - fai_destroy(fai); - sam_close(fp); -@@ -317,9 +336,10 @@ - return 0; - - fail: -+ free(arg_list); - free(ref); - if (b) bam_destroy1(b); -- if (header) bam_hdr_destroy(header); -+ if (header) sam_hdr_destroy(header); - if (fai) fai_destroy(fai); - if (fp) sam_close(fp); - if (fpout) sam_close(fpout); ---- python-pysam.orig/samtools/bam_plbuf.c -+++ python-pysam/samtools/bam_plbuf.c -@@ -58,11 +58,12 @@ - - int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) - { -- int ret, n_plp, tid, pos; -+ int ret, n_plp, tid; -+ hts_pos_t pos; - const bam_pileup1_t *plp; - ret = bam_plp_push(buf->iter, b); - if (ret < 0) return ret; -- while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) -+ while ((plp = bam_plp64_next(buf->iter, &tid, &pos, &n_plp)) != 0) - buf->func(tid, pos, n_plp, plp, buf->data); - return 0; - } ---- python-pysam.orig/samtools/bam_plbuf.c.pysam.c -+++ python-pysam/samtools/bam_plbuf.c.pysam.c -@@ -60,11 +60,12 @@ - - int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) - { -- int ret, n_plp, tid, pos; -+ int ret, n_plp, tid; -+ hts_pos_t pos; - const bam_pileup1_t *plp; - ret = bam_plp_push(buf->iter, b); - if (ret < 0) return ret; -- while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) -+ while ((plp = bam_plp64_next(buf->iter, &tid, &pos, &n_plp)) != 0) - buf->func(tid, pos, n_plp, plp, buf->data); - return 0; - } ---- python-pysam.orig/samtools/bam_plbuf.h -+++ python-pysam/samtools/bam_plbuf.h -@@ -29,7 +29,7 @@ - - #ifndef BAM_PILEUP_F_DEFINED - #define BAM_PILEUP_F_DEFINED --typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); -+typedef int (*bam_pileup_f)(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data); - #endif //BAM_PILEUP_F_DEFINED - - typedef struct { ---- python-pysam.orig/samtools/bam_plcmd.c -+++ python-pysam/samtools/bam_plcmd.c -@@ -1,6 +1,6 @@ - /* bam_plcmd.c -- mpileup subcommand. - -- Copyright (C) 2008-2015 Genome Research Ltd. -+ Copyright (C) 2008-2015, 2019 Genome Research Ltd. - Portions copyright (C) 2009-2012 Broad Institute. - - Author: Heng Li -@@ -36,14 +36,19 @@ - #include - #include - #include -+#include - #include - #include - #include -+#include - #include --#include "sam_header.h" - #include "samtools.h" -+#include "bedidx.h" - #include "sam_opts.h" - -+#define dummy_free(p) -+KLIST_INIT(auxlist, char *, dummy_free) -+ - static inline int printw(int c, FILE *fp) - { - char buf[16]; -@@ -59,7 +64,9 @@ - return 0; - } - --static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref) -+static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, -+ hts_pos_t ref_len, const char *ref, kstring_t *ks, -+ int rev_del) - { - int j; - if (p->is_head) { -@@ -79,21 +86,31 @@ - else c = bam_is_rev(p->b)? tolower(c) : toupper(c); - } - putc(c, fp); -- } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp); -+ } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp); -+ int del_len = -p->indel; - if (p->indel > 0) { -- putc('+', fp); printw(p->indel, fp); -- for (j = 1; j <= p->indel; ++j) { -- int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; -- putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); -+ int len = bam_plp_insertion(p, ks, &del_len); -+ if (len < 0) -+ return -1; -+ putc('+', fp); printw(len, fp); -+ if (bam_is_rev(p->b)) { -+ char pad = rev_del ? '#' : '*'; -+ for (j = 0; j < len; j++) -+ putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp); -+ } else { -+ for (j = 0; j < len; j++) -+ putc(toupper(ks->s[j]), fp); - } -- } else if (p->indel < 0) { -- printw(p->indel, fp); -- for (j = 1; j <= -p->indel; ++j) { -+ } -+ if (del_len > 0) { -+ printw(-del_len, fp); -+ for (j = 1; j <= del_len; ++j) { - int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; - putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); - } - } - if (p->is_tail) putc('$', fp); -+ return 0; - } - - #include -@@ -109,36 +126,43 @@ - #define MPLP_REDO_BAQ (1<<6) - #define MPLP_ILLUMINA13 (1<<7) - #define MPLP_IGNORE_RG (1<<8) --#define MPLP_PRINT_POS (1<<9) --#define MPLP_PRINT_MAPQ (1<<10) -+#define MPLP_PRINT_QPOS (1<<9) - #define MPLP_PER_SAMPLE (1<<11) - #define MPLP_SMART_OVERLAPS (1<<12) -+ - #define MPLP_PRINT_QNAME (1<<13) -+#define MPLP_PRINT_FLAG (1<<14) -+#define MPLP_PRINT_RNAME (1<<15) -+#define MPLP_PRINT_POS (1<<16) -+#define MPLP_PRINT_MAPQ (1<<17) -+#define MPLP_PRINT_CIGAR (1<<18) -+#define MPLP_PRINT_RNEXT (1<<19) -+#define MPLP_PRINT_PNEXT (1<<20) -+#define MPLP_PRINT_TLEN (1<<21) -+#define MPLP_PRINT_SEQ (1<<22) -+#define MPLP_PRINT_QUAL (1<<23) - - #define MPLP_MAX_DEPTH 8000 - #define MPLP_MAX_INDEL_DEPTH 250 - --void *bed_read(const char *fn); --void bed_destroy(void *_h); --int bed_overlap(const void *_h, const char *chr, int beg, int end); -- - typedef struct { -- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all; -+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all, rev_del; - int rflag_require, rflag_filter; - int openQ, extQ, tandemQ, min_support; // for indels - double min_frac; // for indels - char *reg, *pl_list, *fai_fname, *output_fname; - faidx_t *fai; -- void *bed, *rghash; -+ void *bed, *rghash, *auxlist; - int argc; - char **argv; -+ char sep, empty; - sam_global_args ga; - } mplp_conf_t; - - typedef struct { - char *ref[2]; - int ref_id[2]; -- int ref_len[2]; -+ hts_pos_t ref_len[2]; - } mplp_ref_t; - - #define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}} -@@ -146,7 +170,7 @@ - typedef struct { - samFile *fp; - hts_itr_t *iter; -- bam_hdr_t *h; -+ sam_hdr_t *h; - mplp_ref_t *ref; - const mplp_conf_t *conf; - } mplp_aux_t; -@@ -157,7 +181,54 @@ - bam_pileup1_t **plp; - } mplp_pileup_t; - --static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { -+static int build_auxlist(mplp_conf_t *conf, char *optstring) { -+ if (!optstring) -+ return 0; -+ -+ void *colhash = khash_str2int_init(); -+ if (!colhash) -+ return 1; -+ -+ struct active_cols { -+ char *name; -+ int supported; -+ }; -+ -+ const struct active_cols colnames[11] = { -+ {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1}, {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0}, {"QUAL", 0} -+ }; -+ -+ int i, f = MPLP_PRINT_QNAME, colno = 11; -+ for (i = 0; i < colno; i++, f <<= 1) -+ if (colnames[i].supported) -+ khash_str2int_set(colhash, colnames[i].name, f); -+ -+ conf->auxlist = kl_init(auxlist); -+ if (!conf->auxlist) -+ return 1; -+ -+ char *save_p; -+ char *tag = strtok_r(optstring, ",", &save_p); -+ while (tag) { -+ if (khash_str2int_get(colhash, tag, &f) == 0) { -+ conf->flag |= f; -+ } else { -+ if (strlen(tag) != 2) { -+ fprintf(stderr, "[%s] tag '%s' has more than two characters or not supported\n", __func__, tag); -+ } else { -+ char **tag_p = kl_pushp(auxlist, conf->auxlist); -+ *tag_p = tag; -+ } -+ } -+ tag = strtok_r(NULL, ",", &save_p); -+ } -+ -+ khash_str2int_destroy(colhash); -+ -+ return 0; -+} -+ -+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, hts_pos_t *ref_len) { - mplp_ref_t *r = ma->ref; - - //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]); -@@ -177,9 +248,10 @@ - } - if (tid == r->ref_id[1]) { - // Last, swap over -- int tmp; -- tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp; -- tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp; -+ int tmp_id; -+ hts_pos_t tmp_len; -+ tmp_id = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp_id; -+ tmp_len = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp_len; - - char *tc; - tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc; -@@ -195,10 +267,10 @@ - r->ref_len[1] = r->ref_len[0]; - - r->ref_id[0] = tid; -- r->ref[0] = faidx_fetch_seq(ma->conf->fai, -- ma->h->target_name[r->ref_id[0]], -+ r->ref[0] = faidx_fetch_seq64(ma->conf->fai, -+ sam_hdr_tid2name(ma->h, r->ref_id[0]), - 0, -- INT_MAX, -+ HTS_POS_MAX, - &r->ref_len[0]); - - if (!r->ref[0]) { -@@ -216,15 +288,25 @@ - - static void - print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, -- int pos, int n, const char *ref, int ref_len) -+ hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len) - { - int i; -- fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); -+ fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - fputs("\t0\t*\t*", fp); -- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); -- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); -- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp); -+ if (conf->flag & MPLP_PRINT_QPOS) -+ fputs("\t*", fp); -+ int flag_value = MPLP_PRINT_QNAME; -+ while(flag_value < MPLP_PRINT_QUAL + 1) { -+ if (conf->flag & flag_value) -+ fputs("\t*", fp); -+ flag_value <<= 1; -+ } -+ if (conf->auxlist) { -+ int t = 0; -+ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) -+ fputs("\t*", fp); -+ } - } - putc('\n', fp); - } -@@ -233,7 +315,9 @@ - { - char *ref; - mplp_aux_t *ma = (mplp_aux_t*)data; -- int ret, skip = 0, ref_len; -+ int ret, skip = 0; -+ hts_pos_t ref_len; -+ - do { - int has_ref; - ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); -@@ -247,7 +331,7 @@ - if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } - if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } - if (ma->conf->bed && ma->conf->all == 0) { // test overlap -- skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b)); -+ skip = !bed_overlap(ma->conf->bed, sam_hdr_tid2name(ma->h, b->core.tid), b->core.pos, bam_endpos(b)); - if (skip) continue; - } - if (ma->conf->rghash) { // exclude read groups -@@ -265,8 +349,8 @@ - if (ma->conf->fai && b->core.tid >= 0) { - has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); - if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence -- fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", -- __func__, b->core.pos, ref_len, b->core.tid); -+ fprintf(stderr,"[%s] Skipping because %"PRIhts_pos" is outside of %"PRIhts_pos" [ref:%d]\n", -+ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); - skip = 1; - continue; - } -@@ -319,17 +403,19 @@ - * @param conf configuration for this pileup - * @param n number of files specified in fn - * @param fn filenames -+ * @param fn_idx index filenames - */ --static int mpileup(mplp_conf_t *conf, int n, char **fn) -+static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) - { - extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); - extern void bcf_call_del_rghash(void *rghash); - mplp_aux_t **data; -- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth; -+ int i, tid, *n_plp, tid0 = 0, max_depth, max_indel_depth; -+ hts_pos_t pos, beg0 = 0, end0 = HTS_POS_MAX, ref_len; - const bam_pileup1_t **plp; - mplp_ref_t mp_ref = MPLP_REF_INIT; - bam_mplp_t iter; -- bam_hdr_t *h = NULL; /* header of first file in input list */ -+ sam_hdr_t *h = NULL; /* header of first file in input list */ - char *ref; - void *rghash = NULL; - FILE *pileup_fp = NULL; -@@ -359,7 +445,7 @@ - - // read the header of each file in the list and initialize data - for (i = 0; i < n; ++i) { -- bam_hdr_t *h_tmp; -+ sam_hdr_t *h_tmp; - data[i] = calloc(1, sizeof(mplp_aux_t)); - data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in); - if ( !data[i]->fp ) -@@ -383,13 +469,20 @@ - fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); - exit(EXIT_FAILURE); - } -- bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); -+ bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp)); - if (conf->flag & MPLP_BCF) { - // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) -- rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); -+ rghash = bcf_call_add_rg(rghash, sam_hdr_str(h_tmp), conf->pl_list); - } - if (conf->reg) { -- hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx != NULL) { -+ idx = sam_index_load2(data[i]->fp, fn[i], fn_idx[i]); -+ } else { -+ idx = sam_index_load(data[i]->fp, fn[i]); -+ } -+ - if (idx == NULL) { - fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); - exit(EXIT_FAILURE); -@@ -407,7 +500,7 @@ - if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file - else { - // FIXME: check consistency between h and h_tmp -- bam_hdr_destroy(h_tmp); -+ sam_hdr_destroy(h_tmp); - - // we store only the first file's header; it's (alleged to be) - // compatible with the i-th file's target_name lookup needs -@@ -459,10 +552,10 @@ - - // Translate BAM @SQ tags to BCF ##contig tags - // todo: use/write new BAM header manipulation routines, fill also UR, M5 -- for (i=0; in_targets; i++) -+ for (i=0; i < sam_hdr_nref(h); i++) - { - str.l = 0; -- ksprintf(&str, "##contig=", h->target_name[i], h->target_len[i]); -+ ksprintf(&str, "##contig=", sam_hdr_tid2name(h, i), (int64_t) sam_hdr_tid2len(h, i)); - bcf_hdr_append(bcf_hdr, str.s); - } - free(str.s); -@@ -515,7 +608,11 @@ - for (i=0; in; i++) - bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); - bcf_hdr_add_sample(bcf_hdr, NULL); -- bcf_hdr_write(bcf_fp, bcf_hdr); -+ if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) { -+ print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"", -+ conf->output_fname? conf->output_fname : "standard output"); -+ exit(EXIT_FAILURE); -+ } - // End of BCF header creation - - // Initialise the calling algorithm -@@ -574,16 +671,17 @@ - bam_mplp_set_maxcnt(iter, max_depth); - bcf1_t *bcf_rec = bcf_init1(); - int ret; -- int last_tid = -1, last_pos = -1; -+ int last_tid = -1; -+ hts_pos_t last_pos = -1; - - // begin pileup -- while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { -+ while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) { - if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - mplp_get_ref(data[0], tid, &ref, &ref_len); - //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); - if (conf->flag & MPLP_BCF) { - int total_depth, _ref0, ref16; -- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; -+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; - for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; - group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); - _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; -@@ -595,7 +693,11 @@ - bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); -- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); -+ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { -+ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", -+ conf->output_fname?conf->output_fname:"standard output"); -+ exit(EXIT_FAILURE); -+ } - // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) - { -@@ -605,7 +707,11 @@ - if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); -- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); -+ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { -+ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", -+ conf->output_fname?conf->output_fname:"standard output"); -+ exit(EXIT_FAILURE); -+ } - } - } - } else { -@@ -613,10 +719,10 @@ - // Deal with missing portions of previous tids - while (tid > last_tid) { - if (last_tid >= 0 && !conf->reg) { -- while (++last_pos < h->target_len[last_tid]) { -- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { -+ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); -+ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); - } - } - last_tid++; -@@ -629,16 +735,16 @@ - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (conf->reg && last_pos < beg0) continue; // out of range; skip -- if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0) -+ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; -- print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len); -+ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); - } - last_tid = tid; - last_pos = pos; - } -- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; -+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; - -- fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); -+ fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - int j, cnt; - for (j = cnt = 0; j < n_plp[i]; ++j) { -@@ -651,22 +757,40 @@ - fprintf(pileup_fp, "\t%d\t", cnt); - if (n_plp[i] == 0) { - fputs("*\t*", pileup_fp); -- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); -- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); -- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp); -+ if (conf->flag & MPLP_PRINT_QPOS) -+ fputs("\t*", pileup_fp); -+ int flag_value = MPLP_PRINT_QNAME; -+ while(flag_value < MPLP_PRINT_QUAL + 1) { -+ if (conf->flag & flag_value) -+ fputs("\t*", pileup_fp); -+ flag_value <<= 1; -+ } -+ if (conf->auxlist) { -+ int t = 0; -+ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) -+ fputs("\t*", pileup_fp); -+ } - } else { - int n = 0; -+ kstring_t ks = KS_INITIALIZE; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; -- if (c >= conf->min_baseQ) -- n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); -+ if (c >= conf->min_baseQ) { -+ n++; -+ if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) { -+ ret = 1; -+ goto fail; -+ } -+ } - } - if (!n) putc('*', pileup_fp); - -+ /* Print base qualities */ - n = 0; -+ ks_free(&ks); - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; -@@ -681,55 +805,124 @@ - } - if (!n) putc('*', pileup_fp); - -- if (conf->flag & MPLP_PRINT_MAPQ) { -+ /* Print mpileup positions */ -+ if (conf->flag & MPLP_PRINT_QPOS) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq -- ? bam_get_qual(p->b)[p->qpos] -- : 0; -+ ? bam_get_qual(p->b)[p->qpos] -+ : 0; - if ( c < conf->min_baseQ ) continue; -- c = plp[i][j].b->core.qual + 33; -- if (c > 126) c = 126; -- putc(c, pileup_fp); -+ if (n > 0) putc(',', pileup_fp); - n++; -+ fprintf(pileup_fp, "%d", p->qpos + 1); - } - if (!n) putc('*', pileup_fp); - } - -- if (conf->flag & MPLP_PRINT_POS) { -- n = 0; -- putc('\t', pileup_fp); -- for (j = 0; j < n_plp[i]; ++j) { -- const bam_pileup1_t *p = plp[i] + j; -- int c = p->qpos < p->b->core.l_qseq -- ? bam_get_qual(p->b)[p->qpos] -- : 0; -- if ( c < conf->min_baseQ ) continue; -- -- if (n > 0) putc(',', pileup_fp); -- fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... -- n++; -+ /* Print selected columns */ -+ int flag_value = MPLP_PRINT_QNAME; -+ while(flag_value < MPLP_PRINT_QUAL + 1) { -+ if (conf->flag & flag_value) { -+ n = 0; -+ putc('\t', pileup_fp); -+ for (j = 0; j < n_plp[i]; ++j) { -+ const bam_pileup1_t *p = &plp[i][j]; -+ int c = p->qpos < p->b->core.l_qseq -+ ? bam_get_qual(p->b)[p->qpos] -+ : 0; -+ if ( c < conf->min_baseQ ) continue; -+ if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp); -+ n++; -+ -+ switch (flag_value) { -+ case MPLP_PRINT_QNAME: -+ fputs(bam_get_qname(p->b), pileup_fp); -+ break; -+ case MPLP_PRINT_FLAG: -+ fprintf(pileup_fp, "%d", p->b->core.flag); -+ break; -+ case MPLP_PRINT_RNAME: -+ if (p->b->core.tid >= 0) -+ fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); -+ else -+ putc('*', pileup_fp); -+ break; -+ case MPLP_PRINT_POS: -+ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); -+ break; -+ case MPLP_PRINT_MAPQ: -+ c = p->b->core.qual + 33; -+ if (c > 126) c = 126; -+ putc(c, pileup_fp); -+ break; -+ case MPLP_PRINT_RNEXT: -+ if (p->b->core.mtid >= 0) -+ fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); -+ else -+ putc('*', pileup_fp); -+ break; -+ case MPLP_PRINT_PNEXT: -+ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); -+ break; -+ } -+ } -+ if (!n) putc('*', pileup_fp); - } -- if (!n) putc('*', pileup_fp); -+ flag_value <<= 1; - } - -- if (conf->flag & MPLP_PRINT_QNAME) { -- n = 0; -- putc('\t', pileup_fp); -- for (j = 0; j < n_plp[i]; ++j) { -- const bam_pileup1_t *p = &plp[i][j]; -- int c = p->qpos < p->b->core.l_qseq -- ? bam_get_qual(p->b)[p->qpos] -- : 0; -- if ( c < conf->min_baseQ ) continue; -- -- if (n > 0) putc(',', pileup_fp); -- fputs(bam_get_qname(p->b), pileup_fp); -- n++; -+ /* Print selected tags */ -+ klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); -+ if (auxlist_p && auxlist_p->size) { -+ kliter_t(auxlist) *aux; -+ for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { -+ n = 0; -+ putc('\t', pileup_fp); -+ for (j = 0; j < n_plp[i]; ++j) { -+ const bam_pileup1_t *p = &plp[i][j]; -+ int c = p->qpos < p->b->core.l_qseq -+ ? bam_get_qual(p->b)[p->qpos] -+ : 0; -+ if ( c < conf->min_baseQ ) continue; -+ -+ if (n > 0) putc(conf->sep, pileup_fp); -+ n++; -+ uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); -+ if (!tag_u) { -+ putc(conf->empty , pileup_fp); -+ continue; -+ } -+ -+ /* Tag value is string */ -+ if (*tag_u == 'Z' || *tag_u == 'H') { -+ char *tag_s = bam_aux2Z(tag_u); -+ if (!tag_s) continue; -+ fputs(tag_s, pileup_fp); -+ } -+ -+ /* Tag value is integer */ -+ if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { -+ int64_t tag_i = bam_aux2i(tag_u); -+ fprintf(pileup_fp, "%" PRId64 "", tag_i); -+ } -+ -+ /* Tag value is float */ -+ if (*tag_u == 'd' || *tag_u == 'f') { -+ double tag_f = bam_aux2f(tag_u); -+ fprintf(pileup_fp, "%lf", tag_f); -+ } -+ -+ /* Tag value is character */ -+ if (*tag_u == 'A') { -+ char tag_c = bam_aux2A(tag_u); -+ putc(tag_c, pileup_fp); -+ } -+ } -+ if (!n) putc('*', pileup_fp); - } -- if (!n) putc('*', pileup_fp); - } - } - } -@@ -744,12 +937,12 @@ - last_pos = beg0-1; - mplp_get_ref(data[0], tid0, &ref, &ref_len); - } -- while (last_tid >= 0 && last_tid < h->n_targets) { -- while (++last_pos < h->target_len[last_tid]) { -+ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (last_pos >= end0) break; -- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); -+ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); - } - last_tid++; - last_pos = -1; -@@ -758,6 +951,7 @@ - } - } - -+fail: - // clean up - free(bc.tmp.s); - bcf_destroy1(bcf_rec); -@@ -779,7 +973,7 @@ - free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); - bcf_call_del_rghash(rghash); - bam_mplp_destroy(iter); -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - for (i = 0; i < n; ++i) { - sam_close(data[i]->fp); - if (data[i]->iter) hts_itr_destroy(data[i]->iter); -@@ -922,17 +1116,22 @@ - " [%s]\n", tmp_filter); - fprintf(fp, - " -x, --ignore-overlaps disable read-pair overlap detection\n" -+" -X, --customized-index use customized index files\n" // -X flag for index filename - "\n" - "Output options:\n" - " -o, --output FILE write output to FILE [standard output]\n" - " -O, --output-BP output base positions on reads\n" - " -s, --output-MQ output mapping quality\n" - " --output-QNAME output read names\n" -+" --output-extra STR output extra read fields and read tag values\n" -+" --output-sep CHAR set the separator character for tag lists [,]\n" -+" --output-empty CHAR set the no value character for tag lists [*]\n" -+" --reverse-del use '#' character for deletions on the reverse strand\n" - " -a output all positions (including zero depth)\n" - " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" - "\n" - "Generic options:\n"); -- sam_global_opt_help(fp, "-.--.-"); -+ sam_global_opt_help(fp, "-.--.--."); - - fprintf(fp, "\n" - "Note that using \"samtools mpileup\" to generate BCF or VCF files is now\n" -@@ -952,7 +1151,7 @@ - int c; - const char *file_list = NULL; - char **fn = NULL; -- int nfiles = 0, use_orphan = 0; -+ int nfiles = 0, use_orphan = 0, has_index_file = 0; - mplp_conf_t mplp; - memset(&mplp, 0, sizeof(mplp_conf_t)); - mplp.min_baseQ = 13; -@@ -966,6 +1165,9 @@ - mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; - mplp.output_fname = NULL; - mplp.all = 0; -+ mplp.rev_del = 0; -+ mplp.sep = ','; -+ mplp.empty = '*'; - sam_global_args_init(&mplp.ga); - - static const struct option lopts[] = -@@ -1020,9 +1222,15 @@ - {"per-sample-mF", no_argument, NULL, 'p'}, - {"per-sample-mf", no_argument, NULL, 'p'}, - {"platforms", required_argument, NULL, 'P'}, -+ {"customized-index", no_argument, NULL, 'X'}, -+ {"reverse-del", no_argument, NULL, 6}, -+ {"output-extra", required_argument, NULL, 7}, -+ {"output-sep", required_argument, NULL, 8}, -+ {"output-empty", required_argument, NULL, 9}, - {NULL, 0, NULL, 0} - }; -- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) { -+ -+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) { - switch (c) { - case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; - case 1 : -@@ -1036,6 +1244,15 @@ - case 3 : mplp.output_fname = optarg; break; - case 4 : mplp.openQ = atoi(optarg); break; - case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; -+ case 6 : mplp.rev_del = 1; break; -+ case 7 : -+ if (build_auxlist(&mplp, optarg) != 0) { -+ fprintf(stderr,"Could not build aux list using '%s'\n", optarg); -+ return 1; -+ } -+ break; -+ case 8: mplp.sep = optarg[0]; break; -+ case 9: mplp.empty = optarg[0]; break; - case 'f': - mplp.fai = fai_load(optarg); - if (mplp.fai == NULL) return 1; -@@ -1056,6 +1273,7 @@ - case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; deprecated(c); break; - case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; deprecated(c); break; - case 'B': mplp.flag &= ~MPLP_REALN; break; -+ case 'X': has_index_file = 1; break; - case 'D': mplp.fmt_flag |= B2B_FMT_DP; deprecated(c); break; - case 'S': mplp.fmt_flag |= B2B_FMT_SP; deprecated(c); break; - case 'V': mplp.fmt_flag |= B2B_FMT_DV; deprecated(c); break; -@@ -1064,7 +1282,7 @@ - case '6': mplp.flag |= MPLP_ILLUMINA13; break; - case 'R': mplp.flag |= MPLP_IGNORE_RG; break; - case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; -- case 'O': mplp.flag |= MPLP_PRINT_POS; break; -+ case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; - case 'C': mplp.capQ_thres = atoi(optarg); break; - case 'q': mplp.min_mq = atoi(optarg); break; - case 'Q': mplp.min_baseQ = atoi(optarg); break; -@@ -1129,16 +1347,32 @@ - } - int ret; - if (file_list) { -+ if (has_index_file) { -+ fprintf(stderr,"Error: The -b option cannot be combined with -X\n"); // No customize index loc in file list mode -+ return 1; -+ } - if ( read_file_list(file_list,&nfiles,&fn) ) return 1; -- ret = mpileup(&mplp,nfiles,fn); -+ ret = mpileup(&mplp,nfiles,fn,NULL); - for (c=0; c -@@ -38,14 +38,19 @@ - #include - #include - #include -+#include - #include - #include - #include -+#include - #include --#include "sam_header.h" - #include "samtools.h" -+#include "bedidx.h" - #include "sam_opts.h" - -+#define dummy_free(p) -+KLIST_INIT(auxlist, char *, dummy_free) -+ - static inline int printw(int c, FILE *fp) - { - char buf[16]; -@@ -61,7 +66,9 @@ - return 0; - } - --static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref) -+static inline int pileup_seq(FILE *fp, const bam_pileup1_t *p, hts_pos_t pos, -+ hts_pos_t ref_len, const char *ref, kstring_t *ks, -+ int rev_del) - { - int j; - if (p->is_head) { -@@ -81,21 +88,31 @@ - else c = bam_is_rev(p->b)? tolower(c) : toupper(c); - } - putc(c, fp); -- } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp); -+ } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : ((bam_is_rev(p->b) && rev_del) ? '#' : '*'), fp); -+ int del_len = -p->indel; - if (p->indel > 0) { -- putc('+', fp); printw(p->indel, fp); -- for (j = 1; j <= p->indel; ++j) { -- int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; -- putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); -+ int len = bam_plp_insertion(p, ks, &del_len); -+ if (len < 0) -+ return -1; -+ putc('+', fp); printw(len, fp); -+ if (bam_is_rev(p->b)) { -+ char pad = rev_del ? '#' : '*'; -+ for (j = 0; j < len; j++) -+ putc(ks->s[j] != '*' ? tolower(ks->s[j]) : pad, fp); -+ } else { -+ for (j = 0; j < len; j++) -+ putc(toupper(ks->s[j]), fp); - } -- } else if (p->indel < 0) { -- printw(p->indel, fp); -- for (j = 1; j <= -p->indel; ++j) { -+ } -+ if (del_len > 0) { -+ printw(-del_len, fp); -+ for (j = 1; j <= del_len; ++j) { - int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; - putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); - } - } - if (p->is_tail) putc('$', fp); -+ return 0; - } - - #include -@@ -111,36 +128,43 @@ - #define MPLP_REDO_BAQ (1<<6) - #define MPLP_ILLUMINA13 (1<<7) - #define MPLP_IGNORE_RG (1<<8) --#define MPLP_PRINT_POS (1<<9) --#define MPLP_PRINT_MAPQ (1<<10) -+#define MPLP_PRINT_QPOS (1<<9) - #define MPLP_PER_SAMPLE (1<<11) - #define MPLP_SMART_OVERLAPS (1<<12) -+ - #define MPLP_PRINT_QNAME (1<<13) -+#define MPLP_PRINT_FLAG (1<<14) -+#define MPLP_PRINT_RNAME (1<<15) -+#define MPLP_PRINT_POS (1<<16) -+#define MPLP_PRINT_MAPQ (1<<17) -+#define MPLP_PRINT_CIGAR (1<<18) -+#define MPLP_PRINT_RNEXT (1<<19) -+#define MPLP_PRINT_PNEXT (1<<20) -+#define MPLP_PRINT_TLEN (1<<21) -+#define MPLP_PRINT_SEQ (1<<22) -+#define MPLP_PRINT_QUAL (1<<23) - - #define MPLP_MAX_DEPTH 8000 - #define MPLP_MAX_INDEL_DEPTH 250 - --void *bed_read(const char *fn); --void bed_destroy(void *_h); --int bed_overlap(const void *_h, const char *chr, int beg, int end); -- - typedef struct { -- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all; -+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all, rev_del; - int rflag_require, rflag_filter; - int openQ, extQ, tandemQ, min_support; // for indels - double min_frac; // for indels - char *reg, *pl_list, *fai_fname, *output_fname; - faidx_t *fai; -- void *bed, *rghash; -+ void *bed, *rghash, *auxlist; - int argc; - char **argv; -+ char sep, empty; - sam_global_args ga; - } mplp_conf_t; - - typedef struct { - char *ref[2]; - int ref_id[2]; -- int ref_len[2]; -+ hts_pos_t ref_len[2]; - } mplp_ref_t; - - #define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}} -@@ -148,7 +172,7 @@ - typedef struct { - samFile *fp; - hts_itr_t *iter; -- bam_hdr_t *h; -+ sam_hdr_t *h; - mplp_ref_t *ref; - const mplp_conf_t *conf; - } mplp_aux_t; -@@ -159,7 +183,54 @@ - bam_pileup1_t **plp; - } mplp_pileup_t; - --static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) { -+static int build_auxlist(mplp_conf_t *conf, char *optstring) { -+ if (!optstring) -+ return 0; -+ -+ void *colhash = khash_str2int_init(); -+ if (!colhash) -+ return 1; -+ -+ struct active_cols { -+ char *name; -+ int supported; -+ }; -+ -+ const struct active_cols colnames[11] = { -+ {"QNAME", 1}, {"FLAG", 1}, {"RNAME", 1}, {"POS", 1}, {"MAPQ", 1}, {"CIGAR", 0}, {"RNEXT", 1}, {"PNEXT", 1}, {"TLEN", 0}, {"SEQ", 0}, {"QUAL", 0} -+ }; -+ -+ int i, f = MPLP_PRINT_QNAME, colno = 11; -+ for (i = 0; i < colno; i++, f <<= 1) -+ if (colnames[i].supported) -+ khash_str2int_set(colhash, colnames[i].name, f); -+ -+ conf->auxlist = kl_init(auxlist); -+ if (!conf->auxlist) -+ return 1; -+ -+ char *save_p; -+ char *tag = strtok_r(optstring, ",", &save_p); -+ while (tag) { -+ if (khash_str2int_get(colhash, tag, &f) == 0) { -+ conf->flag |= f; -+ } else { -+ if (strlen(tag) != 2) { -+ fprintf(samtools_stderr, "[%s] tag '%s' has more than two characters or not supported\n", __func__, tag); -+ } else { -+ char **tag_p = kl_pushp(auxlist, conf->auxlist); -+ *tag_p = tag; -+ } -+ } -+ tag = strtok_r(NULL, ",", &save_p); -+ } -+ -+ khash_str2int_destroy(colhash); -+ -+ return 0; -+} -+ -+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, hts_pos_t *ref_len) { - mplp_ref_t *r = ma->ref; - - //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]); -@@ -179,9 +250,10 @@ - } - if (tid == r->ref_id[1]) { - // Last, swap over -- int tmp; -- tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp; -- tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp; -+ int tmp_id; -+ hts_pos_t tmp_len; -+ tmp_id = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp_id; -+ tmp_len = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp_len; - - char *tc; - tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc; -@@ -197,10 +269,10 @@ - r->ref_len[1] = r->ref_len[0]; - - r->ref_id[0] = tid; -- r->ref[0] = faidx_fetch_seq(ma->conf->fai, -- ma->h->target_name[r->ref_id[0]], -+ r->ref[0] = faidx_fetch_seq64(ma->conf->fai, -+ sam_hdr_tid2name(ma->h, r->ref_id[0]), - 0, -- INT_MAX, -+ HTS_POS_MAX, - &r->ref_len[0]); - - if (!r->ref[0]) { -@@ -218,15 +290,25 @@ - - static void - print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname, -- int pos, int n, const char *ref, int ref_len) -+ hts_pos_t pos, int n, const char *ref, hts_pos_t ref_len) - { - int i; -- fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); -+ fprintf(fp, "%s\t%"PRIhts_pos"\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - fputs("\t0\t*\t*", fp); -- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp); -- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp); -- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", fp); -+ if (conf->flag & MPLP_PRINT_QPOS) -+ fputs("\t*", fp); -+ int flag_value = MPLP_PRINT_QNAME; -+ while(flag_value < MPLP_PRINT_QUAL + 1) { -+ if (conf->flag & flag_value) -+ fputs("\t*", fp); -+ flag_value <<= 1; -+ } -+ if (conf->auxlist) { -+ int t = 0; -+ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) -+ fputs("\t*", fp); -+ } - } - putc('\n', fp); - } -@@ -235,7 +317,9 @@ - { - char *ref; - mplp_aux_t *ma = (mplp_aux_t*)data; -- int ret, skip = 0, ref_len; -+ int ret, skip = 0; -+ hts_pos_t ref_len; -+ - do { - int has_ref; - ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); -@@ -249,7 +333,7 @@ - if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } - if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } - if (ma->conf->bed && ma->conf->all == 0) { // test overlap -- skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b)); -+ skip = !bed_overlap(ma->conf->bed, sam_hdr_tid2name(ma->h, b->core.tid), b->core.pos, bam_endpos(b)); - if (skip) continue; - } - if (ma->conf->rghash) { // exclude read groups -@@ -267,8 +351,8 @@ - if (ma->conf->fai && b->core.tid >= 0) { - has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); - if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence -- fprintf(samtools_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", -- __func__, b->core.pos, ref_len, b->core.tid); -+ fprintf(samtools_stderr,"[%s] Skipping because %"PRIhts_pos" is outside of %"PRIhts_pos" [ref:%d]\n", -+ __func__, (int64_t) b->core.pos, ref_len, b->core.tid); - skip = 1; - continue; - } -@@ -321,17 +405,19 @@ - * @param conf configuration for this pileup - * @param n number of files specified in fn - * @param fn filenames -+ * @param fn_idx index filenames - */ --static int mpileup(mplp_conf_t *conf, int n, char **fn) -+static int mpileup(mplp_conf_t *conf, int n, char **fn, char **fn_idx) - { - extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); - extern void bcf_call_del_rghash(void *rghash); - mplp_aux_t **data; -- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth; -+ int i, tid, *n_plp, tid0 = 0, max_depth, max_indel_depth; -+ hts_pos_t pos, beg0 = 0, end0 = HTS_POS_MAX, ref_len; - const bam_pileup1_t **plp; - mplp_ref_t mp_ref = MPLP_REF_INIT; - bam_mplp_t iter; -- bam_hdr_t *h = NULL; /* header of first file in input list */ -+ sam_hdr_t *h = NULL; /* header of first file in input list */ - char *ref; - void *rghash = NULL; - FILE *pileup_fp = NULL; -@@ -361,7 +447,7 @@ - - // read the header of each file in the list and initialize data - for (i = 0; i < n; ++i) { -- bam_hdr_t *h_tmp; -+ sam_hdr_t *h_tmp; - data[i] = calloc(1, sizeof(mplp_aux_t)); - data[i]->fp = sam_open_format(fn[i], "rb", &conf->ga.in); - if ( !data[i]->fp ) -@@ -385,13 +471,20 @@ - fprintf(samtools_stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); - exit(EXIT_FAILURE); - } -- bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); -+ bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : sam_hdr_str(h_tmp)); - if (conf->flag & MPLP_BCF) { - // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) -- rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); -+ rghash = bcf_call_add_rg(rghash, sam_hdr_str(h_tmp), conf->pl_list); - } - if (conf->reg) { -- hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx != NULL) { -+ idx = sam_index_load2(data[i]->fp, fn[i], fn_idx[i]); -+ } else { -+ idx = sam_index_load(data[i]->fp, fn[i]); -+ } -+ - if (idx == NULL) { - fprintf(samtools_stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); - exit(EXIT_FAILURE); -@@ -409,7 +502,7 @@ - if (i == 0) h = data[i]->h = h_tmp; // save the header of the first file - else { - // FIXME: check consistency between h and h_tmp -- bam_hdr_destroy(h_tmp); -+ sam_hdr_destroy(h_tmp); - - // we store only the first file's header; it's (alleged to be) - // compatible with the i-th file's target_name lookup needs -@@ -461,10 +554,10 @@ - - // Translate BAM @SQ tags to BCF ##contig tags - // todo: use/write new BAM header manipulation routines, fill also UR, M5 -- for (i=0; in_targets; i++) -+ for (i=0; i < sam_hdr_nref(h); i++) - { - str.l = 0; -- ksprintf(&str, "##contig=", h->target_name[i], h->target_len[i]); -+ ksprintf(&str, "##contig=", sam_hdr_tid2name(h, i), (int64_t) sam_hdr_tid2len(h, i)); - bcf_hdr_append(bcf_hdr, str.s); - } - free(str.s); -@@ -517,7 +610,11 @@ - for (i=0; in; i++) - bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); - bcf_hdr_add_sample(bcf_hdr, NULL); -- bcf_hdr_write(bcf_fp, bcf_hdr); -+ if (bcf_hdr_write(bcf_fp, bcf_hdr) != 0) { -+ print_error_errno("mpileup", "Failed to write VCF/BCF header to \"%s\"", -+ conf->output_fname? conf->output_fname : "standard output"); -+ exit(EXIT_FAILURE); -+ } - // End of BCF header creation - - // Initialise the calling algorithm -@@ -576,16 +673,17 @@ - bam_mplp_set_maxcnt(iter, max_depth); - bcf1_t *bcf_rec = bcf_init1(); - int ret; -- int last_tid = -1, last_pos = -1; -+ int last_tid = -1; -+ hts_pos_t last_pos = -1; - - // begin pileup -- while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { -+ while ( (ret=bam_mplp64_auto(iter, &tid, &pos, n_plp, plp)) > 0) { - if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - mplp_get_ref(data[0], tid, &ref, &ref_len); - //printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref); - if (conf->flag & MPLP_BCF) { - int total_depth, _ref0, ref16; -- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; -+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; - for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; - group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); - _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; -@@ -597,7 +695,11 @@ - bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); -- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); -+ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { -+ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", -+ conf->output_fname?conf->output_fname:"standard output"); -+ exit(EXIT_FAILURE); -+ } - // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) - { -@@ -607,7 +709,11 @@ - if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { - bcf_clear1(bcf_rec); - bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); -- bcf_write1(bcf_fp, bcf_hdr, bcf_rec); -+ if (bcf_write1(bcf_fp, bcf_hdr, bcf_rec) != 0) { -+ print_error_errno("mpileup", "Failed to write VCF/BCF record to \"%s\"", -+ conf->output_fname?conf->output_fname:"standard output"); -+ exit(EXIT_FAILURE); -+ } - } - } - } else { -@@ -615,10 +721,10 @@ - // Deal with missing portions of previous tids - while (tid > last_tid) { - if (last_tid >= 0 && !conf->reg) { -- while (++last_pos < h->target_len[last_tid]) { -- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { -+ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); -+ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); - } - } - last_tid++; -@@ -631,16 +737,16 @@ - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (conf->reg && last_pos < beg0) continue; // out of range; skip -- if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0) -+ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; -- print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len); -+ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, tid), last_pos, n, ref, ref_len); - } - last_tid = tid; - last_pos = pos; - } -- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; -+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, sam_hdr_tid2name(h, tid), pos, pos+1)) continue; - -- fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); -+ fprintf(pileup_fp, "%s\t%"PRIhts_pos"\t%c", sam_hdr_tid2name(h, tid), pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - int j, cnt; - for (j = cnt = 0; j < n_plp[i]; ++j) { -@@ -653,22 +759,40 @@ - fprintf(pileup_fp, "\t%d\t", cnt); - if (n_plp[i] == 0) { - fputs("*\t*", pileup_fp); -- if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); -- if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); -- if (conf->flag & MPLP_PRINT_QNAME) fputs("\t*", pileup_fp); -+ if (conf->flag & MPLP_PRINT_QPOS) -+ fputs("\t*", pileup_fp); -+ int flag_value = MPLP_PRINT_QNAME; -+ while(flag_value < MPLP_PRINT_QUAL + 1) { -+ if (conf->flag & flag_value) -+ fputs("\t*", pileup_fp); -+ flag_value <<= 1; -+ } -+ if (conf->auxlist) { -+ int t = 0; -+ while(t++ < ((klist_t(auxlist) *)conf->auxlist)->size) -+ fputs("\t*", pileup_fp); -+ } - } else { - int n = 0; -+ kstring_t ks = KS_INITIALIZE; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq - ? bam_get_qual(p->b)[p->qpos] - : 0; -- if (c >= conf->min_baseQ) -- n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); -+ if (c >= conf->min_baseQ) { -+ n++; -+ if (pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref, &ks, conf->rev_del) < 0) { -+ ret = 1; -+ goto fail; -+ } -+ } - } - if (!n) putc('*', pileup_fp); - -+ /* Print base qualities */ - n = 0; -+ ks_free(&ks); - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; -@@ -683,55 +807,124 @@ - } - if (!n) putc('*', pileup_fp); - -- if (conf->flag & MPLP_PRINT_MAPQ) { -+ /* Print mpileup positions */ -+ if (conf->flag & MPLP_PRINT_QPOS) { - n = 0; - putc('\t', pileup_fp); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = p->qpos < p->b->core.l_qseq -- ? bam_get_qual(p->b)[p->qpos] -- : 0; -+ ? bam_get_qual(p->b)[p->qpos] -+ : 0; - if ( c < conf->min_baseQ ) continue; -- c = plp[i][j].b->core.qual + 33; -- if (c > 126) c = 126; -- putc(c, pileup_fp); -+ if (n > 0) putc(',', pileup_fp); - n++; -+ fprintf(pileup_fp, "%d", p->qpos + 1); - } - if (!n) putc('*', pileup_fp); - } - -- if (conf->flag & MPLP_PRINT_POS) { -- n = 0; -- putc('\t', pileup_fp); -- for (j = 0; j < n_plp[i]; ++j) { -- const bam_pileup1_t *p = plp[i] + j; -- int c = p->qpos < p->b->core.l_qseq -- ? bam_get_qual(p->b)[p->qpos] -- : 0; -- if ( c < conf->min_baseQ ) continue; -- -- if (n > 0) putc(',', pileup_fp); -- fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(samtools_stdout, ) is very slow... -- n++; -+ /* Print selected columns */ -+ int flag_value = MPLP_PRINT_QNAME; -+ while(flag_value < MPLP_PRINT_QUAL + 1) { -+ if (conf->flag & flag_value) { -+ n = 0; -+ putc('\t', pileup_fp); -+ for (j = 0; j < n_plp[i]; ++j) { -+ const bam_pileup1_t *p = &plp[i][j]; -+ int c = p->qpos < p->b->core.l_qseq -+ ? bam_get_qual(p->b)[p->qpos] -+ : 0; -+ if ( c < conf->min_baseQ ) continue; -+ if (n > 0 && flag_value != MPLP_PRINT_MAPQ) putc(',', pileup_fp); -+ n++; -+ -+ switch (flag_value) { -+ case MPLP_PRINT_QNAME: -+ fputs(bam_get_qname(p->b), pileup_fp); -+ break; -+ case MPLP_PRINT_FLAG: -+ fprintf(pileup_fp, "%d", p->b->core.flag); -+ break; -+ case MPLP_PRINT_RNAME: -+ if (p->b->core.tid >= 0) -+ fputs(sam_hdr_tid2name(h, p->b->core.tid), pileup_fp); -+ else -+ putc('*', pileup_fp); -+ break; -+ case MPLP_PRINT_POS: -+ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.pos + 1); -+ break; -+ case MPLP_PRINT_MAPQ: -+ c = p->b->core.qual + 33; -+ if (c > 126) c = 126; -+ putc(c, pileup_fp); -+ break; -+ case MPLP_PRINT_RNEXT: -+ if (p->b->core.mtid >= 0) -+ fputs(sam_hdr_tid2name(h, p->b->core.mtid), pileup_fp); -+ else -+ putc('*', pileup_fp); -+ break; -+ case MPLP_PRINT_PNEXT: -+ fprintf(pileup_fp, "%"PRId64, (int64_t) p->b->core.mpos + 1); -+ break; -+ } -+ } -+ if (!n) putc('*', pileup_fp); - } -- if (!n) putc('*', pileup_fp); -+ flag_value <<= 1; - } - -- if (conf->flag & MPLP_PRINT_QNAME) { -- n = 0; -- putc('\t', pileup_fp); -- for (j = 0; j < n_plp[i]; ++j) { -- const bam_pileup1_t *p = &plp[i][j]; -- int c = p->qpos < p->b->core.l_qseq -- ? bam_get_qual(p->b)[p->qpos] -- : 0; -- if ( c < conf->min_baseQ ) continue; -- -- if (n > 0) putc(',', pileup_fp); -- fputs(bam_get_qname(p->b), pileup_fp); -- n++; -+ /* Print selected tags */ -+ klist_t(auxlist) *auxlist_p = ((klist_t(auxlist) *)conf->auxlist); -+ if (auxlist_p && auxlist_p->size) { -+ kliter_t(auxlist) *aux; -+ for (aux = kl_begin(auxlist_p); aux != kl_end(auxlist_p); aux = kl_next(aux)) { -+ n = 0; -+ putc('\t', pileup_fp); -+ for (j = 0; j < n_plp[i]; ++j) { -+ const bam_pileup1_t *p = &plp[i][j]; -+ int c = p->qpos < p->b->core.l_qseq -+ ? bam_get_qual(p->b)[p->qpos] -+ : 0; -+ if ( c < conf->min_baseQ ) continue; -+ -+ if (n > 0) putc(conf->sep, pileup_fp); -+ n++; -+ uint8_t* tag_u = bam_aux_get(p->b, kl_val(aux)); -+ if (!tag_u) { -+ putc(conf->empty , pileup_fp); -+ continue; -+ } -+ -+ /* Tag value is string */ -+ if (*tag_u == 'Z' || *tag_u == 'H') { -+ char *tag_s = bam_aux2Z(tag_u); -+ if (!tag_s) continue; -+ fputs(tag_s, pileup_fp); -+ } -+ -+ /* Tag value is integer */ -+ if (*tag_u == 'I' || *tag_u == 'i' || *tag_u == 'C' || *tag_u == 'c' || *tag_u == 'S' || *tag_u == 's') { -+ int64_t tag_i = bam_aux2i(tag_u); -+ fprintf(pileup_fp, "%" PRId64 "", tag_i); -+ } -+ -+ /* Tag value is float */ -+ if (*tag_u == 'd' || *tag_u == 'f') { -+ double tag_f = bam_aux2f(tag_u); -+ fprintf(pileup_fp, "%lf", tag_f); -+ } -+ -+ /* Tag value is character */ -+ if (*tag_u == 'A') { -+ char tag_c = bam_aux2A(tag_u); -+ putc(tag_c, pileup_fp); -+ } -+ } -+ if (!n) putc('*', pileup_fp); - } -- if (!n) putc('*', pileup_fp); - } - } - } -@@ -746,12 +939,12 @@ - last_pos = beg0-1; - mplp_get_ref(data[0], tid0, &ref, &ref_len); - } -- while (last_tid >= 0 && last_tid < h->n_targets) { -- while (++last_pos < h->target_len[last_tid]) { -+ while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { -+ while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (last_pos >= end0) break; -- if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) -+ if (conf->bed && bed_overlap(conf->bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; -- print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len); -+ print_empty_pileup(pileup_fp, conf, sam_hdr_tid2name(h, last_tid), last_pos, n, ref, ref_len); - } - last_tid++; - last_pos = -1; -@@ -760,6 +953,7 @@ - } - } - -+fail: - // clean up - free(bc.tmp.s); - bcf_destroy1(bcf_rec); -@@ -781,7 +975,7 @@ - free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); - bcf_call_del_rghash(rghash); - bam_mplp_destroy(iter); -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - for (i = 0; i < n; ++i) { - sam_close(data[i]->fp); - if (data[i]->iter) hts_itr_destroy(data[i]->iter); -@@ -924,17 +1118,22 @@ - " [%s]\n", tmp_filter); - fprintf(fp, - " -x, --ignore-overlaps disable read-pair overlap detection\n" -+" -X, --customized-index use customized index files\n" // -X flag for index filename - "\n" - "Output options:\n" - " -o, --output FILE write output to FILE [standard output]\n" - " -O, --output-BP output base positions on reads\n" - " -s, --output-MQ output mapping quality\n" - " --output-QNAME output read names\n" -+" --output-extra STR output extra read fields and read tag values\n" -+" --output-sep CHAR set the separator character for tag lists [,]\n" -+" --output-empty CHAR set the no value character for tag lists [*]\n" -+" --reverse-del use '#' character for deletions on the reverse strand\n" - " -a output all positions (including zero depth)\n" - " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n" - "\n" - "Generic options:\n"); -- sam_global_opt_help(fp, "-.--.-"); -+ sam_global_opt_help(fp, "-.--.--."); - - fprintf(fp, "\n" - "Note that using \"samtools mpileup\" to generate BCF or VCF files is now\n" -@@ -954,7 +1153,7 @@ - int c; - const char *file_list = NULL; - char **fn = NULL; -- int nfiles = 0, use_orphan = 0; -+ int nfiles = 0, use_orphan = 0, has_index_file = 0; - mplp_conf_t mplp; - memset(&mplp, 0, sizeof(mplp_conf_t)); - mplp.min_baseQ = 13; -@@ -968,6 +1167,9 @@ - mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; - mplp.output_fname = NULL; - mplp.all = 0; -+ mplp.rev_del = 0; -+ mplp.sep = ','; -+ mplp.empty = '*'; - sam_global_args_init(&mplp.ga); - - static const struct option lopts[] = -@@ -1022,9 +1224,15 @@ - {"per-sample-mF", no_argument, NULL, 'p'}, - {"per-sample-mf", no_argument, NULL, 'p'}, - {"platforms", required_argument, NULL, 'P'}, -+ {"customized-index", no_argument, NULL, 'X'}, -+ {"reverse-del", no_argument, NULL, 6}, -+ {"output-extra", required_argument, NULL, 7}, -+ {"output-sep", required_argument, NULL, 8}, -+ {"output-empty", required_argument, NULL, 9}, - {NULL, 0, NULL, 0} - }; -- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) { -+ -+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxXt:a",lopts,NULL)) >= 0) { - switch (c) { - case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; - case 1 : -@@ -1038,6 +1246,15 @@ - case 3 : mplp.output_fname = optarg; break; - case 4 : mplp.openQ = atoi(optarg); break; - case 5 : mplp.flag |= MPLP_PRINT_QNAME; break; -+ case 6 : mplp.rev_del = 1; break; -+ case 7 : -+ if (build_auxlist(&mplp, optarg) != 0) { -+ fprintf(samtools_stderr,"Could not build aux list using '%s'\n", optarg); -+ return 1; -+ } -+ break; -+ case 8: mplp.sep = optarg[0]; break; -+ case 9: mplp.empty = optarg[0]; break; - case 'f': - mplp.fai = fai_load(optarg); - if (mplp.fai == NULL) return 1; -@@ -1058,6 +1275,7 @@ - case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; deprecated(c); break; - case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; deprecated(c); break; - case 'B': mplp.flag &= ~MPLP_REALN; break; -+ case 'X': has_index_file = 1; break; - case 'D': mplp.fmt_flag |= B2B_FMT_DP; deprecated(c); break; - case 'S': mplp.fmt_flag |= B2B_FMT_SP; deprecated(c); break; - case 'V': mplp.fmt_flag |= B2B_FMT_DV; deprecated(c); break; -@@ -1066,7 +1284,7 @@ - case '6': mplp.flag |= MPLP_ILLUMINA13; break; - case 'R': mplp.flag |= MPLP_IGNORE_RG; break; - case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; -- case 'O': mplp.flag |= MPLP_PRINT_POS; break; -+ case 'O': mplp.flag |= MPLP_PRINT_QPOS; break; - case 'C': mplp.capQ_thres = atoi(optarg); break; - case 'q': mplp.min_mq = atoi(optarg); break; - case 'Q': mplp.min_baseQ = atoi(optarg); break; -@@ -1131,16 +1349,32 @@ - } - int ret; - if (file_list) { -+ if (has_index_file) { -+ fprintf(samtools_stderr,"Error: The -b option cannot be combined with -X\n"); // No customize index loc in file list mode -+ return 1; -+ } - if ( read_file_list(file_list,&nfiles,&fn) ) return 1; -- ret = mpileup(&mplp,nfiles,fn); -+ ret = mpileup(&mplp,nfiles,fn,NULL); - for (c=0; c - -@@ -46,6 +46,7 @@ - "Options:\n" - " -v verbose output (repeat for more verbosity)\n" - " -q suppress warning messages\n" -+" -u unmapped input (do not require targets in header)\n" - "\n" - "Notes:\n" - "\n" -@@ -77,13 +78,16 @@ - - int main_quickcheck(int argc, char** argv) - { -- int verbose = 0, quiet = 0; -+ int verbose = 0, quiet = 0, unmapped = 0; - hts_verbose = 0; - -- const char* optstring = "vq"; -+ const char* optstring = "vqu"; - int opt; - while ((opt = getopt(argc, argv, optstring)) != -1) { - switch (opt) { -+ case 'u': -+ unmapped = 1; -+ break; - case 'v': - verbose++; - break; -@@ -136,17 +140,17 @@ - else { - if (verbose >= 3) fprintf(stderr, "%s is sequence data\n", fn); - // check header -- bam_hdr_t *header = sam_hdr_read(hts_fp); -+ sam_hdr_t *header = sam_hdr_read(hts_fp); - if (header == NULL) { - QC_ERR(QC_BAD_HEADER, 2, "%s caused an error whilst reading its header.\n", fn); - } else { -- if (header->n_targets <= 0) { -+ if (!unmapped && sam_hdr_nref(header) <= 0) { - QC_ERR(QC_BAD_HEADER, 2, "%s had no targets in header.\n", fn); - } - else { -- if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, header->n_targets); -+ if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, sam_hdr_nref(header)); - } -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - } - } - // check EOF on formats that support this ---- python-pysam.orig/samtools/bam_quickcheck.c.pysam.c -+++ python-pysam/samtools/bam_quickcheck.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_quickcheck.c -- quickcheck subcommand. - -- Copyright (C) 2015 Genome Research Ltd. -+ Copyright (C) 2015-2017 Genome Research Ltd. - - Author: Joshua C. Randall - -@@ -48,6 +48,7 @@ - "Options:\n" - " -v verbose output (repeat for more verbosity)\n" - " -q suppress warning messages\n" -+" -u unmapped input (do not require targets in header)\n" - "\n" - "Notes:\n" - "\n" -@@ -79,13 +80,16 @@ - - int main_quickcheck(int argc, char** argv) - { -- int verbose = 0, quiet = 0; -+ int verbose = 0, quiet = 0, unmapped = 0; - hts_verbose = 0; - -- const char* optstring = "vq"; -+ const char* optstring = "vqu"; - int opt; - while ((opt = getopt(argc, argv, optstring)) != -1) { - switch (opt) { -+ case 'u': -+ unmapped = 1; -+ break; - case 'v': - verbose++; - break; -@@ -138,17 +142,17 @@ - else { - if (verbose >= 3) fprintf(samtools_stderr, "%s is sequence data\n", fn); - // check header -- bam_hdr_t *header = sam_hdr_read(hts_fp); -+ sam_hdr_t *header = sam_hdr_read(hts_fp); - if (header == NULL) { - QC_ERR(QC_BAD_HEADER, 2, "%s caused an error whilst reading its header.\n", fn); - } else { -- if (header->n_targets <= 0) { -+ if (!unmapped && sam_hdr_nref(header) <= 0) { - QC_ERR(QC_BAD_HEADER, 2, "%s had no targets in header.\n", fn); - } - else { -- if (verbose >= 3) fprintf(samtools_stderr, "%s has %d targets in header.\n", fn, header->n_targets); -+ if (verbose >= 3) fprintf(samtools_stderr, "%s has %d targets in header.\n", fn, sam_hdr_nref(header)); - } -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - } - } - // check EOF on formats that support this ---- python-pysam.orig/samtools/bam_reheader.c -+++ python-pysam/samtools/bam_reheader.c -@@ -1,7 +1,7 @@ - /* bam_reheader.c -- reheader subcommand. - - Copyright (C) 2010 Broad Institute. -- Copyright (C) 2012-2015 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -29,6 +29,7 @@ - #include - #include - #include -+#include - - #include "htslib/bgzf.h" - #include "htslib/sam.h" -@@ -42,50 +43,44 @@ - * Reads a file and outputs a new BAM file to fd with 'h' replaced as - * the header. No checks are made to the validity. - */ --int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, -- const char *arg_list, int add_PG) -+int bam_reheader(BGZF *in, sam_hdr_t *h, int fd, -+ const char *arg_list, int no_pg, int skip_header) - { - BGZF *fp = NULL; - ssize_t len; - uint8_t *buf = NULL; -- SAM_hdr *sh = NULL; -+ sam_hdr_t *tmp; -+ if (!h) -+ return -1; -+ - if (in->is_write) return -1; - buf = malloc(BUF_SIZE); - if (!buf) { - fprintf(stderr, "Out of memory\n"); - return -1; - } -- if (bam_hdr_read(in) == NULL) { -- fprintf(stderr, "Couldn't read header\n"); -- goto fail; -+ -+ if (!skip_header) { -+ if ((tmp = bam_hdr_read(in)) == NULL) { -+ fprintf(stderr, "Couldn't read header\n"); -+ goto fail; -+ } -+ sam_hdr_destroy(tmp); - } -+ - fp = bgzf_fdopen(fd, "w"); - if (!fp) { - print_error_errno("reheader", "Couldn't open output file"); - goto fail; - } - -- if (add_PG) { -- // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. -- sh = sam_hdr_parse_(h->text, h->l_text); -- if (!sh) -- goto fail; -- if (sam_hdr_add_PG(sh, "samtools", -+ if (!no_pg && sam_hdr_add_pg(h, "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL) != 0) - goto fail; - -- free(h->text); -- h->text = strdup(sam_hdr_str(sh)); -- h->l_text = sam_hdr_length(sh); -- if (!h->text) -- goto fail; -- sam_hdr_free(sh); -- sh = NULL; -- } -- - if (bam_hdr_write(fp, h) < 0) { - print_error_errno("reheader", "Couldn't write header"); - goto fail; -@@ -114,7 +109,6 @@ - fail: - bgzf_close(fp); - free(buf); -- sam_hdr_free(sh); - return -1; - } - -@@ -124,32 +118,28 @@ - * - * FIXME: error checking - */ --int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) -+int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg) - { - htsFile *h_out = hts_open("-", "wc"); - cram_fd *out = h_out->fp.cram; - cram_container *c = NULL; - int ret = -1; -+ if (!h) -+ return ret; - - // Attempt to fill out a cram->refs[] array from @SQ headers -- cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); -- if (add_PG) { -- if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", -+ sam_hdr_t *cram_h = sam_hdr_dup(h); -+ if (!cram_h) -+ return -1; -+ cram_fd_set_header(out, cram_h); -+ if (!no_pg && sam_hdr_add_pg(cram_fd_get_header(out), "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, -- NULL) != 0) -+ NULL)) - goto err; - -- // Covert back to bam_hdr_t struct -- free(h->text); -- h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); -- h->l_text = sam_hdr_length(cram_fd_get_header(out)); -- if (!h->text) -- goto err; -- } -- -- if (sam_hdr_write(h_out, h) != 0) -+ if (sam_hdr_write(h_out, cram_h) != 0) - goto err; - cram_set_option(out, CRAM_OPT_REFERENCE, NULL); - -@@ -192,14 +182,16 @@ - * -1 on general failure; - * -2 on failure due to insufficient size - */ --int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, -- int add_PG) -+int cram_reheader_inplace2(cram_fd *fd, sam_hdr_t *h, const char *arg_list, -+ int no_pg) - { - cram_container *c = NULL; - cram_block *b = NULL; -- SAM_hdr *hdr = NULL; -+ sam_hdr_t *cram_h = NULL; - off_t start; - int ret = -1; -+ if (!h) -+ goto err; - - if (cram_major_vers(fd) < 2 || - cram_major_vers(fd) > 3) { -@@ -208,16 +200,17 @@ - goto err; - } - -- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) -+ cram_h = sam_hdr_dup(h); -+ if (!cram_h) - goto err; - -- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), -+ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL)) - goto err; - -- int header_len = sam_hdr_length(hdr); -+ int header_len = sam_hdr_length(cram_h); - /* Fix M5 strings? Maybe out of scope for this tool */ - - // Load the existing header -@@ -244,7 +237,7 @@ - - cram_block_set_offset(b, 0); // rewind block - int32_put_blk(b, header_len); -- cram_block_append(b, sam_hdr_str(hdr), header_len); -+ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); - // Zero the remaining block - memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0, - cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); -@@ -265,7 +258,7 @@ - err: - if (c) cram_free_container(c); - if (b) cram_free_block(b); -- if (hdr) sam_hdr_free(hdr); -+ if (cram_h) sam_hdr_destroy(cram_h); - - return ret; - } -@@ -286,16 +279,18 @@ - * -1 on general failure; - * -2 on failure due to insufficient size - */ --int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, -- int add_PG) -+int cram_reheader_inplace3(cram_fd *fd, sam_hdr_t *h, const char *arg_list, -+ int no_pg) - { - cram_container *c = NULL; - cram_block *b = NULL; -- SAM_hdr *hdr = NULL; -+ sam_hdr_t *cram_h = NULL; - off_t start, sz, end; - int container_sz, max_container_sz; - char *buf = NULL; - int ret = -1; -+ if (!h) -+ goto err; - - if (cram_major_vers(fd) < 2 || - cram_major_vers(fd) > 3) { -@@ -304,16 +299,17 @@ - goto err; - } - -- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) -+ cram_h = sam_hdr_dup(h); -+ if (!cram_h) - goto err; - -- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), -+ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL)) - goto err; - -- int header_len = sam_hdr_length(hdr); -+ int header_len = sam_hdr_length(cram_h); - /* Fix M5 strings? Maybe out of scope for this tool */ - - // Find current size of SAM header block -@@ -381,7 +377,7 @@ - // Version 3.0 supports compressed header - b = cram_new_block(FILE_HEADER, 0); - int32_put_blk(b, header_len); -- cram_block_append(b, sam_hdr_str(hdr), header_len); -+ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); - cram_block_update_size(b); - - cram_compress_block(fd, b, NULL, -1, -1); -@@ -416,17 +412,17 @@ - if (c) cram_free_container(c); - if (buf) free(buf); - if (b) cram_free_block(b); -- if (hdr) sam_hdr_free(hdr); -+ if (cram_h) sam_hdr_destroy(cram_h); - - return ret; - } - --int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, -- int add_PG) -+int cram_reheader_inplace(cram_fd *fd, sam_hdr_t *h, const char *arg_list, -+ int no_pg) - { - switch (cram_major_vers(fd)) { -- case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG); -- case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG); -+ case 2: return cram_reheader_inplace2(fd, h, arg_list, no_pg); -+ case 3: return cram_reheader_inplace3(fd, h, arg_list, no_pg); - default: - fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, - cram_major_vers(fd)); -@@ -437,33 +433,124 @@ - static void usage(FILE *fp, int ret) { - fprintf(fp, - "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n" -- " or samtools reheader [-P] -i in.header.sam file.bam\n" -+ " or samtools reheader [-P] -i in.header.sam file.cram\n" -+ " or samtools reheader -c CMD in.bam\n" -+ " or samtools reheader -c CMD in.cram\n" - "\n" - "Options:\n" -- " -P, --no-PG Do not generate an @PG header line.\n" -- " -i, --in-place Modify the bam/cram file directly.\n" -- " (Defaults to outputting to stdout.)\n"); -+ " -P, --no-PG Do not generate a @PG header line.\n" -+ " -i, --in-place Modify the CRAM file directly, if possible.\n" -+ " (Defaults to outputting to stdout.)\n" -+ " -c, --command CMD Pass the header in SAM format to external program CMD.\n"); - exit(ret); - } - -+static sam_hdr_t* external_reheader(samFile* in, const char* external) { -+ char *command = NULL; -+ sam_hdr_t* h = NULL; -+ sam_hdr_t* ih = sam_hdr_read(in); -+ if (ih == NULL) { -+ fprintf(stderr, "[%s] failed to read the header for '%s'.\n", __func__, in->fn); -+ return NULL; -+ } -+ char tmp_fn[] = "reheaderXXXXXX"; -+ int tmp_fd = mkstemp(tmp_fn); -+ if (tmp_fd < 0) { -+ print_error_errno("reheader", "fail to open temp file '%s'", tmp_fn); -+ return NULL; -+ } -+ hFILE* tmp_hf = hdopen(tmp_fd, "w"); -+ if (!tmp_hf) { -+ fprintf(stderr, "[%s] failed to convert to hFILE.\n", __func__); -+ goto cleanup; -+ } -+ samFile* tmp_sf = hts_hopen(tmp_hf, tmp_fn, "w"); -+ if (!tmp_sf) { -+ fprintf(stderr, "[%s] failed to convert to samFile.\n", __func__); -+ goto cleanup; -+ } -+ if (-1 == sam_hdr_write(tmp_sf, ih)) { -+ fprintf(stderr, "[%s] failed to write the header to the temp file.\n", __func__); -+ goto cleanup; -+ } -+ sam_close(tmp_sf); -+ sam_hdr_destroy(ih); -+ int comm_len = strlen(external) + strlen(tmp_fn) + 8; -+ command = calloc(comm_len, 1); -+ if (!command || snprintf(command, comm_len, "( %s ) < %s", external, tmp_fn) != comm_len - 1) { -+ fprintf(stderr, "[%s] failed to create command string.\n", __func__); -+ goto cleanup; -+ } -+ FILE* nh = popen(command, "r"); -+ if (!nh) { -+ print_error_errno("reheader", "[%s] failed to run external command '%s'.\n", __func__, command); -+ goto cleanup; -+ } -+ -+ int nh_fd = dup(fileno(nh)); -+ if (nh_fd < 0) { -+ fprintf(stderr, "[%s] failed to get the file descriptor.\n", __func__); -+ goto cleanup; -+ } -+ hFILE* nh_hf = hdopen(nh_fd, "r"); -+ if (!nh_hf) { -+ fprintf(stderr, "[%s] failed to convert to hFILE.\n", __func__); -+ goto cleanup; -+ } -+ samFile* nh_sf = hts_hopen(nh_hf, tmp_fn, "r"); -+ if (!nh_sf) { -+ fprintf(stderr, "[%s] failed to convert to samFile.\n", __func__); -+ goto cleanup; -+ } -+ -+ h = sam_hdr_read(nh_sf); -+ sam_close(nh_sf); -+ if (h == NULL) { -+ fprintf(stderr, "[%s] failed to read the header from the temp file.\n", __func__); -+ } -+ int res = pclose(nh); -+ if (res != 0) { -+ if (res < 0) { -+ print_error_errno("reheader", -+ "Error on closing pipe from command '%s'.\n", -+ command); -+ } else { -+ print_error("reheader", -+ "Non-zero exit code returned by command '%s'\n", -+ command); -+ } -+ if (h) sam_hdr_destroy(h); -+ h = NULL; -+ } -+cleanup: -+ free(command); -+ if (unlink(tmp_fn) != 0) { -+ print_error_errno("reheader", "failed to remove the temp file '%s'", tmp_fn); -+ } -+ -+ return h; -+} -+ - int main_reheader(int argc, char *argv[]) - { -- int inplace = 0, r, add_PG = 1, c; -- bam_hdr_t *h; -+ int inplace = 0, r, no_pg = 0, c, skip_header = 0; -+ sam_hdr_t *h; - samFile *in; -- char *arg_list = stringify_argv(argc+1, argv-1); -+ char *arg_list = NULL, *external = NULL; - - static const struct option lopts[] = { - {"help", no_argument, NULL, 'h'}, - {"in-place", no_argument, NULL, 'i'}, - {"no-PG", no_argument, NULL, 'P'}, -+ {"command", required_argument, NULL, 'c'}, - {NULL, 0, NULL, 0} - }; - -- while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "hiPc:", lopts, NULL)) >= 0) { - switch (c) { -- case 'P': add_PG = 0; break; -+ case 'P': no_pg = 1; break; - case 'i': inplace = 1; break; -+ case 'c': external = optarg; break; - case 'h': usage(stdout, 0); break; - default: - fprintf(stderr, "Invalid option '%c'\n", c); -@@ -471,10 +558,29 @@ - } - } - -- if (argc - optind != 2) -+ if ((argc - optind != 2 || external) && (argc - optind != 1 || !external)) - usage(stderr, 1); - -- { // read the header -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("reheader", "failed to create arg_list"); -+ return 1; -+ } -+ -+ if (external) { -+ skip_header = 1; -+ in = sam_open(argv[optind], inplace?"r+":"r"); -+ if (in == 0) { -+ print_error_errno("reheader", "fail to open file '%s'", argv[optind]); -+ return 1; -+ } -+ -+ h = external_reheader(in, external); -+ if (h == NULL) { -+ fprintf(stderr, "[%s] failed to read the header from '%s'.\n", __func__, external); -+ sam_close(in); -+ return 1; -+ } -+ } else { // read the header from a separate file - samFile *fph = sam_open(argv[optind], "r"); - if (fph == 0) { - print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]); -@@ -487,25 +593,34 @@ - __func__, argv[1]); - return 1; - } -+ in = sam_open(argv[optind+1], inplace?"r+":"r"); -+ if (in == 0) { -+ print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); -+ return 1; -+ } - } -- in = sam_open(argv[optind+1], inplace?"r+":"r"); -- if (in == 0) { -- print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); -- return 1; -- } -+ - if (hts_get_format(in)->format == bam) { -- r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, add_PG); -- } else { -+ if (inplace) { -+ print_error("reheader", "cannot reheader BAM '%s' in-place", argv[optind+1]); -+ r = -1; -+ } else { -+ r = bam_reheader(in->fp.bgzf, h, fileno(stdout), arg_list, no_pg, skip_header); -+ } -+ } else if (hts_get_format(in)->format == cram) { - if (inplace) -- r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG); -+ r = cram_reheader_inplace(in->fp.cram, h, arg_list, no_pg); - else -- r = cram_reheader(in->fp.cram, h, arg_list, add_PG); -+ r = cram_reheader(in->fp.cram, h, arg_list, no_pg); -+ } else { -+ print_error("reheader", "input file '%s' must be BAM or CRAM", argv[optind+1]); -+ r = -1; - } - - if (sam_close(in) != 0) - r = -1; - -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - - if (arg_list) - free(arg_list); ---- python-pysam.orig/samtools/bam_reheader.c.pysam.c -+++ python-pysam/samtools/bam_reheader.c.pysam.c -@@ -3,7 +3,7 @@ - /* bam_reheader.c -- reheader subcommand. - - Copyright (C) 2010 Broad Institute. -- Copyright (C) 2012-2015 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - - #include "htslib/bgzf.h" - #include "htslib/sam.h" -@@ -44,50 +45,44 @@ - * Reads a file and outputs a new BAM file to fd with 'h' replaced as - * the header. No checks are made to the validity. - */ --int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, -- const char *arg_list, int add_PG) -+int bam_reheader(BGZF *in, sam_hdr_t *h, int fd, -+ const char *arg_list, int no_pg, int skip_header) - { - BGZF *fp = NULL; - ssize_t len; - uint8_t *buf = NULL; -- SAM_hdr *sh = NULL; -+ sam_hdr_t *tmp; -+ if (!h) -+ return -1; -+ - if (in->is_write) return -1; - buf = malloc(BUF_SIZE); - if (!buf) { - fprintf(samtools_stderr, "Out of memory\n"); - return -1; - } -- if (bam_hdr_read(in) == NULL) { -- fprintf(samtools_stderr, "Couldn't read header\n"); -- goto fail; -+ -+ if (!skip_header) { -+ if ((tmp = bam_hdr_read(in)) == NULL) { -+ fprintf(samtools_stderr, "Couldn't read header\n"); -+ goto fail; -+ } -+ sam_hdr_destroy(tmp); - } -+ - fp = bgzf_fdopen(fd, "w"); - if (!fp) { - print_error_errno("reheader", "Couldn't open output file"); - goto fail; - } - -- if (add_PG) { -- // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. -- sh = sam_hdr_parse_(h->text, h->l_text); -- if (!sh) -- goto fail; -- if (sam_hdr_add_PG(sh, "samtools", -+ if (!no_pg && sam_hdr_add_pg(h, "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL) != 0) - goto fail; - -- free(h->text); -- h->text = strdup(sam_hdr_str(sh)); -- h->l_text = sam_hdr_length(sh); -- if (!h->text) -- goto fail; -- sam_hdr_free(sh); -- sh = NULL; -- } -- - if (bam_hdr_write(fp, h) < 0) { - print_error_errno("reheader", "Couldn't write header"); - goto fail; -@@ -116,7 +111,6 @@ - fail: - bgzf_close(fp); - free(buf); -- sam_hdr_free(sh); - return -1; - } - -@@ -126,32 +120,28 @@ - * - * FIXME: error checking - */ --int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) -+int cram_reheader(cram_fd *in, sam_hdr_t *h, const char *arg_list, int no_pg) - { - htsFile *h_out = hts_open("-", "wc"); - cram_fd *out = h_out->fp.cram; - cram_container *c = NULL; - int ret = -1; -+ if (!h) -+ return ret; - - // Attempt to fill out a cram->refs[] array from @SQ headers -- cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); -- if (add_PG) { -- if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", -+ sam_hdr_t *cram_h = sam_hdr_dup(h); -+ if (!cram_h) -+ return -1; -+ cram_fd_set_header(out, cram_h); -+ if (!no_pg && sam_hdr_add_pg(cram_fd_get_header(out), "samtools", - "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, -- NULL) != 0) -+ NULL)) - goto err; - -- // Covert back to bam_hdr_t struct -- free(h->text); -- h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); -- h->l_text = sam_hdr_length(cram_fd_get_header(out)); -- if (!h->text) -- goto err; -- } -- -- if (sam_hdr_write(h_out, h) != 0) -+ if (sam_hdr_write(h_out, cram_h) != 0) - goto err; - cram_set_option(out, CRAM_OPT_REFERENCE, NULL); - -@@ -194,14 +184,16 @@ - * -1 on general failure; - * -2 on failure due to insufficient size - */ --int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, -- int add_PG) -+int cram_reheader_inplace2(cram_fd *fd, sam_hdr_t *h, const char *arg_list, -+ int no_pg) - { - cram_container *c = NULL; - cram_block *b = NULL; -- SAM_hdr *hdr = NULL; -+ sam_hdr_t *cram_h = NULL; - off_t start; - int ret = -1; -+ if (!h) -+ goto err; - - if (cram_major_vers(fd) < 2 || - cram_major_vers(fd) > 3) { -@@ -210,16 +202,17 @@ - goto err; - } - -- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) -+ cram_h = sam_hdr_dup(h); -+ if (!cram_h) - goto err; - -- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), -+ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL)) - goto err; - -- int header_len = sam_hdr_length(hdr); -+ int header_len = sam_hdr_length(cram_h); - /* Fix M5 strings? Maybe out of scope for this tool */ - - // Load the existing header -@@ -246,7 +239,7 @@ - - cram_block_set_offset(b, 0); // rewind block - int32_put_blk(b, header_len); -- cram_block_append(b, sam_hdr_str(hdr), header_len); -+ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); - // Zero the remaining block - memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0, - cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); -@@ -267,7 +260,7 @@ - err: - if (c) cram_free_container(c); - if (b) cram_free_block(b); -- if (hdr) sam_hdr_free(hdr); -+ if (cram_h) sam_hdr_destroy(cram_h); - - return ret; - } -@@ -288,16 +281,18 @@ - * -1 on general failure; - * -2 on failure due to insufficient size - */ --int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, -- int add_PG) -+int cram_reheader_inplace3(cram_fd *fd, sam_hdr_t *h, const char *arg_list, -+ int no_pg) - { - cram_container *c = NULL; - cram_block *b = NULL; -- SAM_hdr *hdr = NULL; -+ sam_hdr_t *cram_h = NULL; - off_t start, sz, end; - int container_sz, max_container_sz; - char *buf = NULL; - int ret = -1; -+ if (!h) -+ goto err; - - if (cram_major_vers(fd) < 2 || - cram_major_vers(fd) > 3) { -@@ -306,16 +301,17 @@ - goto err; - } - -- if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) -+ cram_h = sam_hdr_dup(h); -+ if (!cram_h) - goto err; - -- if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), -+ if (!no_pg && sam_hdr_add_pg(cram_h, "samtools", "VN", samtools_version(), - arg_list ? "CL": NULL, - arg_list ? arg_list : NULL, - NULL)) - goto err; - -- int header_len = sam_hdr_length(hdr); -+ int header_len = sam_hdr_length(cram_h); - /* Fix M5 strings? Maybe out of scope for this tool */ - - // Find current size of SAM header block -@@ -383,7 +379,7 @@ - // Version 3.0 supports compressed header - b = cram_new_block(FILE_HEADER, 0); - int32_put_blk(b, header_len); -- cram_block_append(b, sam_hdr_str(hdr), header_len); -+ cram_block_append(b, (void *)sam_hdr_str(cram_h), header_len); - cram_block_update_size(b); - - cram_compress_block(fd, b, NULL, -1, -1); -@@ -418,17 +414,17 @@ - if (c) cram_free_container(c); - if (buf) free(buf); - if (b) cram_free_block(b); -- if (hdr) sam_hdr_free(hdr); -+ if (cram_h) sam_hdr_destroy(cram_h); - - return ret; - } - --int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, -- int add_PG) -+int cram_reheader_inplace(cram_fd *fd, sam_hdr_t *h, const char *arg_list, -+ int no_pg) - { - switch (cram_major_vers(fd)) { -- case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG); -- case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG); -+ case 2: return cram_reheader_inplace2(fd, h, arg_list, no_pg); -+ case 3: return cram_reheader_inplace3(fd, h, arg_list, no_pg); - default: - fprintf(samtools_stderr, "[%s] unsupported CRAM version %d\n", __func__, - cram_major_vers(fd)); -@@ -439,33 +435,124 @@ - static void usage(FILE *fp, int ret) { - fprintf(fp, - "Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n" -- " or samtools reheader [-P] -i in.header.sam file.bam\n" -+ " or samtools reheader [-P] -i in.header.sam file.cram\n" -+ " or samtools reheader -c CMD in.bam\n" -+ " or samtools reheader -c CMD in.cram\n" - "\n" - "Options:\n" -- " -P, --no-PG Do not generate an @PG header line.\n" -- " -i, --in-place Modify the bam/cram file directly.\n" -- " (Defaults to outputting to samtools_stdout.)\n"); -+ " -P, --no-PG Do not generate a @PG header line.\n" -+ " -i, --in-place Modify the CRAM file directly, if possible.\n" -+ " (Defaults to outputting to samtools_stdout.)\n" -+ " -c, --command CMD Pass the header in SAM format to external program CMD.\n"); - exit(ret); - } - -+static sam_hdr_t* external_reheader(samFile* in, const char* external) { -+ char *command = NULL; -+ sam_hdr_t* h = NULL; -+ sam_hdr_t* ih = sam_hdr_read(in); -+ if (ih == NULL) { -+ fprintf(samtools_stderr, "[%s] failed to read the header for '%s'.\n", __func__, in->fn); -+ return NULL; -+ } -+ char tmp_fn[] = "reheaderXXXXXX"; -+ int tmp_fd = mkstemp(tmp_fn); -+ if (tmp_fd < 0) { -+ print_error_errno("reheader", "fail to open temp file '%s'", tmp_fn); -+ return NULL; -+ } -+ hFILE* tmp_hf = hdopen(tmp_fd, "w"); -+ if (!tmp_hf) { -+ fprintf(samtools_stderr, "[%s] failed to convert to hFILE.\n", __func__); -+ goto cleanup; -+ } -+ samFile* tmp_sf = hts_hopen(tmp_hf, tmp_fn, "w"); -+ if (!tmp_sf) { -+ fprintf(samtools_stderr, "[%s] failed to convert to samFile.\n", __func__); -+ goto cleanup; -+ } -+ if (-1 == sam_hdr_write(tmp_sf, ih)) { -+ fprintf(samtools_stderr, "[%s] failed to write the header to the temp file.\n", __func__); -+ goto cleanup; -+ } -+ sam_close(tmp_sf); -+ sam_hdr_destroy(ih); -+ int comm_len = strlen(external) + strlen(tmp_fn) + 8; -+ command = calloc(comm_len, 1); -+ if (!command || snprintf(command, comm_len, "( %s ) < %s", external, tmp_fn) != comm_len - 1) { -+ fprintf(samtools_stderr, "[%s] failed to create command string.\n", __func__); -+ goto cleanup; -+ } -+ FILE* nh = popen(command, "r"); -+ if (!nh) { -+ print_error_errno("reheader", "[%s] failed to run external command '%s'.\n", __func__, command); -+ goto cleanup; -+ } -+ -+ int nh_fd = dup(fileno(nh)); -+ if (nh_fd < 0) { -+ fprintf(samtools_stderr, "[%s] failed to get the file descriptor.\n", __func__); -+ goto cleanup; -+ } -+ hFILE* nh_hf = hdopen(nh_fd, "r"); -+ if (!nh_hf) { -+ fprintf(samtools_stderr, "[%s] failed to convert to hFILE.\n", __func__); -+ goto cleanup; -+ } -+ samFile* nh_sf = hts_hopen(nh_hf, tmp_fn, "r"); -+ if (!nh_sf) { -+ fprintf(samtools_stderr, "[%s] failed to convert to samFile.\n", __func__); -+ goto cleanup; -+ } -+ -+ h = sam_hdr_read(nh_sf); -+ sam_close(nh_sf); -+ if (h == NULL) { -+ fprintf(samtools_stderr, "[%s] failed to read the header from the temp file.\n", __func__); -+ } -+ int res = pclose(nh); -+ if (res != 0) { -+ if (res < 0) { -+ print_error_errno("reheader", -+ "Error on closing pipe from command '%s'.\n", -+ command); -+ } else { -+ print_error("reheader", -+ "Non-zero exit code returned by command '%s'\n", -+ command); -+ } -+ if (h) sam_hdr_destroy(h); -+ h = NULL; -+ } -+cleanup: -+ free(command); -+ if (unlink(tmp_fn) != 0) { -+ print_error_errno("reheader", "failed to remove the temp file '%s'", tmp_fn); -+ } -+ -+ return h; -+} -+ - int main_reheader(int argc, char *argv[]) - { -- int inplace = 0, r, add_PG = 1, c; -- bam_hdr_t *h; -+ int inplace = 0, r, no_pg = 0, c, skip_header = 0; -+ sam_hdr_t *h; - samFile *in; -- char *arg_list = stringify_argv(argc+1, argv-1); -+ char *arg_list = NULL, *external = NULL; - - static const struct option lopts[] = { - {"help", no_argument, NULL, 'h'}, - {"in-place", no_argument, NULL, 'i'}, - {"no-PG", no_argument, NULL, 'P'}, -+ {"command", required_argument, NULL, 'c'}, - {NULL, 0, NULL, 0} - }; - -- while ((c = getopt_long(argc, argv, "hiP", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "hiPc:", lopts, NULL)) >= 0) { - switch (c) { -- case 'P': add_PG = 0; break; -+ case 'P': no_pg = 1; break; - case 'i': inplace = 1; break; -+ case 'c': external = optarg; break; - case 'h': usage(samtools_stdout, 0); break; - default: - fprintf(samtools_stderr, "Invalid option '%c'\n", c); -@@ -473,10 +560,29 @@ - } - } - -- if (argc - optind != 2) -+ if ((argc - optind != 2 || external) && (argc - optind != 1 || !external)) - usage(samtools_stderr, 1); - -- { // read the header -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("reheader", "failed to create arg_list"); -+ return 1; -+ } -+ -+ if (external) { -+ skip_header = 1; -+ in = sam_open(argv[optind], inplace?"r+":"r"); -+ if (in == 0) { -+ print_error_errno("reheader", "fail to open file '%s'", argv[optind]); -+ return 1; -+ } -+ -+ h = external_reheader(in, external); -+ if (h == NULL) { -+ fprintf(samtools_stderr, "[%s] failed to read the header from '%s'.\n", __func__, external); -+ sam_close(in); -+ return 1; -+ } -+ } else { // read the header from a separate file - samFile *fph = sam_open(argv[optind], "r"); - if (fph == 0) { - print_error_errno("reheader", "fail to read the header from '%s'", argv[optind]); -@@ -489,25 +595,34 @@ - __func__, argv[1]); - return 1; - } -+ in = sam_open(argv[optind+1], inplace?"r+":"r"); -+ if (in == 0) { -+ print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); -+ return 1; -+ } - } -- in = sam_open(argv[optind+1], inplace?"r+":"r"); -- if (in == 0) { -- print_error_errno("reheader", "fail to open file '%s'", argv[optind+1]); -- return 1; -- } -+ - if (hts_get_format(in)->format == bam) { -- r = bam_reheader(in->fp.bgzf, h, fileno(samtools_stdout), arg_list, add_PG); -- } else { -+ if (inplace) { -+ print_error("reheader", "cannot reheader BAM '%s' in-place", argv[optind+1]); -+ r = -1; -+ } else { -+ r = bam_reheader(in->fp.bgzf, h, fileno(samtools_stdout), arg_list, no_pg, skip_header); -+ } -+ } else if (hts_get_format(in)->format == cram) { - if (inplace) -- r = cram_reheader_inplace(in->fp.cram, h, arg_list, add_PG); -+ r = cram_reheader_inplace(in->fp.cram, h, arg_list, no_pg); - else -- r = cram_reheader(in->fp.cram, h, arg_list, add_PG); -+ r = cram_reheader(in->fp.cram, h, arg_list, no_pg); -+ } else { -+ print_error("reheader", "input file '%s' must be BAM or CRAM", argv[optind+1]); -+ r = -1; - } - - if (sam_close(in) != 0) - r = -1; - -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - - if (arg_list) - free(arg_list); ---- python-pysam.orig/samtools/bam_rmdup.c -+++ python-pysam/samtools/bam_rmdup.c -@@ -1,6 +1,6 @@ - /* bam_rmdup.c -- duplicate read detection. - -- Copyright (C) 2009, 2015 Genome Research Ltd. -+ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. - Portions copyright (C) 2009 Broad Institute. - - Author: Heng Li -@@ -63,7 +63,7 @@ - stack->a[stack->n++] = b; - } - --static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) -+static inline int dump_best(tmp_stack_t *stack, samFile *out, sam_hdr_t *hdr) - { - int i; - for (i = 0; i != stack->n; ++i) { -@@ -127,7 +127,7 @@ - return q; - } - --int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) -+int bam_rmdup_core(samFile *in, sam_hdr_t *hdr, samFile *out) - { - bam1_t *b = NULL; - int last_tid = -1, last_pos = -1, r; -@@ -165,7 +165,7 @@ - break; - } - last_tid = c->tid; -- fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); -+ fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", sam_hdr_tid2name(hdr, c->tid)); - } - } - if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { -@@ -179,13 +179,16 @@ - q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); - ++q->n_checked; - k = kh_put(pos, q->best_hash, key, &ret); -+ if (ret < 0) goto fail; - if (ret == 0) { // found in best_hash - bam1_t *p = kh_val(q->best_hash, k); - ++q->n_removed; - if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle - kh_put(name, del_set, strdup(bam_get_qname(p)), &ret); // p will be removed -- bam_copy1(p, b); // replaced as b -+ if (ret < 0) goto fail; -+ if (bam_copy1(p, b) == NULL) goto fail; // replaced as b - } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed -+ if (ret < 0) goto fail; - if (ret == 0) - fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); - } else { // not found in best_hash -@@ -250,7 +253,7 @@ - return 1; - } - --int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); -+int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se); - - static int rmdup_usage(void) { - fprintf(stderr, "\n"); -@@ -258,7 +261,7 @@ - fprintf(stderr, "Option: -s rmdup for SE reads\n"); - fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); - -- sam_global_opt_help(stderr, "-....-"); -+ sam_global_opt_help(stderr, "-....--."); - return 1; - } - -@@ -266,7 +269,7 @@ - { - int c, ret, is_se = 0, force_se = 0; - samFile *in, *out; -- bam_hdr_t *header; -+ sam_hdr_t *header; - char wmode[3] = {'w', 'b', 0}; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - -@@ -293,7 +296,7 @@ - return 1; - } - header = sam_hdr_read(in); -- if (header == NULL || header->n_targets == 0) { -+ if (header == NULL || sam_hdr_nref(header) == 0) { - fprintf(stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); - return 1; - } -@@ -312,7 +315,7 @@ - if (is_se) ret = bam_rmdupse_core(in, header, out, force_se); - else ret = bam_rmdup_core(in, header, out); - -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - sam_close(in); - if (sam_close(out) < 0) { - fprintf(stderr, "[bam_rmdup] error closing output file\n"); ---- python-pysam.orig/samtools/bam_rmdup.c.pysam.c -+++ python-pysam/samtools/bam_rmdup.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_rmdup.c -- duplicate read detection. - -- Copyright (C) 2009, 2015 Genome Research Ltd. -+ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. - Portions copyright (C) 2009 Broad Institute. - - Author: Heng Li -@@ -65,7 +65,7 @@ - stack->a[stack->n++] = b; - } - --static inline int dump_best(tmp_stack_t *stack, samFile *out, bam_hdr_t *hdr) -+static inline int dump_best(tmp_stack_t *stack, samFile *out, sam_hdr_t *hdr) - { - int i; - for (i = 0; i != stack->n; ++i) { -@@ -129,7 +129,7 @@ - return q; - } - --int bam_rmdup_core(samFile *in, bam_hdr_t *hdr, samFile *out) -+int bam_rmdup_core(samFile *in, sam_hdr_t *hdr, samFile *out) - { - bam1_t *b = NULL; - int last_tid = -1, last_pos = -1, r; -@@ -167,7 +167,7 @@ - break; - } - last_tid = c->tid; -- fprintf(samtools_stderr, "[bam_rmdup_core] processing reference %s...\n", hdr->target_name[c->tid]); -+ fprintf(samtools_stderr, "[bam_rmdup_core] processing reference %s...\n", sam_hdr_tid2name(hdr, c->tid)); - } - } - if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { -@@ -181,13 +181,16 @@ - q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); - ++q->n_checked; - k = kh_put(pos, q->best_hash, key, &ret); -+ if (ret < 0) goto fail; - if (ret == 0) { // found in best_hash - bam1_t *p = kh_val(q->best_hash, k); - ++q->n_removed; - if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle - kh_put(name, del_set, strdup(bam_get_qname(p)), &ret); // p will be removed -- bam_copy1(p, b); // replaced as b -+ if (ret < 0) goto fail; -+ if (bam_copy1(p, b) == NULL) goto fail; // replaced as b - } else kh_put(name, del_set, strdup(bam_get_qname(b)), &ret); // b will be removed -+ if (ret < 0) goto fail; - if (ret == 0) - fprintf(samtools_stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam_get_qname(b)); - } else { // not found in best_hash -@@ -252,7 +255,7 @@ - return 1; - } - --int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se); -+int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se); - - static int rmdup_usage(void) { - fprintf(samtools_stderr, "\n"); -@@ -260,7 +263,7 @@ - fprintf(samtools_stderr, "Option: -s rmdup for SE reads\n"); - fprintf(samtools_stderr, " -S treat PE reads as SE in rmdup (force -s)\n"); - -- sam_global_opt_help(samtools_stderr, "-....-"); -+ sam_global_opt_help(samtools_stderr, "-....--."); - return 1; - } - -@@ -268,7 +271,7 @@ - { - int c, ret, is_se = 0, force_se = 0; - samFile *in, *out; -- bam_hdr_t *header; -+ sam_hdr_t *header; - char wmode[3] = {'w', 'b', 0}; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - -@@ -295,7 +298,7 @@ - return 1; - } - header = sam_hdr_read(in); -- if (header == NULL || header->n_targets == 0) { -+ if (header == NULL || sam_hdr_nref(header) == 0) { - fprintf(samtools_stderr, "[bam_rmdup] input SAM does not have header. Abort!\n"); - return 1; - } -@@ -314,7 +317,7 @@ - if (is_se) ret = bam_rmdupse_core(in, header, out, force_se); - else ret = bam_rmdup_core(in, header, out); - -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - sam_close(in); - if (sam_close(out) < 0) { - fprintf(samtools_stderr, "[bam_rmdup] error closing output file\n"); ---- python-pysam.orig/samtools/bam_rmdupse.c -+++ python-pysam/samtools/bam_rmdupse.c -@@ -1,6 +1,6 @@ - /* bam_rmdupse.c -- duplicate read detection for unpaired reads. - -- Copyright (C) 2009, 2015 Genome Research Ltd. -+ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. - Portions copyright (C) 2009 Broad Institute. - - Author: Heng Li -@@ -84,7 +84,8 @@ - p->discarded = 0; - p->endpos = endpos; p->score = score; - if (p->b == 0) p->b = bam_init1(); -- bam_copy1(p->b, b); -+ if (!p->b) { perror(NULL); exit(EXIT_FAILURE); } -+ if (bam_copy1(p->b, b) == NULL) { perror(NULL); exit(EXIT_FAILURE); } - return p; - } - -@@ -96,7 +97,7 @@ - kh_del(best, h, k); - } - --static int dump_alignment(samFile *out, bam_hdr_t *hdr, -+static int dump_alignment(samFile *out, sam_hdr_t *hdr, - queue_t *queue, int32_t pos, khash_t(lib) *h) - { - if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { -@@ -125,7 +126,7 @@ - return 0; - } - --int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) -+int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se) - { - bam1_t *b = NULL; - queue_t *queue = NULL; -@@ -179,7 +180,9 @@ - kh_val(h, k) = push_queue(queue, b, endpos, score); - } else { // replace - p->score = score; p->endpos = endpos; -- bam_copy1(p->b, b); -+ if (bam_copy1(p->b, b) == NULL) { -+ perror(NULL); exit(EXIT_FAILURE); -+ } - } - } // otherwise, discard the alignment - } else kh_val(h, k) = push_queue(queue, b, endpos, score); ---- python-pysam.orig/samtools/bam_rmdupse.c.pysam.c -+++ python-pysam/samtools/bam_rmdupse.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_rmdupse.c -- duplicate read detection for unpaired reads. - -- Copyright (C) 2009, 2015 Genome Research Ltd. -+ Copyright (C) 2009, 2015, 2016, 2019 Genome Research Ltd. - Portions copyright (C) 2009 Broad Institute. - - Author: Heng Li -@@ -86,7 +86,8 @@ - p->discarded = 0; - p->endpos = endpos; p->score = score; - if (p->b == 0) p->b = bam_init1(); -- bam_copy1(p->b, b); -+ if (!p->b) { perror(NULL); exit(EXIT_FAILURE); } -+ if (bam_copy1(p->b, b) == NULL) { perror(NULL); exit(EXIT_FAILURE); } - return p; - } - -@@ -98,7 +99,7 @@ - kh_del(best, h, k); - } - --static int dump_alignment(samFile *out, bam_hdr_t *hdr, -+static int dump_alignment(samFile *out, sam_hdr_t *hdr, - queue_t *queue, int32_t pos, khash_t(lib) *h) - { - if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { -@@ -127,7 +128,7 @@ - return 0; - } - --int bam_rmdupse_core(samFile *in, bam_hdr_t *hdr, samFile *out, int force_se) -+int bam_rmdupse_core(samFile *in, sam_hdr_t *hdr, samFile *out, int force_se) - { - bam1_t *b = NULL; - queue_t *queue = NULL; -@@ -181,7 +182,9 @@ - kh_val(h, k) = push_queue(queue, b, endpos, score); - } else { // replace - p->score = score; p->endpos = endpos; -- bam_copy1(p->b, b); -+ if (bam_copy1(p->b, b) == NULL) { -+ perror(NULL); exit(EXIT_FAILURE); -+ } - } - } // otherwise, discard the alignment - } else kh_val(h, k) = push_queue(queue, b, endpos, score); ---- python-pysam.orig/samtools/bam_sort.c -+++ python-pysam/samtools/bam_sort.c -@@ -1,6 +1,6 @@ - /* bam_sort.c -- sorting and merging. - -- Copyright (C) 2008-2016 Genome Research Ltd. -+ Copyright (C) 2008-2019 Genome Research Ltd. - Portions copyright (C) 2009-2012 Broad Institute. - - Author: Heng Li -@@ -44,6 +44,7 @@ - #include "htslib/klist.h" - #include "htslib/kstring.h" - #include "htslib/sam.h" -+#include "htslib/hts_endian.h" - #include "sam_opts.h" - #include "samtools.h" - -@@ -55,7 +56,7 @@ - bam1_t *bam_record; - union { - const uint8_t *tag; -- uint64_t pos; -+ uint8_t pos_tid[12]; - } u; - } bam1_tag; - -@@ -122,12 +123,12 @@ - return *pa? 1 : *pb? -1 : 0; - } - --#define HEAP_EMPTY UINT64_MAX -+#define HEAP_EMPTY (UINT64_MAX >> 1) - - typedef struct { - int i; -- uint32_t rev; -- uint64_t pos, idx; -+ uint32_t tid; -+ uint64_t pos:63, rev:1, idx; - bam1_tag entry; - } heap1_t; - -@@ -153,6 +154,7 @@ - fb = b.entry.bam_record->core.flag & 0xc0; - if (fa != fb) return fa > fb; - } else { -+ if (a.tid != b.tid) return a.tid > b.tid; - if (a.pos != b.pos) return a.pos > b.pos; - if (a.rev != b.rev) return a.rev > b.rev; - } -@@ -164,8 +166,7 @@ - KSORT_INIT(heap, heap1_t, heap_lt) - - typedef struct merged_header { -- kstring_t out_hd; -- kstring_t out_sq; -+ sam_hdr_t *hdr; - kstring_t out_rg; - kstring_t out_pg; - kstring_t out_co; -@@ -187,80 +188,6 @@ - bool lost_coord_sort; - } trans_tbl_t; - --/* Something to look like a regmatch_t */ --typedef struct hdr_match { -- ptrdiff_t rm_so; -- ptrdiff_t rm_eo; --} hdr_match_t; -- --/* -- * Search for header lines of a particular record type. -- * -- * This replaces a regex search for something like /^@SQ.*\tSN:([^\t]+).*$/ -- * but is much quicker. The locations found are returned in *matches, -- * which has a signature the same as that of a regmatch_t. -- * -- * rec is the record type to match (i.e. @HD, @SQ, @PG or @RG) -- * tag is a tag type in the record to match (SN for @SQ, ID for @PG or @RG) -- * -- * The location of the record (if found) is returned in matches[0] -- * If tag is not NULL, the record is searched for the presence of the -- * given tag. If found, the location of the value is returned in matches[1]. -- * If the tag isn't found then the record is ignored and the search resumes -- * on the next header line. -- * -- * For simplicity, some assumptions are made about rec and tag: -- * rec should include the leading '@' sign and be three characters long. -- * tag should be exactly two characters long. -- * These are always string constants when this is called below, so we don't -- * bother to check here. -- * -- * Returns 0 if a match was found, -1 if not. -- */ -- -- --static int hdr_line_match(const char *text, const char *rec, -- const char *tag, hdr_match_t *matches) { -- const char *line_start, *line_end = text; -- const char *tag_start, *tag_end; -- -- for (;;) { -- // Find record, ensure either at start of text or follows '\n' -- line_start = strstr(line_end, rec); -- while (line_start && line_start > text && *(line_start - 1) != '\n') { -- line_start = strstr(line_start + 3, rec); -- } -- if (!line_start) return -1; -- -- // Find end of header line -- line_end = strchr(line_start, '\n'); -- if (!line_end) line_end = line_start + strlen(line_start); -- -- matches[0].rm_so = line_start - text; -- matches[0].rm_eo = line_end - text; -- if (!tag) return 0; // Match found if not looking for tag. -- -- for (tag_start = line_start + 3; tag_start < line_end; tag_start++) { -- // Find possible tag start. Hacky but quick. -- while (*tag_start > '\n') tag_start++; -- -- // Check it -- if (tag_start[0] == '\t' -- && strncmp(tag_start + 1, tag, 2) == 0 -- && tag_start[3] == ':') { -- // Found tag, record location and return. -- tag_end = tag_start + 4; -- while (*tag_end && *tag_end != '\t' && *tag_end != '\n') -- ++tag_end; -- matches[1].rm_so = tag_start - text + 4; -- matches[1].rm_eo = tag_end - text; -- return 0; -- } -- } -- // Couldn't find tag, try again from end of current record. -- } --} -- - static void trans_tbl_destroy(trans_tbl_t *tbl) { - khiter_t iter; - -@@ -299,6 +226,9 @@ - merged_hdr = calloc(1, sizeof(*merged_hdr)); - if (merged_hdr == NULL) return NULL; - -+ merged_hdr->hdr = sam_hdr_init(); -+ if (!merged_hdr->hdr) goto fail; -+ - merged_hdr->targets_sz = 16; - merged_hdr->target_name = malloc(merged_hdr->targets_sz - * sizeof(*merged_hdr->target_name)); -@@ -326,6 +256,7 @@ - kh_destroy(c2i, merged_hdr->sq_tids); - free(merged_hdr->target_name); - free(merged_hdr->target_len); -+ sam_hdr_destroy(merged_hdr->hdr); - free(merged_hdr); - return NULL; - } -@@ -338,12 +269,6 @@ - return kputsn(src + from, to - from, dest) != to - from; - } - --// Append a header line match to kstring --static inline int match_to_ks(const char *src, const hdr_match_t *match, -- kstring_t *dest) { -- return range_to_ks(src, match->rm_so, match->rm_eo, dest); --} -- - // Append a kstring to a kstring - static inline int ks_to_ks(kstring_t *src, kstring_t *dest) { - return kputsn(ks_str(src), ks_len(src), dest) != ks_len(src); -@@ -385,48 +310,32 @@ - */ - - static int trans_tbl_add_hd(merged_header_t* merged_hdr, -- bam_hdr_t *translate) { -- hdr_match_t match = {0, 0}; -+ sam_hdr_t *translate) { -+ kstring_t hd_line = { 0, 0, NULL }; -+ int res; - - // TODO: handle case when @HD needs merging. - if (merged_hdr->have_hd) return 0; - -- if (hdr_line_match(translate->text, "@HD", NULL, &match) != 0) { -- return 0; -+ res = sam_hdr_find_hd(translate, &hd_line); -+ if (res < -1) { -+ print_error("merge", "failed to get @HD line from header"); -+ return -1; - } - -- if (match_to_ks(translate->text, &match, &merged_hdr->out_hd)) goto memfail; -- if (kputc('\n', &merged_hdr->out_hd) == EOF) goto memfail; -- merged_hdr->have_hd = true; -- -- return 0; -- -- memfail: -- perror(__func__); -- return -1; --} -+ if (res < 0) // Not found -+ return 0; - --static inline int grow_target_list(merged_header_t* merged_hdr) { -- size_t new_size; -- char **new_names; -- uint32_t *new_len; -- -- new_size = merged_hdr->targets_sz * 2; -- new_names = realloc(merged_hdr->target_name, sizeof(*new_names) * new_size); -- if (!new_names) goto fail; -- merged_hdr->target_name = new_names; -- -- new_len = realloc(merged_hdr->target_len, sizeof(*new_len) * new_size); -- if (!new_len) goto fail; -- merged_hdr->target_len = new_len; -+ if (sam_hdr_add_lines(merged_hdr->hdr, hd_line.s, hd_line.l) < 0) { -+ print_error("merge", "failed to add @HD line to new header"); -+ free(hd_line.s); -+ return -1; -+ } - -- merged_hdr->targets_sz = new_size; -+ free(hd_line.s); -+ merged_hdr->have_hd = true; - - return 0; -- -- fail: -- perror(__func__); -- return -1; - } - - /* -@@ -444,54 +353,48 @@ - * Returns 0 on success, -1 on failure. - */ - --static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, -+static int trans_tbl_add_sq(merged_header_t* merged_hdr, sam_hdr_t *translate, - trans_tbl_t* tbl) { -- -- kstring_t *out_text = &merged_hdr->out_sq; -- khash_t(c2i)* sq_tids = merged_hdr->sq_tids; -- hdr_match_t *new_sq_matches = NULL; -- char *text; -- hdr_match_t matches[2]; - int32_t i; -- int32_t old_n_targets = merged_hdr->n_targets; -- khiter_t iter; -- int min_tid = -1; -+ int min_tid = -1, res; -+ kstring_t sq_line = { 0, 0, NULL }, sq_sn = { 0, 0, NULL }; - - // Fill in the tid part of the translation table, adding new targets - // to the merged header as we go. - -- for (i = 0; i < translate->n_targets; ++i) { -+ for (i = 0; i < sam_hdr_nref(translate); ++i) { -+ int trans_tid; -+ sq_sn.l = 0; -+ res = sam_hdr_find_tag_pos(translate, "SQ", i, "SN", &sq_sn); -+ if (res < 0) { -+ print_error("merge", "failed to get @SQ SN #%d from header", i + 1); -+ goto fail; -+ } - -- // Check if it's a new target. -- iter = kh_get(c2i, sq_tids, translate->target_name[i]); -+ trans_tid = sam_hdr_name2tid(merged_hdr->hdr, sq_sn.s); -+ if (trans_tid < -1) { -+ print_error("merge", "failed to lookup ref"); -+ goto fail; -+ } - -- if (iter == kh_end(sq_tids)) { -- int ret; -+ if (trans_tid < 0) { - // Append missing entries to out_hdr -- -- if (merged_hdr->n_targets == merged_hdr->targets_sz) { -- if (grow_target_list(merged_hdr)) goto fail; -+ sq_line.l = 0; -+ res = sam_hdr_find_line_id(translate, "SQ", "SN", sq_sn.s, &sq_line); -+ if (res < 0) { -+ print_error("merge", "failed to get @SQ SN:%s from header", sq_sn.s); -+ goto fail; - } - -- merged_hdr->target_name[merged_hdr->n_targets] = strdup(translate->target_name[i]); -- if (merged_hdr->target_name[merged_hdr->n_targets] == NULL) goto memfail; -- merged_hdr->target_len[merged_hdr->n_targets] = translate->target_len[i]; -- -- // Record the new identifier for reference below, -- // and when building the ttable for other inputs. -- iter = kh_put(c2i, sq_tids, -- merged_hdr->target_name[merged_hdr->n_targets], &ret); -- if (ret < 0) { -- free(merged_hdr->target_name[merged_hdr->n_targets]); -- goto memfail; -- } -- assert(ret > 0); // Should not be in hash already. -+ trans_tid = sam_hdr_nref(merged_hdr->hdr); - -- kh_value(sq_tids, iter) = merged_hdr->n_targets; -- tbl->tid_trans[i] = merged_hdr->n_targets++; -- } else { -- tbl->tid_trans[i] = kh_value(sq_tids, iter); -+ res = sam_hdr_add_lines(merged_hdr->hdr, sq_line.s, sq_line.l); -+ if (res < 0) { -+ print_error("merge", "failed to add @SQ SN:%s to new header", sq_sn.s); -+ goto fail; -+ } - } -+ tbl->tid_trans[i] = trans_tid; - - if (tbl->tid_trans[i] > min_tid) { - min_tid = tbl->tid_trans[i]; -@@ -500,78 +403,14 @@ - } - } - -- if (merged_hdr->n_targets == old_n_targets) -- return 0; // Everything done if no new targets. -- -- // Otherwise, find @SQ lines in translate->text for all newly added targets. -- -- new_sq_matches = malloc((merged_hdr->n_targets - old_n_targets) -- * sizeof(*new_sq_matches)); -- if (new_sq_matches == NULL) goto memfail; -- -- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { -- new_sq_matches[i].rm_so = new_sq_matches[i].rm_eo = -1; -- } -- -- text = translate->text; -- while (hdr_line_match(text, "@SQ", "SN", matches) == 0) { -- // matches[0] is whole line, matches[1] is SN value. -- -- // This is a bit disgusting, but avoids a copy... -- char c = text[matches[1].rm_eo]; -- int idx; -- -- text[matches[1].rm_eo] = '\0'; -- -- // Look up the SN value in the sq_tids hash. -- iter = kh_get(c2i, sq_tids, text + matches[1].rm_so); -- text[matches[1].rm_eo] = c; // restore text -- -- if (iter == kh_end(sq_tids)) { -- // Warn about this, but it's not really fatal. -- fprintf(stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", -- __func__, -- (int) (matches[1].rm_eo - matches[1].rm_so), -- text + matches[1].rm_so); -- text += matches[0].rm_eo; -- continue; // Skip to next -- } -- -- idx = kh_value(sq_tids, iter); -- if (idx >= old_n_targets) { -- // is a new SQ, so record position so we can add it to out_text. -- assert(idx < merged_hdr->n_targets); -- ptrdiff_t off = text - translate->text; -- new_sq_matches[idx - old_n_targets].rm_so = matches[0].rm_so + off; -- new_sq_matches[idx - old_n_targets].rm_eo = matches[0].rm_eo + off; -- } -- -- // Carry on searching from end of current match -- text += matches[0].rm_eo; -- } -- -- // Copy the @SQ headers found and recreate any missing from binary header. -- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { -- if (new_sq_matches[i].rm_so >= 0) { -- if (match_to_ks(translate->text, &new_sq_matches[i], out_text)) -- goto memfail; -- if (kputc('\n', out_text) == EOF) goto memfail; -- } else { -- if (kputs("@SQ\tSN:", out_text) == EOF || -- kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF || -- kputs("\tLN:", out_text) == EOF || -- kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF || -- kputc('\n', out_text) == EOF) goto memfail; -- } -- } -+ free(sq_line.s); -+ free(sq_sn.s); - -- free(new_sq_matches); - return 0; - -- memfail: -- perror(__func__); - fail: -- free(new_sq_matches); -+ free(sq_line.s); -+ free(sq_sn.s); - return -1; - } - -@@ -592,29 +431,30 @@ - * - */ - --static klist_t(hdrln) * trans_rg_pg(bool is_rg, bam_hdr_t *translate, -+static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, - bool merge, khash_t(cset)* known_ids, - khash_t(c2c)* id_map, char *override) { -- hdr_match_t matches[2]; - khiter_t iter; -- const char *text = translate->text; -- const char *rec_type = is_rg ? "@RG" : "@PG"; -+ int num_ids, i; -+ const char *rec_type = is_rg ? "RG" : "PG"; - klist_t(hdrln) *hdr_lines; - - hdr_lines = kl_init(hdrln); - - // Search through translate's header -- while (hdr_line_match(text, rec_type, "ID", matches) == 0) { -- // matches[0] is the whole @RG/PG line; matches[1] is the ID field value -+ num_ids = sam_hdr_count_lines(translate, rec_type); -+ if (num_ids < 0) -+ goto fail; - -+ for (i = 0; i < num_ids; i++) { - kstring_t orig_id = { 0, 0, NULL }; // ID in original header - kstring_t transformed_id = { 0, 0, NULL }; // ID in output header - char *map_value; // Value to store in id_map - bool id_changed; // Have we changed the ID? - bool not_found_in_output; // ID isn't in the output header (yet) - -- // Take a copy of the ID as we'll need it for a hash key. -- if (match_to_ks(text, &matches[1], &orig_id)) goto memfail; -+ if (sam_hdr_find_tag_pos(translate, rec_type, i, "ID", &orig_id) < 0) -+ goto fail; - - // is our matched ID in our output ID set already? - iter = kh_get(cset, known_ids, ks_str(&orig_id)); -@@ -651,18 +491,38 @@ - - // Does this line need to go into our output header? - if (not_found_in_output) { -- - // Take matched line and replace ID with transformed_id - kstring_t new_hdr_line = { 0, 0, NULL }; -+ if (sam_hdr_find_line_id(translate, rec_type, -+ "ID", ks_str(&orig_id), &new_hdr_line) < 0){ -+ goto fail; -+ } -+ -+ if (id_changed) { -+ char *idp = strstr(ks_str(&new_hdr_line), "\tID:"), *id_end; -+ ptrdiff_t id_offset, id_len; -+ if (!idp) { -+ print_error("merge", "failed to find ID in \"%s\"\n", -+ ks_str(&new_hdr_line)); -+ goto fail; -+ } -+ idp += 4; -+ for (id_end = idp; *id_end >= '\n'; id_end++) {} -+ -+ id_offset = idp - new_hdr_line.s; -+ id_len = id_end - idp; - -- if (!id_changed) { // Can just copy -- if (match_to_ks(text, &matches[0], &new_hdr_line)) goto memfail; -- } else { // Substitute new name for original -- if (range_to_ks(text, matches[0].rm_so, matches[1].rm_so, -- &new_hdr_line)) goto memfail; -- if (ks_to_ks(&transformed_id, &new_hdr_line)) goto memfail; -- if (range_to_ks(text, matches[1].rm_eo, matches[0].rm_eo, -- &new_hdr_line)) goto memfail; -+ if (id_len < transformed_id.l) { -+ if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len)) -+ goto fail; -+ } -+ if (id_len != transformed_id.l) { -+ memmove(new_hdr_line.s + id_offset + transformed_id.l, -+ new_hdr_line.s + id_offset + id_len, -+ new_hdr_line.l - id_offset - id_len + 1); -+ } -+ memcpy(new_hdr_line.s + id_offset, transformed_id.s, -+ transformed_id.l); - } - - // append line to output linked list -@@ -686,8 +546,6 @@ - int in_there = 0; - iter = kh_put(c2c, id_map, ks_release(&orig_id), &in_there); - kh_value(id_map, iter) = map_value; -- -- text += matches[0].rm_eo; // next! - } - - // If there are no RG lines in the file and we are overriding add one -@@ -724,6 +582,7 @@ - - memfail: - perror(__func__); -+ fail: - if (hdr_lines) kl_destroy(hdrln, hdr_lines); - return NULL; - } -@@ -821,16 +680,18 @@ - * Returns 0 on success, -1 on failure. - */ - --static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, -+static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, - trans_tbl_t* tbl, bool merge_rg, bool merge_pg, - bool copy_co, char* rg_override) - { -+ kstring_t lines = { 0, 0, NULL }; - klist_t(hdrln) *rg_list = NULL; - klist_t(hdrln) *pg_list = NULL; - -- tbl->n_targets = translate->n_targets; -+ tbl->n_targets = sam_hdr_nref(translate); - tbl->rg_trans = tbl->pg_trans = NULL; -- tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); -+ tbl->tid_trans = (int*)calloc(tbl->n_targets ? tbl->n_targets : 1, -+ sizeof(int)); - if (tbl->tid_trans == NULL) goto memfail; - tbl->rg_trans = kh_init(c2c); - if (tbl->rg_trans == NULL) goto memfail; -@@ -859,6 +720,7 @@ - goto fail; - - // Fix-up PP: tags in the new @PG records and add to output -+ lines.l = 0; - if (finish_rg_pg(false, pg_list, tbl->pg_trans, &merged_hdr->out_pg)) - goto fail; - -@@ -867,22 +729,22 @@ - - if (copy_co) { - // Just append @CO headers without translation -- const char *line, *end_pointer; -- for (line = translate->text; *line; line = end_pointer + 1) { -- end_pointer = strchr(line, '\n'); -- if (strncmp(line, "@CO", 3) == 0) { -- if (end_pointer) { -- if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) -- goto memfail; -- } else { // Last line with no trailing '\n' -- if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; -- if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; -- } -- } -- if (end_pointer == NULL) break; -+ int num_co = sam_hdr_count_lines(translate, "CO"), i; -+ if (num_co < 0) -+ goto fail; -+ -+ for (i = 0; i < num_co; i++) { -+ if (sam_hdr_find_line_pos(translate, "CO", i, &lines) < 0) -+ goto fail; -+ if (ks_to_ks(&lines, &merged_hdr->out_co)) -+ goto fail; -+ if (kputc('\n', &merged_hdr->out_co) < 0) -+ goto fail; - } - } - -+ free(lines.s); -+ - return 0; - - memfail: -@@ -891,80 +753,22 @@ - trans_tbl_destroy(tbl); - if (rg_list) kl_destroy(hdrln, rg_list); - if (pg_list) kl_destroy(hdrln, pg_list); -+ free(lines.s); - return -1; - } - --static inline void move_kstr_to_text(char **text, kstring_t *ks) { -- memcpy(*text, ks_str(ks), ks_len(ks)); -- *text += ks_len(ks); -- **text = '\0'; -- free(ks_release(ks)); --} -- --/* -- * Populate a bam_hdr_t struct from data in a merged_header_t. -- */ -- --static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) { -- size_t txt_sz; -- char *text; -- bam_hdr_t *hdr; -- -- // Check output text size -- txt_sz = (ks_len(&merged_hdr->out_hd) -- + ks_len(&merged_hdr->out_sq) -- + ks_len(&merged_hdr->out_rg) -- + ks_len(&merged_hdr->out_pg) -- + ks_len(&merged_hdr->out_co)); -- if (txt_sz >= INT32_MAX) { -- fprintf(stderr, "[%s] Output header text too long\n", __func__); -- return NULL; -- } -- -- // Allocate new header -- hdr = bam_hdr_init(); -- if (hdr == NULL) goto memfail; -- -- // Transfer targets arrays to new header -- hdr->n_targets = merged_hdr->n_targets; -- if (hdr->n_targets > 0) { -- // Try to shrink targets arrays to correct size -- hdr->target_name = realloc(merged_hdr->target_name, -- hdr->n_targets * sizeof(char*)); -- if (!hdr->target_name) hdr->target_name = merged_hdr->target_name; -- -- hdr->target_len = realloc(merged_hdr->target_len, -- hdr->n_targets * sizeof(uint32_t)); -- if (!hdr->target_len) hdr->target_len = merged_hdr->target_len; -- -- // These have either been freed by realloc() or, in the unlikely -- // event that failed, have had their ownership transferred to hdr -- merged_hdr->target_name = NULL; -- merged_hdr->target_len = NULL; -- } -- else { -- hdr->target_name = NULL; -- hdr->target_len = NULL; -- } -- -- // Allocate text -- text = hdr->text = malloc(txt_sz + 1); -- if (!text) goto memfail; -- -- // Put header text in order @HD, @SQ, @RG, @PG, @CO -- move_kstr_to_text(&text, &merged_hdr->out_hd); -- move_kstr_to_text(&text, &merged_hdr->out_sq); -- move_kstr_to_text(&text, &merged_hdr->out_rg); -- move_kstr_to_text(&text, &merged_hdr->out_pg); -- move_kstr_to_text(&text, &merged_hdr->out_co); -- hdr->l_text = txt_sz; -- -- return hdr; -+static int finish_merged_header(merged_header_t *merged_hdr) { -+ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_rg), -+ ks_len(&merged_hdr->out_rg)) < 0) -+ return -1; -+ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_pg), -+ ks_len(&merged_hdr->out_pg)) < 0) -+ return -1; -+ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_co), -+ ks_len(&merged_hdr->out_co)) < 0) -+ return -1; - -- memfail: -- perror(__func__); -- bam_hdr_destroy(hdr); -- return NULL; -+ return 0; - } - - /* -@@ -979,8 +783,6 @@ - size_t i; - khiter_t iter; - if (!merged_hdr) return; -- free(ks_release(&merged_hdr->out_hd)); -- free(ks_release(&merged_hdr->out_sq)); - free(ks_release(&merged_hdr->out_rg)); - free(ks_release(&merged_hdr->out_pg)); - free(ks_release(&merged_hdr->out_co)); -@@ -1147,25 +949,30 @@ - @param cmd command name (used in print_error() etc) - @param in_fmt format options for input files - @param out_fmt output file format and options -+ @param write_index create the index, together with the output file -+ @param arg_list command string for PG line -+ @param no_pg if 1, do not add a new PG line - @discussion Padding information may NOT correctly maintained. This - function is NOT thread safe. - */ - int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, -- const char *headers, int n, char * const *fn, int flag, -- const char *reg, int n_threads, const char *cmd, -- const htsFormat *in_fmt, const htsFormat *out_fmt) -+ const char *headers, int n, char * const *fn, char * const *fn_idx, -+ int flag, const char *reg, int n_threads, const char *cmd, -+ const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index, -+ char *arg_list, int no_pg) - { - samFile *fpout, **fp = NULL; - heap1_t *heap = NULL; -- bam_hdr_t *hout = NULL; -- bam_hdr_t *hin = NULL; -+ sam_hdr_t *hout = NULL; -+ sam_hdr_t *hin = NULL; - int i, j, *RG_len = NULL; - uint64_t idx = 0; - char **RG = NULL; - hts_itr_t **iter = NULL; -- bam_hdr_t **hdr = NULL; -+ sam_hdr_t **hdr = NULL; - trans_tbl_t *translation_tbl = NULL; - int *rtrans = NULL; -+ char *out_idx_fn = NULL; - merged_header_t *merged_hdr = init_merged_header(); - if (!merged_hdr) return -1; - -@@ -1188,7 +995,7 @@ - if (sort_tag) { - g_is_by_tag = 1; - g_sort_tag[0] = sort_tag[0]; -- g_sort_tag[1] = sort_tag[1]; -+ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; - } - - fp = (samFile**)calloc(n, sizeof(samFile*)); -@@ -1197,7 +1004,7 @@ - if (!heap) goto mem_fail; - iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); - if (!iter) goto mem_fail; -- hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); -+ hdr = (sam_hdr_t**)calloc(n, sizeof(sam_hdr_t*)); - if (!hdr) goto mem_fail; - translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); - if (!translation_tbl) goto mem_fail; -@@ -1234,7 +1041,7 @@ - - // open and read the header from each file - for (i = 0; i < n; ++i) { -- bam_hdr_t *hin; -+ sam_hdr_t *hin; - fp[i] = sam_open_format(fn[i], "r", in_fmt); - if (fp[i] == NULL) { - print_error_errno(cmd, "fail to open \"%s\"", fn[i]); -@@ -1255,7 +1062,7 @@ - // TODO sam_itr_next() doesn't yet work for SAM files, - // so for those keep the headers around for use with sam_read1() - if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; -- else { bam_hdr_destroy(hin); hdr[i] = NULL; } -+ else { sam_hdr_destroy(hin); hdr[i] = NULL; } - - if ((translation_tbl+i)->lost_coord_sort && !by_qname) { - fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); -@@ -1284,41 +1091,34 @@ - } - - // Transform the header into standard form -- hout = finish_merged_header(merged_hdr); -+ if (finish_merged_header(merged_hdr) < 0) -+ goto fail; -+ -+ hout = merged_hdr->hdr; - if (!hout) return -1; // FIXME: memory leak - - // If we're only merging a specified region move our iters to start at that point - if (reg) { -- int tid, beg, end; -- const char *name_lim; -+ int tid; -+ hts_pos_t beg, end; - -- rtrans = rtrans_build(n, hout->n_targets, translation_tbl); -+ rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl); - if (!rtrans) goto mem_fail; - -- name_lim = hts_parse_reg(reg, &beg, &end); -- if (name_lim) { -- char *name = malloc(name_lim - reg + 1); -- if (!name) goto mem_fail; -- memcpy(name, reg, name_lim - reg); -- name[name_lim - reg] = '\0'; -- tid = bam_name2id(hout, name); -- free(name); -- } -- else { -- // not parsable as a region, but possibly a sequence named "foo:a" -- tid = bam_name2id(hout, reg); -- beg = 0; -- end = INT_MAX; -- } -- if (tid < 0) { -- if (name_lim) fprintf(stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); -- else fprintf(stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); -+ if (!sam_parse_region(hout, reg, &tid, &beg, &end, 0)) { -+ fprintf(stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg); - goto fail; - } - for (i = 0; i < n; ++i) { -- hts_idx_t *idx = sam_index_load(fp[i], fn[i]); -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx != NULL) { -+ idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); -+ } else { -+ idx = sam_index_load(fp[i], fn[i]); -+ } - // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space -- int mapped_tid = rtrans[i*hout->n_targets+tid]; -+ int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; - if (idx == NULL) { - fprintf(stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", - __func__, fn[i]); -@@ -1334,7 +1134,7 @@ - if (mapped_tid != INT32_MIN) { - fprintf(stderr, - "[%s] failed to get iterator over " -- "{%s, %d, %d, %d}\n", -+ "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", - __func__, fn[i], mapped_tid, beg, end); - } else { - fprintf(stderr, -@@ -1371,7 +1171,8 @@ - res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record); - if (res >= 0) { - bam_translate(h->entry.bam_record, translation_tbl + i); -- h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1); -+ h->tid = h->entry.bam_record->core.tid; -+ h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1); - h->rev = bam_is_rev(h->entry.bam_record); - h->idx = idx++; - if (g_is_by_tag) { -@@ -1396,11 +1197,26 @@ - print_error_errno(cmd, "failed to create \"%s\"", out); - return -1; - } -+ if (!no_pg && sam_hdr_add_pg(hout, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); -+ sam_close(fpout); -+ return -1; -+ } - if (sam_hdr_write(fpout, hout) != 0) { - print_error_errno(cmd, "failed to write header to \"%s\"", out); - sam_close(fpout); - return -1; - } -+ if (write_index) { -+ if (!(out_idx_fn = auto_index(fpout, out, hout))){ -+ sam_close(fpout); -+ return -1; -+ } -+ } - if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); - - // Begin the actual merge -@@ -1415,11 +1231,13 @@ - if (sam_write1(fpout, hout, b) < 0) { - print_error_errno(cmd, "failed writing to \"%s\"", out); - sam_close(fpout); -+ free(out_idx_fn); - return -1; - } - if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { - bam_translate(b, translation_tbl + heap->i); -- heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1); -+ heap->tid = b->core.tid; -+ heap->pos = (uint64_t)(b->core.pos + 1); - heap->rev = bam_is_rev(b); - heap->idx = idx++; - if (g_is_by_tag) { -@@ -1439,6 +1257,14 @@ - ks_heapadjust(heap, 0, n, heap); - } - -+ if (write_index) { -+ if (sam_idx_save(fpout) < 0) { -+ print_error_errno("merge", "writing index failed"); -+ goto fail; -+ } -+ } -+ free(out_idx_fn); -+ - // Clean up and close - if (flag & MERGE_RG) { - for (i = 0; i != n; ++i) free(RG[i]); -@@ -1447,11 +1273,11 @@ - for (i = 0; i < n; ++i) { - trans_tbl_destroy(translation_tbl + i); - hts_itr_destroy(iter[i]); -- bam_hdr_destroy(hdr[i]); -+ sam_hdr_destroy(hdr[i]); - sam_close(fp[i]); - } -- bam_hdr_destroy(hin); -- bam_hdr_destroy(hout); -+ sam_hdr_destroy(hin); -+ sam_hdr_destroy(hout); - free_merged_header(merged_hdr); - free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); - if (sam_close(fpout) < 0) { -@@ -1473,11 +1299,11 @@ - for (i = 0; i < n; ++i) { - if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i); - if (iter && iter[i]) hts_itr_destroy(iter[i]); -- if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); -+ if (hdr && hdr[i]) sam_hdr_destroy(hdr[i]); - if (fp && fp[i]) sam_close(fp[i]); - if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); - } -- if (hout) bam_hdr_destroy(hout); -+ if (hout) sam_hdr_destroy(hout); - free(RG); - free(translation_tbl); - free(hdr); -@@ -1485,6 +1311,7 @@ - free(heap); - free(fp); - free(rtrans); -+ free(out_idx_fn); - return -1; - } - -@@ -1495,7 +1322,7 @@ - strcpy(mode, "wb"); - if (flag & MERGE_UNCOMP) strcat(mode, "0"); - else if (flag & MERGE_LEVEL1) strcat(mode, "1"); -- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL); -+ return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); - } - - static void merge_usage(FILE *to) -@@ -1516,23 +1343,27 @@ - " -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n" - " -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n" - " -s VALUE Override random seed\n" --" -b FILE List of input BAM filenames, one per line [null]\n"); -- sam_global_opt_help(to, "-.O..@"); -+" -b FILE List of input BAM filenames, one per line [null]\n" -+" -X Use customized index files\n" -+" --no-PG do not add a PG line\n"); -+ sam_global_opt_help(to, "-.O..@.."); - } - - int bam_merge(int argc, char *argv[]) - { -- int c, is_by_qname = 0, flag = 0, ret = 0, level = -1; -+ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; - char *fn_headers = NULL, *reg = NULL, mode[12]; -- char *sort_tag = NULL; -+ char *sort_tag = NULL, *arg_list = NULL; - long random_seed = (long)time(NULL); - char** fn = NULL; -- int fn_size = 0; -+ char** fn_idx = NULL; -+ int fn_size = 0, no_pg = 0; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), - { "threads", required_argument, NULL, '@' }, -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -1541,13 +1372,13 @@ - return 0; - } - -- while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) { - switch (c) { - case 'r': flag |= MERGE_RG; break; - case 'f': flag |= MERGE_FORCE; break; -- case 'h': fn_headers = strdup(optarg); break; -+ case 'h': fn_headers = optarg; break; - case 'n': is_by_qname = 1; break; -- case 't': sort_tag = strdup(optarg); break; -+ case 't': sort_tag = optarg; break; - case '1': flag |= MERGE_LEVEL1; level = 1; break; - case 'u': flag |= MERGE_UNCOMP; level = 0; break; - case 'R': reg = strdup(optarg); break; -@@ -1555,8 +1386,13 @@ - case 'c': flag |= MERGE_COMBINE_RG; break; - case 'p': flag |= MERGE_COMBINE_PG; break; - case 's': random_seed = atol(optarg); break; -+ case 'X': has_index_file = 1; break; // -X flag for index filename - case 'b': { - // load the list of files to read -+ if (has_index_file) { -+ fprintf(stderr,"Error: The -b option cannot be combined with -X\n"); -+ ret = 1; goto end; -+ } - int nfiles; - char **fn_read = hts_readlines(optarg, &nfiles); - if (fn_read) { -@@ -1573,7 +1409,7 @@ - } - break; - } -- -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': merge_usage(stderr); return 1; -@@ -1585,6 +1421,11 @@ - return 1; - } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("merge", "failed to create arg_list"); -+ return 1; -+ } -+ - srand48(random_seed); - if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { - FILE *fp = fopen(argv[optind], "rb"); -@@ -1595,24 +1436,41 @@ - } - } - -- int nargcfiles = argc - (optind+1); -+ int nargcfiles = 0; -+ if (has_index_file) { // Calculate # of input BAM files -+ if ((argc - optind - 1) % 2 != 0) { -+ fprintf(stderr, "Odd number of filenames detected! Each BAM file should have an index file\n"); -+ return 1; -+ } -+ nargcfiles = (argc - optind - 1) / 2; -+ } else { -+ nargcfiles = argc - optind - 1; -+ } -+ - if (nargcfiles > 0) { - // Add argc files to end of array - fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); - if (fn == NULL) { ret = 1; goto end; } - memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); -+ -+ if(has_index_file) { -+ fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*)); -+ if (fn_idx == NULL) { ret = 1; goto end; } -+ memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*)); -+ } - } - if (fn_size+nargcfiles < 1) { - print_error("merge", "You must specify at least one (and usually two or more) input files"); - merge_usage(stderr); -+ free(fn_idx); - return 1; - } - strcpy(mode, "wb"); - sam_open_mode(mode+1, argv[optind], NULL); - if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); - if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers, -- fn_size+nargcfiles, fn, flag, reg, ga.nthreads, -- "merge", &ga.in, &ga.out) < 0) -+ fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads, -+ "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) - ret = 1; - - end: -@@ -1621,8 +1479,9 @@ - for (i=0; ii, res; - if (i < nfiles) { // read from file - res = sam_read1(fp[i], hout, heap->entry.bam_record); -@@ -1655,8 +1514,8 @@ - } - } - if (res >= 0) { -- heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32) -- | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)); -+ heap->tid = heap->entry.bam_record->core.tid; -+ heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1); - heap->rev = bam_is_rev(heap->entry.bam_record); - heap->idx = (*idx)++; - if (g_is_by_tag) { -@@ -1676,21 +1535,23 @@ - } - - static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, -- const char *mode, bam_hdr_t *hout, -+ const char *mode, sam_hdr_t *hout, - int n, char * const *fn, int num_in_mem, - buf_region *in_mem, bam1_tag *buf, int n_threads, - const char *cmd, const htsFormat *in_fmt, -- const htsFormat *out_fmt) { -+ const htsFormat *out_fmt, char *arg_list, int no_pg, -+ int write_index) { - samFile *fpout = NULL, **fp = NULL; - heap1_t *heap = NULL; - uint64_t idx = 0; - int i, heap_size = n + num_in_mem; -+ char *out_idx_fn = NULL; - - g_is_by_qname = by_qname; - if (sort_tag) { - g_is_by_tag = 1; - g_sort_tag[0] = sort_tag[0]; -- g_sort_tag[1] = sort_tag[1]; -+ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; - } - if (n > 0) { - fp = (samFile**)calloc(n, sizeof(samFile*)); -@@ -1701,7 +1562,7 @@ - - // Open each file, read the header and put the first read into the heap - for (i = 0; i < heap_size; i++) { -- bam_hdr_t *hin; -+ sam_hdr_t *hin; - heap1_t *h = &heap[i]; - - if (i < n) { -@@ -1718,7 +1579,7 @@ - goto fail; - } - // ... and throw it away as we don't really need it -- bam_hdr_destroy(hin); -+ sam_hdr_destroy(hin); - } - - // Get a read into the heap -@@ -1741,6 +1602,16 @@ - return -1; - } - -+ if (!no_pg && sam_hdr_add_pg(hout, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); -+ sam_close(fpout); -+ return -1; -+ } -+ - if (n_threads > 1) hts_set_threads(fpout, n_threads); - - if (sam_hdr_write(fpout, hout) != 0) { -@@ -1749,14 +1620,20 @@ - return -1; - } - -+ if (write_index) { -+ if (!(out_idx_fn = auto_index(fpout, out, hout))){ -+ sam_close(fpout); -+ return -1; -+ } -+ } -+ - // Now do the merge - ks_heapmake(heap, heap_size, heap); - while (heap->pos != HEAP_EMPTY) { - bam1_t *b = heap->entry.bam_record; - if (sam_write1(fpout, hout, b) < 0) { - print_error_errno(cmd, "failed writing to \"%s\"", out); -- sam_close(fpout); -- return -1; -+ goto fail; - } - if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { - assert(heap->i < n); -@@ -1775,6 +1652,15 @@ - } - free(fp); - free(heap); -+ -+ if (write_index) { -+ if (sam_idx_save(fpout) < 0) { -+ print_error_errno("merge", "writing index failed"); -+ goto fail; -+ } -+ free(out_idx_fn); -+ } -+ - if (sam_close(fpout) < 0) { - print_error(cmd, "error closing output file"); - return -1; -@@ -1786,11 +1672,15 @@ - fail: - for (i = 0; i < n; i++) { - if (fp && fp[i]) sam_close(fp[i]); -- if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); -+ } -+ for (i = 0; i < heap_size; i++) { -+ if (heap && heap[i].i < n && heap[i].entry.bam_record) -+ bam_destroy1(heap[i].entry.bam_record); - } - free(fp); - free(heap); - if (fpout) sam_close(fpout); -+ free(out_idx_fn); - return -1; - } - -@@ -1811,8 +1701,13 @@ - if (t != 0) return t; - return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); - } else { -- pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1); -- pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1); -+ pa = a.bam_record->core.tid; -+ pb = b.bam_record->core.tid; -+ -+ if (pa == pb) { -+ pa = (uint64_t)(a.bam_record->core.pos+1); -+ pb = (uint64_t)(b.bam_record->core.pos+1); -+ } - - if (pa == pb) { - pa = bam_is_rev(a.bam_record); -@@ -1913,7 +1808,7 @@ - size_t buf_len; - const char *prefix; - bam1_tag *buf; -- const bam_hdr_t *h; -+ const sam_hdr_t *h; - int index; - int error; - int no_save; -@@ -1921,45 +1816,99 @@ - - // Returns 0 for success - // -1 for failure --static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) -+static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, -+ const sam_hdr_t *h, int n_threads, const htsFormat *fmt, -+ char *arg_list, int no_pg, int write_index) - { - size_t i; - samFile* fp; -+ char *out_idx_fn = NULL; -+ - fp = sam_open_format(fn, mode, fmt); - if (fp == NULL) return -1; -- if (sam_hdr_write(fp, h) != 0) goto fail; -+ if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ goto fail; -+ } -+ if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail; -+ -+ if (write_index) { -+ if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail; -+ } -+ - if (n_threads > 1) hts_set_threads(fp, n_threads); - for (i = 0; i < l; ++i) { -- if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail; -+ if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail; - } -+ -+ if (write_index) { -+ if (sam_idx_save(fp) < 0) { -+ print_error_errno("merge", "writing index failed"); -+ goto fail; -+ } -+ free(out_idx_fn); -+ } -+ -+ - if (sam_close(fp) < 0) return -1; - return 0; - fail: - sam_close(fp); -+ free(out_idx_fn); - return -1; - } - - #define NUMBASE 256 --#define STEP 8 - --static int ks_radixsort(size_t n, bam1_tag *buf, const bam_hdr_t *h) -+static int ks_radixsort(size_t n, bam1_tag *buf, const sam_hdr_t *h) - { - int curr = 0, ret = -1; - ssize_t i; - bam1_tag *buf_ar2[2], *bam_a, *bam_b; -- uint64_t max_pos = 0, max_digit = 0, shift = 0; -- -+ uint64_t max_pos = 1; -+ uint32_t max_tid = 1, tid_bytes = 0, pos_bytes = 0, byte = 0; -+ uint32_t tid_shift_l, tid_shift_r; -+ int nref = sam_hdr_nref(h); -+ -+ // Count number of bytes needed for biggest tid and pos -+ // Notes: Add 1 to core.pos so always positive. -+ // Convert unmapped tid (-1) to number of references so unmapped -+ // sort to the end. - for (i = 0; i < n; i++) { - bam1_t *b = buf[i].bam_record; -- int32_t tid = b->core.tid == -1 ? h->n_targets : b->core.tid; -- buf[i].u.pos = (uint64_t)tid<<32 | (b->core.pos+1)<<1 | bam_is_rev(b); -- if (max_pos < buf[i].u.pos) -- max_pos = buf[i].u.pos; -- } -- -- while (max_pos) { -- ++max_digit; -- max_pos = max_pos >> 1; -+ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; -+ uint64_t pos = ((uint64_t)(b->core.pos + 1) << 1) | bam_is_rev(b); -+ if (max_tid < tid) -+ max_tid = tid; -+ if (max_pos < pos) -+ max_pos = pos; -+ } -+ -+ for (; max_pos > 0; max_pos >>= 8) pos_bytes++; -+ for (; max_tid > 0; max_tid >>= 8) tid_bytes++; -+ assert(pos_bytes + tid_bytes < sizeof(buf[0].u.pos_tid)); -+ -+ tid_shift_l = pos_bytes * 8; -+ tid_shift_r = 64 - tid_shift_l; -+ -+ // Write position and tid into bam1_tag::u::pos_tid using minimum number -+ // of bytes required. Values are stored little-endian so that we -+ // get a least-significant digit (byte) radix sort. -+ for (i = 0; i < n; i++) { -+ bam1_t *b = buf[i].bam_record; -+ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; -+ // 'pos' here includes as many bytes of tid as will fit -+ // in the space remaining above pos_bytes. The rest of tid -+ // is written out separately. -+ uint64_t pos = (bam_is_rev(b) | -+ ((uint64_t)(b->core.pos + 1) << 1) | -+ (tid_shift_l < 64 ? (uint64_t) tid << tid_shift_l : 0)); -+ u64_to_le(pos, buf[i].u.pos_tid); -+ u32_to_le(tid_shift_r < 32 ? tid >> tid_shift_r : 0, -+ &buf[i].u.pos_tid[8]); - } - - buf_ar2[0] = buf; -@@ -1969,18 +1918,18 @@ - goto err; - } - -- while (shift < max_digit){ -+ // Least-significant digit radix sort (where "digits" are bytes) -+ for (byte = 0; byte < pos_bytes + tid_bytes; byte++) { - size_t remainders[NUMBASE] = { 0 }; - bam_a = buf_ar2[curr]; bam_b = buf_ar2[1-curr]; - for (i = 0; i < n; ++i) -- remainders[(bam_a[i].u.pos >> shift) % NUMBASE]++; -+ remainders[bam_a[i].u.pos_tid[byte]]++; - for (i = 1; i < NUMBASE; ++i) - remainders[i] += remainders[i - 1]; - for (i = n - 1; i >= 0; i--) { -- size_t j = --remainders[(bam_a[i].u.pos >> shift) % NUMBASE]; -+ size_t j = --remainders[bam_a[i].u.pos_tid[byte]]; - bam_b[j] = bam_a[i]; - } -- shift += STEP; - curr = 1 - curr; - } - if (curr == 1) { -@@ -2034,10 +1983,10 @@ - return 0; - } - -- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0) -+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0) - w->error = errno; - } else { -- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) -+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0) - w->error = errno; - } - -@@ -2046,7 +1995,7 @@ - } - - static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, -- const bam_hdr_t *h, int n_threads, buf_region *in_mem) -+ const sam_hdr_t *h, int n_threads, buf_region *in_mem) - { - int i; - size_t pos, rest; -@@ -2107,6 +2056,9 @@ - @param max_mem approxiate maximum memory (very inaccurate) - @param in_fmt input file format options - @param out_fmt output file format and options -+ @param arg_list command string for PG line -+ @param no_pg if 1, do not add a new PG line -+ @paran write_index create index for the output file - @return 0 for successful sorting, negative on errors - - @discussion It may create multiple temporary subalignment files -@@ -2116,11 +2068,12 @@ - int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, - const char *fnout, const char *modeout, - size_t _max_mem, int n_threads, -- const htsFormat *in_fmt, const htsFormat *out_fmt) -+ const htsFormat *in_fmt, const htsFormat *out_fmt, -+ char *arg_list, int no_pg, int write_index) - { - int ret = -1, res, i, n_files = 0; - size_t max_k, k, max_mem, bam_mem_offset; -- bam_hdr_t *header = NULL; -+ sam_hdr_t *header = NULL; - samFile *fp; - bam1_tag *buf = NULL; - bam1_t *b = bam_init1(); -@@ -2139,7 +2092,8 @@ - g_is_by_qname = is_by_qname; - if (sort_by_tag) { - g_is_by_tag = 1; -- strncpy(g_sort_tag, sort_by_tag, 2); -+ g_sort_tag[0] = sort_by_tag[0]; -+ g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0'; - } - - max_mem = _max_mem * n_threads; -@@ -2162,14 +2116,15 @@ - else - new_so = "coordinate"; - -- if (sam_hdr_change_HD(header, "SO", new_so) != 0) { -- print_error("sort", -- "failed to change sort order header to '%s'\n", new_so); -+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) -+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) -+ ) { -+ print_error("sort", "failed to change sort order header to '%s'\n", new_so); - goto err; - } -- if (sam_hdr_change_HD(header, "GO", NULL) != 0) { -- print_error("sort", -- "failed to delete group order header\n"); -+ -+ if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { -+ print_error("sort", "failed to delete group order header\n"); - goto err; - } - -@@ -2252,7 +2207,7 @@ - - // write the final output - if (n_files == 0 && num_in_mem < 2) { // a single block -- if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { -+ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) { - print_error_errno("sort", "failed to create \"%s\"", fnout); - goto err; - } -@@ -2269,7 +2224,8 @@ - } - if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, - n_files, fns, num_in_mem, in_mem, buf, -- n_threads, "sort", in_fmt, out_fmt) < 0) { -+ n_threads, "sort", in_fmt, out_fmt, arg_list, -+ no_pg, write_index) < 0) { - // Propagate bam_merge_simple() failure; it has already emitted a - // message explaining the failure, so no further message is needed. - goto err; -@@ -2293,7 +2249,7 @@ - free(buf); - free(bam_mem); - free(in_mem); -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - if (fp) sam_close(fp); - return ret; - } -@@ -2305,7 +2261,7 @@ - char *fnout = calloc(strlen(prefix) + 4 + 1, 1); - if (!fnout) return -1; - sprintf(fnout, "%s.bam", prefix); -- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL); -+ ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); - free(fnout); - return ret; - } -@@ -2320,8 +2276,9 @@ - " -n Sort by read name\n" - " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" - " -o FILE Write final output to FILE rather than standard output\n" --" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"); -- sam_global_opt_help(fp, "-.O..@"); -+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" -+" --no-PG do not add a PG line\n"); -+ sam_global_opt_help(fp, "-.O..@-."); - } - - static void complain_about_memory_setting(size_t max_mem) { -@@ -2344,8 +2301,8 @@ - int bam_sort(int argc, char *argv[]) - { - size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; -- int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1; -- char* sort_tag = NULL; -+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; -+ char* sort_tag = NULL, *arg_list = NULL; - char *fnout = "-", modeout[12]; - kstring_t tmpprefix = { 0, 0, NULL }; - struct stat st; -@@ -2354,6 +2311,7 @@ - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), - { "threads", required_argument, NULL, '@' }, -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -2361,7 +2319,7 @@ - switch (c) { - case 'o': fnout = optarg; o_seen = 1; break; - case 'n': is_by_qname = 1; break; -- case 't': sort_tag = strdup(optarg); break; -+ case 't': sort_tag = optarg; break; - case 'm': { - char *q; - max_mem = strtol(optarg, &q, 0); -@@ -2372,6 +2330,7 @@ - } - case 'T': kputs(optarg, &tmpprefix); break; - case 'l': level = atoi(optarg); break; -+ case 1: no_pg = 1; break; - - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ -@@ -2395,6 +2354,16 @@ - goto sort_end; - } - -+ if (ga.write_index && (is_by_qname || sort_tag)) { -+ fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); -+ ga.write_index = 0; -+ } -+ -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("sort", "failed to create arg_list"); -+ return 1; -+ } -+ - if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) { - complain_about_memory_setting(max_mem); - ret = EXIT_FAILURE; -@@ -2417,7 +2386,7 @@ - - ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, -- &ga.in, &ga.out); -+ &ga.in, &ga.out, arg_list, no_pg, ga.write_index); - if (ret >= 0) - ret = EXIT_SUCCESS; - else { -@@ -2432,6 +2401,7 @@ - - sort_end: - free(tmpprefix.s); -+ free(arg_list); - sam_global_args_free(&ga); - - return ret; ---- python-pysam.orig/samtools/bam_sort.c.pysam.c -+++ python-pysam/samtools/bam_sort.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_sort.c -- sorting and merging. - -- Copyright (C) 2008-2016 Genome Research Ltd. -+ Copyright (C) 2008-2019 Genome Research Ltd. - Portions copyright (C) 2009-2012 Broad Institute. - - Author: Heng Li -@@ -46,6 +46,7 @@ - #include "htslib/klist.h" - #include "htslib/kstring.h" - #include "htslib/sam.h" -+#include "htslib/hts_endian.h" - #include "sam_opts.h" - #include "samtools.h" - -@@ -57,7 +58,7 @@ - bam1_t *bam_record; - union { - const uint8_t *tag; -- uint64_t pos; -+ uint8_t pos_tid[12]; - } u; - } bam1_tag; - -@@ -124,12 +125,12 @@ - return *pa? 1 : *pb? -1 : 0; - } - --#define HEAP_EMPTY UINT64_MAX -+#define HEAP_EMPTY (UINT64_MAX >> 1) - - typedef struct { - int i; -- uint32_t rev; -- uint64_t pos, idx; -+ uint32_t tid; -+ uint64_t pos:63, rev:1, idx; - bam1_tag entry; - } heap1_t; - -@@ -155,6 +156,7 @@ - fb = b.entry.bam_record->core.flag & 0xc0; - if (fa != fb) return fa > fb; - } else { -+ if (a.tid != b.tid) return a.tid > b.tid; - if (a.pos != b.pos) return a.pos > b.pos; - if (a.rev != b.rev) return a.rev > b.rev; - } -@@ -166,8 +168,7 @@ - KSORT_INIT(heap, heap1_t, heap_lt) - - typedef struct merged_header { -- kstring_t out_hd; -- kstring_t out_sq; -+ sam_hdr_t *hdr; - kstring_t out_rg; - kstring_t out_pg; - kstring_t out_co; -@@ -189,80 +190,6 @@ - bool lost_coord_sort; - } trans_tbl_t; - --/* Something to look like a regmatch_t */ --typedef struct hdr_match { -- ptrdiff_t rm_so; -- ptrdiff_t rm_eo; --} hdr_match_t; -- --/* -- * Search for header lines of a particular record type. -- * -- * This replaces a regex search for something like /^@SQ.*\tSN:([^\t]+).*$/ -- * but is much quicker. The locations found are returned in *matches, -- * which has a signature the same as that of a regmatch_t. -- * -- * rec is the record type to match (i.e. @HD, @SQ, @PG or @RG) -- * tag is a tag type in the record to match (SN for @SQ, ID for @PG or @RG) -- * -- * The location of the record (if found) is returned in matches[0] -- * If tag is not NULL, the record is searched for the presence of the -- * given tag. If found, the location of the value is returned in matches[1]. -- * If the tag isn't found then the record is ignored and the search resumes -- * on the next header line. -- * -- * For simplicity, some assumptions are made about rec and tag: -- * rec should include the leading '@' sign and be three characters long. -- * tag should be exactly two characters long. -- * These are always string constants when this is called below, so we don't -- * bother to check here. -- * -- * Returns 0 if a match was found, -1 if not. -- */ -- -- --static int hdr_line_match(const char *text, const char *rec, -- const char *tag, hdr_match_t *matches) { -- const char *line_start, *line_end = text; -- const char *tag_start, *tag_end; -- -- for (;;) { -- // Find record, ensure either at start of text or follows '\n' -- line_start = strstr(line_end, rec); -- while (line_start && line_start > text && *(line_start - 1) != '\n') { -- line_start = strstr(line_start + 3, rec); -- } -- if (!line_start) return -1; -- -- // Find end of header line -- line_end = strchr(line_start, '\n'); -- if (!line_end) line_end = line_start + strlen(line_start); -- -- matches[0].rm_so = line_start - text; -- matches[0].rm_eo = line_end - text; -- if (!tag) return 0; // Match found if not looking for tag. -- -- for (tag_start = line_start + 3; tag_start < line_end; tag_start++) { -- // Find possible tag start. Hacky but quick. -- while (*tag_start > '\n') tag_start++; -- -- // Check it -- if (tag_start[0] == '\t' -- && strncmp(tag_start + 1, tag, 2) == 0 -- && tag_start[3] == ':') { -- // Found tag, record location and return. -- tag_end = tag_start + 4; -- while (*tag_end && *tag_end != '\t' && *tag_end != '\n') -- ++tag_end; -- matches[1].rm_so = tag_start - text + 4; -- matches[1].rm_eo = tag_end - text; -- return 0; -- } -- } -- // Couldn't find tag, try again from end of current record. -- } --} -- - static void trans_tbl_destroy(trans_tbl_t *tbl) { - khiter_t iter; - -@@ -301,6 +228,9 @@ - merged_hdr = calloc(1, sizeof(*merged_hdr)); - if (merged_hdr == NULL) return NULL; - -+ merged_hdr->hdr = sam_hdr_init(); -+ if (!merged_hdr->hdr) goto fail; -+ - merged_hdr->targets_sz = 16; - merged_hdr->target_name = malloc(merged_hdr->targets_sz - * sizeof(*merged_hdr->target_name)); -@@ -328,6 +258,7 @@ - kh_destroy(c2i, merged_hdr->sq_tids); - free(merged_hdr->target_name); - free(merged_hdr->target_len); -+ sam_hdr_destroy(merged_hdr->hdr); - free(merged_hdr); - return NULL; - } -@@ -340,12 +271,6 @@ - return kputsn(src + from, to - from, dest) != to - from; - } - --// Append a header line match to kstring --static inline int match_to_ks(const char *src, const hdr_match_t *match, -- kstring_t *dest) { -- return range_to_ks(src, match->rm_so, match->rm_eo, dest); --} -- - // Append a kstring to a kstring - static inline int ks_to_ks(kstring_t *src, kstring_t *dest) { - return kputsn(ks_str(src), ks_len(src), dest) != ks_len(src); -@@ -387,48 +312,32 @@ - */ - - static int trans_tbl_add_hd(merged_header_t* merged_hdr, -- bam_hdr_t *translate) { -- hdr_match_t match = {0, 0}; -+ sam_hdr_t *translate) { -+ kstring_t hd_line = { 0, 0, NULL }; -+ int res; - - // TODO: handle case when @HD needs merging. - if (merged_hdr->have_hd) return 0; - -- if (hdr_line_match(translate->text, "@HD", NULL, &match) != 0) { -- return 0; -+ res = sam_hdr_find_hd(translate, &hd_line); -+ if (res < -1) { -+ print_error("merge", "failed to get @HD line from header"); -+ return -1; - } - -- if (match_to_ks(translate->text, &match, &merged_hdr->out_hd)) goto memfail; -- if (kputc('\n', &merged_hdr->out_hd) == EOF) goto memfail; -- merged_hdr->have_hd = true; -- -- return 0; -- -- memfail: -- perror(__func__); -- return -1; --} -+ if (res < 0) // Not found -+ return 0; - --static inline int grow_target_list(merged_header_t* merged_hdr) { -- size_t new_size; -- char **new_names; -- uint32_t *new_len; -- -- new_size = merged_hdr->targets_sz * 2; -- new_names = realloc(merged_hdr->target_name, sizeof(*new_names) * new_size); -- if (!new_names) goto fail; -- merged_hdr->target_name = new_names; -- -- new_len = realloc(merged_hdr->target_len, sizeof(*new_len) * new_size); -- if (!new_len) goto fail; -- merged_hdr->target_len = new_len; -+ if (sam_hdr_add_lines(merged_hdr->hdr, hd_line.s, hd_line.l) < 0) { -+ print_error("merge", "failed to add @HD line to new header"); -+ free(hd_line.s); -+ return -1; -+ } - -- merged_hdr->targets_sz = new_size; -+ free(hd_line.s); -+ merged_hdr->have_hd = true; - - return 0; -- -- fail: -- perror(__func__); -- return -1; - } - - /* -@@ -446,54 +355,48 @@ - * Returns 0 on success, -1 on failure. - */ - --static int trans_tbl_add_sq(merged_header_t* merged_hdr, bam_hdr_t *translate, -+static int trans_tbl_add_sq(merged_header_t* merged_hdr, sam_hdr_t *translate, - trans_tbl_t* tbl) { -- -- kstring_t *out_text = &merged_hdr->out_sq; -- khash_t(c2i)* sq_tids = merged_hdr->sq_tids; -- hdr_match_t *new_sq_matches = NULL; -- char *text; -- hdr_match_t matches[2]; - int32_t i; -- int32_t old_n_targets = merged_hdr->n_targets; -- khiter_t iter; -- int min_tid = -1; -+ int min_tid = -1, res; -+ kstring_t sq_line = { 0, 0, NULL }, sq_sn = { 0, 0, NULL }; - - // Fill in the tid part of the translation table, adding new targets - // to the merged header as we go. - -- for (i = 0; i < translate->n_targets; ++i) { -+ for (i = 0; i < sam_hdr_nref(translate); ++i) { -+ int trans_tid; -+ sq_sn.l = 0; -+ res = sam_hdr_find_tag_pos(translate, "SQ", i, "SN", &sq_sn); -+ if (res < 0) { -+ print_error("merge", "failed to get @SQ SN #%d from header", i + 1); -+ goto fail; -+ } - -- // Check if it's a new target. -- iter = kh_get(c2i, sq_tids, translate->target_name[i]); -+ trans_tid = sam_hdr_name2tid(merged_hdr->hdr, sq_sn.s); -+ if (trans_tid < -1) { -+ print_error("merge", "failed to lookup ref"); -+ goto fail; -+ } - -- if (iter == kh_end(sq_tids)) { -- int ret; -+ if (trans_tid < 0) { - // Append missing entries to out_hdr -- -- if (merged_hdr->n_targets == merged_hdr->targets_sz) { -- if (grow_target_list(merged_hdr)) goto fail; -+ sq_line.l = 0; -+ res = sam_hdr_find_line_id(translate, "SQ", "SN", sq_sn.s, &sq_line); -+ if (res < 0) { -+ print_error("merge", "failed to get @SQ SN:%s from header", sq_sn.s); -+ goto fail; - } - -- merged_hdr->target_name[merged_hdr->n_targets] = strdup(translate->target_name[i]); -- if (merged_hdr->target_name[merged_hdr->n_targets] == NULL) goto memfail; -- merged_hdr->target_len[merged_hdr->n_targets] = translate->target_len[i]; -- -- // Record the new identifier for reference below, -- // and when building the ttable for other inputs. -- iter = kh_put(c2i, sq_tids, -- merged_hdr->target_name[merged_hdr->n_targets], &ret); -- if (ret < 0) { -- free(merged_hdr->target_name[merged_hdr->n_targets]); -- goto memfail; -- } -- assert(ret > 0); // Should not be in hash already. -+ trans_tid = sam_hdr_nref(merged_hdr->hdr); - -- kh_value(sq_tids, iter) = merged_hdr->n_targets; -- tbl->tid_trans[i] = merged_hdr->n_targets++; -- } else { -- tbl->tid_trans[i] = kh_value(sq_tids, iter); -+ res = sam_hdr_add_lines(merged_hdr->hdr, sq_line.s, sq_line.l); -+ if (res < 0) { -+ print_error("merge", "failed to add @SQ SN:%s to new header", sq_sn.s); -+ goto fail; -+ } - } -+ tbl->tid_trans[i] = trans_tid; - - if (tbl->tid_trans[i] > min_tid) { - min_tid = tbl->tid_trans[i]; -@@ -502,78 +405,14 @@ - } - } - -- if (merged_hdr->n_targets == old_n_targets) -- return 0; // Everything done if no new targets. -- -- // Otherwise, find @SQ lines in translate->text for all newly added targets. -- -- new_sq_matches = malloc((merged_hdr->n_targets - old_n_targets) -- * sizeof(*new_sq_matches)); -- if (new_sq_matches == NULL) goto memfail; -- -- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { -- new_sq_matches[i].rm_so = new_sq_matches[i].rm_eo = -1; -- } -- -- text = translate->text; -- while (hdr_line_match(text, "@SQ", "SN", matches) == 0) { -- // matches[0] is whole line, matches[1] is SN value. -- -- // This is a bit disgusting, but avoids a copy... -- char c = text[matches[1].rm_eo]; -- int idx; -- -- text[matches[1].rm_eo] = '\0'; -- -- // Look up the SN value in the sq_tids hash. -- iter = kh_get(c2i, sq_tids, text + matches[1].rm_so); -- text[matches[1].rm_eo] = c; // restore text -- -- if (iter == kh_end(sq_tids)) { -- // Warn about this, but it's not really fatal. -- fprintf(samtools_stderr, "[W::%s] @SQ SN (%.*s) found in text header but not binary header.\n", -- __func__, -- (int) (matches[1].rm_eo - matches[1].rm_so), -- text + matches[1].rm_so); -- text += matches[0].rm_eo; -- continue; // Skip to next -- } -- -- idx = kh_value(sq_tids, iter); -- if (idx >= old_n_targets) { -- // is a new SQ, so record position so we can add it to out_text. -- assert(idx < merged_hdr->n_targets); -- ptrdiff_t off = text - translate->text; -- new_sq_matches[idx - old_n_targets].rm_so = matches[0].rm_so + off; -- new_sq_matches[idx - old_n_targets].rm_eo = matches[0].rm_eo + off; -- } -- -- // Carry on searching from end of current match -- text += matches[0].rm_eo; -- } -- -- // Copy the @SQ headers found and recreate any missing from binary header. -- for (i = 0; i < merged_hdr->n_targets - old_n_targets; i++) { -- if (new_sq_matches[i].rm_so >= 0) { -- if (match_to_ks(translate->text, &new_sq_matches[i], out_text)) -- goto memfail; -- if (kputc('\n', out_text) == EOF) goto memfail; -- } else { -- if (kputs("@SQ\tSN:", out_text) == EOF || -- kputs(merged_hdr->target_name[i + old_n_targets], out_text) == EOF || -- kputs("\tLN:", out_text) == EOF || -- kputuw(merged_hdr->target_len[i + old_n_targets], out_text) == EOF || -- kputc('\n', out_text) == EOF) goto memfail; -- } -- } -+ free(sq_line.s); -+ free(sq_sn.s); - -- free(new_sq_matches); - return 0; - -- memfail: -- perror(__func__); - fail: -- free(new_sq_matches); -+ free(sq_line.s); -+ free(sq_sn.s); - return -1; - } - -@@ -594,29 +433,30 @@ - * - */ - --static klist_t(hdrln) * trans_rg_pg(bool is_rg, bam_hdr_t *translate, -+static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, - bool merge, khash_t(cset)* known_ids, - khash_t(c2c)* id_map, char *override) { -- hdr_match_t matches[2]; - khiter_t iter; -- const char *text = translate->text; -- const char *rec_type = is_rg ? "@RG" : "@PG"; -+ int num_ids, i; -+ const char *rec_type = is_rg ? "RG" : "PG"; - klist_t(hdrln) *hdr_lines; - - hdr_lines = kl_init(hdrln); - - // Search through translate's header -- while (hdr_line_match(text, rec_type, "ID", matches) == 0) { -- // matches[0] is the whole @RG/PG line; matches[1] is the ID field value -+ num_ids = sam_hdr_count_lines(translate, rec_type); -+ if (num_ids < 0) -+ goto fail; - -+ for (i = 0; i < num_ids; i++) { - kstring_t orig_id = { 0, 0, NULL }; // ID in original header - kstring_t transformed_id = { 0, 0, NULL }; // ID in output header - char *map_value; // Value to store in id_map - bool id_changed; // Have we changed the ID? - bool not_found_in_output; // ID isn't in the output header (yet) - -- // Take a copy of the ID as we'll need it for a hash key. -- if (match_to_ks(text, &matches[1], &orig_id)) goto memfail; -+ if (sam_hdr_find_tag_pos(translate, rec_type, i, "ID", &orig_id) < 0) -+ goto fail; - - // is our matched ID in our output ID set already? - iter = kh_get(cset, known_ids, ks_str(&orig_id)); -@@ -653,18 +493,38 @@ - - // Does this line need to go into our output header? - if (not_found_in_output) { -- - // Take matched line and replace ID with transformed_id - kstring_t new_hdr_line = { 0, 0, NULL }; -+ if (sam_hdr_find_line_id(translate, rec_type, -+ "ID", ks_str(&orig_id), &new_hdr_line) < 0){ -+ goto fail; -+ } -+ -+ if (id_changed) { -+ char *idp = strstr(ks_str(&new_hdr_line), "\tID:"), *id_end; -+ ptrdiff_t id_offset, id_len; -+ if (!idp) { -+ print_error("merge", "failed to find ID in \"%s\"\n", -+ ks_str(&new_hdr_line)); -+ goto fail; -+ } -+ idp += 4; -+ for (id_end = idp; *id_end >= '\n'; id_end++) {} -+ -+ id_offset = idp - new_hdr_line.s; -+ id_len = id_end - idp; - -- if (!id_changed) { // Can just copy -- if (match_to_ks(text, &matches[0], &new_hdr_line)) goto memfail; -- } else { // Substitute new name for original -- if (range_to_ks(text, matches[0].rm_so, matches[1].rm_so, -- &new_hdr_line)) goto memfail; -- if (ks_to_ks(&transformed_id, &new_hdr_line)) goto memfail; -- if (range_to_ks(text, matches[1].rm_eo, matches[0].rm_eo, -- &new_hdr_line)) goto memfail; -+ if (id_len < transformed_id.l) { -+ if (ks_resize(&new_hdr_line, new_hdr_line.l + transformed_id.l - id_len)) -+ goto fail; -+ } -+ if (id_len != transformed_id.l) { -+ memmove(new_hdr_line.s + id_offset + transformed_id.l, -+ new_hdr_line.s + id_offset + id_len, -+ new_hdr_line.l - id_offset - id_len + 1); -+ } -+ memcpy(new_hdr_line.s + id_offset, transformed_id.s, -+ transformed_id.l); - } - - // append line to output linked list -@@ -688,8 +548,6 @@ - int in_there = 0; - iter = kh_put(c2c, id_map, ks_release(&orig_id), &in_there); - kh_value(id_map, iter) = map_value; -- -- text += matches[0].rm_eo; // next! - } - - // If there are no RG lines in the file and we are overriding add one -@@ -726,6 +584,7 @@ - - memfail: - perror(__func__); -+ fail: - if (hdr_lines) kl_destroy(hdrln, hdr_lines); - return NULL; - } -@@ -823,16 +682,18 @@ - * Returns 0 on success, -1 on failure. - */ - --static int trans_tbl_init(merged_header_t* merged_hdr, bam_hdr_t* translate, -+static int trans_tbl_init(merged_header_t* merged_hdr, sam_hdr_t* translate, - trans_tbl_t* tbl, bool merge_rg, bool merge_pg, - bool copy_co, char* rg_override) - { -+ kstring_t lines = { 0, 0, NULL }; - klist_t(hdrln) *rg_list = NULL; - klist_t(hdrln) *pg_list = NULL; - -- tbl->n_targets = translate->n_targets; -+ tbl->n_targets = sam_hdr_nref(translate); - tbl->rg_trans = tbl->pg_trans = NULL; -- tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); -+ tbl->tid_trans = (int*)calloc(tbl->n_targets ? tbl->n_targets : 1, -+ sizeof(int)); - if (tbl->tid_trans == NULL) goto memfail; - tbl->rg_trans = kh_init(c2c); - if (tbl->rg_trans == NULL) goto memfail; -@@ -861,6 +722,7 @@ - goto fail; - - // Fix-up PP: tags in the new @PG records and add to output -+ lines.l = 0; - if (finish_rg_pg(false, pg_list, tbl->pg_trans, &merged_hdr->out_pg)) - goto fail; - -@@ -869,22 +731,22 @@ - - if (copy_co) { - // Just append @CO headers without translation -- const char *line, *end_pointer; -- for (line = translate->text; *line; line = end_pointer + 1) { -- end_pointer = strchr(line, '\n'); -- if (strncmp(line, "@CO", 3) == 0) { -- if (end_pointer) { -- if (kputsn(line, end_pointer - line + 1, &merged_hdr->out_co) == EOF) -- goto memfail; -- } else { // Last line with no trailing '\n' -- if (kputs(line, &merged_hdr->out_co) == EOF) goto memfail; -- if (kputc('\n', &merged_hdr->out_co) == EOF) goto memfail; -- } -- } -- if (end_pointer == NULL) break; -+ int num_co = sam_hdr_count_lines(translate, "CO"), i; -+ if (num_co < 0) -+ goto fail; -+ -+ for (i = 0; i < num_co; i++) { -+ if (sam_hdr_find_line_pos(translate, "CO", i, &lines) < 0) -+ goto fail; -+ if (ks_to_ks(&lines, &merged_hdr->out_co)) -+ goto fail; -+ if (kputc('\n', &merged_hdr->out_co) < 0) -+ goto fail; - } - } - -+ free(lines.s); -+ - return 0; - - memfail: -@@ -893,80 +755,22 @@ - trans_tbl_destroy(tbl); - if (rg_list) kl_destroy(hdrln, rg_list); - if (pg_list) kl_destroy(hdrln, pg_list); -+ free(lines.s); - return -1; - } - --static inline void move_kstr_to_text(char **text, kstring_t *ks) { -- memcpy(*text, ks_str(ks), ks_len(ks)); -- *text += ks_len(ks); -- **text = '\0'; -- free(ks_release(ks)); --} -- --/* -- * Populate a bam_hdr_t struct from data in a merged_header_t. -- */ -- --static bam_hdr_t * finish_merged_header(merged_header_t *merged_hdr) { -- size_t txt_sz; -- char *text; -- bam_hdr_t *hdr; -- -- // Check output text size -- txt_sz = (ks_len(&merged_hdr->out_hd) -- + ks_len(&merged_hdr->out_sq) -- + ks_len(&merged_hdr->out_rg) -- + ks_len(&merged_hdr->out_pg) -- + ks_len(&merged_hdr->out_co)); -- if (txt_sz >= INT32_MAX) { -- fprintf(samtools_stderr, "[%s] Output header text too long\n", __func__); -- return NULL; -- } -- -- // Allocate new header -- hdr = bam_hdr_init(); -- if (hdr == NULL) goto memfail; -- -- // Transfer targets arrays to new header -- hdr->n_targets = merged_hdr->n_targets; -- if (hdr->n_targets > 0) { -- // Try to shrink targets arrays to correct size -- hdr->target_name = realloc(merged_hdr->target_name, -- hdr->n_targets * sizeof(char*)); -- if (!hdr->target_name) hdr->target_name = merged_hdr->target_name; -- -- hdr->target_len = realloc(merged_hdr->target_len, -- hdr->n_targets * sizeof(uint32_t)); -- if (!hdr->target_len) hdr->target_len = merged_hdr->target_len; -- -- // These have either been freed by realloc() or, in the unlikely -- // event that failed, have had their ownership transferred to hdr -- merged_hdr->target_name = NULL; -- merged_hdr->target_len = NULL; -- } -- else { -- hdr->target_name = NULL; -- hdr->target_len = NULL; -- } -- -- // Allocate text -- text = hdr->text = malloc(txt_sz + 1); -- if (!text) goto memfail; -- -- // Put header text in order @HD, @SQ, @RG, @PG, @CO -- move_kstr_to_text(&text, &merged_hdr->out_hd); -- move_kstr_to_text(&text, &merged_hdr->out_sq); -- move_kstr_to_text(&text, &merged_hdr->out_rg); -- move_kstr_to_text(&text, &merged_hdr->out_pg); -- move_kstr_to_text(&text, &merged_hdr->out_co); -- hdr->l_text = txt_sz; -- -- return hdr; -+static int finish_merged_header(merged_header_t *merged_hdr) { -+ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_rg), -+ ks_len(&merged_hdr->out_rg)) < 0) -+ return -1; -+ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_pg), -+ ks_len(&merged_hdr->out_pg)) < 0) -+ return -1; -+ if (sam_hdr_add_lines(merged_hdr->hdr, ks_c_str(&merged_hdr->out_co), -+ ks_len(&merged_hdr->out_co)) < 0) -+ return -1; - -- memfail: -- perror(__func__); -- bam_hdr_destroy(hdr); -- return NULL; -+ return 0; - } - - /* -@@ -981,8 +785,6 @@ - size_t i; - khiter_t iter; - if (!merged_hdr) return; -- free(ks_release(&merged_hdr->out_hd)); -- free(ks_release(&merged_hdr->out_sq)); - free(ks_release(&merged_hdr->out_rg)); - free(ks_release(&merged_hdr->out_pg)); - free(ks_release(&merged_hdr->out_co)); -@@ -1149,25 +951,30 @@ - @param cmd command name (used in print_error() etc) - @param in_fmt format options for input files - @param out_fmt output file format and options -+ @param write_index create the index, together with the output file -+ @param arg_list command string for PG line -+ @param no_pg if 1, do not add a new PG line - @discussion Padding information may NOT correctly maintained. This - function is NOT thread safe. - */ - int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, -- const char *headers, int n, char * const *fn, int flag, -- const char *reg, int n_threads, const char *cmd, -- const htsFormat *in_fmt, const htsFormat *out_fmt) -+ const char *headers, int n, char * const *fn, char * const *fn_idx, -+ int flag, const char *reg, int n_threads, const char *cmd, -+ const htsFormat *in_fmt, const htsFormat *out_fmt, int write_index, -+ char *arg_list, int no_pg) - { - samFile *fpout, **fp = NULL; - heap1_t *heap = NULL; -- bam_hdr_t *hout = NULL; -- bam_hdr_t *hin = NULL; -+ sam_hdr_t *hout = NULL; -+ sam_hdr_t *hin = NULL; - int i, j, *RG_len = NULL; - uint64_t idx = 0; - char **RG = NULL; - hts_itr_t **iter = NULL; -- bam_hdr_t **hdr = NULL; -+ sam_hdr_t **hdr = NULL; - trans_tbl_t *translation_tbl = NULL; - int *rtrans = NULL; -+ char *out_idx_fn = NULL; - merged_header_t *merged_hdr = init_merged_header(); - if (!merged_hdr) return -1; - -@@ -1190,7 +997,7 @@ - if (sort_tag) { - g_is_by_tag = 1; - g_sort_tag[0] = sort_tag[0]; -- g_sort_tag[1] = sort_tag[1]; -+ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; - } - - fp = (samFile**)calloc(n, sizeof(samFile*)); -@@ -1199,7 +1006,7 @@ - if (!heap) goto mem_fail; - iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); - if (!iter) goto mem_fail; -- hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); -+ hdr = (sam_hdr_t**)calloc(n, sizeof(sam_hdr_t*)); - if (!hdr) goto mem_fail; - translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); - if (!translation_tbl) goto mem_fail; -@@ -1236,7 +1043,7 @@ - - // open and read the header from each file - for (i = 0; i < n; ++i) { -- bam_hdr_t *hin; -+ sam_hdr_t *hin; - fp[i] = sam_open_format(fn[i], "r", in_fmt); - if (fp[i] == NULL) { - print_error_errno(cmd, "fail to open \"%s\"", fn[i]); -@@ -1257,7 +1064,7 @@ - // TODO sam_itr_next() doesn't yet work for SAM files, - // so for those keep the headers around for use with sam_read1() - if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; -- else { bam_hdr_destroy(hin); hdr[i] = NULL; } -+ else { sam_hdr_destroy(hin); hdr[i] = NULL; } - - if ((translation_tbl+i)->lost_coord_sort && !by_qname) { - fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); -@@ -1286,41 +1093,34 @@ - } - - // Transform the header into standard form -- hout = finish_merged_header(merged_hdr); -+ if (finish_merged_header(merged_hdr) < 0) -+ goto fail; -+ -+ hout = merged_hdr->hdr; - if (!hout) return -1; // FIXME: memory leak - - // If we're only merging a specified region move our iters to start at that point - if (reg) { -- int tid, beg, end; -- const char *name_lim; -+ int tid; -+ hts_pos_t beg, end; - -- rtrans = rtrans_build(n, hout->n_targets, translation_tbl); -+ rtrans = rtrans_build(n, sam_hdr_nref(hout), translation_tbl); - if (!rtrans) goto mem_fail; - -- name_lim = hts_parse_reg(reg, &beg, &end); -- if (name_lim) { -- char *name = malloc(name_lim - reg + 1); -- if (!name) goto mem_fail; -- memcpy(name, reg, name_lim - reg); -- name[name_lim - reg] = '\0'; -- tid = bam_name2id(hout, name); -- free(name); -- } -- else { -- // not parsable as a region, but possibly a sequence named "foo:a" -- tid = bam_name2id(hout, reg); -- beg = 0; -- end = INT_MAX; -- } -- if (tid < 0) { -- if (name_lim) fprintf(samtools_stderr, "[%s] Region \"%s\" specifies an unknown reference name\n", __func__, reg); -- else fprintf(samtools_stderr, "[%s] Badly formatted region: \"%s\"\n", __func__, reg); -+ if (!sam_parse_region(hout, reg, &tid, &beg, &end, 0)) { -+ fprintf(samtools_stderr, "[%s] Badly formatted region or unknown reference name: \"%s\"\n", __func__, reg); - goto fail; - } - for (i = 0; i < n; ++i) { -- hts_idx_t *idx = sam_index_load(fp[i], fn[i]); -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx != NULL) { -+ idx = sam_index_load2(fp[i], fn[i], fn_idx[i]); -+ } else { -+ idx = sam_index_load(fp[i], fn[i]); -+ } - // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space -- int mapped_tid = rtrans[i*hout->n_targets+tid]; -+ int mapped_tid = rtrans[i*sam_hdr_nref(hout)+tid]; - if (idx == NULL) { - fprintf(samtools_stderr, "[%s] failed to load index for %s. Random alignment retrieval only works for indexed BAM or CRAM files.\n", - __func__, fn[i]); -@@ -1336,7 +1136,7 @@ - if (mapped_tid != INT32_MIN) { - fprintf(samtools_stderr, - "[%s] failed to get iterator over " -- "{%s, %d, %d, %d}\n", -+ "{%s, %d, %"PRIhts_pos", %"PRIhts_pos"}\n", - __func__, fn[i], mapped_tid, beg, end); - } else { - fprintf(samtools_stderr, -@@ -1373,7 +1173,8 @@ - res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record); - if (res >= 0) { - bam_translate(h->entry.bam_record, translation_tbl + i); -- h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1); -+ h->tid = h->entry.bam_record->core.tid; -+ h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1); - h->rev = bam_is_rev(h->entry.bam_record); - h->idx = idx++; - if (g_is_by_tag) { -@@ -1398,11 +1199,26 @@ - print_error_errno(cmd, "failed to create \"%s\"", out); - return -1; - } -+ if (!no_pg && sam_hdr_add_pg(hout, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); -+ sam_close(fpout); -+ return -1; -+ } - if (sam_hdr_write(fpout, hout) != 0) { - print_error_errno(cmd, "failed to write header to \"%s\"", out); - sam_close(fpout); - return -1; - } -+ if (write_index) { -+ if (!(out_idx_fn = auto_index(fpout, out, hout))){ -+ sam_close(fpout); -+ return -1; -+ } -+ } - if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); - - // Begin the actual merge -@@ -1417,11 +1233,13 @@ - if (sam_write1(fpout, hout, b) < 0) { - print_error_errno(cmd, "failed writing to \"%s\"", out); - sam_close(fpout); -+ free(out_idx_fn); - return -1; - } - if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { - bam_translate(b, translation_tbl + heap->i); -- heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1); -+ heap->tid = b->core.tid; -+ heap->pos = (uint64_t)(b->core.pos + 1); - heap->rev = bam_is_rev(b); - heap->idx = idx++; - if (g_is_by_tag) { -@@ -1441,6 +1259,14 @@ - ks_heapadjust(heap, 0, n, heap); - } - -+ if (write_index) { -+ if (sam_idx_save(fpout) < 0) { -+ print_error_errno("merge", "writing index failed"); -+ goto fail; -+ } -+ } -+ free(out_idx_fn); -+ - // Clean up and close - if (flag & MERGE_RG) { - for (i = 0; i != n; ++i) free(RG[i]); -@@ -1449,11 +1275,11 @@ - for (i = 0; i < n; ++i) { - trans_tbl_destroy(translation_tbl + i); - hts_itr_destroy(iter[i]); -- bam_hdr_destroy(hdr[i]); -+ sam_hdr_destroy(hdr[i]); - sam_close(fp[i]); - } -- bam_hdr_destroy(hin); -- bam_hdr_destroy(hout); -+ sam_hdr_destroy(hin); -+ sam_hdr_destroy(hout); - free_merged_header(merged_hdr); - free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); - if (sam_close(fpout) < 0) { -@@ -1475,11 +1301,11 @@ - for (i = 0; i < n; ++i) { - if (translation_tbl && translation_tbl[i].tid_trans) trans_tbl_destroy(translation_tbl + i); - if (iter && iter[i]) hts_itr_destroy(iter[i]); -- if (hdr && hdr[i]) bam_hdr_destroy(hdr[i]); -+ if (hdr && hdr[i]) sam_hdr_destroy(hdr[i]); - if (fp && fp[i]) sam_close(fp[i]); - if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); - } -- if (hout) bam_hdr_destroy(hout); -+ if (hout) sam_hdr_destroy(hout); - free(RG); - free(translation_tbl); - free(hdr); -@@ -1487,6 +1313,7 @@ - free(heap); - free(fp); - free(rtrans); -+ free(out_idx_fn); - return -1; - } - -@@ -1497,7 +1324,7 @@ - strcpy(mode, "wb"); - if (flag & MERGE_UNCOMP) strcat(mode, "0"); - else if (flag & MERGE_LEVEL1) strcat(mode, "1"); -- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL); -+ return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); - } - - static void merge_usage(FILE *to) -@@ -1518,23 +1345,27 @@ - " -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n" - " -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n" - " -s VALUE Override random seed\n" --" -b FILE List of input BAM filenames, one per line [null]\n"); -- sam_global_opt_help(to, "-.O..@"); -+" -b FILE List of input BAM filenames, one per line [null]\n" -+" -X Use customized index files\n" -+" --no-PG do not add a PG line\n"); -+ sam_global_opt_help(to, "-.O..@.."); - } - - int bam_merge(int argc, char *argv[]) - { -- int c, is_by_qname = 0, flag = 0, ret = 0, level = -1; -+ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; - char *fn_headers = NULL, *reg = NULL, mode[12]; -- char *sort_tag = NULL; -+ char *sort_tag = NULL, *arg_list = NULL; - long random_seed = (long)time(NULL); - char** fn = NULL; -- int fn_size = 0; -+ char** fn_idx = NULL; -+ int fn_size = 0, no_pg = 0; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), - { "threads", required_argument, NULL, '@' }, -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -1543,13 +1374,13 @@ - return 0; - } - -- while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:X", lopts, NULL)) >= 0) { - switch (c) { - case 'r': flag |= MERGE_RG; break; - case 'f': flag |= MERGE_FORCE; break; -- case 'h': fn_headers = strdup(optarg); break; -+ case 'h': fn_headers = optarg; break; - case 'n': is_by_qname = 1; break; -- case 't': sort_tag = strdup(optarg); break; -+ case 't': sort_tag = optarg; break; - case '1': flag |= MERGE_LEVEL1; level = 1; break; - case 'u': flag |= MERGE_UNCOMP; level = 0; break; - case 'R': reg = strdup(optarg); break; -@@ -1557,8 +1388,13 @@ - case 'c': flag |= MERGE_COMBINE_RG; break; - case 'p': flag |= MERGE_COMBINE_PG; break; - case 's': random_seed = atol(optarg); break; -+ case 'X': has_index_file = 1; break; // -X flag for index filename - case 'b': { - // load the list of files to read -+ if (has_index_file) { -+ fprintf(samtools_stderr,"Error: The -b option cannot be combined with -X\n"); -+ ret = 1; goto end; -+ } - int nfiles; - char **fn_read = hts_readlines(optarg, &nfiles); - if (fn_read) { -@@ -1575,7 +1411,7 @@ - } - break; - } -- -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': merge_usage(samtools_stderr); return 1; -@@ -1587,6 +1423,11 @@ - return 1; - } - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("merge", "failed to create arg_list"); -+ return 1; -+ } -+ - srand48(random_seed); - if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { - FILE *fp = fopen(argv[optind], "rb"); -@@ -1597,24 +1438,41 @@ - } - } - -- int nargcfiles = argc - (optind+1); -+ int nargcfiles = 0; -+ if (has_index_file) { // Calculate # of input BAM files -+ if ((argc - optind - 1) % 2 != 0) { -+ fprintf(samtools_stderr, "Odd number of filenames detected! Each BAM file should have an index file\n"); -+ return 1; -+ } -+ nargcfiles = (argc - optind - 1) / 2; -+ } else { -+ nargcfiles = argc - optind - 1; -+ } -+ - if (nargcfiles > 0) { - // Add argc files to end of array - fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); - if (fn == NULL) { ret = 1; goto end; } - memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); -+ -+ if(has_index_file) { -+ fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*)); -+ if (fn_idx == NULL) { ret = 1; goto end; } -+ memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*)); -+ } - } - if (fn_size+nargcfiles < 1) { - print_error("merge", "You must specify at least one (and usually two or more) input files"); - merge_usage(samtools_stderr); -+ free(fn_idx); - return 1; - } - strcpy(mode, "wb"); - sam_open_mode(mode+1, argv[optind], NULL); - if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); - if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers, -- fn_size+nargcfiles, fn, flag, reg, ga.nthreads, -- "merge", &ga.in, &ga.out) < 0) -+ fn_size+nargcfiles, fn, fn_idx, flag, reg, ga.nthreads, -+ "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) - ret = 1; - - end: -@@ -1623,8 +1481,9 @@ - for (i=0; ii, res; - if (i < nfiles) { // read from file - res = sam_read1(fp[i], hout, heap->entry.bam_record); -@@ -1657,8 +1516,8 @@ - } - } - if (res >= 0) { -- heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32) -- | (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)); -+ heap->tid = heap->entry.bam_record->core.tid; -+ heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1); - heap->rev = bam_is_rev(heap->entry.bam_record); - heap->idx = (*idx)++; - if (g_is_by_tag) { -@@ -1678,21 +1537,23 @@ - } - - static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, -- const char *mode, bam_hdr_t *hout, -+ const char *mode, sam_hdr_t *hout, - int n, char * const *fn, int num_in_mem, - buf_region *in_mem, bam1_tag *buf, int n_threads, - const char *cmd, const htsFormat *in_fmt, -- const htsFormat *out_fmt) { -+ const htsFormat *out_fmt, char *arg_list, int no_pg, -+ int write_index) { - samFile *fpout = NULL, **fp = NULL; - heap1_t *heap = NULL; - uint64_t idx = 0; - int i, heap_size = n + num_in_mem; -+ char *out_idx_fn = NULL; - - g_is_by_qname = by_qname; - if (sort_tag) { - g_is_by_tag = 1; - g_sort_tag[0] = sort_tag[0]; -- g_sort_tag[1] = sort_tag[1]; -+ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; - } - if (n > 0) { - fp = (samFile**)calloc(n, sizeof(samFile*)); -@@ -1703,7 +1564,7 @@ - - // Open each file, read the header and put the first read into the heap - for (i = 0; i < heap_size; i++) { -- bam_hdr_t *hin; -+ sam_hdr_t *hin; - heap1_t *h = &heap[i]; - - if (i < n) { -@@ -1720,7 +1581,7 @@ - goto fail; - } - // ... and throw it away as we don't really need it -- bam_hdr_destroy(hin); -+ sam_hdr_destroy(hin); - } - - // Get a read into the heap -@@ -1743,6 +1604,16 @@ - return -1; - } - -+ if (!no_pg && sam_hdr_add_pg(hout, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error(cmd, "failed to add PG line to the header of \"%s\"", out); -+ sam_close(fpout); -+ return -1; -+ } -+ - if (n_threads > 1) hts_set_threads(fpout, n_threads); - - if (sam_hdr_write(fpout, hout) != 0) { -@@ -1751,14 +1622,20 @@ - return -1; - } - -+ if (write_index) { -+ if (!(out_idx_fn = auto_index(fpout, out, hout))){ -+ sam_close(fpout); -+ return -1; -+ } -+ } -+ - // Now do the merge - ks_heapmake(heap, heap_size, heap); - while (heap->pos != HEAP_EMPTY) { - bam1_t *b = heap->entry.bam_record; - if (sam_write1(fpout, hout, b) < 0) { - print_error_errno(cmd, "failed writing to \"%s\"", out); -- sam_close(fpout); -- return -1; -+ goto fail; - } - if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { - assert(heap->i < n); -@@ -1777,6 +1654,15 @@ - } - free(fp); - free(heap); -+ -+ if (write_index) { -+ if (sam_idx_save(fpout) < 0) { -+ print_error_errno("merge", "writing index failed"); -+ goto fail; -+ } -+ free(out_idx_fn); -+ } -+ - if (sam_close(fpout) < 0) { - print_error(cmd, "error closing output file"); - return -1; -@@ -1788,11 +1674,15 @@ - fail: - for (i = 0; i < n; i++) { - if (fp && fp[i]) sam_close(fp[i]); -- if (heap && heap[i].entry.bam_record) bam_destroy1(heap[i].entry.bam_record); -+ } -+ for (i = 0; i < heap_size; i++) { -+ if (heap && heap[i].i < n && heap[i].entry.bam_record) -+ bam_destroy1(heap[i].entry.bam_record); - } - free(fp); - free(heap); - if (fpout) sam_close(fpout); -+ free(out_idx_fn); - return -1; - } - -@@ -1813,8 +1703,13 @@ - if (t != 0) return t; - return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); - } else { -- pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1); -- pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1); -+ pa = a.bam_record->core.tid; -+ pb = b.bam_record->core.tid; -+ -+ if (pa == pb) { -+ pa = (uint64_t)(a.bam_record->core.pos+1); -+ pb = (uint64_t)(b.bam_record->core.pos+1); -+ } - - if (pa == pb) { - pa = bam_is_rev(a.bam_record); -@@ -1915,7 +1810,7 @@ - size_t buf_len; - const char *prefix; - bam1_tag *buf; -- const bam_hdr_t *h; -+ const sam_hdr_t *h; - int index; - int error; - int no_save; -@@ -1923,45 +1818,99 @@ - - // Returns 0 for success - // -1 for failure --static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, const bam_hdr_t *h, int n_threads, const htsFormat *fmt) -+static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *buf, -+ const sam_hdr_t *h, int n_threads, const htsFormat *fmt, -+ char *arg_list, int no_pg, int write_index) - { - size_t i; - samFile* fp; -+ char *out_idx_fn = NULL; -+ - fp = sam_open_format(fn, mode, fmt); - if (fp == NULL) return -1; -- if (sam_hdr_write(fp, h) != 0) goto fail; -+ if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ goto fail; -+ } -+ if (sam_hdr_write(fp, (sam_hdr_t *)h) != 0) goto fail; -+ -+ if (write_index) { -+ if (!(out_idx_fn = auto_index(fp, fn, (sam_hdr_t *)h))) goto fail; -+ } -+ - if (n_threads > 1) hts_set_threads(fp, n_threads); - for (i = 0; i < l; ++i) { -- if (sam_write1(fp, h, buf[i].bam_record) < 0) goto fail; -+ if (sam_write1(fp, (sam_hdr_t *)h, buf[i].bam_record) < 0) goto fail; - } -+ -+ if (write_index) { -+ if (sam_idx_save(fp) < 0) { -+ print_error_errno("merge", "writing index failed"); -+ goto fail; -+ } -+ free(out_idx_fn); -+ } -+ -+ - if (sam_close(fp) < 0) return -1; - return 0; - fail: - sam_close(fp); -+ free(out_idx_fn); - return -1; - } - - #define NUMBASE 256 --#define STEP 8 - --static int ks_radixsort(size_t n, bam1_tag *buf, const bam_hdr_t *h) -+static int ks_radixsort(size_t n, bam1_tag *buf, const sam_hdr_t *h) - { - int curr = 0, ret = -1; - ssize_t i; - bam1_tag *buf_ar2[2], *bam_a, *bam_b; -- uint64_t max_pos = 0, max_digit = 0, shift = 0; -- -+ uint64_t max_pos = 1; -+ uint32_t max_tid = 1, tid_bytes = 0, pos_bytes = 0, byte = 0; -+ uint32_t tid_shift_l, tid_shift_r; -+ int nref = sam_hdr_nref(h); -+ -+ // Count number of bytes needed for biggest tid and pos -+ // Notes: Add 1 to core.pos so always positive. -+ // Convert unmapped tid (-1) to number of references so unmapped -+ // sort to the end. - for (i = 0; i < n; i++) { - bam1_t *b = buf[i].bam_record; -- int32_t tid = b->core.tid == -1 ? h->n_targets : b->core.tid; -- buf[i].u.pos = (uint64_t)tid<<32 | (b->core.pos+1)<<1 | bam_is_rev(b); -- if (max_pos < buf[i].u.pos) -- max_pos = buf[i].u.pos; -- } -- -- while (max_pos) { -- ++max_digit; -- max_pos = max_pos >> 1; -+ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; -+ uint64_t pos = ((uint64_t)(b->core.pos + 1) << 1) | bam_is_rev(b); -+ if (max_tid < tid) -+ max_tid = tid; -+ if (max_pos < pos) -+ max_pos = pos; -+ } -+ -+ for (; max_pos > 0; max_pos >>= 8) pos_bytes++; -+ for (; max_tid > 0; max_tid >>= 8) tid_bytes++; -+ assert(pos_bytes + tid_bytes < sizeof(buf[0].u.pos_tid)); -+ -+ tid_shift_l = pos_bytes * 8; -+ tid_shift_r = 64 - tid_shift_l; -+ -+ // Write position and tid into bam1_tag::u::pos_tid using minimum number -+ // of bytes required. Values are stored little-endian so that we -+ // get a least-significant digit (byte) radix sort. -+ for (i = 0; i < n; i++) { -+ bam1_t *b = buf[i].bam_record; -+ uint32_t tid = b->core.tid == -1 ? nref : b->core.tid; -+ // 'pos' here includes as many bytes of tid as will fit -+ // in the space remaining above pos_bytes. The rest of tid -+ // is written out separately. -+ uint64_t pos = (bam_is_rev(b) | -+ ((uint64_t)(b->core.pos + 1) << 1) | -+ (tid_shift_l < 64 ? (uint64_t) tid << tid_shift_l : 0)); -+ u64_to_le(pos, buf[i].u.pos_tid); -+ u32_to_le(tid_shift_r < 32 ? tid >> tid_shift_r : 0, -+ &buf[i].u.pos_tid[8]); - } - - buf_ar2[0] = buf; -@@ -1971,18 +1920,18 @@ - goto err; - } - -- while (shift < max_digit){ -+ // Least-significant digit radix sort (where "digits" are bytes) -+ for (byte = 0; byte < pos_bytes + tid_bytes; byte++) { - size_t remainders[NUMBASE] = { 0 }; - bam_a = buf_ar2[curr]; bam_b = buf_ar2[1-curr]; - for (i = 0; i < n; ++i) -- remainders[(bam_a[i].u.pos >> shift) % NUMBASE]++; -+ remainders[bam_a[i].u.pos_tid[byte]]++; - for (i = 1; i < NUMBASE; ++i) - remainders[i] += remainders[i - 1]; - for (i = n - 1; i >= 0; i--) { -- size_t j = --remainders[(bam_a[i].u.pos >> shift) % NUMBASE]; -+ size_t j = --remainders[bam_a[i].u.pos_tid[byte]]; - bam_b[j] = bam_a[i]; - } -- shift += STEP; - curr = 1 - curr; - } - if (curr == 1) { -@@ -2036,10 +1985,10 @@ - return 0; - } - -- if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0) -+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt, NULL, 1, 0) < 0) - w->error = errno; - } else { -- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0) -+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL, NULL, 1, 0) < 0) - w->error = errno; - } - -@@ -2048,7 +1997,7 @@ - } - - static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, -- const bam_hdr_t *h, int n_threads, buf_region *in_mem) -+ const sam_hdr_t *h, int n_threads, buf_region *in_mem) - { - int i; - size_t pos, rest; -@@ -2109,6 +2058,9 @@ - @param max_mem approxiate maximum memory (very inaccurate) - @param in_fmt input file format options - @param out_fmt output file format and options -+ @param arg_list command string for PG line -+ @param no_pg if 1, do not add a new PG line -+ @paran write_index create index for the output file - @return 0 for successful sorting, negative on errors - - @discussion It may create multiple temporary subalignment files -@@ -2118,11 +2070,12 @@ - int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, - const char *fnout, const char *modeout, - size_t _max_mem, int n_threads, -- const htsFormat *in_fmt, const htsFormat *out_fmt) -+ const htsFormat *in_fmt, const htsFormat *out_fmt, -+ char *arg_list, int no_pg, int write_index) - { - int ret = -1, res, i, n_files = 0; - size_t max_k, k, max_mem, bam_mem_offset; -- bam_hdr_t *header = NULL; -+ sam_hdr_t *header = NULL; - samFile *fp; - bam1_tag *buf = NULL; - bam1_t *b = bam_init1(); -@@ -2141,7 +2094,8 @@ - g_is_by_qname = is_by_qname; - if (sort_by_tag) { - g_is_by_tag = 1; -- strncpy(g_sort_tag, sort_by_tag, 2); -+ g_sort_tag[0] = sort_by_tag[0]; -+ g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0'; - } - - max_mem = _max_mem * n_threads; -@@ -2164,14 +2118,15 @@ - else - new_so = "coordinate"; - -- if (sam_hdr_change_HD(header, "SO", new_so) != 0) { -- print_error("sort", -- "failed to change sort order header to '%s'\n", new_so); -+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) -+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) -+ ) { -+ print_error("sort", "failed to change sort order header to '%s'\n", new_so); - goto err; - } -- if (sam_hdr_change_HD(header, "GO", NULL) != 0) { -- print_error("sort", -- "failed to delete group order header\n"); -+ -+ if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { -+ print_error("sort", "failed to delete group order header\n"); - goto err; - } - -@@ -2254,7 +2209,7 @@ - - // write the final output - if (n_files == 0 && num_in_mem < 2) { // a single block -- if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) { -+ if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, arg_list, no_pg, write_index) != 0) { - print_error_errno("sort", "failed to create \"%s\"", fnout); - goto err; - } -@@ -2271,7 +2226,8 @@ - } - if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, - n_files, fns, num_in_mem, in_mem, buf, -- n_threads, "sort", in_fmt, out_fmt) < 0) { -+ n_threads, "sort", in_fmt, out_fmt, arg_list, -+ no_pg, write_index) < 0) { - // Propagate bam_merge_simple() failure; it has already emitted a - // message explaining the failure, so no further message is needed. - goto err; -@@ -2295,7 +2251,7 @@ - free(buf); - free(bam_mem); - free(in_mem); -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - if (fp) sam_close(fp); - return ret; - } -@@ -2307,7 +2263,7 @@ - char *fnout = calloc(strlen(prefix) + 4 + 1, 1); - if (!fnout) return -1; - sprintf(fnout, "%s.bam", prefix); -- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL); -+ ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); - free(fnout); - return ret; - } -@@ -2322,8 +2278,9 @@ - " -n Sort by read name\n" - " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" - " -o FILE Write final output to FILE rather than standard output\n" --" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"); -- sam_global_opt_help(fp, "-.O..@"); -+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" -+" --no-PG do not add a PG line\n"); -+ sam_global_opt_help(fp, "-.O..@-."); - } - - static void complain_about_memory_setting(size_t max_mem) { -@@ -2346,8 +2303,8 @@ - int bam_sort(int argc, char *argv[]) - { - size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; -- int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1; -- char* sort_tag = NULL; -+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; -+ char* sort_tag = NULL, *arg_list = NULL; - char *fnout = "-", modeout[12]; - kstring_t tmpprefix = { 0, 0, NULL }; - struct stat st; -@@ -2356,6 +2313,7 @@ - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), - { "threads", required_argument, NULL, '@' }, -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -2363,7 +2321,7 @@ - switch (c) { - case 'o': fnout = optarg; o_seen = 1; break; - case 'n': is_by_qname = 1; break; -- case 't': sort_tag = strdup(optarg); break; -+ case 't': sort_tag = optarg; break; - case 'm': { - char *q; - max_mem = strtol(optarg, &q, 0); -@@ -2374,6 +2332,7 @@ - } - case 'T': kputs(optarg, &tmpprefix); break; - case 'l': level = atoi(optarg); break; -+ case 1: no_pg = 1; break; - - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ -@@ -2397,6 +2356,16 @@ - goto sort_end; - } - -+ if (ga.write_index && (is_by_qname || sort_tag)) { -+ fprintf(samtools_stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); -+ ga.write_index = 0; -+ } -+ -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("sort", "failed to create arg_list"); -+ return 1; -+ } -+ - if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) { - complain_about_memory_setting(max_mem); - ret = EXIT_FAILURE; -@@ -2419,7 +2388,7 @@ - - ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, -- &ga.in, &ga.out); -+ &ga.in, &ga.out, arg_list, no_pg, ga.write_index); - if (ret >= 0) - ret = EXIT_SUCCESS; - else { -@@ -2434,6 +2403,7 @@ - - sort_end: - free(tmpprefix.s); -+ free(arg_list); - sam_global_args_free(&ga); - - return ret; ---- python-pysam.orig/samtools/bam_split.c -+++ python-pysam/samtools/bam_split.c -@@ -1,6 +1,6 @@ - /* bam_split.c -- split subcommand. - -- Copyright (C) 2013-2016 Genome Research Ltd. -+ Copyright (C) 2013-2016,2018-2019 Genome Research Ltd. - - Author: Martin Pollard - -@@ -24,7 +24,6 @@ - - #include - --#include - #include - #include - #include -@@ -32,6 +31,8 @@ - #include - #include - #include -+#include -+#include - #include - #include - #include -@@ -43,11 +44,12 @@ - KHASH_MAP_INIT_STR(c2i, int) - - struct parsed_opts { -- char* merged_input_name; -- char* unaccounted_header_name; -- char* unaccounted_name; -- char* output_format_string; -+ const char *merged_input_name; -+ const char *unaccounted_header_name; -+ const char *unaccounted_name; -+ const char *output_format_string; - bool verbose; -+ int no_pg; - sam_global_args ga; - }; - -@@ -55,16 +57,18 @@ - - struct state { - samFile* merged_input_file; -- bam_hdr_t* merged_input_header; -+ sam_hdr_t* merged_input_header; - samFile* unaccounted_file; -- bam_hdr_t* unaccounted_header; -+ sam_hdr_t* unaccounted_header; - size_t output_count; - char** rg_id; -+ char **rg_index_file_name; - char **rg_output_file_name; - samFile** rg_output_file; -- bam_hdr_t** rg_output_header; -+ sam_hdr_t** rg_output_header; - kh_c2i_t* rg_hash; - htsThreadPool p; -+ int write_index; - }; - - typedef struct state state_t; -@@ -75,14 +79,15 @@ - static void usage(FILE *write_to) - { - fprintf(write_to, --"Usage: samtools split [-u [:]]\n" -+"Usage: samtools split [-u ] [-h ]\n" - " [-f ] [-v] \n" - "Options:\n" - " -f STRING output filename format string [\"%%*_%%#.%%.\"]\n" - " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" --" -u FILE1:FILE2 ...and override the header with FILE2\n" --" -v verbose output\n"); -- sam_global_opt_help(write_to, "-....@"); -+" -h FILE2 ... and override the header with FILE2 (-u file only)\n" -+" -v verbose output\n" -+" --no-PG do not add a PG line\n"); -+ sam_global_opt_help(write_to, "-....@.."); - fprintf(write_to, - "\n" - "Format string expansions:\n" -@@ -99,11 +104,11 @@ - { - if (argc == 1) { usage(stdout); return NULL; } - -- const char* optstring = "vf:u:@:"; -- char* delim; -+ const char *optstring = "vf:h:u:@:"; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -116,20 +121,19 @@ - while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) { - switch (opt) { - case 'f': -- retval->output_format_string = strdup(optarg); -- if (! retval->output_format_string ) { perror("cannot allocate output format string memory"); return NULL; } -+ retval->output_format_string = optarg; -+ break; -+ case 'h': -+ retval->unaccounted_header_name = optarg; - break; - case 'v': - retval->verbose = true; - break; - case 'u': -- retval->unaccounted_name = strdup(optarg); -- if (! retval->unaccounted_name ) { perror("cannot allocate string memory"); return NULL; } -- if ((delim = strchr(retval->unaccounted_name, ':')) != NULL) { -- *delim = '\0'; -- retval->unaccounted_header_name = strdup(delim+1); -- if (! retval->unaccounted_header_name ) { perror("cannot allocate string memory"); return NULL; } -- } -+ retval->unaccounted_name = optarg; -+ break; -+ case 1: -+ retval->no_pg = 1; - break; - default: - if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break; -@@ -141,7 +145,7 @@ - } - } - -- if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.%."); -+ if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%."; - - argc -= optind; - argv += optind; -@@ -153,8 +157,7 @@ - return NULL; - } - -- retval->merged_input_name = strdup(argv[0]); -- if (! retval->merged_input_name ) { perror("cannot allocate string memory"); return NULL; } -+ retval->merged_input_name = argv[0]; - - return retval; - } -@@ -166,176 +169,110 @@ - const char* pointer = format_string; - const char* next; - while ((next = strchr(pointer, '%')) != NULL) { -- kputsn(pointer, next-pointer, &str); -+ if (kputsn(pointer, next-pointer, &str) < 0) goto memfail; - ++next; - switch (*next) { - case '%': -- kputc('%', &str); -+ if (kputc('%', &str) < 0) goto memfail; - break; - case '*': -- kputs(basename, &str); -+ if (kputs(basename, &str) < 0) goto memfail; - break; - case '#': -- kputl(rg_idx, &str); -+ if (kputl(rg_idx, &str) < 0) goto memfail; - break; - case '!': -- kputs(rg_id, &str); -+ if (kputs(rg_id, &str) < 0) goto memfail; - break; - case '.': - // Only really need to cope with sam, bam, cram -- if (format->format != unknown_format) -- kputs(hts_format_file_extension(format), &str); -- else -- kputs("bam", &str); -+ if (format->format != unknown_format) { -+ if (kputs(hts_format_file_extension(format), &str) < 0) -+ goto memfail; -+ } else { -+ if (kputs("bam", &str) < 0) goto memfail; -+ } - break; - case '\0': -- // Error is: fprintf(stderr, "bad format string, trailing %%\n"); -- free(str.s); -- return NULL; -+ print_error("split", "Trailing %% in filename format string"); -+ goto fail; - default: - // Error is: fprintf(stderr, "bad format string, unknown format specifier\n"); -- free(str.s); -- return NULL; -+ print_error("split", "Unknown specifier %%%c in filename format string", *next); -+ goto fail; - } - pointer = next + 1; - } -- kputs(pointer, &str); -+ if (kputs(pointer, &str) < 0) goto memfail; - return ks_release(&str); -+ -+ memfail: -+ print_error_errno("split", "Couldn't build output filename"); -+ fail: -+ free(str.s); -+ return NULL; - } - - // Parse the header, count the number of RG tags and return a list of their names --static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name) -+static bool count_RG(sam_hdr_t* hdr, size_t* count, char*** output_name) - { -- if (hdr->l_text < 3 ) { -+ char **names = NULL; -+ kstring_t id_val = KS_INITIALIZE; -+ int i, n_rg = sam_hdr_count_lines(hdr, "RG"); -+ -+ if (n_rg < 0) { -+ print_error("split", "Failed to get @RG IDs"); - *count = 0; - *output_name = NULL; -- return true; -+ return false; - } -- kstring_t input = { 0, 0, NULL }; -- kputsn(hdr->text, hdr->l_text, &input); - -- ////////////////////////////////////////// -- // First stage count number of @RG tags // -- ////////////////////////////////////////// -- char* pointer = ks_str(&input); -- size_t n_rg = 0; -- // Guard against rare case where @RG is first header line -- // This shouldn't happen but could where @HD is omitted -- if (pointer[0] == '@' && pointer[1] == 'R' && pointer[2] == 'G' ) { -- ++n_rg; -- pointer += 3; -- } -- char* line; -- while ((line = strstr(pointer, "\n@RG")) != NULL) { -- ++n_rg; -- pointer = line + 1; -- } -- -- ////////////////////////////////// -- // Second stage locate @RG ID's // -- ////////////////////////////////// -- char** names = (char**)calloc(sizeof(char*), n_rg); -- size_t next = 0; -- -- regex_t rg_finder; -- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { -- free(input.s); -- free(names); -- return false; -+ if (n_rg == 0) { -+ *count = 0; -+ *output_name = NULL; -+ return true; - } -- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); -- int error; -- char* begin = ks_str(&input); -- -- while ((error = regexec(&rg_finder, begin, 2, matches, 0)) == 0) { -- kstring_t str = { 0, 0, NULL }; -- kputsn(begin+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &str); -- names[next++] = ks_release(&str); -- begin += matches[0].rm_eo; -- } -- -- if (error != REG_NOMATCH) { -- // cleanup -- regfree(&rg_finder); -- free(matches); -- free(names); -- free(input.s); -- return false; -+ -+ names = calloc(n_rg, sizeof(names[0])); -+ if (!names) goto memfail; -+ -+ for (i = 0; i < n_rg; i++) { -+ if (sam_hdr_find_tag_pos(hdr, "RG", i, "ID", &id_val) < 0) goto memfail; -+ names[i] = ks_release(&id_val); - } -- free(matches); - -- // return results - *count = n_rg; - *output_name = names; -- regfree(&rg_finder); -- free(input.s); - return true; -+ -+ memfail: -+ print_error_errno("split", "Failed to get @RG IDs"); -+ *count = 0; -+ *output_name = NULL; -+ ks_free(&id_val); -+ free(names); -+ return false; - } - --// Filters a header of @RG lines where ID != id_keep --// TODO: strip @PG's descended from other RGs and their descendants --static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list) -+static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2) - { -- kstring_t str = {0, 0, NULL}; -- -- regex_t rg_finder; -- -- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { -- return false; -+ size_t n; -+ if (sam_hdr_nref(hdr1) != sam_hdr_nref(hdr2)) { -+ print_error("split", -+ "Unaccounted header contains wrong number of references"); -+ return -1; - } -- -- // regex vars -- char* header = hdr->text; -- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); -- kstring_t found_id = { 0, 0, NULL }; -- int error; -- -- while ((error = regexec(&rg_finder, header, 2, matches, 0)) == 0) { -- kputsn(header, matches[0].rm_so, &str); // copy header up until the found RG line -- -- found_id.l = 0; -- kputsn(header+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &found_id); // extract ID -- // if it matches keep keep it, else we can just ignore it -- if (strcmp(ks_str(&found_id), id_keep) == 0) { -- kputsn(header+matches[0].rm_so, (matches[0].rm_eo+1)-matches[0].rm_so, &str); -- } -- // move pointer forward -- header += matches[0].rm_eo+1; -- } -- // cleanup -- free(found_id.s); -- free(matches); -- regfree(&rg_finder); -- // Did we leave loop because of an error? -- if (error != REG_NOMATCH) { -- return false; -+ for (n = 0; n < sam_hdr_nref(hdr1); n++) { -+ hts_pos_t h1_len = sam_hdr_tid2len(hdr1, n); -+ hts_pos_t h2_len = sam_hdr_tid2len(hdr2, n); -+ if (h1_len != h2_len) { -+ print_error("split", -+ "Unaccounted header reference %zu \"%s\" is not the same length as in the input file", -+ n + 1, sam_hdr_tid2name(hdr2, n)); -+ return -1; -+ } - } -- -- // Write remainder of string -- kputs(header, &str); -- -- // Modify header -- hdr->l_text = ks_len(&str); -- free(hdr->text); -- hdr->text = ks_release(&str); -- -- // Add the PG line -- SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text); -- if (sam_hdr_add_PG(sh, "samtools", -- "VN", samtools_version(), -- arg_list ? "CL": NULL, -- arg_list ? arg_list : NULL, -- NULL) != 0) -- return -1; -- -- free(hdr->text); -- hdr->text = strdup(sam_hdr_str(sh)); -- hdr->l_text = sam_hdr_length(sh); -- if (!hdr->text) -- return false; -- sam_hdr_free(sh); -- -- return true; -+ return 0; - } - - // Set the initial state -@@ -350,6 +287,7 @@ - if (opts->ga.nthreads > 0) { - if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) { - fprintf(stderr, "Error creating thread pool\n"); -+ cleanup_state(retval, false); - return NULL; - } - } -@@ -357,7 +295,7 @@ - retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); - if (!retval->merged_input_file) { - print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); -- free(retval); -+ cleanup_state(retval, false); - return NULL; - } - if (retval->p.pool) -@@ -381,11 +319,26 @@ - if (retval->unaccounted_header == NULL) { - print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name); - cleanup_state(retval, false); -+ sam_close(hdr_load); - return NULL; - } - sam_close(hdr_load); -+ if (header_compatible(retval->merged_input_header, -+ retval->unaccounted_header) != 0) { -+ cleanup_state(retval, false); -+ return NULL; -+ } - } else { -- retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header); -+ retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header); -+ if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name); -+ cleanup_state(retval, false); -+ return NULL; -+ } - } - - retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); -@@ -401,12 +354,15 @@ - // Open output files for RGs - if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; - if (opts->verbose) fprintf(stderr, "@RG's found %zu\n",retval->output_count); -- -- retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *)); -- retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); -- retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); -+ // Prevent calloc(0, size); -+ size_t num = retval->output_count ? retval->output_count : 1; -+ retval->rg_index_file_name = (char **)calloc(num, sizeof(char *)); -+ retval->rg_output_file_name = (char **)calloc(num, sizeof(char *)); -+ retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*)); -+ retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*)); - retval->rg_hash = kh_init_c2i(); -- if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { -+ if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || -+ !retval->rg_hash || !retval->rg_index_file_name) { - print_error_errno("split", "Could not initialise output file array"); - cleanup_state(retval, false); - return NULL; -@@ -432,7 +388,6 @@ - &opts->ga.out); - - if ( output_filename == NULL ) { -- print_error("split", "Error expanding output filename format string"); - cleanup_state(retval, false); - free(input_base_name); - return NULL; -@@ -452,11 +407,23 @@ - // Record index in hash - int ret; - khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret); -+ if (ret < 0) { -+ print_error_errno("split", "Couldn't add @RG ID to look-up table"); -+ cleanup_state(retval, false); -+ free(input_base_name); -+ return NULL; -+ } - kh_val(retval->rg_hash,iter) = i; - - // Set and edit header -- retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); -- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) { -+ retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header); -+ if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) || -+ (!opts->no_pg && -+ sam_hdr_add_pg(retval->rg_output_header[i], "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL))) { - print_error("split", "Could not rewrite header for \"%s\"", output_filename); - cleanup_state(retval, false); - free(input_base_name); -@@ -465,6 +432,7 @@ - } - - free(input_base_name); -+ retval->write_index = opts->ga.write_index; - - return retval; - } -@@ -481,6 +449,15 @@ - print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); - return false; - } -+ if (state->write_index) { -+ state->rg_index_file_name[i] = auto_index(state->rg_output_file[i], -+ state->rg_output_file_name[i], -+ state->rg_output_header[i]); -+ if (!state->rg_index_file_name[i]) { -+ print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]); -+ return false; -+ } -+ } - } - - bam1_t* file_read = bam_init1(); -@@ -547,6 +524,16 @@ - } - } - -+ if (state->write_index) { -+ for (i = 0; i < state->output_count; i++) { -+ if (sam_idx_save(state->rg_output_file[i]) < 0) { -+ print_error_errno("split", "writing index failed"); -+ return false; -+ } -+ free(state->rg_index_file_name[i]); -+ } -+ } -+ - return true; - } - -@@ -555,7 +542,7 @@ - int ret = 0; - - if (!status) return 0; -- if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); -+ if (status->unaccounted_header) sam_hdr_destroy(status->unaccounted_header); - if (status->unaccounted_file) { - if (sam_close(status->unaccounted_file) < 0 && check_close) { - print_error("split", "Error on closing unaccounted file"); -@@ -566,7 +553,7 @@ - size_t i; - for (i = 0; i < status->output_count; i++) { - if (status->rg_output_header && status->rg_output_header[i]) -- bam_hdr_destroy(status->rg_output_header[i]); -+ sam_hdr_destroy(status->rg_output_header[i]); - if (status->rg_output_file && status->rg_output_file[i]) { - if (sam_close(status->rg_output_file[i]) < 0 && check_close) { - print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]); -@@ -577,16 +564,16 @@ - if (status->rg_output_file_name) free(status->rg_output_file_name[i]); - } - if (status->merged_input_header) -- bam_hdr_destroy(status->merged_input_header); -+ sam_hdr_destroy(status->merged_input_header); - free(status->rg_output_header); - free(status->rg_output_file); - free(status->rg_output_file_name); -+ free(status->rg_index_file_name); - kh_destroy_c2i(status->rg_hash); - free(status->rg_id); -- free(status); -- - if (status->p.pool) - hts_tpool_destroy(status->p.pool); -+ free(status); - - return ret; - } -@@ -594,10 +581,6 @@ - static void cleanup_opts(parsed_opts_t* opts) - { - if (!opts) return; -- free(opts->merged_input_name); -- free(opts->unaccounted_header_name); -- free(opts->unaccounted_name); -- free(opts->output_format_string); - sam_global_args_free(&opts->ga); - free(opts); - } -@@ -605,9 +588,11 @@ - int main_split(int argc, char** argv) - { - int ret = 1; -- char *arg_list = stringify_argv(argc+1, argv-1); -+ char *arg_list = NULL; - parsed_opts_t* opts = parse_args(argc, argv); - if (!opts) goto cleanup_opts; -+ if (!opts->no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) -+ goto cleanup_opts; - state_t* status = init(opts, arg_list); - if (!status) goto cleanup_opts; - ---- python-pysam.orig/samtools/bam_split.c.pysam.c -+++ python-pysam/samtools/bam_split.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_split.c -- split subcommand. - -- Copyright (C) 2013-2016 Genome Research Ltd. -+ Copyright (C) 2013-2016,2018-2019 Genome Research Ltd. - - Author: Martin Pollard - -@@ -26,7 +26,6 @@ - - #include - --#include - #include - #include - #include -@@ -34,6 +33,8 @@ - #include - #include - #include -+#include -+#include - #include - #include - #include -@@ -45,11 +46,12 @@ - KHASH_MAP_INIT_STR(c2i, int) - - struct parsed_opts { -- char* merged_input_name; -- char* unaccounted_header_name; -- char* unaccounted_name; -- char* output_format_string; -+ const char *merged_input_name; -+ const char *unaccounted_header_name; -+ const char *unaccounted_name; -+ const char *output_format_string; - bool verbose; -+ int no_pg; - sam_global_args ga; - }; - -@@ -57,16 +59,18 @@ - - struct state { - samFile* merged_input_file; -- bam_hdr_t* merged_input_header; -+ sam_hdr_t* merged_input_header; - samFile* unaccounted_file; -- bam_hdr_t* unaccounted_header; -+ sam_hdr_t* unaccounted_header; - size_t output_count; - char** rg_id; -+ char **rg_index_file_name; - char **rg_output_file_name; - samFile** rg_output_file; -- bam_hdr_t** rg_output_header; -+ sam_hdr_t** rg_output_header; - kh_c2i_t* rg_hash; - htsThreadPool p; -+ int write_index; - }; - - typedef struct state state_t; -@@ -77,14 +81,15 @@ - static void usage(FILE *write_to) - { - fprintf(write_to, --"Usage: samtools split [-u [:]]\n" -+"Usage: samtools split [-u ] [-h ]\n" - " [-f ] [-v] \n" - "Options:\n" - " -f STRING output filename format string [\"%%*_%%#.%%.\"]\n" - " -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n" --" -u FILE1:FILE2 ...and override the header with FILE2\n" --" -v verbose output\n"); -- sam_global_opt_help(write_to, "-....@"); -+" -h FILE2 ... and override the header with FILE2 (-u file only)\n" -+" -v verbose output\n" -+" --no-PG do not add a PG line\n"); -+ sam_global_opt_help(write_to, "-....@.."); - fprintf(write_to, - "\n" - "Format string expansions:\n" -@@ -101,11 +106,11 @@ - { - if (argc == 1) { usage(samtools_stdout); return NULL; } - -- const char* optstring = "vf:u:@:"; -- char* delim; -+ const char *optstring = "vf:h:u:@:"; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -118,20 +123,19 @@ - while ((opt = getopt_long(argc, argv, optstring, lopts, NULL)) != -1) { - switch (opt) { - case 'f': -- retval->output_format_string = strdup(optarg); -- if (! retval->output_format_string ) { perror("cannot allocate output format string memory"); return NULL; } -+ retval->output_format_string = optarg; -+ break; -+ case 'h': -+ retval->unaccounted_header_name = optarg; - break; - case 'v': - retval->verbose = true; - break; - case 'u': -- retval->unaccounted_name = strdup(optarg); -- if (! retval->unaccounted_name ) { perror("cannot allocate string memory"); return NULL; } -- if ((delim = strchr(retval->unaccounted_name, ':')) != NULL) { -- *delim = '\0'; -- retval->unaccounted_header_name = strdup(delim+1); -- if (! retval->unaccounted_header_name ) { perror("cannot allocate string memory"); return NULL; } -- } -+ retval->unaccounted_name = optarg; -+ break; -+ case 1: -+ retval->no_pg = 1; - break; - default: - if (parse_sam_global_opt(opt, optarg, lopts, &retval->ga) == 0) break; -@@ -143,7 +147,7 @@ - } - } - -- if (retval->output_format_string == NULL) retval->output_format_string = strdup("%*_%#.%."); -+ if (retval->output_format_string == NULL) retval->output_format_string = "%*_%#.%."; - - argc -= optind; - argv += optind; -@@ -155,8 +159,7 @@ - return NULL; - } - -- retval->merged_input_name = strdup(argv[0]); -- if (! retval->merged_input_name ) { perror("cannot allocate string memory"); return NULL; } -+ retval->merged_input_name = argv[0]; - - return retval; - } -@@ -168,176 +171,110 @@ - const char* pointer = format_string; - const char* next; - while ((next = strchr(pointer, '%')) != NULL) { -- kputsn(pointer, next-pointer, &str); -+ if (kputsn(pointer, next-pointer, &str) < 0) goto memfail; - ++next; - switch (*next) { - case '%': -- kputc('%', &str); -+ if (kputc('%', &str) < 0) goto memfail; - break; - case '*': -- kputs(basename, &str); -+ if (kputs(basename, &str) < 0) goto memfail; - break; - case '#': -- kputl(rg_idx, &str); -+ if (kputl(rg_idx, &str) < 0) goto memfail; - break; - case '!': -- kputs(rg_id, &str); -+ if (kputs(rg_id, &str) < 0) goto memfail; - break; - case '.': - // Only really need to cope with sam, bam, cram -- if (format->format != unknown_format) -- kputs(hts_format_file_extension(format), &str); -- else -- kputs("bam", &str); -+ if (format->format != unknown_format) { -+ if (kputs(hts_format_file_extension(format), &str) < 0) -+ goto memfail; -+ } else { -+ if (kputs("bam", &str) < 0) goto memfail; -+ } - break; - case '\0': -- // Error is: fprintf(samtools_stderr, "bad format string, trailing %%\n"); -- free(str.s); -- return NULL; -+ print_error("split", "Trailing %% in filename format string"); -+ goto fail; - default: - // Error is: fprintf(samtools_stderr, "bad format string, unknown format specifier\n"); -- free(str.s); -- return NULL; -+ print_error("split", "Unknown specifier %%%c in filename format string", *next); -+ goto fail; - } - pointer = next + 1; - } -- kputs(pointer, &str); -+ if (kputs(pointer, &str) < 0) goto memfail; - return ks_release(&str); -+ -+ memfail: -+ print_error_errno("split", "Couldn't build output filename"); -+ fail: -+ free(str.s); -+ return NULL; - } - - // Parse the header, count the number of RG tags and return a list of their names --static bool count_RG(bam_hdr_t* hdr, size_t* count, char*** output_name) -+static bool count_RG(sam_hdr_t* hdr, size_t* count, char*** output_name) - { -- if (hdr->l_text < 3 ) { -+ char **names = NULL; -+ kstring_t id_val = KS_INITIALIZE; -+ int i, n_rg = sam_hdr_count_lines(hdr, "RG"); -+ -+ if (n_rg < 0) { -+ print_error("split", "Failed to get @RG IDs"); - *count = 0; - *output_name = NULL; -- return true; -+ return false; - } -- kstring_t input = { 0, 0, NULL }; -- kputsn(hdr->text, hdr->l_text, &input); - -- ////////////////////////////////////////// -- // First stage count number of @RG tags // -- ////////////////////////////////////////// -- char* pointer = ks_str(&input); -- size_t n_rg = 0; -- // Guard against rare case where @RG is first header line -- // This shouldn't happen but could where @HD is omitted -- if (pointer[0] == '@' && pointer[1] == 'R' && pointer[2] == 'G' ) { -- ++n_rg; -- pointer += 3; -- } -- char* line; -- while ((line = strstr(pointer, "\n@RG")) != NULL) { -- ++n_rg; -- pointer = line + 1; -- } -- -- ////////////////////////////////// -- // Second stage locate @RG ID's // -- ////////////////////////////////// -- char** names = (char**)calloc(sizeof(char*), n_rg); -- size_t next = 0; -- -- regex_t rg_finder; -- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { -- free(input.s); -- free(names); -- return false; -+ if (n_rg == 0) { -+ *count = 0; -+ *output_name = NULL; -+ return true; - } -- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); -- int error; -- char* begin = ks_str(&input); -- -- while ((error = regexec(&rg_finder, begin, 2, matches, 0)) == 0) { -- kstring_t str = { 0, 0, NULL }; -- kputsn(begin+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &str); -- names[next++] = ks_release(&str); -- begin += matches[0].rm_eo; -- } -- -- if (error != REG_NOMATCH) { -- // cleanup -- regfree(&rg_finder); -- free(matches); -- free(names); -- free(input.s); -- return false; -+ -+ names = calloc(n_rg, sizeof(names[0])); -+ if (!names) goto memfail; -+ -+ for (i = 0; i < n_rg; i++) { -+ if (sam_hdr_find_tag_pos(hdr, "RG", i, "ID", &id_val) < 0) goto memfail; -+ names[i] = ks_release(&id_val); - } -- free(matches); - -- // return results - *count = n_rg; - *output_name = names; -- regfree(&rg_finder); -- free(input.s); - return true; -+ -+ memfail: -+ print_error_errno("split", "Failed to get @RG IDs"); -+ *count = 0; -+ *output_name = NULL; -+ ks_free(&id_val); -+ free(names); -+ return false; - } - --// Filters a header of @RG lines where ID != id_keep --// TODO: strip @PG's descended from other RGs and their descendants --static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list) -+static int header_compatible(sam_hdr_t *hdr1, sam_hdr_t *hdr2) - { -- kstring_t str = {0, 0, NULL}; -- -- regex_t rg_finder; -- -- if (regcomp(&rg_finder, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE) != 0) { -- return false; -+ size_t n; -+ if (sam_hdr_nref(hdr1) != sam_hdr_nref(hdr2)) { -+ print_error("split", -+ "Unaccounted header contains wrong number of references"); -+ return -1; - } -- -- // regex vars -- char* header = hdr->text; -- regmatch_t* matches = (regmatch_t*)calloc(sizeof(regmatch_t),2); -- kstring_t found_id = { 0, 0, NULL }; -- int error; -- -- while ((error = regexec(&rg_finder, header, 2, matches, 0)) == 0) { -- kputsn(header, matches[0].rm_so, &str); // copy header up until the found RG line -- -- found_id.l = 0; -- kputsn(header+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &found_id); // extract ID -- // if it matches keep keep it, else we can just ignore it -- if (strcmp(ks_str(&found_id), id_keep) == 0) { -- kputsn(header+matches[0].rm_so, (matches[0].rm_eo+1)-matches[0].rm_so, &str); -- } -- // move pointer forward -- header += matches[0].rm_eo+1; -- } -- // cleanup -- free(found_id.s); -- free(matches); -- regfree(&rg_finder); -- // Did we leave loop because of an error? -- if (error != REG_NOMATCH) { -- return false; -+ for (n = 0; n < sam_hdr_nref(hdr1); n++) { -+ hts_pos_t h1_len = sam_hdr_tid2len(hdr1, n); -+ hts_pos_t h2_len = sam_hdr_tid2len(hdr2, n); -+ if (h1_len != h2_len) { -+ print_error("split", -+ "Unaccounted header reference %zu \"%s\" is not the same length as in the input file", -+ n + 1, sam_hdr_tid2name(hdr2, n)); -+ return -1; -+ } - } -- -- // Write remainder of string -- kputs(header, &str); -- -- // Modify header -- hdr->l_text = ks_len(&str); -- free(hdr->text); -- hdr->text = ks_release(&str); -- -- // Add the PG line -- SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text); -- if (sam_hdr_add_PG(sh, "samtools", -- "VN", samtools_version(), -- arg_list ? "CL": NULL, -- arg_list ? arg_list : NULL, -- NULL) != 0) -- return -1; -- -- free(hdr->text); -- hdr->text = strdup(sam_hdr_str(sh)); -- hdr->l_text = sam_hdr_length(sh); -- if (!hdr->text) -- return false; -- sam_hdr_free(sh); -- -- return true; -+ return 0; - } - - // Set the initial state -@@ -352,6 +289,7 @@ - if (opts->ga.nthreads > 0) { - if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) { - fprintf(samtools_stderr, "Error creating thread pool\n"); -+ cleanup_state(retval, false); - return NULL; - } - } -@@ -359,7 +297,7 @@ - retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in); - if (!retval->merged_input_file) { - print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name); -- free(retval); -+ cleanup_state(retval, false); - return NULL; - } - if (retval->p.pool) -@@ -383,11 +321,26 @@ - if (retval->unaccounted_header == NULL) { - print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name); - cleanup_state(retval, false); -+ sam_close(hdr_load); - return NULL; - } - sam_close(hdr_load); -+ if (header_compatible(retval->merged_input_header, -+ retval->unaccounted_header) != 0) { -+ cleanup_state(retval, false); -+ return NULL; -+ } - } else { -- retval->unaccounted_header = bam_hdr_dup(retval->merged_input_header); -+ retval->unaccounted_header = sam_hdr_dup(retval->merged_input_header); -+ if (!opts->no_pg && sam_hdr_add_pg(retval->unaccounted_header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("split", "Could not rewrite header for \"%s\"", opts->unaccounted_name); -+ cleanup_state(retval, false); -+ return NULL; -+ } - } - - retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out); -@@ -403,12 +356,15 @@ - // Open output files for RGs - if (!count_RG(retval->merged_input_header, &retval->output_count, &retval->rg_id)) return NULL; - if (opts->verbose) fprintf(samtools_stderr, "@RG's found %zu\n",retval->output_count); -- -- retval->rg_output_file_name = (char **)calloc(retval->output_count, sizeof(char *)); -- retval->rg_output_file = (samFile**)calloc(retval->output_count, sizeof(samFile*)); -- retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*)); -+ // Prevent calloc(0, size); -+ size_t num = retval->output_count ? retval->output_count : 1; -+ retval->rg_index_file_name = (char **)calloc(num, sizeof(char *)); -+ retval->rg_output_file_name = (char **)calloc(num, sizeof(char *)); -+ retval->rg_output_file = (samFile**)calloc(num, sizeof(samFile*)); -+ retval->rg_output_header = (sam_hdr_t**)calloc(num, sizeof(sam_hdr_t*)); - retval->rg_hash = kh_init_c2i(); -- if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) { -+ if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || -+ !retval->rg_hash || !retval->rg_index_file_name) { - print_error_errno("split", "Could not initialise output file array"); - cleanup_state(retval, false); - return NULL; -@@ -434,7 +390,6 @@ - &opts->ga.out); - - if ( output_filename == NULL ) { -- print_error("split", "Error expanding output filename format string"); - cleanup_state(retval, false); - free(input_base_name); - return NULL; -@@ -454,11 +409,23 @@ - // Record index in hash - int ret; - khiter_t iter = kh_put_c2i(retval->rg_hash, retval->rg_id[i], &ret); -+ if (ret < 0) { -+ print_error_errno("split", "Couldn't add @RG ID to look-up table"); -+ cleanup_state(retval, false); -+ free(input_base_name); -+ return NULL; -+ } - kh_val(retval->rg_hash,iter) = i; - - // Set and edit header -- retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header); -- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) { -+ retval->rg_output_header[i] = sam_hdr_dup(retval->merged_input_header); -+ if (sam_hdr_remove_except(retval->rg_output_header[i], "RG", "ID", retval->rg_id[i]) || -+ (!opts->no_pg && -+ sam_hdr_add_pg(retval->rg_output_header[i], "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL))) { - print_error("split", "Could not rewrite header for \"%s\"", output_filename); - cleanup_state(retval, false); - free(input_base_name); -@@ -467,6 +434,7 @@ - } - - free(input_base_name); -+ retval->write_index = opts->ga.write_index; - - return retval; - } -@@ -483,6 +451,15 @@ - print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]); - return false; - } -+ if (state->write_index) { -+ state->rg_index_file_name[i] = auto_index(state->rg_output_file[i], -+ state->rg_output_file_name[i], -+ state->rg_output_header[i]); -+ if (!state->rg_index_file_name[i]) { -+ print_error_errno("split", "Could not create index for file \"%s\"", state->rg_output_file_name[i]); -+ return false; -+ } -+ } - } - - bam1_t* file_read = bam_init1(); -@@ -549,6 +526,16 @@ - } - } - -+ if (state->write_index) { -+ for (i = 0; i < state->output_count; i++) { -+ if (sam_idx_save(state->rg_output_file[i]) < 0) { -+ print_error_errno("split", "writing index failed"); -+ return false; -+ } -+ free(state->rg_index_file_name[i]); -+ } -+ } -+ - return true; - } - -@@ -557,7 +544,7 @@ - int ret = 0; - - if (!status) return 0; -- if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header); -+ if (status->unaccounted_header) sam_hdr_destroy(status->unaccounted_header); - if (status->unaccounted_file) { - if (sam_close(status->unaccounted_file) < 0 && check_close) { - print_error("split", "Error on closing unaccounted file"); -@@ -568,7 +555,7 @@ - size_t i; - for (i = 0; i < status->output_count; i++) { - if (status->rg_output_header && status->rg_output_header[i]) -- bam_hdr_destroy(status->rg_output_header[i]); -+ sam_hdr_destroy(status->rg_output_header[i]); - if (status->rg_output_file && status->rg_output_file[i]) { - if (sam_close(status->rg_output_file[i]) < 0 && check_close) { - print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]); -@@ -579,16 +566,16 @@ - if (status->rg_output_file_name) free(status->rg_output_file_name[i]); - } - if (status->merged_input_header) -- bam_hdr_destroy(status->merged_input_header); -+ sam_hdr_destroy(status->merged_input_header); - free(status->rg_output_header); - free(status->rg_output_file); - free(status->rg_output_file_name); -+ free(status->rg_index_file_name); - kh_destroy_c2i(status->rg_hash); - free(status->rg_id); -- free(status); -- - if (status->p.pool) - hts_tpool_destroy(status->p.pool); -+ free(status); - - return ret; - } -@@ -596,10 +583,6 @@ - static void cleanup_opts(parsed_opts_t* opts) - { - if (!opts) return; -- free(opts->merged_input_name); -- free(opts->unaccounted_header_name); -- free(opts->unaccounted_name); -- free(opts->output_format_string); - sam_global_args_free(&opts->ga); - free(opts); - } -@@ -607,9 +590,11 @@ - int main_split(int argc, char** argv) - { - int ret = 1; -- char *arg_list = stringify_argv(argc+1, argv-1); -+ char *arg_list = NULL; - parsed_opts_t* opts = parse_args(argc, argv); - if (!opts) goto cleanup_opts; -+ if (!opts->no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) -+ goto cleanup_opts; - state_t* status = init(opts, arg_list); - if (!status) goto cleanup_opts; - ---- python-pysam.orig/samtools/bam_stat.c -+++ python-pysam/samtools/bam_stat.c -@@ -1,6 +1,6 @@ - /* bam_stat.c -- flagstat subcommand. - -- Copyright (C) 2009, 2011, 2013-2015 Genome Research Ltd. -+ Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -69,7 +69,7 @@ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ - } while (0) - --bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) -+bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) - { - bam_flagstat_t *s; - bam1_t *b; -@@ -93,19 +93,155 @@ - return buffer; - } - -+static const char *percent_json(char *buffer, long long n, long long total) -+{ -+ if (total != 0) sprintf(buffer, "%.2f", (float)n / total * 100.0); -+ else strcpy(buffer, "null"); -+ return buffer; -+} -+ - static void usage_exit(FILE *fp, int exit_status) - { - fprintf(fp, "Usage: samtools flagstat [options] \n"); -- sam_global_opt_help(fp, "-.---@"); -+ sam_global_opt_help(fp, "-.---@-."); -+ fprintf(fp, " -O, --"); -+ fprintf(fp, "output-fmt FORMAT[,OPT[=VAL]]...\n" -+ " Specify output format (json, tsv)\n"); - exit(exit_status); - } - -+static void out_fmt_default(bam_flagstat_t *s) -+{ -+ char b0[16], b1[16]; -+ printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); -+ printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); -+ printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); -+ printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); -+ printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); -+ printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); -+ printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); -+ printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); -+ printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); -+ printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); -+ printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); -+ printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); -+ printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); -+} -+ -+static void out_fmt_json(bam_flagstat_t *s) { -+ char b0[16], b1[16]; -+ printf("{\n \"QC-passed reads\": { \n" -+ " \"total\": %lld, \n" -+ " \"secondary\": %lld, \n" -+ " \"supplementary\": %lld, \n" -+ " \"duplicates\": %lld, \n" -+ " \"mapped\": %lld, \n" -+ " \"mapped %%\": %s, \n" -+ " \"paired in sequencing\": %lld, \n" -+ " \"read1\": %lld, \n" -+ " \"read2\": %lld, \n" -+ " \"properly paired\": %lld, \n" -+ " \"properly paired %%\": %s, \n" -+ " \"with itself and mate mapped\": %lld, \n" -+ " \"singletons\": %lld, \n" -+ " \"singletons %%\": %s, \n" -+ " \"with mate mapped to a different chr\": %lld, \n" -+ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" -+ " }," -+ "\n \"QC-failed reads\": { \n" -+ " \"total\": %lld, \n" -+ " \"secondary\": %lld, \n" -+ " \"supplementary\": %lld, \n" -+ " \"duplicates\": %lld, \n" -+ " \"mapped\": %lld, \n" -+ " \"mapped %%\": %s, \n" -+ " \"paired in sequencing\": %lld, \n" -+ " \"read1\": %lld, \n" -+ " \"read2\": %lld, \n" -+ " \"properly paired\": %lld, \n" -+ " \"properly paired %%\": %s, \n" -+ " \"with itself and mate mapped\": %lld, \n" -+ " \"singletons\": %lld, \n" -+ " \"singletons %%\": %s, \n" -+ " \"with mate mapped to a different chr\": %lld, \n" -+ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" -+ " }\n" -+ "}\n", -+ s->n_reads[0], -+ s->n_secondary[0], -+ s->n_supp[0], -+ s->n_dup[0], -+ s->n_mapped[0], -+ percent_json(b0, s->n_mapped[0], s->n_reads[0]), -+ s->n_pair_all[0], -+ s->n_read1[0], -+ s->n_read2[0], -+ s->n_pair_good[0], -+ percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]), -+ s->n_pair_map[0], -+ s->n_sgltn[0], -+ percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]), -+ s->n_diffchr[0], -+ s->n_diffhigh[0], -+ s->n_reads[1], -+ s->n_secondary[1], -+ s->n_supp[1], -+ s->n_dup[1], -+ s->n_mapped[1], -+ percent_json(b1, s->n_mapped[1], s->n_reads[1]), -+ s->n_pair_all[1], -+ s->n_read1[1], -+ s->n_read2[1], -+ s->n_pair_good[1], -+ percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]), -+ s->n_pair_map[1], -+ s->n_sgltn[1], -+ percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]), -+ s->n_diffchr[1], -+ s->n_diffhigh[1] -+ ); -+} -+ -+static void out_fmt_tsv(bam_flagstat_t *s) { -+ char b0[16], b1[16]; -+ printf("%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); -+ printf("%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]); -+ printf("%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]); -+ printf("%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]); -+ printf("%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]); -+ printf("%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); -+ printf("%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); -+ printf("%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]); -+ printf("%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]); -+ printf("%lld\t%lld\tproperly paired\n", s->n_pair_good[0], s->n_pair_good[1]); -+ printf("%s\t%s\tproperly paired %%\n", percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); -+ printf("%lld\t%lld\twith itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); -+ printf("%lld\t%lld\tsingletons\n", s->n_sgltn[0], s->n_sgltn[1]); -+ printf("%s\t%s\tsingletons %%\n", percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); -+ printf("%lld\t%lld\twith mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); -+ printf("%lld\t%lld\twith mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); -+} -+ -+/* -+ * Select flagstats output format to print. -+ */ -+static void output_fmt(bam_flagstat_t *s, const char *out_fmt) -+{ -+ if (strcmp(out_fmt, "json") == 0 || strcmp(out_fmt, "JSON") == 0) { -+ out_fmt_json(s); -+ } else if (strcmp(out_fmt, "tsv") == 0 || strcmp(out_fmt, "TSV") == 0) { -+ out_fmt_tsv(s); -+ } else { -+ out_fmt_default(s); -+ } -+} -+ - int bam_flagstat(int argc, char *argv[]) - { - samFile *fp; -- bam_hdr_t *header; -+ sam_hdr_t *header; - bam_flagstat_t *s; -- char b0[16], b1[16]; -+ const char *out_fmt = "default"; - int c; - - enum { -@@ -114,12 +250,15 @@ - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { -- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), -+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), - {NULL, 0, NULL, 0} - }; - -- while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "@:O:", lopts, NULL)) >= 0) { - switch (c) { -+ case 'O': -+ out_fmt = optarg; -+ break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': -@@ -155,22 +294,11 @@ - fprintf(stderr, "Failed to read header for \"%s\"\n", argv[optind]); - return 1; - } -+ - s = bam_flagstat_core(fp, header); -- printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); -- printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); -- printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); -- printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); -- printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); -- printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); -- printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); -- printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); -- printf("%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); -- printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); -- printf("%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); -- printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); -- printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); -+ output_fmt(s, out_fmt); - free(s); -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - sam_close(fp); - sam_global_args_free(&ga); - return 0; ---- python-pysam.orig/samtools/bam_stat.c.pysam.c -+++ python-pysam/samtools/bam_stat.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bam_stat.c -- flagstat subcommand. - -- Copyright (C) 2009, 2011, 2013-2015 Genome Research Ltd. -+ Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -71,7 +71,7 @@ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ - } while (0) - --bam_flagstat_t *bam_flagstat_core(samFile *fp, bam_hdr_t *h) -+bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) - { - bam_flagstat_t *s; - bam1_t *b; -@@ -95,19 +95,155 @@ - return buffer; - } - -+static const char *percent_json(char *buffer, long long n, long long total) -+{ -+ if (total != 0) sprintf(buffer, "%.2f", (float)n / total * 100.0); -+ else strcpy(buffer, "null"); -+ return buffer; -+} -+ - static void usage_exit(FILE *fp, int exit_status) - { - fprintf(fp, "Usage: samtools flagstat [options] \n"); -- sam_global_opt_help(fp, "-.---@"); -+ sam_global_opt_help(fp, "-.---@-."); -+ fprintf(fp, " -O, --"); -+ fprintf(fp, "output-fmt FORMAT[,OPT[=VAL]]...\n" -+ " Specify output format (json, tsv)\n"); - exit(exit_status); - } - -+static void out_fmt_default(bam_flagstat_t *s) -+{ -+ char b0[16], b1[16]; -+ fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); -+ fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); -+ fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); -+ fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); -+ fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); -+ fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); -+ fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); -+ fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); -+ fprintf(samtools_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); -+ fprintf(samtools_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); -+ fprintf(samtools_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); -+ fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); -+ fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); -+} -+ -+static void out_fmt_json(bam_flagstat_t *s) { -+ char b0[16], b1[16]; -+ fprintf(samtools_stdout, "{\n \"QC-passed reads\": { \n" -+ " \"total\": %lld, \n" -+ " \"secondary\": %lld, \n" -+ " \"supplementary\": %lld, \n" -+ " \"duplicates\": %lld, \n" -+ " \"mapped\": %lld, \n" -+ " \"mapped %%\": %s, \n" -+ " \"paired in sequencing\": %lld, \n" -+ " \"read1\": %lld, \n" -+ " \"read2\": %lld, \n" -+ " \"properly paired\": %lld, \n" -+ " \"properly paired %%\": %s, \n" -+ " \"with itself and mate mapped\": %lld, \n" -+ " \"singletons\": %lld, \n" -+ " \"singletons %%\": %s, \n" -+ " \"with mate mapped to a different chr\": %lld, \n" -+ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" -+ " }," -+ "\n \"QC-failed reads\": { \n" -+ " \"total\": %lld, \n" -+ " \"secondary\": %lld, \n" -+ " \"supplementary\": %lld, \n" -+ " \"duplicates\": %lld, \n" -+ " \"mapped\": %lld, \n" -+ " \"mapped %%\": %s, \n" -+ " \"paired in sequencing\": %lld, \n" -+ " \"read1\": %lld, \n" -+ " \"read2\": %lld, \n" -+ " \"properly paired\": %lld, \n" -+ " \"properly paired %%\": %s, \n" -+ " \"with itself and mate mapped\": %lld, \n" -+ " \"singletons\": %lld, \n" -+ " \"singletons %%\": %s, \n" -+ " \"with mate mapped to a different chr\": %lld, \n" -+ " \"with mate mapped to a different chr (mapQ >= 5)\": %lld \n" -+ " }\n" -+ "}\n", -+ s->n_reads[0], -+ s->n_secondary[0], -+ s->n_supp[0], -+ s->n_dup[0], -+ s->n_mapped[0], -+ percent_json(b0, s->n_mapped[0], s->n_reads[0]), -+ s->n_pair_all[0], -+ s->n_read1[0], -+ s->n_read2[0], -+ s->n_pair_good[0], -+ percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]), -+ s->n_pair_map[0], -+ s->n_sgltn[0], -+ percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]), -+ s->n_diffchr[0], -+ s->n_diffhigh[0], -+ s->n_reads[1], -+ s->n_secondary[1], -+ s->n_supp[1], -+ s->n_dup[1], -+ s->n_mapped[1], -+ percent_json(b1, s->n_mapped[1], s->n_reads[1]), -+ s->n_pair_all[1], -+ s->n_read1[1], -+ s->n_read2[1], -+ s->n_pair_good[1], -+ percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]), -+ s->n_pair_map[1], -+ s->n_sgltn[1], -+ percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]), -+ s->n_diffchr[1], -+ s->n_diffhigh[1] -+ ); -+} -+ -+static void out_fmt_tsv(bam_flagstat_t *s) { -+ char b0[16], b1[16]; -+ fprintf(samtools_stdout, "%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]); -+ fprintf(samtools_stdout, "%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); -+ fprintf(samtools_stdout, "%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tproperly paired\n", s->n_pair_good[0], s->n_pair_good[1]); -+ fprintf(samtools_stdout, "%s\t%s\tproperly paired %%\n", percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); -+ fprintf(samtools_stdout, "%lld\t%lld\twith itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\tsingletons\n", s->n_sgltn[0], s->n_sgltn[1]); -+ fprintf(samtools_stdout, "%s\t%s\tsingletons %%\n", percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); -+ fprintf(samtools_stdout, "%lld\t%lld\twith mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); -+ fprintf(samtools_stdout, "%lld\t%lld\twith mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); -+} -+ -+/* -+ * Select flagstats output format to print. -+ */ -+static void output_fmt(bam_flagstat_t *s, const char *out_fmt) -+{ -+ if (strcmp(out_fmt, "json") == 0 || strcmp(out_fmt, "JSON") == 0) { -+ out_fmt_json(s); -+ } else if (strcmp(out_fmt, "tsv") == 0 || strcmp(out_fmt, "TSV") == 0) { -+ out_fmt_tsv(s); -+ } else { -+ out_fmt_default(s); -+ } -+} -+ - int bam_flagstat(int argc, char *argv[]) - { - samFile *fp; -- bam_hdr_t *header; -+ sam_hdr_t *header; - bam_flagstat_t *s; -- char b0[16], b1[16]; -+ const char *out_fmt = "default"; - int c; - - enum { -@@ -116,12 +252,15 @@ - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { -- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), -+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), - {NULL, 0, NULL, 0} - }; - -- while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "@:O:", lopts, NULL)) >= 0) { - switch (c) { -+ case 'O': -+ out_fmt = optarg; -+ break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': -@@ -157,22 +296,11 @@ - fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", argv[optind]); - return 1; - } -+ - s = bam_flagstat_core(fp, header); -- fprintf(samtools_stdout, "%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); -- fprintf(samtools_stdout, "%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); -- fprintf(samtools_stdout, "%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); -- fprintf(samtools_stdout, "%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); -- fprintf(samtools_stdout, "%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); -- fprintf(samtools_stdout, "%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); -- fprintf(samtools_stdout, "%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); -- fprintf(samtools_stdout, "%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); -- fprintf(samtools_stdout, "%lld + %lld properly paired (%s : %s)\n", s->n_pair_good[0], s->n_pair_good[1], percent(b0, s->n_pair_good[0], s->n_pair_all[0]), percent(b1, s->n_pair_good[1], s->n_pair_all[1])); -- fprintf(samtools_stdout, "%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); -- fprintf(samtools_stdout, "%lld + %lld singletons (%s : %s)\n", s->n_sgltn[0], s->n_sgltn[1], percent(b0, s->n_sgltn[0], s->n_pair_all[0]), percent(b1, s->n_sgltn[1], s->n_pair_all[1])); -- fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); -- fprintf(samtools_stdout, "%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); -+ output_fmt(s, out_fmt); - free(s); -- bam_hdr_destroy(header); -+ sam_hdr_destroy(header); - sam_close(fp); - sam_global_args_free(&ga); - return 0; ---- python-pysam.orig/samtools/bamshuf.c -+++ python-pysam/samtools/bamshuf.c -@@ -1,7 +1,7 @@ - /* bamshuf.c -- collate subcommand. - - Copyright (C) 2012 Broad Institute. -- Copyright (C) 2013, 2015, 2018 Genome Research Ltd. -+ Copyright (C) 2013, 2015-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -164,7 +164,7 @@ - } - - --static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, bam_hdr_t *header, int files) { -+static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, sam_hdr_t *header, int files) { - uint32_t x; - - x = hash_X31_Wang(bam_get_qname(bam)) % files; -@@ -181,13 +181,13 @@ - - - static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, -- int is_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga) -+ int is_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg) - { - samFile *fp, *fpw = NULL, **fpt = NULL; - char **fnt = NULL, modew[8]; - bam1_t *b = NULL; - int i, counter, l, r; -- bam_hdr_t *h = NULL; -+ sam_hdr_t *h = NULL; - int64_t j, max_cnt = 0, *cnt = NULL; - elem_t *a = NULL; - htsThreadPool p = {NULL, 0}; -@@ -214,14 +214,10 @@ - goto fail; - } - -- if (sam_hdr_change_HD(h, "SO", "unsorted") != 0) { -- print_error("collate", -- "failed to change sort order header to 'unsorted'\n"); -- goto fail; -- } -- if (sam_hdr_change_HD(h, "GO", "query") != 0) { -- print_error("collate", -- "failed to change group order header to 'query'\n"); -+ if ((-1 == sam_hdr_update_hd(h, "SO", "unsorted", "GO", "query")) -+ && (-1 == sam_hdr_add_line(h, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL)) -+ ) { -+ print_error("collate", "failed to update HD line\n"); - goto fail; - } - -@@ -254,6 +250,15 @@ - } - if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p); - -+ if (!no_pg && sam_hdr_add_pg(h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("collate", "failed to add PG line to header of \"%s\"", output_file); -+ goto fail; -+ } -+ - if (sam_hdr_write(fpw, h) < 0) { - print_error_errno("collate", "Couldn't write header"); - goto fail; -@@ -459,7 +464,7 @@ - goto fail; - } - if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); -- bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header -+ sam_hdr_destroy(sam_hdr_read(fp)); // Skip over header - - // Slurp in one of the split files - for (j = 0; j < c; ++j) { -@@ -485,7 +490,7 @@ - } - } - -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); - free(a); free(fnt); free(cnt); - sam_global_args_free(ga); -@@ -503,7 +508,7 @@ - fail: - if (fp) sam_close(fp); - if (fpw) sam_close(fpw); -- if (h) bam_hdr_destroy(h); -+ if (h) sam_hdr_destroy(h); - for (i = 0; i < n_files; ++i) { - if (fnt) free(fnt[i]); - if (fpt && fpt[i]) sam_close(fpt[i]); -@@ -530,10 +535,11 @@ - " -f fast (only primary alignments)\n" - " -r working reads stored (with -f) [%d]\n" // reads_store - " -l INT compression level [%d]\n" // DEF_CLEVEL -- " -n INT number of temporary files [%d]\n", // n_files -+ " -n INT number of temporary files [%d]\n" // n_files -+ " --no-PG do not add a PG line\n", - reads_store, DEF_CLEVEL, n_files); - -- sam_global_opt_help(fp, "-....@"); -+ sam_global_opt_help(fp, "-....@-."); - fprintf(fp, - " is required unless the -o or -O options are used.\n"); - -@@ -574,12 +580,13 @@ - - int main_bamshuf(int argc, char *argv[]) - { -- int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0; -+ int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0; - const char *output_file = NULL; -- char *prefix = NULL; -+ char *prefix = NULL, *arg_list = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -592,6 +599,7 @@ - case 'o': output_file = optarg; break; - case 'f': fast_coll = 1; break; - case 'r': reads_store = atoi(optarg); break; -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return usage(stderr, n_files, reads_store); -@@ -612,10 +620,16 @@ - - if (!prefix) return EXIT_FAILURE; - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("collate", "failed to create arg_list"); -+ return 1; -+ } -+ - ret = bamshuf(argv[optind], n_files, prefix, clevel, is_stdout, -- output_file, fast_coll, reads_store, &ga); -+ output_file, fast_coll, reads_store, &ga, arg_list, no_pg); - - if (pre_mem) free(prefix); -+ free(arg_list); - - return ret; - } ---- python-pysam.orig/samtools/bamshuf.c.pysam.c -+++ python-pysam/samtools/bamshuf.c.pysam.c -@@ -3,7 +3,7 @@ - /* bamshuf.c -- collate subcommand. - - Copyright (C) 2012 Broad Institute. -- Copyright (C) 2013, 2015, 2018 Genome Research Ltd. -+ Copyright (C) 2013, 2015-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -166,7 +166,7 @@ - } - - --static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, bam_hdr_t *header, int files) { -+static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, sam_hdr_t *header, int files) { - uint32_t x; - - x = hash_X31_Wang(bam_get_qname(bam)) % files; -@@ -183,13 +183,13 @@ - - - static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, -- int is_samtools_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga) -+ int is_samtools_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga, char *arg_list, int no_pg) - { - samFile *fp, *fpw = NULL, **fpt = NULL; - char **fnt = NULL, modew[8]; - bam1_t *b = NULL; - int i, counter, l, r; -- bam_hdr_t *h = NULL; -+ sam_hdr_t *h = NULL; - int64_t j, max_cnt = 0, *cnt = NULL; - elem_t *a = NULL; - htsThreadPool p = {NULL, 0}; -@@ -216,14 +216,10 @@ - goto fail; - } - -- if (sam_hdr_change_HD(h, "SO", "unsorted") != 0) { -- print_error("collate", -- "failed to change sort order header to 'unsorted'\n"); -- goto fail; -- } -- if (sam_hdr_change_HD(h, "GO", "query") != 0) { -- print_error("collate", -- "failed to change group order header to 'query'\n"); -+ if ((-1 == sam_hdr_update_hd(h, "SO", "unsorted", "GO", "query")) -+ && (-1 == sam_hdr_add_line(h, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL)) -+ ) { -+ print_error("collate", "failed to update HD line\n"); - goto fail; - } - -@@ -256,6 +252,15 @@ - } - if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p); - -+ if (!no_pg && sam_hdr_add_pg(h, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("collate", "failed to add PG line to header of \"%s\"", output_file); -+ goto fail; -+ } -+ - if (sam_hdr_write(fpw, h) < 0) { - print_error_errno("collate", "Couldn't write header"); - goto fail; -@@ -461,7 +466,7 @@ - goto fail; - } - if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); -- bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header -+ sam_hdr_destroy(sam_hdr_read(fp)); // Skip over header - - // Slurp in one of the split files - for (j = 0; j < c; ++j) { -@@ -487,7 +492,7 @@ - } - } - -- bam_hdr_destroy(h); -+ sam_hdr_destroy(h); - for (j = 0; j < max_cnt; ++j) bam_destroy1(a[j].b); - free(a); free(fnt); free(cnt); - sam_global_args_free(ga); -@@ -505,7 +510,7 @@ - fail: - if (fp) sam_close(fp); - if (fpw) sam_close(fpw); -- if (h) bam_hdr_destroy(h); -+ if (h) sam_hdr_destroy(h); - for (i = 0; i < n_files; ++i) { - if (fnt) free(fnt[i]); - if (fpt && fpt[i]) sam_close(fpt[i]); -@@ -532,10 +537,11 @@ - " -f fast (only primary alignments)\n" - " -r working reads stored (with -f) [%d]\n" // reads_store - " -l INT compression level [%d]\n" // DEF_CLEVEL -- " -n INT number of temporary files [%d]\n", // n_files -+ " -n INT number of temporary files [%d]\n" // n_files -+ " --no-PG do not add a PG line\n", - reads_store, DEF_CLEVEL, n_files); - -- sam_global_opt_help(fp, "-....@"); -+ sam_global_opt_help(fp, "-....@-."); - fprintf(fp, - " is required unless the -o or -O options are used.\n"); - -@@ -576,12 +582,13 @@ - - int main_bamshuf(int argc, char *argv[]) - { -- int c, n_files = 64, clevel = DEF_CLEVEL, is_samtools_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0; -+ int c, n_files = 64, clevel = DEF_CLEVEL, is_samtools_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0, no_pg = 0; - const char *output_file = NULL; -- char *prefix = NULL; -+ char *prefix = NULL, *arg_list = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -594,6 +601,7 @@ - case 'o': output_file = optarg; break; - case 'f': fast_coll = 1; break; - case 'r': reads_store = atoi(optarg); break; -+ case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return usage(samtools_stderr, n_files, reads_store); -@@ -614,10 +622,16 @@ - - if (!prefix) return EXIT_FAILURE; - -+ if (!no_pg && !(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("collate", "failed to create arg_list"); -+ return 1; -+ } -+ - ret = bamshuf(argv[optind], n_files, prefix, clevel, is_samtools_stdout, -- output_file, fast_coll, reads_store, &ga); -+ output_file, fast_coll, reads_store, &ga, arg_list, no_pg); - - if (pre_mem) free(prefix); -+ free(arg_list); - - return ret; - } ---- python-pysam.orig/samtools/bamtk.c -+++ python-pysam/samtools/bamtk.c -@@ -1,6 +1,6 @@ - /* bamtk.c -- main samtools command front-end. - -- Copyright (C) 2008-2018 Genome Research Ltd. -+ Copyright (C) 2008-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -38,7 +38,7 @@ - int bam_merge(int argc, char *argv[]); - int bam_index(int argc, char *argv[]); - int bam_sort(int argc, char *argv[]); --int bam_tview_main(int argc, char *argv[]); -+//int bam_tview_main(int argc, char *argv[]); - int bam_mating(int argc, char *argv[]); - int bam_rmdup(int argc, char *argv[]); - int bam_flagstat(int argc, char *argv[]); -@@ -52,6 +52,7 @@ - int main_phase(int argc, char *argv[]); - int main_cat(int argc, char *argv[]); - int main_depth(int argc, char *argv[]); -+int main_coverage(int argc, char *argv[]); - int main_bam2fq(int argc, char *argv[]); - int main_pad2unpad(int argc, char *argv[]); - int main_bedcov(int argc, char *argv[]); -@@ -109,6 +110,7 @@ - "\n" - " -- Statistics\n" - " bedcov read depth per BED region\n" -+" coverage alignment depth and percent coverage\n" - " depth compute the depth\n" - " flagstat simple stats\n" - " idxstats BAM index stats\n" -@@ -166,14 +168,16 @@ - else if (strcmp(argv[1], "merge") == 0) ret = bam_merge(argc-1, argv+1); - else if (strcmp(argv[1], "sort") == 0) ret = bam_sort(argc-1, argv+1); - else if (strcmp(argv[1], "index") == 0) ret = bam_index(argc-1, argv+1); -- else if (strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); -+ else if (strcmp(argv[1], "idxstat") == 0 || -+ strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); - else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1); - else if (strcmp(argv[1], "fqidx") == 0) ret = fqidx_main(argc-1, argv+1); - else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); - else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); - else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); - else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); -- else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1); -+ else if (strcmp(argv[1], "flagstat") == 0 || -+ strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1); - else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1); -@@ -181,6 +185,7 @@ - else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1); - else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1); - else if (strcmp(argv[1], "depth") == 0) ret = main_depth(argc-1, argv+1); -+ else if (strcmp(argv[1], "coverage") == 0) ret = main_coverage(argc-1, argv+1); - else if (strcmp(argv[1], "bam2fq") == 0 || - strcmp(argv[1], "fastq") == 0 || - strcmp(argv[1], "fasta") == 0) ret = main_bam2fq(argc-1, argv+1); -@@ -189,8 +194,10 @@ - else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1); - else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1); - else if (strcmp(argv[1], "collate") == 0) ret = main_bamshuf(argc-1, argv+1); -- else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); -- else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); -+ else if (strcmp(argv[1], "stat") == 0 || -+ strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); -+ else if (strcmp(argv[1], "flag") == 0 || -+ strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); - else if (strcmp(argv[1], "split") == 0) ret = main_split(argc-1, argv+1); - else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1); - else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1); -@@ -198,12 +205,12 @@ - fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); - return 1; - } -- else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); -+ //else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); - else if (strcmp(argv[1], "--version") == 0) { - printf( - "samtools %s\n" - "Using htslib %s\n" --"Copyright (C) 2018 Genome Research Ltd.\n", -+"Copyright (C) 2019 Genome Research Ltd.\n", - samtools_version(), hts_version()); - } - else if (strcmp(argv[1], "--version-only") == 0) { ---- python-pysam.orig/samtools/bamtk.c.pysam.c -+++ python-pysam/samtools/bamtk.c.pysam.c -@@ -2,7 +2,7 @@ - - /* bamtk.c -- main samtools command front-end. - -- Copyright (C) 2008-2018 Genome Research Ltd. -+ Copyright (C) 2008-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -54,6 +54,7 @@ - int main_phase(int argc, char *argv[]); - int main_cat(int argc, char *argv[]); - int main_depth(int argc, char *argv[]); -+int main_coverage(int argc, char *argv[]); - int main_bam2fq(int argc, char *argv[]); - int main_pad2unpad(int argc, char *argv[]); - int main_bedcov(int argc, char *argv[]); -@@ -111,6 +112,7 @@ - "\n" - " -- Statistics\n" - " bedcov read depth per BED region\n" -+" coverage alignment depth and percent coverage\n" - " depth compute the depth\n" - " flagstat simple stats\n" - " idxstats BAM index stats\n" -@@ -168,14 +170,16 @@ - else if (strcmp(argv[1], "merge") == 0) ret = bam_merge(argc-1, argv+1); - else if (strcmp(argv[1], "sort") == 0) ret = bam_sort(argc-1, argv+1); - else if (strcmp(argv[1], "index") == 0) ret = bam_index(argc-1, argv+1); -- else if (strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); -+ else if (strcmp(argv[1], "idxstat") == 0 || -+ strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1); - else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1); - else if (strcmp(argv[1], "fqidx") == 0) ret = fqidx_main(argc-1, argv+1); - else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1); - else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1); - else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1); - else if (strcmp(argv[1], "markdup") == 0) ret = bam_markdup(argc-1, argv+1); -- else if (strcmp(argv[1], "flagstat") == 0) ret = bam_flagstat(argc-1, argv+1); -+ else if (strcmp(argv[1], "flagstat") == 0 || -+ strcmp(argv[1], "flagstats") == 0) ret = bam_flagstat(argc-1, argv+1); - else if (strcmp(argv[1], "calmd") == 0) ret = bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "fillmd") == 0) ret = bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "reheader") == 0) ret = main_reheader(argc-1, argv+1); -@@ -183,6 +187,7 @@ - else if (strcmp(argv[1], "targetcut") == 0) ret = main_cut_target(argc-1, argv+1); - else if (strcmp(argv[1], "phase") == 0) ret = main_phase(argc-1, argv+1); - else if (strcmp(argv[1], "depth") == 0) ret = main_depth(argc-1, argv+1); -+ else if (strcmp(argv[1], "coverage") == 0) ret = main_coverage(argc-1, argv+1); - else if (strcmp(argv[1], "bam2fq") == 0 || - strcmp(argv[1], "fastq") == 0 || - strcmp(argv[1], "fasta") == 0) ret = main_bam2fq(argc-1, argv+1); -@@ -191,8 +196,10 @@ - else if (strcmp(argv[1], "bedcov") == 0) ret = main_bedcov(argc-1, argv+1); - else if (strcmp(argv[1], "bamshuf") == 0) ret = main_bamshuf(argc-1, argv+1); - else if (strcmp(argv[1], "collate") == 0) ret = main_bamshuf(argc-1, argv+1); -- else if (strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); -- else if (strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); -+ else if (strcmp(argv[1], "stat") == 0 || -+ strcmp(argv[1], "stats") == 0) ret = main_stats(argc-1, argv+1); -+ else if (strcmp(argv[1], "flag") == 0 || -+ strcmp(argv[1], "flags") == 0) ret = main_flags(argc-1, argv+1); - else if (strcmp(argv[1], "split") == 0) ret = main_split(argc-1, argv+1); - else if (strcmp(argv[1], "quickcheck") == 0) ret = main_quickcheck(argc-1, argv+1); - else if (strcmp(argv[1], "addreplacerg") == 0) ret = main_addreplacerg(argc-1, argv+1); -@@ -202,10 +209,10 @@ - } - //else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); - else if (strcmp(argv[1], "--version") == 0) { -- fprintf(samtools_stdout, -+ fprintf(samtools_stdout, - "samtools %s\n" - "Using htslib %s\n" --"Copyright (C) 2018 Genome Research Ltd.\n", -+"Copyright (C) 2019 Genome Research Ltd.\n", - samtools_version(), hts_version()); - } - else if (strcmp(argv[1], "--version-only") == 0) { ---- python-pysam.orig/samtools/bedcov.c -+++ python-pysam/samtools/bedcov.c -@@ -1,7 +1,7 @@ - /* bedcov.c -- bedcov subcommand. - - Copyright (C) 2012 Broad Institute. -- Copyright (C) 2013-2014 Genome Research Ltd. -+ Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -34,6 +34,7 @@ - #include "htslib/kstring.h" - #include "htslib/sam.h" - #include "htslib/thread_pool.h" -+#include "samtools.h" - #include "sam_opts.h" - - #include "htslib/kseq.h" -@@ -41,7 +42,7 @@ - - typedef struct { - htsFile *fp; -- bam_hdr_t *header; -+ sam_hdr_t *header; - hts_itr_t *iter; - int min_mapQ; - } aux_t; -@@ -71,7 +72,7 @@ - int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0; - int64_t *cnt; - const bam_pileup1_t **plp; -- int usage = 0; -+ int usage = 0, has_index_file = 0; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { -@@ -79,9 +80,10 @@ - { NULL, 0, NULL, 0 } - }; - -- while ((c = getopt_long(argc, argv, "Q:j", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) { - switch (c) { - case 'Q': min_mapQ = atoi(optarg); break; -+ case 'X': has_index_file = 1; break; - case 'j': skip_DN = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ -@@ -93,20 +95,36 @@ - fprintf(stderr, "Usage: samtools bedcov [options] [...]\n\n"); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -Q mapping quality threshold [0]\n"); -+ fprintf(stderr, " -X use customized index files\n"); - fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); -- sam_global_opt_help(stderr, "-.--.-"); -+ sam_global_opt_help(stderr, "-.--.--."); - return 1; - } -+ if (has_index_file) { -+ if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files -+ fprintf(stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n"); -+ return 1; -+ } -+ n = (argc - optind - 1) / 2; -+ } else { -+ n = argc - optind - 1; -+ } -+ - memset(&str, 0, sizeof(kstring_t)); -- n = argc - optind - 1; - aux = calloc(n, sizeof(aux_t*)); - idx = calloc(n, sizeof(hts_idx_t*)); - for (i = 0; i < n; ++i) { - aux[i] = calloc(1, sizeof(aux_t)); - aux[i]->min_mapQ = min_mapQ; - aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); -- if (aux[i]->fp) -- idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); -+ if (aux[i]->fp) { -+ // If index filename has not been specfied, look in BAM folder -+ if (has_index_file) { -+ idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]); -+ } else { -+ idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); -+ } -+ } - if (aux[i]->fp == 0 || idx[i] == 0) { - fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); - return 2; -@@ -122,6 +140,10 @@ - cnt = calloc(n, 8); - - fp = gzopen(argv[optind], "rb"); -+ if (fp == NULL) { -+ print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]); -+ return 2; -+ } - ks = ks_init(fp); - n_plp = calloc(n, sizeof(int)); - plp = calloc(n, sizeof(bam_pileup1_t*)); -@@ -186,7 +208,7 @@ - for (i = 0; i < n; ++i) { - if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); - hts_idx_destroy(idx[i]); -- bam_hdr_destroy(aux[i]->header); -+ sam_hdr_destroy(aux[i]->header); - sam_close(aux[i]->fp); - free(aux[i]); - } ---- python-pysam.orig/samtools/bedcov.c.pysam.c -+++ python-pysam/samtools/bedcov.c.pysam.c -@@ -3,7 +3,7 @@ - /* bedcov.c -- bedcov subcommand. - - Copyright (C) 2012 Broad Institute. -- Copyright (C) 2013-2014 Genome Research Ltd. -+ Copyright (C) 2013-2014, 2018, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -36,6 +36,7 @@ - #include "htslib/kstring.h" - #include "htslib/sam.h" - #include "htslib/thread_pool.h" -+#include "samtools.h" - #include "sam_opts.h" - - #include "htslib/kseq.h" -@@ -43,7 +44,7 @@ - - typedef struct { - htsFile *fp; -- bam_hdr_t *header; -+ sam_hdr_t *header; - hts_itr_t *iter; - int min_mapQ; - } aux_t; -@@ -73,7 +74,7 @@ - int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0; - int64_t *cnt; - const bam_pileup1_t **plp; -- int usage = 0; -+ int usage = 0, has_index_file = 0; - - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { -@@ -81,9 +82,10 @@ - { NULL, 0, NULL, 0 } - }; - -- while ((c = getopt_long(argc, argv, "Q:j", lopts, NULL)) >= 0) { -+ while ((c = getopt_long(argc, argv, "Q:Xj", lopts, NULL)) >= 0) { - switch (c) { - case 'Q': min_mapQ = atoi(optarg); break; -+ case 'X': has_index_file = 1; break; - case 'j': skip_DN = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ -@@ -95,20 +97,36 @@ - fprintf(samtools_stderr, "Usage: samtools bedcov [options] [...]\n\n"); - fprintf(samtools_stderr, "Options:\n"); - fprintf(samtools_stderr, " -Q mapping quality threshold [0]\n"); -+ fprintf(samtools_stderr, " -X use customized index files\n"); - fprintf(samtools_stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); -- sam_global_opt_help(samtools_stderr, "-.--.-"); -+ sam_global_opt_help(samtools_stderr, "-.--.--."); - return 1; - } -+ if (has_index_file) { -+ if ((argc - optind - 1) % 2 != 0) { // Calculate # of input BAM files -+ fprintf(samtools_stderr, "ERROR: odd number of filenames detected! Each BAM file should have an index file\n"); -+ return 1; -+ } -+ n = (argc - optind - 1) / 2; -+ } else { -+ n = argc - optind - 1; -+ } -+ - memset(&str, 0, sizeof(kstring_t)); -- n = argc - optind - 1; - aux = calloc(n, sizeof(aux_t*)); - idx = calloc(n, sizeof(hts_idx_t*)); - for (i = 0; i < n; ++i) { - aux[i] = calloc(1, sizeof(aux_t)); - aux[i]->min_mapQ = min_mapQ; - aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); -- if (aux[i]->fp) -- idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); -+ if (aux[i]->fp) { -+ // If index filename has not been specfied, look in BAM folder -+ if (has_index_file) { -+ idx[i] = sam_index_load2(aux[i]->fp, argv[i+optind+1], argv[i+optind+n+1]); -+ } else { -+ idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); -+ } -+ } - if (aux[i]->fp == 0 || idx[i] == 0) { - fprintf(samtools_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); - return 2; -@@ -124,6 +142,10 @@ - cnt = calloc(n, 8); - - fp = gzopen(argv[optind], "rb"); -+ if (fp == NULL) { -+ print_error_errno("bedcov", "can't open BED file '%s'", argv[optind]); -+ return 2; -+ } - ks = ks_init(fp); - n_plp = calloc(n, sizeof(int)); - plp = calloc(n, sizeof(bam_pileup1_t*)); -@@ -188,7 +210,7 @@ - for (i = 0; i < n; ++i) { - if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); - hts_idx_destroy(idx[i]); -- bam_hdr_destroy(aux[i]->header); -+ sam_hdr_destroy(aux[i]->header); - sam_close(aux[i]->fp); - free(aux[i]); - } ---- python-pysam.orig/samtools/bedidx.c -+++ python-pysam/samtools/bedidx.c -@@ -1,7 +1,7 @@ - /* bedidx.c -- BED file indexing. - - Copyright (C) 2011 Broad Institute. -- Copyright (C) 2014,2017 Genome Research Ltd. -+ Copyright (C) 2014, 2017-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -34,26 +34,28 @@ - #include "bedidx.h" - - #include "htslib/ksort.h" --KSORT_INIT_GENERIC(uint64_t) - - #include "htslib/kseq.h" - KSTREAM_INIT(gzFile, gzread, 8192) - -+static inline int lt_pair_pos(hts_pair_pos_t a, hts_pair_pos_t b) { -+ if (a.beg == b.beg) return a.end < b.end; -+ return a.beg < b.beg; -+} -+KSORT_INIT_STATIC(hts_pair_pos_t, hts_pair_pos_t, lt_pair_pos) -+ - /*! @typedef - * @abstract bed_reglist_t - value type of the BED hash table - * This structure encodes the list of intervals (ranges) for the regions provided via BED file or - * command line arguments. -- * @field *a pointer to the array of intervals (kept as 64 bit integers). The upper 32 bits -- * encode the beginning of the interval, while the lower 32 bits encode the end, for easy sorting. -- * |-- 32 bits --|-- 32 bits --| -- * |---- beg ----|---- end ----| -+ * @field *a pointer to the array of intervals. - * @field n actual number of elements contained by a - * @field m number of allocated elements to a (n <= m) - * @field *idx index array for computing the minimum offset - */ - typedef struct { - int n, m; -- uint64_t *a; -+ hts_pair_pos_t *a; - int *idx; - int filter; - } bed_reglist_t; -@@ -71,7 +73,6 @@ - khint_t k; - int i; - const char *reg; -- uint32_t beg, end; - - if (!h) { - printf("Hash table is empty!\n"); -@@ -84,10 +85,8 @@ - if ((p = &kh_val(h,k)) != NULL && p->n > 0) { - printf("Filter: %d\n", p->filter); - for (i=0; in; i++) { -- beg = (uint32_t)(p->a[i]>>32); -- end = (uint32_t)(p->a[i]); -- -- printf("\tinterval[%d]: %d-%d\n",i,beg,end); -+ printf("\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", -+ i,p->a[i].beg,p->a[i].end); - } - } else { - printf("Region '%s' has no intervals!\n", reg); -@@ -97,20 +96,23 @@ - } - #endif - --static int *bed_index_core(int n, uint64_t *a) -+static int *bed_index_core(int n, hts_pair_pos_t *a) - { -- int i, j, l, *idx; -+ int i, j, l, *idx, *new_idx; - l = 0; idx = 0; - for (i = 0; i < n; ++i) { -- int beg, end; -- beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; -+ hts_pos_t beg, end; -+ beg = a[i].beg >> LIDX_SHIFT; end = a[i].end >> LIDX_SHIFT; - if (l < end + 1) { - int old_l = l; - l = end + 1; - kroundup32(l); -- idx = realloc(idx, l * sizeof(int)); -- if (!idx) -+ new_idx = realloc(idx, l * sizeof(*idx)); -+ if (!new_idx) { -+ free(idx); - return NULL; -+ } -+ idx = new_idx; - - for (j = old_l; j < l; ++j) - idx[j] = -1; -@@ -131,19 +133,19 @@ - if (kh_exist(h, k)) { - bed_reglist_t *p = &kh_val(h, k); - if (p->idx) free(p->idx); -- ks_introsort(uint64_t, p->n, p->a); -+ ks_introsort(hts_pair_pos_t, p->n, p->a); - p->idx = bed_index_core(p->n, p->a); - } - } - } - --static int bed_minoff(const bed_reglist_t *p, unsigned int beg, unsigned int end) { -+static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) { - int i, min_off=0; - - if (p && p->idx) { - min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; - if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here -- int n = beg>>LIDX_SHIFT; -+ hts_pos_t n = beg>>LIDX_SHIFT; - if (n > p->n) - n = p->n; - for (i = n - 1; i >= 0; --i) -@@ -156,21 +158,21 @@ - return min_off; - } - --static int bed_overlap_core(const bed_reglist_t *p, int beg, int end) -+static int bed_overlap_core(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) - { - int i, min_off; - if (p->n == 0) return 0; - min_off = bed_minoff(p, beg, end); - - for (i = min_off; i < p->n; ++i) { -- if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed -- if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) -+ if (p->a[i].beg >= end) break; // out of range; no need to proceed -+ if (p->a[i].end > beg && p->a[i].beg < end) - return 1; // find the overlap; return - } - return 0; - } - --int bed_overlap(const void *_h, const char *chr, int beg, int end) -+int bed_overlap(const void *_h, const char *chr, hts_pos_t beg, hts_pos_t end) - { - const reghash_t *h = (const reghash_t*)_h; - khint_t k; -@@ -202,11 +204,11 @@ - continue; - - for (new_n = 0, j = 1; j < p->n; j++) { -- if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { -+ if (p->a[new_n].end < p->a[j].beg) { - p->a[++new_n] = p->a[j]; - } else { -- if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) -- p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); -+ if (p->a[new_n].end < p->a[j].end) -+ p->a[new_n].end = p->a[j].end; - } - } - -@@ -260,13 +262,17 @@ - if (fp == 0) return 0; - ks = ks_init(fp); - if (NULL == ks) goto fail; // In case ks_init ever gets error checking... -- while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) > 0) { // read a line -+ int ks_len; -+ while ((ks_len = ks_getuntil(ks, KS_SEP_LINE, &str, &dret)) >= 0) { // read a line - char *ref = str.s, *ref_end; -- unsigned int beg = 0, end = 0; -+ uint64_t beg = 0, end = 0; - int num = 0; - khint_t k; - bed_reglist_t *p; - -+ if (ks_len == 0) -+ continue; // skip blank lines -+ - line++; - while (*ref && isspace(*ref)) ref++; - if ('\0' == *ref) continue; // Skip blank lines -@@ -275,7 +281,7 @@ - while (*ref_end && !isspace(*ref_end)) ref_end++; - if ('\0' != *ref_end) { - *ref_end = '\0'; // terminate ref and look for start, end -- num = sscanf(ref_end + 1, "%u %u", &beg, &end); -+ num = sscanf(ref_end + 1, "%"SCNu64" %"SCNu64, &beg, &end); - } - if (1 == num) { // VCF-style format - end = beg--; // Counts from 1 instead of 0 for BED files -@@ -293,7 +299,8 @@ - } else { - fprintf(stderr, - "[bed_read] Parse error reading \"%s\" at line %u : " -- "end (%u) must not be less than start (%u)\n", -+ "end (%"PRIu64") must not be less " -+ "than start (%"PRIu64")\n", - fn, line, end, beg); - } - errno = 0; // Prevent caller from printing misleading error messages -@@ -318,16 +325,21 @@ - // Add begin,end to the list - if (p->n == p->m) { - p->m = p->m ? p->m<<1 : 4; -- p->a = realloc(p->a, p->m * sizeof(uint64_t)); -- if (NULL == p->a) goto fail; -+ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); -+ if (NULL == new_a) goto fail; -+ p->a = new_a; - } -- p->a[p->n++] = (uint64_t)beg<<32 | end; -+ p->a[p->n].beg = beg; -+ p->a[p->n++].end = end; - } - // FIXME: Need to check for errors in ks_getuntil. At the moment it - // doesn't look like it can return one. Possibly use gzgets instead? - -+ if (gzclose(fp) != Z_OK) { -+ fp = NULL; -+ goto fail; -+ } - ks_destroy(ks); -- gzclose(fp); - free(str.s); - bed_index(h); - //bed_unify(h); -@@ -361,7 +373,7 @@ - kh_destroy(reg, h); - } - --static void *bed_insert(void *reg_hash, char *reg, unsigned int beg, unsigned int end) { -+static void *bed_insert(void *reg_hash, char *reg, hts_pos_t beg, hts_pos_t end) { - - reghash_t *h; - khint_t k; -@@ -390,10 +402,12 @@ - // Add beg and end to the list - if (p->n == p->m) { - p->m = p->m ? p->m<<1 : 4; -- p->a = realloc(p->a, p->m * sizeof(uint64_t)); -- if (NULL == p->a) goto fail; -+ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); -+ if (NULL == new_a) goto fail; -+ p->a = new_a; - } -- p->a[p->n++] = (uint64_t)beg<<32 | end; -+ p->a[p->n].beg = beg; -+ p->a[p->n++].end = end; - - fail: - return h; -@@ -413,10 +427,10 @@ - reghash_t *t; - bed_reglist_t *p, *q; - khint_t l, k; -- uint64_t *new_a; -+ hts_pair_pos_t *new_a; - int i, j, new_n, min_off; - const char *reg; -- uint32_t beg, end; -+ hts_pos_t beg, end; - - h = (reghash_t *)reg_hash; - t = (reghash_t *)tmp_hash; -@@ -434,20 +448,21 @@ - if (k == kh_end(h) || !(p = &kh_val(h, k)) || !(p->n)) - continue; - -- new_a = (uint64_t *)calloc(q->n + p->n, sizeof(uint64_t)); -+ new_a = calloc(q->n + p->n, sizeof(new_a[0])); - if (!new_a) - return NULL; - new_n = 0; - - for (i = 0; i < q->n; i++) { -- beg = (uint32_t)(q->a[i]>>32); -- end = (uint32_t)(q->a[i]); -+ beg = q->a[i].beg; -+ end = q->a[i].end; - - min_off = bed_minoff(p, beg, end); - for (j = min_off; j < p->n; ++j) { -- if ((uint32_t)(p->a[j]>>32) >= end) break; // out of range; no need to proceed -- if ((uint32_t)(p->a[j]) > beg && (uint32_t)(p->a[j]>>32) < end) { -- new_a[new_n++] = ((uint64_t)MAX((uint32_t)(p->a[j]>>32), beg) << 32) | MIN((uint32_t)p->a[j], end); -+ if (p->a[j].beg >= end) break; // out of range; no need to proceed -+ if (p->a[j].end > beg && p->a[j].beg < end) { -+ new_a[new_n].beg = MAX(p->a[j].beg, beg); -+ new_a[new_n++].end = MIN(p->a[j].end, end); - } - } - } -@@ -494,6 +509,11 @@ - - for (i=first; i 1024) { -@@ -596,8 +616,8 @@ - reglist[count].max_end = 0; - - for (j = 0; j < p->n; j++) { -- reglist[count].intervals[j].beg = (uint32_t)(p->a[j]>>32); -- reglist[count].intervals[j].end = (uint32_t)(p->a[j]); -+ reglist[count].intervals[j].beg = p->a[j].beg; -+ reglist[count].intervals[j].end = p->a[j].end; - - if (reglist[count].intervals[j].end > reglist[count].max_end) - reglist[count].max_end = reglist[count].intervals[j].end; ---- python-pysam.orig/samtools/bedidx.c.pysam.c -+++ python-pysam/samtools/bedidx.c.pysam.c -@@ -3,7 +3,7 @@ - /* bedidx.c -- BED file indexing. - - Copyright (C) 2011 Broad Institute. -- Copyright (C) 2014,2017 Genome Research Ltd. -+ Copyright (C) 2014, 2017-2019 Genome Research Ltd. - - Author: Heng Li - -@@ -36,26 +36,28 @@ - #include "bedidx.h" - - #include "htslib/ksort.h" --KSORT_INIT_GENERIC(uint64_t) - - #include "htslib/kseq.h" - KSTREAM_INIT(gzFile, gzread, 8192) - -+static inline int lt_pair_pos(hts_pair_pos_t a, hts_pair_pos_t b) { -+ if (a.beg == b.beg) return a.end < b.end; -+ return a.beg < b.beg; -+} -+KSORT_INIT_STATIC(hts_pair_pos_t, hts_pair_pos_t, lt_pair_pos) -+ - /*! @typedef - * @abstract bed_reglist_t - value type of the BED hash table - * This structure encodes the list of intervals (ranges) for the regions provided via BED file or - * command line arguments. -- * @field *a pointer to the array of intervals (kept as 64 bit integers). The upper 32 bits -- * encode the beginning of the interval, while the lower 32 bits encode the end, for easy sorting. -- * |-- 32 bits --|-- 32 bits --| -- * |---- beg ----|---- end ----| -+ * @field *a pointer to the array of intervals. - * @field n actual number of elements contained by a - * @field m number of allocated elements to a (n <= m) - * @field *idx index array for computing the minimum offset - */ - typedef struct { - int n, m; -- uint64_t *a; -+ hts_pair_pos_t *a; - int *idx; - int filter; - } bed_reglist_t; -@@ -73,7 +75,6 @@ - khint_t k; - int i; - const char *reg; -- uint32_t beg, end; - - if (!h) { - fprintf(samtools_stdout, "Hash table is empty!\n"); -@@ -86,10 +87,8 @@ - if ((p = &kh_val(h,k)) != NULL && p->n > 0) { - fprintf(samtools_stdout, "Filter: %d\n", p->filter); - for (i=0; in; i++) { -- beg = (uint32_t)(p->a[i]>>32); -- end = (uint32_t)(p->a[i]); -- -- fprintf(samtools_stdout, "\tinterval[%d]: %d-%d\n",i,beg,end); -+ fprintf(samtools_stdout, "\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", -+ i,p->a[i].beg,p->a[i].end); - } - } else { - fprintf(samtools_stdout, "Region '%s' has no intervals!\n", reg); -@@ -99,20 +98,23 @@ - } - #endif - --static int *bed_index_core(int n, uint64_t *a) -+static int *bed_index_core(int n, hts_pair_pos_t *a) - { -- int i, j, l, *idx; -+ int i, j, l, *idx, *new_idx; - l = 0; idx = 0; - for (i = 0; i < n; ++i) { -- int beg, end; -- beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; -+ hts_pos_t beg, end; -+ beg = a[i].beg >> LIDX_SHIFT; end = a[i].end >> LIDX_SHIFT; - if (l < end + 1) { - int old_l = l; - l = end + 1; - kroundup32(l); -- idx = realloc(idx, l * sizeof(int)); -- if (!idx) -+ new_idx = realloc(idx, l * sizeof(*idx)); -+ if (!new_idx) { -+ free(idx); - return NULL; -+ } -+ idx = new_idx; - - for (j = old_l; j < l; ++j) - idx[j] = -1; -@@ -133,19 +135,19 @@ - if (kh_exist(h, k)) { - bed_reglist_t *p = &kh_val(h, k); - if (p->idx) free(p->idx); -- ks_introsort(uint64_t, p->n, p->a); -+ ks_introsort(hts_pair_pos_t, p->n, p->a); - p->idx = bed_index_core(p->n, p->a); - } - } - } - --static int bed_minoff(const bed_reglist_t *p, unsigned int beg, unsigned int end) { -+static int bed_minoff(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) { - int i, min_off=0; - - if (p && p->idx) { - min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; - if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here -- int n = beg>>LIDX_SHIFT; -+ hts_pos_t n = beg>>LIDX_SHIFT; - if (n > p->n) - n = p->n; - for (i = n - 1; i >= 0; --i) -@@ -158,21 +160,21 @@ - return min_off; - } - --static int bed_overlap_core(const bed_reglist_t *p, int beg, int end) -+static int bed_overlap_core(const bed_reglist_t *p, hts_pos_t beg, hts_pos_t end) - { - int i, min_off; - if (p->n == 0) return 0; - min_off = bed_minoff(p, beg, end); - - for (i = min_off; i < p->n; ++i) { -- if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed -- if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) -+ if (p->a[i].beg >= end) break; // out of range; no need to proceed -+ if (p->a[i].end > beg && p->a[i].beg < end) - return 1; // find the overlap; return - } - return 0; - } - --int bed_overlap(const void *_h, const char *chr, int beg, int end) -+int bed_overlap(const void *_h, const char *chr, hts_pos_t beg, hts_pos_t end) - { - const reghash_t *h = (const reghash_t*)_h; - khint_t k; -@@ -204,11 +206,11 @@ - continue; - - for (new_n = 0, j = 1; j < p->n; j++) { -- if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { -+ if (p->a[new_n].end < p->a[j].beg) { - p->a[++new_n] = p->a[j]; - } else { -- if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) -- p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); -+ if (p->a[new_n].end < p->a[j].end) -+ p->a[new_n].end = p->a[j].end; - } - } - -@@ -262,13 +264,17 @@ - if (fp == 0) return 0; - ks = ks_init(fp); - if (NULL == ks) goto fail; // In case ks_init ever gets error checking... -- while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) > 0) { // read a line -+ int ks_len; -+ while ((ks_len = ks_getuntil(ks, KS_SEP_LINE, &str, &dret)) >= 0) { // read a line - char *ref = str.s, *ref_end; -- unsigned int beg = 0, end = 0; -+ uint64_t beg = 0, end = 0; - int num = 0; - khint_t k; - bed_reglist_t *p; - -+ if (ks_len == 0) -+ continue; // skip blank lines -+ - line++; - while (*ref && isspace(*ref)) ref++; - if ('\0' == *ref) continue; // Skip blank lines -@@ -277,7 +283,7 @@ - while (*ref_end && !isspace(*ref_end)) ref_end++; - if ('\0' != *ref_end) { - *ref_end = '\0'; // terminate ref and look for start, end -- num = sscanf(ref_end + 1, "%u %u", &beg, &end); -+ num = sscanf(ref_end + 1, "%"SCNu64" %"SCNu64, &beg, &end); - } - if (1 == num) { // VCF-style format - end = beg--; // Counts from 1 instead of 0 for BED files -@@ -295,7 +301,8 @@ - } else { - fprintf(samtools_stderr, - "[bed_read] Parse error reading \"%s\" at line %u : " -- "end (%u) must not be less than start (%u)\n", -+ "end (%"PRIu64") must not be less " -+ "than start (%"PRIu64")\n", - fn, line, end, beg); - } - errno = 0; // Prevent caller from printing misleading error messages -@@ -320,16 +327,21 @@ - // Add begin,end to the list - if (p->n == p->m) { - p->m = p->m ? p->m<<1 : 4; -- p->a = realloc(p->a, p->m * sizeof(uint64_t)); -- if (NULL == p->a) goto fail; -+ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); -+ if (NULL == new_a) goto fail; -+ p->a = new_a; - } -- p->a[p->n++] = (uint64_t)beg<<32 | end; -+ p->a[p->n].beg = beg; -+ p->a[p->n++].end = end; - } - // FIXME: Need to check for errors in ks_getuntil. At the moment it - // doesn't look like it can return one. Possibly use gzgets instead? - -+ if (gzclose(fp) != Z_OK) { -+ fp = NULL; -+ goto fail; -+ } - ks_destroy(ks); -- gzclose(fp); - free(str.s); - bed_index(h); - //bed_unify(h); -@@ -363,7 +375,7 @@ - kh_destroy(reg, h); - } - --static void *bed_insert(void *reg_hash, char *reg, unsigned int beg, unsigned int end) { -+static void *bed_insert(void *reg_hash, char *reg, hts_pos_t beg, hts_pos_t end) { - - reghash_t *h; - khint_t k; -@@ -392,10 +404,12 @@ - // Add beg and end to the list - if (p->n == p->m) { - p->m = p->m ? p->m<<1 : 4; -- p->a = realloc(p->a, p->m * sizeof(uint64_t)); -- if (NULL == p->a) goto fail; -+ hts_pair_pos_t *new_a = realloc(p->a, p->m * sizeof(p->a[0])); -+ if (NULL == new_a) goto fail; -+ p->a = new_a; - } -- p->a[p->n++] = (uint64_t)beg<<32 | end; -+ p->a[p->n].beg = beg; -+ p->a[p->n++].end = end; - - fail: - return h; -@@ -415,10 +429,10 @@ - reghash_t *t; - bed_reglist_t *p, *q; - khint_t l, k; -- uint64_t *new_a; -+ hts_pair_pos_t *new_a; - int i, j, new_n, min_off; - const char *reg; -- uint32_t beg, end; -+ hts_pos_t beg, end; - - h = (reghash_t *)reg_hash; - t = (reghash_t *)tmp_hash; -@@ -436,20 +450,21 @@ - if (k == kh_end(h) || !(p = &kh_val(h, k)) || !(p->n)) - continue; - -- new_a = (uint64_t *)calloc(q->n + p->n, sizeof(uint64_t)); -+ new_a = calloc(q->n + p->n, sizeof(new_a[0])); - if (!new_a) - return NULL; - new_n = 0; - - for (i = 0; i < q->n; i++) { -- beg = (uint32_t)(q->a[i]>>32); -- end = (uint32_t)(q->a[i]); -+ beg = q->a[i].beg; -+ end = q->a[i].end; - - min_off = bed_minoff(p, beg, end); - for (j = min_off; j < p->n; ++j) { -- if ((uint32_t)(p->a[j]>>32) >= end) break; // out of range; no need to proceed -- if ((uint32_t)(p->a[j]) > beg && (uint32_t)(p->a[j]>>32) < end) { -- new_a[new_n++] = ((uint64_t)MAX((uint32_t)(p->a[j]>>32), beg) << 32) | MIN((uint32_t)p->a[j], end); -+ if (p->a[j].beg >= end) break; // out of range; no need to proceed -+ if (p->a[j].end > beg && p->a[j].beg < end) { -+ new_a[new_n].beg = MAX(p->a[j].beg, beg); -+ new_a[new_n++].end = MIN(p->a[j].end, end); - } - } - } -@@ -496,6 +511,11 @@ - - for (i=first; i 1024) { -@@ -598,8 +618,8 @@ - reglist[count].max_end = 0; - - for (j = 0; j < p->n; j++) { -- reglist[count].intervals[j].beg = (uint32_t)(p->a[j]>>32); -- reglist[count].intervals[j].end = (uint32_t)(p->a[j]); -+ reglist[count].intervals[j].beg = p->a[j].beg; -+ reglist[count].intervals[j].end = p->a[j].end; - - if (reglist[count].intervals[j].end > reglist[count].max_end) - reglist[count].max_end = reglist[count].intervals[j].end; ---- python-pysam.orig/samtools/bedidx.h -+++ python-pysam/samtools/bedidx.h -@@ -36,7 +36,7 @@ - - void *bed_read(const char *fn); - void bed_destroy(void *_h); --int bed_overlap(const void *_h, const char *chr, int beg, int end); -+int bed_overlap(const void *_h, const char *chr, hts_pos_t beg, hts_pos_t end); - void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op); - const char* bed_get(void *reg_hash, int index, int filter); - hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *count_regs); ---- /dev/null -+++ python-pysam/samtools/coverage.c -@@ -0,0 +1,702 @@ -+/* coverage.c -- samtools coverage subcommand -+ -+ Copyright (C) 2018,2019 Florian Breitwieser -+ Portions copyright (C) 2019 Genome Research Ltd. -+ -+ Author: Florian P Breitwieser -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+/* This program calculates coverage from multiple BAMs -+ * simutaneously, to achieve random access and to use the BED interface. -+ * To compile this program separately, you may: -+ * -+ * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz -+ */ -+ -+// C headers -+#include -+ -+#include -+#include -+#include -+#include // variadic functions -+#include // INT_MAX -+#include // round -+#include -+#include -+#include -+#include -+ -+#ifdef _WIN32 -+#include -+#else -+#include -+#endif -+ -+#include "htslib/sam.h" -+#include "htslib/hts.h" -+#include "samtools.h" -+#include "sam_opts.h" -+ -+const char *VERSION = "0.1"; -+ -+typedef struct { // auxiliary data structure to hold a BAM file -+ samFile *fp; // file handle -+ sam_hdr_t *hdr; // file header -+ hts_itr_t *iter; // iterator to a region - NULL for us by default -+ int min_mapQ; // mapQ filter -+ int min_len; // length filter -+ unsigned int n_reads; // records the number of reads seen in file -+ unsigned int n_selected_reads; // records the number of reads passing filter -+ unsigned long summed_mapQ; // summed mapQ of all reads passing filter -+ int fail_flags; -+ int required_flags; -+} bam_aux_t; -+ -+typedef struct { // auxiliary data structure to hold stats on coverage -+ unsigned long long n_covered_bases; -+ unsigned long long summed_coverage; -+ unsigned long long summed_baseQ; -+ unsigned long long summed_mapQ; -+ unsigned int n_reads; -+ unsigned int n_selected_reads; -+ int32_t tid; // chromosome ID, defined by header -+ hts_pos_t beg; -+ hts_pos_t end; -+ int64_t bin_width; -+} stats_aux_t; -+ -+#if __STDC_VERSION__ >= 199901L -+#define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL -+ -+// UTF8 specifies block characters in eights going from \u2581 (lower one eight block) to \u2588 (full block) -+// https://en.wikipedia.org/wiki/Block_Elements -+// LOWER ONE EIGHTH BLOCK … FULL BLOCK -+static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"}; -+// In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those -+static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"}; -+ -+#else -+ -+// Fall back to explicit UTF-8 encodings of the same characters -+#define VERTICAL_LINE "\xE2\x94\x82" -+ -+static const char *const BLOCK_CHARS8[8] = { -+ "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84", -+ "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" }; -+ -+static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"}; -+ -+#endif -+ -+// in bam_plcmd.c -+int read_file_list(const char *file_list, int *n, char **argv[]); -+ -+static int usage() { -+ fprintf(stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n" -+ "Input options:\n" -+ " -b, --bam-list FILE list of input BAM filenames, one per line\n" -+ " -l, --min-read-len INT ignore reads shorter than INT bp [0]\n" -+ " -q, --min-MQ INT base quality threshold [0]\n" -+ " -Q, --min-BQ INT mapping quality threshold [0]\n" -+ " --rf required flags: skip reads with mask bits unset []\n" -+ " --ff filter flags: skip reads with mask bits set \n" -+ " [UNMAP,SECONDARY,QCFAIL,DUP]\n" -+ "Output options:\n" -+ " -m, --histogram show histogram instead of tabular output\n" -+ " -A, --ascii show only ASCII characters in histogram\n" -+ " -o, --output FILE write output to FILE [stdout]\n" -+ " -H, --no-header don't print a header in tabular mode\n" -+ " -w, --n-bins INT number of bins in histogram [terminal width - 40]\n" -+ " -r, --region REG show specified region. Format: chr:start-end. \n" -+ " -h, --help help (this page)\n"); -+ -+ fprintf(stdout, "\nGeneric options:\n"); -+ sam_global_opt_help(stdout, "-.--.--."); -+ -+ fprintf(stdout, -+ "\nSee manpage for additional details.\n" -+ " rname Reference name / chromosome\n" -+ " startpos Start position\n" -+ " endpos End position (or sequence length)\n" -+ " numreads Number reads aligned to the region (after filtering)\n" -+ " covbases Number of covered bases with depth >= 1\n" -+ " coverage Proportion of covered bases [0..1]\n" -+ " meandepth Mean depth of coverage\n" -+ " meanbaseq Mean baseQ in covered region\n" -+ " meanmapq Mean mapQ of selected reads\n" -+ ); -+ -+ return EXIT_SUCCESS; -+} -+ -+static char* center_text(char *text, char *buf, int width) { -+ int len = strlen(text); -+ assert(len <= width); -+ int padding = (width - len) / 2; -+ int padding_ex = (width - len) % 2; -+ if (padding >= 1) -+ sprintf(buf, " %*s%*s", len+padding, text, padding-1+padding_ex, " "); -+ else -+ sprintf(buf, "%s", text); -+ -+ return buf; -+} -+ -+static char* readable_bps(double base_pairs, char *buf) { -+ const char* units[] = {"", "K", "M", "G", "T"}; -+ int i = 0; -+ while (base_pairs >= 1000 && i < (sizeof(units)/sizeof(units[0]) - 1)) { -+ base_pairs /= 1000; -+ i++; -+ } -+ sprintf(buf, "%.*f%s", i, base_pairs, units[i]); -+ return buf; -+} -+ -+static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) { -+ int i; -+ stats->n_reads = 0; -+ stats->n_selected_reads = 0; -+ stats->summed_mapQ = 0; -+ for (i = 0; i < n_bam_files && data[i]; ++i) { -+ stats->n_reads += data[i]->n_reads; -+ stats->n_selected_reads += data[i]->n_selected_reads; -+ stats->summed_mapQ += data[i]->summed_mapQ; -+ data[i]->n_reads = 0; -+ data[i]->n_selected_reads = 0; -+ data[i]->summed_mapQ = 0; -+ } -+} -+ -+// read one alignment from one BAM file -+static int read_bam(void *data, bam1_t *b) { -+ bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure -+ int ret; -+ while (1) { -+ if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break; -+ ++aux->n_reads; -+ -+ if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue; -+ if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue; -+ if ( b->core.qual < aux->min_mapQ ) continue; -+ if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; -+ ++aux->n_selected_reads; -+ aux->summed_mapQ += b->core.qual; -+ break; -+ } -+ return ret; -+} -+ -+void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) { -+ fputs(sam_hdr_tid2name(h, stats->tid), file_out); -+ double region_len = (double) stats->end - stats->beg; -+ fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n", -+ stats->beg+1, -+ stats->end, -+ stats->n_selected_reads, -+ stats->n_covered_bases, -+ 100.0 * stats->n_covered_bases / region_len, -+ stats->summed_coverage / region_len, -+ stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0, -+ stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0 -+ ); -+} -+ -+void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist, -+ const int hist_size, const bool full_utf) { -+ int i, col; -+ bool show_percentiles = false; -+ const int n_rows = 10; -+ const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2; -+ const int blockchar_len = full_utf? 8 : 2; -+ /* -+ if (stats->beg == 0) { -+ stats->end = h->target_len[stats->tid]; -+ } -+ */ -+ double region_len = stats->end - stats->beg; -+ -+ // Calculate histogram that contains percent covered -+ double hist_data[hist_size]; -+ double max_val = 0.0; -+ for (i = 0; i < hist_size; ++i) { -+ hist_data[i] = 100 * hist[i] / (double) stats->bin_width; -+ if (hist_data[i] > max_val) max_val = hist_data[i]; -+ } -+ -+ char buf[30]; -+ fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf)); -+ -+ double row_bin_size = max_val / (double) n_rows; -+ for (i = n_rows-1; i >= 0; --i) { -+ double current_bin = row_bin_size * i; -+ if (show_percentiles) { -+ fprintf(file_out, ">%3i%% ", i*10); -+ } else { -+ fprintf(file_out, ">%7.2f%% ", current_bin); -+ } -+ fprintf(file_out, VERTICAL_LINE); -+ for (col = 0; col < hist_size; ++col) { -+ // get the difference in eights, or halfs when full UTF8 is not supported -+ int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1; -+ if (cur_val_diff < 0) { -+ fputc(' ', file_out); -+ } else { -+ if (cur_val_diff >= blockchar_len) -+ cur_val_diff = blockchar_len - 1; -+ -+ fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]); -+ } -+ } -+ fprintf(file_out, VERTICAL_LINE); -+ fputc(' ', file_out); -+ switch (i) { -+ case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break; -+ case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break; -+ case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break; -+ case 6: fprintf(file_out, "Percent covered: %.4g%%", -+ 100.0 * stats->n_covered_bases / region_len); break; -+ case 5: fprintf(file_out, "Mean coverage: %.3gx", -+ stats->summed_coverage / region_len); break; -+ case 4: fprintf(file_out, "Mean baseQ: %.3g", -+ stats->summed_baseQ/(double) stats->summed_coverage); break; -+ case 3: fprintf(file_out, "Mean mapQ: %.3g", -+ stats->summed_mapQ/(double) stats->n_selected_reads); break; -+ case 1: fprintf(file_out, "Histo bin width: %sbp", -+ readable_bps(stats->bin_width, buf)); break; -+ case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break; -+ }; -+ fputc('\n', file_out); -+ } -+ -+ // print x axis. Could be made pretty for widths that are not divisible -+ // by 10 by variable spacing of the labels, instead of placing a label every 10 characters -+ char buf2[50]; -+ fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10)); -+ int rest; -+ for (rest = 10; rest < 10*(hist_size/10); rest += 10) { -+ fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10)); -+ } -+ int last_padding = hist_size%10; -+ fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10)); -+ fprintf(file_out, "\n"); -+} -+ -+int main_coverage(int argc, char *argv[]) { -+ int status = EXIT_SUCCESS; -+ -+ int ret, tid, pos, i, j; -+ -+ int max_depth = 0; -+ int opt_min_baseQ = 0; -+ int opt_min_mapQ = 0; -+ int opt_min_len = 0; -+ int opt_n_bins = 50; -+ bool opt_full_width = true; -+ char *opt_output_file = NULL; -+ bam_aux_t **data = NULL; -+ bam_mplp_t mplp = NULL; -+ const bam_pileup1_t **plp = NULL; -+ uint32_t *hist = NULL; -+ stats_aux_t *stats = NULL; -+ char *opt_reg = 0; // specified region -+ char *opt_file_list = NULL; -+ int n_bam_files = 0; -+ char **fn = NULL; -+ int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags -+ int required_flags = 0; -+ -+ int *n_plp = NULL; -+ sam_hdr_t *h = NULL; // BAM header of the 1st input -+ -+ bool opt_print_header = true; -+ bool opt_print_tabular = true; -+ bool opt_print_histogram = false; -+ bool *covered_tids = NULL; -+ bool opt_full_utf = true; -+ -+ FILE *file_out = stdout; -+ -+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; -+ static const struct option lopts[] = { -+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), -+ {"rf", required_argument, NULL, 1}, // require flag -+ {"ff", required_argument, NULL, 2}, // filter flag -+ {"incl-flags", required_argument, NULL, 1}, // require flag -+ {"excl-flags", required_argument, NULL, 2}, // filter flag -+ {"bam-list", required_argument, NULL, 'b'}, -+ {"min-read-len", required_argument, NULL, 'L'}, -+ {"min-MQ", required_argument, NULL, 'q'}, -+ {"min-mq", required_argument, NULL, 'q'}, -+ {"min-BQ", required_argument, NULL, 'Q'}, -+ {"min-bq", required_argument, NULL, 'Q'}, -+ {"histogram", no_argument, NULL, 'm'}, -+ {"ascii", no_argument, NULL, 'A'}, -+ {"output", required_argument, NULL, 'o'}, -+ {"no-header", no_argument, NULL, 'H'}, -+ {"n-bins", required_argument, NULL, 'w'}, -+ {"region", required_argument, NULL, 'r'}, -+ {"help", no_argument, NULL, 'h'}, -+ { NULL, 0, NULL, 0 } -+ }; -+ -+ // parse the command line -+ int c; -+ opterr = 0; -+ while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) { -+ switch (c) { -+ case 1: -+ if ((required_flags = bam_str2flag(optarg)) < 0) { -+ fprintf(stderr,"Could not parse --rf %s\n", optarg); return EXIT_FAILURE; -+ }; break; -+ case 2: -+ if ((fail_flags = bam_str2flag(optarg)) < 0) { -+ fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE; -+ }; break; -+ case 'o': opt_output_file = optarg; opt_full_width = false; break; -+ case 'L': opt_min_len = atoi(optarg); break; -+ case 'q': opt_min_baseQ = atoi(optarg); break; -+ case 'Q': opt_min_mapQ = atoi(optarg); break; -+ case 'w': opt_n_bins = atoi(optarg); opt_full_width = false; -+ opt_print_histogram = true; opt_print_tabular = false; -+ break; -+ case 'r': opt_reg = optarg; break; // parsing a region requires a BAM header (strdup unnecessary) -+ case 'b': opt_file_list = optarg; break; -+ case 'm': opt_print_histogram = true; opt_print_tabular = false; break; -+ case 'A': opt_full_utf = false; -+ opt_print_histogram = true; opt_print_tabular = false; -+ break; -+ case 'H': opt_print_header = false; break; -+ case 'h': return usage(); -+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; -+ /* else fall-through */ -+ case '?': -+ if (optopt != '?') { // '-?' appeared on command line -+ if (optopt) { // Bad short option -+ print_error("coverage", "invalid option -- '%c'", optopt); -+ } else { // Bad long option -+ // Do our best. There is no good solution to finding -+ // out what the bad option was. -+ // See, e.g. https://stackoverflow.com/questions/2723888/where-does-getopt-long-store-an-unrecognized-option -+ if (optind > 0 && strncmp(argv[optind - 1], "--", 2) == 0) { -+ print_error("coverage", "unrecognised option '%s'", -+ argv[optind - 1]); -+ } -+ } -+ } -+ return usage(); -+ } -+ } -+ if (optind == argc && !opt_file_list) -+ return usage(); -+ -+ // output file provided by user -+ if (opt_output_file != NULL && strcmp(opt_output_file,"-")!=0) { -+ file_out = fopen( opt_output_file, "w" ); -+ if (file_out == NULL) { -+ print_error_errno("coverage", "Cannot open \"%s\" for writing.", opt_output_file); -+ return EXIT_FAILURE; -+ } -+ } -+ -+ if (opt_n_bins <= 0 || opt_full_width) { -+ // get number of columns of terminal -+ const char* env_columns = getenv("COLUMNS"); -+ int columns = 0; -+ if (env_columns == NULL) { -+#ifdef _WIN32 -+ CONSOLE_SCREEN_BUFFER_INFO csbi; -+ if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) { -+ columns = csbi.srWindow.Right - csbi.srWindow.Left + 1; -+ } -+#else -+ struct winsize w; -+ if (ioctl(2, TIOCGWINSZ, &w) == 0) -+ columns = w.ws_col; -+#endif -+ } else { -+ columns = atoi(env_columns); // atoi(NULL) returns 0 -+ } -+ -+ if (columns > 60) { -+ opt_n_bins = columns - 40; -+ } else { -+ opt_n_bins = 40; -+ } -+ } -+ -+ // setvbuf(file_out, NULL, _IONBF, 0); //turn off buffering -+ -+ // Open all BAM files -+ if (opt_file_list) { -+ // Read file names from opt_file_list into argv, and record the number of files in n_bam_files -+ if (read_file_list(opt_file_list, &n_bam_files, &fn)) { -+ print_error_errno("coverage", "Cannot open file list \"%s\".", opt_file_list); -+ return EXIT_FAILURE; -+ } -+ argv = fn; -+ optind = 0; -+ } else { -+ n_bam_files = argc - optind; // the number of BAMs on the command line -+ } -+ -+ data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file -+ if (!data) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ -+ for (i = 0; i < n_bam_files; ++i) { -+ int rf; -+ data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t)); -+ if (!data[i]) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM -+ -+ if (data[i]->fp == NULL) { -+ print_error_errno("coverage", "Could not open \"%s\"", argv[optind+i]); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; -+ if (opt_min_baseQ) rf |= SAM_QUAL; -+ -+ // Set CRAM options on file handle - returns 0 on success -+ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -+ print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { -+ print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ data[i]->min_mapQ = opt_min_mapQ; // set the mapQ filter -+ data[i]->min_len = opt_min_len; // set the qlen filter -+ data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header -+ data[i]->fail_flags = fail_flags; -+ data[i]->required_flags = required_flags; -+ if (data[i]->hdr == NULL) { -+ print_error_errno("coverage", "Could not read header for \"%s\"", argv[optind+i]); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ -+ // Lookup region if specified -+ if (opt_reg) { // if a region is specified -+ hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index -+ if (idx == NULL) { -+ print_error_errno("coverage", "Failed to load index for \"%s\"", argv[optind+i]); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator -+ hts_idx_destroy(idx); // the index is not needed any more; free the memory -+ if (data[i]->iter == NULL) { -+ print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ } -+ } -+ -+ if (opt_print_tabular && opt_print_header) -+ fputs("#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq\n", file_out); -+ -+ h = data[0]->hdr; // easy access to the header of the 1st BAM -+ int n_targets = sam_hdr_nref(h); -+ covered_tids = calloc(n_targets, sizeof(bool)); -+ stats = calloc(1, sizeof(stats_aux_t)); -+ if (!covered_tids || !stats) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ -+ int64_t n_bins = opt_n_bins; -+ if (opt_reg) { -+ stats->tid = data[0]->iter->tid; -+ stats->beg = data[0]->iter->beg; // and to the parsed region coordinates -+ stats->end = data[0]->iter->end; -+ if (stats->end == HTS_POS_MAX) { -+ stats->end = sam_hdr_tid2len(h, stats->tid); -+ } -+ if (opt_n_bins > stats->end - stats->beg) { -+ n_bins = stats->end - stats->beg; -+ } -+ stats->bin_width = (stats->end-stats->beg) / n_bins; -+ } else { -+ stats->tid = -1; -+ } -+ -+ int64_t current_bin = 0; -+ -+ // the core multi-pileup loop -+ mplp = bam_mplp_init(n_bam_files, read_bam, (void**)data); // initialization -+ if (max_depth > 0) -+ bam_mplp_set_maxcnt(mplp, max_depth); // set maximum coverage depth -+ else if (!max_depth) -+ bam_mplp_set_maxcnt(mplp, INT_MAX); -+ -+ -+ // Extra info for histogram and coverage counting -+ hist = (uint32_t*) calloc(opt_n_bins, sizeof(uint32_t)); -+ n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM -+ plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) -+ if (!hist || !n_plp || !plp) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position -+ -+ if (tid != stats->tid) { // Next target sequence -+ if (stats->tid >= 0) { // It's not the first sequence, print results -+ set_read_counts(data, stats, n_bam_files); -+ if (opt_print_histogram) { -+ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); -+ fputc('\n', file_out); -+ } else if (opt_print_tabular) { -+ print_tabular_line(file_out, h, stats); -+ } -+ -+ // reset data -+ memset(stats, 0, sizeof(stats_aux_t)); -+ if (opt_print_histogram) -+ memset(hist, 0, n_bins*sizeof(uint32_t)); -+ } -+ -+ stats->tid = tid; -+ covered_tids[tid] = true; -+ if (!opt_reg) -+ stats->end = sam_hdr_tid2len(h, tid); -+ -+ if (opt_print_histogram) { -+ n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins; -+ stats->bin_width = (stats->end-stats->beg) / n_bins; -+ } -+ } -+ if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip -+ if (tid >= n_targets) continue; // diff number of @SQ lines per file? -+ -+ if (opt_print_histogram) { -+ current_bin = (pos - stats->beg) / stats->bin_width; -+ } -+ -+ bool count_base = false; -+ for (i = 0; i < n_bam_files; ++i) { // base level filters have to go here -+ int depth_at_pos = n_plp[i]; -+ for (j = 0; j < n_plp[i]; ++j) { -+ const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know -+ -+ if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos -+ else if (p->qpos < p->b->core.l_qseq && -+ bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality -+ else -+ stats->summed_baseQ += bam_get_qual(p->b)[p->qpos]; -+ } -+ if (depth_at_pos > 0) { -+ count_base = true; -+ stats->summed_coverage += depth_at_pos; -+ } -+ // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage -+ //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output -+ } -+ if (count_base) { -+ ++(stats->n_covered_bases); -+ if (opt_print_histogram && current_bin < n_bins) -+ ++(hist[current_bin]); // Histogram based on breadth of coverage -+ } -+ } -+ -+ if (stats->tid != -1) { -+ set_read_counts(data, stats, n_bam_files); -+ if (opt_print_histogram) { -+ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); -+ } else if (opt_print_tabular) { -+ print_tabular_line(file_out, h, stats); -+ } -+ } -+ -+ -+ if (!opt_reg && opt_print_tabular) { -+ memset(stats, 0, sizeof(stats_aux_t)); -+ for (i = 0; i < n_targets; ++i) { -+ if (!covered_tids[i]) { -+ stats->tid = i; -+ stats->end = sam_hdr_tid2len(h, i); -+ print_tabular_line(file_out, h, stats); -+ } -+ } -+ } -+ -+ if (ret < 0) status = EXIT_FAILURE; -+ -+coverage_end: -+ if (n_plp) free(n_plp); -+ if (plp) free(plp); -+ bam_mplp_destroy(mplp); -+ -+ if (covered_tids) free(covered_tids); -+ if (hist) free(hist); -+ if (stats) free(stats); -+ -+ -+ // Close files and free data structures -+ if (!(file_out == stdout || fclose(file_out) == 0)) { -+ if (status == EXIT_SUCCESS) { -+ print_error_errno("coverage", "error on closing \"%s\"", -+ (opt_output_file && strcmp(opt_output_file, "-") != 0? -+ opt_output_file : "stdout")); -+ status = EXIT_FAILURE; -+ } -+ } -+ -+ if (data) { -+ for (i = 0; i < n_bam_files && data[i]; ++i) { -+ sam_hdr_destroy(data[i]->hdr); -+ if (data[i]->fp) sam_close(data[i]->fp); -+ hts_itr_destroy(data[i]->iter); -+ free(data[i]); -+ } -+ free(data); -+ } -+ -+ if (opt_file_list && fn) { -+ for (i = 0; i < n_bam_files; ++i) -+ free(fn[i]); -+ free(fn); -+ } -+ sam_global_args_free(&ga); -+ -+ return status; -+} -+ -+#ifdef _MAIN_BAMCOV -+int main(int argc, char *argv[]) { -+ return main_coverage(argc, argv); -+} -+#endif ---- /dev/null -+++ python-pysam/samtools/coverage.c.pysam.c -@@ -0,0 +1,704 @@ -+#include "samtools.pysam.h" -+ -+/* coverage.c -- samtools coverage subcommand -+ -+ Copyright (C) 2018,2019 Florian Breitwieser -+ Portions copyright (C) 2019 Genome Research Ltd. -+ -+ Author: Florian P Breitwieser -+ -+Permission is hereby granted, free of charge, to any person obtaining a copy -+of this software and associated documentation files (the "Software"), to deal -+in the Software without restriction, including without limitation the rights -+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -+copies of the Software, and to permit persons to whom the Software is -+furnished to do so, subject to the following conditions: -+ -+The above copyright notice and this permission notice shall be included in -+all copies or substantial portions of the Software. -+ -+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -+DEALINGS IN THE SOFTWARE. */ -+ -+/* This program calculates coverage from multiple BAMs -+ * simutaneously, to achieve random access and to use the BED interface. -+ * To compile this program separately, you may: -+ * -+ * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz -+ */ -+ -+// C headers -+#include -+ -+#include -+#include -+#include -+#include // variadic functions -+#include // INT_MAX -+#include // round -+#include -+#include -+#include -+#include -+ -+#ifdef _WIN32 -+#include -+#else -+#include -+#endif -+ -+#include "htslib/sam.h" -+#include "htslib/hts.h" -+#include "samtools.h" -+#include "sam_opts.h" -+ -+const char *VERSION = "0.1"; -+ -+typedef struct { // auxiliary data structure to hold a BAM file -+ samFile *fp; // file handle -+ sam_hdr_t *hdr; // file header -+ hts_itr_t *iter; // iterator to a region - NULL for us by default -+ int min_mapQ; // mapQ filter -+ int min_len; // length filter -+ unsigned int n_reads; // records the number of reads seen in file -+ unsigned int n_selected_reads; // records the number of reads passing filter -+ unsigned long summed_mapQ; // summed mapQ of all reads passing filter -+ int fail_flags; -+ int required_flags; -+} bam_aux_t; -+ -+typedef struct { // auxiliary data structure to hold stats on coverage -+ unsigned long long n_covered_bases; -+ unsigned long long summed_coverage; -+ unsigned long long summed_baseQ; -+ unsigned long long summed_mapQ; -+ unsigned int n_reads; -+ unsigned int n_selected_reads; -+ int32_t tid; // chromosome ID, defined by header -+ hts_pos_t beg; -+ hts_pos_t end; -+ int64_t bin_width; -+} stats_aux_t; -+ -+#if __STDC_VERSION__ >= 199901L -+#define VERTICAL_LINE "\u2502" // BOX DRAWINGS LIGHT VERTICAL -+ -+// UTF8 specifies block characters in eights going from \u2581 (lower one eight block) to \u2588 (full block) -+// https://en.wikipedia.org/wiki/Block_Elements -+// LOWER ONE EIGHTH BLOCK … FULL BLOCK -+static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"}; -+// In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those -+static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"}; -+ -+#else -+ -+// Fall back to explicit UTF-8 encodings of the same characters -+#define VERTICAL_LINE "\xE2\x94\x82" -+ -+static const char *const BLOCK_CHARS8[8] = { -+ "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84", -+ "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" }; -+ -+static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"}; -+ -+#endif -+ -+// in bam_plcmd.c -+int read_file_list(const char *file_list, int *n, char **argv[]); -+ -+static int usage() { -+ fprintf(samtools_stdout, "Usage: samtools coverage [options] in1.bam [in2.bam [...]]\n\n" -+ "Input options:\n" -+ " -b, --bam-list FILE list of input BAM filenames, one per line\n" -+ " -l, --min-read-len INT ignore reads shorter than INT bp [0]\n" -+ " -q, --min-MQ INT base quality threshold [0]\n" -+ " -Q, --min-BQ INT mapping quality threshold [0]\n" -+ " --rf required flags: skip reads with mask bits unset []\n" -+ " --ff filter flags: skip reads with mask bits set \n" -+ " [UNMAP,SECONDARY,QCFAIL,DUP]\n" -+ "Output options:\n" -+ " -m, --histogram show histogram instead of tabular output\n" -+ " -A, --ascii show only ASCII characters in histogram\n" -+ " -o, --output FILE write output to FILE [samtools_stdout]\n" -+ " -H, --no-header don't print a header in tabular mode\n" -+ " -w, --n-bins INT number of bins in histogram [terminal width - 40]\n" -+ " -r, --region REG show specified region. Format: chr:start-end. \n" -+ " -h, --help help (this page)\n"); -+ -+ fprintf(samtools_stdout, "\nGeneric options:\n"); -+ sam_global_opt_help(samtools_stdout, "-.--.--."); -+ -+ fprintf(samtools_stdout, -+ "\nSee manpage for additional details.\n" -+ " rname Reference name / chromosome\n" -+ " startpos Start position\n" -+ " endpos End position (or sequence length)\n" -+ " numreads Number reads aligned to the region (after filtering)\n" -+ " covbases Number of covered bases with depth >= 1\n" -+ " coverage Proportion of covered bases [0..1]\n" -+ " meandepth Mean depth of coverage\n" -+ " meanbaseq Mean baseQ in covered region\n" -+ " meanmapq Mean mapQ of selected reads\n" -+ ); -+ -+ return EXIT_SUCCESS; -+} -+ -+static char* center_text(char *text, char *buf, int width) { -+ int len = strlen(text); -+ assert(len <= width); -+ int padding = (width - len) / 2; -+ int padding_ex = (width - len) % 2; -+ if (padding >= 1) -+ sprintf(buf, " %*s%*s", len+padding, text, padding-1+padding_ex, " "); -+ else -+ sprintf(buf, "%s", text); -+ -+ return buf; -+} -+ -+static char* readable_bps(double base_pairs, char *buf) { -+ const char* units[] = {"", "K", "M", "G", "T"}; -+ int i = 0; -+ while (base_pairs >= 1000 && i < (sizeof(units)/sizeof(units[0]) - 1)) { -+ base_pairs /= 1000; -+ i++; -+ } -+ sprintf(buf, "%.*f%s", i, base_pairs, units[i]); -+ return buf; -+} -+ -+static void set_read_counts(bam_aux_t **data, stats_aux_t *stats, int n_bam_files) { -+ int i; -+ stats->n_reads = 0; -+ stats->n_selected_reads = 0; -+ stats->summed_mapQ = 0; -+ for (i = 0; i < n_bam_files && data[i]; ++i) { -+ stats->n_reads += data[i]->n_reads; -+ stats->n_selected_reads += data[i]->n_selected_reads; -+ stats->summed_mapQ += data[i]->summed_mapQ; -+ data[i]->n_reads = 0; -+ data[i]->n_selected_reads = 0; -+ data[i]->summed_mapQ = 0; -+ } -+} -+ -+// read one alignment from one BAM file -+static int read_bam(void *data, bam1_t *b) { -+ bam_aux_t *aux = (bam_aux_t*)data; // data in fact is a pointer to an auxiliary structure -+ int ret; -+ while (1) { -+ if((ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b)) < 0) break; -+ ++aux->n_reads; -+ -+ if ( aux->fail_flags && (b->core.flag & aux->fail_flags) ) continue; -+ if ( aux->required_flags && !(b->core.flag & aux->required_flags) ) continue; -+ if ( b->core.qual < aux->min_mapQ ) continue; -+ if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; -+ ++aux->n_selected_reads; -+ aux->summed_mapQ += b->core.qual; -+ break; -+ } -+ return ret; -+} -+ -+void print_tabular_line(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats) { -+ fputs(sam_hdr_tid2name(h, stats->tid), file_out); -+ double region_len = (double) stats->end - stats->beg; -+ fprintf(file_out, "\t%"PRId64"\t%"PRId64"\t%u\t%llu\t%g\t%g\t%.3g\t%.3g\n", -+ stats->beg+1, -+ stats->end, -+ stats->n_selected_reads, -+ stats->n_covered_bases, -+ 100.0 * stats->n_covered_bases / region_len, -+ stats->summed_coverage / region_len, -+ stats->summed_coverage > 0? stats->summed_baseQ/(double) stats->summed_coverage : 0, -+ stats->n_selected_reads > 0? stats->summed_mapQ/(double) stats->n_selected_reads : 0 -+ ); -+} -+ -+void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, const uint32_t *hist, -+ const int hist_size, const bool full_utf) { -+ int i, col; -+ bool show_percentiles = false; -+ const int n_rows = 10; -+ const char * const * BLOCK_CHARS = full_utf? BLOCK_CHARS8 : BLOCK_CHARS2; -+ const int blockchar_len = full_utf? 8 : 2; -+ /* -+ if (stats->beg == 0) { -+ stats->end = h->target_len[stats->tid]; -+ } -+ */ -+ double region_len = stats->end - stats->beg; -+ -+ // Calculate histogram that contains percent covered -+ double hist_data[hist_size]; -+ double max_val = 0.0; -+ for (i = 0; i < hist_size; ++i) { -+ hist_data[i] = 100 * hist[i] / (double) stats->bin_width; -+ if (hist_data[i] > max_val) max_val = hist_data[i]; -+ } -+ -+ char buf[30]; -+ fprintf(file_out, "%s (%sbp)\n", sam_hdr_tid2name(h, stats->tid), readable_bps(sam_hdr_tid2len(h, stats->tid), buf)); -+ -+ double row_bin_size = max_val / (double) n_rows; -+ for (i = n_rows-1; i >= 0; --i) { -+ double current_bin = row_bin_size * i; -+ if (show_percentiles) { -+ fprintf(file_out, ">%3i%% ", i*10); -+ } else { -+ fprintf(file_out, ">%7.2f%% ", current_bin); -+ } -+ fprintf(file_out, VERTICAL_LINE); -+ for (col = 0; col < hist_size; ++col) { -+ // get the difference in eights, or halfs when full UTF8 is not supported -+ int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1; -+ if (cur_val_diff < 0) { -+ fputc(' ', file_out); -+ } else { -+ if (cur_val_diff >= blockchar_len) -+ cur_val_diff = blockchar_len - 1; -+ -+ fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]); -+ } -+ } -+ fprintf(file_out, VERTICAL_LINE); -+ fputc(' ', file_out); -+ switch (i) { -+ case 9: fprintf(file_out, "Number of reads: %i", stats->n_selected_reads); break; -+ case 8: if (stats->n_reads - stats->n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats->n_reads - stats->n_selected_reads); break; -+ case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats->n_covered_bases, buf)); break; -+ case 6: fprintf(file_out, "Percent covered: %.4g%%", -+ 100.0 * stats->n_covered_bases / region_len); break; -+ case 5: fprintf(file_out, "Mean coverage: %.3gx", -+ stats->summed_coverage / region_len); break; -+ case 4: fprintf(file_out, "Mean baseQ: %.3g", -+ stats->summed_baseQ/(double) stats->summed_coverage); break; -+ case 3: fprintf(file_out, "Mean mapQ: %.3g", -+ stats->summed_mapQ/(double) stats->n_selected_reads); break; -+ case 1: fprintf(file_out, "Histo bin width: %sbp", -+ readable_bps(stats->bin_width, buf)); break; -+ case 0: fprintf(file_out, "Histo max bin: %.5g%%", max_val); break; -+ }; -+ fputc('\n', file_out); -+ } -+ -+ // print x axis. Could be made pretty for widths that are not divisible -+ // by 10 by variable spacing of the labels, instead of placing a label every 10 characters -+ char buf2[50]; -+ fprintf(file_out, " %s", center_text(readable_bps(stats->beg + 1, buf), buf2, 10)); -+ int rest; -+ for (rest = 10; rest < 10*(hist_size/10); rest += 10) { -+ fprintf(file_out, "%s", center_text(readable_bps(stats->beg + stats->bin_width*rest, buf), buf2, 10)); -+ } -+ int last_padding = hist_size%10; -+ fprintf(file_out, "%*s%s", last_padding, " ", center_text(readable_bps(stats->end, buf), buf2, 10)); -+ fprintf(file_out, "\n"); -+} -+ -+int main_coverage(int argc, char *argv[]) { -+ int status = EXIT_SUCCESS; -+ -+ int ret, tid, pos, i, j; -+ -+ int max_depth = 0; -+ int opt_min_baseQ = 0; -+ int opt_min_mapQ = 0; -+ int opt_min_len = 0; -+ int opt_n_bins = 50; -+ bool opt_full_width = true; -+ char *opt_output_file = NULL; -+ bam_aux_t **data = NULL; -+ bam_mplp_t mplp = NULL; -+ const bam_pileup1_t **plp = NULL; -+ uint32_t *hist = NULL; -+ stats_aux_t *stats = NULL; -+ char *opt_reg = 0; // specified region -+ char *opt_file_list = NULL; -+ int n_bam_files = 0; -+ char **fn = NULL; -+ int fail_flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); // Default fail flags -+ int required_flags = 0; -+ -+ int *n_plp = NULL; -+ sam_hdr_t *h = NULL; // BAM header of the 1st input -+ -+ bool opt_print_header = true; -+ bool opt_print_tabular = true; -+ bool opt_print_histogram = false; -+ bool *covered_tids = NULL; -+ bool opt_full_utf = true; -+ -+ FILE *file_out = samtools_stdout; -+ -+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT; -+ static const struct option lopts[] = { -+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), -+ {"rf", required_argument, NULL, 1}, // require flag -+ {"ff", required_argument, NULL, 2}, // filter flag -+ {"incl-flags", required_argument, NULL, 1}, // require flag -+ {"excl-flags", required_argument, NULL, 2}, // filter flag -+ {"bam-list", required_argument, NULL, 'b'}, -+ {"min-read-len", required_argument, NULL, 'L'}, -+ {"min-MQ", required_argument, NULL, 'q'}, -+ {"min-mq", required_argument, NULL, 'q'}, -+ {"min-BQ", required_argument, NULL, 'Q'}, -+ {"min-bq", required_argument, NULL, 'Q'}, -+ {"histogram", no_argument, NULL, 'm'}, -+ {"ascii", no_argument, NULL, 'A'}, -+ {"output", required_argument, NULL, 'o'}, -+ {"no-header", no_argument, NULL, 'H'}, -+ {"n-bins", required_argument, NULL, 'w'}, -+ {"region", required_argument, NULL, 'r'}, -+ {"help", no_argument, NULL, 'h'}, -+ { NULL, 0, NULL, 0 } -+ }; -+ -+ // parse the command line -+ int c; -+ opterr = 0; -+ while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) { -+ switch (c) { -+ case 1: -+ if ((required_flags = bam_str2flag(optarg)) < 0) { -+ fprintf(samtools_stderr,"Could not parse --rf %s\n", optarg); return EXIT_FAILURE; -+ }; break; -+ case 2: -+ if ((fail_flags = bam_str2flag(optarg)) < 0) { -+ fprintf(samtools_stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE; -+ }; break; -+ case 'o': opt_output_file = optarg; opt_full_width = false; break; -+ case 'L': opt_min_len = atoi(optarg); break; -+ case 'q': opt_min_baseQ = atoi(optarg); break; -+ case 'Q': opt_min_mapQ = atoi(optarg); break; -+ case 'w': opt_n_bins = atoi(optarg); opt_full_width = false; -+ opt_print_histogram = true; opt_print_tabular = false; -+ break; -+ case 'r': opt_reg = optarg; break; // parsing a region requires a BAM header (strdup unnecessary) -+ case 'b': opt_file_list = optarg; break; -+ case 'm': opt_print_histogram = true; opt_print_tabular = false; break; -+ case 'A': opt_full_utf = false; -+ opt_print_histogram = true; opt_print_tabular = false; -+ break; -+ case 'H': opt_print_header = false; break; -+ case 'h': return usage(); -+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; -+ /* else fall-through */ -+ case '?': -+ if (optopt != '?') { // '-?' appeared on command line -+ if (optopt) { // Bad short option -+ print_error("coverage", "invalid option -- '%c'", optopt); -+ } else { // Bad long option -+ // Do our best. There is no good solution to finding -+ // out what the bad option was. -+ // See, e.g. https://stackoverflow.com/questions/2723888/where-does-getopt-long-store-an-unrecognized-option -+ if (optind > 0 && strncmp(argv[optind - 1], "--", 2) == 0) { -+ print_error("coverage", "unrecognised option '%s'", -+ argv[optind - 1]); -+ } -+ } -+ } -+ return usage(); -+ } -+ } -+ if (optind == argc && !opt_file_list) -+ return usage(); -+ -+ // output file provided by user -+ if (opt_output_file != NULL && strcmp(opt_output_file,"-")!=0) { -+ file_out = fopen( opt_output_file, "w" ); -+ if (file_out == NULL) { -+ print_error_errno("coverage", "Cannot open \"%s\" for writing.", opt_output_file); -+ return EXIT_FAILURE; -+ } -+ } -+ -+ if (opt_n_bins <= 0 || opt_full_width) { -+ // get number of columns of terminal -+ const char* env_columns = getenv("COLUMNS"); -+ int columns = 0; -+ if (env_columns == NULL) { -+#ifdef _WIN32 -+ CONSOLE_SCREEN_BUFFER_INFO csbi; -+ if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi)) { -+ columns = csbi.srWindow.Right - csbi.srWindow.Left + 1; -+ } -+#else -+ struct winsize w; -+ if (ioctl(2, TIOCGWINSZ, &w) == 0) -+ columns = w.ws_col; -+#endif -+ } else { -+ columns = atoi(env_columns); // atoi(NULL) returns 0 -+ } -+ -+ if (columns > 60) { -+ opt_n_bins = columns - 40; -+ } else { -+ opt_n_bins = 40; -+ } -+ } -+ -+ // setvbuf(file_out, NULL, _IONBF, 0); //turn off buffering -+ -+ // Open all BAM files -+ if (opt_file_list) { -+ // Read file names from opt_file_list into argv, and record the number of files in n_bam_files -+ if (read_file_list(opt_file_list, &n_bam_files, &fn)) { -+ print_error_errno("coverage", "Cannot open file list \"%s\".", opt_file_list); -+ return EXIT_FAILURE; -+ } -+ argv = fn; -+ optind = 0; -+ } else { -+ n_bam_files = argc - optind; // the number of BAMs on the command line -+ } -+ -+ data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file -+ if (!data) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ -+ for (i = 0; i < n_bam_files; ++i) { -+ int rf; -+ data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t)); -+ if (!data[i]) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM -+ -+ if (data[i]->fp == NULL) { -+ print_error_errno("coverage", "Could not open \"%s\"", argv[optind+i]); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; -+ if (opt_min_baseQ) rf |= SAM_QUAL; -+ -+ // Set CRAM options on file handle - returns 0 on success -+ if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -+ print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { -+ print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ data[i]->min_mapQ = opt_min_mapQ; // set the mapQ filter -+ data[i]->min_len = opt_min_len; // set the qlen filter -+ data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header -+ data[i]->fail_flags = fail_flags; -+ data[i]->required_flags = required_flags; -+ if (data[i]->hdr == NULL) { -+ print_error_errno("coverage", "Could not read header for \"%s\"", argv[optind+i]); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ -+ // Lookup region if specified -+ if (opt_reg) { // if a region is specified -+ hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index -+ if (idx == NULL) { -+ print_error_errno("coverage", "Failed to load index for \"%s\"", argv[optind+i]); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator -+ hts_idx_destroy(idx); // the index is not needed any more; free the memory -+ if (data[i]->iter == NULL) { -+ print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ } -+ } -+ -+ if (opt_print_tabular && opt_print_header) -+ fputs("#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq\n", file_out); -+ -+ h = data[0]->hdr; // easy access to the header of the 1st BAM -+ int n_targets = sam_hdr_nref(h); -+ covered_tids = calloc(n_targets, sizeof(bool)); -+ stats = calloc(1, sizeof(stats_aux_t)); -+ if (!covered_tids || !stats) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ -+ int64_t n_bins = opt_n_bins; -+ if (opt_reg) { -+ stats->tid = data[0]->iter->tid; -+ stats->beg = data[0]->iter->beg; // and to the parsed region coordinates -+ stats->end = data[0]->iter->end; -+ if (stats->end == HTS_POS_MAX) { -+ stats->end = sam_hdr_tid2len(h, stats->tid); -+ } -+ if (opt_n_bins > stats->end - stats->beg) { -+ n_bins = stats->end - stats->beg; -+ } -+ stats->bin_width = (stats->end-stats->beg) / n_bins; -+ } else { -+ stats->tid = -1; -+ } -+ -+ int64_t current_bin = 0; -+ -+ // the core multi-pileup loop -+ mplp = bam_mplp_init(n_bam_files, read_bam, (void**)data); // initialization -+ if (max_depth > 0) -+ bam_mplp_set_maxcnt(mplp, max_depth); // set maximum coverage depth -+ else if (!max_depth) -+ bam_mplp_set_maxcnt(mplp, INT_MAX); -+ -+ -+ // Extra info for histogram and coverage counting -+ hist = (uint32_t*) calloc(opt_n_bins, sizeof(uint32_t)); -+ n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM -+ plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) -+ if (!hist || !n_plp || !plp) { -+ print_error("coverage", "Failed to allocate memory"); -+ status = EXIT_FAILURE; -+ goto coverage_end; -+ } -+ while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position -+ -+ if (tid != stats->tid) { // Next target sequence -+ if (stats->tid >= 0) { // It's not the first sequence, print results -+ set_read_counts(data, stats, n_bam_files); -+ if (opt_print_histogram) { -+ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); -+ fputc('\n', file_out); -+ } else if (opt_print_tabular) { -+ print_tabular_line(file_out, h, stats); -+ } -+ -+ // reset data -+ memset(stats, 0, sizeof(stats_aux_t)); -+ if (opt_print_histogram) -+ memset(hist, 0, n_bins*sizeof(uint32_t)); -+ } -+ -+ stats->tid = tid; -+ covered_tids[tid] = true; -+ if (!opt_reg) -+ stats->end = sam_hdr_tid2len(h, tid); -+ -+ if (opt_print_histogram) { -+ n_bins = opt_n_bins > stats->end-stats->beg? stats->end-stats->beg : opt_n_bins; -+ stats->bin_width = (stats->end-stats->beg) / n_bins; -+ } -+ } -+ if (pos < stats->beg || pos >= stats->end) continue; // out of range; skip -+ if (tid >= n_targets) continue; // diff number of @SQ lines per file? -+ -+ if (opt_print_histogram) { -+ current_bin = (pos - stats->beg) / stats->bin_width; -+ } -+ -+ bool count_base = false; -+ for (i = 0; i < n_bam_files; ++i) { // base level filters have to go here -+ int depth_at_pos = n_plp[i]; -+ for (j = 0; j < n_plp[i]; ++j) { -+ const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know -+ -+ if (p->is_del || p->is_refskip) --depth_at_pos; // having dels or refskips at tid:pos -+ else if (p->qpos < p->b->core.l_qseq && -+ bam_get_qual(p->b)[p->qpos] < opt_min_baseQ) --depth_at_pos; // low base quality -+ else -+ stats->summed_baseQ += bam_get_qual(p->b)[p->qpos]; -+ } -+ if (depth_at_pos > 0) { -+ count_base = true; -+ stats->summed_coverage += depth_at_pos; -+ } -+ // hist[current_bin] += depth_at_pos; // Add counts to the histogram here to have one based on coverage -+ //fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output -+ } -+ if (count_base) { -+ ++(stats->n_covered_bases); -+ if (opt_print_histogram && current_bin < n_bins) -+ ++(hist[current_bin]); // Histogram based on breadth of coverage -+ } -+ } -+ -+ if (stats->tid != -1) { -+ set_read_counts(data, stats, n_bam_files); -+ if (opt_print_histogram) { -+ print_hist(file_out, h, stats, hist, n_bins, opt_full_utf); -+ } else if (opt_print_tabular) { -+ print_tabular_line(file_out, h, stats); -+ } -+ } -+ -+ -+ if (!opt_reg && opt_print_tabular) { -+ memset(stats, 0, sizeof(stats_aux_t)); -+ for (i = 0; i < n_targets; ++i) { -+ if (!covered_tids[i]) { -+ stats->tid = i; -+ stats->end = sam_hdr_tid2len(h, i); -+ print_tabular_line(file_out, h, stats); -+ } -+ } -+ } -+ -+ if (ret < 0) status = EXIT_FAILURE; -+ -+coverage_end: -+ if (n_plp) free(n_plp); -+ if (plp) free(plp); -+ bam_mplp_destroy(mplp); -+ -+ if (covered_tids) free(covered_tids); -+ if (hist) free(hist); -+ if (stats) free(stats); -+ -+ -+ // Close files and free data structures -+ if (!(file_out == samtools_stdout || fclose(file_out) == 0)) { -+ if (status == EXIT_SUCCESS) { -+ print_error_errno("coverage", "error on closing \"%s\"", -+ (opt_output_file && strcmp(opt_output_file, "-") != 0? -+ opt_output_file : "samtools_stdout")); -+ status = EXIT_FAILURE; -+ } -+ } -+ -+ if (data) { -+ for (i = 0; i < n_bam_files && data[i]; ++i) { -+ sam_hdr_destroy(data[i]->hdr); -+ if (data[i]->fp) sam_close(data[i]->fp); -+ hts_itr_destroy(data[i]->iter); -+ free(data[i]); -+ } -+ free(data); -+ } -+ -+ if (opt_file_list && fn) { -+ for (i = 0; i < n_bam_files; ++i) -+ free(fn[i]); -+ free(fn); -+ } -+ sam_global_args_free(&ga); -+ -+ return status; -+} -+ -+#ifdef _MAIN_BAMCOV -+int samtools_coverage_main(int argc, char *argv[]) { -+ return main_coverage(argc, argv); -+} -+#endif ---- python-pysam.orig/samtools/cut_target.c -+++ python-pysam/samtools/cut_target.c -@@ -1,7 +1,7 @@ - /* cut_target.c -- targetcut subcommand. - - Copyright (C) 2011 Broad Institute. -- Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd. -+ Copyright (C) 2012-2013, 2015, 2016, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -49,9 +49,9 @@ - int min_baseQ, tid, max_bases; - uint16_t *bases; - samFile *fp; -- bam_hdr_t *h; -+ sam_hdr_t *h; - char *ref; -- int len; -+ hts_pos_t len; - faidx_t *fai; - errmod_t *em; - } ct_t; -@@ -92,9 +92,10 @@ - return ret<<8|k; - } - --static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) -+static void process_cns(sam_hdr_t *h, int tid, hts_pos_t l, uint16_t *cns) - { -- int i, f[2][2], *prev, *curr, *swap_tmp, s; -+ int64_t i, s; -+ int f[2][2], *prev, *curr, *swap_tmp; - uint8_t *b; // backtrack array - b = calloc(l, 1); - f[0][0] = f[0][1] = 0; -@@ -123,11 +124,11 @@ - s = b[i]>>s&1; - } - // print -- for (i = 0, s = -1; i < INT_MAX && i <= l; ++i) { -+ for (i = 0, s = -1; i < INT64_MAX && i <= l; ++i) { - if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { - if (s >= 0) { -- int j; -- printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); -+ int64_t j; -+ printf("%s:%"PRId64"-%"PRId64"\t0\t%s\t%"PRId64"\t60\t%"PRId64"M\t*\t0\t0\t", sam_hdr_tid2name(h, tid), s+1, i, sam_hdr_tid2name(h, tid), s+1, i-s); - for (j = s; j < i; ++j) { - int c = cns[j]>>8; - if (c == 0) putchar('N'); -@@ -157,7 +158,7 @@ - if ( g->fai && b->core.tid >= 0 ) { - if (b->core.tid != g->tid) { // then load the sequence - free(g->ref); -- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len); -+ g->ref = fai_fetch64(g->fai, sam_hdr_tid2name(g->h, b->core.tid), &g->len); - g->tid = b->core.tid; - } - sam_prob_realn(b, g->ref, g->len, 1<<1|1); -@@ -169,7 +170,8 @@ - - int main_cut_target(int argc, char *argv[]) - { -- int c, tid, pos, n, lasttid = -1, l, max_l, usage = 0; -+ int c, tid, pos, n, lasttid = -1, usage = 0; -+ hts_pos_t l, max_l; - const bam_pileup1_t *p; - bam_plp_t plp; - uint16_t *cns; -@@ -201,7 +203,7 @@ - } - if (usage || argc == optind) { - fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); -- sam_global_opt_help(stderr, "-.--f-"); -+ sam_global_opt_help(stderr, "-.--f--."); - return 1; - } - l = max_l = 0; cns = 0; -@@ -223,12 +225,12 @@ - if (tid < 0) break; - if (tid != lasttid) { // change of chromosome - if (cns) process_cns(g.h, lasttid, l, cns); -- if (max_l < g.h->target_len[tid]) { -- max_l = g.h->target_len[tid]; -+ if (max_l < sam_hdr_tid2len(g.h, tid)) { -+ max_l = sam_hdr_tid2len(g.h, tid); - kroundup32(max_l); - cns = realloc(cns, max_l * 2); - } -- l = g.h->target_len[tid]; -+ l = sam_hdr_tid2len(g.h, tid); - memset(cns, 0, max_l * 2); - lasttid = tid; - } -@@ -236,7 +238,7 @@ - } - process_cns(g.h, lasttid, l, cns); - free(cns); -- bam_hdr_destroy(g.h); -+ sam_hdr_destroy(g.h); - bam_plp_destroy(plp); - sam_close(g.fp); - if (g.fai) { ---- python-pysam.orig/samtools/cut_target.c.pysam.c -+++ python-pysam/samtools/cut_target.c.pysam.c -@@ -3,7 +3,7 @@ - /* cut_target.c -- targetcut subcommand. - - Copyright (C) 2011 Broad Institute. -- Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd. -+ Copyright (C) 2012-2013, 2015, 2016, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -51,9 +51,9 @@ - int min_baseQ, tid, max_bases; - uint16_t *bases; - samFile *fp; -- bam_hdr_t *h; -+ sam_hdr_t *h; - char *ref; -- int len; -+ hts_pos_t len; - faidx_t *fai; - errmod_t *em; - } ct_t; -@@ -94,9 +94,10 @@ - return ret<<8|k; - } - --static void process_cns(bam_hdr_t *h, int tid, int l, uint16_t *cns) -+static void process_cns(sam_hdr_t *h, int tid, hts_pos_t l, uint16_t *cns) - { -- int i, f[2][2], *prev, *curr, *swap_tmp, s; -+ int64_t i, s; -+ int f[2][2], *prev, *curr, *swap_tmp; - uint8_t *b; // backtrack array - b = calloc(l, 1); - f[0][0] = f[0][1] = 0; -@@ -125,11 +126,11 @@ - s = b[i]>>s&1; - } - // print -- for (i = 0, s = -1; i < INT_MAX && i <= l; ++i) { -+ for (i = 0, s = -1; i < INT64_MAX && i <= l; ++i) { - if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { - if (s >= 0) { -- int j; -- fprintf(samtools_stdout, "%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); -+ int64_t j; -+ fprintf(samtools_stdout, "%s:%"PRId64"-%"PRId64"\t0\t%s\t%"PRId64"\t60\t%"PRId64"M\t*\t0\t0\t", sam_hdr_tid2name(h, tid), s+1, i, sam_hdr_tid2name(h, tid), s+1, i-s); - for (j = s; j < i; ++j) { - int c = cns[j]>>8; - if (c == 0) fputc('N', samtools_stdout); -@@ -159,7 +160,7 @@ - if ( g->fai && b->core.tid >= 0 ) { - if (b->core.tid != g->tid) { // then load the sequence - free(g->ref); -- g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len); -+ g->ref = fai_fetch64(g->fai, sam_hdr_tid2name(g->h, b->core.tid), &g->len); - g->tid = b->core.tid; - } - sam_prob_realn(b, g->ref, g->len, 1<<1|1); -@@ -171,7 +172,8 @@ - - int main_cut_target(int argc, char *argv[]) - { -- int c, tid, pos, n, lasttid = -1, l, max_l, usage = 0; -+ int c, tid, pos, n, lasttid = -1, usage = 0; -+ hts_pos_t l, max_l; - const bam_pileup1_t *p; - bam_plp_t plp; - uint16_t *cns; -@@ -203,7 +205,7 @@ - } - if (usage || argc == optind) { - fprintf(samtools_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] \n"); -- sam_global_opt_help(samtools_stderr, "-.--f-"); -+ sam_global_opt_help(samtools_stderr, "-.--f--."); - return 1; - } - l = max_l = 0; cns = 0; -@@ -225,12 +227,12 @@ - if (tid < 0) break; - if (tid != lasttid) { // change of chromosome - if (cns) process_cns(g.h, lasttid, l, cns); -- if (max_l < g.h->target_len[tid]) { -- max_l = g.h->target_len[tid]; -+ if (max_l < sam_hdr_tid2len(g.h, tid)) { -+ max_l = sam_hdr_tid2len(g.h, tid); - kroundup32(max_l); - cns = realloc(cns, max_l * 2); - } -- l = g.h->target_len[tid]; -+ l = sam_hdr_tid2len(g.h, tid); - memset(cns, 0, max_l * 2); - lasttid = tid; - } -@@ -238,7 +240,7 @@ - } - process_cns(g.h, lasttid, l, cns); - free(cns); -- bam_hdr_destroy(g.h); -+ sam_hdr_destroy(g.h); - bam_plp_destroy(plp); - sam_close(g.fp); - if (g.fai) { ---- python-pysam.orig/samtools/dict.c -+++ python-pysam/samtools/dict.c -@@ -98,6 +98,7 @@ - hts_md5_destroy(md5); - - if (args->output_fname) fclose(out); -+ gzclose(fp); - } - - static int dict_usage(void) ---- python-pysam.orig/samtools/dict.c.pysam.c -+++ python-pysam/samtools/dict.c.pysam.c -@@ -100,6 +100,7 @@ - hts_md5_destroy(md5); - - if (args->output_fname) fclose(out); -+ gzclose(fp); - } - - static int dict_usage(void) ---- python-pysam.orig/samtools/faidx.c -+++ python-pysam/samtools/faidx.c -@@ -1,6 +1,6 @@ - /* faidx.c -- faidx subcommand. - -- Copyright (C) 2008, 2009, 2013, 2016, 2018 Genome Research Ltd. -+ Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li -@@ -67,9 +67,9 @@ - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, - }; - --static void reverse_complement(char *str, int len) { -+static void reverse_complement(char *str, const hts_pos_t len) { - char c; -- int i = 0, j = len - 1; -+ hts_pos_t i = 0, j = len - 1; - - while (i <= j) { - c = str[i]; -@@ -80,10 +80,9 @@ - } - } - -- --static void reverse(char *str, int len) { -+static void reverse(char *str, const hts_pos_t len) { - char c; -- int i = 0, j = len - 1; -+ hts_pos_t i = 0, j = len - 1; - - while (i < j) { - c = str[i]; -@@ -95,9 +94,10 @@ - } - - --static int write_line(FILE *file, const char *line, const char *name, const int ignore, -- const int length, const int seq_len) { -- int beg, end; -+static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name, -+ const int ignore, const int length, const hts_pos_t seq_len) { -+ int id; -+ hts_pos_t beg, end; - - if (seq_len < 0) { - fprintf(stderr, "[faidx] Failed to fetch sequence in %s\n", name); -@@ -109,15 +109,16 @@ - } - } else if (seq_len == 0) { - fprintf(stderr, "[faidx] Zero length sequence: %s\n", name); -- } else if (hts_parse_reg(name, &beg, &end) && (end < INT_MAX) && (seq_len != end - beg)) { -+ } else if (fai_parse_region(faid, name, &id, &beg, &end, 0) -+ && (end < INT_MAX) && (seq_len != end - beg)) { - fprintf(stderr, "[faidx] Truncated sequence: %s\n", name); - } - -- size_t i, seq_sz = seq_len; -+ hts_pos_t i, seq_sz = seq_len; - - for (i = 0; i < seq_sz; i += length) - { -- size_t len = i + length < seq_sz ? length : seq_sz - i; -+ hts_pos_t len = i + length < seq_sz ? length : seq_sz - i; - if (fwrite(line + i, 1, len, file) < len || - fputc('\n', file) == EOF) { - print_error_errno("faidx", "failed to write output"); -@@ -133,8 +134,8 @@ - const int length, const int rev, - const char *pos_strand_name, const char *neg_strand_name, - enum fai_format_options format) { -- int seq_len; -- char *seq = fai_fetch(faid, name, &seq_len); -+ hts_pos_t seq_len; -+ char *seq = fai_fetch64(faid, name, &seq_len); - - if (format == FAI_FASTA) { - fprintf(file, ">%s%s\n", name, rev ? neg_strand_name : pos_strand_name); -@@ -146,7 +147,8 @@ - reverse_complement(seq, seq_len); - } - -- if (write_line(file, seq, name, ignore, length, seq_len) == EXIT_FAILURE) { -+ if (write_line(faid, file, seq, name, ignore, length, seq_len) -+ == EXIT_FAILURE) { - free(seq); - return EXIT_FAILURE; - } -@@ -156,14 +158,15 @@ - if (format == FAI_FASTQ) { - fprintf(file, "+\n"); - -- char *qual = fai_fetchqual(faid, name, &seq_len); -+ char *qual = fai_fetchqual64(faid, name, &seq_len); - - if (rev && seq_len > 0) { - reverse(qual, seq_len); - } - -- if (write_line(file, qual, name, ignore, length, seq_len) == EXIT_FAILURE) { -- free(seq); -+ if (write_line(faid, file, qual, name, ignore, length, seq_len) -+ == EXIT_FAILURE) { -+ free(qual); - return EXIT_FAILURE; - } - ---- python-pysam.orig/samtools/faidx.c.pysam.c -+++ python-pysam/samtools/faidx.c.pysam.c -@@ -2,7 +2,7 @@ - - /* faidx.c -- faidx subcommand. - -- Copyright (C) 2008, 2009, 2013, 2016, 2018 Genome Research Ltd. -+ Copyright (C) 2008, 2009, 2013, 2016, 2018-2019 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li -@@ -69,9 +69,9 @@ - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, - }; - --static void reverse_complement(char *str, int len) { -+static void reverse_complement(char *str, const hts_pos_t len) { - char c; -- int i = 0, j = len - 1; -+ hts_pos_t i = 0, j = len - 1; - - while (i <= j) { - c = str[i]; -@@ -82,10 +82,9 @@ - } - } - -- --static void reverse(char *str, int len) { -+static void reverse(char *str, const hts_pos_t len) { - char c; -- int i = 0, j = len - 1; -+ hts_pos_t i = 0, j = len - 1; - - while (i < j) { - c = str[i]; -@@ -97,9 +96,10 @@ - } - - --static int write_line(FILE *file, const char *line, const char *name, const int ignore, -- const int length, const int seq_len) { -- int beg, end; -+static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name, -+ const int ignore, const int length, const hts_pos_t seq_len) { -+ int id; -+ hts_pos_t beg, end; - - if (seq_len < 0) { - fprintf(samtools_stderr, "[faidx] Failed to fetch sequence in %s\n", name); -@@ -111,15 +111,16 @@ - } - } else if (seq_len == 0) { - fprintf(samtools_stderr, "[faidx] Zero length sequence: %s\n", name); -- } else if (hts_parse_reg(name, &beg, &end) && (end < INT_MAX) && (seq_len != end - beg)) { -+ } else if (fai_parse_region(faid, name, &id, &beg, &end, 0) -+ && (end < INT_MAX) && (seq_len != end - beg)) { - fprintf(samtools_stderr, "[faidx] Truncated sequence: %s\n", name); - } - -- size_t i, seq_sz = seq_len; -+ hts_pos_t i, seq_sz = seq_len; - - for (i = 0; i < seq_sz; i += length) - { -- size_t len = i + length < seq_sz ? length : seq_sz - i; -+ hts_pos_t len = i + length < seq_sz ? length : seq_sz - i; - if (fwrite(line + i, 1, len, file) < len || - fputc('\n', file) == EOF) { - print_error_errno("faidx", "failed to write output"); -@@ -135,8 +136,8 @@ - const int length, const int rev, - const char *pos_strand_name, const char *neg_strand_name, - enum fai_format_options format) { -- int seq_len; -- char *seq = fai_fetch(faid, name, &seq_len); -+ hts_pos_t seq_len; -+ char *seq = fai_fetch64(faid, name, &seq_len); - - if (format == FAI_FASTA) { - fprintf(file, ">%s%s\n", name, rev ? neg_strand_name : pos_strand_name); -@@ -148,7 +149,8 @@ - reverse_complement(seq, seq_len); - } - -- if (write_line(file, seq, name, ignore, length, seq_len) == EXIT_FAILURE) { -+ if (write_line(faid, file, seq, name, ignore, length, seq_len) -+ == EXIT_FAILURE) { - free(seq); - return EXIT_FAILURE; - } -@@ -158,14 +160,15 @@ - if (format == FAI_FASTQ) { - fprintf(file, "+\n"); - -- char *qual = fai_fetchqual(faid, name, &seq_len); -+ char *qual = fai_fetchqual64(faid, name, &seq_len); - - if (rev && seq_len > 0) { - reverse(qual, seq_len); - } - -- if (write_line(file, qual, name, ignore, length, seq_len) == EXIT_FAILURE) { -- free(seq); -+ if (write_line(faid, file, qual, name, ignore, length, seq_len) -+ == EXIT_FAILURE) { -+ free(qual); - return EXIT_FAILURE; - } - ---- python-pysam.orig/samtools/htslib-1.9/LICENSE -+++ /dev/null -@@ -1,69 +0,0 @@ --[Files in this distribution outwith the cram/ subdirectory are distributed --according to the terms of the following MIT/Expat license.] -- --The MIT/Expat License -- --Copyright (C) 2012-2018 Genome Research Ltd. -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL --THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING --FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER --DEALINGS IN THE SOFTWARE. -- -- --[Files within the cram/ subdirectory in this distribution are distributed --according to the terms of the following Modified 3-Clause BSD license.] -- --The Modified-BSD License -- --Copyright (C) 2012-2018 Genome Research Ltd. -- --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are met: -- --1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- --2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- --3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute -- nor the names of its contributors may be used to endorse or promote products -- derived from this software without specific prior written permission. -- --THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" --AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR ITS CONTRIBUTORS BE LIABLE --FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR --SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER --CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, --OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE --OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- -- --[The use of a range of years within a copyright notice in this distribution --should be interpreted as being equivalent to a list of years including the --first and last year specified and all consecutive years between them. -- --For example, a copyright notice that reads "Copyright (C) 2005, 2007-2009, --2011-2012" should be interpreted as being identical to a notice that reads --"Copyright (C) 2005, 2007, 2008, 2009, 2011, 2012" and a copyright notice --that reads "Copyright (C) 2005-2012" should be interpreted as being identical --to a notice that reads "Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, --2011, 2012".] ---- python-pysam.orig/samtools/htslib-1.9/README -+++ /dev/null -@@ -1,5 +0,0 @@ --HTSlib is an implementation of a unified C library for accessing common file --formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing --data. It is the core library used by samtools and bcftools. -- --See INSTALL for building and installation instructions. ---- python-pysam.orig/samtools/misc/ace2sam.c -+++ python-pysam/samtools/misc/ace2sam.c -@@ -93,7 +93,8 @@ - s.l = s.m = 0; s.s = 0; - af_n = af_max = af_i = 0; af = 0; - for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; -- fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); -+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); -+ if (fp == NULL) fatal("can't open input file"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, &s, &dret) >= 0) { - if (strcmp(s.s, "CO") == 0) { // contig sequence ---- python-pysam.orig/samtools/misc/ace2sam.c.pysam.c -+++ python-pysam/samtools/misc/ace2sam.c.pysam.c -@@ -95,7 +95,8 @@ - s.l = s.m = 0; s.s = 0; - af_n = af_max = af_i = 0; af = 0; - for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; -- fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); -+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); -+ if (fp == NULL) fatal("can't open input file"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, &s, &dret) >= 0) { - if (strcmp(s.s, "CO") == 0) { // contig sequence ---- python-pysam.orig/samtools/padding.c -+++ python-pysam/samtools/padding.c -@@ -1,7 +1,7 @@ - /* padding.c -- depad subcommand. - - Copyright (C) 2011, 2012 Broad Institute. -- Copyright (C) 2014-2016 Genome Research Ltd. -+ Copyright (C) 2014-2016, 2019 Genome Research Ltd. - Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. - - Author: Heng Li -@@ -29,10 +29,10 @@ - #include - #include - #include -+#include - #include - #include - #include --#include "sam_header.h" - #include "sam_opts.h" - #include "samtools.h" - -@@ -62,6 +62,10 @@ - if (_n == _m) { \ - _m = _m? _m<<1 : 4; \ - _c = (uint32_t*)realloc(_c, _m * 4); \ -+ if (!(_c)) { \ -+ fprintf(stderr, "[depad] ERROR: Memory allocation failure.\n"); \ -+ return -1; \ -+ } \ - } \ - _c[_n++] = (_v); \ - } while (0) -@@ -107,15 +111,15 @@ - return length != s->l; - } - --int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) -+int load_unpadded_ref(faidx_t *fai, const char *ref_name, hts_pos_t ref_len, kstring_t *seq) - { - char base; - char *fai_ref = 0; -- int fai_ref_len = 0, k; -+ hts_pos_t fai_ref_len = 0, k; - -- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); -+ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); - if (fai_ref_len != ref_len) { -- fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); -+ fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, ref_len); - free(fai_ref); - return -1; - } -@@ -141,16 +145,16 @@ - return 0; - } - --int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) -+hts_pos_t get_unpadded_len(faidx_t *fai, const char *ref_name, hts_pos_t padded_len) - { - char base; - char *fai_ref = 0; -- int fai_ref_len = 0, k; -- int bases=0, gaps=0; -+ hts_pos_t fai_ref_len = 0, k; -+ hts_pos_t bases=0, gaps=0; - -- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); -+ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); - if (fai_ref_len != padded_len) { -- fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); -+ fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, padded_len); - free(fai_ref); - return -1; - } -@@ -185,7 +189,7 @@ - return posmap; - } - --int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) -+int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) - { - bam1_t *b = 0; - kstring_t r, q; -@@ -207,21 +211,21 @@ - - uint32_t *cigar = bam_get_cigar(b); - n2 = 0; -- if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { -+ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid)) == 0) { - // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); - r_tid = b->core.tid; - if (0!=unpad_seq(b, &r)) { - fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); - return -1; - }; -- if (h->target_len[r_tid] != r.l) { -- fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); -+ if (sam_hdr_tid2len(h, r_tid) != r.l) { -+ fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %"PRId64" in BAM header, but %zu in embedded reference\n", bam_get_qname(b), (int64_t) sam_hdr_tid2len(h, r_tid), r.l); - return -1; - } - if (fai) { - // Check the embedded reference matches the FASTA file -- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { -- fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &q)) { -+ fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - assert(r.l == q.l); -@@ -230,7 +234,7 @@ - if (r.s[i] != q.s[i]) { - // Show gaps as ASCII 45 - fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", -- h->target_name[b->core.tid], i+1, -+ sam_hdr_tid2name(h, b->core.tid), i+1, - r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, - q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); - return -1; -@@ -249,15 +253,15 @@ - ; // good case, reference available - //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); - } else if (fai) { -- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { -- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { -+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - posmap = update_posmap(posmap, r); - r_tid = b->core.tid; - // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); - } else { -- fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); -+ fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - if (0!=unpad_seq(b, &q)) { -@@ -343,19 +347,19 @@ - /* Nasty case, Must load alternative posmap */ - // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); - if (!fai) { -- fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); -+ fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.mtid)); - return -1; - } - /* Temporarily load the other reference sequence */ -- if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { -- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.mtid), sam_hdr_tid2len(h, b->core.mtid), &r)) { -+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.mtid)); - return -1; - } - posmap = update_posmap(posmap, r); - b->core.mpos = posmap[b->core.mpos]; - /* Restore the reference and posmap*/ -- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { -- fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { -+ fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - posmap = update_posmap(posmap, r); -@@ -374,126 +378,47 @@ - ret = 1; - } - free(r.s); free(q.s); free(posmap); -+ free(cigar2); - bam_destroy1(b); - return ret; - } - --bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) -+sam_hdr_t * fix_header(sam_hdr_t *old, faidx_t *fai) - { -- int i = 0, unpadded_len = 0; -- bam_hdr_t *header = 0 ; -- unsigned short ln_found; -- -- header = bam_hdr_dup(old); -- for (i = 0; i < old->n_targets; ++i) { -- unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); -+ int i = 0, ret = 0; -+ hts_pos_t unpadded_len = 0; -+ sam_hdr_t *header = sam_hdr_dup(old); -+ if (!header) -+ return NULL; -+ -+ int nref = sam_hdr_nref(old); -+ char len_buf[64]; -+ -+ for (i = 0; i < nref; ++i) { -+ unpadded_len = get_unpadded_len(fai, sam_hdr_tid2name(old, i), sam_hdr_tid2len(old, i)); - if (unpadded_len < 0) { -- fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); -+ fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %"PRIhts_pos"\n", sam_hdr_tid2name(old, i), (hts_pos_t) sam_hdr_tid2len(old, i)); -+ } else if (unpadded_len > sam_hdr_tid2len(old, i)) { -+ fprintf(stderr, "[depad] New unpadded length of '%s' is larger than the padded length (%"PRIhts_pos" > %"PRIhts_pos")\n", -+ sam_hdr_tid2name(old, i), unpadded_len, -+ (hts_pos_t) sam_hdr_tid2len(old, i)); -+ ret = 1; - } else { -- header->target_len[i] = unpadded_len; -+ sprintf(len_buf, "%"PRIhts_pos"", unpadded_len); -+ if ((ret |= sam_hdr_update_line(header, "SQ", "SN", sam_hdr_tid2name(header, i), "LN", len_buf, NULL))) -+ fprintf(stderr, "[depad] Error updating length of '%s' from %"PRIhts_pos" to %"PRIhts_pos"\n", -+ sam_hdr_tid2name(header, i), -+ (hts_pos_t) sam_hdr_tid2len(header, i), -+ unpadded_len); - //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); - } - } -- /* Duplicating the header allocated new buffer for header string */ -- /* After modifying the @SQ lines it will only get smaller, since */ -- /* the LN entries will be the same or shorter, and we'll remove */ -- /* any MD entries (MD5 checksums). */ -- assert(strlen(old->text) == strlen(header->text)); -- assert (0==strcmp(old->text, header->text)); -- const char *text; -- text = old->text; -- header->text[0] = '\0'; /* Resuse the allocated buffer */ -- char * newtext = header->text; -- char * end=NULL; -- while (text[0]=='@') { -- end = strchr(text, '\n'); -- assert(end != 0); -- if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { -- const char *cp = text+3; -- char *name = strstr(text, "\tSN:"); -- char *name_end; -- if (!name) { -- fprintf(stderr, "Unable to find SN: header field\n"); -- return NULL; -- } -- name += 4; -- for (name_end = name; name_end != end && *name_end != '\t'; name_end++); -- strcat(newtext, "@SQ"); -- ln_found = 0; -- -- /* Parse the @SQ lines */ -- while (cp != end) { -- if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { -- // Rewrite the length -- char len_buf[100]; -- int tid; -- unsigned int old_length, new_length; -- const char *old_cp = cp; -- -- ln_found = 1; -- -- while (cp != end && *cp++ != '\t'); -- old_length = (int)(cp - old_cp); -- -- for (tid = 0; tid < header->n_targets; tid++) { -- // may want to hash this, but new header API incoming. -- if (strncmp(name, header->target_name[tid], name_end - name) == 0) { -- new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]); -- if (new_length <= old_length) { -- strcat(newtext, len_buf); -- } -- else { -- fprintf(stderr, "LN value of the reference is larger than the original!\n"); -- exit(1); -- } -- break; -- } -- } - -- if (cp != end) -- strcat(newtext, "\t"); -- } else if (end-cp >= 2 && -- ((ln_found && strncmp(cp, "LN", 2) == 0) || -- strncmp(cp, "M5", 2) == 0 || -- strncmp(cp, "UR", 2) == 0)) -- { -- // skip secondary LNs -- // MD5 changed during depadding; ditch it. -- // URLs are also invalid. -- while (cp != end && *cp++ != '\t'); -- } else { -- // Otherwise copy this sub-field verbatim -- const char *cp_start = cp; -- while (cp != end && *cp++ != '\t'); -- strncat(newtext, cp_start, cp-cp_start); -- } -- } -- -- // Add newline, replacing trailing '\t' if last on line was the LN: -- char *text_end = newtext + strlen(newtext); -- if (text_end[-1] == '\t') -- text_end[-1] = '\n'; -- else -- *text_end++ = '\n', *text_end = '\0'; -- } else { -- /* Copy this line to the new header */ -- strncat(newtext, text, end - text + 1); -- } -- text = end + 1; -+ if (ret) { -+ sam_hdr_destroy(header); -+ return NULL; - } -- assert (text[0]=='\0'); -- /* Check we didn't overflow the buffer */ -- assert (strlen(header->text) <= strlen(old->text)); -- if (strlen(header->text) < header->l_text) { -- //fprintf(stderr, "[depad] Reallocating header buffer\n"); -- assert (newtext == header->text); -- newtext = malloc(strlen(header->text) + 1); -- strcpy(newtext, header->text); -- free(header->text); -- header->text = newtext; -- header->l_text = strlen(newtext); -- } -- //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); -+ - return header; - } - -@@ -502,15 +427,17 @@ - int main_pad2unpad(int argc, char *argv[]) - { - samFile *in = 0, *out = 0; -- bam_hdr_t *h = 0, *h_fix = 0; -+ sam_hdr_t *h = 0, *h_fix = 0; - faidx_t *fai = 0; -- int c, compress_level = -1, is_long_help = 0; -- char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0; -+ int c, compress_level = -1, is_long_help = 0, no_pg = 0; -+ char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL; - int ret=0; -+ char *arg_list = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -532,6 +459,7 @@ - if (ga.out.format == unknown_format) - hts_parse_format(&ga.out, "bam"); - break; -+ case 1: no_pg = 1; break; - case '?': is_long_help = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); -@@ -569,7 +497,11 @@ - goto depad_end; - } - if (fai) { -- h_fix = fix_header(h, fai); -+ if (!(h_fix = fix_header(h, fai))){ -+ fprintf(stderr, "[depad] failed to fix the header from\n"); -+ ret = 1; -+ goto depad_end; -+ } - } else { - fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); - h_fix = h; -@@ -587,25 +519,61 @@ - if (ga.out.format == cram) - hts_set_opt(out, CRAM_OPT_NO_REF, 1); - -+ if (!no_pg) { -+ if(!(arg_list = stringify_argv(argc+1, argv-1))) { -+ fprintf(stderr, "[depad] failed to create arg_list\n"); -+ ret = 1; -+ goto depad_end; -+ } -+ -+ if (sam_hdr_add_pg(h_fix, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ fprintf(stderr, "[depad] failed to add PG line to header\n"); -+ ret = 1; -+ goto depad_end; -+ } -+ } -+ - if (sam_hdr_write(out, h_fix) != 0) { - fprintf(stderr, "[depad] failed to write header.\n"); - ret = 1; - goto depad_end; - } -+ if (ga.write_index) { -+ if (!(fn_out_idx = auto_index(out, fn_out, h_fix))) { -+ ret = 1; -+ goto depad_end; -+ } -+ } - - // Do the depad - if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1; - -+ if (ga.write_index) { -+ if (sam_idx_save(out) < 0) { -+ print_error_errno("depad", "writing index failed"); -+ ret = 1; -+ } -+ } -+ - depad_end: - // close files, free and return -+ free(arg_list); - if (fai) fai_destroy(fai); -- if (h) bam_hdr_destroy(h); -+ if (h) sam_hdr_destroy(h); -+ if (h_fix && h_fix != h) sam_hdr_destroy(h_fix); - if (in) sam_close(in); - if (out && sam_close(out) < 0) { - fprintf(stderr, "[depad] error on closing output file.\n"); - ret = 1; - } - free(fn_list); free(fn_out); -+ if (fn_out_idx) -+ free(fn_out_idx); -+ sam_global_args_free(&ga); - return ret; - } - -@@ -621,8 +589,9 @@ - fprintf(stderr, " -T, --reference FILE\n"); - fprintf(stderr, " Padded reference sequence file [null]\n"); - fprintf(stderr, " -o FILE Output file name [stdout]\n"); -+ fprintf(stderr, " --no-PG do not add a PG line\n"); - fprintf(stderr, " -? Longer help\n"); -- sam_global_opt_help(stderr, "-...--"); -+ sam_global_opt_help(stderr, "-...--.."); - - if (is_long_help) - fprintf(stderr, ---- python-pysam.orig/samtools/padding.c.pysam.c -+++ python-pysam/samtools/padding.c.pysam.c -@@ -3,7 +3,7 @@ - /* padding.c -- depad subcommand. - - Copyright (C) 2011, 2012 Broad Institute. -- Copyright (C) 2014-2016 Genome Research Ltd. -+ Copyright (C) 2014-2016, 2019 Genome Research Ltd. - Portions copyright (C) 2012, 2013 Peter Cock, The James Hutton Institute. - - Author: Heng Li -@@ -31,10 +31,10 @@ - #include - #include - #include -+#include - #include - #include - #include --#include "sam_header.h" - #include "sam_opts.h" - #include "samtools.h" - -@@ -64,6 +64,10 @@ - if (_n == _m) { \ - _m = _m? _m<<1 : 4; \ - _c = (uint32_t*)realloc(_c, _m * 4); \ -+ if (!(_c)) { \ -+ fprintf(samtools_stderr, "[depad] ERROR: Memory allocation failure.\n"); \ -+ return -1; \ -+ } \ - } \ - _c[_n++] = (_v); \ - } while (0) -@@ -109,15 +113,15 @@ - return length != s->l; - } - --int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) -+int load_unpadded_ref(faidx_t *fai, const char *ref_name, hts_pos_t ref_len, kstring_t *seq) - { - char base; - char *fai_ref = 0; -- int fai_ref_len = 0, k; -+ hts_pos_t fai_ref_len = 0, k; - -- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); -+ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); - if (fai_ref_len != ref_len) { -- fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); -+ fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence %s length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, ref_len); - free(fai_ref); - return -1; - } -@@ -143,16 +147,16 @@ - return 0; - } - --int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) -+hts_pos_t get_unpadded_len(faidx_t *fai, const char *ref_name, hts_pos_t padded_len) - { - char base; - char *fai_ref = 0; -- int fai_ref_len = 0, k; -- int bases=0, gaps=0; -+ hts_pos_t fai_ref_len = 0, k; -+ hts_pos_t bases=0, gaps=0; - -- fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); -+ fai_ref = fai_fetch64(fai, ref_name, &fai_ref_len); - if (fai_ref_len != padded_len) { -- fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); -+ fprintf(samtools_stderr, "[depad] ERROR: FASTA sequence '%s' length %"PRIhts_pos", expected %"PRIhts_pos"\n", ref_name, fai_ref_len, padded_len); - free(fai_ref); - return -1; - } -@@ -187,7 +191,7 @@ - return posmap; - } - --int bam_pad2unpad(samFile *in, samFile *out, bam_hdr_t *h, faidx_t *fai) -+int bam_pad2unpad(samFile *in, samFile *out, sam_hdr_t *h, faidx_t *fai) - { - bam1_t *b = 0; - kstring_t r, q; -@@ -209,21 +213,21 @@ - - uint32_t *cigar = bam_get_cigar(b); - n2 = 0; -- if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) { -+ if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid)) == 0) { - // fprintf(samtools_stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b)); - r_tid = b->core.tid; - if (0!=unpad_seq(b, &r)) { - fprintf(samtools_stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b)); - return -1; - }; -- if (h->target_len[r_tid] != r.l) { -- fprintf(samtools_stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l)); -+ if (sam_hdr_tid2len(h, r_tid) != r.l) { -+ fprintf(samtools_stderr, "[depad] ERROR: (Padded) length of '%s' is %"PRId64" in BAM header, but %zu in embedded reference\n", bam_get_qname(b), (int64_t) sam_hdr_tid2len(h, r_tid), r.l); - return -1; - } - if (fai) { - // Check the embedded reference matches the FASTA file -- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { -- fprintf(samtools_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &q)) { -+ fprintf(samtools_stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - assert(r.l == q.l); -@@ -232,7 +236,7 @@ - if (r.s[i] != q.s[i]) { - // Show gaps as ASCII 45 - fprintf(samtools_stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", -- h->target_name[b->core.tid], i+1, -+ sam_hdr_tid2name(h, b->core.tid), i+1, - r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45, - q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45); - return -1; -@@ -251,15 +255,15 @@ - ; // good case, reference available - //fprintf(samtools_stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b)); - } else if (fai) { -- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { -- fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { -+ fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - posmap = update_posmap(posmap, r); - r_tid = b->core.tid; - // fprintf(samtools_stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); - } else { -- fprintf(samtools_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); -+ fprintf(samtools_stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - if (0!=unpad_seq(b, &q)) { -@@ -345,19 +349,19 @@ - /* Nasty case, Must load alternative posmap */ - // fprintf(samtools_stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); - if (!fai) { -- fprintf(samtools_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); -+ fprintf(samtools_stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", sam_hdr_tid2name(h, b->core.mtid)); - return -1; - } - /* Temporarily load the other reference sequence */ -- if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { -- fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.mtid), sam_hdr_tid2len(h, b->core.mtid), &r)) { -+ fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.mtid)); - return -1; - } - posmap = update_posmap(posmap, r); - b->core.mpos = posmap[b->core.mpos]; - /* Restore the reference and posmap*/ -- if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { -- fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); -+ if (load_unpadded_ref(fai, sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), &r)) { -+ fprintf(samtools_stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", sam_hdr_tid2name(h, b->core.tid)); - return -1; - } - posmap = update_posmap(posmap, r); -@@ -376,126 +380,47 @@ - ret = 1; - } - free(r.s); free(q.s); free(posmap); -+ free(cigar2); - bam_destroy1(b); - return ret; - } - --bam_hdr_t * fix_header(bam_hdr_t *old, faidx_t *fai) -+sam_hdr_t * fix_header(sam_hdr_t *old, faidx_t *fai) - { -- int i = 0, unpadded_len = 0; -- bam_hdr_t *header = 0 ; -- unsigned short ln_found; -- -- header = bam_hdr_dup(old); -- for (i = 0; i < old->n_targets; ++i) { -- unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); -+ int i = 0, ret = 0; -+ hts_pos_t unpadded_len = 0; -+ sam_hdr_t *header = sam_hdr_dup(old); -+ if (!header) -+ return NULL; -+ -+ int nref = sam_hdr_nref(old); -+ char len_buf[64]; -+ -+ for (i = 0; i < nref; ++i) { -+ unpadded_len = get_unpadded_len(fai, sam_hdr_tid2name(old, i), sam_hdr_tid2len(old, i)); - if (unpadded_len < 0) { -- fprintf(samtools_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); -+ fprintf(samtools_stderr, "[depad] ERROR getting unpadded length of '%s', padded length %"PRIhts_pos"\n", sam_hdr_tid2name(old, i), (hts_pos_t) sam_hdr_tid2len(old, i)); -+ } else if (unpadded_len > sam_hdr_tid2len(old, i)) { -+ fprintf(samtools_stderr, "[depad] New unpadded length of '%s' is larger than the padded length (%"PRIhts_pos" > %"PRIhts_pos")\n", -+ sam_hdr_tid2name(old, i), unpadded_len, -+ (hts_pos_t) sam_hdr_tid2len(old, i)); -+ ret = 1; - } else { -- header->target_len[i] = unpadded_len; -+ sprintf(len_buf, "%"PRIhts_pos"", unpadded_len); -+ if ((ret |= sam_hdr_update_line(header, "SQ", "SN", sam_hdr_tid2name(header, i), "LN", len_buf, NULL))) -+ fprintf(samtools_stderr, "[depad] Error updating length of '%s' from %"PRIhts_pos" to %"PRIhts_pos"\n", -+ sam_hdr_tid2name(header, i), -+ (hts_pos_t) sam_hdr_tid2len(header, i), -+ unpadded_len); - //fprintf(samtools_stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); - } - } -- /* Duplicating the header allocated new buffer for header string */ -- /* After modifying the @SQ lines it will only get smaller, since */ -- /* the LN entries will be the same or shorter, and we'll remove */ -- /* any MD entries (MD5 checksums). */ -- assert(strlen(old->text) == strlen(header->text)); -- assert (0==strcmp(old->text, header->text)); -- const char *text; -- text = old->text; -- header->text[0] = '\0'; /* Resuse the allocated buffer */ -- char * newtext = header->text; -- char * end=NULL; -- while (text[0]=='@') { -- end = strchr(text, '\n'); -- assert(end != 0); -- if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { -- const char *cp = text+3; -- char *name = strstr(text, "\tSN:"); -- char *name_end; -- if (!name) { -- fprintf(samtools_stderr, "Unable to find SN: header field\n"); -- return NULL; -- } -- name += 4; -- for (name_end = name; name_end != end && *name_end != '\t'; name_end++); -- strcat(newtext, "@SQ"); -- ln_found = 0; -- -- /* Parse the @SQ lines */ -- while (cp != end) { -- if (!ln_found && end-cp >= 2 && strncmp(cp, "LN", 2) == 0) { -- // Rewrite the length -- char len_buf[100]; -- int tid; -- unsigned int old_length, new_length; -- const char *old_cp = cp; -- -- ln_found = 1; -- -- while (cp != end && *cp++ != '\t'); -- old_length = (int)(cp - old_cp); -- -- for (tid = 0; tid < header->n_targets; tid++) { -- // may want to hash this, but new header API incoming. -- if (strncmp(name, header->target_name[tid], name_end - name) == 0) { -- new_length = sprintf(len_buf, "LN:%d", header->target_len[tid]); -- if (new_length <= old_length) { -- strcat(newtext, len_buf); -- } -- else { -- fprintf(samtools_stderr, "LN value of the reference is larger than the original!\n"); -- exit(1); -- } -- break; -- } -- } - -- if (cp != end) -- strcat(newtext, "\t"); -- } else if (end-cp >= 2 && -- ((ln_found && strncmp(cp, "LN", 2) == 0) || -- strncmp(cp, "M5", 2) == 0 || -- strncmp(cp, "UR", 2) == 0)) -- { -- // skip secondary LNs -- // MD5 changed during depadding; ditch it. -- // URLs are also invalid. -- while (cp != end && *cp++ != '\t'); -- } else { -- // Otherwise copy this sub-field verbatim -- const char *cp_start = cp; -- while (cp != end && *cp++ != '\t'); -- strncat(newtext, cp_start, cp-cp_start); -- } -- } -- -- // Add newline, replacing trailing '\t' if last on line was the LN: -- char *text_end = newtext + strlen(newtext); -- if (text_end[-1] == '\t') -- text_end[-1] = '\n'; -- else -- *text_end++ = '\n', *text_end = '\0'; -- } else { -- /* Copy this line to the new header */ -- strncat(newtext, text, end - text + 1); -- } -- text = end + 1; -+ if (ret) { -+ sam_hdr_destroy(header); -+ return NULL; - } -- assert (text[0]=='\0'); -- /* Check we didn't overflow the buffer */ -- assert (strlen(header->text) <= strlen(old->text)); -- if (strlen(header->text) < header->l_text) { -- //fprintf(samtools_stderr, "[depad] Reallocating header buffer\n"); -- assert (newtext == header->text); -- newtext = malloc(strlen(header->text) + 1); -- strcpy(newtext, header->text); -- free(header->text); -- header->text = newtext; -- header->l_text = strlen(newtext); -- } -- //fprintf(samtools_stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); -+ - return header; - } - -@@ -504,15 +429,17 @@ - int main_pad2unpad(int argc, char *argv[]) - { - samFile *in = 0, *out = 0; -- bam_hdr_t *h = 0, *h_fix = 0; -+ sam_hdr_t *h = 0, *h_fix = 0; - faidx_t *fai = 0; -- int c, compress_level = -1, is_long_help = 0; -- char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0; -+ int c, compress_level = -1, is_long_help = 0, no_pg = 0; -+ char in_mode[5], out_mode[6], *fn_out = 0, *fn_list = 0, *fn_out_idx = NULL; - int ret=0; -+ char *arg_list = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -534,6 +461,7 @@ - if (ga.out.format == unknown_format) - hts_parse_format(&ga.out, "bam"); - break; -+ case 1: no_pg = 1; break; - case '?': is_long_help = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - fprintf(samtools_stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); -@@ -571,7 +499,11 @@ - goto depad_end; - } - if (fai) { -- h_fix = fix_header(h, fai); -+ if (!(h_fix = fix_header(h, fai))){ -+ fprintf(samtools_stderr, "[depad] failed to fix the header from\n"); -+ ret = 1; -+ goto depad_end; -+ } - } else { - fprintf(samtools_stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); - h_fix = h; -@@ -589,25 +521,61 @@ - if (ga.out.format == cram) - hts_set_opt(out, CRAM_OPT_NO_REF, 1); - -+ if (!no_pg) { -+ if(!(arg_list = stringify_argv(argc+1, argv-1))) { -+ fprintf(samtools_stderr, "[depad] failed to create arg_list\n"); -+ ret = 1; -+ goto depad_end; -+ } -+ -+ if (sam_hdr_add_pg(h_fix, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ fprintf(samtools_stderr, "[depad] failed to add PG line to header\n"); -+ ret = 1; -+ goto depad_end; -+ } -+ } -+ - if (sam_hdr_write(out, h_fix) != 0) { - fprintf(samtools_stderr, "[depad] failed to write header.\n"); - ret = 1; - goto depad_end; - } -+ if (ga.write_index) { -+ if (!(fn_out_idx = auto_index(out, fn_out, h_fix))) { -+ ret = 1; -+ goto depad_end; -+ } -+ } - - // Do the depad - if (bam_pad2unpad(in, out, h, fai) != 0) ret = 1; - -+ if (ga.write_index) { -+ if (sam_idx_save(out) < 0) { -+ print_error_errno("depad", "writing index failed"); -+ ret = 1; -+ } -+ } -+ - depad_end: - // close files, free and return -+ free(arg_list); - if (fai) fai_destroy(fai); -- if (h) bam_hdr_destroy(h); -+ if (h) sam_hdr_destroy(h); -+ if (h_fix && h_fix != h) sam_hdr_destroy(h_fix); - if (in) sam_close(in); - if (out && sam_close(out) < 0) { - fprintf(samtools_stderr, "[depad] error on closing output file.\n"); - ret = 1; - } - free(fn_list); free(fn_out); -+ if (fn_out_idx) -+ free(fn_out_idx); -+ sam_global_args_free(&ga); - return ret; - } - -@@ -623,8 +591,9 @@ - fprintf(samtools_stderr, " -T, --reference FILE\n"); - fprintf(samtools_stderr, " Padded reference sequence file [null]\n"); - fprintf(samtools_stderr, " -o FILE Output file name [samtools_stdout]\n"); -+ fprintf(samtools_stderr, " --no-PG do not add a PG line\n"); - fprintf(samtools_stderr, " -? Longer help\n"); -- sam_global_opt_help(samtools_stderr, "-...--"); -+ sam_global_opt_help(samtools_stderr, "-...--.."); - - if (is_long_help) - fprintf(samtools_stderr, ---- python-pysam.orig/samtools/phase.c -+++ python-pysam/samtools/phase.c -@@ -1,7 +1,7 @@ - /* phase.c -- phase subcommand. - - Copyright (C) 2011 Broad Institute. -- Copyright (C) 2013-2016 Genome Research Ltd. -+ Copyright (C) 2013-2016, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -52,15 +52,15 @@ - - typedef struct { - // configurations, initialized in the main function -- int flag, k, min_baseQ, min_varLOD, max_depth; -+ int flag, k, min_baseQ, min_varLOD, max_depth, no_pg; - // other global variables - int vpos_shift; - samFile* fp; -- bam_hdr_t* fp_hdr; -- char *pre; -+ sam_hdr_t* fp_hdr; -+ char *pre, *arg_list; - char *out_name[3]; - samFile* out[3]; -- bam_hdr_t* out_hdr[3]; -+ sam_hdr_t* out_hdr[3]; - // alignment queue - int n, m; - bam1_t **b; -@@ -503,7 +503,7 @@ - return ret; - } - --static khash_t(set64) *loadpos(const char *fn, bam_hdr_t *h) -+static khash_t(set64) *loadpos(const char *fn, sam_hdr_t *h) - { - gzFile fp; - kstream_t *ks; -@@ -511,9 +511,15 @@ - kstring_t *str; - khash_t(set64) *hash; - -+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); -+ if (fp == NULL) { -+ print_error_errno("phase", "Couldn't open site file '%s'", fn); -+ return NULL; -+ } -+ - hash = kh_init(set64); - str = calloc(1, sizeof(kstring_t)); -- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); -+ - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int tid = bam_name2id(h, str->s); -@@ -557,7 +563,15 @@ - return -1; - } - -- g->out_hdr[c] = bam_hdr_dup(g->fp_hdr); -+ g->out_hdr[c] = sam_hdr_dup(g->fp_hdr); -+ if (!g->no_pg && sam_hdr_add_pg(g->out_hdr[c], "samtools", -+ "VN", samtools_version(), -+ g->arg_list ? "CL": NULL, -+ g->arg_list ? g->arg_list : NULL, -+ NULL)) { -+ print_error("phase", "failed to add PG line to header"); -+ return -1; -+ } - if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) { - print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]); - return -1; -@@ -582,6 +596,7 @@ - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -601,6 +616,7 @@ - case 'A': g.flag |= FLAG_DROP_AMBI; break; - case 'b': g.pre = strdup(optarg); break; - case 'l': fn_list = strdup(optarg); break; -+ case 1: g.no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': usage=1; break; -@@ -618,10 +634,11 @@ - // fprintf(stderr, " -l FILE list of sites to phase [null]\n"); - fprintf(stderr, " -F do not attempt to fix chimeras\n"); - fprintf(stderr, " -A drop reads with ambiguous phase\n"); -+ fprintf(stderr, " --no-PG do not add a PG line\n"); - // fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); - fprintf(stderr, "\n"); - -- sam_global_opt_help(stderr, "-....-"); -+ sam_global_opt_help(stderr, "-....--."); - - return 1; - } -@@ -636,8 +653,13 @@ - __func__, argv[optind]); - return 1; - } -+ if (!g.no_pg && !(g.arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("phase", "failed to create arg_list"); -+ return 1; -+ } - if (fn_list) { // read the list of sites to phase - set = loadpos(fn_list, g.fp_hdr); -+ if (set == NULL) return 1; - free(fn_list); - } else g.flag &= ~FLAG_LIST_EXCL; - if (g.pre) { // open BAMs to write -@@ -677,7 +699,7 @@ - g.vpos_shift = 0; - if (lasttid >= 0) { - seqs = shrink_hash(seqs); -- if (phase(&g, g.fp_hdr->target_name[lasttid], -+ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, lasttid), - vpos, cns, seqs) < 0) { - return 1; - } -@@ -749,7 +771,7 @@ - } - if (dophase) { - seqs = shrink_hash(seqs); -- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { -+ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { - return 1; - } - update_vpos(vpos, seqs); -@@ -759,11 +781,11 @@ - ++vpos; - } - if (tid >= 0) { -- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { -+ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { - return 1; - } - } -- bam_hdr_destroy(g.fp_hdr); -+ sam_hdr_destroy(g.fp_hdr); - bam_plp_destroy(iter); - sam_close(g.fp); - kh_destroy(64, seqs); -@@ -779,12 +801,13 @@ - __func__, g.out_name[c]); - res = 1; - } -- bam_hdr_destroy(g.out_hdr[c]); -+ sam_hdr_destroy(g.out_hdr[c]); - free(g.out_name[c]); - } - free(g.pre); free(g.b); - if (res) return 1; - } -+ free(g.arg_list); - sam_global_args_free(&ga); - return 0; - } ---- python-pysam.orig/samtools/phase.c.pysam.c -+++ python-pysam/samtools/phase.c.pysam.c -@@ -3,7 +3,7 @@ - /* phase.c -- phase subcommand. - - Copyright (C) 2011 Broad Institute. -- Copyright (C) 2013-2016 Genome Research Ltd. -+ Copyright (C) 2013-2016, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -54,15 +54,15 @@ - - typedef struct { - // configurations, initialized in the main function -- int flag, k, min_baseQ, min_varLOD, max_depth; -+ int flag, k, min_baseQ, min_varLOD, max_depth, no_pg; - // other global variables - int vpos_shift; - samFile* fp; -- bam_hdr_t* fp_hdr; -- char *pre; -+ sam_hdr_t* fp_hdr; -+ char *pre, *arg_list; - char *out_name[3]; - samFile* out[3]; -- bam_hdr_t* out_hdr[3]; -+ sam_hdr_t* out_hdr[3]; - // alignment queue - int n, m; - bam1_t **b; -@@ -505,7 +505,7 @@ - return ret; - } - --static khash_t(set64) *loadpos(const char *fn, bam_hdr_t *h) -+static khash_t(set64) *loadpos(const char *fn, sam_hdr_t *h) - { - gzFile fp; - kstream_t *ks; -@@ -513,9 +513,15 @@ - kstring_t *str; - khash_t(set64) *hash; - -+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); -+ if (fp == NULL) { -+ print_error_errno("phase", "Couldn't open site file '%s'", fn); -+ return NULL; -+ } -+ - hash = kh_init(set64); - str = calloc(1, sizeof(kstring_t)); -- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); -+ - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int tid = bam_name2id(h, str->s); -@@ -559,7 +565,15 @@ - return -1; - } - -- g->out_hdr[c] = bam_hdr_dup(g->fp_hdr); -+ g->out_hdr[c] = sam_hdr_dup(g->fp_hdr); -+ if (!g->no_pg && sam_hdr_add_pg(g->out_hdr[c], "samtools", -+ "VN", samtools_version(), -+ g->arg_list ? "CL": NULL, -+ g->arg_list ? g->arg_list : NULL, -+ NULL)) { -+ print_error("phase", "failed to add PG line to header"); -+ return -1; -+ } - if (sam_hdr_write(g->out[c], g->out_hdr[c]) < 0) { - print_error_errno("phase", "Failed to write header for '%s'", g->out_name[c]); - return -1; -@@ -584,6 +598,7 @@ - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -603,6 +618,7 @@ - case 'A': g.flag |= FLAG_DROP_AMBI; break; - case 'b': g.pre = strdup(optarg); break; - case 'l': fn_list = strdup(optarg); break; -+ case 1: g.no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': usage=1; break; -@@ -620,10 +636,11 @@ - // fprintf(samtools_stderr, " -l FILE list of sites to phase [null]\n"); - fprintf(samtools_stderr, " -F do not attempt to fix chimeras\n"); - fprintf(samtools_stderr, " -A drop reads with ambiguous phase\n"); -+ fprintf(samtools_stderr, " --no-PG do not add a PG line\n"); - // fprintf(samtools_stderr, " -e do not discover SNPs (effective with -l)\n"); - fprintf(samtools_stderr, "\n"); - -- sam_global_opt_help(samtools_stderr, "-....-"); -+ sam_global_opt_help(samtools_stderr, "-....--."); - - return 1; - } -@@ -638,8 +655,13 @@ - __func__, argv[optind]); - return 1; - } -+ if (!g.no_pg && !(g.arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("phase", "failed to create arg_list"); -+ return 1; -+ } - if (fn_list) { // read the list of sites to phase - set = loadpos(fn_list, g.fp_hdr); -+ if (set == NULL) return 1; - free(fn_list); - } else g.flag &= ~FLAG_LIST_EXCL; - if (g.pre) { // open BAMs to write -@@ -679,7 +701,7 @@ - g.vpos_shift = 0; - if (lasttid >= 0) { - seqs = shrink_hash(seqs); -- if (phase(&g, g.fp_hdr->target_name[lasttid], -+ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, lasttid), - vpos, cns, seqs) < 0) { - return 1; - } -@@ -751,7 +773,7 @@ - } - if (dophase) { - seqs = shrink_hash(seqs); -- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { -+ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { - return 1; - } - update_vpos(vpos, seqs); -@@ -761,11 +783,11 @@ - ++vpos; - } - if (tid >= 0) { -- if (phase(&g, g.fp_hdr->target_name[tid], vpos, cns, seqs) < 0) { -+ if (phase(&g, sam_hdr_tid2name(g.fp_hdr, tid), vpos, cns, seqs) < 0) { - return 1; - } - } -- bam_hdr_destroy(g.fp_hdr); -+ sam_hdr_destroy(g.fp_hdr); - bam_plp_destroy(iter); - sam_close(g.fp); - kh_destroy(64, seqs); -@@ -781,12 +803,13 @@ - __func__, g.out_name[c]); - res = 1; - } -- bam_hdr_destroy(g.out_hdr[c]); -+ sam_hdr_destroy(g.out_hdr[c]); - free(g.out_name[c]); - } - free(g.pre); free(g.b); - if (res) return 1; - } -+ free(g.arg_list); - sam_global_args_free(&ga); - return 0; - } ---- python-pysam.orig/samtools/sam.c -+++ python-pysam/samtools/sam.c -@@ -1,6 +1,6 @@ - /* sam.c -- format-neutral SAM/BAM API. - -- Copyright (C) 2009, 2012-2015 Genome Research Ltd. -+ Copyright (C) 2009, 2012-2016 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li -@@ -65,12 +65,12 @@ - return NULL; - } - fp->is_write = 0; -- if (fp->header->n_targets == 0 && bam_verbose >= 1) -+ if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1) - fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); - } - else { - enum htsExactFormat fmt = hts_get_format(fp->file)->format; -- fp->header = (bam_hdr_t *)aux; // For writing, we won't free it -+ fp->header = (sam_hdr_t *)aux; // For writing, we won't free it - fp->is_write = 1; - if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { - if (sam_hdr_write(fp->file, fp->header) < 0) { -@@ -89,7 +89,7 @@ - void samclose(samfile_t *fp) - { - if (fp) { -- if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header); -+ if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header); - sam_close(fp->file); - free(fp); - } ---- python-pysam.orig/samtools/sam.c.pysam.c -+++ python-pysam/samtools/sam.c.pysam.c -@@ -2,7 +2,7 @@ - - /* sam.c -- format-neutral SAM/BAM API. - -- Copyright (C) 2009, 2012-2015 Genome Research Ltd. -+ Copyright (C) 2009, 2012-2016 Genome Research Ltd. - Portions copyright (C) 2011 Broad Institute. - - Author: Heng Li -@@ -67,12 +67,12 @@ - return NULL; - } - fp->is_write = 0; -- if (fp->header->n_targets == 0 && bam_verbose >= 1) -+ if (sam_hdr_nref(fp->header) == 0 && bam_verbose >= 1) - fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n"); - } - else { - enum htsExactFormat fmt = hts_get_format(fp->file)->format; -- fp->header = (bam_hdr_t *)aux; // For writing, we won't free it -+ fp->header = (sam_hdr_t *)aux; // For writing, we won't free it - fp->is_write = 1; - if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { - if (sam_hdr_write(fp->file, fp->header) < 0) { -@@ -91,7 +91,7 @@ - void samclose(samfile_t *fp) - { - if (fp) { -- if (!fp->is_write && fp->header) bam_hdr_destroy(fp->header); -+ if (!fp->is_write && fp->header) sam_hdr_destroy(fp->header); - sam_close(fp->file); - free(fp); - } ---- python-pysam.orig/samtools/sam.h -+++ python-pysam/samtools/sam.h -@@ -1,6 +1,6 @@ - /* sam.h -- format-neutral SAM/BAM API. - -- Copyright (C) 2009, 2013-2015 Genome Research Ltd. -+ Copyright (C) 2009, 2013-2015, 2019 Genome Research Ltd. - - Author: Heng Li - -@@ -49,7 +49,7 @@ - typedef struct { - samFile *file; - struct { BGZF *bam; } x; // Hack so that fp->x.bam still works -- bam_hdr_t *header; -+ sam_hdr_t *header; - unsigned short is_write:1; - } samfile_t; - -@@ -103,14 +103,20 @@ - static inline int samwrite(samfile_t *fp, const bam1_t *b) { return sam_write1(fp->file, fp->header, b); } - - /*! -- @abstract Load BAM/CRAM index for use with samfetch() -+ @abstract Load BAM/CRAM index for use with samfetch() with supporting the use of index file - @param fp file handler - @param fn name of the BAM or CRAM file (NOT the index file) -+ @param fnidx name of the index file - @return pointer to the index structure - */ -- static inline bam_index_t *samtools_sam_index_load(samfile_t *fp, const char *fn) { return sam_index_load(fp->file, fn); } -+ static inline bam_index_t *samtools_sam_index_load(samfile_t *fp, const char *fn, const char *fnidx) { -+ if (fnidx != NULL) { -+ return sam_index_load2(fp->file, fn, fnidx); -+ } -+ return sam_index_load(fp->file, fn); -+ } - #undef sam_index_load -- #define sam_index_load(fp,fn) (samtools_sam_index_load((fp), (fn))) -+ #define sam_index_load(fp,fn,fnidx) (samtools_sam_index_load((fp), (fn), (fnidx))) - - /*! - @abstract Retrieve the alignments overlapping the specified region. ---- python-pysam.orig/samtools/sam_header.c -+++ /dev/null -@@ -1,836 +0,0 @@ --/* sam_header.c -- basic SAM/BAM header API. -- -- Copyright (C) 2009-2013 Genome Research Ltd. -- -- Author: Petr Danecek -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL --THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING --FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER --DEALINGS IN THE SOFTWARE. */ -- --#include -- --#include "sam_header.h" --#include --#include --#include --#include --#include -- --#include "htslib/khash.h" --KHASH_MAP_INIT_STR(str, const char *) -- --struct _HeaderList --{ -- struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. -- struct _HeaderList *next; -- void *data; --}; --typedef struct _HeaderList list_t; --typedef list_t HeaderDict; -- --typedef struct --{ -- char key[2]; -- char *value; --} --HeaderTag; -- --typedef struct --{ -- char type[2]; -- list_t *tags; --} --HeaderLine; -- --const char *o_hd_tags[] = {"SO","GO",NULL}; --const char *r_hd_tags[] = {"VN",NULL}; -- --const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; --const char *r_sq_tags[] = {"SN","LN",NULL}; --const char *u_sq_tags[] = {"SN",NULL}; -- --const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; --const char *r_rg_tags[] = {"ID",NULL}; --const char *u_rg_tags[] = {"ID",NULL}; -- --const char *o_pg_tags[] = {"VN","CL",NULL}; --const char *r_pg_tags[] = {"ID",NULL}; -- --const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; --const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; --const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; --const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; -- -- --static void debug(const char *format, ...) --{ -- va_list ap; -- va_start(ap, format); -- vfprintf(stderr, format, ap); -- va_end(ap); --} -- --#if 0 --// Replaced by list_append_to_end --static list_t *list_prepend(list_t *root, void *data) --{ -- list_t *l = malloc(sizeof(list_t)); -- l->next = root; -- l->data = data; -- return l; --} --#endif -- --// Relies on the root->last being correct. Do not use with the other list_* --// routines unless they are fixed to modify root->last as well. --static list_t *list_append_to_end(list_t *root, void *data) --{ -- list_t *l = malloc(sizeof(list_t)); -- l->last = l; -- l->next = NULL; -- l->data = data; -- -- if ( !root ) -- return l; -- -- root->last->next = l; -- root->last = l; -- return root; --} -- --static list_t *list_append(list_t *root, void *data) --{ -- list_t *l = root; -- while (l && l->next) -- l = l->next; -- if ( l ) -- { -- l->next = malloc(sizeof(list_t)); -- l = l->next; -- } -- else -- { -- l = malloc(sizeof(list_t)); -- root = l; -- } -- l->data = data; -- l->next = NULL; -- return root; --} -- --static void list_free(list_t *root) --{ -- list_t *l = root; -- while (root) -- { -- l = root; -- root = root->next; -- free(l); -- } --} -- -- -- --// Look for a tag "XY" in a predefined const char *[] array. --static int tag_exists(const char *tag, const char **tags) --{ -- int itag=0; -- if ( !tags ) return -1; -- while ( tags[itag] ) -- { -- if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; -- itag++; -- } -- return -1; --} -- -- -- --// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text --// or NULL if everything has been read. The lineptr should be freed by the caller. The --// newline character is stripped. --static const char *nextline(char **lineptr, size_t *n, const char *text) --{ -- int len; -- const char *to = text; -- -- if ( !*to ) return NULL; -- -- while ( *to && *to!='\n' && *to!='\r' ) to++; -- len = to - text + 1; -- -- if ( *to ) -- { -- // Advance the pointer for the next call -- if ( *to=='\n' ) to++; -- else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; -- } -- if ( !len ) -- return to; -- -- if ( !*lineptr ) -- { -- *lineptr = malloc(len); -- *n = len; -- } -- else if ( *nkey[0] = name[0]; -- tag->key[1] = name[1]; -- tag->value = malloc(len+1); -- memcpy(tag->value,value_from,len+1); -- tag->value[len] = 0; -- return tag; --} -- --static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) --{ -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; -- tags = tags->next; -- } -- return NULL; --} -- -- --// Return codes: --// 0 .. different types or unique tags differ or conflicting tags, cannot be merged --// 1 .. all tags identical -> no need to merge, drop one --// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated --// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line --static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) --{ -- HeaderTag *t1, *t2; -- -- if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) -- return 0; -- -- int itype = tag_exists(hline1->type,types); -- if ( itype==-1 ) { -- debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); -- return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code -- } -- -- if ( unique_tags[itype] ) -- { -- t1 = header_line_has_tag(hline1,unique_tags[itype][0]); -- t2 = header_line_has_tag(hline2,unique_tags[itype][0]); -- if ( !t1 || !t2 ) // this should never happen, the unique tags are required -- return 2; -- -- if ( strcmp(t1->value,t2->value) ) -- return 0; // the unique tags differ, cannot be merged -- } -- if ( !required_tags[itype] && !optional_tags[itype] ) -- { -- t1 = hline1->tags->data; -- t2 = hline2->tags->data; -- if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments -- return 0; -- } -- -- int missing=0, itag=0; -- while ( required_tags[itype] && required_tags[itype][itag] ) -- { -- t1 = header_line_has_tag(hline1,required_tags[itype][itag]); -- t2 = header_line_has_tag(hline2,required_tags[itype][itag]); -- if ( !t1 && !t2 ) -- return 2; // this should never happen -- else if ( !t1 || !t2 ) -- missing = 1; // there is some tag missing in one of the hlines -- else if ( strcmp(t1->value,t2->value) ) -- { -- if ( unique_tags[itype] ) -- return 2; // the lines have a matching unique tag but have a conflicting tag -- -- return 0; // the lines contain conflicting tags, cannot be merged -- } -- itag++; -- } -- itag = 0; -- while ( optional_tags[itype] && optional_tags[itype][itag] ) -- { -- t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); -- t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); -- if ( !t1 && !t2 ) -- { -- itag++; -- continue; -- } -- if ( !t1 || !t2 ) -- missing = 1; // there is some tag missing in one of the hlines -- else if ( strcmp(t1->value,t2->value) ) -- { -- if ( unique_tags[itype] ) -- return 2; // the lines have a matching unique tag but have a conflicting tag -- -- return 0; // the lines contain conflicting tags, cannot be merged -- } -- itag++; -- } -- if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged -- return 1; --} -- -- --static HeaderLine *sam_header_line_clone(const HeaderLine *hline) --{ -- list_t *tags; -- HeaderLine *out = malloc(sizeof(HeaderLine)); -- out->type[0] = hline->type[0]; -- out->type[1] = hline->type[1]; -- out->tags = NULL; -- -- tags = hline->tags; -- while (tags) -- { -- HeaderTag *old = tags->data; -- -- HeaderTag *new = malloc(sizeof(HeaderTag)); -- new->key[0] = old->key[0]; -- new->key[1] = old->key[1]; -- new->value = strdup(old->value); -- out->tags = list_append(out->tags, new); -- -- tags = tags->next; -- } -- return out; --} -- --static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) --{ -- list_t *tmpl_tags; -- -- if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) -- return 0; -- -- tmpl_tags = tmpl_hline->tags; -- while (tmpl_tags) -- { -- HeaderTag *tmpl_tag = tmpl_tags->data; -- HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); -- if ( !out_tag ) -- { -- HeaderTag *tag = malloc(sizeof(HeaderTag)); -- tag->key[0] = tmpl_tag->key[0]; -- tag->key[1] = tmpl_tag->key[1]; -- tag->value = strdup(tmpl_tag->value); -- out_hline->tags = list_append(out_hline->tags,tag); -- } -- tmpl_tags = tmpl_tags->next; -- } -- return 1; --} -- -- --static HeaderLine *sam_header_line_parse(const char *headerLine) --{ -- HeaderLine *hline; -- HeaderTag *tag; -- const char *from, *to; -- from = headerLine; -- -- if ( *from != '@' ) { -- debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); -- return 0; -- } -- to = ++from; -- -- while (*to && *to!='\t') to++; -- if ( to-from != 2 ) { -- debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); -- return 0; -- } -- -- hline = malloc(sizeof(HeaderLine)); -- hline->type[0] = from[0]; -- hline->type[1] = from[1]; -- hline->tags = NULL; -- -- int itype = tag_exists(hline->type, types); -- -- from = to; -- while (*to && *to=='\t') to++; -- if ( to-from != 1 ) { -- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); -- free(hline); -- return 0; -- } -- from = to; -- while (*from) -- { -- while (*to && *to!='\t') to++; -- -- if ( !required_tags[itype] && !optional_tags[itype] ) -- { -- // CO is a special case, it can contain anything, including tabs -- if ( *to ) { to++; continue; } -- tag = new_tag(" ",from,to-1); -- } -- else -- tag = new_tag(from,from+3,to-1); -- -- if ( header_line_has_tag(hline,tag->key) ) -- debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); -- hline->tags = list_append(hline->tags, tag); -- -- from = to; -- while (*to && *to=='\t') to++; -- if ( *to && to-from != 1 ) { -- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); -- return 0; -- } -- -- from = to; -- } -- return hline; --} -- -- --// Must be of an existing type, all tags must be recognised and all required tags must be present --static int sam_header_line_validate(HeaderLine *hline) --{ -- list_t *tags; -- HeaderTag *tag; -- int itype, itag; -- -- // Is the type correct? -- itype = tag_exists(hline->type, types); -- if ( itype==-1 ) -- { -- debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); -- return 0; -- } -- -- // Has all required tags? -- itag = 0; -- while ( required_tags[itype] && required_tags[itype][itag] ) -- { -- if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) -- { -- debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], -- hline->type[0],hline->type[1]); -- return 0; -- } -- itag++; -- } -- -- // Are all tags recognised? -- tags = hline->tags; -- while ( tags ) -- { -- tag = tags->data; -- if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) -- { -- // Lower case tags are user-defined values. -- if( !(islower(tag->key[0]) || islower(tag->key[1])) ) -- { -- // Neither is lower case, but tag was not recognized. -- debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); -- // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes -- } -- // else - allow user defined tag -- } -- tags = tags->next; -- } -- -- return 1; --} -- -- --static void print_header_line(FILE *fp, HeaderLine *hline) --{ -- list_t *tags = hline->tags; -- HeaderTag *tag; -- -- fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); -- while (tags) -- { -- tag = tags->data; -- -- fprintf(fp, "\t"); -- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) -- fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); -- fprintf(fp, "%s", tag->value); -- -- tags = tags->next; -- } -- fprintf(fp,"\n"); --} -- -- --static void sam_header_line_free(HeaderLine *hline) --{ -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- free(tag->value); -- free(tag); -- tags = tags->next; -- } -- list_free(hline->tags); -- free(hline); --} -- --void sam_header_free(void *_header) --{ -- HeaderDict *header = (HeaderDict*)_header; -- list_t *hlines = header; -- while (hlines) -- { -- sam_header_line_free(hlines->data); -- hlines = hlines->next; -- } -- list_free(header); --} -- --HeaderDict *sam_header_clone(const HeaderDict *dict) --{ -- HeaderDict *out = NULL; -- while (dict) -- { -- HeaderLine *hline = dict->data; -- out = list_append(out, sam_header_line_clone(hline)); -- dict = dict->next; -- } -- return out; --} -- --// Returns a newly allocated string --char *sam_header_write(const void *_header) --{ -- const HeaderDict *header = (const HeaderDict*)_header; -- char *out = NULL; -- int len=0, nout=0; -- const list_t *hlines; -- -- // Calculate the length of the string to allocate -- hlines = header; -- while (hlines) -- { -- len += 4; // @XY and \n -- -- HeaderLine *hline = hlines->data; -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- len += strlen(tag->value) + 1; // \t -- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) -- len += strlen(tag->value) + 3; // XY: -- tags = tags->next; -- } -- hlines = hlines->next; -- } -- -- nout = 0; -- out = malloc(len+1); -- hlines = header; -- while (hlines) -- { -- HeaderLine *hline = hlines->data; -- -- nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); -- -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- nout += sprintf(out+nout,"\t"); -- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) -- nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); -- nout += sprintf(out+nout,"%s", tag->value); -- tags = tags->next; -- } -- hlines = hlines->next; -- nout += sprintf(out+nout,"\n"); -- } -- out[len] = 0; -- return out; --} -- --void *sam_header_parse2(const char *headerText) --{ -- list_t *hlines = NULL; -- HeaderLine *hline; -- const char *text; -- char *buf=NULL; -- size_t nbuf = 0; -- int tovalidate = 0; -- -- if ( !headerText ) -- return 0; -- -- text = headerText; -- while ( (text=nextline(&buf, &nbuf, text)) ) -- { -- hline = sam_header_line_parse(buf); -- if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) -- // With too many (~250,000) reference sequences the header parsing was too slow with list_append. -- hlines = list_append_to_end(hlines, hline); -- else -- { -- if (hline) sam_header_line_free(hline); -- sam_header_free(hlines); -- if ( buf ) free(buf); -- return NULL; -- } -- } -- if ( buf ) free(buf); -- -- return hlines; --} -- --void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) --{ -- const HeaderDict *dict = (const HeaderDict*)_dict; -- const list_t *l = dict; -- khash_t(str) *tbl = kh_init(str); -- khiter_t k; -- int ret; -- -- if (_dict == 0) return tbl; // return an empty (not null) hash table -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- -- HeaderTag *key, *value; -- key = header_line_has_tag(hline,key_tag); -- value = header_line_has_tag(hline,value_tag); -- if ( !key || !value ) -- { -- l = l->next; -- continue; -- } -- -- k = kh_get(str, tbl, key->value); -- if ( k != kh_end(tbl) ) -- debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); -- k = kh_put(str, tbl, key->value, &ret); -- kh_value(tbl, k) = value->value; -- -- l = l->next; -- } -- return tbl; --} -- --char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) --{ -- const HeaderDict *dict = (const HeaderDict*)_dict; -- const list_t *l = dict; -- int max, n; -- char **ret; -- -- ret = 0; *_n = max = n = 0; -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- -- HeaderTag *key; -- key = header_line_has_tag(hline,key_tag); -- if ( !key ) -- { -- l = l->next; -- continue; -- } -- -- if (n == max) { -- max = max? max<<1 : 4; -- ret = realloc(ret, max * sizeof(char*)); -- } -- ret[n++] = key->value; -- -- l = l->next; -- } -- *_n = n; -- return ret; --} -- --void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) --{ -- list_t *l = iter; -- if ( !l ) return NULL; -- -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- -- HeaderTag *key, *value; -- key = header_line_has_tag(hline,key_tag); -- value = header_line_has_tag(hline,value_tag); -- if ( !key || !value ) -- { -- l = l->next; -- continue; -- } -- -- *_key = key->value; -- *_value = value->value; -- return l->next; -- } -- return l; --} -- --const char *sam_tbl_get(void *h, const char *key) --{ -- khash_t(str) *tbl = (khash_t(str)*)h; -- khint_t k; -- k = kh_get(str, tbl, key); -- return k == kh_end(tbl)? 0 : kh_val(tbl, k); --} -- --int sam_tbl_size(void *h) --{ -- khash_t(str) *tbl = (khash_t(str)*)h; -- return h? kh_size(tbl) : 0; --} -- --void sam_tbl_destroy(void *h) --{ -- khash_t(str) *tbl = (khash_t(str)*)h; -- kh_destroy(str, tbl); --} -- --void *sam_header_merge(int n, const void **_dicts) --{ -- const HeaderDict **dicts = (const HeaderDict**)_dicts; -- HeaderDict *out_dict; -- int idict, status; -- -- if ( n<2 ) return NULL; -- -- out_dict = sam_header_clone(dicts[0]); -- -- for (idict=1; idictdata, out_hlines->data); -- if ( status==0 ) -- { -- out_hlines = out_hlines->next; -- continue; -- } -- -- if ( status==2 ) -- { -- print_header_line(stderr,tmpl_hlines->data); -- print_header_line(stderr,out_hlines->data); -- debug("Conflicting lines, cannot merge the headers.\n"); -- return 0; -- } -- if ( status==3 ) -- sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); -- -- inserted = 1; -- break; -- } -- if ( !inserted ) -- out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); -- -- tmpl_hlines = tmpl_hlines->next; -- } -- } -- -- return out_dict; --} -- --char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) --{ -- int nout = 0; -- char **out = NULL; -- -- *n = 0; -- list_t *l = (list_t *)dict; -- if ( !l ) return NULL; -- -- int i, ntags = 0; -- while ( tags[ntags] ) ntags++; -- -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); -- for (i=0; ivalue; -- } -- nout++; -- l = l->next; -- } -- *n = nout; -- return out; --} -- ---- python-pysam.orig/samtools/sam_header.c.pysam.c -+++ /dev/null -@@ -1,838 +0,0 @@ --#include "samtools.pysam.h" -- --/* sam_header.c -- basic SAM/BAM header API. -- -- Copyright (C) 2009-2013 Genome Research Ltd. -- -- Author: Petr Danecek -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL --THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING --FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER --DEALINGS IN THE SOFTWARE. */ -- --#include -- --#include "sam_header.h" --#include --#include --#include --#include --#include -- --#include "htslib/khash.h" --KHASH_MAP_INIT_STR(str, const char *) -- --struct _HeaderList --{ -- struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. -- struct _HeaderList *next; -- void *data; --}; --typedef struct _HeaderList list_t; --typedef list_t HeaderDict; -- --typedef struct --{ -- char key[2]; -- char *value; --} --HeaderTag; -- --typedef struct --{ -- char type[2]; -- list_t *tags; --} --HeaderLine; -- --const char *o_hd_tags[] = {"SO","GO",NULL}; --const char *r_hd_tags[] = {"VN",NULL}; -- --const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; --const char *r_sq_tags[] = {"SN","LN",NULL}; --const char *u_sq_tags[] = {"SN",NULL}; -- --const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; --const char *r_rg_tags[] = {"ID",NULL}; --const char *u_rg_tags[] = {"ID",NULL}; -- --const char *o_pg_tags[] = {"VN","CL",NULL}; --const char *r_pg_tags[] = {"ID",NULL}; -- --const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; --const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; --const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; --const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; -- -- --static void debug(const char *format, ...) --{ -- va_list ap; -- va_start(ap, format); -- vfprintf(samtools_stderr, format, ap); -- va_end(ap); --} -- --#if 0 --// Replaced by list_append_to_end --static list_t *list_prepend(list_t *root, void *data) --{ -- list_t *l = malloc(sizeof(list_t)); -- l->next = root; -- l->data = data; -- return l; --} --#endif -- --// Relies on the root->last being correct. Do not use with the other list_* --// routines unless they are fixed to modify root->last as well. --static list_t *list_append_to_end(list_t *root, void *data) --{ -- list_t *l = malloc(sizeof(list_t)); -- l->last = l; -- l->next = NULL; -- l->data = data; -- -- if ( !root ) -- return l; -- -- root->last->next = l; -- root->last = l; -- return root; --} -- --static list_t *list_append(list_t *root, void *data) --{ -- list_t *l = root; -- while (l && l->next) -- l = l->next; -- if ( l ) -- { -- l->next = malloc(sizeof(list_t)); -- l = l->next; -- } -- else -- { -- l = malloc(sizeof(list_t)); -- root = l; -- } -- l->data = data; -- l->next = NULL; -- return root; --} -- --static void list_free(list_t *root) --{ -- list_t *l = root; -- while (root) -- { -- l = root; -- root = root->next; -- free(l); -- } --} -- -- -- --// Look for a tag "XY" in a predefined const char *[] array. --static int tag_exists(const char *tag, const char **tags) --{ -- int itag=0; -- if ( !tags ) return -1; -- while ( tags[itag] ) -- { -- if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; -- itag++; -- } -- return -1; --} -- -- -- --// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text --// or NULL if everything has been read. The lineptr should be freed by the caller. The --// newline character is stripped. --static const char *nextline(char **lineptr, size_t *n, const char *text) --{ -- int len; -- const char *to = text; -- -- if ( !*to ) return NULL; -- -- while ( *to && *to!='\n' && *to!='\r' ) to++; -- len = to - text + 1; -- -- if ( *to ) -- { -- // Advance the pointer for the next call -- if ( *to=='\n' ) to++; -- else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; -- } -- if ( !len ) -- return to; -- -- if ( !*lineptr ) -- { -- *lineptr = malloc(len); -- *n = len; -- } -- else if ( *nkey[0] = name[0]; -- tag->key[1] = name[1]; -- tag->value = malloc(len+1); -- memcpy(tag->value,value_from,len+1); -- tag->value[len] = 0; -- return tag; --} -- --static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) --{ -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; -- tags = tags->next; -- } -- return NULL; --} -- -- --// Return codes: --// 0 .. different types or unique tags differ or conflicting tags, cannot be merged --// 1 .. all tags identical -> no need to merge, drop one --// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated --// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line --static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) --{ -- HeaderTag *t1, *t2; -- -- if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) -- return 0; -- -- int itype = tag_exists(hline1->type,types); -- if ( itype==-1 ) { -- debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); -- return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code -- } -- -- if ( unique_tags[itype] ) -- { -- t1 = header_line_has_tag(hline1,unique_tags[itype][0]); -- t2 = header_line_has_tag(hline2,unique_tags[itype][0]); -- if ( !t1 || !t2 ) // this should never happen, the unique tags are required -- return 2; -- -- if ( strcmp(t1->value,t2->value) ) -- return 0; // the unique tags differ, cannot be merged -- } -- if ( !required_tags[itype] && !optional_tags[itype] ) -- { -- t1 = hline1->tags->data; -- t2 = hline2->tags->data; -- if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments -- return 0; -- } -- -- int missing=0, itag=0; -- while ( required_tags[itype] && required_tags[itype][itag] ) -- { -- t1 = header_line_has_tag(hline1,required_tags[itype][itag]); -- t2 = header_line_has_tag(hline2,required_tags[itype][itag]); -- if ( !t1 && !t2 ) -- return 2; // this should never happen -- else if ( !t1 || !t2 ) -- missing = 1; // there is some tag missing in one of the hlines -- else if ( strcmp(t1->value,t2->value) ) -- { -- if ( unique_tags[itype] ) -- return 2; // the lines have a matching unique tag but have a conflicting tag -- -- return 0; // the lines contain conflicting tags, cannot be merged -- } -- itag++; -- } -- itag = 0; -- while ( optional_tags[itype] && optional_tags[itype][itag] ) -- { -- t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); -- t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); -- if ( !t1 && !t2 ) -- { -- itag++; -- continue; -- } -- if ( !t1 || !t2 ) -- missing = 1; // there is some tag missing in one of the hlines -- else if ( strcmp(t1->value,t2->value) ) -- { -- if ( unique_tags[itype] ) -- return 2; // the lines have a matching unique tag but have a conflicting tag -- -- return 0; // the lines contain conflicting tags, cannot be merged -- } -- itag++; -- } -- if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged -- return 1; --} -- -- --static HeaderLine *sam_header_line_clone(const HeaderLine *hline) --{ -- list_t *tags; -- HeaderLine *out = malloc(sizeof(HeaderLine)); -- out->type[0] = hline->type[0]; -- out->type[1] = hline->type[1]; -- out->tags = NULL; -- -- tags = hline->tags; -- while (tags) -- { -- HeaderTag *old = tags->data; -- -- HeaderTag *new = malloc(sizeof(HeaderTag)); -- new->key[0] = old->key[0]; -- new->key[1] = old->key[1]; -- new->value = strdup(old->value); -- out->tags = list_append(out->tags, new); -- -- tags = tags->next; -- } -- return out; --} -- --static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) --{ -- list_t *tmpl_tags; -- -- if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) -- return 0; -- -- tmpl_tags = tmpl_hline->tags; -- while (tmpl_tags) -- { -- HeaderTag *tmpl_tag = tmpl_tags->data; -- HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); -- if ( !out_tag ) -- { -- HeaderTag *tag = malloc(sizeof(HeaderTag)); -- tag->key[0] = tmpl_tag->key[0]; -- tag->key[1] = tmpl_tag->key[1]; -- tag->value = strdup(tmpl_tag->value); -- out_hline->tags = list_append(out_hline->tags,tag); -- } -- tmpl_tags = tmpl_tags->next; -- } -- return 1; --} -- -- --static HeaderLine *sam_header_line_parse(const char *headerLine) --{ -- HeaderLine *hline; -- HeaderTag *tag; -- const char *from, *to; -- from = headerLine; -- -- if ( *from != '@' ) { -- debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); -- return 0; -- } -- to = ++from; -- -- while (*to && *to!='\t') to++; -- if ( to-from != 2 ) { -- debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); -- return 0; -- } -- -- hline = malloc(sizeof(HeaderLine)); -- hline->type[0] = from[0]; -- hline->type[1] = from[1]; -- hline->tags = NULL; -- -- int itype = tag_exists(hline->type, types); -- -- from = to; -- while (*to && *to=='\t') to++; -- if ( to-from != 1 ) { -- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); -- free(hline); -- return 0; -- } -- from = to; -- while (*from) -- { -- while (*to && *to!='\t') to++; -- -- if ( !required_tags[itype] && !optional_tags[itype] ) -- { -- // CO is a special case, it can contain anything, including tabs -- if ( *to ) { to++; continue; } -- tag = new_tag(" ",from,to-1); -- } -- else -- tag = new_tag(from,from+3,to-1); -- -- if ( header_line_has_tag(hline,tag->key) ) -- debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); -- hline->tags = list_append(hline->tags, tag); -- -- from = to; -- while (*to && *to=='\t') to++; -- if ( *to && to-from != 1 ) { -- debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); -- return 0; -- } -- -- from = to; -- } -- return hline; --} -- -- --// Must be of an existing type, all tags must be recognised and all required tags must be present --static int sam_header_line_validate(HeaderLine *hline) --{ -- list_t *tags; -- HeaderTag *tag; -- int itype, itag; -- -- // Is the type correct? -- itype = tag_exists(hline->type, types); -- if ( itype==-1 ) -- { -- debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); -- return 0; -- } -- -- // Has all required tags? -- itag = 0; -- while ( required_tags[itype] && required_tags[itype][itag] ) -- { -- if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) -- { -- debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], -- hline->type[0],hline->type[1]); -- return 0; -- } -- itag++; -- } -- -- // Are all tags recognised? -- tags = hline->tags; -- while ( tags ) -- { -- tag = tags->data; -- if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) -- { -- // Lower case tags are user-defined values. -- if( !(islower(tag->key[0]) || islower(tag->key[1])) ) -- { -- // Neither is lower case, but tag was not recognized. -- debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); -- // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes -- } -- // else - allow user defined tag -- } -- tags = tags->next; -- } -- -- return 1; --} -- -- --static void print_header_line(FILE *fp, HeaderLine *hline) --{ -- list_t *tags = hline->tags; -- HeaderTag *tag; -- -- fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); -- while (tags) -- { -- tag = tags->data; -- -- fprintf(fp, "\t"); -- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) -- fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); -- fprintf(fp, "%s", tag->value); -- -- tags = tags->next; -- } -- fprintf(fp,"\n"); --} -- -- --static void sam_header_line_free(HeaderLine *hline) --{ -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- free(tag->value); -- free(tag); -- tags = tags->next; -- } -- list_free(hline->tags); -- free(hline); --} -- --void sam_header_free(void *_header) --{ -- HeaderDict *header = (HeaderDict*)_header; -- list_t *hlines = header; -- while (hlines) -- { -- sam_header_line_free(hlines->data); -- hlines = hlines->next; -- } -- list_free(header); --} -- --HeaderDict *sam_header_clone(const HeaderDict *dict) --{ -- HeaderDict *out = NULL; -- while (dict) -- { -- HeaderLine *hline = dict->data; -- out = list_append(out, sam_header_line_clone(hline)); -- dict = dict->next; -- } -- return out; --} -- --// Returns a newly allocated string --char *sam_header_write(const void *_header) --{ -- const HeaderDict *header = (const HeaderDict*)_header; -- char *out = NULL; -- int len=0, nout=0; -- const list_t *hlines; -- -- // Calculate the length of the string to allocate -- hlines = header; -- while (hlines) -- { -- len += 4; // @XY and \n -- -- HeaderLine *hline = hlines->data; -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- len += strlen(tag->value) + 1; // \t -- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) -- len += strlen(tag->value) + 3; // XY: -- tags = tags->next; -- } -- hlines = hlines->next; -- } -- -- nout = 0; -- out = malloc(len+1); -- hlines = header; -- while (hlines) -- { -- HeaderLine *hline = hlines->data; -- -- nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); -- -- list_t *tags = hline->tags; -- while (tags) -- { -- HeaderTag *tag = tags->data; -- nout += sprintf(out+nout,"\t"); -- if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) -- nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); -- nout += sprintf(out+nout,"%s", tag->value); -- tags = tags->next; -- } -- hlines = hlines->next; -- nout += sprintf(out+nout,"\n"); -- } -- out[len] = 0; -- return out; --} -- --void *sam_header_parse2(const char *headerText) --{ -- list_t *hlines = NULL; -- HeaderLine *hline; -- const char *text; -- char *buf=NULL; -- size_t nbuf = 0; -- int tovalidate = 0; -- -- if ( !headerText ) -- return 0; -- -- text = headerText; -- while ( (text=nextline(&buf, &nbuf, text)) ) -- { -- hline = sam_header_line_parse(buf); -- if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) -- // With too many (~250,000) reference sequences the header parsing was too slow with list_append. -- hlines = list_append_to_end(hlines, hline); -- else -- { -- if (hline) sam_header_line_free(hline); -- sam_header_free(hlines); -- if ( buf ) free(buf); -- return NULL; -- } -- } -- if ( buf ) free(buf); -- -- return hlines; --} -- --void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) --{ -- const HeaderDict *dict = (const HeaderDict*)_dict; -- const list_t *l = dict; -- khash_t(str) *tbl = kh_init(str); -- khiter_t k; -- int ret; -- -- if (_dict == 0) return tbl; // return an empty (not null) hash table -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- -- HeaderTag *key, *value; -- key = header_line_has_tag(hline,key_tag); -- value = header_line_has_tag(hline,value_tag); -- if ( !key || !value ) -- { -- l = l->next; -- continue; -- } -- -- k = kh_get(str, tbl, key->value); -- if ( k != kh_end(tbl) ) -- debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); -- k = kh_put(str, tbl, key->value, &ret); -- kh_value(tbl, k) = value->value; -- -- l = l->next; -- } -- return tbl; --} -- --char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) --{ -- const HeaderDict *dict = (const HeaderDict*)_dict; -- const list_t *l = dict; -- int max, n; -- char **ret; -- -- ret = 0; *_n = max = n = 0; -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- -- HeaderTag *key; -- key = header_line_has_tag(hline,key_tag); -- if ( !key ) -- { -- l = l->next; -- continue; -- } -- -- if (n == max) { -- max = max? max<<1 : 4; -- ret = realloc(ret, max * sizeof(char*)); -- } -- ret[n++] = key->value; -- -- l = l->next; -- } -- *_n = n; -- return ret; --} -- --void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) --{ -- list_t *l = iter; -- if ( !l ) return NULL; -- -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- -- HeaderTag *key, *value; -- key = header_line_has_tag(hline,key_tag); -- value = header_line_has_tag(hline,value_tag); -- if ( !key || !value ) -- { -- l = l->next; -- continue; -- } -- -- *_key = key->value; -- *_value = value->value; -- return l->next; -- } -- return l; --} -- --const char *sam_tbl_get(void *h, const char *key) --{ -- khash_t(str) *tbl = (khash_t(str)*)h; -- khint_t k; -- k = kh_get(str, tbl, key); -- return k == kh_end(tbl)? 0 : kh_val(tbl, k); --} -- --int sam_tbl_size(void *h) --{ -- khash_t(str) *tbl = (khash_t(str)*)h; -- return h? kh_size(tbl) : 0; --} -- --void sam_tbl_destroy(void *h) --{ -- khash_t(str) *tbl = (khash_t(str)*)h; -- kh_destroy(str, tbl); --} -- --void *sam_header_merge(int n, const void **_dicts) --{ -- const HeaderDict **dicts = (const HeaderDict**)_dicts; -- HeaderDict *out_dict; -- int idict, status; -- -- if ( n<2 ) return NULL; -- -- out_dict = sam_header_clone(dicts[0]); -- -- for (idict=1; idictdata, out_hlines->data); -- if ( status==0 ) -- { -- out_hlines = out_hlines->next; -- continue; -- } -- -- if ( status==2 ) -- { -- print_header_line(samtools_stderr,tmpl_hlines->data); -- print_header_line(samtools_stderr,out_hlines->data); -- debug("Conflicting lines, cannot merge the headers.\n"); -- return 0; -- } -- if ( status==3 ) -- sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); -- -- inserted = 1; -- break; -- } -- if ( !inserted ) -- out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); -- -- tmpl_hlines = tmpl_hlines->next; -- } -- } -- -- return out_dict; --} -- --char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) --{ -- int nout = 0; -- char **out = NULL; -- -- *n = 0; -- list_t *l = (list_t *)dict; -- if ( !l ) return NULL; -- -- int i, ntags = 0; -- while ( tags[ntags] ) ntags++; -- -- while (l) -- { -- HeaderLine *hline = l->data; -- if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) -- { -- l = l->next; -- continue; -- } -- out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); -- for (i=0; ivalue; -- } -- nout++; -- l = l->next; -- } -- *n = nout; -- return out; --} -- ---- python-pysam.orig/samtools/sam_header.h -+++ /dev/null -@@ -1,72 +0,0 @@ --/* sam_header.h -- basic SAM/BAM header API. -- -- Copyright (C) 2009, 2012, 2013 Genome Research Ltd. -- -- Author: Petr Danecek -- --Permission is hereby granted, free of charge, to any person obtaining a copy --of this software and associated documentation files (the "Software"), to deal --in the Software without restriction, including without limitation the rights --to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --copies of the Software, and to permit persons to whom the Software is --furnished to do so, subject to the following conditions: -- --The above copyright notice and this permission notice shall be included in --all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL --THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING --FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER --DEALINGS IN THE SOFTWARE. */ -- --#ifndef __SAM_HEADER_H__ --#define __SAM_HEADER_H__ -- --#ifdef __cplusplus --extern "C" { --#endif -- -- void *sam_header_parse2(const char *headerText); -- void *sam_header_merge(int n, const void **dicts); -- void sam_header_free(void *header); -- char *sam_header_write(const void *headerDict); // returns a newly allocated string -- -- /* -- // Usage example -- const char *key, *val; -- void *iter = sam_header_parse2(bam->header->text); -- while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val); -- */ -- void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); -- char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); -- -- /* -- // Usage example -- int i, j, n; -- const char *tags[] = {"SN","LN","UR","M5",NULL}; -- void *dict = sam_header_parse2(bam->header->text); -- char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n); -- for (i=0; i - -@@ -66,8 +66,23 @@ - break; - } else if (strcmp(lopt->name, "reference") == 0) { - char *ref = malloc(10 + strlen(optarg) + 1); -+ -+ if (!ref) { -+ fprintf(stderr, "Unable to allocate memory in " -+ "parse_sam_global_opt.\n"); -+ -+ return -1; -+ } -+ - sprintf(ref, "reference=%s", optarg); -- ga->reference = strdup(optarg); -+ -+ if (!(ga->reference = strdup(optarg))) { -+ fprintf(stderr, "Unable to allocate memory in " -+ "parse_sam_global_opt.\n"); -+ -+ return -1; -+ } -+ - r = hts_opt_add((hts_opt **)&ga->in.specific, ref); - r |= hts_opt_add((hts_opt **)&ga->out.specific, ref); - free(ref); -@@ -75,17 +90,32 @@ - } else if (strcmp(lopt->name, "threads") == 0) { - ga->nthreads = atoi(optarg); - break; --// } else if (strcmp(lopt->name, "verbose") == 0) { --// ga->verbosity++; --// break; -+ } else if (strcmp(lopt->name, "write-index") == 0) { -+ ga->write_index = 1; -+ break; -+ } else if (strcmp(lopt->name, "verbosity") == 0) { -+ hts_verbose = atoi(optarg); -+ break; - } - } - - if (!lopt->name) { -- fprintf(stderr, "Unexpected global option: %s\n", lopt->name); -+ fprintf(stderr, "Unexpected global option.\n"); - return -1; - } - -+ /* -+ * SAM format with compression enabled implies SAM.bgzf -+ */ -+ if (ga->out.format == sam) { -+ hts_opt *opts = (hts_opt *)ga->out.specific; -+ while (opts) { -+ if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) -+ ga->out.compression = bgzf; -+ opts = opts->next; -+ } -+ } -+ - return r; - } - -@@ -136,9 +166,12 @@ - else if (strcmp(lopts[i].name, "threads") == 0) - fprintf(fp,"threads INT\n" - " Number of additional threads to use [0]\n"); --// else if (strcmp(lopts[i].name, "verbose") == 0) --// fprintf(fp,"verbose\n" --// " Increment level of verbosity\n"); -+ else if (strcmp(lopts[i].name, "write-index") == 0) -+ fprintf(fp,"write-index\n" -+ " Automatically index the output files [off]\n"); -+ else if (strcmp(lopts[i].name, "verbosity") == 0) -+ fprintf(fp,"verbosity INT\n" -+ " Set level of verbosity\n"); - } - } - ---- python-pysam.orig/samtools/sam_opts.c.pysam.c -+++ python-pysam/samtools/sam_opts.c.pysam.c -@@ -2,7 +2,7 @@ - - /* sam_opts.c -- utilities to aid parsing common command line options. - -- Copyright (C) 2015 Genome Research Ltd. -+ Copyright (C) 2015, 2019 Genome Research Ltd. - - Author: James Bonfield - -@@ -68,8 +68,23 @@ - break; - } else if (strcmp(lopt->name, "reference") == 0) { - char *ref = malloc(10 + strlen(optarg) + 1); -+ -+ if (!ref) { -+ fprintf(samtools_stderr, "Unable to allocate memory in " -+ "parse_sam_global_opt.\n"); -+ -+ return -1; -+ } -+ - sprintf(ref, "reference=%s", optarg); -- ga->reference = strdup(optarg); -+ -+ if (!(ga->reference = strdup(optarg))) { -+ fprintf(samtools_stderr, "Unable to allocate memory in " -+ "parse_sam_global_opt.\n"); -+ -+ return -1; -+ } -+ - r = hts_opt_add((hts_opt **)&ga->in.specific, ref); - r |= hts_opt_add((hts_opt **)&ga->out.specific, ref); - free(ref); -@@ -77,17 +92,32 @@ - } else if (strcmp(lopt->name, "threads") == 0) { - ga->nthreads = atoi(optarg); - break; --// } else if (strcmp(lopt->name, "verbose") == 0) { --// ga->verbosity++; --// break; -+ } else if (strcmp(lopt->name, "write-index") == 0) { -+ ga->write_index = 1; -+ break; -+ } else if (strcmp(lopt->name, "verbosity") == 0) { -+ hts_verbose = atoi(optarg); -+ break; - } - } - - if (!lopt->name) { -- fprintf(samtools_stderr, "Unexpected global option: %s\n", lopt->name); -+ fprintf(samtools_stderr, "Unexpected global option.\n"); - return -1; - } - -+ /* -+ * SAM format with compression enabled implies SAM.bgzf -+ */ -+ if (ga->out.format == sam) { -+ hts_opt *opts = (hts_opt *)ga->out.specific; -+ while (opts) { -+ if (opts->opt == HTS_OPT_COMPRESSION_LEVEL) -+ ga->out.compression = bgzf; -+ opts = opts->next; -+ } -+ } -+ - return r; - } - -@@ -138,9 +168,12 @@ - else if (strcmp(lopts[i].name, "threads") == 0) - fprintf(fp,"threads INT\n" - " Number of additional threads to use [0]\n"); --// else if (strcmp(lopts[i].name, "verbose") == 0) --// fprintf(fp,"verbose\n" --// " Increment level of verbosity\n"); -+ else if (strcmp(lopts[i].name, "write-index") == 0) -+ fprintf(fp,"write-index\n" -+ " Automatically index the output files [off]\n"); -+ else if (strcmp(lopts[i].name, "verbosity") == 0) -+ fprintf(fp,"verbosity INT\n" -+ " Set level of verbosity\n"); - } - } - ---- python-pysam.orig/samtools/sam_opts.h -+++ python-pysam/samtools/sam_opts.h -@@ -1,6 +1,6 @@ - /* sam_opts.h -- utilities to aid parsing common command line options. - -- Copyright (C) 2015 Genome Research Ltd. -+ Copyright (C) 2015, 2019 Genome Research Ltd. - - Author: James Bonfield - -@@ -35,7 +35,7 @@ - htsFormat out; - char *reference; - int nthreads; -- //int verbosity; -+ int write_index; - } sam_global_args; - - #define SAM_GLOBAL_ARGS_INIT {{0},{0}} -@@ -47,7 +47,8 @@ - SAM_OPT_OUTPUT_FMT_OPTION, - SAM_OPT_REFERENCE, - SAM_OPT_NTHREADS, -- //SAM_OPT_VERBOSE -+ SAM_OPT_WRITE_INDEX, -+ SAM_OPT_VERBOSITY, - }; - - #define SAM_OPT_VAL(val, defval) ((val) == '-')? '?' : (val)? (val) : (defval) -@@ -64,8 +65,9 @@ - {"output-fmt", required_argument, NULL, SAM_OPT_VAL(o3, SAM_OPT_OUTPUT_FMT)}, \ - {"output-fmt-option", required_argument, NULL, SAM_OPT_VAL(o4, SAM_OPT_OUTPUT_FMT_OPTION)}, \ - {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}, \ -- {"threads", required_argument, NULL, SAM_OPT_VAL(o6, SAM_OPT_NTHREADS)} -- //{"verbose", no_argument, NULL, SAM_OPT_VERBOSE} -+ {"threads", required_argument, NULL, SAM_OPT_VAL(o6, SAM_OPT_NTHREADS)}, \ -+ {"write-index", no_argument, NULL, SAM_OPT_WRITE_INDEX}, \ -+ {"verbosity", required_argument, NULL, SAM_OPT_VERBOSITY} - - /* - * Processes a standard "global" samtools long option. ---- python-pysam.orig/samtools/sam_utils.c -+++ python-pysam/samtools/sam_utils.c -@@ -1,6 +1,6 @@ - /* sam_utils.c -- various utilities internal to samtools. - -- Copyright (C) 2014-2016 Genome Research Ltd. -+ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. - - Author: John Marshall - -@@ -23,6 +23,7 @@ - DEALINGS IN THE SOFTWARE. */ - - #include -+#include - - #include - #include -@@ -58,3 +59,80 @@ - vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); - va_end(args); - } -+ -+void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) -+{ -+ int r = sam_close(fp); -+ if (r >= 0) return; -+ -+ // TODO Need error infrastructure so we can print a message instead of r -+ if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); -+ else print_error(subcmd, "error closing %s: %d", null_fname, r); -+ -+ *retp = EXIT_FAILURE; -+} -+ -+/* Pick an index suffix based on the output file descriptor type. */ -+static char *idx_suffix(htsFile *fp) { -+ switch (fp->format.format) { -+ case sam: -+ case bam: -+ // Tough cheese if you wanted bai! -+ // New feature => mandatory new index too, for simplicity of CLI. -+ return "csi"; -+ -+ case cram: -+ return "crai"; -+ -+ default: -+ return NULL; -+ } -+} -+ -+/* -+ * Utility function to add an index to a file we've opened for write. -+ * NB: Call this after writing the header and before writing sequences. -+ * -+ * The returned index filename should be freed by the caller, but only -+ * after sam_idx_save has been called. -+ * -+ * Returns index filename on success, -+ * NULL on failure. -+ */ -+char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header) { -+ char *fn_idx; -+ int min_shift = 14; /* CSI */ -+ if (!fn || !*fn || strcmp(fn, "-") == 0) -+ return NULL; -+ -+ char *delim = strstr(fn, HTS_IDX_DELIM); -+ if (delim != NULL) { -+ delim += strlen(HTS_IDX_DELIM); -+ -+ fn_idx = strdup(delim); -+ if (!fn_idx) -+ return NULL; -+ -+ size_t l = strlen(fn_idx); -+ if (l >= 4 && strcmp(fn_idx + l - 4, ".bai") == 0) -+ min_shift = 0; -+ } else { -+ char *suffix = idx_suffix(fp); -+ if (!suffix) -+ return NULL; -+ -+ fn_idx = malloc(strlen(fn)+6); -+ if (!fn_idx) -+ return NULL; -+ -+ sprintf(fn_idx, "%s.%s", fn, suffix); -+ } -+ -+ if (sam_idx_init(fp, header, min_shift, fn_idx) < 0) { -+ print_error_errno("auto_index", "failed to open index \"%s\" for writing", fn_idx); -+ free(fn_idx); -+ return NULL; -+ } -+ -+ return fn_idx; -+} ---- python-pysam.orig/samtools/sam_utils.c.pysam.c -+++ python-pysam/samtools/sam_utils.c.pysam.c -@@ -2,7 +2,7 @@ - - /* sam_utils.c -- various utilities internal to samtools. - -- Copyright (C) 2014-2016 Genome Research Ltd. -+ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. - - Author: John Marshall - -@@ -25,6 +25,7 @@ - DEALINGS IN THE SOFTWARE. */ - - #include -+#include - - #include - #include -@@ -60,3 +61,80 @@ - vprint_error_core(subcommand, format, args, err? strerror(err) : NULL); - va_end(args); - } -+ -+void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) -+{ -+ int r = sam_close(fp); -+ if (r >= 0) return; -+ -+ // TODO Need error infrastructure so we can print a message instead of r -+ if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); -+ else print_error(subcmd, "error closing %s: %d", null_fname, r); -+ -+ *retp = EXIT_FAILURE; -+} -+ -+/* Pick an index suffix based on the output file descriptor type. */ -+static char *idx_suffix(htsFile *fp) { -+ switch (fp->format.format) { -+ case sam: -+ case bam: -+ // Tough cheese if you wanted bai! -+ // New feature => mandatory new index too, for simplicity of CLI. -+ return "csi"; -+ -+ case cram: -+ return "crai"; -+ -+ default: -+ return NULL; -+ } -+} -+ -+/* -+ * Utility function to add an index to a file we've opened for write. -+ * NB: Call this after writing the header and before writing sequences. -+ * -+ * The returned index filename should be freed by the caller, but only -+ * after sam_idx_save has been called. -+ * -+ * Returns index filename on success, -+ * NULL on failure. -+ */ -+char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header) { -+ char *fn_idx; -+ int min_shift = 14; /* CSI */ -+ if (!fn || !*fn || strcmp(fn, "-") == 0) -+ return NULL; -+ -+ char *delim = strstr(fn, HTS_IDX_DELIM); -+ if (delim != NULL) { -+ delim += strlen(HTS_IDX_DELIM); -+ -+ fn_idx = strdup(delim); -+ if (!fn_idx) -+ return NULL; -+ -+ size_t l = strlen(fn_idx); -+ if (l >= 4 && strcmp(fn_idx + l - 4, ".bai") == 0) -+ min_shift = 0; -+ } else { -+ char *suffix = idx_suffix(fp); -+ if (!suffix) -+ return NULL; -+ -+ fn_idx = malloc(strlen(fn)+6); -+ if (!fn_idx) -+ return NULL; -+ -+ sprintf(fn_idx, "%s.%s", fn, suffix); -+ } -+ -+ if (sam_idx_init(fp, header, min_shift, fn_idx) < 0) { -+ print_error_errno("auto_index", "failed to open index \"%s\" for writing", fn_idx); -+ free(fn_idx); -+ return NULL; -+ } -+ -+ return fn_idx; -+} ---- python-pysam.orig/samtools/sam_view.c -+++ python-pysam/samtools/sam_view.c -@@ -1,6 +1,6 @@ - /* sam_view.c -- SAM<->BAM<->CRAM conversion. - -- Copyright (C) 2009-2017 Genome Research Ltd. -+ Copyright (C) 2009-2019 Genome Research Ltd. - Portions copyright (C) 2009, 2011, 2012 Broad Institute. - - Author: Heng Li -@@ -32,33 +32,25 @@ - #include - #include - #include --#include --#include - #include --#include - #include "htslib/sam.h" - #include "htslib/faidx.h" --#include "htslib/kstring.h" - #include "htslib/khash.h" --#include "htslib/klist.h" - #include "htslib/thread_pool.h" --#include "htslib/bgzf.h" - #include "samtools.h" - #include "sam_opts.h" - #include "bedidx.h" - --#define DEFAULT_BARCODE_TAG "BC" --#define DEFAULT_QUALITY_TAG "QT" -- - KHASH_SET_INIT_STR(rg) --#define taglist_free(p) --KLIST_INIT(ktaglist, char*, taglist_free) -+KHASH_SET_INIT_STR(tv) - - typedef khash_t(rg) *rghash_t; -+typedef khash_t(tv) *tvhash_t; - - // This structure contains the settings for a samview run - typedef struct samview_settings { - rghash_t rghash; -+ tvhash_t tvhash; - int min_mapQ; - int flag_on; - int flag_off; -@@ -72,16 +64,17 @@ - size_t remove_aux_len; - char** remove_aux; - int multi_region; -+ char* tag; - } samview_settings_t; - - - // TODO Add declarations of these to a viable htslib or samtools header --extern const char *bam_get_library(bam_hdr_t *header, const bam1_t *b); -+extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); - extern int bam_remove_B(bam1_t *b); - extern char *samfaipath(const char *fn_ref); - - // Returns 0 to indicate read should be output 1 otherwise --static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) -+static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) - { - if (settings->remove_B) bam_remove_B(b); - if (settings->min_qlen > 0) { -@@ -96,7 +89,7 @@ - return 1; - if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) - return 1; -- if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) -+ if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b)))) - return 1; - if (settings->subsam_frac > 0.) { - uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); -@@ -109,8 +102,17 @@ - if (k == kh_end(settings->rghash)) return 1; - } - } -+ if (settings->tvhash && settings->tag) { -+ uint8_t *s = bam_aux_get(b, settings->tag); -+ if (s) { -+ khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1)); -+ if (k == kh_end(settings->tvhash)) return 1; -+ } else { -+ return 1; -+ } -+ } - if (settings->library) { -- const char *p = bam_get_library((bam_hdr_t*)h, b); -+ const char *p = bam_get_library((sam_hdr_t*)h, b); - if (!p || strcmp(p, settings->library) != 0) return 1; - } - if (settings->remove_aux_len) { -@@ -125,37 +127,6 @@ - return 0; - } - --static char *drop_rg(char *hdtxt, rghash_t h, int *len) --{ -- char *p = hdtxt, *q, *r, *s; -- kstring_t str; -- memset(&str, 0, sizeof(kstring_t)); -- while (1) { -- int toprint = 0; -- q = strchr(p, '\n'); -- if (q == 0) q = p + strlen(p); -- if (q - p < 3) break; // the line is too short; then stop -- if (strncmp(p, "@RG\t", 4) == 0) { -- int c; -- khint_t k; -- if ((r = strstr(p, "\tID:")) != 0) { -- r += 4; -- for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); -- c = *s; *s = '\0'; -- k = kh_get(rg, h, r); -- *s = c; -- if (k != kh_end(h)) toprint = 1; -- } -- } else toprint = 1; -- if (toprint) { -- kputsn(p, q - p, &str); kputc('\n', &str); -- } -- p = q + 1; -- } -- *len = str.l; -- return str.s; --} -- - static int usage(FILE *fp, int exit_status, int is_long_help); - - static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name) -@@ -217,39 +188,87 @@ - return (ret != -1) ? 0 : -1; - } - --static inline int check_sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) -+static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) - { -- int r = sam_write1(fp, h, b); -- if (r >= 0) return r; -+ char *d = strdup(name); -+ int ret = 0; - -- if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); -- else print_error_errno("view", "writing to standard output failed"); -+ if (d == NULL) goto err; - -- *retp = EXIT_FAILURE; -- return r; -+ if (settings->tvhash == NULL) { -+ settings->tvhash = kh_init(tv); -+ if (settings->tvhash == NULL) goto err; -+ } -+ -+ kh_put(tv, settings->tvhash, d, &ret); -+ if (ret == -1) goto err; -+ if (ret == 0) free(d); /* Duplicate */ -+ return 0; -+ -+ err: -+ print_error(subcmd, "Couldn't add \"%s\" to tag values list: memory exhausted?", name); -+ free(d); -+ return -1; -+} -+ -+static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn) -+{ -+ FILE *fp; -+ char buf[1024]; -+ int ret = 0; -+ if (settings->tvhash == NULL) { -+ settings->tvhash = kh_init(tv); -+ if (settings->tvhash == NULL) { -+ perror(NULL); -+ return -1; -+ } -+ } -+ -+ fp = fopen(fn, "r"); -+ if (fp == NULL) { -+ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); -+ return -1; -+ } -+ -+ while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { -+ char *d = strdup(buf); -+ if (d != NULL) { -+ kh_put(tv, settings->tvhash, d, &ret); -+ if (ret == 0) free(d); /* Duplicate */ -+ } else { -+ ret = -1; -+ } -+ } -+ if (ferror(fp)) ret = -1; -+ if (ret == -1) { -+ print_error_errno(subcmd, "failed to read \"%s\"", fn); -+ } -+ fclose(fp); -+ return (ret != -1) ? 0 : -1; - } - --static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) -+static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) - { -- int r = sam_close(fp); -- if (r >= 0) return; -+ int r = sam_write1(fp, h, b); -+ if (r >= 0) return r; - -- // TODO Need error infrastructure so we can print a message instead of r -- if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); -- else print_error(subcmd, "error closing %s: %d", null_fname, r); -+ if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); -+ else print_error_errno("view", "writing to standard output failed"); - - *retp = EXIT_FAILURE; -+ return r; - } - - int main_samview(int argc, char *argv[]) - { -- int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; -+ int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; - int64_t count = 0; - samFile *in = 0, *out = 0, *un_out=0; - FILE *fp_out = NULL; -- bam_hdr_t *header = NULL; -+ sam_hdr_t *header = NULL; - char out_mode[5], out_un_mode[5], *out_format = ""; -- char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; -+ char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; -+ char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - htsThreadPool p = {NULL, 0}; - int filter_state = ALL, filter_op = 0; -@@ -257,6 +276,7 @@ - - samview_settings_t settings = { - .rghash = NULL, -+ .tvhash = NULL, - .min_mapQ = 0, - .flag_on = 0, - .flag_off = 0, -@@ -267,11 +287,13 @@ - .subsam_frac = -1., - .library = NULL, - .bed = NULL, -- .multi_region = 0 -+ .multi_region = 0, -+ .tag = NULL - }; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -288,7 +310,7 @@ - opterr = 0; - - while ((c = getopt_long(argc, argv, -- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:L:s:@:m:x:U:M", -+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX", - lopts, NULL)) >= 0) { - switch (c) { - case 's': -@@ -298,7 +320,6 @@ - srand(settings.subsam_seed); - settings.subsam_seed = rand(); - } -- - if (q && *q == '.') { - settings.subsam_frac = strtod(q, &q); - if (*q) ret = 1; -@@ -321,6 +342,7 @@ - case 'H': is_header_only = 1; break; - case 'o': fn_out = strdup(optarg); break; - case 'U': fn_un_out = strdup(optarg); break; -+ case 'X': has_index_file = 1; break; - case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; - case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; - case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; -@@ -347,6 +369,63 @@ - goto view_end; - } - break; -+ case 'd': -+ if (strlen(optarg) < 4 || optarg[2] != ':') { -+ print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ -+ if (settings.tag) { -+ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { -+ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); -+ ret = 1; -+ goto view_end; -+ } -+ } else { -+ if (!(settings.tag = calloc(3, 1))) { -+ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ memcpy(settings.tag, optarg, 2); -+ } -+ -+ if (add_tag_value_single("view", &settings, optarg+3) != 0) { -+ ret = 1; -+ goto view_end; -+ } -+ break; -+ case 'D': -+ // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX -+ // path translation as described at: -+ // http://www.mingw.org/wiki/Posix_path_conversion -+ if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { -+ print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ -+ if (settings.tag) { -+ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { -+ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); -+ ret = 1; -+ goto view_end; -+ } -+ } else { -+ if (!(settings.tag = calloc(3, 1))) { -+ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ memcpy(settings.tag, optarg, 2); -+ } -+ -+ if (add_tag_values_file("view", &settings, optarg+3) != 0) { -+ ret = 1; -+ goto view_end; -+ } -+ break; - /* REMOVED as htslib doesn't support this - //case 'x': out_format = "x"; break; - //case 'X': out_format = "X"; break; -@@ -380,6 +459,7 @@ - } - break; - case 'M': settings.multi_region = 1; break; -+ case 1: no_pg = 1; break; - default: - if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) - return usage(stderr, EXIT_FAILURE, 0); -@@ -429,13 +509,8 @@ - ret = 1; - goto view_end; - } -- if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... -- char *tmp; -- int l; -- tmp = drop_rg(header->text, settings.rghash, &l); -- free(header->text); -- header->text = tmp; -- header->l_text = l; -+ if (settings.rghash) { -+ sam_hdr_remove_lines(header, "RG", "ID", settings.rghash); - } - if (!is_count) { - if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { -@@ -450,7 +525,25 @@ - goto view_end; - } - } -- if (*out_format || is_header || -+ -+ if (!no_pg) { -+ if (!(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("view", "failed to create arg_list"); -+ ret = 1; -+ goto view_end; -+ } -+ if (sam_hdr_add_pg(header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("view", "failed to add PG line to the header"); -+ ret = 1; -+ goto view_end; -+ } -+ } -+ -+ if (*out_format || ga.write_index || is_header || - out_mode[1] == 'b' || out_mode[1] == 'c' || - (ga.out.format != sam && ga.out.format != unknown_format)) { - if (sam_hdr_write(out, header) != 0) { -@@ -459,6 +552,13 @@ - goto view_end; - } - } -+ if (ga.write_index) { -+ if (!(fn_out_idx = auto_index(out, fn_out, header))) { -+ ret = 1; -+ goto view_end; -+ } -+ } -+ - if (fn_un_out) { - if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { - print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); -@@ -481,6 +581,12 @@ - goto view_end; - } - } -+ if (ga.write_index) { -+ if (!(fn_un_out_idx = auto_index(un_out, fn_un_out, header))) { -+ ret = 1; -+ goto view_end; -+ } -+ } - } - } - else { -@@ -505,11 +611,23 @@ - } - if (is_header_only) goto view_end; // no need to print alignments - -+ if (has_index_file) { -+ fn_idx_in = (optind+1 < argc)? argv[optind+1] : 0; -+ if (fn_idx_in == 0) { -+ fprintf(stderr, "[main_samview] incorrect number of arguments for -X option. Aborting.\n"); -+ return 1; -+ } -+ } -+ - if (settings.multi_region) { -- if (optind < argc - 1) { //regions have been specified in the command line -+ if (!has_index_file && optind < argc - 1) { //regions have been specified in the command line - settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file - if (!filter_op) - filter_state = FILTERED; -+ } else if (has_index_file && optind < argc - 2) { -+ settings.bed = bed_hash_regions(settings.bed, argv, optind+2, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file -+ if (!filter_op) -+ filter_state = FILTERED; - } else { - bed_unify(settings.bed); - } -@@ -518,7 +636,13 @@ - if (settings.bed == NULL) { // index is unavailable or no regions have been specified - fprintf(stderr, "[main_samview] no regions or BED file have been provided. Aborting.\n"); - } else { -- hts_idx_t *idx = sam_index_load(in, fn_in); // load index -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx_in != 0) { -+ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index -+ } else { -+ idx = sam_index_load(in, fn_in); -+ } - if (idx != NULL) { - - int regcount = 0; -@@ -555,7 +679,7 @@ - } - bam_destroy1(b); - } else { -- if (optind + 1 >= argc) { // convert/print the entire file -+ if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file - bam1_t *b = bam_init1(); - int r; - while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' -@@ -574,22 +698,25 @@ - } else { // retrieve alignments in specified regions - int i; - bam1_t *b; -- hts_idx_t *idx = sam_index_load(in, fn_in); // load index -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx_in != NULL) { -+ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index -+ } else { -+ idx = sam_index_load(in, fn_in); -+ } - if (idx == 0) { // index is unavailable - fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - ret = 1; - goto view_end; - } - b = bam_init1(); -- for (i = optind + 1; i < argc; ++i) { -+ -+ for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { - int result; - hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' - if (iter == NULL) { // region invalid or reference name not found -- int beg, end; -- if (hts_parse_reg(argv[i], &beg, &end)) -- fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); -- else -- fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); -+ fprintf(stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); - continue; - } - // fetch alignments -@@ -613,6 +740,17 @@ - } - } - -+ if (ga.write_index) { -+ if (sam_idx_save(out) < 0) { -+ print_error_errno("view", "writing index failed"); -+ ret = 1; -+ } -+ if (un_out && sam_idx_save(un_out) < 0) { -+ print_error_errno("view", "writing index failed"); -+ ret = 1; -+ } -+ } -+ - view_end: - if (is_count && ret == 0) { - if (fprintf(fn_out? fp_out : stdout, "%" PRId64 "\n", count) < 0) { -@@ -630,7 +768,7 @@ - - free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); - sam_global_args_free(&ga); -- if ( header ) bam_hdr_destroy(header); -+ if ( header ) sam_hdr_destroy(header); - if (settings.bed) bed_destroy(settings.bed); - if (settings.rghash) { - khint_t k; -@@ -638,13 +776,28 @@ - if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); - kh_destroy(rg, settings.rghash); - } -+ if (settings.tvhash) { -+ khint_t k; -+ for (k = 0; k < kh_end(settings.tvhash); ++k) -+ if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); -+ kh_destroy(tv, settings.tvhash); -+ } - if (settings.remove_aux_len) { - free(settings.remove_aux); - } -+ if (settings.tag) { -+ free(settings.tag); -+ } - - if (p.pool) - hts_tpool_destroy(p.pool); - -+ if (fn_out_idx) -+ free(fn_out_idx); -+ if (fn_un_out_idx) -+ free(fn_un_out_idx); -+ free(arg_list); -+ - return ret; - } - -@@ -667,10 +820,16 @@ - " -U FILE output reads not selected by filters to FILE [null]\n" - // extra input - " -t FILE FILE listing reference names and lengths (see long help) [null]\n" -+" -X include customized index file\n" - // read filters - " -L FILE only include reads overlapping this BED FILE [null]\n" - " -r STR only include reads in read group STR [null]\n" - " -R FILE only include reads with read group listed in FILE [null]\n" -+" -d STR:STR\n" -+" only include reads with tag STR and associated value STR [null]\n" -+" -D STR:FILE\n" -+" only include reads with tag STR and associated values listed in\n" -+" FILE [null]\n" - " -q INT only include reads with mapping quality >= INT [0]\n" - " -l STR only include reads in library STR [null]\n" - " -m INT only include reads with number of CIGAR operations consuming\n" -@@ -687,9 +846,10 @@ - " -B collapse the backward CIGAR operation\n" - // general options - " -? print long help, including note about region specification\n" --" -S ignored (input format is auto-detected)\n"); -+" -S ignored (input format is auto-detected)\n" -+" --no-PG do not add a PG line\n"); - -- sam_global_opt_help(fp, "-.O.T@"); -+ sam_global_opt_help(fp, "-.O.T@.."); - fprintf(fp, "\n"); - - if (is_long_help) -@@ -747,903 +907,3 @@ - free(argv2); - return ret; - } -- --int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; --static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; -- --static void bam2fq_usage(FILE *to, const char *command) --{ -- int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; -- fprintf(to, --"Usage: samtools %s [options...] \n", command); -- fprintf(to, --"Options:\n" --" -0 FILE write reads designated READ_OTHER to FILE\n" --" -1 FILE write reads designated READ1 to FILE\n" --" -2 FILE write reads designated READ2 to FILE\n" --" note: if a singleton file is specified with -s, only\n" --" paired reads will be written to the -1 and -2 files.\n" --" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x --" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 --" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) --" -n don't append /1 and /2 to the read name\n" --" -N always append /1 and /2 to the read name\n"); -- if (fq) fprintf(to, --" -O output quality in the OQ tag if present\n"); -- fprintf(to, --" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" --" -t copy RG, BC and QT tags to the %s header line\n", -- fq ? "FASTQ" : "FASTA"); -- fprintf(to, --" -T TAGLIST copy arbitrary tags to the %s header line\n", -- fq ? "FASTQ" : "FASTA"); -- if (fq) fprintf(to, --" -v INT default quality score if not given in file [1]\n" --" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" --" -c compression level [0..9] to use when creating gz or bgzf fastq files\n" --" --i1 FILE write first index reads to FILE\n" --" --i2 FILE write second index reads to FILE\n" --" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" --" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" --" --index-format STR How to parse barcode and quality tags\n\n"); -- sam_global_opt_help(to, "-.--.@"); -- fprintf(to, --"\n" --"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" --"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" --"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" --"or both unset.\n" --"Run 'samtools flags' for more information on flag codes and meanings.\n"); -- fprintf(to, --"\n" --"The index-format string describes how to parse the barcode and quality tags, for example:\n" --" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" --" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" --"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" --"'read until the separator or end of tag', for example:\n" --" n*i* ignore the left part of the tag until the separator, then use the second part\n" --" of the tag as index 1\n"); -- fprintf(to, --"\n" --"Examples:\n" --" To get just the paired reads in separate files, use:\n" --" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n -F 0x900 in.bam\n" --"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" --" samtools %s -F 0x900 in.bam > all_reads.%s\n", -- command, fq ? "fq" : "fa", fq ? "fq" : "fa", -- command, fq ? "fq" : "fa"); --} -- --typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; --typedef enum { FASTA, FASTQ } fastfile; --typedef struct bam2fq_opts { -- char *fnse; -- char *fnr[3]; -- char *fn_input; // pointer to input filename in argv do not free -- bool has12, has12always, use_oq, copy_tags, illumina_tag; -- int flag_on, flag_off, flag_alloff; -- sam_global_args ga; -- fastfile filetype; -- int def_qual; -- char *barcode_tag; -- char *quality_tag; -- char *index_file[2]; -- char *index_format; -- char *extra_tags; -- char compression_level; --} bam2fq_opts_t; -- --typedef struct bam2fq_state { -- samFile *fp; -- BGZF *fpse; -- BGZF *fpr[3]; -- BGZF *fpi[2]; -- BGZF *hstdout; -- bam_hdr_t *h; -- bool has12, use_oq, copy_tags, illumina_tag; -- int flag_on, flag_off, flag_alloff; -- fastfile filetype; -- int def_qual; -- klist_t(ktaglist) *taglist; -- char *index_sequence; -- char compression_level; --} bam2fq_state_t; -- --/* -- * Get and decode the read from a BAM record. -- * -- * TODO: htslib really needs an interface for this. Consider this or perhaps -- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str -- * functions as string formatted equivalents to bam_get_{seq,qual}? -- */ -- --/* -- * Reverse a string in place. -- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. -- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik -- */ --static char *reverse(char *str) --{ -- int i = strlen(str)-1,j=0; -- char ch; -- while (i>j) { -- ch = str[i]; -- str[i]= str[j]; -- str[j] = ch; -- i--; -- j++; -- } -- return str; --} -- --/* return the read, reverse complemented if necessary */ --static char *get_read(const bam1_t *rec) --{ -- int len = rec->core.l_qseq + 1; -- char *read = calloc(1, len); -- char *seq = (char *)bam_get_seq(rec); -- int n; -- -- if (!read) return NULL; -- -- for (n=0; n < rec->core.l_qseq; n++) { -- if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; -- else read[n] = seq_nt16_str[bam_seqi(seq,n)]; -- } -- if (rec->core.flag & BAM_FREVERSE) reverse(read); -- return read; --} -- --/* -- * get and decode the quality from a BAM record -- */ --static int get_quality(const bam1_t *rec, char **qual_out) --{ -- char *quality = calloc(1, rec->core.l_qseq + 1); -- char *q = (char *)bam_get_qual(rec); -- int n; -- -- if (!quality) return -1; -- -- if (*q == '\xff') { -- free(quality); -- *qual_out = NULL; -- return 0; -- } -- -- for (n=0; n < rec->core.l_qseq; n++) { -- quality[n] = q[n]+33; -- } -- if (rec->core.flag & BAM_FREVERSE) reverse(quality); -- *qual_out = quality; -- return 0; --} -- --// --// End of htslib complaints --// -- -- --static readpart which_readpart(const bam1_t *b) --{ -- if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { -- return READ_1; -- } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { -- return READ_2; -- } else { -- return READ_UNKNOWN; -- } --} -- --/* -- * parse the length part from the index-format string -- */ --static int getLength(char **s) --{ -- int n = 0; -- while (**s) { -- if (**s == '*') { n=-1; (*s)++; break; } -- if ( !isdigit(**s)) break; -- n = n*10 + ((**s)-'0'); -- (*s)++; -- } -- return n; --} -- --static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) --{ -- uint8_t *s = bam_aux_get(rec, tag); -- if (s) { -- char aux_type = *s; -- switch (aux_type) { -- case 'C': -- case 'S': aux_type = 'I'; break; -- case 'c': -- case 's': aux_type = 'i'; break; -- case 'd': aux_type = 'f'; break; -- } -- -- // Ensure space. Need 6 chars + length of tag. Max length of -- // i is 16, A is 21, B currently 26, Z is unknown, so -- // have to check that one later. -- if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; -- -- kputc('\t', linebuf); -- kputsn(tag, 2, linebuf); -- kputc(':', linebuf); -- kputc(aux_type=='I'? 'i': aux_type, linebuf); -- kputc(':', linebuf); -- switch (aux_type) { -- case 'H': -- case 'Z': -- if (kputs(bam_aux2Z(s), linebuf) < 0) return false; -- break; -- case 'i': kputw(bam_aux2i(s), linebuf); break; -- case 'I': kputuw(bam_aux2i(s), linebuf); break; -- case 'A': kputc(bam_aux2A(s), linebuf); break; -- case 'f': kputd(bam_aux2f(s), linebuf); break; -- case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; -- default: kputs("*** Unknown aux type ***", linebuf); return false; -- } -- } -- return true; --} -- --static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) --{ -- if (!index_sequence) return 0; -- -- kstring_t new = {0,0,NULL}; -- if (linebuf->s) { -- char *s = strchr(linebuf->s, '\n'); -- if (s) { -- if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) -- return -1; -- *s = 0; -- kputs(linebuf->s, &new); -- kputc(' ', &new); -- readpart readpart = which_readpart(rec); -- if (readpart == READ_1) kputc('1', &new); -- else if (readpart == READ_2) kputc('2', &new); -- else kputc('0', &new); -- -- kputc(':', &new); -- if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); -- else kputc('N', &new); -- -- kputs(":0:", &new); -- kputs(index_sequence, &new); -- kputc('\n', &new); -- kputs(s+1, &new); -- free(ks_release(linebuf)); -- linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; -- } -- } -- return 0; --} -- --static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) --{ -- int i; -- -- linebuf->l = 0; -- // Write read name -- if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; -- if (kputs(bam_get_qname(rec), linebuf) < 0) return false; -- // Add the /1 /2 if requested -- if (state->has12) { -- readpart readpart = which_readpart(rec); -- if (readpart == READ_1) { -- if (kputs("/1", linebuf) < 0) return false; -- } else if (readpart == READ_2) { -- if (kputs("/2", linebuf) < 0) return false; -- } -- } -- if (state->copy_tags) { -- for (i = 0; copied_tags[i]; ++i) { -- if (!copy_tag(copied_tags[i], rec, linebuf)) { -- fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -- return false; -- } -- } -- } -- -- if (state->taglist->size) { -- kliter_t(ktaglist) *p; -- for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { -- if (!copy_tag(kl_val(p), rec, linebuf)) { -- fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -- return false; -- } -- } -- } -- -- if (kputc('\n', linebuf) < 0) return false; -- if (kputs(seq, linebuf) < 0) return false; -- if (kputc('\n', linebuf) < 0) return false; -- -- if (state->filetype == FASTQ) { -- // Write quality -- if (kputs("+\n", linebuf) < 0) return false; -- if (qual && *qual) { -- if (kputs(qual, linebuf) < 0) return false; -- } else { -- int len = strlen(seq); -- if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; -- for (i = 0; i < len; ++i) { -- kputc(33 + state->def_qual, linebuf); -- } -- } -- if (kputc('\n', linebuf) < 0) return false; -- } -- return true; --} -- --/* -- * Create FASTQ lines from the barcode tag using the index-format -- */ --static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) --{ -- uint8_t *p; -- char *ifmt = opts->index_format; -- char *tag = NULL; -- char *qual = NULL; -- char *sub_tag = NULL; -- char *sub_qual = NULL; -- size_t tag_len; -- int file_number = 0; -- kstring_t linebuf = { 0, 0, NULL }; // Buffer -- -- -- // read barcode tag -- p = bam_aux_get(rec,opts->barcode_tag); -- if (p) tag = bam_aux2Z(p); -- -- if (!tag) return true; // there is no tag -- -- tag_len = strlen(tag); -- sub_tag = calloc(1, tag_len + 1); -- if (!sub_tag) goto fail; -- sub_qual = calloc(1, tag_len + 1); -- if (!sub_qual) goto fail; -- -- // read quality tag -- p = bam_aux_get(rec, opts->quality_tag); -- if (p) qual = bam_aux2Z(p); -- -- // Parse the index-format string -- while (*ifmt) { -- if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly -- char action = *ifmt; // should be 'i' or 'n' -- ifmt++; // skip over action -- int index_len = getLength(&ifmt); -- int n = 0; -- -- if (index_len < 0) { -- // read until separator -- while (isalpha(*tag)) { -- sub_tag[n] = *tag++; -- if (qual) sub_qual[n] = *qual++; -- n++; -- } -- if (*tag) { // skip separator -- tag++; -- if (qual) qual++; -- } -- } else { -- // read index_len characters -- while (index_len-- && *tag) { -- sub_tag[n] = *tag++; -- if (qual) sub_qual[n] = *qual++; -- n++; -- } -- } -- sub_tag[n] = '\0'; -- sub_qual[n] = '\0'; -- -- if (action=='i' && *sub_tag && state->fpi[file_number]) { -- //if (file_number==0) state->index_sequence = strdup(sub_tag); // we're going to need this later... -- state->index_sequence = strdup(sub_tag); // we're going to need this later... -- if (!state->index_sequence) goto fail; -- if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; -- if (state->illumina_tag) { -- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf, rec) < 0) { -- goto fail; -- } -- } -- if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) -- goto fail; -- } -- -- } -- -- free(sub_qual); free(sub_tag); -- free(linebuf.s); -- return true; -- -- fail: -- perror(__func__); -- free(sub_qual); free(sub_tag); -- free(linebuf.s); -- return true; --} -- --// Transform a bam1_t record into a string with the FASTQ representation of it --// @returns false for error, true for success --static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) --{ -- int32_t qlen = b->core.l_qseq; -- assert(qlen >= 0); -- const uint8_t *oq = NULL; -- char *qual = NULL; -- -- char *seq = get_read(b); -- if (!seq) return false; -- -- if (state->use_oq) oq = bam_aux_get(b, "OQ"); -- if (oq && *oq=='Z') { -- qual = strdup(bam_aux2Z(oq)); -- if (!qual) goto fail; -- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented -- reverse(qual); -- } -- } else { -- if (get_quality(b, &qual) < 0) goto fail; -- } -- -- if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; -- -- free(qual); -- free(seq); -- return true; -- -- fail: -- free(seq); -- free(qual); -- return false; --} -- --static void free_opts(bam2fq_opts_t *opts) --{ -- free(opts->barcode_tag); -- free(opts->quality_tag); -- free(opts->index_format); -- free(opts->extra_tags); -- free(opts); --} -- --// return true if valid --static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) --{ -- // Parse args -- bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); -- opts->has12 = true; -- opts->has12always = false; -- opts->filetype = FASTQ; -- opts->def_qual = 1; -- opts->barcode_tag = NULL; -- opts->quality_tag = NULL; -- opts->index_format = NULL; -- opts->index_file[0] = NULL; -- opts->index_file[1] = NULL; -- opts->extra_tags = NULL; -- opts->compression_level = 1; -- -- int c; -- sam_global_args_init(&opts->ga); -- static const struct option lopts[] = { -- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), -- {"i1", required_argument, NULL, 1}, -- {"I1", required_argument, NULL, 1}, -- {"i2", required_argument, NULL, 2}, -- {"I2", required_argument, NULL, 2}, -- {"if", required_argument, NULL, 3}, -- {"IF", required_argument, NULL, 3}, -- {"index-format", required_argument, NULL, 3}, -- {"barcode-tag", required_argument, NULL, 'b'}, -- {"quality-tag", required_argument, NULL, 'q'}, -- { NULL, 0, NULL, 0 } -- }; -- while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { -- switch (c) { -- case 'b': opts->barcode_tag = strdup(optarg); break; -- case 'q': opts->quality_tag = strdup(optarg); break; -- case 1 : opts->index_file[0] = optarg; break; -- case 2 : opts->index_file[1] = optarg; break; -- case 3 : opts->index_format = strdup(optarg); break; -- case '0': opts->fnr[0] = optarg; break; -- case '1': opts->fnr[1] = optarg; break; -- case '2': opts->fnr[2] = optarg; break; -- case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; -- case 'F': opts->flag_off |= strtol(optarg, 0, 0); break; -- case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; -- case 'n': opts->has12 = false; break; -- case 'N': opts->has12always = true; break; -- case 'O': opts->use_oq = true; break; -- case 's': opts->fnse = optarg; break; -- case 't': opts->copy_tags = true; break; -- case 'i': opts->illumina_tag = true; break; -- case 'c': opts->compression_level = atoi(optarg); break; -- case 'T': opts->extra_tags = strdup(optarg); break; -- case 'v': opts->def_qual = atoi(optarg); break; -- case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; -- default: -- if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { -- bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; -- } -- break; -- } -- } -- -- if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; -- if (opts->has12always) opts->has12 = true; -- -- if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); -- if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); -- -- int nIndex = 0; -- if (opts->index_format) { -- char *s; -- for (s = opts->index_format; *s; s++) { -- if (*s == 'i') nIndex++; -- } -- } -- if (nIndex>2) { -- fprintf(stderr,"Invalid index format: more than 2 indexes\n"); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (opts->index_file[1] && !opts->index_file[0]) { -- fprintf(stderr, "Index one specified, but index two not given\n"); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (nIndex==2 && !opts->index_file[1]) { -- fprintf(stderr, "index_format specifies two indexes, but only one index file given\n"); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (nIndex==1 && !opts->index_file[0]) { -- fprintf(stderr, "index_format specifies an index, but no index file given\n"); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (nIndex==0 && opts->index_file[0]) { -- fprintf(stderr, "index_format not specified, but index file given\n"); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (opts->def_qual < 0 || 93 < opts->def_qual) { -- fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- const char* type_str = argv[0]; -- if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { -- opts->filetype = FASTQ; -- } else if (strcasecmp("fasta", type_str) == 0) { -- opts->filetype = FASTA; -- } else { -- print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if ((argc - (optind)) == 0) { -- fprintf(stderr, "No input file specified.\n"); -- bam2fq_usage(stdout, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if ((argc - (optind)) != 1) { -- fprintf(stderr, "Too many arguments.\n"); -- bam2fq_usage(stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- opts->fn_input = argv[optind]; -- *opts_out = opts; -- return true; --} -- --static BGZF *open_fqfile(char *filename, int c) --{ -- char mode[4] = "w"; -- size_t len = strlen(filename); -- -- mode[2] = 0; mode[3] = 0; -- if (len > 3 && strstr(filename + (len - 3),".gz")) { -- mode[1] = 'g'; mode[2] = c+'0'; -- } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) -- || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { -- mode[1] = c+'0'; -- } else { -- mode[1] = 'u'; -- } -- -- return bgzf_open(filename,mode); --} -- --static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) --{ -- bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); -- state->flag_on = opts->flag_on; -- state->flag_off = opts->flag_off; -- state->flag_alloff = opts->flag_alloff; -- state->has12 = opts->has12; -- state->use_oq = opts->use_oq; -- state->illumina_tag = opts->illumina_tag; -- state->copy_tags = opts->copy_tags; -- state->filetype = opts->filetype; -- state->def_qual = opts->def_qual; -- state->index_sequence = NULL; -- state->hstdout = NULL; -- state->compression_level = opts->compression_level; -- -- state->taglist = kl_init(ktaglist); -- if (opts->extra_tags) { -- char *save_p; -- char *s = strtok_r(opts->extra_tags, ",", &save_p); -- while (s) { -- if (strlen(s) != 2) { -- fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s); -- free(state); -- return false; -- } -- char **et = kl_pushp(ktaglist, state->taglist); -- *et = s; -- s = strtok_r(NULL, ",", &save_p); -- } -- } -- -- state->fp = sam_open(opts->fn_input, "r"); -- if (state->fp == NULL) { -- print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); -- free(state); -- return false; -- } -- if (opts->ga.nthreads > 0) -- hts_set_threads(state->fp, opts->ga.nthreads); -- uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; -- if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; -- if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -- fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); -- free(state); -- return false; -- } -- if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { -- fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); -- free(state); -- return false; -- } -- if (opts->fnse) { -- state->fpse = open_fqfile(opts->fnse, state->compression_level); -- if (state->fpse == NULL) { -- print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); -- free(state); -- return false; -- } -- } -- -- if (opts->ga.reference) { -- if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { -- print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); -- free(state); -- return false; -- } -- } -- -- int i; -- for (i = 0; i < 3; ++i) { -- if (opts->fnr[i]) { -- state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level); -- if (state->fpr[i] == NULL) { -- print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]); -- free(state); -- return false; -- } -- } else { -- if (!state->hstdout) { -- state->hstdout = bgzf_dopen(fileno(stdout), "wu"); -- if (!state->hstdout) { -- print_error_errno("bam2fq", "Cannot open STDOUT"); -- free(state); -- return false; -- } -- } -- state->fpr[i] = state->hstdout; -- } -- } -- for (i = 0; i < 2; i++) { -- state->fpi[i] = NULL; -- if (opts->index_file[i]) { -- state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level); -- if (state->fpi[i] == NULL) { -- print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]); -- free(state); -- return false; -- } -- } -- } -- -- state->h = sam_hdr_read(state->fp); -- if (state->h == NULL) { -- fprintf(stderr, "Failed to read header for \"%s\"\n", opts->fn_input); -- free(state); -- return false; -- } -- -- *state_out = state; -- return true; --} -- --static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) --{ -- bool valid = true; -- bam_hdr_destroy(state->h); -- check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); -- if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } -- int i; -- for (i = 0; i < 3; ++i) { -- if (state->fpr[i] != state->hstdout) { -- if (bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } -- } -- } -- if (state->hstdout) { -- if (bgzf_close(state->hstdout)) { -- print_error_errno("bam2fq", "Error closing STDOUT"); -- valid = false; -- } -- } -- for (i = 0; i < 2; i++) { -- if (state->fpi[i] && bgzf_close(state->fpi[i])) { -- print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); -- valid = false; -- } -- } -- kl_destroy(ktaglist,state->taglist); -- free(state->index_sequence); -- free(state); -- return valid; --} -- --static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) --{ -- return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments -- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags -- || (b->core.flag&(state->flag_off)) != 0 -- || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); -- --} -- --static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) --{ -- int n; -- bam1_t *records[3]; -- bam1_t* b = bam_init1(); -- char *current_qname = NULL; -- int64_t n_reads = 0, n_singletons = 0; // Statistics -- kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; -- int score[3]; -- int at_eof; -- if (b == NULL ) { -- perror("[bam2fq_mainloop] Malloc error for bam record buffer."); -- return false; -- } -- -- bool valid = true; -- while (true) { -- int res = sam_read1(state->fp, state->h, b); -- if (res < -1) { -- fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); -- return false; -- } -- at_eof = res < 0; -- -- if (!at_eof && filter_it_out(b, state)) continue; -- if (!at_eof) ++n_reads; -- -- if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { -- if (current_qname) { -- if (state->illumina_tag) { -- for (n=0; valid && n<3; n++) { -- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; -- } -- if (!valid) break; -- } -- free(state->index_sequence); state->index_sequence = NULL; -- if (score[1] > 0 && score[2] > 0) { -- // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] -- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -- } else if (score[1] > 0 || score[2] > 0) { -- if (state->fpse) { -- // print whichever one exists to fpse -- if (score[1] > 0) { -- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -- } else { -- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -- } -- ++n_singletons; -- } else { -- if (score[1] > 0) { -- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -- } else { -- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -- } -- } -- } -- if (score[0]) { // TODO: check this -- // print linebuf[0] to fpr[0] -- if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } -- } -- } -- -- if (at_eof) break; -- -- free(current_qname); -- current_qname = strdup(bam_get_qname(b)); -- if (!current_qname) { valid = false; break; } -- score[0] = score[1] = score[2] = 0; -- } -- -- // Prefer a copy of the read that has base qualities -- int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; -- if (b_score > score[which_readpart(b)]) { -- if (state->fpi[0]) if (!tags2fq(b, state, opts)) return false; -- records[which_readpart(b)] = b; -- if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) { -- fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__); -- return false; -- } -- score[which_readpart(b)] = b_score; -- } -- } -- if (!valid) -- { -- perror("[bam2fq_mainloop] Error writing to FASTx files."); -- } -- bam_destroy1(b); -- free(current_qname); -- free(linebuf[0].s); -- free(linebuf[1].s); -- free(linebuf[2].s); -- fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); -- fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); -- -- return valid; --} -- --int main_bam2fq(int argc, char *argv[]) --{ -- int status = EXIT_SUCCESS; -- bam2fq_opts_t* opts = NULL; -- bam2fq_state_t* state = NULL; -- -- bool valid = parse_opts(argc, argv, &opts); -- if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; -- -- if (!init_state(opts, &state)) return EXIT_FAILURE; -- -- if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; -- -- if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; -- sam_global_args_free(&opts->ga); -- free_opts(opts); -- -- return status; --} ---- python-pysam.orig/samtools/sam_view.c.pysam.c -+++ python-pysam/samtools/sam_view.c.pysam.c -@@ -2,7 +2,7 @@ - - /* sam_view.c -- SAM<->BAM<->CRAM conversion. - -- Copyright (C) 2009-2017 Genome Research Ltd. -+ Copyright (C) 2009-2019 Genome Research Ltd. - Portions copyright (C) 2009, 2011, 2012 Broad Institute. - - Author: Heng Li -@@ -34,33 +34,25 @@ - #include - #include - #include --#include --#include - #include --#include - #include "htslib/sam.h" - #include "htslib/faidx.h" --#include "htslib/kstring.h" - #include "htslib/khash.h" --#include "htslib/klist.h" - #include "htslib/thread_pool.h" --#include "htslib/bgzf.h" - #include "samtools.h" - #include "sam_opts.h" - #include "bedidx.h" - --#define DEFAULT_BARCODE_TAG "BC" --#define DEFAULT_QUALITY_TAG "QT" -- - KHASH_SET_INIT_STR(rg) --#define taglist_free(p) --KLIST_INIT(ktaglist, char*, taglist_free) -+KHASH_SET_INIT_STR(tv) - - typedef khash_t(rg) *rghash_t; -+typedef khash_t(tv) *tvhash_t; - - // This structure contains the settings for a samview run - typedef struct samview_settings { - rghash_t rghash; -+ tvhash_t tvhash; - int min_mapQ; - int flag_on; - int flag_off; -@@ -74,16 +66,17 @@ - size_t remove_aux_len; - char** remove_aux; - int multi_region; -+ char* tag; - } samview_settings_t; - - - // TODO Add declarations of these to a viable htslib or samtools header --extern const char *bam_get_library(bam_hdr_t *header, const bam1_t *b); -+extern const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); - extern int bam_remove_B(bam1_t *b); - extern char *samfaipath(const char *fn_ref); - - // Returns 0 to indicate read should be output 1 otherwise --static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings) -+static int process_aln(const sam_hdr_t *h, bam1_t *b, samview_settings_t* settings) - { - if (settings->remove_B) bam_remove_B(b); - if (settings->min_qlen > 0) { -@@ -98,7 +91,7 @@ - return 1; - if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff)) - return 1; -- if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) -+ if (!settings->multi_region && settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, sam_hdr_tid2name(h, b->core.tid), b->core.pos, bam_endpos(b)))) - return 1; - if (settings->subsam_frac > 0.) { - uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed); -@@ -111,8 +104,17 @@ - if (k == kh_end(settings->rghash)) return 1; - } - } -+ if (settings->tvhash && settings->tag) { -+ uint8_t *s = bam_aux_get(b, settings->tag); -+ if (s) { -+ khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1)); -+ if (k == kh_end(settings->tvhash)) return 1; -+ } else { -+ return 1; -+ } -+ } - if (settings->library) { -- const char *p = bam_get_library((bam_hdr_t*)h, b); -+ const char *p = bam_get_library((sam_hdr_t*)h, b); - if (!p || strcmp(p, settings->library) != 0) return 1; - } - if (settings->remove_aux_len) { -@@ -127,37 +129,6 @@ - return 0; - } - --static char *drop_rg(char *hdtxt, rghash_t h, int *len) --{ -- char *p = hdtxt, *q, *r, *s; -- kstring_t str; -- memset(&str, 0, sizeof(kstring_t)); -- while (1) { -- int toprint = 0; -- q = strchr(p, '\n'); -- if (q == 0) q = p + strlen(p); -- if (q - p < 3) break; // the line is too short; then stop -- if (strncmp(p, "@RG\t", 4) == 0) { -- int c; -- khint_t k; -- if ((r = strstr(p, "\tID:")) != 0) { -- r += 4; -- for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); -- c = *s; *s = '\0'; -- k = kh_get(rg, h, r); -- *s = c; -- if (k != kh_end(h)) toprint = 1; -- } -- } else toprint = 1; -- if (toprint) { -- kputsn(p, q - p, &str); kputc('\n', &str); -- } -- p = q + 1; -- } -- *len = str.l; -- return str.s; --} -- - static int usage(FILE *fp, int exit_status, int is_long_help); - - static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name) -@@ -219,39 +190,87 @@ - return (ret != -1) ? 0 : -1; - } - --static inline int check_sam_write1(samFile *fp, const bam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) -+static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) - { -- int r = sam_write1(fp, h, b); -- if (r >= 0) return r; -+ char *d = strdup(name); -+ int ret = 0; - -- if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); -- else print_error_errno("view", "writing to standard output failed"); -+ if (d == NULL) goto err; - -- *retp = EXIT_FAILURE; -- return r; -+ if (settings->tvhash == NULL) { -+ settings->tvhash = kh_init(tv); -+ if (settings->tvhash == NULL) goto err; -+ } -+ -+ kh_put(tv, settings->tvhash, d, &ret); -+ if (ret == -1) goto err; -+ if (ret == 0) free(d); /* Duplicate */ -+ return 0; -+ -+ err: -+ print_error(subcmd, "Couldn't add \"%s\" to tag values list: memory exhausted?", name); -+ free(d); -+ return -1; -+} -+ -+static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn) -+{ -+ FILE *fp; -+ char buf[1024]; -+ int ret = 0; -+ if (settings->tvhash == NULL) { -+ settings->tvhash = kh_init(tv); -+ if (settings->tvhash == NULL) { -+ perror(NULL); -+ return -1; -+ } -+ } -+ -+ fp = fopen(fn, "r"); -+ if (fp == NULL) { -+ print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); -+ return -1; -+ } -+ -+ while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { -+ char *d = strdup(buf); -+ if (d != NULL) { -+ kh_put(tv, settings->tvhash, d, &ret); -+ if (ret == 0) free(d); /* Duplicate */ -+ } else { -+ ret = -1; -+ } -+ } -+ if (ferror(fp)) ret = -1; -+ if (ret == -1) { -+ print_error_errno(subcmd, "failed to read \"%s\"", fn); -+ } -+ fclose(fp); -+ return (ret != -1) ? 0 : -1; - } - --static void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp) -+static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) - { -- int r = sam_close(fp); -- if (r >= 0) return; -+ int r = sam_write1(fp, h, b); -+ if (r >= 0) return r; - -- // TODO Need error infrastructure so we can print a message instead of r -- if (fname) print_error(subcmd, "error closing \"%s\": %d", fname, r); -- else print_error(subcmd, "error closing %s: %d", null_fname, r); -+ if (fname) print_error_errno("view", "writing to \"%s\" failed", fname); -+ else print_error_errno("view", "writing to standard output failed"); - - *retp = EXIT_FAILURE; -+ return r; - } - - int main_samview(int argc, char *argv[]) - { -- int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; -+ int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; - int64_t count = 0; - samFile *in = 0, *out = 0, *un_out=0; - FILE *fp_out = NULL; -- bam_hdr_t *header = NULL; -+ sam_hdr_t *header = NULL; - char out_mode[5], out_un_mode[5], *out_format = ""; -- char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; -+ char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; -+ char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - htsThreadPool p = {NULL, 0}; - int filter_state = ALL, filter_op = 0; -@@ -259,6 +278,7 @@ - - samview_settings_t settings = { - .rghash = NULL, -+ .tvhash = NULL, - .min_mapQ = 0, - .flag_on = 0, - .flag_off = 0, -@@ -269,11 +289,13 @@ - .subsam_frac = -1., - .library = NULL, - .bed = NULL, -- .multi_region = 0 -+ .multi_region = 0, -+ .tag = NULL - }; - - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), -+ {"no-PG", no_argument, NULL, 1}, - { NULL, 0, NULL, 0 } - }; - -@@ -290,7 +312,7 @@ - opterr = 0; - - while ((c = getopt_long(argc, argv, -- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:L:s:@:m:x:U:M", -+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX", - lopts, NULL)) >= 0) { - switch (c) { - case 's': -@@ -300,7 +322,6 @@ - srand(settings.subsam_seed); - settings.subsam_seed = rand(); - } -- - if (q && *q == '.') { - settings.subsam_frac = strtod(q, &q); - if (*q) ret = 1; -@@ -323,6 +344,7 @@ - case 'H': is_header_only = 1; break; - case 'o': fn_out = strdup(optarg); break; - case 'U': fn_un_out = strdup(optarg); break; -+ case 'X': has_index_file = 1; break; - case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; - case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; - case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; -@@ -349,6 +371,63 @@ - goto view_end; - } - break; -+ case 'd': -+ if (strlen(optarg) < 4 || optarg[2] != ':') { -+ print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ -+ if (settings.tag) { -+ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { -+ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); -+ ret = 1; -+ goto view_end; -+ } -+ } else { -+ if (!(settings.tag = calloc(3, 1))) { -+ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ memcpy(settings.tag, optarg, 2); -+ } -+ -+ if (add_tag_value_single("view", &settings, optarg+3) != 0) { -+ ret = 1; -+ goto view_end; -+ } -+ break; -+ case 'D': -+ // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX -+ // path translation as described at: -+ // http://www.mingw.org/wiki/Posix_path_conversion -+ if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { -+ print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ -+ if (settings.tag) { -+ if (settings.tag[0] != optarg[0] || settings.tag[1] != optarg[1]) { -+ print_error("view", "Different tag \"%s\" was specified before: \"%s\"", settings.tag, optarg); -+ ret = 1; -+ goto view_end; -+ } -+ } else { -+ if (!(settings.tag = calloc(3, 1))) { -+ print_error("view", "Could not allocate memory for tag: \"%s\"", optarg); -+ ret = 1; -+ goto view_end; -+ } -+ memcpy(settings.tag, optarg, 2); -+ } -+ -+ if (add_tag_values_file("view", &settings, optarg+3) != 0) { -+ ret = 1; -+ goto view_end; -+ } -+ break; - /* REMOVED as htslib doesn't support this - //case 'x': out_format = "x"; break; - //case 'X': out_format = "X"; break; -@@ -382,6 +461,7 @@ - } - break; - case 'M': settings.multi_region = 1; break; -+ case 1: no_pg = 1; break; - default: - if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) - return usage(samtools_stderr, EXIT_FAILURE, 0); -@@ -431,13 +511,8 @@ - ret = 1; - goto view_end; - } -- if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... -- char *tmp; -- int l; -- tmp = drop_rg(header->text, settings.rghash, &l); -- free(header->text); -- header->text = tmp; -- header->l_text = l; -+ if (settings.rghash) { -+ sam_hdr_remove_lines(header, "RG", "ID", settings.rghash); - } - if (!is_count) { - if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { -@@ -452,7 +527,25 @@ - goto view_end; - } - } -- if (*out_format || is_header || -+ -+ if (!no_pg) { -+ if (!(arg_list = stringify_argv(argc+1, argv-1))) { -+ print_error("view", "failed to create arg_list"); -+ ret = 1; -+ goto view_end; -+ } -+ if (sam_hdr_add_pg(header, "samtools", -+ "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)) { -+ print_error("view", "failed to add PG line to the header"); -+ ret = 1; -+ goto view_end; -+ } -+ } -+ -+ if (*out_format || ga.write_index || is_header || - out_mode[1] == 'b' || out_mode[1] == 'c' || - (ga.out.format != sam && ga.out.format != unknown_format)) { - if (sam_hdr_write(out, header) != 0) { -@@ -461,6 +554,13 @@ - goto view_end; - } - } -+ if (ga.write_index) { -+ if (!(fn_out_idx = auto_index(out, fn_out, header))) { -+ ret = 1; -+ goto view_end; -+ } -+ } -+ - if (fn_un_out) { - if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { - print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); -@@ -483,6 +583,12 @@ - goto view_end; - } - } -+ if (ga.write_index) { -+ if (!(fn_un_out_idx = auto_index(un_out, fn_un_out, header))) { -+ ret = 1; -+ goto view_end; -+ } -+ } - } - } - else { -@@ -507,11 +613,23 @@ - } - if (is_header_only) goto view_end; // no need to print alignments - -+ if (has_index_file) { -+ fn_idx_in = (optind+1 < argc)? argv[optind+1] : 0; -+ if (fn_idx_in == 0) { -+ fprintf(samtools_stderr, "[main_samview] incorrect number of arguments for -X option. Aborting.\n"); -+ return 1; -+ } -+ } -+ - if (settings.multi_region) { -- if (optind < argc - 1) { //regions have been specified in the command line -+ if (!has_index_file && optind < argc - 1) { //regions have been specified in the command line - settings.bed = bed_hash_regions(settings.bed, argv, optind+1, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file - if (!filter_op) - filter_state = FILTERED; -+ } else if (has_index_file && optind < argc - 2) { -+ settings.bed = bed_hash_regions(settings.bed, argv, optind+2, argc, &filter_op); //insert(1) or filter out(0) the regions from the command line in the same hash table as the bed file -+ if (!filter_op) -+ filter_state = FILTERED; - } else { - bed_unify(settings.bed); - } -@@ -520,7 +638,13 @@ - if (settings.bed == NULL) { // index is unavailable or no regions have been specified - fprintf(samtools_stderr, "[main_samview] no regions or BED file have been provided. Aborting.\n"); - } else { -- hts_idx_t *idx = sam_index_load(in, fn_in); // load index -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx_in != 0) { -+ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index -+ } else { -+ idx = sam_index_load(in, fn_in); -+ } - if (idx != NULL) { - - int regcount = 0; -@@ -557,7 +681,7 @@ - } - bam_destroy1(b); - } else { -- if (optind + 1 >= argc) { // convert/print the entire file -+ if ((has_index_file && optind >= argc - 2) || (!has_index_file && optind >= argc - 1)) { // convert/print the entire file - bam1_t *b = bam_init1(); - int r; - while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' -@@ -576,22 +700,25 @@ - } else { // retrieve alignments in specified regions - int i; - bam1_t *b; -- hts_idx_t *idx = sam_index_load(in, fn_in); // load index -+ hts_idx_t *idx = NULL; -+ // If index filename has not been specfied, look in BAM folder -+ if (fn_idx_in != NULL) { -+ idx = sam_index_load2(in, fn_in, fn_idx_in); // load index -+ } else { -+ idx = sam_index_load(in, fn_in); -+ } - if (idx == 0) { // index is unavailable - fprintf(samtools_stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); - ret = 1; - goto view_end; - } - b = bam_init1(); -- for (i = optind + 1; i < argc; ++i) { -+ -+ for (i = (has_index_file)? optind+2 : optind+1; i < argc; ++i) { - int result; - hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' - if (iter == NULL) { // region invalid or reference name not found -- int beg, end; -- if (hts_parse_reg(argv[i], &beg, &end)) -- fprintf(samtools_stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); -- else -- fprintf(samtools_stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); -+ fprintf(samtools_stderr, "[main_samview] region \"%s\" specifies an invalid region or unknown reference. Continue anyway.\n", argv[i]); - continue; - } - // fetch alignments -@@ -615,6 +742,17 @@ - } - } - -+ if (ga.write_index) { -+ if (sam_idx_save(out) < 0) { -+ print_error_errno("view", "writing index failed"); -+ ret = 1; -+ } -+ if (un_out && sam_idx_save(un_out) < 0) { -+ print_error_errno("view", "writing index failed"); -+ ret = 1; -+ } -+ } -+ - view_end: - if (is_count && ret == 0) { - if (fprintf(fn_out? fp_out : samtools_stdout, "%" PRId64 "\n", count) < 0) { -@@ -632,7 +770,7 @@ - - free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); - sam_global_args_free(&ga); -- if ( header ) bam_hdr_destroy(header); -+ if ( header ) sam_hdr_destroy(header); - if (settings.bed) bed_destroy(settings.bed); - if (settings.rghash) { - khint_t k; -@@ -640,13 +778,28 @@ - if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); - kh_destroy(rg, settings.rghash); - } -+ if (settings.tvhash) { -+ khint_t k; -+ for (k = 0; k < kh_end(settings.tvhash); ++k) -+ if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); -+ kh_destroy(tv, settings.tvhash); -+ } - if (settings.remove_aux_len) { - free(settings.remove_aux); - } -+ if (settings.tag) { -+ free(settings.tag); -+ } - - if (p.pool) - hts_tpool_destroy(p.pool); - -+ if (fn_out_idx) -+ free(fn_out_idx); -+ if (fn_un_out_idx) -+ free(fn_un_out_idx); -+ free(arg_list); -+ - return ret; - } - -@@ -669,10 +822,16 @@ - " -U FILE output reads not selected by filters to FILE [null]\n" - // extra input - " -t FILE FILE listing reference names and lengths (see long help) [null]\n" -+" -X include customized index file\n" - // read filters - " -L FILE only include reads overlapping this BED FILE [null]\n" - " -r STR only include reads in read group STR [null]\n" - " -R FILE only include reads with read group listed in FILE [null]\n" -+" -d STR:STR\n" -+" only include reads with tag STR and associated value STR [null]\n" -+" -D STR:FILE\n" -+" only include reads with tag STR and associated values listed in\n" -+" FILE [null]\n" - " -q INT only include reads with mapping quality >= INT [0]\n" - " -l STR only include reads in library STR [null]\n" - " -m INT only include reads with number of CIGAR operations consuming\n" -@@ -689,9 +848,10 @@ - " -B collapse the backward CIGAR operation\n" - // general options - " -? print long help, including note about region specification\n" --" -S ignored (input format is auto-detected)\n"); -+" -S ignored (input format is auto-detected)\n" -+" --no-PG do not add a PG line\n"); - -- sam_global_opt_help(fp, "-.O.T@"); -+ sam_global_opt_help(fp, "-.O.T@.."); - fprintf(fp, "\n"); - - if (is_long_help) -@@ -749,903 +909,3 @@ - free(argv2); - return ret; - } -- --int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; --static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; -- --static void bam2fq_usage(FILE *to, const char *command) --{ -- int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; -- fprintf(to, --"Usage: samtools %s [options...] \n", command); -- fprintf(to, --"Options:\n" --" -0 FILE write reads designated READ_OTHER to FILE\n" --" -1 FILE write reads designated READ1 to FILE\n" --" -2 FILE write reads designated READ2 to FILE\n" --" note: if a singleton file is specified with -s, only\n" --" paired reads will be written to the -1 and -2 files.\n" --" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x --" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 --" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) --" -n don't append /1 and /2 to the read name\n" --" -N always append /1 and /2 to the read name\n"); -- if (fq) fprintf(to, --" -O output quality in the OQ tag if present\n"); -- fprintf(to, --" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" --" -t copy RG, BC and QT tags to the %s header line\n", -- fq ? "FASTQ" : "FASTA"); -- fprintf(to, --" -T TAGLIST copy arbitrary tags to the %s header line\n", -- fq ? "FASTQ" : "FASTA"); -- if (fq) fprintf(to, --" -v INT default quality score if not given in file [1]\n" --" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" --" -c compression level [0..9] to use when creating gz or bgzf fastq files\n" --" --i1 FILE write first index reads to FILE\n" --" --i2 FILE write second index reads to FILE\n" --" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" --" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" --" --index-format STR How to parse barcode and quality tags\n\n"); -- sam_global_opt_help(to, "-.--.@"); -- fprintf(to, --"\n" --"Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" --"Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" --"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" --"or both unset.\n" --"Run 'samtools flags' for more information on flag codes and meanings.\n"); -- fprintf(to, --"\n" --"The index-format string describes how to parse the barcode and quality tags, for example:\n" --" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" --" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" --"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" --"'read until the separator or end of tag', for example:\n" --" n*i* ignore the left part of the tag until the separator, then use the second part\n" --" of the tag as index 1\n"); -- fprintf(to, --"\n" --"Examples:\n" --" To get just the paired reads in separate files, use:\n" --" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n -F 0x900 in.bam\n" --"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" --" samtools %s -F 0x900 in.bam > all_reads.%s\n", -- command, fq ? "fq" : "fa", fq ? "fq" : "fa", -- command, fq ? "fq" : "fa"); --} -- --typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart; --typedef enum { FASTA, FASTQ } fastfile; --typedef struct bam2fq_opts { -- char *fnse; -- char *fnr[3]; -- char *fn_input; // pointer to input filename in argv do not free -- bool has12, has12always, use_oq, copy_tags, illumina_tag; -- int flag_on, flag_off, flag_alloff; -- sam_global_args ga; -- fastfile filetype; -- int def_qual; -- char *barcode_tag; -- char *quality_tag; -- char *index_file[2]; -- char *index_format; -- char *extra_tags; -- char compression_level; --} bam2fq_opts_t; -- --typedef struct bam2fq_state { -- samFile *fp; -- BGZF *fpse; -- BGZF *fpr[3]; -- BGZF *fpi[2]; -- BGZF *hsamtools_stdout; -- bam_hdr_t *h; -- bool has12, use_oq, copy_tags, illumina_tag; -- int flag_on, flag_off, flag_alloff; -- fastfile filetype; -- int def_qual; -- klist_t(ktaglist) *taglist; -- char *index_sequence; -- char compression_level; --} bam2fq_state_t; -- --/* -- * Get and decode the read from a BAM record. -- * -- * TODO: htslib really needs an interface for this. Consider this or perhaps -- * bam_get_seq_str (current vs original orientation) and bam_get_qual_str -- * functions as string formatted equivalents to bam_get_{seq,qual}? -- */ -- --/* -- * Reverse a string in place. -- * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. -- * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik -- */ --static char *reverse(char *str) --{ -- int i = strlen(str)-1,j=0; -- char ch; -- while (i>j) { -- ch = str[i]; -- str[i]= str[j]; -- str[j] = ch; -- i--; -- j++; -- } -- return str; --} -- --/* return the read, reverse complemented if necessary */ --static char *get_read(const bam1_t *rec) --{ -- int len = rec->core.l_qseq + 1; -- char *read = calloc(1, len); -- char *seq = (char *)bam_get_seq(rec); -- int n; -- -- if (!read) return NULL; -- -- for (n=0; n < rec->core.l_qseq; n++) { -- if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; -- else read[n] = seq_nt16_str[bam_seqi(seq,n)]; -- } -- if (rec->core.flag & BAM_FREVERSE) reverse(read); -- return read; --} -- --/* -- * get and decode the quality from a BAM record -- */ --static int get_quality(const bam1_t *rec, char **qual_out) --{ -- char *quality = calloc(1, rec->core.l_qseq + 1); -- char *q = (char *)bam_get_qual(rec); -- int n; -- -- if (!quality) return -1; -- -- if (*q == '\xff') { -- free(quality); -- *qual_out = NULL; -- return 0; -- } -- -- for (n=0; n < rec->core.l_qseq; n++) { -- quality[n] = q[n]+33; -- } -- if (rec->core.flag & BAM_FREVERSE) reverse(quality); -- *qual_out = quality; -- return 0; --} -- --// --// End of htslib complaints --// -- -- --static readpart which_readpart(const bam1_t *b) --{ -- if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { -- return READ_1; -- } else if ((b->core.flag & BAM_FREAD2) && !(b->core.flag & BAM_FREAD1)) { -- return READ_2; -- } else { -- return READ_UNKNOWN; -- } --} -- --/* -- * parse the length part from the index-format string -- */ --static int getLength(char **s) --{ -- int n = 0; -- while (**s) { -- if (**s == '*') { n=-1; (*s)++; break; } -- if ( !isdigit(**s)) break; -- n = n*10 + ((**s)-'0'); -- (*s)++; -- } -- return n; --} -- --static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) --{ -- uint8_t *s = bam_aux_get(rec, tag); -- if (s) { -- char aux_type = *s; -- switch (aux_type) { -- case 'C': -- case 'S': aux_type = 'I'; break; -- case 'c': -- case 's': aux_type = 'i'; break; -- case 'd': aux_type = 'f'; break; -- } -- -- // Ensure space. Need 6 chars + length of tag. Max length of -- // i is 16, A is 21, B currently 26, Z is unknown, so -- // have to check that one later. -- if (ks_resize(linebuf, ks_len(linebuf) + 64) < 0) return false; -- -- kputc('\t', linebuf); -- kputsn(tag, 2, linebuf); -- kputc(':', linebuf); -- kputc(aux_type=='I'? 'i': aux_type, linebuf); -- kputc(':', linebuf); -- switch (aux_type) { -- case 'H': -- case 'Z': -- if (kputs(bam_aux2Z(s), linebuf) < 0) return false; -- break; -- case 'i': kputw(bam_aux2i(s), linebuf); break; -- case 'I': kputuw(bam_aux2i(s), linebuf); break; -- case 'A': kputc(bam_aux2A(s), linebuf); break; -- case 'f': kputd(bam_aux2f(s), linebuf); break; -- case 'B': kputs("*** Unhandled aux type ***", linebuf); return false; -- default: kputs("*** Unknown aux type ***", linebuf); return false; -- } -- } -- return true; --} -- --static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) --{ -- if (!index_sequence) return 0; -- -- kstring_t new = {0,0,NULL}; -- if (linebuf->s) { -- char *s = strchr(linebuf->s, '\n'); -- if (s) { -- if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) -- return -1; -- *s = 0; -- kputs(linebuf->s, &new); -- kputc(' ', &new); -- readpart readpart = which_readpart(rec); -- if (readpart == READ_1) kputc('1', &new); -- else if (readpart == READ_2) kputc('2', &new); -- else kputc('0', &new); -- -- kputc(':', &new); -- if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); -- else kputc('N', &new); -- -- kputs(":0:", &new); -- kputs(index_sequence, &new); -- kputc('\n', &new); -- kputs(s+1, &new); -- free(ks_release(linebuf)); -- linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; -- } -- } -- return 0; --} -- --static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) --{ -- int i; -- -- linebuf->l = 0; -- // Write read name -- if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; -- if (kputs(bam_get_qname(rec), linebuf) < 0) return false; -- // Add the /1 /2 if requested -- if (state->has12) { -- readpart readpart = which_readpart(rec); -- if (readpart == READ_1) { -- if (kputs("/1", linebuf) < 0) return false; -- } else if (readpart == READ_2) { -- if (kputs("/2", linebuf) < 0) return false; -- } -- } -- if (state->copy_tags) { -- for (i = 0; copied_tags[i]; ++i) { -- if (!copy_tag(copied_tags[i], rec, linebuf)) { -- fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -- return false; -- } -- } -- } -- -- if (state->taglist->size) { -- kliter_t(ktaglist) *p; -- for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { -- if (!copy_tag(kl_val(p), rec, linebuf)) { -- fprintf(samtools_stderr, "Problem copying aux tags: [%s]\n", linebuf->s); -- return false; -- } -- } -- } -- -- if (kputc('\n', linebuf) < 0) return false; -- if (kputs(seq, linebuf) < 0) return false; -- if (kputc('\n', linebuf) < 0) return false; -- -- if (state->filetype == FASTQ) { -- // Write quality -- if (kputs("+\n", linebuf) < 0) return false; -- if (qual && *qual) { -- if (kputs(qual, linebuf) < 0) return false; -- } else { -- int len = strlen(seq); -- if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; -- for (i = 0; i < len; ++i) { -- kputc(33 + state->def_qual, linebuf); -- } -- } -- if (kputc('\n', linebuf) < 0) return false; -- } -- return true; --} -- --/* -- * Create FASTQ lines from the barcode tag using the index-format -- */ --static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) --{ -- uint8_t *p; -- char *ifmt = opts->index_format; -- char *tag = NULL; -- char *qual = NULL; -- char *sub_tag = NULL; -- char *sub_qual = NULL; -- size_t tag_len; -- int file_number = 0; -- kstring_t linebuf = { 0, 0, NULL }; // Buffer -- -- -- // read barcode tag -- p = bam_aux_get(rec,opts->barcode_tag); -- if (p) tag = bam_aux2Z(p); -- -- if (!tag) return true; // there is no tag -- -- tag_len = strlen(tag); -- sub_tag = calloc(1, tag_len + 1); -- if (!sub_tag) goto fail; -- sub_qual = calloc(1, tag_len + 1); -- if (!sub_qual) goto fail; -- -- // read quality tag -- p = bam_aux_get(rec, opts->quality_tag); -- if (p) qual = bam_aux2Z(p); -- -- // Parse the index-format string -- while (*ifmt) { -- if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly -- char action = *ifmt; // should be 'i' or 'n' -- ifmt++; // skip over action -- int index_len = getLength(&ifmt); -- int n = 0; -- -- if (index_len < 0) { -- // read until separator -- while (isalpha(*tag)) { -- sub_tag[n] = *tag++; -- if (qual) sub_qual[n] = *qual++; -- n++; -- } -- if (*tag) { // skip separator -- tag++; -- if (qual) qual++; -- } -- } else { -- // read index_len characters -- while (index_len-- && *tag) { -- sub_tag[n] = *tag++; -- if (qual) sub_qual[n] = *qual++; -- n++; -- } -- } -- sub_tag[n] = '\0'; -- sub_qual[n] = '\0'; -- -- if (action=='i' && *sub_tag && state->fpi[file_number]) { -- //if (file_number==0) state->index_sequence = strdup(sub_tag); // we're going to need this later... -- state->index_sequence = strdup(sub_tag); // we're going to need this later... -- if (!state->index_sequence) goto fail; -- if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; -- if (state->illumina_tag) { -- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf, rec) < 0) { -- goto fail; -- } -- } -- if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) -- goto fail; -- } -- -- } -- -- free(sub_qual); free(sub_tag); -- free(linebuf.s); -- return true; -- -- fail: -- perror(__func__); -- free(sub_qual); free(sub_tag); -- free(linebuf.s); -- return true; --} -- --// Transform a bam1_t record into a string with the FASTQ representation of it --// @returns false for error, true for success --static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) --{ -- int32_t qlen = b->core.l_qseq; -- assert(qlen >= 0); -- const uint8_t *oq = NULL; -- char *qual = NULL; -- -- char *seq = get_read(b); -- if (!seq) return false; -- -- if (state->use_oq) oq = bam_aux_get(b, "OQ"); -- if (oq && *oq=='Z') { -- qual = strdup(bam_aux2Z(oq)); -- if (!qual) goto fail; -- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented -- reverse(qual); -- } -- } else { -- if (get_quality(b, &qual) < 0) goto fail; -- } -- -- if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; -- -- free(qual); -- free(seq); -- return true; -- -- fail: -- free(seq); -- free(qual); -- return false; --} -- --static void free_opts(bam2fq_opts_t *opts) --{ -- free(opts->barcode_tag); -- free(opts->quality_tag); -- free(opts->index_format); -- free(opts->extra_tags); -- free(opts); --} -- --// return true if valid --static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out) --{ -- // Parse args -- bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t)); -- opts->has12 = true; -- opts->has12always = false; -- opts->filetype = FASTQ; -- opts->def_qual = 1; -- opts->barcode_tag = NULL; -- opts->quality_tag = NULL; -- opts->index_format = NULL; -- opts->index_file[0] = NULL; -- opts->index_file[1] = NULL; -- opts->extra_tags = NULL; -- opts->compression_level = 1; -- -- int c; -- sam_global_args_init(&opts->ga); -- static const struct option lopts[] = { -- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), -- {"i1", required_argument, NULL, 1}, -- {"I1", required_argument, NULL, 1}, -- {"i2", required_argument, NULL, 2}, -- {"I2", required_argument, NULL, 2}, -- {"if", required_argument, NULL, 3}, -- {"IF", required_argument, NULL, 3}, -- {"index-format", required_argument, NULL, 3}, -- {"barcode-tag", required_argument, NULL, 'b'}, -- {"quality-tag", required_argument, NULL, 'q'}, -- { NULL, 0, NULL, 0 } -- }; -- while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { -- switch (c) { -- case 'b': opts->barcode_tag = strdup(optarg); break; -- case 'q': opts->quality_tag = strdup(optarg); break; -- case 1 : opts->index_file[0] = optarg; break; -- case 2 : opts->index_file[1] = optarg; break; -- case 3 : opts->index_format = strdup(optarg); break; -- case '0': opts->fnr[0] = optarg; break; -- case '1': opts->fnr[1] = optarg; break; -- case '2': opts->fnr[2] = optarg; break; -- case 'f': opts->flag_on |= strtol(optarg, 0, 0); break; -- case 'F': opts->flag_off |= strtol(optarg, 0, 0); break; -- case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; -- case 'n': opts->has12 = false; break; -- case 'N': opts->has12always = true; break; -- case 'O': opts->use_oq = true; break; -- case 's': opts->fnse = optarg; break; -- case 't': opts->copy_tags = true; break; -- case 'i': opts->illumina_tag = true; break; -- case 'c': opts->compression_level = atoi(optarg); break; -- case 'T': opts->extra_tags = strdup(optarg); break; -- case 'v': opts->def_qual = atoi(optarg); break; -- case '?': bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; -- default: -- if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { -- bam2fq_usage(samtools_stderr, argv[0]); free_opts(opts); return false; -- } -- break; -- } -- } -- -- if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; -- if (opts->has12always) opts->has12 = true; -- -- if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); -- if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); -- -- int nIndex = 0; -- if (opts->index_format) { -- char *s; -- for (s = opts->index_format; *s; s++) { -- if (*s == 'i') nIndex++; -- } -- } -- if (nIndex>2) { -- fprintf(samtools_stderr,"Invalid index format: more than 2 indexes\n"); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (opts->index_file[1] && !opts->index_file[0]) { -- fprintf(samtools_stderr, "Index one specified, but index two not given\n"); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (nIndex==2 && !opts->index_file[1]) { -- fprintf(samtools_stderr, "index_format specifies two indexes, but only one index file given\n"); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (nIndex==1 && !opts->index_file[0]) { -- fprintf(samtools_stderr, "index_format specifies an index, but no index file given\n"); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (nIndex==0 && opts->index_file[0]) { -- fprintf(samtools_stderr, "index_format not specified, but index file given\n"); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if (opts->def_qual < 0 || 93 < opts->def_qual) { -- fprintf(samtools_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- const char* type_str = argv[0]; -- if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { -- opts->filetype = FASTQ; -- } else if (strcasecmp("fasta", type_str) == 0) { -- opts->filetype = FASTA; -- } else { -- print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if ((argc - (optind)) == 0) { -- fprintf(samtools_stderr, "No input file specified.\n"); -- bam2fq_usage(samtools_stdout, argv[0]); -- free_opts(opts); -- return false; -- } -- -- if ((argc - (optind)) != 1) { -- fprintf(samtools_stderr, "Too many arguments.\n"); -- bam2fq_usage(samtools_stderr, argv[0]); -- free_opts(opts); -- return false; -- } -- opts->fn_input = argv[optind]; -- *opts_out = opts; -- return true; --} -- --static BGZF *open_fqfile(char *filename, int c) --{ -- char mode[4] = "w"; -- size_t len = strlen(filename); -- -- mode[2] = 0; mode[3] = 0; -- if (len > 3 && strstr(filename + (len - 3),".gz")) { -- mode[1] = 'g'; mode[2] = c+'0'; -- } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) -- || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { -- mode[1] = c+'0'; -- } else { -- mode[1] = 'u'; -- } -- -- return bgzf_open(filename,mode); --} -- --static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) --{ -- bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); -- state->flag_on = opts->flag_on; -- state->flag_off = opts->flag_off; -- state->flag_alloff = opts->flag_alloff; -- state->has12 = opts->has12; -- state->use_oq = opts->use_oq; -- state->illumina_tag = opts->illumina_tag; -- state->copy_tags = opts->copy_tags; -- state->filetype = opts->filetype; -- state->def_qual = opts->def_qual; -- state->index_sequence = NULL; -- state->hsamtools_stdout = NULL; -- state->compression_level = opts->compression_level; -- -- state->taglist = kl_init(ktaglist); -- if (opts->extra_tags) { -- char *save_p; -- char *s = strtok_r(opts->extra_tags, ",", &save_p); -- while (s) { -- if (strlen(s) != 2) { -- fprintf(samtools_stderr, "Parsing extra tags - '%s' is not two characters\n", s); -- free(state); -- return false; -- } -- char **et = kl_pushp(ktaglist, state->taglist); -- *et = s; -- s = strtok_r(NULL, ",", &save_p); -- } -- } -- -- state->fp = sam_open(opts->fn_input, "r"); -- if (state->fp == NULL) { -- print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); -- free(state); -- return false; -- } -- if (opts->ga.nthreads > 0) -- hts_set_threads(state->fp, opts->ga.nthreads); -- uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL; -- if (opts->use_oq || opts->extra_tags || opts->index_file[0]) rf |= SAM_AUX; -- if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { -- fprintf(samtools_stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); -- free(state); -- return false; -- } -- if (hts_set_opt(state->fp, CRAM_OPT_DECODE_MD, 0)) { -- fprintf(samtools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); -- free(state); -- return false; -- } -- if (opts->fnse) { -- state->fpse = open_fqfile(opts->fnse, state->compression_level); -- if (state->fpse == NULL) { -- print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); -- free(state); -- return false; -- } -- } -- -- if (opts->ga.reference) { -- if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) { -- print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference); -- free(state); -- return false; -- } -- } -- -- int i; -- for (i = 0; i < 3; ++i) { -- if (opts->fnr[i]) { -- state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level); -- if (state->fpr[i] == NULL) { -- print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", i, opts->fnr[i]); -- free(state); -- return false; -- } -- } else { -- if (!state->hsamtools_stdout) { -- state->hsamtools_stdout = bgzf_dopen(fileno(samtools_stdout), "wu"); -- if (!state->hsamtools_stdout) { -- print_error_errno("bam2fq", "Cannot open STDOUT"); -- free(state); -- return false; -- } -- } -- state->fpr[i] = state->hsamtools_stdout; -- } -- } -- for (i = 0; i < 2; i++) { -- state->fpi[i] = NULL; -- if (opts->index_file[i]) { -- state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level); -- if (state->fpi[i] == NULL) { -- print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]); -- free(state); -- return false; -- } -- } -- } -- -- state->h = sam_hdr_read(state->fp); -- if (state->h == NULL) { -- fprintf(samtools_stderr, "Failed to read header for \"%s\"\n", opts->fn_input); -- free(state); -- return false; -- } -- -- *state_out = state; -- return true; --} -- --static bool destroy_state(const bam2fq_opts_t *opts, bam2fq_state_t *state, int* status) --{ -- bool valid = true; -- bam_hdr_destroy(state->h); -- check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); -- if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } -- int i; -- for (i = 0; i < 3; ++i) { -- if (state->fpr[i] != state->hsamtools_stdout) { -- if (bgzf_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } -- } -- } -- if (state->hsamtools_stdout) { -- if (bgzf_close(state->hsamtools_stdout)) { -- print_error_errno("bam2fq", "Error closing STDOUT"); -- valid = false; -- } -- } -- for (i = 0; i < 2; i++) { -- if (state->fpi[i] && bgzf_close(state->fpi[i])) { -- print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); -- valid = false; -- } -- } -- kl_destroy(ktaglist,state->taglist); -- free(state->index_sequence); -- free(state); -- return valid; --} -- --static inline bool filter_it_out(const bam1_t *b, const bam2fq_state_t *state) --{ -- return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments -- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags -- || (b->core.flag&(state->flag_off)) != 0 -- || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff)); -- --} -- --static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) --{ -- int n; -- bam1_t *records[3]; -- bam1_t* b = bam_init1(); -- char *current_qname = NULL; -- int64_t n_reads = 0, n_singletons = 0; // Statistics -- kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; -- int score[3]; -- int at_eof; -- if (b == NULL ) { -- perror("[bam2fq_mainloop] Malloc error for bam record buffer."); -- return false; -- } -- -- bool valid = true; -- while (true) { -- int res = sam_read1(state->fp, state->h, b); -- if (res < -1) { -- fprintf(samtools_stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); -- return false; -- } -- at_eof = res < 0; -- -- if (!at_eof && filter_it_out(b, state)) continue; -- if (!at_eof) ++n_reads; -- -- if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { -- if (current_qname) { -- if (state->illumina_tag) { -- for (n=0; valid && n<3; n++) { -- if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; -- } -- if (!valid) break; -- } -- free(state->index_sequence); state->index_sequence = NULL; -- if (score[1] > 0 && score[2] > 0) { -- // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] -- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -- } else if (score[1] > 0 || score[2] > 0) { -- if (state->fpse) { -- // print whichever one exists to fpse -- if (score[1] > 0) { -- if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -- } else { -- if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -- } -- ++n_singletons; -- } else { -- if (score[1] > 0) { -- if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } -- } else { -- if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } -- } -- } -- } -- if (score[0]) { // TODO: check this -- // print linebuf[0] to fpr[0] -- if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } -- } -- } -- -- if (at_eof) break; -- -- free(current_qname); -- current_qname = strdup(bam_get_qname(b)); -- if (!current_qname) { valid = false; break; } -- score[0] = score[1] = score[2] = 0; -- } -- -- // Prefer a copy of the read that has base qualities -- int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; -- if (b_score > score[which_readpart(b)]) { -- if (state->fpi[0]) if (!tags2fq(b, state, opts)) return false; -- records[which_readpart(b)] = b; -- if(!bam1_to_fq(b, &linebuf[which_readpart(b)], state)) { -- fprintf(samtools_stderr, "[%s] Error converting read to FASTA/Q\n", __func__); -- return false; -- } -- score[which_readpart(b)] = b_score; -- } -- } -- if (!valid) -- { -- perror("[bam2fq_mainloop] Error writing to FASTx files."); -- } -- bam_destroy1(b); -- free(current_qname); -- free(linebuf[0].s); -- free(linebuf[1].s); -- free(linebuf[2].s); -- fprintf(samtools_stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); -- fprintf(samtools_stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); -- -- return valid; --} -- --int main_bam2fq(int argc, char *argv[]) --{ -- int status = EXIT_SUCCESS; -- bam2fq_opts_t* opts = NULL; -- bam2fq_state_t* state = NULL; -- -- bool valid = parse_opts(argc, argv, &opts); -- if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; -- -- if (!init_state(opts, &state)) return EXIT_FAILURE; -- -- if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; -- -- if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; -- sam_global_args_free(&opts->ga); -- free_opts(opts); -- -- return status; --} ---- python-pysam.orig/samtools/samtools.h -+++ python-pysam/samtools/samtools.h -@@ -1,6 +1,6 @@ - /* samtools.h -- utility routines. - -- Copyright (C) 2013-2015 Genome Research Ltd. -+ Copyright (C) 2013-2015, 2019 Genome Research Ltd. - - Author: Petr Danecek - -@@ -25,15 +25,28 @@ - #ifndef SAMTOOLS_H - #define SAMTOOLS_H - -+#include "htslib/hts_defs.h" -+#include "htslib/sam.h" -+ - const char *samtools_version(void); - --#if defined __GNUC__ && __GNUC__ >= 2 --#define CHECK_PRINTF(fmt,args) __attribute__ ((format (printf, fmt, args))) --#else --#define CHECK_PRINTF(fmt,args) --#endif -+#define CHECK_PRINTF(fmt,args) HTS_FORMAT(HTS_PRINTF_FMT, (fmt), (args)) - - void print_error(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); - void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); - -+void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp); -+ -+/* -+ * Utility function to add an index to a file we've opened for write. -+ * NB: Call this after writing the header and before writing sequences. -+ * -+ * The returned index filename should be freed by the caller, but only -+ * after sam_idx_save has been called. -+ * -+ * Returns index filename on success, -+ * NULL on failure. -+ */ -+char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header); -+ - #endif ---- python-pysam.orig/samtools/stats.c -+++ python-pysam/samtools/stats.c -@@ -1,6 +1,6 @@ - /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - -- Copyright (C) 2012-2015 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Petr Danecek - Author: Sam Nicholls -@@ -46,6 +46,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -53,7 +54,7 @@ - #include - #include - #include --#include "sam_header.h" -+#include - #include - #include "samtools.h" - #include -@@ -65,8 +66,10 @@ - #define BWA_MIN_RDLEN 35 - #define DEFAULT_CHUNK_NO 8 - #define DEFAULT_PAIR_MAX 10000 -+#define ERROR_LIMIT 200 - // From the spec - // If 0x4 is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, bits 0x2, 0x10, 0x100 and 0x800, and the bit 0x20 of the previous read in the template. -+#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED) - #define IS_PAIRED_AND_MAPPED(bam) (((bam)->core.flag&BAM_FPAIRED) && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) - #define IS_PROPERLYPAIRED(bam) (((bam)->core.flag&(BAM_FPAIRED|BAM_FPROPER_PAIR)) == (BAM_FPAIRED|BAM_FPROPER_PAIR) && !((bam)->core.flag&BAM_FUNMAP)) - #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) -@@ -77,6 +80,14 @@ - #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) - #define IS_ORIGINAL(bam) (((bam)->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) == 0) - -+#define READ_ORDER_NONE 0 -+#define READ_ORDER_FIRST 1 -+#define READ_ORDER_LAST 2 -+#define READ_ORDER_MIDDLE 3 -+ -+#define REG_INC 100 -+#define POS_INC 1000 -+ - // The GC-depth graph works as follows: split the reference sequence into - // segments and calculate GC content and depth in each bin. Then sort - // these segments by their GC and plot the depth distribution by means -@@ -91,17 +102,16 @@ - // For coverage distribution, a simple pileup - typedef struct - { -- int64_t pos; -+ hts_pos_t pos; - int size, start; - int *buffer; - } - round_buffer_t; - --typedef struct { uint32_t from, to; } pos_t; - typedef struct - { -- int npos,mpos,cpos; -- pos_t *pos; -+ int npos, mpos, cpos; -+ hts_pair_pos_t *pos; - } - regions_t; - -@@ -118,6 +128,17 @@ - - typedef struct - { -+ char tag_name[3]; -+ char qual_name[3]; -+ uint32_t nbases; -+ int32_t tag_sep; // Index of the separator (if present) -+ int32_t max_qual; -+ uint32_t offset; // Where the tag stats info is located in the allocated memory -+} -+barcode_info_t; -+ -+typedef struct -+{ - // Auxiliary data - int flag_require, flag_filter; - faidx_t *fai; // Reference sequence for GC-depth graph -@@ -129,7 +150,7 @@ - float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part - int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins - samFile* sam; -- bam_hdr_t* sam_header; -+ sam_hdr_t* sam_header; - - // Filters - int filter_readlen; -@@ -175,6 +196,7 @@ - uint64_t total_len_dup; - uint64_t nreads_1st; - uint64_t nreads_2nd; -+ uint64_t nreads_other; - uint64_t nreads_filtered; - uint64_t nreads_dup; - uint64_t nreads_unmapped; -@@ -196,8 +218,8 @@ - // GC-depth related data - uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin - gc_depth_t *gcd; // The GC-depth bins holder -- int32_t tid, gcd_pos; // Position of the current bin -- int32_t pos; // Position of the last read -+ int32_t tid; // Position of the current bin -+ hts_pos_t gcd_pos, pos; // Position of the last read - - // Coverage distribution related data - int ncov; // The number of coverage bins -@@ -207,12 +229,13 @@ - // Mismatches by read cycle - uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against - int mrseq_buf; // The size of the buffer -- int32_t rseq_pos; // The coordinate of the first base in the buffer -- int32_t nrseq_buf; // The used part of the buffer -+ hts_pos_t rseq_pos; // The coordinate of the first base in the buffer -+ int64_t nrseq_buf; // The used part of the buffer - uint64_t *mpc_buf; // Mismatches per cycle - - // Target regions -- int nregions, reg_from, reg_to; -+ int nregions; -+ hts_pos_t reg_from, reg_to; - regions_t *regions; - - // Auxiliary data -@@ -223,13 +246,20 @@ - char* split_name; - - stats_info_t* info; // Pointer to options and settings struct -- pos_t *chunks; -+ hts_pair_pos_t *chunks; - uint32_t nchunks; - - uint32_t pair_count; // Number of active pairs in the pairing hash table - uint32_t target_count; // Number of bases covered by the target file - uint32_t last_pair_tid; - uint32_t last_read_flush; -+ -+ // Barcode statistics -+ acgtno_count_t *acgtno_barcode; -+ uint64_t *quals_barcode; -+ barcode_info_t *tags_barcode; -+ uint32_t ntags; -+ uint32_t error_number; - } - stats_t; - KHASH_MAP_INIT_STR(c2stats, stats_t*) -@@ -237,18 +267,18 @@ - typedef struct { - uint32_t first; // 1 - first read, 2 - second read - uint32_t n, m; // number of chunks, allocated chunks -- pos_t *chunks; // chunk array of size m -+ hts_pair_pos_t *chunks; // chunk array of size m - } pair_t; - KHASH_MAP_INIT_STR(qn2pair, pair_t*) - - --static void error(const char *format, ...); -+static void HTS_NORETURN error(const char *format, ...); - int is_in_regions(bam1_t *bam_line, stats_t *stats); - void realloc_buffers(stats_t *stats, int seq_len); - - static int regions_lt(const void *r1, const void *r2) { -- int64_t from_diff = (int64_t)((pos_t *)r1)->from - (int64_t)((pos_t *)r2)->from; -- int64_t to_diff = (int64_t)((pos_t *)r1)->to - (int64_t)((pos_t *)r2)->to; -+ int64_t from_diff = ((hts_pair_pos_t *)r1)->beg - ((hts_pair_pos_t *)r2)->beg; -+ int64_t to_diff = ((hts_pair_pos_t *)r1)->end - ((hts_pair_pos_t *)r2)->end; - - return from_diff > 0 ? 1 : from_diff < 0 ? -1 : to_diff > 0 ? 1 : to_diff < 0 ? -1 : 0; - } -@@ -265,19 +295,19 @@ - return 1 + (depth - min) / step; - } - --static inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) -+static inline int round_buffer_lidx2ridx(int offset, int size, hts_pos_t refpos, hts_pos_t pos) - { - return (offset + (pos-refpos) % size) % size; - } - --void round_buffer_flush(stats_t *stats, int64_t pos) -+void round_buffer_flush(stats_t *stats, hts_pos_t pos) - { - int ibuf,idp; - - if ( pos==stats->cov_rbuf.pos ) - return; - -- int64_t new_pos = pos; -+ hts_pos_t new_pos = pos; - if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) - { - // Flush the whole buffer, but in sequential order, -@@ -285,10 +315,10 @@ - } - - if ( pos < stats->cov_rbuf.pos ) -- error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); -+ error("Expected coordinates in ascending order, got %"PRIhts_pos" after %"PRIhts_pos"\n", pos, stats->cov_rbuf.pos); - - int ifrom = stats->cov_rbuf.start; -- int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); -+ int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos-1); - if ( ifrom>ito ) - { - for (ibuf=ifrom; ibufcov_rbuf.size; ibuf++) -@@ -309,27 +339,30 @@ - stats->cov[idp]++; - stats->cov_rbuf.buffer[ibuf] = 0; - } -- stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); -+ stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos); - stats->cov_rbuf.pos = new_pos; - } - --void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) -+/** -+ * [from, to) - 0 based half-open -+ */ -+static void round_buffer_insert_read(round_buffer_t *rbuf, hts_pos_t from, hts_pos_t to) - { -- if ( to-from >= rbuf->size ) -- error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); -+ if ( to-from > rbuf->size ) -+ error("The read length too big (%"PRIhts_pos"), please increase the buffer length (currently %d)\n", to-from, rbuf->size); - if ( from < rbuf->pos ) -- error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); -+ error("The reads are not sorted (%"PRIhts_pos" comes after %"PRIhts_pos").\n", from, rbuf->pos); - -- int ifrom,ito,ibuf; -- ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); -- ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); -+ int ifrom, ito, ibuf; -+ ifrom = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, from); -+ ito = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, to); - if ( ifrom>ito ) - { - for (ibuf=ifrom; ibufsize; ibuf++) - rbuf->buffer[ibuf]++; - ifrom = 0; - } -- for (ibuf=ifrom; ibuf<=ito; ibuf++) -+ for (ibuf=ifrom; ibufbuffer[ibuf]++; - } - -@@ -362,7 +395,7 @@ - void count_indels(stats_t *stats,bam1_t *bam_line) - { - int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; -- int is_1st = IS_READ1(bam_line) ? 1 : 0; -+ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; - int icig; - int icycle = 0; - int read_len = bam_line->core.l_qseq; -@@ -377,10 +410,10 @@ - int idx = is_fwd ? icycle : read_len-icycle-ncig; - if ( idx<0 ) - error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); -- if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -- if ( is_1st ) -+ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%"PRIhts_pos" %s\n", idx, stats->nbases, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); -+ if ( order == READ_ORDER_FIRST ) - stats->ins_cycles_1st[idx]++; -- else -+ if ( order == READ_ORDER_LAST ) - stats->ins_cycles_2nd[idx]++; - icycle += ncig; - if ( ncig<=stats->nindels ) -@@ -392,9 +425,9 @@ - int idx = is_fwd ? icycle-1 : read_len-icycle-1; - if ( idx<0 ) continue; // discard meaningless deletions - if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); -- if ( is_1st ) -+ if ( order == READ_ORDER_FIRST ) - stats->del_cycles_1st[idx]++; -- else -+ if ( order == READ_ORDER_LAST ) - stats->del_cycles_2nd[idx]++; - if ( ncig<=stats->nindels ) - stats->deletions[ncig-1]++; -@@ -420,8 +453,8 @@ - void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) - { - int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; -- int icig,iread=0,icycle=0; -- int iref = bam_line->core.pos - stats->rseq_pos; -+ int icig, iread=0, icycle=0; -+ hts_pos_t iref = bam_line->core.pos - stats->rseq_pos; - uint8_t *read = bam_get_seq(bam_line); - uint8_t *quals = bam_get_qual(bam_line); - uint64_t *mpc_buf = stats->mpc_buf; -@@ -454,13 +487,13 @@ - continue; - } - // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large -- // chunk of refseq in memory. Not very frequent and not noticable in the stats. -+ // chunk of refseq in memory. Not very frequent and not noticeable in the stats. - if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) continue; - if ( cig!=BAM_CMATCH && cig!=BAM_CEQUAL && cig!=BAM_CDIFF ) // not relying on precalculated diffs -- error("TODO: cigar %d, %s:%d %s\n", cig,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -+ error("TODO: cigar %d, %s:%"PRIhts_pos" %s\n", cig, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); - - if ( ncig+iref > stats->nrseq_buf ) -- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1); -+ error("FIXME: %d+%"PRIhts_pos" > %"PRId64", %s, %s:%"PRIhts_pos"\n", ncig, iref, stats->nrseq_buf, bam_get_qname(bam_line), sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1); - - int im; - for (im=0; im=stats->nquals ) -- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -+ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); - - int idx = is_fwd ? icycle : read_len-icycle-1; - if ( idx>stats->max_len ) -- error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -+ error("mpc: %d>%d (%s %"PRIhts_pos" %s)\n", idx, stats->max_len, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); - - idx = idx*stats->nquals + qual; - if ( idx>=stats->nquals*stats->nbases ) -@@ -503,11 +536,12 @@ - } - } - --void read_ref_seq(stats_t *stats, int32_t tid, int32_t pos) -+void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos) - { -- int i, fai_ref_len; -- char *fai_ref = faidx_fetch_seq(stats->info->fai, stats->info->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len); -- if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->info->sam_header->target_name[tid]); -+ int i; -+ hts_pos_t fai_ref_len; -+ char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); -+ if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid)); - - uint8_t *ptr = stats->rseq_buf; - for (i=0; itid = tid; - } - --float fai_gc_content(stats_t *stats, int pos, int len) -+float fai_gc_content(stats_t *stats, hts_pos_t pos, int len) - { - uint32_t gc,count,c; -- int i = pos - stats->rseq_pos, ito = i + len; -+ hts_pos_t i = pos - stats->rseq_pos, ito = i + len; - assert( i>=0 ); - - if ( ito > stats->nrseq_buf ) ito = stats->nrseq_buf; -@@ -568,6 +602,9 @@ - if ( stats->mrseq_bufrseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); -+ if (!stats->rseq_buf) { -+ error("Could not reallocate reference sequence buffer"); -+ } - stats->mrseq_buf = n; - } - } -@@ -659,6 +696,9 @@ - - // Realloc the coverage distribution buffer - int *rbuffer = calloc(sizeof(int),seq_len*5); -+ if (!rbuffer) { -+ error("Could not allocate coverage distribution buffer"); -+ } - n = stats->cov_rbuf.size-stats->cov_rbuf.start; - memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); - if ( stats->cov_rbuf.start>1 ) -@@ -688,6 +728,119 @@ - stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2); - } - -+// Collect statistics about the barcode tags specified by init_barcode_tags method -+static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) { -+ uint32_t nbases, tag, i; -+ acgtno_count_t *acgtno; -+ uint64_t *quals; -+ int32_t *separator, *maxqual; -+ -+ for (tag = 0; tag < stats->ntags; tag++) { -+ const char *barcode_tag = stats->tags_barcode[tag].tag_name, *qual_tag = stats->tags_barcode[tag].qual_name; -+ uint8_t* bc = bam_aux_get(bam_line, barcode_tag); -+ if (!bc) -+ continue; -+ -+ char* barcode = bam_aux2Z(bc); -+ if (!barcode) -+ continue; -+ -+ uint32_t barcode_len = strlen(barcode); -+ if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time -+ uint32_t offset = 0; -+ for (i = 0; i < stats->ntags; i++) -+ offset += stats->tags_barcode[i].nbases; -+ -+ stats->tags_barcode[tag].offset = offset; -+ stats->tags_barcode[tag].nbases = barcode_len; -+ stats->acgtno_barcode = realloc(stats->acgtno_barcode, (offset + barcode_len) * sizeof(acgtno_count_t)); -+ stats->quals_barcode = realloc(stats->quals_barcode, (offset + barcode_len) * stats->nquals * sizeof(uint64_t)); -+ -+ if (!stats->acgtno_barcode || !stats->quals_barcode) -+ error("Error allocating memory. Aborting!\n"); -+ -+ memset(stats->acgtno_barcode + offset, 0, barcode_len*sizeof(acgtno_count_t)); -+ memset(stats->quals_barcode + offset*stats->nquals, 0, barcode_len*stats->nquals*sizeof(uint64_t)); -+ } -+ -+ nbases = stats->tags_barcode[tag].nbases; -+ if (barcode_len > nbases) { -+ fprintf(stderr, "Barcodes with tag %s differ in length at sequence '%s'\n", barcode_tag, bam_get_qname(bam_line)); -+ continue; -+ } -+ -+ acgtno = stats->acgtno_barcode + stats->tags_barcode[tag].offset; -+ quals = stats->quals_barcode + stats->tags_barcode[tag].offset*stats->nquals; -+ maxqual = &stats->tags_barcode[tag].max_qual; -+ separator = &stats->tags_barcode[tag].tag_sep; -+ int error_flag = 0; -+ -+ for (i = 0; i < barcode_len; i++) { -+ switch (barcode[i]) { -+ case 'A': -+ acgtno[i].a++; -+ break; -+ case 'C': -+ acgtno[i].c++; -+ break; -+ case 'G': -+ acgtno[i].g++; -+ break; -+ case 'T': -+ acgtno[i].t++; -+ break; -+ case 'N': -+ acgtno[i].n++; -+ break; -+ default: -+ if (*separator >= 0) { -+ if (*separator != i) { -+ if (stats->error_number < ERROR_LIMIT) { -+ fprintf(stderr, "Barcode separator for tag %s is in a different position or wrong barcode content('%s') at sequence '%s'\n", barcode_tag, barcode, bam_get_qname(bam_line)); -+ stats->error_number++; -+ } -+ error_flag = 1; -+ } -+ } else { -+ *separator = i; -+ } -+ } -+ -+ /* don't process the rest of the tag bases */ -+ if (error_flag) -+ break; -+ } -+ -+ /* skip to the next tag */ -+ if (error_flag) -+ continue; -+ -+ uint8_t* qt = bam_aux_get(bam_line, qual_tag); -+ if (!qt) -+ continue; -+ -+ char* barqual = bam_aux2Z(qt); -+ if (!barqual) -+ continue; -+ -+ uint32_t barqual_len = strlen(barqual); -+ if (barqual_len == barcode_len) { -+ for (i = 0; i < barcode_len; i++) { -+ int32_t qual = (int32_t)barqual[i] - '!'; // Phred + 33 -+ if (qual >= 0 && qual < stats->nquals) { -+ quals[i * stats->nquals + qual]++; -+ if (qual > *maxqual) -+ *maxqual = qual; -+ } -+ } -+ } else { -+ if (stats->error_number++ < ERROR_LIMIT) { -+ fprintf(stderr, "%s length and %s length don't match for sequence '%s'\n", barcode_tag, qual_tag, bam_get_qname(bam_line)); -+ } -+ } -+ } -+} -+ - // These stats should only be calculated for the original reads ignoring - // supplementary artificial reads otherwise we'll accidentally double count - void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out) -@@ -698,42 +851,48 @@ - if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; - if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++; - -+ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; -+ - // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored - uint8_t *seq = bam_get_seq(bam_line); -- int i, read_cycle, gc_count = 0, reverse = IS_REVERSE(bam_line), is_first = IS_READ1(bam_line); -- for (i=0; iacgtno_cycles_1st[ read_cycle ].a++ : stats->acgtno_cycles_2nd[ read_cycle ].a++; -- break; -- case 2: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].c++ : stats->acgtno_cycles_2nd[ read_cycle ].c++; -- gc_count++; -- break; -- case 4: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].g++ : stats->acgtno_cycles_2nd[ read_cycle ].g++; -- gc_count++; -- break; -- case 8: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].t++ : stats->acgtno_cycles_2nd[ read_cycle ].t++; -- break; -- case 15: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].n++ : stats->acgtno_cycles_2nd[ read_cycle ].n++; -- break; -- default: -- /* -- * count "=" sequences in "other" along -- * with MRSVWYHKDB ambiguity codes -- */ -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].other++ : stats->acgtno_cycles_2nd[ read_cycle ].other++; -- break; -+ acgtno_count_t *acgtno_cycles = (order == READ_ORDER_FIRST) ? stats->acgtno_cycles_1st : (order == READ_ORDER_LAST) ? stats->acgtno_cycles_2nd : NULL ; -+ if (acgtno_cycles) { -+ for (i=0; ingc-1)/seq_len; -@@ -743,38 +902,48 @@ - // Determine which array (1st or 2nd read) will these stats go to, - // trim low quality bases from end the same way BWA does, - // fill GC histogram -- uint64_t *quals; -+ uint64_t *quals = NULL; - uint8_t *bam_quals = bam_get_qual(bam_line); -- if ( IS_READ2(bam_line) ) -- { -- quals = stats->quals_2nd; -- stats->nreads_2nd++; -- stats->total_len_2nd += seq_len; -- for (i=gc_idx_min; igc_2nd[i]++; -- } -- else -- { -+ -+ switch (order) { -+ case READ_ORDER_FIRST: - quals = stats->quals_1st; - stats->nreads_1st++; - stats->total_len_1st += seq_len; - for (i=gc_idx_min; igc_1st[i]++; -+ break; -+ case READ_ORDER_LAST: -+ quals = stats->quals_2nd; -+ stats->nreads_2nd++; -+ stats->total_len_2nd += seq_len; -+ for (i=gc_idx_min; igc_2nd[i]++; -+ break; -+ default: -+ stats->nreads_other++; - } - if ( stats->info->trim_qual>0 ) - stats->nbases_trimmed += bwa_trim_read(stats->info->trim_qual, bam_quals, seq_len, reverse); - - // Quality histogram and average quality. Clipping is neglected. -- for (i=0; i=stats->nquals ) -- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -- if ( qual>stats->max_qual ) -- stats->max_qual = qual; -+ if (quals) { -+ for (i=0; i=stats->nquals ) -+ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); -+ if ( qual>stats->max_qual ) -+ stats->max_qual = qual; -+ -+ quals[ i*stats->nquals+qual ]++; -+ stats->sum_qual += qual; -+ } -+ } - -- quals[ i*stats->nquals+qual ]++; -- stats->sum_qual += qual; -+ // Barcode statistics -+ if (order == READ_ORDER_FIRST) { -+ collect_barcode_stats(bam_line, stats); - } - - // Look at the flags and increment appropriate counters (mapped, paired, etc) -@@ -803,7 +972,7 @@ - *gc_count_out = gc_count; - } - --static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, int max) { -+static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, hts_pos_t max) { - if ( !read_pairs ) - return 0; - -@@ -814,7 +983,7 @@ - char *key = (char *)kh_key(read_pairs, k); - pair_t *val = kh_val(read_pairs, k); - if ( val && val->chunks ) { -- if ( val->chunks[val->n-1].to < max ) { -+ if ( val->chunks[val->n-1].end < max ) { - free(val->chunks); - free(val); - free(key); -@@ -828,29 +997,32 @@ - } - } - } -- if ( max == INT_MAX ) -+ if ( max == INT64_MAX ) - kh_destroy(qn2pair, read_pairs); - - return count; - } - --static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, int pmin, int pmax) { -+/** -+ * [pmin, pmax) - 0 based half-open -+ */ -+static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, hts_pos_t pmin, hts_pos_t pmax) { - if ( !bam_line || !read_pairs || !stats ) - return; - -- uint32_t first = (IS_READ1(bam_line) > 0 ? 1 : 0) + (IS_READ2(bam_line) > 0 ? 2 : 0) ; -+ uint32_t order = (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0); - if ( !(bam_line->core.flag & BAM_FPAIRED) || - (bam_line->core.flag & BAM_FMUNMAP) || -- (abs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || -- (first != 1 && first != 2) ) { -+ (llabs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || -+ (order != READ_ORDER_FIRST && order != READ_ORDER_LAST) ) { - if ( pmin >= 0 ) -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); - return; - } - - char *qname = bam_get_qname(bam_line); - if ( !qname ) { -- fprintf(stderr, "Error retrieving qname for line starting at pos %d\n", bam_line->core.pos); -+ fprintf(stderr, "Error retrieving qname for line starting at pos %"PRIhts_pos"\n", bam_line->core.pos); - return; - } - -@@ -868,8 +1040,7 @@ - - k = kh_put(qn2pair, read_pairs, s, &ret); - if ( -1 == ret ) { -- fprintf(stderr, "Error inserting read '%s' in pair hash table\n", qname); -- return; -+ error("Error inserting read '%s' in pair hash table\n", qname); - } - - pair_t *pc = calloc(1, sizeof(pair_t)); -@@ -879,16 +1050,16 @@ - } - - pc->m = DEFAULT_CHUNK_NO; -- pc->chunks = calloc(pc->m, sizeof(pos_t)); -+ pc->chunks = calloc(pc->m, sizeof(hts_pair_pos_t)); - if ( !pc->chunks ) { - fprintf(stderr, "Error allocating memory\n"); - return; - } - -- pc->chunks[0].from = pmin; -- pc->chunks[0].to = pmax; -+ pc->chunks[0].beg = pmin; -+ pc->chunks[0].end = pmax; - pc->n = 1; -- pc->first = first; -+ pc->first = order; - - kh_val(read_pairs, k) = pc; - stats->pair_count++; -@@ -899,12 +1070,12 @@ - return; - } - -- if ( first == pc->first ) { //chunk from an existing line -+ if ( order == pc->first ) { //chunk from an existing line - if ( pmin == -1 ) - return; - - if ( pc->n == pc->m ) { -- pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(pos_t)); -+ hts_pair_pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(hts_pair_pos_t)); - if ( !tmp ) { - fprintf(stderr, "Error allocating memory\n"); - return; -@@ -913,8 +1084,8 @@ - pc->m<<=1; - } - -- pc->chunks[pc->n].from = pmin; -- pc->chunks[pc->n].to = pmax; -+ pc->chunks[pc->n].beg = pmin; -+ pc->chunks[pc->n].end = pmax; - pc->n++; - } else { //the other line, check for overlapping - if ( pmin == -1 && kh_exist(read_pairs, k) ) { //job done, delete entry -@@ -932,28 +1103,28 @@ - - int i; - for (i=0; in; i++) { -- if ( pmin >= pc->chunks[i].to ) -+ if ( pmin >= pc->chunks[i].end ) - continue; - -- if ( pmax <= pc->chunks[i].from ) //no overlap -+ if ( pmax <= pc->chunks[i].beg ) //no overlap - break; - -- if ( pmin < pc->chunks[i].from ) { //overlap at the beginning -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].from-1); -- pmin = pc->chunks[i].from; -+ if ( pmin < pc->chunks[i].beg ) { //overlap at the beginning -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].beg); -+ pmin = pc->chunks[i].beg; - } - -- if ( pmax <= pc->chunks[i].to ) { //completely contained -+ if ( pmax <= pc->chunks[i].end ) { //completely contained - stats->nbases_mapped_cigar -= (pmax - pmin); - return; - } else { //overlap at the end -- stats->nbases_mapped_cigar -= (pc->chunks[i].to - pmin); -- pmin = pc->chunks[i].to; -+ stats->nbases_mapped_cigar -= (pc->chunks[i].end - pmin); -+ pmin = pc->chunks[i].end; - } - } - } - } -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); - } - - void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs) -@@ -998,15 +1169,17 @@ - stats->nreads_dup++; - } - -+ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; -+ - int read_len = unclipped_length(bam_line); - if ( read_len >= stats->nbases ) - realloc_buffers(stats,read_len); - // Update max_len observed - if ( stats->max_lenmax_len = read_len; -- if ( IS_READ1(bam_line) && stats->max_len_1st < read_len ) -+ if ( order == READ_ORDER_FIRST && stats->max_len_1st < read_len ) - stats->max_len_1st = read_len; -- if ( IS_READ2(bam_line) && stats->max_len_2nd < read_len ) -+ if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len ) - stats->max_len_2nd = read_len; - - int i; -@@ -1017,8 +1190,8 @@ - if ( IS_ORIGINAL(bam_line) ) - { - stats->read_lengths[read_len]++; -- if ( IS_READ1(bam_line) ) stats->read_lengths_1st[read_len]++; -- if ( IS_READ2(bam_line) ) stats->read_lengths_2nd[read_len]++; -+ if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++; -+ if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++; - collect_orig_read_stats(bam_line, stats, &gc_count); - } - -@@ -1039,7 +1212,7 @@ - isize = stats->info->nisize; - if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) - { -- int pos_fst = bam_line->core.mpos - bam_line->core.pos; -+ hts_pos_t pos_fst = bam_line->core.mpos - bam_line->core.pos; - int is_fst = IS_READ1(bam_line) ? 1 : -1; - int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; - int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; -@@ -1075,7 +1248,7 @@ - if ( stats->regions ) - { - // Count only on-target bases -- int iref = bam_line->core.pos + 1; -+ hts_pos_t iref = bam_line->core.pos + 1; - for (i=0; icore.n_cigar; i++) - { - int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]); -@@ -1129,7 +1302,7 @@ - } - - if ( stats->last_pair_tid != bam_line->core.tid) { -- stats->pair_count -= cleanup_overlaps(read_pairs, INT_MAX-1); -+ stats->pair_count -= cleanup_overlaps(read_pairs, INT64_MAX-1); - stats->last_pair_tid = bam_line->core.tid; - stats->last_read_flush = 0; - } -@@ -1181,8 +1354,9 @@ - // Coverage distribution graph - round_buffer_flush(stats,bam_line->core.pos); - if ( stats->regions ) { -- uint32_t p = bam_line->core.pos, pnew, pmin, pmax, j; -- pmin = pmax = i = j = 0; -+ hts_pos_t p = bam_line->core.pos, pnew, pmin = 0, pmax = 0; -+ uint32_t j = 0; -+ i = 0; - while ( j < bam_line->core.n_cigar && i < stats->nchunks ) { - int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); - int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); -@@ -1190,13 +1364,13 @@ - case BAM_CMATCH: - case BAM_CEQUAL: - case BAM_CDIFF: -- pmin = MAX(p, stats->chunks[i].from-1); -- pmax = MIN(p+oplen, stats->chunks[i].to); -- if ( pmax >= pmin ) { -+ pmin = MAX(p, stats->chunks[i].beg-1); // 0 based -+ pmax = MIN(p+oplen, stats->chunks[i].end); // 1 based -+ if ( pmax > pmin ) { - if ( stats->info->remove_overlaps ) - remove_overlaps(bam_line, read_pairs, stats, pmin, pmax); - else -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); - } - break; - case BAM_CDEL: -@@ -1204,7 +1378,7 @@ - } - pnew = p + (bam_cigar_type(op)&2 ? oplen : 0); // consumes reference - -- if ( pnew >= stats->chunks[i].to ) { -+ if ( pnew >= stats->chunks[i].end ) { - // go to the next chunk - i++; - } else { -@@ -1214,7 +1388,8 @@ - } - } - } else { -- uint32_t p = bam_line->core.pos, j; -+ hts_pos_t p = bam_line->core.pos; -+ uint32_t j; - for (j = 0; j < bam_line->core.n_cigar; j++) { - int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); - int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); -@@ -1225,7 +1400,7 @@ - if ( stats->info->remove_overlaps ) - remove_overlaps(bam_line, read_pairs, stats, p, p+oplen); - else -- round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen); - break; - case BAM_CDEL: - break; -@@ -1234,7 +1409,7 @@ - } - } - if ( stats->info->remove_overlaps ) -- remove_overlaps(bam_line, read_pairs, stats, -1, -1); //remove the line from the hash table -+ remove_overlaps(bam_line, read_pairs, stats, -1LL, -1LL); //remove the line from the hash table - } - } - -@@ -1255,7 +1430,7 @@ - float n,d; - int k; - -- n = p*(N+1)/100; -+ n = (float)p*(N+1)/100; - k = n; - if ( k<=0 ) - return gcd[0].depth; -@@ -1320,9 +1495,9 @@ - fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); - fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); - fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); -- fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below) -+ fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) - fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); -- fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); -+ fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); - fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); - fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); - fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); -@@ -1344,7 +1519,7 @@ - fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); - fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches); - fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0); -- float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; -+ float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0; - fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length); - fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0); - fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0); -@@ -1358,7 +1533,7 @@ - fprintf(to, "SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); - fprintf(to, "SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); - fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); -- fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd):0); -+ fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0); - if ( stats->target_count ) { - fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count); - for (icov=stats->info->cov_threshold+1; icovncov; icov++) -@@ -1439,11 +1614,18 @@ - 100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum); - - } -+ -+ uint64_t tA=0, tC=0, tG=0, tT=0, tN=0; - fprintf(to, "# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); - for (ibase=0; ibasemax_len; ibase++) - { - acgtno_count_t *acgtno_count_1st = &(stats->acgtno_cycles_1st[ibase]); - uint64_t acgt_sum_1st = acgtno_count_1st->a + acgtno_count_1st->c + acgtno_count_1st->g + acgtno_count_1st->t; -+ tA += acgtno_count_1st->a; -+ tC += acgtno_count_1st->c; -+ tG += acgtno_count_1st->g; -+ tT += acgtno_count_1st->t; -+ tN += acgtno_count_1st->n; - - if ( acgt_sum_1st ) - fprintf(to, "FBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, -@@ -1455,11 +1637,19 @@ - 100.*acgtno_count_1st->other/acgt_sum_1st); - - } -+ fprintf(to, "# ACGT raw counters for first fragments. Use `grep ^FTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); -+ fprintf(to, "FTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); -+ tA=0, tC=0, tG=0, tT=0, tN=0; - fprintf(to, "# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); - for (ibase=0; ibasemax_len; ibase++) - { - acgtno_count_t *acgtno_count_2nd = &(stats->acgtno_cycles_2nd[ibase]); - uint64_t acgt_sum_2nd = acgtno_count_2nd->a + acgtno_count_2nd->c + acgtno_count_2nd->g + acgtno_count_2nd->t; -+ tA += acgtno_count_2nd->a; -+ tC += acgtno_count_2nd->c; -+ tG += acgtno_count_2nd->g; -+ tT += acgtno_count_2nd->t; -+ tN += acgtno_count_2nd->n; - - if ( acgt_sum_2nd ) - fprintf(to, "LBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, -@@ -1471,6 +1661,52 @@ - 100.*acgtno_count_2nd->other/acgt_sum_2nd); - - } -+ fprintf(to, "# ACGT raw counters for last fragments. Use `grep ^LTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); -+ fprintf(to, "LTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); -+ -+ int tag; -+ for (tag=0; tagntags; tag++) { -+ if (stats->tags_barcode[tag].nbases) { -+ fprintf(to, "# ACGT content per cycle for barcodes. Use `grep ^%sC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N counts as a percentage of all A/C/G/T bases [%%]\n", -+ stats->tags_barcode[tag].tag_name); -+ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) -+ { -+ if (ibase == stats->tags_barcode[tag].tag_sep) -+ continue; -+ -+ acgtno_count_t *acgtno_count = stats->acgtno_barcode + stats->tags_barcode[tag].offset + ibase; -+ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t; -+ -+ if ( acgt_sum ) -+ fprintf(to, "%sC%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", stats->tags_barcode[tag].tag_name, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep, -+ 100.*acgtno_count->a/acgt_sum, -+ 100.*acgtno_count->c/acgt_sum, -+ 100.*acgtno_count->g/acgt_sum, -+ 100.*acgtno_count->t/acgt_sum, -+ 100.*acgtno_count->n/acgt_sum); -+ } -+ -+ fprintf(to, "# Barcode Qualities. Use `grep ^%sQ | cut -f 2-` to extract this part.\n", stats->tags_barcode[tag].qual_name); -+ fprintf(to, "# Columns correspond to qualities and rows to barcode cycles. First column is the cycle number.\n"); -+ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) -+ { -+ if (ibase == stats->tags_barcode[tag].tag_sep) -+ continue; -+ -+ fprintf(to, "%sQ%d\t%d", stats->tags_barcode[tag].qual_name, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep); -+ for (iqual=0; iqual<=stats->tags_barcode[tag].max_qual; iqual++) -+ { -+ fprintf(to, "\t%ld", (long)stats->quals_barcode[(stats->tags_barcode[tag].offset + ibase)*stats->nquals+iqual]); -+ } -+ fprintf(to, "\n"); -+ } -+ } -+ } -+ - fprintf(to, "# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); - for (isize=0; isizeisize->inward(stats->isize->data, isize)); -@@ -1564,14 +1800,15 @@ - } - } - --void init_regions(stats_t *stats, const char *file) -+static void init_regions(stats_t *stats, const char *file) - { - FILE *fp = fopen(file,"r"); - if ( !fp ) error("%s: %s\n",file,strerror(errno)); - - kstring_t line = { 0, 0, NULL }; - int warned = 0, r, p, new_p; -- int prev_tid=-1, prev_pos=-1; -+ int prev_tid=-1; -+ hts_pos_t prev_pos=-1LL; - while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0) - { - if ( line.s[0] == '#' ) continue; -@@ -1592,30 +1829,33 @@ - - if ( tid >= stats->nregions ) - { -- stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); -+ if(!(stats->regions = realloc(stats->regions,sizeof(regions_t)*(tid+REG_INC)))) -+ error("Could not allocate memory for region.\n"); -+ - int j; -- for (j=stats->nregions; jnregions+100; j++) -+ for (j=stats->nregions; jregions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; - stats->regions[j].pos = NULL; - } -- stats->nregions += 100; -+ stats->nregions = tid+REG_INC; - } - int npos = stats->regions[tid].npos; - if ( npos >= stats->regions[tid].mpos ) - { -- stats->regions[tid].mpos += 1000; -- stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); -+ stats->regions[tid].mpos = npos+POS_INC; -+ if (!(stats->regions[tid].pos = realloc(stats->regions[tid].pos, sizeof(hts_pair_pos_t)*stats->regions[tid].mpos))) -+ error("Could not allocate memory for interval.\n"); - } - -- if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); -+ if ( (sscanf(&line.s[i+1],"%"SCNd64" %"SCNd64, &stats->regions[tid].pos[npos].beg, &stats->regions[tid].pos[npos].end))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); - if ( prev_tid==-1 || prev_tid!=tid ) - { - prev_tid = tid; -- prev_pos = stats->regions[tid].pos[npos].from; -+ prev_pos = stats->regions[tid].pos[npos].beg; - } -- if ( prev_pos>stats->regions[tid].pos[npos].from ) -- error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line.s,stats->regions[tid].pos[npos].from,prev_pos); -+ if ( prev_pos>stats->regions[tid].pos[npos].beg ) -+ error("The positions are not in chromosomal order (%s:%"PRIhts_pos" comes after %"PRIhts_pos")\n", line.s, stats->regions[tid].pos[npos].beg, prev_pos); - stats->regions[tid].npos++; - if ( stats->regions[tid].npos > stats->nchunks ) - stats->nchunks = stats->regions[tid].npos; -@@ -1628,20 +1868,21 @@ - for (r = 0; r < stats->nregions; r++) { - regions_t *reg = &stats->regions[r]; - if ( reg->npos > 1 ) { -- qsort(reg->pos, reg->npos, sizeof(pos_t), regions_lt); -+ qsort(reg->pos, reg->npos, sizeof(hts_pair_pos_t), regions_lt); - for (new_p = 0, p = 1; p < reg->npos; p++) { -- if ( reg->pos[new_p].to < reg->pos[p].from ) -+ if ( reg->pos[new_p].end < reg->pos[p].beg ) - reg->pos[++new_p] = reg->pos[p]; -- else if ( reg->pos[new_p].to < reg->pos[p].to ) -- reg->pos[new_p].to = reg->pos[p].to; -+ else if ( reg->pos[new_p].end < reg->pos[p].end ) -+ reg->pos[new_p].end = reg->pos[p].end; - } - reg->npos = ++new_p; - } - for (p = 0; p < reg->npos; p++) -- stats->target_count += (reg->pos[p].to - reg->pos[p].from + 1); -+ stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); - } - -- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); -+ if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)))) -+ error("Could not allocate memory for chunk.\n"); - } - - void destroy_regions(stats_t *stats) -@@ -1676,22 +1917,22 @@ - // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, - // even small overlap is enough to include the read in the stats. - int i = reg->cpos; -- while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; -+ while ( inpos && reg->pos[i].end<=bam_line->core.pos ) i++; - if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } - int64_t endpos = bam_endpos(bam_line); -- if ( endpos < reg->pos[i].from ) return 0; -+ if ( endpos < reg->pos[i].beg ) return 0; - - //found a read overlapping a region - reg->cpos = i; -- stats->reg_from = reg->pos[i].from; -- stats->reg_to = reg->pos[i].to; -+ stats->reg_from = reg->pos[i].beg; -+ stats->reg_to = reg->pos[i].end; - - //now find all the overlapping chunks - stats->nchunks = 0; - while (i < reg->npos) { -- if (bam_line->core.pos < reg->pos[i].to && endpos >= reg->pos[i].from) { -- stats->chunks[stats->nchunks].from = MAX(bam_line->core.pos+1, reg->pos[i].from); -- stats->chunks[stats->nchunks].to = MIN(endpos, reg->pos[i].to); -+ if (bam_line->core.pos < reg->pos[i].end && endpos >= reg->pos[i].beg) { -+ stats->chunks[stats->nchunks].beg = MAX(bam_line->core.pos+1, reg->pos[i].beg); -+ stats->chunks[stats->nchunks].end = MIN(endpos, reg->pos[i].end); - stats->nchunks++; - } - i++; -@@ -1707,7 +1948,7 @@ - int i, j, tid; - stats->nregions = iter->n_reg; - stats->regions = calloc(stats->nregions, sizeof(regions_t)); -- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); -+ stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)); - if ( !stats->regions || !stats->chunks ) - return 1; - -@@ -1727,15 +1968,15 @@ - } - - stats->regions[tid].mpos = stats->regions[tid].npos = iter->reg_list[i].count; -- stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(pos_t)); -+ stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(hts_pair_pos_t)); - if ( !stats->regions[tid].pos ) - return 1; - - for (j = 0; j < stats->regions[tid].npos; j++) { -- stats->regions[tid].pos[j].from = iter->reg_list[i].intervals[j].beg+1; -- stats->regions[tid].pos[j].to = iter->reg_list[i].intervals[j].end; -+ stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1; -+ stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end; - -- stats->target_count += (stats->regions[tid].pos[j].to - stats->regions[tid].pos[j].from + 1); -+ stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); - } - } - -@@ -1773,7 +2014,7 @@ - } - - --static void error(const char *format, ...) -+static void HTS_NORETURN error(const char *format, ...) - { - if ( !format ) - { -@@ -1783,13 +2024,14 @@ - printf("Options:\n"); - printf(" -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); - printf(" -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); -+ printf(" -X, --customized-index-file Use a customized index file\n"); - printf(" -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); - printf(" -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); - printf(" --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); - printf(" -h, --help This help message\n"); - printf(" -i, --insert-size Maximum insert size [8000]\n"); - printf(" -I, --id Include only listed read group or sample name\n"); -- printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); -+ printf(" -l, --read-length Include in the statistics only reads with the given read length [-1]\n"); - printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); - printf(" -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); - printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); -@@ -1799,8 +2041,8 @@ - printf(" -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); - printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); - printf(" -p, --remove-overlaps Remove overlaps of paired-end reads from coverage and base count computations.\n"); -- printf(" -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation.\n"); -- sam_global_opt_help(stdout, "-.--.@"); -+ printf(" -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation [0]\n"); -+ sam_global_opt_help(stdout, "-.--.@-."); - printf("\n"); - } - else -@@ -1840,6 +2082,9 @@ - free(stats->ins_cycles_2nd); - free(stats->del_cycles_1st); - free(stats->del_cycles_2nd); -+ if (stats->acgtno_barcode) free(stats->acgtno_barcode); -+ if (stats->quals_barcode) free(stats->quals_barcode); -+ free(stats->tags_barcode); - destroy_regions(stats); - if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash); - free(stats->split_name); -@@ -1878,6 +2123,9 @@ - - void destroy_split_stats(khash_t(c2stats) *split_hash) - { -+ if (!split_hash) -+ return; -+ - int i = 0; - stats_t *curr_stats = NULL; - for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){ -@@ -1891,6 +2139,10 @@ - stats_info_t* stats_info_init(int argc, char *argv[]) - { - stats_info_t* info = calloc(1, sizeof(stats_info_t)); -+ if (!info) { -+ return NULL; -+ } -+ - info->nisize = 8000; - info->isize_main_bulk = 0.99; // There are always outliers at the far end - info->gcd_bin_size = 20e3; -@@ -1926,11 +2178,15 @@ - stats_t* stats_init() - { - stats_t *stats = calloc(1,sizeof(stats_t)); -+ if (!stats) -+ return NULL; -+ - stats->ngc = 200; - stats->nquals = 256; - stats->nbases = 300; - stats->rseq_pos = -1; -- stats->tid = stats->gcd_pos = -1; -+ stats->tid = -1; -+ stats->gcd_pos = -1LL; - stats->igcd = 0; - stats->is_sorted = 1; - stats->nindels = stats->nbases; -@@ -1944,6 +2200,18 @@ - return stats; - } - -+static int init_barcode_tags(stats_t* stats) { -+ stats->ntags = 4; -+ stats->tags_barcode = calloc(stats->ntags, sizeof(barcode_info_t)); -+ if (!stats->tags_barcode) -+ return -1; -+ stats->tags_barcode[0] = (barcode_info_t){"BC", "QT", 0, -1, -1, 0}; -+ stats->tags_barcode[1] = (barcode_info_t){"CR", "CY", 0, -1, -1, 0}; -+ stats->tags_barcode[2] = (barcode_info_t){"OX", "BZ", 0, -1, -1, 0}; -+ stats->tags_barcode[3] = (barcode_info_t){"RX", "QX", 0, -1, -1, 0}; -+ return 0; -+} -+ - static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* group_id, const char* targets) - { - // Give stats_t a pointer to the info struct -@@ -1961,32 +2229,60 @@ - stats->ncov = 3 + (info->cov_max-info->cov_min) / info->cov_step; - info->cov_max = info->cov_min + ((info->cov_max-info->cov_min)/info->cov_step +1)*info->cov_step - 1; - stats->cov = calloc(sizeof(uint64_t),stats->ncov); -+ if (!stats->cov) goto nomem; - stats->cov_rbuf.size = stats->nbases*5; - stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); -- -+ if (!stats->cov_rbuf.buffer) goto nomem; - if ( group_id ) init_group_id(stats, group_id); - // .. arrays - stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); -+ if (!stats->quals_1st) goto nomem; - stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); -+ if (!stats->quals_2nd) goto nomem; - stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); -+ if (!stats->gc_1st) goto nomem; - stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); -+ if (!stats->gc_2nd) goto nomem; - stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0); -+ if (!stats->isize) goto nomem; - stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); -- stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; -+ if (!stats->gcd) goto nomem; -+ if (info->fai) { -+ stats->mpc_buf = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); -+ if (!stats->mpc_buf) goto nomem; -+ } else { -+ stats->mpc_buf = NULL; -+ } - stats->acgtno_cycles_1st = calloc(stats->nbases,sizeof(acgtno_count_t)); -+ if (!stats->acgtno_cycles_1st) goto nomem; - stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t)); -+ if (!stats->acgtno_cycles_2nd) goto nomem; - stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->read_lengths) goto nomem; - stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->read_lengths_1st) goto nomem; - stats->read_lengths_2nd = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->read_lengths_2nd) goto nomem; - stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->insertions) goto nomem; - stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->deletions) goto nomem; - stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->ins_cycles_1st) goto nomem; - stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->ins_cycles_2nd) goto nomem; - stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->del_cycles_1st) goto nomem; - stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->del_cycles_2nd) goto nomem; -+ if (init_barcode_tags(stats) < 0) -+ goto nomem; - realloc_rseq_buffer(stats); - if ( targets ) - init_regions(stats, targets); -+ return; -+ nomem: -+ error("Out of memory"); - } - - static stats_t* get_curr_split_stats(bam1_t* bam_line, khash_t(c2stats)* split_hash, stats_info_t* info, char* targets) -@@ -2002,6 +2298,9 @@ - khiter_t k = kh_get(c2stats, split_hash, split_name); - if(k == kh_end(split_hash)){ - curr_stats = stats_init(); // mallocs new instance -+ if (!curr_stats) { -+ error("Couldn't allocate split stats"); -+ } - init_stat_structs(curr_stats, info, NULL, targets); - curr_stats->split_name = split_name; - -@@ -2024,11 +2323,16 @@ - { - char *targets = NULL; - char *bam_fname = NULL; -+ char *bam_idx_fname = NULL; - char *group_id = NULL; -- int sparse = 0; -+ int sparse = 0, has_index_file = 0, ret = 1; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - - stats_info_t *info = stats_info_init(argc, argv); -+ if (!info) { -+ fprintf(stderr, "Could not allocate memory for info.\n"); -+ return 1; -+ } - - static const struct option loptions[] = - { -@@ -2036,6 +2340,7 @@ - {"help", no_argument, NULL, 'h'}, - {"remove-dups", no_argument, NULL, 'd'}, - {"sam", no_argument, NULL, 's'}, -+ {"customized-index-file", required_argument, NULL, 'X'}, - {"ref-seq", required_argument, NULL, 'r'}, - {"coverage", required_argument, NULL, 'c'}, - {"read-length", required_argument, NULL, 'l'}, -@@ -2056,13 +2361,14 @@ - }; - int opt; - -- while ( (opt=getopt_long(argc,argv,"?hdsxpr:c:l:i:t:m:q:f:F:g:I:1:S:P:@:",loptions,NULL))>0 ) -+ while ( (opt=getopt_long(argc,argv,"?hdsXxpr:c:l:i:t:m:q:f:F:g:I:S:P:@:",loptions,NULL))>0 ) - { - switch (opt) - { - case 'f': info->flag_require = bam_str2flag(optarg); break; - case 'F': info->flag_filter |= bam_str2flag(optarg); break; - case 'd': info->flag_filter |= BAM_FDUP; break; -+ case 'X': has_index_file = 1; break; - case 's': break; - case 'r': info->fai = fai_load(optarg); - if (info->fai==NULL) -@@ -2088,15 +2394,15 @@ - break; - case '?': - case 'h': error(NULL); -+ /* no break */ - default: - if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0) - error("Unknown argument: %s\n", optarg); - break; - } - } -- if ( optind 0) - hts_set_threads(info->sam, ga.nthreads); - - stats_t *all_stats = stats_init(); -+ if (!all_stats) { -+ fprintf(stderr, "Could not allocate memory for stats.\n"); -+ cleanup_stats_info(info); -+ return 1; -+ } - stats_t *curr_stats = NULL; - init_stat_structs(all_stats, info, group_id, targets); - // Init - // .. hash - khash_t(c2stats)* split_hash = kh_init(c2stats); -+ if (!split_hash) goto cleanup_all_stats; - - khash_t(qn2pair)* read_pairs = kh_init(qn2pair); -+ if (!read_pairs) goto cleanup_split_hash; - - // Collect statistics - bam1_t *bam_line = bam_init1(); -- if ( optindsam,bam_fname); -- if (bam_idx) { -- -- int regcount = 0; -- hts_reglist_t *reglist = bed_reglist(region_hash, ALL, ®count); -- if (reglist) { -- -- hts_itr_multi_t *iter = sam_itr_regions(bam_idx, info->sam_header, reglist, regcount); -- if (iter) { -- -- if (!targets) { -- all_stats->nchunks = argc-optind; -- if ( replicate_regions(all_stats, iter) ) -- fprintf(stderr, "Replications of the regions failed."); -- } -+ if (!bam_line) goto cleanup_read_pairs; -+ -+ if (optind < argc) { -+ // Region:interval arguments in the command line -+ hts_idx_t *bam_idx = NULL; -+ if (has_index_file) { -+ bam_idx = sam_index_load2(info->sam, bam_fname, bam_idx_fname); -+ } else { -+ // If an index filename has not been specified, look alongside the alignment file -+ bam_idx = sam_index_load(info->sam, bam_fname); -+ } -+ -+ if (bam_idx) { -+ hts_itr_multi_t *iter = sam_itr_regarray(bam_idx, info->sam_header, &argv[optind], argc - optind); -+ if (iter) { -+ if (!targets) { -+ all_stats->nchunks = argc-optind; -+ if (replicate_regions(all_stats, iter)) -+ fprintf(stderr, "Replications of the regions failed\n"); -+ } - -- if ( all_stats->nregions && all_stats->regions ) { -- while (sam_itr_multi_next(info->sam, iter, bam_line) >= 0) { -- if (info->split_tag) { -- curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); -- collect_stats(bam_line, curr_stats, read_pairs); -- } -- collect_stats(bam_line, all_stats, read_pairs); -- } -+ if ( all_stats->nregions && all_stats->regions ) { -+ while ((ret = sam_itr_next(info->sam, iter, bam_line)) >= 0) { -+ if (info->split_tag) { -+ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); -+ collect_stats(bam_line, curr_stats, read_pairs); - } -+ collect_stats(bam_line, all_stats, read_pairs); -+ } - -+ if (ret < -1) { -+ fprintf(stderr, "Failure while running the iterator\n"); - hts_itr_multi_destroy(iter); -- } else { -- fprintf(stderr, "Creation of the region iterator failed."); -- hts_reglist_free(reglist, regcount); -+ hts_idx_destroy(bam_idx); -+ goto cleanup; - } -- } else { -- fprintf(stderr, "Creation of the region list failed."); - } -- -- hts_idx_destroy(bam_idx); -+ hts_itr_multi_destroy(iter); - } else { -- fprintf(stderr, "Random alignment retrieval only works for indexed BAM files.\n"); -+ fprintf(stderr, "Multi-region iterator could not be created\n"); -+ hts_idx_destroy(bam_idx); -+ goto cleanup; - } -- -- bed_destroy(region_hash); -+ hts_idx_destroy(bam_idx); - } else { -- fprintf(stderr, "Creation of the region hash table failed.\n"); -+ if (has_index_file) -+ fprintf(stderr, "Invalid index file '%s'\n", bam_idx_fname); -+ fprintf(stderr, "Random alignment retrieval only works for indexed files\n"); -+ goto cleanup; - } -- } -- else -- { -+ } else { - if ( info->cov_threshold > 0 && !targets ) { -- fprintf(stderr, "Coverage percentage calcuation requires a list of target regions\n"); -+ fprintf(stderr, "Coverage percentage calculation requires a list of target regions\n"); - goto cleanup; - } - - // Stream through the entire BAM ignoring off-target regions if -t is given -- int ret; - while ((ret = sam_read1(info->sam, info->sam_header, bam_line)) >= 0) { - if (info->split_tag) { - curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); -@@ -2194,7 +2509,7 @@ - - if (ret < -1) { - fprintf(stderr, "Failure while decoding file\n"); -- return 1; -+ goto cleanup; - } - } - -@@ -2203,15 +2518,19 @@ - if (info->split_tag) - output_split_stats(split_hash, bam_fname, sparse); - -+ ret = 0; - cleanup: - bam_destroy1(bam_line); -- bam_hdr_destroy(info->sam_header); -+ sam_hdr_destroy(info->sam_header); - sam_global_args_free(&ga); - -+cleanup_read_pairs: -+ cleanup_overlaps(read_pairs, INT64_MAX); -+cleanup_split_hash: -+ destroy_split_stats(split_hash); -+cleanup_all_stats: - cleanup_stats(all_stats); - cleanup_stats_info(info); -- destroy_split_stats(split_hash); -- cleanup_overlaps(read_pairs, INT_MAX); - -- return 0; -+ return ret; - } ---- python-pysam.orig/samtools/stats.c.pysam.c -+++ python-pysam/samtools/stats.c.pysam.c -@@ -2,7 +2,7 @@ - - /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - -- Copyright (C) 2012-2015 Genome Research Ltd. -+ Copyright (C) 2012-2019 Genome Research Ltd. - - Author: Petr Danecek - Author: Sam Nicholls -@@ -48,6 +48,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -55,7 +56,7 @@ - #include - #include - #include --#include "sam_header.h" -+#include - #include - #include "samtools.h" - #include -@@ -67,8 +68,10 @@ - #define BWA_MIN_RDLEN 35 - #define DEFAULT_CHUNK_NO 8 - #define DEFAULT_PAIR_MAX 10000 -+#define ERROR_LIMIT 200 - // From the spec - // If 0x4 is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, bits 0x2, 0x10, 0x100 and 0x800, and the bit 0x20 of the previous read in the template. -+#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED) - #define IS_PAIRED_AND_MAPPED(bam) (((bam)->core.flag&BAM_FPAIRED) && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) - #define IS_PROPERLYPAIRED(bam) (((bam)->core.flag&(BAM_FPAIRED|BAM_FPROPER_PAIR)) == (BAM_FPAIRED|BAM_FPROPER_PAIR) && !((bam)->core.flag&BAM_FUNMAP)) - #define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) -@@ -79,6 +82,14 @@ - #define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) - #define IS_ORIGINAL(bam) (((bam)->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY)) == 0) - -+#define READ_ORDER_NONE 0 -+#define READ_ORDER_FIRST 1 -+#define READ_ORDER_LAST 2 -+#define READ_ORDER_MIDDLE 3 -+ -+#define REG_INC 100 -+#define POS_INC 1000 -+ - // The GC-depth graph works as follows: split the reference sequence into - // segments and calculate GC content and depth in each bin. Then sort - // these segments by their GC and plot the depth distribution by means -@@ -93,17 +104,16 @@ - // For coverage distribution, a simple pileup - typedef struct - { -- int64_t pos; -+ hts_pos_t pos; - int size, start; - int *buffer; - } - round_buffer_t; - --typedef struct { uint32_t from, to; } pos_t; - typedef struct - { -- int npos,mpos,cpos; -- pos_t *pos; -+ int npos, mpos, cpos; -+ hts_pair_pos_t *pos; - } - regions_t; - -@@ -120,6 +130,17 @@ - - typedef struct - { -+ char tag_name[3]; -+ char qual_name[3]; -+ uint32_t nbases; -+ int32_t tag_sep; // Index of the separator (if present) -+ int32_t max_qual; -+ uint32_t offset; // Where the tag stats info is located in the allocated memory -+} -+barcode_info_t; -+ -+typedef struct -+{ - // Auxiliary data - int flag_require, flag_filter; - faidx_t *fai; // Reference sequence for GC-depth graph -@@ -131,7 +152,7 @@ - float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part - int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins - samFile* sam; -- bam_hdr_t* sam_header; -+ sam_hdr_t* sam_header; - - // Filters - int filter_readlen; -@@ -177,6 +198,7 @@ - uint64_t total_len_dup; - uint64_t nreads_1st; - uint64_t nreads_2nd; -+ uint64_t nreads_other; - uint64_t nreads_filtered; - uint64_t nreads_dup; - uint64_t nreads_unmapped; -@@ -198,8 +220,8 @@ - // GC-depth related data - uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin - gc_depth_t *gcd; // The GC-depth bins holder -- int32_t tid, gcd_pos; // Position of the current bin -- int32_t pos; // Position of the last read -+ int32_t tid; // Position of the current bin -+ hts_pos_t gcd_pos, pos; // Position of the last read - - // Coverage distribution related data - int ncov; // The number of coverage bins -@@ -209,12 +231,13 @@ - // Mismatches by read cycle - uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against - int mrseq_buf; // The size of the buffer -- int32_t rseq_pos; // The coordinate of the first base in the buffer -- int32_t nrseq_buf; // The used part of the buffer -+ hts_pos_t rseq_pos; // The coordinate of the first base in the buffer -+ int64_t nrseq_buf; // The used part of the buffer - uint64_t *mpc_buf; // Mismatches per cycle - - // Target regions -- int nregions, reg_from, reg_to; -+ int nregions; -+ hts_pos_t reg_from, reg_to; - regions_t *regions; - - // Auxiliary data -@@ -225,13 +248,20 @@ - char* split_name; - - stats_info_t* info; // Pointer to options and settings struct -- pos_t *chunks; -+ hts_pair_pos_t *chunks; - uint32_t nchunks; - - uint32_t pair_count; // Number of active pairs in the pairing hash table - uint32_t target_count; // Number of bases covered by the target file - uint32_t last_pair_tid; - uint32_t last_read_flush; -+ -+ // Barcode statistics -+ acgtno_count_t *acgtno_barcode; -+ uint64_t *quals_barcode; -+ barcode_info_t *tags_barcode; -+ uint32_t ntags; -+ uint32_t error_number; - } - stats_t; - KHASH_MAP_INIT_STR(c2stats, stats_t*) -@@ -239,18 +269,18 @@ - typedef struct { - uint32_t first; // 1 - first read, 2 - second read - uint32_t n, m; // number of chunks, allocated chunks -- pos_t *chunks; // chunk array of size m -+ hts_pair_pos_t *chunks; // chunk array of size m - } pair_t; - KHASH_MAP_INIT_STR(qn2pair, pair_t*) - - --static void error(const char *format, ...); -+static void HTS_NORETURN error(const char *format, ...); - int is_in_regions(bam1_t *bam_line, stats_t *stats); - void realloc_buffers(stats_t *stats, int seq_len); - - static int regions_lt(const void *r1, const void *r2) { -- int64_t from_diff = (int64_t)((pos_t *)r1)->from - (int64_t)((pos_t *)r2)->from; -- int64_t to_diff = (int64_t)((pos_t *)r1)->to - (int64_t)((pos_t *)r2)->to; -+ int64_t from_diff = ((hts_pair_pos_t *)r1)->beg - ((hts_pair_pos_t *)r2)->beg; -+ int64_t to_diff = ((hts_pair_pos_t *)r1)->end - ((hts_pair_pos_t *)r2)->end; - - return from_diff > 0 ? 1 : from_diff < 0 ? -1 : to_diff > 0 ? 1 : to_diff < 0 ? -1 : 0; - } -@@ -267,19 +297,19 @@ - return 1 + (depth - min) / step; - } - --static inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) -+static inline int round_buffer_lidx2ridx(int offset, int size, hts_pos_t refpos, hts_pos_t pos) - { - return (offset + (pos-refpos) % size) % size; - } - --void round_buffer_flush(stats_t *stats, int64_t pos) -+void round_buffer_flush(stats_t *stats, hts_pos_t pos) - { - int ibuf,idp; - - if ( pos==stats->cov_rbuf.pos ) - return; - -- int64_t new_pos = pos; -+ hts_pos_t new_pos = pos; - if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) - { - // Flush the whole buffer, but in sequential order, -@@ -287,10 +317,10 @@ - } - - if ( pos < stats->cov_rbuf.pos ) -- error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); -+ error("Expected coordinates in ascending order, got %"PRIhts_pos" after %"PRIhts_pos"\n", pos, stats->cov_rbuf.pos); - - int ifrom = stats->cov_rbuf.start; -- int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); -+ int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos-1); - if ( ifrom>ito ) - { - for (ibuf=ifrom; ibufcov_rbuf.size; ibuf++) -@@ -311,27 +341,30 @@ - stats->cov[idp]++; - stats->cov_rbuf.buffer[ibuf] = 0; - } -- stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); -+ stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start, stats->cov_rbuf.size, stats->cov_rbuf.pos, pos); - stats->cov_rbuf.pos = new_pos; - } - --void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) -+/** -+ * [from, to) - 0 based half-open -+ */ -+static void round_buffer_insert_read(round_buffer_t *rbuf, hts_pos_t from, hts_pos_t to) - { -- if ( to-from >= rbuf->size ) -- error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); -+ if ( to-from > rbuf->size ) -+ error("The read length too big (%"PRIhts_pos"), please increase the buffer length (currently %d)\n", to-from, rbuf->size); - if ( from < rbuf->pos ) -- error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); -+ error("The reads are not sorted (%"PRIhts_pos" comes after %"PRIhts_pos").\n", from, rbuf->pos); - -- int ifrom,ito,ibuf; -- ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); -- ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); -+ int ifrom, ito, ibuf; -+ ifrom = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, from); -+ ito = round_buffer_lidx2ridx(rbuf->start, rbuf->size, rbuf->pos, to); - if ( ifrom>ito ) - { - for (ibuf=ifrom; ibufsize; ibuf++) - rbuf->buffer[ibuf]++; - ifrom = 0; - } -- for (ibuf=ifrom; ibuf<=ito; ibuf++) -+ for (ibuf=ifrom; ibufbuffer[ibuf]++; - } - -@@ -364,7 +397,7 @@ - void count_indels(stats_t *stats,bam1_t *bam_line) - { - int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; -- int is_1st = IS_READ1(bam_line) ? 1 : 0; -+ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; - int icig; - int icycle = 0; - int read_len = bam_line->core.l_qseq; -@@ -379,10 +412,10 @@ - int idx = is_fwd ? icycle : read_len-icycle-ncig; - if ( idx<0 ) - error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); -- if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -- if ( is_1st ) -+ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%"PRIhts_pos" %s\n", idx, stats->nbases, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); -+ if ( order == READ_ORDER_FIRST ) - stats->ins_cycles_1st[idx]++; -- else -+ if ( order == READ_ORDER_LAST ) - stats->ins_cycles_2nd[idx]++; - icycle += ncig; - if ( ncig<=stats->nindels ) -@@ -394,9 +427,9 @@ - int idx = is_fwd ? icycle-1 : read_len-icycle-1; - if ( idx<0 ) continue; // discard meaningless deletions - if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); -- if ( is_1st ) -+ if ( order == READ_ORDER_FIRST ) - stats->del_cycles_1st[idx]++; -- else -+ if ( order == READ_ORDER_LAST ) - stats->del_cycles_2nd[idx]++; - if ( ncig<=stats->nindels ) - stats->deletions[ncig-1]++; -@@ -422,8 +455,8 @@ - void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) - { - int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; -- int icig,iread=0,icycle=0; -- int iref = bam_line->core.pos - stats->rseq_pos; -+ int icig, iread=0, icycle=0; -+ hts_pos_t iref = bam_line->core.pos - stats->rseq_pos; - uint8_t *read = bam_get_seq(bam_line); - uint8_t *quals = bam_get_qual(bam_line); - uint64_t *mpc_buf = stats->mpc_buf; -@@ -456,13 +489,13 @@ - continue; - } - // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large -- // chunk of refseq in memory. Not very frequent and not noticable in the stats. -+ // chunk of refseq in memory. Not very frequent and not noticeable in the stats. - if ( cig==BAM_CREF_SKIP || cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) continue; - if ( cig!=BAM_CMATCH && cig!=BAM_CEQUAL && cig!=BAM_CDIFF ) // not relying on precalculated diffs -- error("TODO: cigar %d, %s:%d %s\n", cig,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -+ error("TODO: cigar %d, %s:%"PRIhts_pos" %s\n", cig, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); - - if ( ncig+iref > stats->nrseq_buf ) -- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam_get_qname(bam_line),stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1); -+ error("FIXME: %d+%"PRIhts_pos" > %"PRId64", %s, %s:%"PRIhts_pos"\n", ncig, iref, stats->nrseq_buf, bam_get_qname(bam_line), sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1); - - int im; - for (im=0; im=stats->nquals ) -- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -+ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); - - int idx = is_fwd ? icycle : read_len-icycle-1; - if ( idx>stats->max_len ) -- error("mpc: %d>%d (%s %d %s)\n",idx,stats->max_len,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -+ error("mpc: %d>%d (%s %"PRIhts_pos" %s)\n", idx, stats->max_len, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); - - idx = idx*stats->nquals + qual; - if ( idx>=stats->nquals*stats->nbases ) -@@ -505,11 +538,12 @@ - } - } - --void read_ref_seq(stats_t *stats, int32_t tid, int32_t pos) -+void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos) - { -- int i, fai_ref_len; -- char *fai_ref = faidx_fetch_seq(stats->info->fai, stats->info->sam_header->target_name[tid], pos, pos+stats->mrseq_buf-1, &fai_ref_len); -- if ( fai_ref_len<0 ) error("Failed to fetch the sequence \"%s\"\n", stats->info->sam_header->target_name[tid]); -+ int i; -+ hts_pos_t fai_ref_len; -+ char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); -+ if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid)); - - uint8_t *ptr = stats->rseq_buf; - for (i=0; itid = tid; - } - --float fai_gc_content(stats_t *stats, int pos, int len) -+float fai_gc_content(stats_t *stats, hts_pos_t pos, int len) - { - uint32_t gc,count,c; -- int i = pos - stats->rseq_pos, ito = i + len; -+ hts_pos_t i = pos - stats->rseq_pos, ito = i + len; - assert( i>=0 ); - - if ( ito > stats->nrseq_buf ) ito = stats->nrseq_buf; -@@ -570,6 +604,9 @@ - if ( stats->mrseq_bufrseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); -+ if (!stats->rseq_buf) { -+ error("Could not reallocate reference sequence buffer"); -+ } - stats->mrseq_buf = n; - } - } -@@ -661,6 +698,9 @@ - - // Realloc the coverage distribution buffer - int *rbuffer = calloc(sizeof(int),seq_len*5); -+ if (!rbuffer) { -+ error("Could not allocate coverage distribution buffer"); -+ } - n = stats->cov_rbuf.size-stats->cov_rbuf.start; - memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); - if ( stats->cov_rbuf.start>1 ) -@@ -690,6 +730,119 @@ - stats->checksum.quals += crc32(0L, qual, (seq_len+1)/2); - } - -+// Collect statistics about the barcode tags specified by init_barcode_tags method -+static void collect_barcode_stats(bam1_t* bam_line, stats_t* stats) { -+ uint32_t nbases, tag, i; -+ acgtno_count_t *acgtno; -+ uint64_t *quals; -+ int32_t *separator, *maxqual; -+ -+ for (tag = 0; tag < stats->ntags; tag++) { -+ const char *barcode_tag = stats->tags_barcode[tag].tag_name, *qual_tag = stats->tags_barcode[tag].qual_name; -+ uint8_t* bc = bam_aux_get(bam_line, barcode_tag); -+ if (!bc) -+ continue; -+ -+ char* barcode = bam_aux2Z(bc); -+ if (!barcode) -+ continue; -+ -+ uint32_t barcode_len = strlen(barcode); -+ if (!stats->tags_barcode[tag].nbases) { // tag seen for the first time -+ uint32_t offset = 0; -+ for (i = 0; i < stats->ntags; i++) -+ offset += stats->tags_barcode[i].nbases; -+ -+ stats->tags_barcode[tag].offset = offset; -+ stats->tags_barcode[tag].nbases = barcode_len; -+ stats->acgtno_barcode = realloc(stats->acgtno_barcode, (offset + barcode_len) * sizeof(acgtno_count_t)); -+ stats->quals_barcode = realloc(stats->quals_barcode, (offset + barcode_len) * stats->nquals * sizeof(uint64_t)); -+ -+ if (!stats->acgtno_barcode || !stats->quals_barcode) -+ error("Error allocating memory. Aborting!\n"); -+ -+ memset(stats->acgtno_barcode + offset, 0, barcode_len*sizeof(acgtno_count_t)); -+ memset(stats->quals_barcode + offset*stats->nquals, 0, barcode_len*stats->nquals*sizeof(uint64_t)); -+ } -+ -+ nbases = stats->tags_barcode[tag].nbases; -+ if (barcode_len > nbases) { -+ fprintf(samtools_stderr, "Barcodes with tag %s differ in length at sequence '%s'\n", barcode_tag, bam_get_qname(bam_line)); -+ continue; -+ } -+ -+ acgtno = stats->acgtno_barcode + stats->tags_barcode[tag].offset; -+ quals = stats->quals_barcode + stats->tags_barcode[tag].offset*stats->nquals; -+ maxqual = &stats->tags_barcode[tag].max_qual; -+ separator = &stats->tags_barcode[tag].tag_sep; -+ int error_flag = 0; -+ -+ for (i = 0; i < barcode_len; i++) { -+ switch (barcode[i]) { -+ case 'A': -+ acgtno[i].a++; -+ break; -+ case 'C': -+ acgtno[i].c++; -+ break; -+ case 'G': -+ acgtno[i].g++; -+ break; -+ case 'T': -+ acgtno[i].t++; -+ break; -+ case 'N': -+ acgtno[i].n++; -+ break; -+ default: -+ if (*separator >= 0) { -+ if (*separator != i) { -+ if (stats->error_number < ERROR_LIMIT) { -+ fprintf(samtools_stderr, "Barcode separator for tag %s is in a different position or wrong barcode content('%s') at sequence '%s'\n", barcode_tag, barcode, bam_get_qname(bam_line)); -+ stats->error_number++; -+ } -+ error_flag = 1; -+ } -+ } else { -+ *separator = i; -+ } -+ } -+ -+ /* don't process the rest of the tag bases */ -+ if (error_flag) -+ break; -+ } -+ -+ /* skip to the next tag */ -+ if (error_flag) -+ continue; -+ -+ uint8_t* qt = bam_aux_get(bam_line, qual_tag); -+ if (!qt) -+ continue; -+ -+ char* barqual = bam_aux2Z(qt); -+ if (!barqual) -+ continue; -+ -+ uint32_t barqual_len = strlen(barqual); -+ if (barqual_len == barcode_len) { -+ for (i = 0; i < barcode_len; i++) { -+ int32_t qual = (int32_t)barqual[i] - '!'; // Phred + 33 -+ if (qual >= 0 && qual < stats->nquals) { -+ quals[i * stats->nquals + qual]++; -+ if (qual > *maxqual) -+ *maxqual = qual; -+ } -+ } -+ } else { -+ if (stats->error_number++ < ERROR_LIMIT) { -+ fprintf(samtools_stderr, "%s length and %s length don't match for sequence '%s'\n", barcode_tag, qual_tag, bam_get_qname(bam_line)); -+ } -+ } -+ } -+} -+ - // These stats should only be calculated for the original reads ignoring - // supplementary artificial reads otherwise we'll accidentally double count - void collect_orig_read_stats(bam1_t *bam_line, stats_t *stats, int* gc_count_out) -@@ -700,42 +853,48 @@ - if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; - if ( bam_line->core.flag & BAM_FPAIRED ) stats->nreads_paired_tech++; - -+ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; -+ - // Count GC and ACGT per cycle. Note that cycle is approximate, clipping is ignored - uint8_t *seq = bam_get_seq(bam_line); -- int i, read_cycle, gc_count = 0, reverse = IS_REVERSE(bam_line), is_first = IS_READ1(bam_line); -- for (i=0; iacgtno_cycles_1st[ read_cycle ].a++ : stats->acgtno_cycles_2nd[ read_cycle ].a++; -- break; -- case 2: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].c++ : stats->acgtno_cycles_2nd[ read_cycle ].c++; -- gc_count++; -- break; -- case 4: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].g++ : stats->acgtno_cycles_2nd[ read_cycle ].g++; -- gc_count++; -- break; -- case 8: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].t++ : stats->acgtno_cycles_2nd[ read_cycle ].t++; -- break; -- case 15: -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].n++ : stats->acgtno_cycles_2nd[ read_cycle ].n++; -- break; -- default: -- /* -- * count "=" sequences in "other" along -- * with MRSVWYHKDB ambiguity codes -- */ -- is_first ? stats->acgtno_cycles_1st[ read_cycle ].other++ : stats->acgtno_cycles_2nd[ read_cycle ].other++; -- break; -+ acgtno_count_t *acgtno_cycles = (order == READ_ORDER_FIRST) ? stats->acgtno_cycles_1st : (order == READ_ORDER_LAST) ? stats->acgtno_cycles_2nd : NULL ; -+ if (acgtno_cycles) { -+ for (i=0; ingc-1)/seq_len; -@@ -745,38 +904,48 @@ - // Determine which array (1st or 2nd read) will these stats go to, - // trim low quality bases from end the same way BWA does, - // fill GC histogram -- uint64_t *quals; -+ uint64_t *quals = NULL; - uint8_t *bam_quals = bam_get_qual(bam_line); -- if ( IS_READ2(bam_line) ) -- { -- quals = stats->quals_2nd; -- stats->nreads_2nd++; -- stats->total_len_2nd += seq_len; -- for (i=gc_idx_min; igc_2nd[i]++; -- } -- else -- { -+ -+ switch (order) { -+ case READ_ORDER_FIRST: - quals = stats->quals_1st; - stats->nreads_1st++; - stats->total_len_1st += seq_len; - for (i=gc_idx_min; igc_1st[i]++; -+ break; -+ case READ_ORDER_LAST: -+ quals = stats->quals_2nd; -+ stats->nreads_2nd++; -+ stats->total_len_2nd += seq_len; -+ for (i=gc_idx_min; igc_2nd[i]++; -+ break; -+ default: -+ stats->nreads_other++; - } - if ( stats->info->trim_qual>0 ) - stats->nbases_trimmed += bwa_trim_read(stats->info->trim_qual, bam_quals, seq_len, reverse); - - // Quality histogram and average quality. Clipping is neglected. -- for (i=0; i=stats->nquals ) -- error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->info->sam_header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam_get_qname(bam_line)); -- if ( qual>stats->max_qual ) -- stats->max_qual = qual; -+ if (quals) { -+ for (i=0; i=stats->nquals ) -+ error("TODO: quality too high %d>=%d (%s %"PRIhts_pos" %s)\n", qual, stats->nquals, sam_hdr_tid2name(stats->info->sam_header, bam_line->core.tid), bam_line->core.pos+1, bam_get_qname(bam_line)); -+ if ( qual>stats->max_qual ) -+ stats->max_qual = qual; -+ -+ quals[ i*stats->nquals+qual ]++; -+ stats->sum_qual += qual; -+ } -+ } - -- quals[ i*stats->nquals+qual ]++; -- stats->sum_qual += qual; -+ // Barcode statistics -+ if (order == READ_ORDER_FIRST) { -+ collect_barcode_stats(bam_line, stats); - } - - // Look at the flags and increment appropriate counters (mapped, paired, etc) -@@ -805,7 +974,7 @@ - *gc_count_out = gc_count; - } - --static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, int max) { -+static int cleanup_overlaps(khash_t(qn2pair) *read_pairs, hts_pos_t max) { - if ( !read_pairs ) - return 0; - -@@ -816,7 +985,7 @@ - char *key = (char *)kh_key(read_pairs, k); - pair_t *val = kh_val(read_pairs, k); - if ( val && val->chunks ) { -- if ( val->chunks[val->n-1].to < max ) { -+ if ( val->chunks[val->n-1].end < max ) { - free(val->chunks); - free(val); - free(key); -@@ -830,29 +999,32 @@ - } - } - } -- if ( max == INT_MAX ) -+ if ( max == INT64_MAX ) - kh_destroy(qn2pair, read_pairs); - - return count; - } - --static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, int pmin, int pmax) { -+/** -+ * [pmin, pmax) - 0 based half-open -+ */ -+static void remove_overlaps(bam1_t *bam_line, khash_t(qn2pair) *read_pairs, stats_t *stats, hts_pos_t pmin, hts_pos_t pmax) { - if ( !bam_line || !read_pairs || !stats ) - return; - -- uint32_t first = (IS_READ1(bam_line) > 0 ? 1 : 0) + (IS_READ2(bam_line) > 0 ? 2 : 0) ; -+ uint32_t order = (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0); - if ( !(bam_line->core.flag & BAM_FPAIRED) || - (bam_line->core.flag & BAM_FMUNMAP) || -- (abs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || -- (first != 1 && first != 2) ) { -+ (llabs(bam_line->core.isize) >= 2*bam_line->core.l_qseq) || -+ (order != READ_ORDER_FIRST && order != READ_ORDER_LAST) ) { - if ( pmin >= 0 ) -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); - return; - } - - char *qname = bam_get_qname(bam_line); - if ( !qname ) { -- fprintf(samtools_stderr, "Error retrieving qname for line starting at pos %d\n", bam_line->core.pos); -+ fprintf(samtools_stderr, "Error retrieving qname for line starting at pos %"PRIhts_pos"\n", bam_line->core.pos); - return; - } - -@@ -870,8 +1042,7 @@ - - k = kh_put(qn2pair, read_pairs, s, &ret); - if ( -1 == ret ) { -- fprintf(samtools_stderr, "Error inserting read '%s' in pair hash table\n", qname); -- return; -+ error("Error inserting read '%s' in pair hash table\n", qname); - } - - pair_t *pc = calloc(1, sizeof(pair_t)); -@@ -881,16 +1052,16 @@ - } - - pc->m = DEFAULT_CHUNK_NO; -- pc->chunks = calloc(pc->m, sizeof(pos_t)); -+ pc->chunks = calloc(pc->m, sizeof(hts_pair_pos_t)); - if ( !pc->chunks ) { - fprintf(samtools_stderr, "Error allocating memory\n"); - return; - } - -- pc->chunks[0].from = pmin; -- pc->chunks[0].to = pmax; -+ pc->chunks[0].beg = pmin; -+ pc->chunks[0].end = pmax; - pc->n = 1; -- pc->first = first; -+ pc->first = order; - - kh_val(read_pairs, k) = pc; - stats->pair_count++; -@@ -901,12 +1072,12 @@ - return; - } - -- if ( first == pc->first ) { //chunk from an existing line -+ if ( order == pc->first ) { //chunk from an existing line - if ( pmin == -1 ) - return; - - if ( pc->n == pc->m ) { -- pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(pos_t)); -+ hts_pair_pos_t *tmp = realloc(pc->chunks, (pc->m<<1)*sizeof(hts_pair_pos_t)); - if ( !tmp ) { - fprintf(samtools_stderr, "Error allocating memory\n"); - return; -@@ -915,8 +1086,8 @@ - pc->m<<=1; - } - -- pc->chunks[pc->n].from = pmin; -- pc->chunks[pc->n].to = pmax; -+ pc->chunks[pc->n].beg = pmin; -+ pc->chunks[pc->n].end = pmax; - pc->n++; - } else { //the other line, check for overlapping - if ( pmin == -1 && kh_exist(read_pairs, k) ) { //job done, delete entry -@@ -934,28 +1105,28 @@ - - int i; - for (i=0; in; i++) { -- if ( pmin >= pc->chunks[i].to ) -+ if ( pmin >= pc->chunks[i].end ) - continue; - -- if ( pmax <= pc->chunks[i].from ) //no overlap -+ if ( pmax <= pc->chunks[i].beg ) //no overlap - break; - -- if ( pmin < pc->chunks[i].from ) { //overlap at the beginning -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].from-1); -- pmin = pc->chunks[i].from; -+ if ( pmin < pc->chunks[i].beg ) { //overlap at the beginning -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pc->chunks[i].beg); -+ pmin = pc->chunks[i].beg; - } - -- if ( pmax <= pc->chunks[i].to ) { //completely contained -+ if ( pmax <= pc->chunks[i].end ) { //completely contained - stats->nbases_mapped_cigar -= (pmax - pmin); - return; - } else { //overlap at the end -- stats->nbases_mapped_cigar -= (pc->chunks[i].to - pmin); -- pmin = pc->chunks[i].to; -+ stats->nbases_mapped_cigar -= (pc->chunks[i].end - pmin); -+ pmin = pc->chunks[i].end; - } - } - } - } -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); - } - - void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs) -@@ -1000,15 +1171,17 @@ - stats->nreads_dup++; - } - -+ uint32_t order = IS_PAIRED(bam_line) ? (IS_READ1(bam_line) ? READ_ORDER_FIRST : 0) + (IS_READ2(bam_line) ? READ_ORDER_LAST : 0) : READ_ORDER_FIRST; -+ - int read_len = unclipped_length(bam_line); - if ( read_len >= stats->nbases ) - realloc_buffers(stats,read_len); - // Update max_len observed - if ( stats->max_lenmax_len = read_len; -- if ( IS_READ1(bam_line) && stats->max_len_1st < read_len ) -+ if ( order == READ_ORDER_FIRST && stats->max_len_1st < read_len ) - stats->max_len_1st = read_len; -- if ( IS_READ2(bam_line) && stats->max_len_2nd < read_len ) -+ if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len ) - stats->max_len_2nd = read_len; - - int i; -@@ -1019,8 +1192,8 @@ - if ( IS_ORIGINAL(bam_line) ) - { - stats->read_lengths[read_len]++; -- if ( IS_READ1(bam_line) ) stats->read_lengths_1st[read_len]++; -- if ( IS_READ2(bam_line) ) stats->read_lengths_2nd[read_len]++; -+ if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++; -+ if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++; - collect_orig_read_stats(bam_line, stats, &gc_count); - } - -@@ -1041,7 +1214,7 @@ - isize = stats->info->nisize; - if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) - { -- int pos_fst = bam_line->core.mpos - bam_line->core.pos; -+ hts_pos_t pos_fst = bam_line->core.mpos - bam_line->core.pos; - int is_fst = IS_READ1(bam_line) ? 1 : -1; - int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; - int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; -@@ -1077,7 +1250,7 @@ - if ( stats->regions ) - { - // Count only on-target bases -- int iref = bam_line->core.pos + 1; -+ hts_pos_t iref = bam_line->core.pos + 1; - for (i=0; icore.n_cigar; i++) - { - int cig = bam_cigar_op(bam_get_cigar(bam_line)[i]); -@@ -1131,7 +1304,7 @@ - } - - if ( stats->last_pair_tid != bam_line->core.tid) { -- stats->pair_count -= cleanup_overlaps(read_pairs, INT_MAX-1); -+ stats->pair_count -= cleanup_overlaps(read_pairs, INT64_MAX-1); - stats->last_pair_tid = bam_line->core.tid; - stats->last_read_flush = 0; - } -@@ -1183,8 +1356,9 @@ - // Coverage distribution graph - round_buffer_flush(stats,bam_line->core.pos); - if ( stats->regions ) { -- uint32_t p = bam_line->core.pos, pnew, pmin, pmax, j; -- pmin = pmax = i = j = 0; -+ hts_pos_t p = bam_line->core.pos, pnew, pmin = 0, pmax = 0; -+ uint32_t j = 0; -+ i = 0; - while ( j < bam_line->core.n_cigar && i < stats->nchunks ) { - int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); - int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); -@@ -1192,13 +1366,13 @@ - case BAM_CMATCH: - case BAM_CEQUAL: - case BAM_CDIFF: -- pmin = MAX(p, stats->chunks[i].from-1); -- pmax = MIN(p+oplen, stats->chunks[i].to); -- if ( pmax >= pmin ) { -+ pmin = MAX(p, stats->chunks[i].beg-1); // 0 based -+ pmax = MIN(p+oplen, stats->chunks[i].end); // 1 based -+ if ( pmax > pmin ) { - if ( stats->info->remove_overlaps ) - remove_overlaps(bam_line, read_pairs, stats, pmin, pmax); - else -- round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax); - } - break; - case BAM_CDEL: -@@ -1206,7 +1380,7 @@ - } - pnew = p + (bam_cigar_type(op)&2 ? oplen : 0); // consumes reference - -- if ( pnew >= stats->chunks[i].to ) { -+ if ( pnew >= stats->chunks[i].end ) { - // go to the next chunk - i++; - } else { -@@ -1216,7 +1390,8 @@ - } - } - } else { -- uint32_t p = bam_line->core.pos, j; -+ hts_pos_t p = bam_line->core.pos; -+ uint32_t j; - for (j = 0; j < bam_line->core.n_cigar; j++) { - int op = bam_cigar_op(bam_get_cigar(bam_line)[j]); - int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]); -@@ -1227,7 +1402,7 @@ - if ( stats->info->remove_overlaps ) - remove_overlaps(bam_line, read_pairs, stats, p, p+oplen); - else -- round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen-1); -+ round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen); - break; - case BAM_CDEL: - break; -@@ -1236,7 +1411,7 @@ - } - } - if ( stats->info->remove_overlaps ) -- remove_overlaps(bam_line, read_pairs, stats, -1, -1); //remove the line from the hash table -+ remove_overlaps(bam_line, read_pairs, stats, -1LL, -1LL); //remove the line from the hash table - } - } - -@@ -1257,7 +1432,7 @@ - float n,d; - int k; - -- n = p*(N+1)/100; -+ n = (float)p*(N+1)/100; - k = n; - if ( k<=0 ) - return gcd[0].depth; -@@ -1322,9 +1497,9 @@ - fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); - fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); - fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); -- fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); // not counting excluded seqs (and none of the below) -+ fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) - fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); -- fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); -+ fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); - fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); - fprintf(to, "SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); - fprintf(to, "SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); -@@ -1346,7 +1521,7 @@ - fprintf(to, "SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); - fprintf(to, "SN\tmismatches:\t%ld\t# from NM fields\n", (long)stats->nmismatches); - fprintf(to, "SN\terror rate:\t%e\t# mismatches / bases mapped (cigar)\n", stats->nbases_mapped_cigar ? (float)stats->nmismatches/stats->nbases_mapped_cigar : 0); -- float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; -+ float avg_read_length = (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0; - fprintf(to, "SN\taverage length:\t%.0f\n", avg_read_length); - fprintf(to, "SN\taverage first fragment length:\t%.0f\n", stats->nreads_1st? (float)stats->total_len_1st/stats->nreads_1st:0); - fprintf(to, "SN\taverage last fragment length:\t%.0f\n", stats->nreads_2nd? (float)stats->total_len_2nd/stats->nreads_2nd:0); -@@ -1360,7 +1535,7 @@ - fprintf(to, "SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); - fprintf(to, "SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); - fprintf(to, "SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); -- fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd):0); -+ fprintf(to, "SN\tpercentage of properly paired reads (%%):\t%.1f\n", (stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)? (float)(100*stats->nreads_properly_paired)/(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other):0); - if ( stats->target_count ) { - fprintf(to, "SN\tbases inside the target:\t%u\n", stats->target_count); - for (icov=stats->info->cov_threshold+1; icovncov; icov++) -@@ -1441,11 +1616,18 @@ - 100.*(acgtno_count_1st->other + acgtno_count_2nd->other)/acgt_sum); - - } -+ -+ uint64_t tA=0, tC=0, tG=0, tT=0, tN=0; - fprintf(to, "# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); - for (ibase=0; ibasemax_len; ibase++) - { - acgtno_count_t *acgtno_count_1st = &(stats->acgtno_cycles_1st[ibase]); - uint64_t acgt_sum_1st = acgtno_count_1st->a + acgtno_count_1st->c + acgtno_count_1st->g + acgtno_count_1st->t; -+ tA += acgtno_count_1st->a; -+ tC += acgtno_count_1st->c; -+ tG += acgtno_count_1st->g; -+ tT += acgtno_count_1st->t; -+ tN += acgtno_count_1st->n; - - if ( acgt_sum_1st ) - fprintf(to, "FBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, -@@ -1457,11 +1639,19 @@ - 100.*acgtno_count_1st->other/acgt_sum_1st); - - } -+ fprintf(to, "# ACGT raw counters for first fragments. Use `grep ^FTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); -+ fprintf(to, "FTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); -+ tA=0, tC=0, tG=0, tT=0, tN=0; - fprintf(to, "# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N and O counts as a percentage of all A/C/G/T bases [%%]\n"); - for (ibase=0; ibasemax_len; ibase++) - { - acgtno_count_t *acgtno_count_2nd = &(stats->acgtno_cycles_2nd[ibase]); - uint64_t acgt_sum_2nd = acgtno_count_2nd->a + acgtno_count_2nd->c + acgtno_count_2nd->g + acgtno_count_2nd->t; -+ tA += acgtno_count_2nd->a; -+ tC += acgtno_count_2nd->c; -+ tG += acgtno_count_2nd->g; -+ tT += acgtno_count_2nd->t; -+ tN += acgtno_count_2nd->n; - - if ( acgt_sum_2nd ) - fprintf(to, "LBC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase+1, -@@ -1473,6 +1663,52 @@ - 100.*acgtno_count_2nd->other/acgt_sum_2nd); - - } -+ fprintf(to, "# ACGT raw counters for last fragments. Use `grep ^LTC | cut -f 2-` to extract this part. The columns are: A,C,G,T,N base counters\n"); -+ fprintf(to, "LTC\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", tA, tC, tG, tT, tN); -+ -+ int tag; -+ for (tag=0; tagntags; tag++) { -+ if (stats->tags_barcode[tag].nbases) { -+ fprintf(to, "# ACGT content per cycle for barcodes. Use `grep ^%sC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [%%]; and N counts as a percentage of all A/C/G/T bases [%%]\n", -+ stats->tags_barcode[tag].tag_name); -+ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) -+ { -+ if (ibase == stats->tags_barcode[tag].tag_sep) -+ continue; -+ -+ acgtno_count_t *acgtno_count = stats->acgtno_barcode + stats->tags_barcode[tag].offset + ibase; -+ uint64_t acgt_sum = acgtno_count->a + acgtno_count->c + acgtno_count->g + acgtno_count->t; -+ -+ if ( acgt_sum ) -+ fprintf(to, "%sC%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", stats->tags_barcode[tag].tag_name, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep, -+ 100.*acgtno_count->a/acgt_sum, -+ 100.*acgtno_count->c/acgt_sum, -+ 100.*acgtno_count->g/acgt_sum, -+ 100.*acgtno_count->t/acgt_sum, -+ 100.*acgtno_count->n/acgt_sum); -+ } -+ -+ fprintf(to, "# Barcode Qualities. Use `grep ^%sQ | cut -f 2-` to extract this part.\n", stats->tags_barcode[tag].qual_name); -+ fprintf(to, "# Columns correspond to qualities and rows to barcode cycles. First column is the cycle number.\n"); -+ for (ibase=0; ibasetags_barcode[tag].nbases; ibase++) -+ { -+ if (ibase == stats->tags_barcode[tag].tag_sep) -+ continue; -+ -+ fprintf(to, "%sQ%d\t%d", stats->tags_barcode[tag].qual_name, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? 1 : 2, -+ stats->tags_barcode[tag].tag_sep < 0 || ibase < stats->tags_barcode[tag].tag_sep ? ibase+1 : ibase-stats->tags_barcode[tag].tag_sep); -+ for (iqual=0; iqual<=stats->tags_barcode[tag].max_qual; iqual++) -+ { -+ fprintf(to, "\t%ld", (long)stats->quals_barcode[(stats->tags_barcode[tag].offset + ibase)*stats->nquals+iqual]); -+ } -+ fprintf(to, "\n"); -+ } -+ } -+ } -+ - fprintf(to, "# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); - for (isize=0; isizeisize->inward(stats->isize->data, isize)); -@@ -1566,14 +1802,15 @@ - } - } - --void init_regions(stats_t *stats, const char *file) -+static void init_regions(stats_t *stats, const char *file) - { - FILE *fp = fopen(file,"r"); - if ( !fp ) error("%s: %s\n",file,strerror(errno)); - - kstring_t line = { 0, 0, NULL }; - int warned = 0, r, p, new_p; -- int prev_tid=-1, prev_pos=-1; -+ int prev_tid=-1; -+ hts_pos_t prev_pos=-1LL; - while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0) - { - if ( line.s[0] == '#' ) continue; -@@ -1594,30 +1831,33 @@ - - if ( tid >= stats->nregions ) - { -- stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); -+ if(!(stats->regions = realloc(stats->regions,sizeof(regions_t)*(tid+REG_INC)))) -+ error("Could not allocate memory for region.\n"); -+ - int j; -- for (j=stats->nregions; jnregions+100; j++) -+ for (j=stats->nregions; jregions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; - stats->regions[j].pos = NULL; - } -- stats->nregions += 100; -+ stats->nregions = tid+REG_INC; - } - int npos = stats->regions[tid].npos; - if ( npos >= stats->regions[tid].mpos ) - { -- stats->regions[tid].mpos += 1000; -- stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); -+ stats->regions[tid].mpos = npos+POS_INC; -+ if (!(stats->regions[tid].pos = realloc(stats->regions[tid].pos, sizeof(hts_pair_pos_t)*stats->regions[tid].mpos))) -+ error("Could not allocate memory for interval.\n"); - } - -- if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); -+ if ( (sscanf(&line.s[i+1],"%"SCNd64" %"SCNd64, &stats->regions[tid].pos[npos].beg, &stats->regions[tid].pos[npos].end))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]); - if ( prev_tid==-1 || prev_tid!=tid ) - { - prev_tid = tid; -- prev_pos = stats->regions[tid].pos[npos].from; -+ prev_pos = stats->regions[tid].pos[npos].beg; - } -- if ( prev_pos>stats->regions[tid].pos[npos].from ) -- error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line.s,stats->regions[tid].pos[npos].from,prev_pos); -+ if ( prev_pos>stats->regions[tid].pos[npos].beg ) -+ error("The positions are not in chromosomal order (%s:%"PRIhts_pos" comes after %"PRIhts_pos")\n", line.s, stats->regions[tid].pos[npos].beg, prev_pos); - stats->regions[tid].npos++; - if ( stats->regions[tid].npos > stats->nchunks ) - stats->nchunks = stats->regions[tid].npos; -@@ -1630,20 +1870,21 @@ - for (r = 0; r < stats->nregions; r++) { - regions_t *reg = &stats->regions[r]; - if ( reg->npos > 1 ) { -- qsort(reg->pos, reg->npos, sizeof(pos_t), regions_lt); -+ qsort(reg->pos, reg->npos, sizeof(hts_pair_pos_t), regions_lt); - for (new_p = 0, p = 1; p < reg->npos; p++) { -- if ( reg->pos[new_p].to < reg->pos[p].from ) -+ if ( reg->pos[new_p].end < reg->pos[p].beg ) - reg->pos[++new_p] = reg->pos[p]; -- else if ( reg->pos[new_p].to < reg->pos[p].to ) -- reg->pos[new_p].to = reg->pos[p].to; -+ else if ( reg->pos[new_p].end < reg->pos[p].end ) -+ reg->pos[new_p].end = reg->pos[p].end; - } - reg->npos = ++new_p; - } - for (p = 0; p < reg->npos; p++) -- stats->target_count += (reg->pos[p].to - reg->pos[p].from + 1); -+ stats->target_count += (reg->pos[p].end - reg->pos[p].beg + 1); - } - -- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); -+ if (!(stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)))) -+ error("Could not allocate memory for chunk.\n"); - } - - void destroy_regions(stats_t *stats) -@@ -1678,22 +1919,22 @@ - // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, - // even small overlap is enough to include the read in the stats. - int i = reg->cpos; -- while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; -+ while ( inpos && reg->pos[i].end<=bam_line->core.pos ) i++; - if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } - int64_t endpos = bam_endpos(bam_line); -- if ( endpos < reg->pos[i].from ) return 0; -+ if ( endpos < reg->pos[i].beg ) return 0; - - //found a read overlapping a region - reg->cpos = i; -- stats->reg_from = reg->pos[i].from; -- stats->reg_to = reg->pos[i].to; -+ stats->reg_from = reg->pos[i].beg; -+ stats->reg_to = reg->pos[i].end; - - //now find all the overlapping chunks - stats->nchunks = 0; - while (i < reg->npos) { -- if (bam_line->core.pos < reg->pos[i].to && endpos >= reg->pos[i].from) { -- stats->chunks[stats->nchunks].from = MAX(bam_line->core.pos+1, reg->pos[i].from); -- stats->chunks[stats->nchunks].to = MIN(endpos, reg->pos[i].to); -+ if (bam_line->core.pos < reg->pos[i].end && endpos >= reg->pos[i].beg) { -+ stats->chunks[stats->nchunks].beg = MAX(bam_line->core.pos+1, reg->pos[i].beg); -+ stats->chunks[stats->nchunks].end = MIN(endpos, reg->pos[i].end); - stats->nchunks++; - } - i++; -@@ -1709,7 +1950,7 @@ - int i, j, tid; - stats->nregions = iter->n_reg; - stats->regions = calloc(stats->nregions, sizeof(regions_t)); -- stats->chunks = calloc(stats->nchunks, sizeof(pos_t)); -+ stats->chunks = calloc(stats->nchunks, sizeof(hts_pair_pos_t)); - if ( !stats->regions || !stats->chunks ) - return 1; - -@@ -1729,15 +1970,15 @@ - } - - stats->regions[tid].mpos = stats->regions[tid].npos = iter->reg_list[i].count; -- stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(pos_t)); -+ stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(hts_pair_pos_t)); - if ( !stats->regions[tid].pos ) - return 1; - - for (j = 0; j < stats->regions[tid].npos; j++) { -- stats->regions[tid].pos[j].from = iter->reg_list[i].intervals[j].beg+1; -- stats->regions[tid].pos[j].to = iter->reg_list[i].intervals[j].end; -+ stats->regions[tid].pos[j].beg = iter->reg_list[i].intervals[j].beg+1; -+ stats->regions[tid].pos[j].end = iter->reg_list[i].intervals[j].end; - -- stats->target_count += (stats->regions[tid].pos[j].to - stats->regions[tid].pos[j].from + 1); -+ stats->target_count += (stats->regions[tid].pos[j].end - stats->regions[tid].pos[j].beg + 1); - } - } - -@@ -1775,7 +2016,7 @@ - } - - --static void error(const char *format, ...) -+static void HTS_NORETURN error(const char *format, ...) - { - if ( !format ) - { -@@ -1785,13 +2026,14 @@ - fprintf(samtools_stdout, "Options:\n"); - fprintf(samtools_stdout, " -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); - fprintf(samtools_stdout, " -d, --remove-dups Exclude from statistics reads marked as duplicates\n"); -+ fprintf(samtools_stdout, " -X, --customized-index-file Use a customized index file\n"); - fprintf(samtools_stdout, " -f, --required-flag Required flag, 0 for unset. See also `samtools flags` [0]\n"); - fprintf(samtools_stdout, " -F, --filtering-flag Filtering flag, 0 for unset. See also `samtools flags` [0]\n"); - fprintf(samtools_stdout, " --GC-depth the size of GC-depth bins (decreasing bin size increases memory requirement) [2e4]\n"); - fprintf(samtools_stdout, " -h, --help This help message\n"); - fprintf(samtools_stdout, " -i, --insert-size Maximum insert size [8000]\n"); - fprintf(samtools_stdout, " -I, --id Include only listed read group or sample name\n"); -- fprintf(samtools_stdout, " -l, --read-length Include in the statistics only reads with the given read length []\n"); -+ fprintf(samtools_stdout, " -l, --read-length Include in the statistics only reads with the given read length [-1]\n"); - fprintf(samtools_stdout, " -m, --most-inserts Report only the main part of inserts [0.99]\n"); - fprintf(samtools_stdout, " -P, --split-prefix Path or string prefix for filepaths output by -S (default is input filename)\n"); - fprintf(samtools_stdout, " -q, --trim-quality The BWA trimming parameter [0]\n"); -@@ -1801,8 +2043,8 @@ - fprintf(samtools_stdout, " -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); - fprintf(samtools_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n"); - fprintf(samtools_stdout, " -p, --remove-overlaps Remove overlaps of paired-end reads from coverage and base count computations.\n"); -- fprintf(samtools_stdout, " -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation.\n"); -- sam_global_opt_help(samtools_stdout, "-.--.@"); -+ fprintf(samtools_stdout, " -g, --cov-threshold Only bases with coverage above this value will be included in the target percentage computation [0]\n"); -+ sam_global_opt_help(samtools_stdout, "-.--.@-."); - fprintf(samtools_stdout, "\n"); - } - else -@@ -1842,6 +2084,9 @@ - free(stats->ins_cycles_2nd); - free(stats->del_cycles_1st); - free(stats->del_cycles_2nd); -+ if (stats->acgtno_barcode) free(stats->acgtno_barcode); -+ if (stats->quals_barcode) free(stats->quals_barcode); -+ free(stats->tags_barcode); - destroy_regions(stats); - if ( stats->rg_hash ) khash_str2int_destroy(stats->rg_hash); - free(stats->split_name); -@@ -1880,6 +2125,9 @@ - - void destroy_split_stats(khash_t(c2stats) *split_hash) - { -+ if (!split_hash) -+ return; -+ - int i = 0; - stats_t *curr_stats = NULL; - for(i = kh_begin(split_hash); i != kh_end(split_hash); ++i){ -@@ -1893,6 +2141,10 @@ - stats_info_t* stats_info_init(int argc, char *argv[]) - { - stats_info_t* info = calloc(1, sizeof(stats_info_t)); -+ if (!info) { -+ return NULL; -+ } -+ - info->nisize = 8000; - info->isize_main_bulk = 0.99; // There are always outliers at the far end - info->gcd_bin_size = 20e3; -@@ -1928,11 +2180,15 @@ - stats_t* stats_init() - { - stats_t *stats = calloc(1,sizeof(stats_t)); -+ if (!stats) -+ return NULL; -+ - stats->ngc = 200; - stats->nquals = 256; - stats->nbases = 300; - stats->rseq_pos = -1; -- stats->tid = stats->gcd_pos = -1; -+ stats->tid = -1; -+ stats->gcd_pos = -1LL; - stats->igcd = 0; - stats->is_sorted = 1; - stats->nindels = stats->nbases; -@@ -1946,6 +2202,18 @@ - return stats; - } - -+static int init_barcode_tags(stats_t* stats) { -+ stats->ntags = 4; -+ stats->tags_barcode = calloc(stats->ntags, sizeof(barcode_info_t)); -+ if (!stats->tags_barcode) -+ return -1; -+ stats->tags_barcode[0] = (barcode_info_t){"BC", "QT", 0, -1, -1, 0}; -+ stats->tags_barcode[1] = (barcode_info_t){"CR", "CY", 0, -1, -1, 0}; -+ stats->tags_barcode[2] = (barcode_info_t){"OX", "BZ", 0, -1, -1, 0}; -+ stats->tags_barcode[3] = (barcode_info_t){"RX", "QX", 0, -1, -1, 0}; -+ return 0; -+} -+ - static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* group_id, const char* targets) - { - // Give stats_t a pointer to the info struct -@@ -1963,32 +2231,60 @@ - stats->ncov = 3 + (info->cov_max-info->cov_min) / info->cov_step; - info->cov_max = info->cov_min + ((info->cov_max-info->cov_min)/info->cov_step +1)*info->cov_step - 1; - stats->cov = calloc(sizeof(uint64_t),stats->ncov); -+ if (!stats->cov) goto nomem; - stats->cov_rbuf.size = stats->nbases*5; - stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); -- -+ if (!stats->cov_rbuf.buffer) goto nomem; - if ( group_id ) init_group_id(stats, group_id); - // .. arrays - stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); -+ if (!stats->quals_1st) goto nomem; - stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); -+ if (!stats->quals_2nd) goto nomem; - stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); -+ if (!stats->gc_1st) goto nomem; - stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); -+ if (!stats->gc_2nd) goto nomem; - stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0); -+ if (!stats->isize) goto nomem; - stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); -- stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; -+ if (!stats->gcd) goto nomem; -+ if (info->fai) { -+ stats->mpc_buf = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); -+ if (!stats->mpc_buf) goto nomem; -+ } else { -+ stats->mpc_buf = NULL; -+ } - stats->acgtno_cycles_1st = calloc(stats->nbases,sizeof(acgtno_count_t)); -+ if (!stats->acgtno_cycles_1st) goto nomem; - stats->acgtno_cycles_2nd = calloc(stats->nbases,sizeof(acgtno_count_t)); -+ if (!stats->acgtno_cycles_2nd) goto nomem; - stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->read_lengths) goto nomem; - stats->read_lengths_1st = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->read_lengths_1st) goto nomem; - stats->read_lengths_2nd = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->read_lengths_2nd) goto nomem; - stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->insertions) goto nomem; - stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); -+ if (!stats->deletions) goto nomem; - stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->ins_cycles_1st) goto nomem; - stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->ins_cycles_2nd) goto nomem; - stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->del_cycles_1st) goto nomem; - stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); -+ if (!stats->del_cycles_2nd) goto nomem; -+ if (init_barcode_tags(stats) < 0) -+ goto nomem; - realloc_rseq_buffer(stats); - if ( targets ) - init_regions(stats, targets); -+ return; -+ nomem: -+ error("Out of memory"); - } - - static stats_t* get_curr_split_stats(bam1_t* bam_line, khash_t(c2stats)* split_hash, stats_info_t* info, char* targets) -@@ -2004,6 +2300,9 @@ - khiter_t k = kh_get(c2stats, split_hash, split_name); - if(k == kh_end(split_hash)){ - curr_stats = stats_init(); // mallocs new instance -+ if (!curr_stats) { -+ error("Couldn't allocate split stats"); -+ } - init_stat_structs(curr_stats, info, NULL, targets); - curr_stats->split_name = split_name; - -@@ -2026,11 +2325,16 @@ - { - char *targets = NULL; - char *bam_fname = NULL; -+ char *bam_idx_fname = NULL; - char *group_id = NULL; -- int sparse = 0; -+ int sparse = 0, has_index_file = 0, ret = 1; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - - stats_info_t *info = stats_info_init(argc, argv); -+ if (!info) { -+ fprintf(samtools_stderr, "Could not allocate memory for info.\n"); -+ return 1; -+ } - - static const struct option loptions[] = - { -@@ -2038,6 +2342,7 @@ - {"help", no_argument, NULL, 'h'}, - {"remove-dups", no_argument, NULL, 'd'}, - {"sam", no_argument, NULL, 's'}, -+ {"customized-index-file", required_argument, NULL, 'X'}, - {"ref-seq", required_argument, NULL, 'r'}, - {"coverage", required_argument, NULL, 'c'}, - {"read-length", required_argument, NULL, 'l'}, -@@ -2058,13 +2363,14 @@ - }; - int opt; - -- while ( (opt=getopt_long(argc,argv,"?hdsxpr:c:l:i:t:m:q:f:F:g:I:1:S:P:@:",loptions,NULL))>0 ) -+ while ( (opt=getopt_long(argc,argv,"?hdsXxpr:c:l:i:t:m:q:f:F:g:I:S:P:@:",loptions,NULL))>0 ) - { - switch (opt) - { - case 'f': info->flag_require = bam_str2flag(optarg); break; - case 'F': info->flag_filter |= bam_str2flag(optarg); break; - case 'd': info->flag_filter |= BAM_FDUP; break; -+ case 'X': has_index_file = 1; break; - case 's': break; - case 'r': info->fai = fai_load(optarg); - if (info->fai==NULL) -@@ -2090,15 +2396,15 @@ - break; - case '?': - case 'h': error(NULL); -+ /* no break */ - default: - if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0) - error("Unknown argument: %s\n", optarg); - break; - } - } -- if ( optind 0) - hts_set_threads(info->sam, ga.nthreads); - - stats_t *all_stats = stats_init(); -+ if (!all_stats) { -+ fprintf(samtools_stderr, "Could not allocate memory for stats.\n"); -+ cleanup_stats_info(info); -+ return 1; -+ } - stats_t *curr_stats = NULL; - init_stat_structs(all_stats, info, group_id, targets); - // Init - // .. hash - khash_t(c2stats)* split_hash = kh_init(c2stats); -+ if (!split_hash) goto cleanup_all_stats; - - khash_t(qn2pair)* read_pairs = kh_init(qn2pair); -+ if (!read_pairs) goto cleanup_split_hash; - - // Collect statistics - bam1_t *bam_line = bam_init1(); -- if ( optindsam,bam_fname); -- if (bam_idx) { -- -- int regcount = 0; -- hts_reglist_t *reglist = bed_reglist(region_hash, ALL, ®count); -- if (reglist) { -- -- hts_itr_multi_t *iter = sam_itr_regions(bam_idx, info->sam_header, reglist, regcount); -- if (iter) { -- -- if (!targets) { -- all_stats->nchunks = argc-optind; -- if ( replicate_regions(all_stats, iter) ) -- fprintf(samtools_stderr, "Replications of the regions failed."); -- } -+ if (!bam_line) goto cleanup_read_pairs; -+ -+ if (optind < argc) { -+ // Region:interval arguments in the command line -+ hts_idx_t *bam_idx = NULL; -+ if (has_index_file) { -+ bam_idx = sam_index_load2(info->sam, bam_fname, bam_idx_fname); -+ } else { -+ // If an index filename has not been specified, look alongside the alignment file -+ bam_idx = sam_index_load(info->sam, bam_fname); -+ } -+ -+ if (bam_idx) { -+ hts_itr_multi_t *iter = sam_itr_regarray(bam_idx, info->sam_header, &argv[optind], argc - optind); -+ if (iter) { -+ if (!targets) { -+ all_stats->nchunks = argc-optind; -+ if (replicate_regions(all_stats, iter)) -+ fprintf(samtools_stderr, "Replications of the regions failed\n"); -+ } - -- if ( all_stats->nregions && all_stats->regions ) { -- while (sam_itr_multi_next(info->sam, iter, bam_line) >= 0) { -- if (info->split_tag) { -- curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); -- collect_stats(bam_line, curr_stats, read_pairs); -- } -- collect_stats(bam_line, all_stats, read_pairs); -- } -+ if ( all_stats->nregions && all_stats->regions ) { -+ while ((ret = sam_itr_next(info->sam, iter, bam_line)) >= 0) { -+ if (info->split_tag) { -+ curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); -+ collect_stats(bam_line, curr_stats, read_pairs); - } -+ collect_stats(bam_line, all_stats, read_pairs); -+ } - -+ if (ret < -1) { -+ fprintf(samtools_stderr, "Failure while running the iterator\n"); - hts_itr_multi_destroy(iter); -- } else { -- fprintf(samtools_stderr, "Creation of the region iterator failed."); -- hts_reglist_free(reglist, regcount); -+ hts_idx_destroy(bam_idx); -+ goto cleanup; - } -- } else { -- fprintf(samtools_stderr, "Creation of the region list failed."); - } -- -- hts_idx_destroy(bam_idx); -+ hts_itr_multi_destroy(iter); - } else { -- fprintf(samtools_stderr, "Random alignment retrieval only works for indexed BAM files.\n"); -+ fprintf(samtools_stderr, "Multi-region iterator could not be created\n"); -+ hts_idx_destroy(bam_idx); -+ goto cleanup; - } -- -- bed_destroy(region_hash); -+ hts_idx_destroy(bam_idx); - } else { -- fprintf(samtools_stderr, "Creation of the region hash table failed.\n"); -+ if (has_index_file) -+ fprintf(samtools_stderr, "Invalid index file '%s'\n", bam_idx_fname); -+ fprintf(samtools_stderr, "Random alignment retrieval only works for indexed files\n"); -+ goto cleanup; - } -- } -- else -- { -+ } else { - if ( info->cov_threshold > 0 && !targets ) { -- fprintf(samtools_stderr, "Coverage percentage calcuation requires a list of target regions\n"); -+ fprintf(samtools_stderr, "Coverage percentage calculation requires a list of target regions\n"); - goto cleanup; - } - - // Stream through the entire BAM ignoring off-target regions if -t is given -- int ret; - while ((ret = sam_read1(info->sam, info->sam_header, bam_line)) >= 0) { - if (info->split_tag) { - curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets); -@@ -2196,7 +2511,7 @@ - - if (ret < -1) { - fprintf(samtools_stderr, "Failure while decoding file\n"); -- return 1; -+ goto cleanup; - } - } - -@@ -2205,15 +2520,19 @@ - if (info->split_tag) - output_split_stats(split_hash, bam_fname, sparse); - -+ ret = 0; - cleanup: - bam_destroy1(bam_line); -- bam_hdr_destroy(info->sam_header); -+ sam_hdr_destroy(info->sam_header); - sam_global_args_free(&ga); - -+cleanup_read_pairs: -+ cleanup_overlaps(read_pairs, INT64_MAX); -+cleanup_split_hash: -+ destroy_split_stats(split_hash); -+cleanup_all_stats: - cleanup_stats(all_stats); - cleanup_stats_info(info); -- destroy_split_stats(split_hash); -- cleanup_overlaps(read_pairs, INT_MAX); - -- return 0; -+ return ret; - } ---- python-pysam.orig/samtools/stats_isize.c -+++ python-pysam/samtools/stats_isize.c -@@ -1,6 +1,6 @@ - /* stats_isize.c -- generalised insert size calculation for samtools stats. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014, 2018 Genome Research Ltd. - - Author: Nicholas Clarke - -@@ -162,12 +162,23 @@ - if (bound <= 0) { - // Use sparse data structure. - isize_sparse_data_t *data = (isize_sparse_data_t *) malloc(sizeof(isize_sparse_data_t)); -+ if (!data) -+ return NULL; - - // Initialise - data->max = 0; - data->array = kh_init(m32); -+ if (!data->array) { -+ free(data); -+ return NULL; -+ } - - isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); -+ if (!isize) { -+ kh_destroy(m32, data->array); -+ free(data); -+ return NULL; -+ } - - isize->data.sparse = data; - isize->nitems = & sparse_nitems; -@@ -192,13 +203,20 @@ - uint64_t* out = calloc(bound,sizeof(uint64_t)); - uint64_t* other = calloc(bound,sizeof(uint64_t)); - isize_dense_data_t *rec = (isize_dense_data_t *)malloc(sizeof(isize_dense_data_t)); -+ isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); -+ if (!in || !out || !other || !rec || !isize) { -+ free(in); -+ free(out); -+ free(other); -+ free(rec); -+ free(isize); -+ return NULL; -+ } - rec->isize_inward = in; - rec->isize_outward = out; - rec->isize_other = other; - rec->total=bound; - -- isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); -- - isize->data.dense = rec; - isize->nitems = & dense_nitems; - ---- python-pysam.orig/samtools/stats_isize.c.pysam.c -+++ python-pysam/samtools/stats_isize.c.pysam.c -@@ -2,7 +2,7 @@ - - /* stats_isize.c -- generalised insert size calculation for samtools stats. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014, 2018 Genome Research Ltd. - - Author: Nicholas Clarke - -@@ -164,12 +164,23 @@ - if (bound <= 0) { - // Use sparse data structure. - isize_sparse_data_t *data = (isize_sparse_data_t *) malloc(sizeof(isize_sparse_data_t)); -+ if (!data) -+ return NULL; - - // Initialise - data->max = 0; - data->array = kh_init(m32); -+ if (!data->array) { -+ free(data); -+ return NULL; -+ } - - isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); -+ if (!isize) { -+ kh_destroy(m32, data->array); -+ free(data); -+ return NULL; -+ } - - isize->data.sparse = data; - isize->nitems = & sparse_nitems; -@@ -194,13 +205,20 @@ - uint64_t* out = calloc(bound,sizeof(uint64_t)); - uint64_t* other = calloc(bound,sizeof(uint64_t)); - isize_dense_data_t *rec = (isize_dense_data_t *)malloc(sizeof(isize_dense_data_t)); -+ isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); -+ if (!in || !out || !other || !rec || !isize) { -+ free(in); -+ free(out); -+ free(other); -+ free(rec); -+ free(isize); -+ return NULL; -+ } - rec->isize_inward = in; - rec->isize_outward = out; - rec->isize_other = other; - rec->total=bound; - -- isize_t *isize = (isize_t *)malloc(sizeof(isize_t)); -- - isize->data.dense = rec; - isize->nitems = & dense_nitems; - ---- python-pysam.orig/samtools/test/merge/test_bam_translate.c -+++ python-pysam/samtools/test/merge/test_bam_translate.c -@@ -31,10 +31,11 @@ - #include - #include - #include -+#include - - void dump_read(bam1_t* b) { - printf("->core.tid:(%d)\n", b->core.tid); -- printf("->core.pos:(%d)\n", b->core.pos); -+ printf("->core.pos:(%"PRId64")\n", (int64_t) b->core.pos); - printf("->core.bin:(%d)\n", b->core.bin); - printf("->core.qual:(%d)\n", b->core.qual); - printf("->core.l_qname:(%d)\n", b->core.l_qname); -@@ -42,8 +43,8 @@ - printf("->core.n_cigar:(%d)\n", b->core.n_cigar); - printf("->core.l_qseq:(%d)\n", b->core.l_qseq); - printf("->core.mtid:(%d)\n", b->core.mtid); -- printf("->core.mpos:(%d)\n", b->core.mpos); -- printf("->core.isize:(%d)\n", b->core.isize); -+ printf("->core.mpos:(%"PRId64")\n", (int64_t) b->core.mpos); -+ printf("->core.isize:(%"PRId64")\n", (int64_t) b->core.isize); - if (b->data) { - printf("->data:"); - int i; -@@ -146,7 +147,7 @@ - tbl->tid_trans[3] = 8; - int in_there = 0; - khiter_t iter = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); -- kh_value(tbl->rg_trans, iter) = strdup("goodbye"); -+ kh_value(tbl->rg_trans, iter) = "goodbye"; - - b->core.tid = 0; - b->core.pos = 1334; -@@ -186,7 +187,7 @@ - tbl->tid_trans[3] = 8; - int in_there = 0; - khiter_t iter = kh_put(c2c, tbl->pg_trans, strdup("hello"), &in_there); -- kh_value(tbl->pg_trans,iter) = strdup("goodbye"); -+ kh_value(tbl->pg_trans,iter) = "goodbye"; - - - b->core.tid = 0; -@@ -302,9 +303,9 @@ - tbl->tid_trans[3] = 8; - int in_there = 0; - khiter_t iter_rg = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); -- kh_value(tbl->rg_trans, iter_rg) = strdup("goodbye"); -+ kh_value(tbl->rg_trans, iter_rg) = "goodbye"; - khiter_t iter_pg = kh_put(c2c, tbl->pg_trans, strdup("quail"), &in_there); -- kh_value(tbl->pg_trans, iter_pg) = strdup("bird"); -+ kh_value(tbl->pg_trans, iter_pg) = "bird"; - - - b->core.tid = 0; ---- python-pysam.orig/samtools/test/merge/test_bam_translate.c.pysam.c -+++ python-pysam/samtools/test/merge/test_bam_translate.c.pysam.c -@@ -33,10 +33,11 @@ - #include - #include - #include -+#include - - void dump_read(bam1_t* b) { - fprintf(samtools_stdout, "->core.tid:(%d)\n", b->core.tid); -- fprintf(samtools_stdout, "->core.pos:(%d)\n", b->core.pos); -+ fprintf(samtools_stdout, "->core.pos:(%"PRId64")\n", (int64_t) b->core.pos); - fprintf(samtools_stdout, "->core.bin:(%d)\n", b->core.bin); - fprintf(samtools_stdout, "->core.qual:(%d)\n", b->core.qual); - fprintf(samtools_stdout, "->core.l_qname:(%d)\n", b->core.l_qname); -@@ -44,8 +45,8 @@ - fprintf(samtools_stdout, "->core.n_cigar:(%d)\n", b->core.n_cigar); - fprintf(samtools_stdout, "->core.l_qseq:(%d)\n", b->core.l_qseq); - fprintf(samtools_stdout, "->core.mtid:(%d)\n", b->core.mtid); -- fprintf(samtools_stdout, "->core.mpos:(%d)\n", b->core.mpos); -- fprintf(samtools_stdout, "->core.isize:(%d)\n", b->core.isize); -+ fprintf(samtools_stdout, "->core.mpos:(%"PRId64")\n", (int64_t) b->core.mpos); -+ fprintf(samtools_stdout, "->core.isize:(%"PRId64")\n", (int64_t) b->core.isize); - if (b->data) { - fprintf(samtools_stdout, "->data:"); - int i; -@@ -148,7 +149,7 @@ - tbl->tid_trans[3] = 8; - int in_there = 0; - khiter_t iter = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); -- kh_value(tbl->rg_trans, iter) = strdup("goodbye"); -+ kh_value(tbl->rg_trans, iter) = "goodbye"; - - b->core.tid = 0; - b->core.pos = 1334; -@@ -188,7 +189,7 @@ - tbl->tid_trans[3] = 8; - int in_there = 0; - khiter_t iter = kh_put(c2c, tbl->pg_trans, strdup("hello"), &in_there); -- kh_value(tbl->pg_trans,iter) = strdup("goodbye"); -+ kh_value(tbl->pg_trans,iter) = "goodbye"; - - - b->core.tid = 0; -@@ -304,9 +305,9 @@ - tbl->tid_trans[3] = 8; - int in_there = 0; - khiter_t iter_rg = kh_put(c2c, tbl->rg_trans, strdup("hello"), &in_there); -- kh_value(tbl->rg_trans, iter_rg) = strdup("goodbye"); -+ kh_value(tbl->rg_trans, iter_rg) = "goodbye"; - khiter_t iter_pg = kh_put(c2c, tbl->pg_trans, strdup("quail"), &in_there); -- kh_value(tbl->pg_trans, iter_pg) = strdup("bird"); -+ kh_value(tbl->pg_trans, iter_pg) = "bird"; - - - b->core.tid = 0; ---- python-pysam.orig/samtools/test/merge/test_trans_tbl_init.c -+++ python-pysam/samtools/test/merge/test_trans_tbl_init.c -@@ -1,6 +1,6 @@ - /* test/merge/test_trans_tbl_init.c -- merge test harness. - -- Copyright (C) 2013, 2014 Genome Research Ltd. -+ Copyright (C) 2013-2016, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -27,18 +27,19 @@ - #include "../../bam_sort.c" - #include - #include -+#include - - typedef struct refseq_info { - const char *name; - uint32_t len; - } refseq_info_t; - --void dump_header(bam_hdr_t* hdr) { -- printf("->n_targets:(%d)\n", hdr->n_targets); -+void dump_header(sam_hdr_t* hdr) { -+ printf("->n_targets:(%d)\n", sam_hdr_nref(hdr)); - int i; -- for (i = 0; i < hdr->n_targets; ++i) { -- printf("->target_name[%d]:(%s)\n",i,hdr->target_name[i]); -- printf("->target_len[%d]:(%d)\n",i,hdr->target_len[i]); -+ for (i = 0; i < sam_hdr_nref(hdr); ++i) { -+ printf("->target_name[%d]:(%s)\n", i, sam_hdr_tid2name(hdr, i)); -+ printf("->target_len[%d]:(%"PRId64")\n", i, (int64_t) sam_hdr_tid2len(hdr, i)); - } - - printf("->text:("); -@@ -46,7 +47,7 @@ - printf(")\n"); - } - --static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) { -+static int populate_merged_header(sam_hdr_t *hdr, merged_header_t *merged_hdr) { - trans_tbl_t dummy; - int res; - res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL); -@@ -56,55 +57,35 @@ - - /* - * Populate merged_hdr with data from bam0_header_text and bam0_refseqs. -- * Return bam_hdr_t based on the content in bam1_header_text and bam1_refseqs. -+ * Return sam_hdr_t based on the content in bam1_header_text and bam1_refseqs. - */ - --bam_hdr_t * setup_test(const char *bam0_header_text, -+sam_hdr_t * setup_test(const char *bam0_header_text, - const refseq_info_t *bam0_refseqs, - int32_t bam0_n_refseqs, - const char *bam1_header_text, - const refseq_info_t *bam1_refseqs, - int32_t bam1_n_refseqs, - merged_header_t *merged_hdr) { -- bam_hdr_t* bam0 = NULL; -- bam_hdr_t* bam1 = NULL; -- int32_t i; -- -- bam0 = bam_hdr_init(); -- bam0->text = strdup(bam0_header_text); -- if (!bam0->text) goto fail; -- bam0->l_text = strlen(bam0_header_text); -- bam0->n_targets = 1; -- bam0->target_name = (char**)calloc(bam0_n_refseqs, sizeof(char*)); -- bam0->target_len = (uint32_t*)calloc(bam0_n_refseqs, sizeof(uint32_t)); -- for (i = 0; i < bam0_n_refseqs; i++) { -- bam0->target_name[i] = strdup(bam0_refseqs[i].name); -- if (!bam0->target_name[i]) goto fail; -- bam0->target_len[i] = bam0_refseqs[i].len; -- } -+ sam_hdr_t* bam0 = NULL; -+ sam_hdr_t* bam1 = NULL; -+ -+ bam0 = sam_hdr_init(); -+ if (!bam0 || -1 == sam_hdr_add_lines(bam0, bam0_header_text, strlen(bam0_header_text))) -+ goto fail; - - if (populate_merged_header(bam0, merged_hdr)) goto fail; - -- bam1 = bam_hdr_init(); -- if (!bam1) goto fail; -- bam1->text = strdup(bam1_header_text); -- if (!bam1->text) goto fail; -- bam1->l_text = strlen(bam1_header_text); -- bam1->n_targets = bam1_n_refseqs; -- bam1->target_name = (char**)calloc(bam1_n_refseqs, sizeof(char*)); -- bam1->target_len = (uint32_t*)calloc(bam1_n_refseqs, sizeof(uint32_t)); -- for (i = 0; i < bam1_n_refseqs; i++) { -- bam1->target_name[i] = strdup(bam1_refseqs[i].name); -- if (!bam1->target_name[i]) goto fail; -- bam1->target_len[i] = bam1_refseqs[i].len; -- } -+ bam1 = sam_hdr_init(); -+ if (!bam1 || -1 == sam_hdr_add_lines(bam1, bam1_header_text, strlen(bam1_header_text))) -+ goto fail; - -- bam_hdr_destroy(bam0); -+ sam_hdr_destroy(bam0); - return bam1; - - fail: -- bam_hdr_destroy(bam1); -- bam_hdr_destroy(bam0); -+ sam_hdr_destroy(bam1); -+ sam_hdr_destroy(bam0); - return NULL; - } - -@@ -126,18 +107,18 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_1_trans_text, test_1_refs, NELE(test_1_refs), - merged_hdr); - } - --bool check_test_1(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_1(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_1_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen( test_1_trans_text) -- || translate->n_targets != 1 -+ strncmp(test_1_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen( test_1_trans_text) -+ || sam_hdr_nref(translate) != 1 - ) return false; - - // Check output header -@@ -148,7 +129,7 @@ - regex_t check_regex; - regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); - -- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 1 ) return false; -+ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 1 ) return false; - - regfree(&check_regex); - -@@ -161,25 +142,24 @@ - static const char test_2_trans_text[] = - "@HD\tVN:1.4\tSO:unknown\n" - "@SQ\tSN:donkey\tLN:133\n" --"@SQ\tSN:fish\tLN:133"; -+"@SQ\tSN:fish\tLN:133\n"; - - static const refseq_info_t test_2_refs[2] = { - { "donkey", 133 }, - { "fish", 133 } - }; - --bam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_2_trans_text, test_2_refs, NELE(test_2_refs), - merged_hdr); - } - --bool check_test_2(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_2(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged -- if ( -- strncmp(test_2_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_2_trans_text) -- || translate->n_targets != 2 -+ if (sam_hdr_length(translate) != strlen(test_2_trans_text) -+ || strncmp(test_2_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_nref(translate) != 2 - ) return false; - - // Check output header -@@ -191,7 +171,7 @@ - regex_t check_regex; - regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); - -- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 2 ) return false; -+ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 2 ) return false; - - regfree(&check_regex); - -@@ -212,18 +192,18 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_3_trans_text, test_3_refs, NELE(test_3_refs), - merged_hdr); - } - --bool check_test_3(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_3(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_3_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_3_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_3_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_3_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -239,7 +219,7 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { - const char* t4_init_text = - "@HD\tVN:1.4\tSO:unknown\n" - "@SQ\tSN:fish\tLN:133\tSP:frog\n" -@@ -250,12 +230,12 @@ - merged_hdr); - } - --bool check_test_4(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_4(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_4_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_4_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_4_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_4_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -273,7 +253,7 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { - const char* t5_init_text = - "@HD\tVN:1.4\tSO:unknown\n" - "@SQ\tSN:fish\tLN:133\tSP:frog\n" -@@ -286,12 +266,12 @@ - merged_hdr); - } - --bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_5(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_5_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_5_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_5_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_5_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -309,18 +289,18 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_6_trans_text, test_6_refs, NELE(test_6_refs), - merged_hdr); - } - --bool check_test_6(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_6(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_6_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_5_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_6_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_5_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -346,8 +326,8 @@ - const long GIMMICK_SEED = 0x1234330e; - srand48(GIMMICK_SEED); - -- bam_hdr_t* out; -- bam_hdr_t* translate; -+ sam_hdr_t* out; -+ sam_hdr_t* translate; - - if (verbose) printf("BEGIN test 1\n"); - // setup -@@ -362,7 +342,8 @@ - } - if (verbose) printf("RUN test 1\n"); - trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 1\n"); - if (verbose > 1) { -@@ -380,8 +361,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_1); - if (verbose) printf("END test 1\n"); - -@@ -399,7 +380,8 @@ - } - if (verbose) printf("RUN test 2\n"); - trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 2\n"); - if (verbose > 1) { -@@ -417,8 +399,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_2); - if (verbose) printf("END test 2\n"); - -@@ -435,7 +417,8 @@ - } - if (verbose) printf("RUN test 3\n"); - trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 3\n"); - if (verbose > 1) { -@@ -453,8 +436,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_3); - if (verbose) printf("END test 3\n"); - -@@ -471,7 +454,8 @@ - } - if (verbose) printf("RUN test 4\n"); - trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 4\n"); - if (verbose > 1) { -@@ -489,8 +473,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_4); - if (verbose) printf("END test 4\n"); - -@@ -508,7 +492,8 @@ - } - if (verbose) printf("RUN test 5\n"); - trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 5\n"); - if (verbose > 1) { -@@ -526,8 +511,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_5); - if (verbose) printf("END test 5\n"); - -@@ -544,7 +529,8 @@ - } - if (verbose) printf("RUN test 6\n"); - trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename"); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) printf("END RUN test 6\n"); - if (verbose > 1) { -@@ -562,8 +548,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_6); - if (verbose) printf("END test 6\n"); - ---- python-pysam.orig/samtools/test/merge/test_trans_tbl_init.c.pysam.c -+++ python-pysam/samtools/test/merge/test_trans_tbl_init.c.pysam.c -@@ -2,7 +2,7 @@ - - /* test/merge/test_trans_tbl_init.c -- merge test harness. - -- Copyright (C) 2013, 2014 Genome Research Ltd. -+ Copyright (C) 2013-2016, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -29,18 +29,19 @@ - #include "../../bam_sort.c" - #include - #include -+#include - - typedef struct refseq_info { - const char *name; - uint32_t len; - } refseq_info_t; - --void dump_header(bam_hdr_t* hdr) { -- fprintf(samtools_stdout, "->n_targets:(%d)\n", hdr->n_targets); -+void dump_header(sam_hdr_t* hdr) { -+ fprintf(samtools_stdout, "->n_targets:(%d)\n", sam_hdr_nref(hdr)); - int i; -- for (i = 0; i < hdr->n_targets; ++i) { -- fprintf(samtools_stdout, "->target_name[%d]:(%s)\n",i,hdr->target_name[i]); -- fprintf(samtools_stdout, "->target_len[%d]:(%d)\n",i,hdr->target_len[i]); -+ for (i = 0; i < sam_hdr_nref(hdr); ++i) { -+ fprintf(samtools_stdout, "->target_name[%d]:(%s)\n", i, sam_hdr_tid2name(hdr, i)); -+ fprintf(samtools_stdout, "->target_len[%d]:(%"PRId64")\n", i, (int64_t) sam_hdr_tid2len(hdr, i)); - } - - fprintf(samtools_stdout, "->text:("); -@@ -48,7 +49,7 @@ - fprintf(samtools_stdout, ")\n"); - } - --static int populate_merged_header(bam_hdr_t *hdr, merged_header_t *merged_hdr) { -+static int populate_merged_header(sam_hdr_t *hdr, merged_header_t *merged_hdr) { - trans_tbl_t dummy; - int res; - res = trans_tbl_init(merged_hdr, hdr, &dummy, 0, 0, 1, NULL); -@@ -58,55 +59,35 @@ - - /* - * Populate merged_hdr with data from bam0_header_text and bam0_refseqs. -- * Return bam_hdr_t based on the content in bam1_header_text and bam1_refseqs. -+ * Return sam_hdr_t based on the content in bam1_header_text and bam1_refseqs. - */ - --bam_hdr_t * setup_test(const char *bam0_header_text, -+sam_hdr_t * setup_test(const char *bam0_header_text, - const refseq_info_t *bam0_refseqs, - int32_t bam0_n_refseqs, - const char *bam1_header_text, - const refseq_info_t *bam1_refseqs, - int32_t bam1_n_refseqs, - merged_header_t *merged_hdr) { -- bam_hdr_t* bam0 = NULL; -- bam_hdr_t* bam1 = NULL; -- int32_t i; -- -- bam0 = bam_hdr_init(); -- bam0->text = strdup(bam0_header_text); -- if (!bam0->text) goto fail; -- bam0->l_text = strlen(bam0_header_text); -- bam0->n_targets = 1; -- bam0->target_name = (char**)calloc(bam0_n_refseqs, sizeof(char*)); -- bam0->target_len = (uint32_t*)calloc(bam0_n_refseqs, sizeof(uint32_t)); -- for (i = 0; i < bam0_n_refseqs; i++) { -- bam0->target_name[i] = strdup(bam0_refseqs[i].name); -- if (!bam0->target_name[i]) goto fail; -- bam0->target_len[i] = bam0_refseqs[i].len; -- } -+ sam_hdr_t* bam0 = NULL; -+ sam_hdr_t* bam1 = NULL; -+ -+ bam0 = sam_hdr_init(); -+ if (!bam0 || -1 == sam_hdr_add_lines(bam0, bam0_header_text, strlen(bam0_header_text))) -+ goto fail; - - if (populate_merged_header(bam0, merged_hdr)) goto fail; - -- bam1 = bam_hdr_init(); -- if (!bam1) goto fail; -- bam1->text = strdup(bam1_header_text); -- if (!bam1->text) goto fail; -- bam1->l_text = strlen(bam1_header_text); -- bam1->n_targets = bam1_n_refseqs; -- bam1->target_name = (char**)calloc(bam1_n_refseqs, sizeof(char*)); -- bam1->target_len = (uint32_t*)calloc(bam1_n_refseqs, sizeof(uint32_t)); -- for (i = 0; i < bam1_n_refseqs; i++) { -- bam1->target_name[i] = strdup(bam1_refseqs[i].name); -- if (!bam1->target_name[i]) goto fail; -- bam1->target_len[i] = bam1_refseqs[i].len; -- } -+ bam1 = sam_hdr_init(); -+ if (!bam1 || -1 == sam_hdr_add_lines(bam1, bam1_header_text, strlen(bam1_header_text))) -+ goto fail; - -- bam_hdr_destroy(bam0); -+ sam_hdr_destroy(bam0); - return bam1; - - fail: -- bam_hdr_destroy(bam1); -- bam_hdr_destroy(bam0); -+ sam_hdr_destroy(bam1); -+ sam_hdr_destroy(bam0); - return NULL; - } - -@@ -128,18 +109,18 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_1(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_1_trans_text, test_1_refs, NELE(test_1_refs), - merged_hdr); - } - --bool check_test_1(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_1(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_1_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen( test_1_trans_text) -- || translate->n_targets != 1 -+ strncmp(test_1_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen( test_1_trans_text) -+ || sam_hdr_nref(translate) != 1 - ) return false; - - // Check output header -@@ -150,7 +131,7 @@ - regex_t check_regex; - regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); - -- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 1 ) return false; -+ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 1 ) return false; - - regfree(&check_regex); - -@@ -163,25 +144,24 @@ - static const char test_2_trans_text[] = - "@HD\tVN:1.4\tSO:unknown\n" - "@SQ\tSN:donkey\tLN:133\n" --"@SQ\tSN:fish\tLN:133"; -+"@SQ\tSN:fish\tLN:133\n"; - - static const refseq_info_t test_2_refs[2] = { - { "donkey", 133 }, - { "fish", 133 } - }; - --bam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_2(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_2_trans_text, test_2_refs, NELE(test_2_refs), - merged_hdr); - } - --bool check_test_2(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_2(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged -- if ( -- strncmp(test_2_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_2_trans_text) -- || translate->n_targets != 2 -+ if (sam_hdr_length(translate) != strlen(test_2_trans_text) -+ || strncmp(test_2_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_nref(translate) != 2 - ) return false; - - // Check output header -@@ -193,7 +173,7 @@ - regex_t check_regex; - regcomp(&check_regex, out_regex, REG_EXTENDED|REG_NOSUB); - -- if ( regexec(&check_regex, out->text, 0, NULL, 0) != 0 || out->n_targets != 2 ) return false; -+ if ( regexec(&check_regex, sam_hdr_str(out), 0, NULL, 0) != 0 || sam_hdr_nref(out) != 2 ) return false; - - regfree(&check_regex); - -@@ -214,18 +194,18 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_3(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_3_trans_text, test_3_refs, NELE(test_3_refs), - merged_hdr); - } - --bool check_test_3(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_3(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_3_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_3_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_3_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_3_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -241,7 +221,7 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_4(merged_header_t *merged_hdr) { - const char* t4_init_text = - "@HD\tVN:1.4\tSO:unknown\n" - "@SQ\tSN:fish\tLN:133\tSP:frog\n" -@@ -252,12 +232,12 @@ - merged_hdr); - } - --bool check_test_4(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_4(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_4_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_4_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_4_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_4_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -275,7 +255,7 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_5(merged_header_t *merged_hdr) { - const char* t5_init_text = - "@HD\tVN:1.4\tSO:unknown\n" - "@SQ\tSN:fish\tLN:133\tSP:frog\n" -@@ -288,12 +268,12 @@ - merged_hdr); - } - --bool check_test_5(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_5(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_5_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_5_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_5_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_5_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -311,18 +291,18 @@ - { "fish", 133 } - }; - --bam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { -+sam_hdr_t * setup_test_6(merged_header_t *merged_hdr) { - return setup_test(init_text, init_refs, NELE(init_refs), - test_6_trans_text, test_6_refs, NELE(test_6_refs), - merged_hdr); - } - --bool check_test_6(bam_hdr_t* translate, bam_hdr_t* out, trans_tbl_t* tbl) { -+bool check_test_6(sam_hdr_t* translate, sam_hdr_t* out, trans_tbl_t* tbl) { - // Check input is unchanged - if ( -- strncmp(test_6_trans_text, translate->text, translate->l_text) -- || translate->l_text != strlen(test_5_trans_text) -- || translate->n_targets != 2 -+ strncmp(test_6_trans_text, sam_hdr_str(translate), sam_hdr_length(translate)) -+ || sam_hdr_length(translate) != strlen(test_5_trans_text) -+ || sam_hdr_nref(translate) != 2 - ) return false; - return true; - } -@@ -348,8 +328,8 @@ - const long GIMMICK_SEED = 0x1234330e; - srand48(GIMMICK_SEED); - -- bam_hdr_t* out; -- bam_hdr_t* translate; -+ sam_hdr_t* out; -+ sam_hdr_t* translate; - - if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); - // setup -@@ -364,7 +344,8 @@ - } - if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); - trans_tbl_init(merged_hdr, translate, &tbl_1, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); - if (verbose > 1) { -@@ -382,8 +363,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_1); - if (verbose) fprintf(samtools_stdout, "END test 1\n"); - -@@ -401,7 +382,8 @@ - } - if (verbose) fprintf(samtools_stdout, "RUN test 2\n"); - trans_tbl_init(merged_hdr, translate, &tbl_2, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) fprintf(samtools_stdout, "END RUN test 2\n"); - if (verbose > 1) { -@@ -419,8 +401,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_2); - if (verbose) fprintf(samtools_stdout, "END test 2\n"); - -@@ -437,7 +419,8 @@ - } - if (verbose) fprintf(samtools_stdout, "RUN test 3\n"); - trans_tbl_init(merged_hdr, translate, &tbl_3, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) fprintf(samtools_stdout, "END RUN test 3\n"); - if (verbose > 1) { -@@ -455,8 +438,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_3); - if (verbose) fprintf(samtools_stdout, "END test 3\n"); - -@@ -473,7 +456,8 @@ - } - if (verbose) fprintf(samtools_stdout, "RUN test 4\n"); - trans_tbl_init(merged_hdr, translate, &tbl_4, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) fprintf(samtools_stdout, "END RUN test 4\n"); - if (verbose > 1) { -@@ -491,8 +475,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_4); - if (verbose) fprintf(samtools_stdout, "END test 4\n"); - -@@ -510,7 +494,8 @@ - } - if (verbose) fprintf(samtools_stdout, "RUN test 5\n"); - trans_tbl_init(merged_hdr, translate, &tbl_5, false, false, true, NULL); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) fprintf(samtools_stdout, "END RUN test 5\n"); - if (verbose > 1) { -@@ -528,8 +513,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_5); - if (verbose) fprintf(samtools_stdout, "END test 5\n"); - -@@ -546,7 +531,8 @@ - } - if (verbose) fprintf(samtools_stdout, "RUN test 6\n"); - trans_tbl_init(merged_hdr, translate, &tbl_6, false, false, true, "filename"); -- out = finish_merged_header(merged_hdr); -+ finish_merged_header(merged_hdr); -+ out = merged_hdr->hdr; - free_merged_header(merged_hdr); - if (verbose) fprintf(samtools_stdout, "END RUN test 6\n"); - if (verbose > 1) { -@@ -564,8 +550,8 @@ - ++failure; - } - // teardown -- bam_hdr_destroy(translate); -- bam_hdr_destroy(out); -+ sam_hdr_destroy(translate); -+ sam_hdr_destroy(out); - trans_tbl_destroy(&tbl_6); - if (verbose) fprintf(samtools_stdout, "END test 6\n"); - ---- python-pysam.orig/samtools/test/split/test_count_rg.c -+++ python-pysam/samtools/test/split/test_count_rg.c -@@ -1,6 +1,6 @@ - /* test/split/test_count_rg.c -- split test cases. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -29,15 +29,14 @@ - #include - #include - --void setup_test_1(bam_hdr_t** hdr_in) -+void setup_test_1(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test1 = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:150\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test1); -- (*hdr_in)->l_text = strlen(test1); -+ sam_hdr_add_lines(*hdr_in, test1, 0); - } - - int main(int argc, char**argv) -@@ -66,13 +65,14 @@ - - // Setup stderr redirect - kstring_t res = { 0, 0, NULL }; -- FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr -+ int orig_stderr = dup(STDERR_FILENO); // Save stderr -+ int redirected_stderr; - char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; - FILE* check = NULL; - - // setup - if (verbose) printf("BEGIN test 1\n"); // TID test -- bam_hdr_t* hdr1; -+ sam_hdr_t* hdr1; - size_t count; - char** output; - setup_test_1(&hdr1); -@@ -83,9 +83,9 @@ - if (verbose) printf("RUN test 1\n"); - - // test -- xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe -+ redirected_stderr = redirect_stderr(tempfname); - bool result_1 = count_RG(hdr1, &count, &output); -- fclose(stderr); -+ flush_and_restore_stderr(orig_stderr, redirected_stderr); - - if (verbose) printf("END RUN test 1\n"); - if (verbose > 1) { -@@ -111,15 +111,15 @@ - free(output[i]); - } - free(output); -- bam_hdr_destroy(hdr1); -+ sam_hdr_destroy(hdr1); - if (verbose) printf("END test 1\n"); - - // Cleanup - free(res.s); - remove(tempfname); - if (failure > 0) -- fprintf(orig_stderr, "%d failures %d successes\n", failure, success); -- fclose(orig_stderr); -+ fprintf(stderr, "%d failures %d successes\n", failure, success); -+ close(orig_stderr); - - return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; - } ---- python-pysam.orig/samtools/test/split/test_count_rg.c.pysam.c -+++ python-pysam/samtools/test/split/test_count_rg.c.pysam.c -@@ -2,7 +2,7 @@ - - /* test/split/test_count_rg.c -- split test cases. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -31,15 +31,14 @@ - #include - #include - --void setup_test_1(bam_hdr_t** hdr_in) -+void setup_test_1(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test1 = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:150\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test1); -- (*hdr_in)->l_text = strlen(test1); -+ sam_hdr_add_lines(*hdr_in, test1, 0); - } - - int samtools_test_count_rg_main(int argc, char**argv) -@@ -68,13 +67,14 @@ - - // Setup samtools_stderr redirect - kstring_t res = { 0, 0, NULL }; -- FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr -+ int orig_samtools_stderr = dup(STDERR_FILENO); // Save samtools_stderr -+ int redirected_samtools_stderr; - char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; - FILE* check = NULL; - - // setup - if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // TID test -- bam_hdr_t* hdr1; -+ sam_hdr_t* hdr1; - size_t count; - char** output; - setup_test_1(&hdr1); -@@ -85,9 +85,9 @@ - if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); - - // test -- xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe -+ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); - bool result_1 = count_RG(hdr1, &count, &output); -- fclose(samtools_stderr); -+ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); - - if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); - if (verbose > 1) { -@@ -113,15 +113,15 @@ - free(output[i]); - } - free(output); -- bam_hdr_destroy(hdr1); -+ sam_hdr_destroy(hdr1); - if (verbose) fprintf(samtools_stdout, "END test 1\n"); - - // Cleanup - free(res.s); - remove(tempfname); - if (failure > 0) -- fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); -- fclose(orig_samtools_stderr); -+ fprintf(samtools_stderr, "%d failures %d successes\n", failure, success); -+ close(orig_samtools_stderr); - - return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; - } ---- python-pysam.orig/samtools/test/split/test_expand_format_string.c -+++ python-pysam/samtools/test/split/test_expand_format_string.c -@@ -29,15 +29,14 @@ - #include - #include - --void setup_test_1(bam_hdr_t** hdr_in) -+void setup_test_1(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test1 = - "@HD\tVN:1.4\n" - "@SQ\tSN:blah\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test1); -- (*hdr_in)->l_text = strlen(test1); -+ sam_hdr_add_lines(*hdr_in, test1, 0); - } - - int main(int argc, char**argv) ---- python-pysam.orig/samtools/test/split/test_expand_format_string.c.pysam.c -+++ python-pysam/samtools/test/split/test_expand_format_string.c.pysam.c -@@ -31,15 +31,14 @@ - #include - #include - --void setup_test_1(bam_hdr_t** hdr_in) -+void setup_test_1(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test1 = - "@HD\tVN:1.4\n" - "@SQ\tSN:blah\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test1); -- (*hdr_in)->l_text = strlen(test1); -+ sam_hdr_add_lines(*hdr_in, test1, 0); - } - - int samtools_test_expand_format_string_main(int argc, char**argv) ---- python-pysam.orig/samtools/test/split/test_filter_header_rg.c -+++ python-pysam/samtools/test/split/test_filter_header_rg.c -@@ -1,6 +1,6 @@ - /* test/split/test_filter_header_rg.c -- split test cases. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -24,61 +24,133 @@ - - #include - --#include "../../bam_split.c" - #include "../test.h" - #include -+#include -+#include "samtools.h" -+#include -+#include -+#include "htslib/kstring.h" -+ -+int line_cmp(const void *av, const void *bv) { -+ const char *a = *(const char **) av; -+ const char *b = *(const char **) bv; -+ size_t al = strcspn(a, "\n"); -+ size_t bl = strcspn(b, "\n"); -+ size_t min = al < bl ? al : bl; -+ int m = memcmp(a, b, min); -+ if (m != 0) return m; -+ if (al < bl) return -1; -+ return al == bl ? 0 : 1; -+} -+ -+bool hdrcmp(const char *hdr1, const char *hdr2) { -+ size_t nl1, nl2, count1 = 0, count2 = 0, i; -+ const char *l; -+ const char **lines1, **lines2; -+ int res = 0; -+ -+ // First line should be @HD -+ if (strncmp(hdr1, "@HD\t", 4) != 0) return false; -+ if (strncmp(hdr2, "@HD\t", 4) != 0) return false; -+ nl1 = strcspn(hdr1, "\n"); -+ nl2 = strcspn(hdr2, "\n"); -+ if (nl1 != nl2 || memcmp(hdr1, hdr2, nl1) != 0) return false; -+ -+ // Count lines. -+ for (l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count1; -+ for (l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count2; -+ if (count1 != count2) return false; -+ -+ lines1 = malloc(count1 * sizeof(*lines1)); -+ if (!lines1) return false; -+ lines2 = malloc(count2 * sizeof(*lines2)); -+ if (!lines2) { free(lines1); return false; } -+ -+ for (i = 0, l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) -+ lines1[i++] = ++l; -+ for (i = 0, l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) -+ lines2[i++] = ++l; -+ -+ qsort(lines1, count1, sizeof(*lines1), line_cmp); -+ qsort(lines2, count2, sizeof(*lines2), line_cmp); -+ -+ for (i = 0; i < count1; i++) { -+ res = line_cmp(&lines1[i], &lines2[i]); -+ if (res != 0) break; -+ } -+ -+ free(lines1); -+ free(lines2); -+ -+ return res?false:true; -+} - --void setup_test_1(bam_hdr_t** hdr_in) -+void setup_test_1(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test1 = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test1); -- (*hdr_in)->l_text = strlen(test1); -+ sam_hdr_add_lines(*hdr_in, test1, 0); - } - --bool check_test_1(const bam_hdr_t* hdr) { -+bool check_test_1(sam_hdr_t* hdr) { - const char *test1_res = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; - -- if (strcmp(hdr->text, test1_res)) { -- return false; -- } -- return true; -+ return hdrcmp(sam_hdr_str(hdr), test1_res); - } - --void setup_test_2(bam_hdr_t** hdr_in) -+void setup_test_2(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test2 = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test2); -- (*hdr_in)->l_text = strlen(test2); -+ sam_hdr_add_lines(*hdr_in, test2, 0); - } - --bool check_test_2(const bam_hdr_t* hdr) { -+bool check_test_2(sam_hdr_t* hdr) { - const char *test2_res = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@RG\tID:fish\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; - -- if (strcmp(hdr->text, test2_res)) { -- return false; -- } -- return true; -+ return hdrcmp(sam_hdr_str(hdr), test2_res); -+} -+ -+void setup_test_3(sam_hdr_t** hdr_in) -+{ -+ *hdr_in = sam_hdr_init(); -+ const char *test3 = -+ "@HD\tVN:1.4\n" -+ "@SQ\tSN:blah\tLN:1\n" -+ "@RG\tID:fish1\n" -+ "@RG\tID:fish2\n" -+ "@RG\tID:fish3\n" -+ "@RG\tID:fish4\n"; -+ sam_hdr_add_lines(*hdr_in, test3, 0); -+} -+ -+bool check_test_3(sam_hdr_t* hdr) { -+ const char *test3_res = -+ "@HD\tVN:1.4\n" -+ "@SQ\tSN:blah\tLN:1\n" -+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; -+ -+ return hdrcmp(sam_hdr_str(hdr), test3_res); - } - - int main(int argc, char *argv[]) - { - // test state -- const int NUM_TESTS = 2; -+ const int NUM_TESTS = 3; - int verbose = 0; - int success = 0; - int failure = 0; -@@ -103,13 +175,14 @@ - - // Setup stderr redirect - kstring_t res = { 0, 0, NULL }; -- FILE* orig_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save stderr -+ int orig_stderr = dup(STDERR_FILENO); // Save stderr -+ int redirected_stderr; - char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; - FILE* check = NULL; - - // setup - if (verbose) printf("BEGIN test 1\n"); // test eliminating a tag that isn't there -- bam_hdr_t* hdr1; -+ sam_hdr_t* hdr1; - const char* id_to_keep_1 = "1#2.3"; - setup_test_1(&hdr1); - if (verbose > 1) { -@@ -119,9 +192,13 @@ - if (verbose) printf("RUN test 1\n"); - - // test -- xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe -- bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list); -- fclose(stderr); -+ redirected_stderr = redirect_stderr(tempfname); -+ bool result_1 = (!sam_hdr_remove_except(hdr1, "RG", "ID", id_to_keep_1) && -+ !sam_hdr_add_pg(hdr1, "samtools", "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)); -+ flush_and_restore_stderr(orig_stderr, redirected_stderr); - - if (verbose) printf("END RUN test 1\n"); - if (verbose > 1) { -@@ -144,11 +221,11 @@ - fclose(check); - - // teardown -- bam_hdr_destroy(hdr1); -+ sam_hdr_destroy(hdr1); - if (verbose) printf("END test 1\n"); - - if (verbose) printf("BEGIN test 2\n"); // test eliminating a tag that is there -- bam_hdr_t* hdr2; -+ sam_hdr_t* hdr2; - const char* id_to_keep_2 = "fish"; - setup_test_2(&hdr2); - if (verbose > 1) { -@@ -158,9 +235,13 @@ - if (verbose) printf("RUN test 2\n"); - - // test -- xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe -- bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list); -- fclose(stderr); -+ redirected_stderr = redirect_stderr(tempfname); -+ bool result_2 = (!sam_hdr_remove_except(hdr2, "RG", "ID", id_to_keep_2) && -+ !sam_hdr_add_pg(hdr2, "samtools", "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)); -+ flush_and_restore_stderr(orig_stderr, redirected_stderr); - - if (verbose) printf("END RUN test 2\n"); - if (verbose > 1) { -@@ -183,17 +264,58 @@ - fclose(check); - - // teardown -- bam_hdr_destroy(hdr2); -+ sam_hdr_destroy(hdr2); - if (verbose) printf("END test 2\n"); - -+ if (verbose) printf("BEGIN test 3\n"); // test eliminating a tag that is there -+ sam_hdr_t* hdr3; -+ setup_test_3(&hdr3); -+ if (verbose > 1) { -+ printf("hdr3\n"); -+ dump_hdr(hdr3); -+ } -+ if (verbose) printf("RUN test 3\n"); -+ -+ // test -+ redirected_stderr = redirect_stderr(tempfname); -+ bool result_3 = (!sam_hdr_remove_except(hdr3, "RG", NULL, NULL) && -+ !sam_hdr_add_pg(hdr3, "samtools", "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)); -+ flush_and_restore_stderr(orig_stderr, redirected_stderr); -+ -+ if (verbose) printf("END RUN test 3\n"); -+ if (verbose > 1) { -+ printf("hdr3\n"); -+ dump_hdr(hdr3); -+ } -+ -+ // check result -+ res.l = 0; -+ check = fopen(tempfname, "r"); -+ if ( result_3 -+ && check_test_3(hdr3) -+ && kgetline(&res, (kgets_func *)fgets, check) < 0 -+ && (feof(check) || res.l == 0)) { -+ ++success; -+ } else { -+ ++failure; -+ if (verbose) printf("FAIL test 3\n"); -+ } -+ fclose(check); -+ -+ // teardown -+ sam_hdr_destroy(hdr3); -+ if (verbose) printf("END test 3\n"); - - // Cleanup - free(res.s); - free(arg_list); - remove(tempfname); - if (failure > 0) -- fprintf(orig_stderr, "%d failures %d successes\n", failure, success); -- fclose(orig_stderr); -+ fprintf(stderr, "%d failures %d successes\n", failure, success); -+ close(orig_stderr); - - return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; - } ---- python-pysam.orig/samtools/test/split/test_filter_header_rg.c.pysam.c -+++ python-pysam/samtools/test/split/test_filter_header_rg.c.pysam.c -@@ -2,7 +2,7 @@ - - /* test/split/test_filter_header_rg.c -- split test cases. - -- Copyright (C) 2014 Genome Research Ltd. -+ Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -26,61 +26,133 @@ - - #include - --#include "../../bam_split.c" - #include "../test.h" - #include -+#include -+#include "samtools.h" -+#include -+#include -+#include "htslib/kstring.h" -+ -+int line_cmp(const void *av, const void *bv) { -+ const char *a = *(const char **) av; -+ const char *b = *(const char **) bv; -+ size_t al = strcspn(a, "\n"); -+ size_t bl = strcspn(b, "\n"); -+ size_t min = al < bl ? al : bl; -+ int m = memcmp(a, b, min); -+ if (m != 0) return m; -+ if (al < bl) return -1; -+ return al == bl ? 0 : 1; -+} -+ -+bool hdrcmp(const char *hdr1, const char *hdr2) { -+ size_t nl1, nl2, count1 = 0, count2 = 0, i; -+ const char *l; -+ const char **lines1, **lines2; -+ int res = 0; -+ -+ // First line should be @HD -+ if (strncmp(hdr1, "@HD\t", 4) != 0) return false; -+ if (strncmp(hdr2, "@HD\t", 4) != 0) return false; -+ nl1 = strcspn(hdr1, "\n"); -+ nl2 = strcspn(hdr2, "\n"); -+ if (nl1 != nl2 || memcmp(hdr1, hdr2, nl1) != 0) return false; -+ -+ // Count lines. -+ for (l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count1; -+ for (l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) ++l, ++count2; -+ if (count1 != count2) return false; -+ -+ lines1 = malloc(count1 * sizeof(*lines1)); -+ if (!lines1) return false; -+ lines2 = malloc(count2 * sizeof(*lines2)); -+ if (!lines2) { free(lines1); return false; } -+ -+ for (i = 0, l = hdr1 + nl1; *l != '\0'; l += strcspn(l, "\n")) -+ lines1[i++] = ++l; -+ for (i = 0, l = hdr2 + nl2; *l != '\0'; l += strcspn(l, "\n")) -+ lines2[i++] = ++l; -+ -+ qsort(lines1, count1, sizeof(*lines1), line_cmp); -+ qsort(lines2, count2, sizeof(*lines2), line_cmp); -+ -+ for (i = 0; i < count1; i++) { -+ res = line_cmp(&lines1[i], &lines2[i]); -+ if (res != 0) break; -+ } -+ -+ free(lines1); -+ free(lines2); -+ -+ return res?false:true; -+} - --void setup_test_1(bam_hdr_t** hdr_in) -+void setup_test_1(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test1 = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test1); -- (*hdr_in)->l_text = strlen(test1); -+ sam_hdr_add_lines(*hdr_in, test1, 0); - } - --bool check_test_1(const bam_hdr_t* hdr) { -+bool check_test_1(sam_hdr_t* hdr) { - const char *test1_res = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; - -- if (strcmp(hdr->text, test1_res)) { -- return false; -- } -- return true; -+ return hdrcmp(sam_hdr_str(hdr), test1_res); - } - --void setup_test_2(bam_hdr_t** hdr_in) -+void setup_test_2(sam_hdr_t** hdr_in) - { -- *hdr_in = bam_hdr_init(); -+ *hdr_in = sam_hdr_init(); - const char *test2 = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@RG\tID:fish\n"; -- (*hdr_in)->text = strdup(test2); -- (*hdr_in)->l_text = strlen(test2); -+ sam_hdr_add_lines(*hdr_in, test2, 0); - } - --bool check_test_2(const bam_hdr_t* hdr) { -+bool check_test_2(sam_hdr_t* hdr) { - const char *test2_res = - "@HD\tVN:1.4\n" -- "@SQ\tSN:blah\n" -+ "@SQ\tSN:blah\tLN:1\n" - "@RG\tID:fish\n" - "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; - -- if (strcmp(hdr->text, test2_res)) { -- return false; -- } -- return true; -+ return hdrcmp(sam_hdr_str(hdr), test2_res); -+} -+ -+void setup_test_3(sam_hdr_t** hdr_in) -+{ -+ *hdr_in = sam_hdr_init(); -+ const char *test3 = -+ "@HD\tVN:1.4\n" -+ "@SQ\tSN:blah\tLN:1\n" -+ "@RG\tID:fish1\n" -+ "@RG\tID:fish2\n" -+ "@RG\tID:fish3\n" -+ "@RG\tID:fish4\n"; -+ sam_hdr_add_lines(*hdr_in, test3, 0); -+} -+ -+bool check_test_3(sam_hdr_t* hdr) { -+ const char *test3_res = -+ "@HD\tVN:1.4\n" -+ "@SQ\tSN:blah\tLN:1\n" -+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n"; -+ -+ return hdrcmp(sam_hdr_str(hdr), test3_res); - } - - int samtools_test_filter_header_rg_main(int argc, char *argv[]) - { - // test state -- const int NUM_TESTS = 2; -+ const int NUM_TESTS = 3; - int verbose = 0; - int success = 0; - int failure = 0; -@@ -105,13 +177,14 @@ - - // Setup samtools_stderr redirect - kstring_t res = { 0, 0, NULL }; -- FILE* orig_samtools_stderr = fdopen(dup(STDERR_FILENO), "a"); // Save samtools_stderr -+ int orig_samtools_stderr = dup(STDERR_FILENO); // Save samtools_stderr -+ int redirected_samtools_stderr; - char* tempfname = (optind < argc)? argv[optind] : "test_count_rg.tmp"; - FILE* check = NULL; - - // setup - if (verbose) fprintf(samtools_stdout, "BEGIN test 1\n"); // test eliminating a tag that isn't there -- bam_hdr_t* hdr1; -+ sam_hdr_t* hdr1; - const char* id_to_keep_1 = "1#2.3"; - setup_test_1(&hdr1); - if (verbose > 1) { -@@ -121,9 +194,13 @@ - if (verbose) fprintf(samtools_stdout, "RUN test 1\n"); - - // test -- xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe -- bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list); -- fclose(samtools_stderr); -+ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); -+ bool result_1 = (!sam_hdr_remove_except(hdr1, "RG", "ID", id_to_keep_1) && -+ !sam_hdr_add_pg(hdr1, "samtools", "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)); -+ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); - - if (verbose) fprintf(samtools_stdout, "END RUN test 1\n"); - if (verbose > 1) { -@@ -146,11 +223,11 @@ - fclose(check); - - // teardown -- bam_hdr_destroy(hdr1); -+ sam_hdr_destroy(hdr1); - if (verbose) fprintf(samtools_stdout, "END test 1\n"); - - if (verbose) fprintf(samtools_stdout, "BEGIN test 2\n"); // test eliminating a tag that is there -- bam_hdr_t* hdr2; -+ sam_hdr_t* hdr2; - const char* id_to_keep_2 = "fish"; - setup_test_2(&hdr2); - if (verbose > 1) { -@@ -160,9 +237,13 @@ - if (verbose) fprintf(samtools_stdout, "RUN test 2\n"); - - // test -- xfreopen(tempfname, "w", samtools_stderr); // Redirect samtools_stderr to pipe -- bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list); -- fclose(samtools_stderr); -+ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); -+ bool result_2 = (!sam_hdr_remove_except(hdr2, "RG", "ID", id_to_keep_2) && -+ !sam_hdr_add_pg(hdr2, "samtools", "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)); -+ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); - - if (verbose) fprintf(samtools_stdout, "END RUN test 2\n"); - if (verbose > 1) { -@@ -185,17 +266,58 @@ - fclose(check); - - // teardown -- bam_hdr_destroy(hdr2); -+ sam_hdr_destroy(hdr2); - if (verbose) fprintf(samtools_stdout, "END test 2\n"); - -+ if (verbose) fprintf(samtools_stdout, "BEGIN test 3\n"); // test eliminating a tag that is there -+ sam_hdr_t* hdr3; -+ setup_test_3(&hdr3); -+ if (verbose > 1) { -+ fprintf(samtools_stdout, "hdr3\n"); -+ dump_hdr(hdr3); -+ } -+ if (verbose) fprintf(samtools_stdout, "RUN test 3\n"); -+ -+ // test -+ redirected_samtools_stderr = redirect_samtools_stderr(tempfname); -+ bool result_3 = (!sam_hdr_remove_except(hdr3, "RG", NULL, NULL) && -+ !sam_hdr_add_pg(hdr3, "samtools", "VN", samtools_version(), -+ arg_list ? "CL": NULL, -+ arg_list ? arg_list : NULL, -+ NULL)); -+ flush_and_restore_samtools_stderr(orig_samtools_stderr, redirected_samtools_stderr); -+ -+ if (verbose) fprintf(samtools_stdout, "END RUN test 3\n"); -+ if (verbose > 1) { -+ fprintf(samtools_stdout, "hdr3\n"); -+ dump_hdr(hdr3); -+ } -+ -+ // check result -+ res.l = 0; -+ check = fopen(tempfname, "r"); -+ if ( result_3 -+ && check_test_3(hdr3) -+ && kgetline(&res, (kgets_func *)fgets, check) < 0 -+ && (feof(check) || res.l == 0)) { -+ ++success; -+ } else { -+ ++failure; -+ if (verbose) fprintf(samtools_stdout, "FAIL test 3\n"); -+ } -+ fclose(check); -+ -+ // teardown -+ sam_hdr_destroy(hdr3); -+ if (verbose) fprintf(samtools_stdout, "END test 3\n"); - - // Cleanup - free(res.s); - free(arg_list); - remove(tempfname); - if (failure > 0) -- fprintf(orig_samtools_stderr, "%d failures %d successes\n", failure, success); -- fclose(orig_samtools_stderr); -+ fprintf(samtools_stderr, "%d failures %d successes\n", failure, success); -+ close(orig_samtools_stderr); - - return (success == NUM_TESTS)? EXIT_SUCCESS : EXIT_FAILURE; - } ---- python-pysam.orig/samtools/test/test.c -+++ python-pysam/samtools/test/test.c -@@ -1,6 +1,6 @@ - /* test/test.c -- test harness utility routines. - -- Copyright (C) 2014, 2016 Genome Research Ltd. -+ Copyright (C) 2014, 2016, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -28,6 +28,12 @@ - #include - #include - #include -+#include -+#include -+#include -+#include -+#include -+#include - #include - - #include "test.h" -@@ -41,17 +47,34 @@ - } - } - --void dump_hdr(const bam_hdr_t* hdr) -+int redirect_stderr(const char *path) { -+ int fd = open(path, O_WRONLY|O_TRUNC|O_CREAT, 0666); -+ if (!fd) { -+ fprintf(stderr, "Couldn't open \"%s\" : %s\n", path, strerror(errno)); -+ exit(2); -+ } -+ fflush(stderr); -+ dup2(fd, STDERR_FILENO); -+ return fd; -+} -+ -+void flush_and_restore_stderr(int orig_stderr, int redirect_fd) { -+ fflush(stderr); -+ dup2(orig_stderr, STDERR_FILENO); -+ close(redirect_fd); -+} -+ -+void dump_hdr(const sam_hdr_t* hdr) - { -- printf("n_targets: %d\n", hdr->n_targets); -+ printf("n_targets: %d\n", sam_hdr_nref(hdr)); - printf("ignore_sam_err: %d\n", hdr->ignore_sam_err); -- printf("l_text: %u\n", hdr->l_text); -+ printf("l_text: %zu\n", (size_t) sam_hdr_length((sam_hdr_t*)hdr)); - printf("idx\ttarget_len\ttarget_name:\n"); - int32_t target; -- for (target = 0; target < hdr->n_targets; ++target) { -- printf("%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); -+ for (target = 0; target < sam_hdr_nref(hdr); ++target) { -+ printf("%d\t%"PRId64"\t\"%s\"\n", target, (int64_t) sam_hdr_tid2len(hdr, target), sam_hdr_tid2name(hdr, target)); - } -- printf("text: \"%s\"\n", hdr->text); -+ printf("text: \"%s\"\n", sam_hdr_str((sam_hdr_t*)hdr)); - } - - // For tests, just return a constant that can be embedded in expected output. ---- python-pysam.orig/samtools/test/test.c.pysam.c -+++ python-pysam/samtools/test/test.c.pysam.c -@@ -2,7 +2,7 @@ - - /* test/test.c -- test harness utility routines. - -- Copyright (C) 2014, 2016 Genome Research Ltd. -+ Copyright (C) 2014, 2016, 2019 Genome Research Ltd. - - Author: Martin O. Pollard - -@@ -30,6 +30,12 @@ - #include - #include - #include -+#include -+#include -+#include -+#include -+#include -+#include - #include - - #include "test.h" -@@ -43,17 +49,34 @@ - } - } - --void dump_hdr(const bam_hdr_t* hdr) -+int redirect_samtools_stderr(const char *path) { -+ int fd = open(path, O_WRONLY|O_TRUNC|O_CREAT, 0666); -+ if (!fd) { -+ fprintf(samtools_stderr, "Couldn't open \"%s\" : %s\n", path, strerror(errno)); -+ exit(2); -+ } -+ fflush(samtools_stderr); -+ dup2(fd, STDERR_FILENO); -+ return fd; -+} -+ -+void flush_and_restore_samtools_stderr(int orig_samtools_stderr, int redirect_fd) { -+ fflush(samtools_stderr); -+ dup2(orig_samtools_stderr, STDERR_FILENO); -+ close(redirect_fd); -+} -+ -+void dump_hdr(const sam_hdr_t* hdr) - { -- fprintf(samtools_stdout, "n_targets: %d\n", hdr->n_targets); -+ fprintf(samtools_stdout, "n_targets: %d\n", sam_hdr_nref(hdr)); - fprintf(samtools_stdout, "ignore_sam_err: %d\n", hdr->ignore_sam_err); -- fprintf(samtools_stdout, "l_text: %u\n", hdr->l_text); -+ fprintf(samtools_stdout, "l_text: %zu\n", (size_t) sam_hdr_length((sam_hdr_t*)hdr)); - fprintf(samtools_stdout, "idx\ttarget_len\ttarget_name:\n"); - int32_t target; -- for (target = 0; target < hdr->n_targets; ++target) { -- fprintf(samtools_stdout, "%d\t%u\t\"%s\"\n", target, hdr->target_len[target], hdr->target_name[target]); -+ for (target = 0; target < sam_hdr_nref(hdr); ++target) { -+ fprintf(samtools_stdout, "%d\t%"PRId64"\t\"%s\"\n", target, (int64_t) sam_hdr_tid2len(hdr, target), sam_hdr_tid2name(hdr, target)); - } -- fprintf(samtools_stdout, "text: \"%s\"\n", hdr->text); -+ fprintf(samtools_stdout, "text: \"%s\"\n", sam_hdr_str((sam_hdr_t*)hdr)); - } - - // For tests, just return a constant that can be embedded in expected output. ---- python-pysam.orig/samtools/test/test.h -+++ python-pysam/samtools/test/test.h -@@ -30,6 +30,9 @@ - - void xfreopen(const char *path, const char *mode, FILE *stream); - --void dump_hdr(const bam_hdr_t* hdr); -+int redirect_stderr(const char *path); -+void flush_and_restore_stderr(int orig_stderr, int redirect_fd); -+ -+void dump_hdr(const sam_hdr_t* hdr); - - #endif ---- python-pysam.orig/samtools/tmp_file.c -+++ python-pysam/samtools/tmp_file.c -@@ -2,7 +2,7 @@ - tmp_file.c - write to and read from a temporary binary file - for fast storage plus added compression. - -- Copyright (C) 2017 Genome Research Ltd. -+ Copyright (C) 2017, 2018 Genome Research Ltd. - - Author: Andrew Whitwham - -@@ -66,7 +66,6 @@ - tmp->max_data_size = TMP_SAM_MAX_DATA + sizeof(bam1_t); // arbitrary but growable - tmp->ring_buffer_size = TMP_SAM_RING_SIZE; // arbitrary (min 64K) but growable - tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size * tmp->group_size); -- tmp->data = NULL; - tmp->ring_buffer = malloc(sizeof(uint8_t) * tmp->ring_buffer_size); - tmp->ring_index = tmp->ring_buffer; - tmp->comp_buffer = malloc(tmp->comp_buffer_size); -@@ -184,7 +183,7 @@ - - - /* -- * This does the actual compression and writing to disk. On disk format consists of a -+ * This does the actual compression and writing to a file. The file format consists of a - * single size_t for the size of the compressed data followed by the data itself. - * Returns 0 on success, a negative number on failure. - */ -@@ -244,16 +243,16 @@ - - /* - * Stores an in memory bam structure for writing and if enough are gathered together writes -- * it to disk. Mulitiple alignments compress better that single ones though after a certain number -+ * it to a file. Multiple alignments compress better that single ones though after a certain number - * there is a law of diminishing returns. - * Returns 0 on success, a negative number on failure. - */ - int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) { - -- if ((tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { -+ if ((tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { - int ret; - -- if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 5))) { -+ if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 2))) { - tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n", - (tmp->input_size + inbam->l_data)); - -@@ -283,70 +282,8 @@ - - - /* -- * Closes the file after writing out any remaining alignments. Adds a size_t 0 to -- * mark the end of the file. Companion function to tmp_file_open_read below. -- * Returns 0 on success, a negative number on failure. -- */ --int tmp_file_close_write(tmp_file_t *tmp) { -- size_t terminator = 0; -- -- if (tmp->entry_number) { -- int ret; -- -- if ((ret = tmp_file_write_to_file(tmp))) { -- return ret; -- } -- } -- -- if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { -- tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); -- return TMP_SAM_FILE_ERROR; -- } -- -- if (fclose(tmp->fp)) { -- tmp_print_error(tmp, "[tmp_file] Error: closing tmp file %s failed.\n", tmp->name); -- return TMP_SAM_FILE_ERROR; -- } -- -- LZ4_freeStream(tmp->stream); -- -- return TMP_SAM_OK; --} -- -- --/* -- * Opens the file for reading. Optionally, if given a pointer to an existing -- * bam1_t structure, it will free the data entry to prevent memory leaks. -- * Companion function to tmp_file_close_write above. -- * Returns 0 on success, a negative number on failure. -- */ --int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam) { -- -- if ((tmp->fp = fopen(tmp->name, "rb")) == NULL) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to open read file %s.\n", tmp->name); -- return TMP_SAM_FILE_ERROR; -- } -- -- tmp->dstream = LZ4_createStreamDecode(); -- tmp->offset = 0; -- -- if (inbam) { -- free(inbam->data); -- } -- -- if (!tmp->dstream) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); -- return TMP_SAM_MEM_ERROR; -- } -- -- -- return TMP_SAM_OK; --} -- -- --/* -- * An alternative to tmp_file_close_write that does the same job without actually -- * closing the file. Companion function to tmp_file_begin_read below. -+ * Marks the end of file writing. Adds a size_t 0 to mark the end of -+ * the file. Companion function to tmp_file_begin_read below. - * Returns 0 on success, a negative number on failure. - */ - int tmp_file_end_write(tmp_file_t *tmp) { -@@ -374,11 +311,11 @@ - - - /* -- * An alternative to tmp_file_open_read but works on an open file. -+ * Prepares the file for reading. - * Companion function to tmp_file_end_write above. - * Returns 0 on success, a negative number on failure. - */ --int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam) { -+int tmp_file_begin_read(tmp_file_t *tmp) { - - rewind(tmp->fp); - -@@ -386,10 +323,6 @@ - tmp->offset = 0; - tmp->entry_number = tmp->group_size; - -- if (inbam) { -- free(inbam->data); -- } -- - if (!tmp->dstream) { - tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); - return TMP_SAM_MEM_ERROR; -@@ -400,11 +333,19 @@ - - - /* -- * Read the next alignment, either from memory or from disk. -+ * Read the next alignment, either from memory or from a file. - * Returns size of entry on success, 0 on end of file or a negative on error. - */ - int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) { - int entry_size; -+ uint8_t *data = inbam->data; -+ -+ /* while tmp_file_read assumes that the same bam1_t variable -+ is being used in each call, this may not be the case. So -+ default to the lowest memory size for safety. */ -+ if (tmp->data_size > inbam->m_data) { -+ tmp->data_size = inbam->m_data; -+ } - - if (tmp->entry_number == tmp->group_size) { - // read more data -@@ -438,17 +379,22 @@ - - tmp->ring_index = tmp->ring_buffer + tmp->offset; - memcpy(inbam, tmp->ring_index, sizeof(bam1_t)); -+ inbam->data = data; // put the pointer to real bam data back - - if ((unsigned int)inbam->l_data > tmp->data_size) { -- if ((tmp->data = realloc(tmp->data, sizeof(uint8_t) * inbam->l_data)) == NULL) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp data memory.\n"); -+ uint8_t *tmp_data; -+ tmp->data_size = inbam->l_data; kroundup32(tmp->data_size); -+ -+ if ((tmp_data = realloc(inbam->data, sizeof(uint8_t) * tmp->data_size)) == NULL) { -+ tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp bam data memory.\n"); - return TMP_SAM_MEM_ERROR; - } - -- tmp->data_size = inbam->l_data; -+ inbam->data = tmp_data; - } - -- inbam->data = tmp->data; -+ inbam->m_data = tmp->data_size; // set to the actual data size -+ - entry_size = sizeof(bam1_t); - - memcpy(inbam->data, tmp->ring_index + entry_size, inbam->l_data); -@@ -474,34 +420,19 @@ - - - /* -- * Frees up memory, closes the file and optionally deletes it. Giving this function -- * pointer to the bam1_t structure used for reading will set its data value to null, -- * preventing bam_destroy1() from trying to free already freed memory. -- * Returns 0 on success, a negative number or EOF on failure. -+ * Frees up memory, closes the file and deletes it. -+ * Returns 0 on success or EOF on failure. - */ --int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete) { -+int tmp_file_destroy(tmp_file_t *tmp) { - int ret = 0; - - ret = fclose(tmp->fp); - -- if (delete && ret == 0) { -- if (unlink(tmp->name)) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to delete file %s.\n", tmp->name); -- ret = TMP_SAM_FILE_ERROR; -- } -- } -- - LZ4_freeStreamDecode(tmp->dstream); - free(tmp->ring_buffer); - free(tmp->comp_buffer); - free(tmp->name); -- free(tmp->data); - free(tmp->dict); - -- -- if (inbam) { -- inbam->data = NULL; -- } -- - return ret; - } ---- python-pysam.orig/samtools/tmp_file.c.pysam.c -+++ python-pysam/samtools/tmp_file.c.pysam.c -@@ -4,7 +4,7 @@ - tmp_file.c - write to and read from a temporary binary file - for fast storage plus added compression. - -- Copyright (C) 2017 Genome Research Ltd. -+ Copyright (C) 2017, 2018 Genome Research Ltd. - - Author: Andrew Whitwham - -@@ -68,7 +68,6 @@ - tmp->max_data_size = TMP_SAM_MAX_DATA + sizeof(bam1_t); // arbitrary but growable - tmp->ring_buffer_size = TMP_SAM_RING_SIZE; // arbitrary (min 64K) but growable - tmp->comp_buffer_size = LZ4_COMPRESSBOUND(tmp->max_data_size * tmp->group_size); -- tmp->data = NULL; - tmp->ring_buffer = malloc(sizeof(uint8_t) * tmp->ring_buffer_size); - tmp->ring_index = tmp->ring_buffer; - tmp->comp_buffer = malloc(tmp->comp_buffer_size); -@@ -186,7 +185,7 @@ - - - /* -- * This does the actual compression and writing to disk. On disk format consists of a -+ * This does the actual compression and writing to a file. The file format consists of a - * single size_t for the size of the compressed data followed by the data itself. - * Returns 0 on success, a negative number on failure. - */ -@@ -246,16 +245,16 @@ - - /* - * Stores an in memory bam structure for writing and if enough are gathered together writes -- * it to disk. Mulitiple alignments compress better that single ones though after a certain number -+ * it to a file. Multiple alignments compress better that single ones though after a certain number - * there is a law of diminishing returns. - * Returns 0 on success, a negative number on failure. - */ - int tmp_file_write(tmp_file_t *tmp, bam1_t *inbam) { - -- if ((tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { -+ if ((tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) >= tmp->ring_buffer_size) { - int ret; - -- if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 5))) { -+ if ((ret = tmp_file_grow_ring_buffer(tmp, (tmp->offset + tmp->input_size + sizeof(bam1_t) + inbam->l_data) * 2))) { - tmp_print_error(tmp, "[tmp_file] Error: input line too big. (%ld).\n", - (tmp->input_size + inbam->l_data)); - -@@ -285,70 +284,8 @@ - - - /* -- * Closes the file after writing out any remaining alignments. Adds a size_t 0 to -- * mark the end of the file. Companion function to tmp_file_open_read below. -- * Returns 0 on success, a negative number on failure. -- */ --int tmp_file_close_write(tmp_file_t *tmp) { -- size_t terminator = 0; -- -- if (tmp->entry_number) { -- int ret; -- -- if ((ret = tmp_file_write_to_file(tmp))) { -- return ret; -- } -- } -- -- if (fwrite(&terminator, sizeof(size_t), 1, tmp->fp) < 1) { -- tmp_print_error(tmp, "[tmp_file] Error: tmp file write terminator failed.\n"); -- return TMP_SAM_FILE_ERROR; -- } -- -- if (fclose(tmp->fp)) { -- tmp_print_error(tmp, "[tmp_file] Error: closing tmp file %s failed.\n", tmp->name); -- return TMP_SAM_FILE_ERROR; -- } -- -- LZ4_freeStream(tmp->stream); -- -- return TMP_SAM_OK; --} -- -- --/* -- * Opens the file for reading. Optionally, if given a pointer to an existing -- * bam1_t structure, it will free the data entry to prevent memory leaks. -- * Companion function to tmp_file_close_write above. -- * Returns 0 on success, a negative number on failure. -- */ --int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam) { -- -- if ((tmp->fp = fopen(tmp->name, "rb")) == NULL) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to open read file %s.\n", tmp->name); -- return TMP_SAM_FILE_ERROR; -- } -- -- tmp->dstream = LZ4_createStreamDecode(); -- tmp->offset = 0; -- -- if (inbam) { -- free(inbam->data); -- } -- -- if (!tmp->dstream) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); -- return TMP_SAM_MEM_ERROR; -- } -- -- -- return TMP_SAM_OK; --} -- -- --/* -- * An alternative to tmp_file_close_write that does the same job without actually -- * closing the file. Companion function to tmp_file_begin_read below. -+ * Marks the end of file writing. Adds a size_t 0 to mark the end of -+ * the file. Companion function to tmp_file_begin_read below. - * Returns 0 on success, a negative number on failure. - */ - int tmp_file_end_write(tmp_file_t *tmp) { -@@ -376,11 +313,11 @@ - - - /* -- * An alternative to tmp_file_open_read but works on an open file. -+ * Prepares the file for reading. - * Companion function to tmp_file_end_write above. - * Returns 0 on success, a negative number on failure. - */ --int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam) { -+int tmp_file_begin_read(tmp_file_t *tmp) { - - rewind(tmp->fp); - -@@ -388,10 +325,6 @@ - tmp->offset = 0; - tmp->entry_number = tmp->group_size; - -- if (inbam) { -- free(inbam->data); -- } -- - if (!tmp->dstream) { - tmp_print_error(tmp, "[tmp_file] Error: unable to allocate compression stream.\n"); - return TMP_SAM_MEM_ERROR; -@@ -402,11 +335,19 @@ - - - /* -- * Read the next alignment, either from memory or from disk. -+ * Read the next alignment, either from memory or from a file. - * Returns size of entry on success, 0 on end of file or a negative on error. - */ - int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam) { - int entry_size; -+ uint8_t *data = inbam->data; -+ -+ /* while tmp_file_read assumes that the same bam1_t variable -+ is being used in each call, this may not be the case. So -+ default to the lowest memory size for safety. */ -+ if (tmp->data_size > inbam->m_data) { -+ tmp->data_size = inbam->m_data; -+ } - - if (tmp->entry_number == tmp->group_size) { - // read more data -@@ -440,17 +381,22 @@ - - tmp->ring_index = tmp->ring_buffer + tmp->offset; - memcpy(inbam, tmp->ring_index, sizeof(bam1_t)); -+ inbam->data = data; // put the pointer to real bam data back - - if ((unsigned int)inbam->l_data > tmp->data_size) { -- if ((tmp->data = realloc(tmp->data, sizeof(uint8_t) * inbam->l_data)) == NULL) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp data memory.\n"); -+ uint8_t *tmp_data; -+ tmp->data_size = inbam->l_data; kroundup32(tmp->data_size); -+ -+ if ((tmp_data = realloc(inbam->data, sizeof(uint8_t) * tmp->data_size)) == NULL) { -+ tmp_print_error(tmp, "[tmp_file] Error: unable to allocate tmp bam data memory.\n"); - return TMP_SAM_MEM_ERROR; - } - -- tmp->data_size = inbam->l_data; -+ inbam->data = tmp_data; - } - -- inbam->data = tmp->data; -+ inbam->m_data = tmp->data_size; // set to the actual data size -+ - entry_size = sizeof(bam1_t); - - memcpy(inbam->data, tmp->ring_index + entry_size, inbam->l_data); -@@ -476,34 +422,19 @@ - - - /* -- * Frees up memory, closes the file and optionally deletes it. Giving this function -- * pointer to the bam1_t structure used for reading will set its data value to null, -- * preventing bam_destroy1() from trying to free already freed memory. -- * Returns 0 on success, a negative number or EOF on failure. -+ * Frees up memory, closes the file and deletes it. -+ * Returns 0 on success or EOF on failure. - */ --int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete) { -+int tmp_file_destroy(tmp_file_t *tmp) { - int ret = 0; - - ret = fclose(tmp->fp); - -- if (delete && ret == 0) { -- if (unlink(tmp->name)) { -- tmp_print_error(tmp, "[tmp_file] Error: unable to delete file %s.\n", tmp->name); -- ret = TMP_SAM_FILE_ERROR; -- } -- } -- - LZ4_freeStreamDecode(tmp->dstream); - free(tmp->ring_buffer); - free(tmp->comp_buffer); - free(tmp->name); -- free(tmp->data); - free(tmp->dict); - -- -- if (inbam) { -- inbam->data = NULL; -- } -- - return ret; - } ---- python-pysam.orig/samtools/tmp_file.h -+++ python-pysam/samtools/tmp_file.h -@@ -2,7 +2,7 @@ - tmp_file.h - write to and read from a temporary binary file - for fast storage plus added compression. - -- Copyright (C) 2017 Genome Research Ltd. -+ Copyright (C) 2017, 2018 Genome Research Ltd. - - Author: Andrew Whitwham - -@@ -58,7 +58,6 @@ - size_t ring_buffer_size; - size_t comp_buffer_size; - size_t offset; -- uint8_t *data; - uint8_t *ring_buffer; - uint8_t *ring_index; - char *comp_buffer; -@@ -84,7 +83,7 @@ - - /* - * Stores an in memory bam structure for writing and if enough are gathered together writes -- * it to disk. Mulitiple alignments compress better that single ones though after a certain number -+ * it to a file. Multiple alignments compress better that single ones though after a certain number - * there is a law of diminishing returns. - * Returns 0 on success, a negative number on failure. - */ -@@ -92,50 +91,31 @@ - - - /* -- * Closes the file after writing out any remaining alignments. Adds a size_t 0 to -- * mark the end of the file. Companion function to tmp_file_open_read below. -- * Returns 0 on success, a negative number on failure. -- */ --int tmp_file_close_write(tmp_file_t *tmp); -- -- --/* -- * Opens the file for reading. Optionally, if given a pointer to an existing -- * bam1_t structure, it will free the data entry to prevent memory leaks. -- * Companion function to tmp_file_close_write above. -- * Returns 0 on success, a negative number on failure. -- */ --int tmp_file_open_read(tmp_file_t *tmp, bam1_t *inbam); -- -- --/* -- * An alternative to tmp_file_close_write that does the same job without actually -- * closing the file. Companion function to tmp_file_begin_read below. -+ * Marks the end of file writing. Adds a size_t 0 to mark the end of -+ * the file. Companion function to tmp_file_begin_read below. - * Returns 0 on success, a negative number on failure. - */ - int tmp_file_end_write(tmp_file_t *tmp); - - /* -- * An alternative to tmp_file_open_read but works on an open file. -+ * Prepares the file for reading. - * Companion function to tmp_file_end_write above. - * Returns 0 on success, a negative number on failure. - */ --int tmp_file_begin_read(tmp_file_t *tmp, bam1_t *inbam); -+int tmp_file_begin_read(tmp_file_t *tmp); - - /* -- * Read the next alignment, either from memory or from disk. -+ * Read the next alignment, either from memory or from a file. - * Returns size of entry on success, 0 on end of file or a negative on error. - */ - int tmp_file_read(tmp_file_t *tmp, bam1_t *inbam); - - - /* -- * Frees up memory, closes the file and optionally deletes it. Giving this function -- * pointer to the bam1_t structure used for reading will set its data value to null, -- * preventing bam_destroy1() from trying to free already freed memory. -- * Returns 0 on success, a negative number or EOF on failure. -+ * Frees up memory, closes the file and deletes it. -+ * Returns 0 on success or EOF on failure. - */ --int tmp_file_destroy(tmp_file_t *tmp, bam1_t *inbam, int delete); -+int tmp_file_destroy(tmp_file_t *tmp); - - #ifdef __cplusplus - } ---- python-pysam.orig/samtools/version.h -+++ python-pysam/samtools/version.h -@@ -1 +1 @@ --#define SAMTOOLS_VERSION "1.9" -+#define SAMTOOLS_VERSION "1.10" ---- python-pysam.orig/samtools/win32/xcurses.h -+++ /dev/null -@@ -1,1377 +0,0 @@ --/* Public Domain Curses */ -- --/* $Id: curses.h,v 1.295 2008/07/15 17:13:25 wmcbrine Exp $ */ -- --/*----------------------------------------------------------------------* -- * PDCurses * -- *----------------------------------------------------------------------*/ -- --#ifndef __PDCURSES__ --#define __PDCURSES__ 1 -- --/*man-start************************************************************** -- --PDCurses definitions list: (Only define those needed) -- -- XCURSES True if compiling for X11. -- PDC_RGB True if you want to use RGB color definitions -- (Red = 1, Green = 2, Blue = 4) instead of BGR. -- PDC_WIDE True if building wide-character support. -- PDC_DLL_BUILD True if building a Win32 DLL. -- NCURSES_MOUSE_VERSION Use the ncurses mouse API instead -- of PDCurses' traditional mouse API. -- --PDCurses portable platform definitions list: -- -- PDC_BUILD Defines API build version. -- PDCURSES Enables access to PDCurses-only routines. -- XOPEN Always true. -- SYSVcurses True if you are compiling for SYSV portability. -- BSDcurses True if you are compiling for BSD portability. -- --**man-end****************************************************************/ -- --#define PDC_BUILD 3401 --#define PDCURSES 1 /* PDCurses-only routines */ --#define XOPEN 1 /* X/Open Curses routines */ --#define SYSVcurses 1 /* System V Curses routines */ --#define BSDcurses 1 /* BSD Curses routines */ --#define CHTYPE_LONG 1 /* size of chtype; long */ -- --/*----------------------------------------------------------------------*/ -- --#include --#include --#include /* Required by X/Open usage below */ -- --#ifdef PDC_WIDE --# include --#endif -- --#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) --extern "C" --{ --# define bool _bool --#endif -- --/*---------------------------------------------------------------------- -- * -- * PDCurses Manifest Constants -- * -- */ -- --#ifndef FALSE --# define FALSE 0 --#endif --#ifndef TRUE --# define TRUE 1 --#endif --#ifndef NULL --# define NULL (void *)0 --#endif --#ifndef ERR --# define ERR (-1) --#endif --#ifndef OK --# define OK 0 --#endif -- --/*---------------------------------------------------------------------- -- * -- * PDCurses Type Declarations -- * -- */ -- --typedef unsigned char bool; /* PDCurses Boolean type */ -- --#ifdef CHTYPE_LONG --# if _LP64 --typedef unsigned int chtype; --# else --typedef unsigned long chtype; /* 16-bit attr + 16-bit char */ --# endif --#else --typedef unsigned short chtype; /* 8-bit attr + 8-bit char */ --#endif -- --#ifdef PDC_WIDE --typedef chtype cchar_t; --#endif -- --typedef chtype attr_t; -- --/*---------------------------------------------------------------------- -- * -- * PDCurses Mouse Interface -- SYSVR4, with extensions -- * -- */ -- --typedef struct --{ -- int x; /* absolute column, 0 based, measured in characters */ -- int y; /* absolute row, 0 based, measured in characters */ -- short button[3]; /* state of each button */ -- int changes; /* flags indicating what has changed with the mouse */ --} MOUSE_STATUS; -- --#define BUTTON_RELEASED 0x0000 --#define BUTTON_PRESSED 0x0001 --#define BUTTON_CLICKED 0x0002 --#define BUTTON_DOUBLE_CLICKED 0x0003 --#define BUTTON_TRIPLE_CLICKED 0x0004 --#define BUTTON_MOVED 0x0005 /* PDCurses */ --#define WHEEL_SCROLLED 0x0006 /* PDCurses */ --#define BUTTON_ACTION_MASK 0x0007 /* PDCurses */ -- --#define PDC_BUTTON_SHIFT 0x0008 /* PDCurses */ --#define PDC_BUTTON_CONTROL 0x0010 /* PDCurses */ --#define PDC_BUTTON_ALT 0x0020 /* PDCurses */ --#define BUTTON_MODIFIER_MASK 0x0038 /* PDCurses */ -- --#define MOUSE_X_POS (Mouse_status.x) --#define MOUSE_Y_POS (Mouse_status.y) -- --/* -- * Bits associated with the .changes field: -- * 3 2 1 0 -- * 210987654321098765432109876543210 -- * 1 <- button 1 has changed -- * 10 <- button 2 has changed -- * 100 <- button 3 has changed -- * 1000 <- mouse has moved -- * 10000 <- mouse position report -- * 100000 <- mouse wheel up -- * 1000000 <- mouse wheel down -- */ -- --#define PDC_MOUSE_MOVED 0x0008 --#define PDC_MOUSE_POSITION 0x0010 --#define PDC_MOUSE_WHEEL_UP 0x0020 --#define PDC_MOUSE_WHEEL_DOWN 0x0040 -- --#define A_BUTTON_CHANGED (Mouse_status.changes & 7) --#define MOUSE_MOVED (Mouse_status.changes & PDC_MOUSE_MOVED) --#define MOUSE_POS_REPORT (Mouse_status.changes & PDC_MOUSE_POSITION) --#define BUTTON_CHANGED(x) (Mouse_status.changes & (1 << ((x) - 1))) --#define BUTTON_STATUS(x) (Mouse_status.button[(x) - 1]) --#define MOUSE_WHEEL_UP (Mouse_status.changes & PDC_MOUSE_WHEEL_UP) --#define MOUSE_WHEEL_DOWN (Mouse_status.changes & PDC_MOUSE_WHEEL_DOWN) -- --/* mouse bit-masks */ -- --#define BUTTON1_RELEASED 0x00000001L --#define BUTTON1_PRESSED 0x00000002L --#define BUTTON1_CLICKED 0x00000004L --#define BUTTON1_DOUBLE_CLICKED 0x00000008L --#define BUTTON1_TRIPLE_CLICKED 0x00000010L --#define BUTTON1_MOVED 0x00000010L /* PDCurses */ -- --#define BUTTON2_RELEASED 0x00000020L --#define BUTTON2_PRESSED 0x00000040L --#define BUTTON2_CLICKED 0x00000080L --#define BUTTON2_DOUBLE_CLICKED 0x00000100L --#define BUTTON2_TRIPLE_CLICKED 0x00000200L --#define BUTTON2_MOVED 0x00000200L /* PDCurses */ -- --#define BUTTON3_RELEASED 0x00000400L --#define BUTTON3_PRESSED 0x00000800L --#define BUTTON3_CLICKED 0x00001000L --#define BUTTON3_DOUBLE_CLICKED 0x00002000L --#define BUTTON3_TRIPLE_CLICKED 0x00004000L --#define BUTTON3_MOVED 0x00004000L /* PDCurses */ -- --/* For the ncurses-compatible functions only, BUTTON4_PRESSED and -- BUTTON5_PRESSED are returned for mouse scroll wheel up and down; -- otherwise PDCurses doesn't support buttons 4 and 5 */ -- --#define BUTTON4_RELEASED 0x00008000L --#define BUTTON4_PRESSED 0x00010000L --#define BUTTON4_CLICKED 0x00020000L --#define BUTTON4_DOUBLE_CLICKED 0x00040000L --#define BUTTON4_TRIPLE_CLICKED 0x00080000L -- --#define BUTTON5_RELEASED 0x00100000L --#define BUTTON5_PRESSED 0x00200000L --#define BUTTON5_CLICKED 0x00400000L --#define BUTTON5_DOUBLE_CLICKED 0x00800000L --#define BUTTON5_TRIPLE_CLICKED 0x01000000L -- --#define MOUSE_WHEEL_SCROLL 0x02000000L /* PDCurses */ --#define BUTTON_MODIFIER_SHIFT 0x04000000L /* PDCurses */ --#define BUTTON_MODIFIER_CONTROL 0x08000000L /* PDCurses */ --#define BUTTON_MODIFIER_ALT 0x10000000L /* PDCurses */ -- --#define ALL_MOUSE_EVENTS 0x1fffffffL --#define REPORT_MOUSE_POSITION 0x20000000L -- --/* ncurses mouse interface */ -- --typedef unsigned long mmask_t; -- --typedef struct --{ -- short id; /* unused, always 0 */ -- int x, y, z; /* x, y same as MOUSE_STATUS; z unused */ -- mmask_t bstate; /* equivalent to changes + button[], but -- in the same format as used for mousemask() */ --} MEVENT; -- --#ifdef NCURSES_MOUSE_VERSION --# define BUTTON_SHIFT BUTTON_MODIFIER_SHIFT --# define BUTTON_CONTROL BUTTON_MODIFIER_CONTROL --# define BUTTON_CTRL BUTTON_MODIFIER_CONTROL --# define BUTTON_ALT BUTTON_MODIFIER_ALT --#else --# define BUTTON_SHIFT PDC_BUTTON_SHIFT --# define BUTTON_CONTROL PDC_BUTTON_CONTROL --# define BUTTON_ALT PDC_BUTTON_ALT --#endif -- --/*---------------------------------------------------------------------- -- * -- * PDCurses Structure Definitions -- * -- */ -- --typedef struct _win /* definition of a window */ --{ -- int _cury; /* current pseudo-cursor */ -- int _curx; -- int _maxy; /* max window coordinates */ -- int _maxx; -- int _begy; /* origin on screen */ -- int _begx; -- int _flags; /* window properties */ -- chtype _attrs; /* standard attributes and colors */ -- chtype _bkgd; /* background, normally blank */ -- bool _clear; /* causes clear at next refresh */ -- bool _leaveit; /* leaves cursor where it is */ -- bool _scroll; /* allows window scrolling */ -- bool _nodelay; /* input character wait flag */ -- bool _immed; /* immediate update flag */ -- bool _sync; /* synchronise window ancestors */ -- bool _use_keypad; /* flags keypad key mode active */ -- chtype **_y; /* pointer to line pointer array */ -- int *_firstch; /* first changed character in line */ -- int *_lastch; /* last changed character in line */ -- int _tmarg; /* top of scrolling region */ -- int _bmarg; /* bottom of scrolling region */ -- int _delayms; /* milliseconds of delay for getch() */ -- int _parx, _pary; /* coords relative to parent (0,0) */ -- struct _win *_parent; /* subwin's pointer to parent win */ --} WINDOW; -- --/* Avoid using the SCREEN struct directly -- use the corresponding -- functions if possible. This struct may eventually be made private. */ -- --typedef struct --{ -- bool alive; /* if initscr() called, and not endwin() */ -- bool autocr; /* if cr -> lf */ -- bool cbreak; /* if terminal unbuffered */ -- bool echo; /* if terminal echo */ -- bool raw_inp; /* raw input mode (v. cooked input) */ -- bool raw_out; /* raw output mode (7 v. 8 bits) */ -- bool audible; /* FALSE if the bell is visual */ -- bool mono; /* TRUE if current screen is mono */ -- bool resized; /* TRUE if TERM has been resized */ -- bool orig_attr; /* TRUE if we have the original colors */ -- short orig_fore; /* original screen foreground color */ -- short orig_back; /* original screen foreground color */ -- int cursrow; /* position of physical cursor */ -- int curscol; /* position of physical cursor */ -- int visibility; /* visibility of cursor */ -- int orig_cursor; /* original cursor size */ -- int lines; /* new value for LINES */ -- int cols; /* new value for COLS */ -- unsigned long _trap_mbe; /* trap these mouse button events */ -- unsigned long _map_mbe_to_key; /* map mouse buttons to slk */ -- int mouse_wait; /* time to wait (in ms) for a -- button release after a press, in -- order to count it as a click */ -- int slklines; /* lines in use by slk_init() */ -- WINDOW *slk_winptr; /* window for slk */ -- int linesrippedoff; /* lines ripped off via ripoffline() */ -- int linesrippedoffontop; /* lines ripped off on -- top via ripoffline() */ -- int delaytenths; /* 1/10ths second to wait block -- getch() for */ -- bool _preserve; /* TRUE if screen background -- to be preserved */ -- int _restore; /* specifies if screen background -- to be restored, and how */ -- bool save_key_modifiers; /* TRUE if each key modifiers saved -- with each key press */ -- bool return_key_modifiers; /* TRUE if modifier keys are -- returned as "real" keys */ -- bool key_code; /* TRUE if last key is a special key; -- used internally by get_wch() */ --#ifdef XCURSES -- int XcurscrSize; /* size of Xcurscr shared memory block */ -- bool sb_on; -- int sb_viewport_y; -- int sb_viewport_x; -- int sb_total_y; -- int sb_total_x; -- int sb_cur_y; -- int sb_cur_x; --#endif -- short line_color; /* color of line attributes - default -1 */ --} SCREEN; -- --/*---------------------------------------------------------------------- -- * -- * PDCurses External Variables -- * -- */ -- --#ifdef PDC_DLL_BUILD --# ifdef CURSES_LIBRARY --# define PDCEX __declspec(dllexport) extern --# else --# define PDCEX __declspec(dllimport) --# endif --#else --# define PDCEX extern --#endif -- --PDCEX int LINES; /* terminal height */ --PDCEX int COLS; /* terminal width */ --PDCEX WINDOW *stdscr; /* the default screen window */ --PDCEX WINDOW *curscr; /* the current screen image */ --PDCEX SCREEN *SP; /* curses variables */ --PDCEX MOUSE_STATUS Mouse_status; --PDCEX int COLORS; --PDCEX int COLOR_PAIRS; --PDCEX int TABSIZE; --PDCEX chtype acs_map[]; /* alternate character set map */ --PDCEX char ttytype[]; /* terminal name/description */ -- --/*man-start************************************************************** -- --PDCurses Text Attributes --======================== -- --Originally, PDCurses used a short (16 bits) for its chtype. To include --color, a number of things had to be sacrificed from the strict Unix and --System V support. The main problem was fitting all character attributes --and color into an unsigned char (all 8 bits!). -- --Today, PDCurses by default uses a long (32 bits) for its chtype, as in --System V. The short chtype is still available, by undefining CHTYPE_LONG --and rebuilding the library. -- --The following is the structure of a win->_attrs chtype: -- --short form: -- --------------------------------------------------- --|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| --------------------------------------------------- -- color number | attrs | character eg 'a' -- --The available non-color attributes are bold, reverse and blink. Others --have no effect. The high order char is an index into an array of --physical colors (defined in color.c) -- 32 foreground/background color --pairs (5 bits) plus 3 bits for other attributes. -- --long form: -- ------------------------------------------------------------------------------ --|31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|..| 3| 2| 1| 0| ------------------------------------------------------------------------------ -- color number | modifiers | character eg 'a' -- --The available non-color attributes are bold, underline, invisible, --right-line, left-line, protect, reverse and blink. 256 color pairs (8 --bits), 8 bits for other attributes, and 16 bits for character data. -- --**man-end****************************************************************/ -- --/*** Video attribute macros ***/ -- --#define A_NORMAL (chtype)0 -- --#ifdef CHTYPE_LONG --# define A_ALTCHARSET (chtype)0x00010000 --# define A_RIGHTLINE (chtype)0x00020000 --# define A_LEFTLINE (chtype)0x00040000 --# define A_INVIS (chtype)0x00080000 --# define A_UNDERLINE (chtype)0x00100000 --# define A_REVERSE (chtype)0x00200000 --# define A_BLINK (chtype)0x00400000 --# define A_BOLD (chtype)0x00800000 -- --# define A_ATTRIBUTES (chtype)0xffff0000 --# define A_CHARTEXT (chtype)0x0000ffff --# define A_COLOR (chtype)0xff000000 -- --# define A_ITALIC A_INVIS --# define A_PROTECT (A_UNDERLINE | A_LEFTLINE | A_RIGHTLINE) -- --# define PDC_ATTR_SHIFT 19 --# define PDC_COLOR_SHIFT 24 --#else --# define A_BOLD (chtype)0x0100 /* X/Open */ --# define A_REVERSE (chtype)0x0200 /* X/Open */ --# define A_BLINK (chtype)0x0400 /* X/Open */ -- --# define A_ATTRIBUTES (chtype)0xff00 /* X/Open */ --# define A_CHARTEXT (chtype)0x00ff /* X/Open */ --# define A_COLOR (chtype)0xf800 /* System V */ -- --# define A_ALTCHARSET A_NORMAL /* X/Open */ --# define A_PROTECT A_NORMAL /* X/Open */ --# define A_UNDERLINE A_NORMAL /* X/Open */ -- --# define A_LEFTLINE A_NORMAL --# define A_RIGHTLINE A_NORMAL --# define A_ITALIC A_NORMAL --# define A_INVIS A_NORMAL -- --# define PDC_ATTR_SHIFT 8 --# define PDC_COLOR_SHIFT 11 --#endif -- --#define A_STANDOUT (A_REVERSE | A_BOLD) /* X/Open */ --#define A_DIM A_NORMAL -- --#define CHR_MSK A_CHARTEXT /* Obsolete */ --#define ATR_MSK A_ATTRIBUTES /* Obsolete */ --#define ATR_NRM A_NORMAL /* Obsolete */ -- --/* For use with attr_t -- X/Open says, "these shall be distinct", so -- this is a non-conforming implementation. */ -- --#define WA_ALTCHARSET A_ALTCHARSET --#define WA_BLINK A_BLINK --#define WA_BOLD A_BOLD --#define WA_DIM A_DIM --#define WA_INVIS A_INVIS --#define WA_LEFT A_LEFTLINE --#define WA_PROTECT A_PROTECT --#define WA_REVERSE A_REVERSE --#define WA_RIGHT A_RIGHTLINE --#define WA_STANDOUT A_STANDOUT --#define WA_UNDERLINE A_UNDERLINE -- --#define WA_HORIZONTAL A_NORMAL --#define WA_LOW A_NORMAL --#define WA_TOP A_NORMAL --#define WA_VERTICAL A_NORMAL -- --/*** Alternate character set macros ***/ -- --/* 'w' = 32-bit chtype; acs_map[] index | A_ALTCHARSET -- 'n' = 16-bit chtype; it gets the fallback set because no bit is -- available for A_ALTCHARSET */ -- --#ifdef CHTYPE_LONG --# define ACS_PICK(w, n) ((chtype)w | A_ALTCHARSET) --#else --# define ACS_PICK(w, n) ((chtype)n) --#endif -- --/* VT100-compatible symbols -- box chars */ -- --#define ACS_ULCORNER ACS_PICK('l', '+') --#define ACS_LLCORNER ACS_PICK('m', '+') --#define ACS_URCORNER ACS_PICK('k', '+') --#define ACS_LRCORNER ACS_PICK('j', '+') --#define ACS_RTEE ACS_PICK('u', '+') --#define ACS_LTEE ACS_PICK('t', '+') --#define ACS_BTEE ACS_PICK('v', '+') --#define ACS_TTEE ACS_PICK('w', '+') --#define ACS_HLINE ACS_PICK('q', '-') --#define ACS_VLINE ACS_PICK('x', '|') --#define ACS_PLUS ACS_PICK('n', '+') -- --/* VT100-compatible symbols -- other */ -- --#define ACS_S1 ACS_PICK('o', '-') --#define ACS_S9 ACS_PICK('s', '_') --#define ACS_DIAMOND ACS_PICK('`', '+') --#define ACS_CKBOARD ACS_PICK('a', ':') --#define ACS_DEGREE ACS_PICK('f', '\'') --#define ACS_PLMINUS ACS_PICK('g', '#') --#define ACS_BULLET ACS_PICK('~', 'o') -- --/* Teletype 5410v1 symbols -- these are defined in SysV curses, but -- are not well-supported by most terminals. Stick to VT100 characters -- for optimum portability. */ -- --#define ACS_LARROW ACS_PICK(',', '<') --#define ACS_RARROW ACS_PICK('+', '>') --#define ACS_DARROW ACS_PICK('.', 'v') --#define ACS_UARROW ACS_PICK('-', '^') --#define ACS_BOARD ACS_PICK('h', '#') --#define ACS_LANTERN ACS_PICK('i', '*') --#define ACS_BLOCK ACS_PICK('0', '#') -- --/* That goes double for these -- undocumented SysV symbols. Don't use -- them. */ -- --#define ACS_S3 ACS_PICK('p', '-') --#define ACS_S7 ACS_PICK('r', '-') --#define ACS_LEQUAL ACS_PICK('y', '<') --#define ACS_GEQUAL ACS_PICK('z', '>') --#define ACS_PI ACS_PICK('{', 'n') --#define ACS_NEQUAL ACS_PICK('|', '+') --#define ACS_STERLING ACS_PICK('}', 'L') -- --/* Box char aliases */ -- --#define ACS_BSSB ACS_ULCORNER --#define ACS_SSBB ACS_LLCORNER --#define ACS_BBSS ACS_URCORNER --#define ACS_SBBS ACS_LRCORNER --#define ACS_SBSS ACS_RTEE --#define ACS_SSSB ACS_LTEE --#define ACS_SSBS ACS_BTEE --#define ACS_BSSS ACS_TTEE --#define ACS_BSBS ACS_HLINE --#define ACS_SBSB ACS_VLINE --#define ACS_SSSS ACS_PLUS -- --/* cchar_t aliases */ -- --#ifdef PDC_WIDE --# define WACS_ULCORNER (&(acs_map['l'])) --# define WACS_LLCORNER (&(acs_map['m'])) --# define WACS_URCORNER (&(acs_map['k'])) --# define WACS_LRCORNER (&(acs_map['j'])) --# define WACS_RTEE (&(acs_map['u'])) --# define WACS_LTEE (&(acs_map['t'])) --# define WACS_BTEE (&(acs_map['v'])) --# define WACS_TTEE (&(acs_map['w'])) --# define WACS_HLINE (&(acs_map['q'])) --# define WACS_VLINE (&(acs_map['x'])) --# define WACS_PLUS (&(acs_map['n'])) -- --# define WACS_S1 (&(acs_map['o'])) --# define WACS_S9 (&(acs_map['s'])) --# define WACS_DIAMOND (&(acs_map['`'])) --# define WACS_CKBOARD (&(acs_map['a'])) --# define WACS_DEGREE (&(acs_map['f'])) --# define WACS_PLMINUS (&(acs_map['g'])) --# define WACS_BULLET (&(acs_map['~'])) -- --# define WACS_LARROW (&(acs_map[','])) --# define WACS_RARROW (&(acs_map['+'])) --# define WACS_DARROW (&(acs_map['.'])) --# define WACS_UARROW (&(acs_map['-'])) --# define WACS_BOARD (&(acs_map['h'])) --# define WACS_LANTERN (&(acs_map['i'])) --# define WACS_BLOCK (&(acs_map['0'])) -- --# define WACS_S3 (&(acs_map['p'])) --# define WACS_S7 (&(acs_map['r'])) --# define WACS_LEQUAL (&(acs_map['y'])) --# define WACS_GEQUAL (&(acs_map['z'])) --# define WACS_PI (&(acs_map['{'])) --# define WACS_NEQUAL (&(acs_map['|'])) --# define WACS_STERLING (&(acs_map['}'])) -- --# define WACS_BSSB WACS_ULCORNER --# define WACS_SSBB WACS_LLCORNER --# define WACS_BBSS WACS_URCORNER --# define WACS_SBBS WACS_LRCORNER --# define WACS_SBSS WACS_RTEE --# define WACS_SSSB WACS_LTEE --# define WACS_SSBS WACS_BTEE --# define WACS_BSSS WACS_TTEE --# define WACS_BSBS WACS_HLINE --# define WACS_SBSB WACS_VLINE --# define WACS_SSSS WACS_PLUS --#endif -- --/*** Color macros ***/ -- --#define COLOR_BLACK 0 -- --#ifdef PDC_RGB /* RGB */ --# define COLOR_RED 1 --# define COLOR_GREEN 2 --# define COLOR_BLUE 4 --#else /* BGR */ --# define COLOR_BLUE 1 --# define COLOR_GREEN 2 --# define COLOR_RED 4 --#endif -- --#define COLOR_CYAN (COLOR_BLUE | COLOR_GREEN) --#define COLOR_MAGENTA (COLOR_RED | COLOR_BLUE) --#define COLOR_YELLOW (COLOR_RED | COLOR_GREEN) -- --#define COLOR_WHITE 7 -- --/*---------------------------------------------------------------------- -- * -- * Function and Keypad Key Definitions. -- * Many are just for compatibility. -- * -- */ -- --#define KEY_CODE_YES 0x100 /* If get_wch() gives a key code */ -- --#define KEY_BREAK 0x101 /* Not on PC KBD */ --#define KEY_DOWN 0x102 /* Down arrow key */ --#define KEY_UP 0x103 /* Up arrow key */ --#define KEY_LEFT 0x104 /* Left arrow key */ --#define KEY_RIGHT 0x105 /* Right arrow key */ --#define KEY_HOME 0x106 /* home key */ --#define KEY_BACKSPACE 0x107 /* not on pc */ --#define KEY_F0 0x108 /* function keys; 64 reserved */ -- --#define KEY_DL 0x148 /* delete line */ --#define KEY_IL 0x149 /* insert line */ --#define KEY_DC 0x14a /* delete character */ --#define KEY_IC 0x14b /* insert char or enter ins mode */ --#define KEY_EIC 0x14c /* exit insert char mode */ --#define KEY_CLEAR 0x14d /* clear screen */ --#define KEY_EOS 0x14e /* clear to end of screen */ --#define KEY_EOL 0x14f /* clear to end of line */ --#define KEY_SF 0x150 /* scroll 1 line forward */ --#define KEY_SR 0x151 /* scroll 1 line back (reverse) */ --#define KEY_NPAGE 0x152 /* next page */ --#define KEY_PPAGE 0x153 /* previous page */ --#define KEY_STAB 0x154 /* set tab */ --#define KEY_CTAB 0x155 /* clear tab */ --#define KEY_CATAB 0x156 /* clear all tabs */ --#define KEY_ENTER 0x157 /* enter or send (unreliable) */ --#define KEY_SRESET 0x158 /* soft/reset (partial/unreliable) */ --#define KEY_RESET 0x159 /* reset/hard reset (unreliable) */ --#define KEY_PRINT 0x15a /* print/copy */ --#define KEY_LL 0x15b /* home down/bottom (lower left) */ --#define KEY_ABORT 0x15c /* abort/terminate key (any) */ --#define KEY_SHELP 0x15d /* short help */ --#define KEY_LHELP 0x15e /* long help */ --#define KEY_BTAB 0x15f /* Back tab key */ --#define KEY_BEG 0x160 /* beg(inning) key */ --#define KEY_CANCEL 0x161 /* cancel key */ --#define KEY_CLOSE 0x162 /* close key */ --#define KEY_COMMAND 0x163 /* cmd (command) key */ --#define KEY_COPY 0x164 /* copy key */ --#define KEY_CREATE 0x165 /* create key */ --#define KEY_END 0x166 /* end key */ --#define KEY_EXIT 0x167 /* exit key */ --#define KEY_FIND 0x168 /* find key */ --#define KEY_HELP 0x169 /* help key */ --#define KEY_MARK 0x16a /* mark key */ --#define KEY_MESSAGE 0x16b /* message key */ --#define KEY_MOVE 0x16c /* move key */ --#define KEY_NEXT 0x16d /* next object key */ --#define KEY_OPEN 0x16e /* open key */ --#define KEY_OPTIONS 0x16f /* options key */ --#define KEY_PREVIOUS 0x170 /* previous object key */ --#define KEY_REDO 0x171 /* redo key */ --#define KEY_REFERENCE 0x172 /* ref(erence) key */ --#define KEY_REFRESH 0x173 /* refresh key */ --#define KEY_REPLACE 0x174 /* replace key */ --#define KEY_RESTART 0x175 /* restart key */ --#define KEY_RESUME 0x176 /* resume key */ --#define KEY_SAVE 0x177 /* save key */ --#define KEY_SBEG 0x178 /* shifted beginning key */ --#define KEY_SCANCEL 0x179 /* shifted cancel key */ --#define KEY_SCOMMAND 0x17a /* shifted command key */ --#define KEY_SCOPY 0x17b /* shifted copy key */ --#define KEY_SCREATE 0x17c /* shifted create key */ --#define KEY_SDC 0x17d /* shifted delete char key */ --#define KEY_SDL 0x17e /* shifted delete line key */ --#define KEY_SELECT 0x17f /* select key */ --#define KEY_SEND 0x180 /* shifted end key */ --#define KEY_SEOL 0x181 /* shifted clear line key */ --#define KEY_SEXIT 0x182 /* shifted exit key */ --#define KEY_SFIND 0x183 /* shifted find key */ --#define KEY_SHOME 0x184 /* shifted home key */ --#define KEY_SIC 0x185 /* shifted input key */ -- --#define KEY_SLEFT 0x187 /* shifted left arrow key */ --#define KEY_SMESSAGE 0x188 /* shifted message key */ --#define KEY_SMOVE 0x189 /* shifted move key */ --#define KEY_SNEXT 0x18a /* shifted next key */ --#define KEY_SOPTIONS 0x18b /* shifted options key */ --#define KEY_SPREVIOUS 0x18c /* shifted prev key */ --#define KEY_SPRINT 0x18d /* shifted print key */ --#define KEY_SREDO 0x18e /* shifted redo key */ --#define KEY_SREPLACE 0x18f /* shifted replace key */ --#define KEY_SRIGHT 0x190 /* shifted right arrow */ --#define KEY_SRSUME 0x191 /* shifted resume key */ --#define KEY_SSAVE 0x192 /* shifted save key */ --#define KEY_SSUSPEND 0x193 /* shifted suspend key */ --#define KEY_SUNDO 0x194 /* shifted undo key */ --#define KEY_SUSPEND 0x195 /* suspend key */ --#define KEY_UNDO 0x196 /* undo key */ -- --/* PDCurses-specific key definitions -- PC only */ -- --#define ALT_0 0x197 --#define ALT_1 0x198 --#define ALT_2 0x199 --#define ALT_3 0x19a --#define ALT_4 0x19b --#define ALT_5 0x19c --#define ALT_6 0x19d --#define ALT_7 0x19e --#define ALT_8 0x19f --#define ALT_9 0x1a0 --#define ALT_A 0x1a1 --#define ALT_B 0x1a2 --#define ALT_C 0x1a3 --#define ALT_D 0x1a4 --#define ALT_E 0x1a5 --#define ALT_F 0x1a6 --#define ALT_G 0x1a7 --#define ALT_H 0x1a8 --#define ALT_I 0x1a9 --#define ALT_J 0x1aa --#define ALT_K 0x1ab --#define ALT_L 0x1ac --#define ALT_M 0x1ad --#define ALT_N 0x1ae --#define ALT_O 0x1af --#define ALT_P 0x1b0 --#define ALT_Q 0x1b1 --#define ALT_R 0x1b2 --#define ALT_S 0x1b3 --#define ALT_T 0x1b4 --#define ALT_U 0x1b5 --#define ALT_V 0x1b6 --#define ALT_W 0x1b7 --#define ALT_X 0x1b8 --#define ALT_Y 0x1b9 --#define ALT_Z 0x1ba -- --#define CTL_LEFT 0x1bb /* Control-Left-Arrow */ --#define CTL_RIGHT 0x1bc --#define CTL_PGUP 0x1bd --#define CTL_PGDN 0x1be --#define CTL_HOME 0x1bf --#define CTL_END 0x1c0 -- --#define KEY_A1 0x1c1 /* upper left on Virtual keypad */ --#define KEY_A2 0x1c2 /* upper middle on Virt. keypad */ --#define KEY_A3 0x1c3 /* upper right on Vir. keypad */ --#define KEY_B1 0x1c4 /* middle left on Virt. keypad */ --#define KEY_B2 0x1c5 /* center on Virt. keypad */ --#define KEY_B3 0x1c6 /* middle right on Vir. keypad */ --#define KEY_C1 0x1c7 /* lower left on Virt. keypad */ --#define KEY_C2 0x1c8 /* lower middle on Virt. keypad */ --#define KEY_C3 0x1c9 /* lower right on Vir. keypad */ -- --#define PADSLASH 0x1ca /* slash on keypad */ --#define PADENTER 0x1cb /* enter on keypad */ --#define CTL_PADENTER 0x1cc /* ctl-enter on keypad */ --#define ALT_PADENTER 0x1cd /* alt-enter on keypad */ --#define PADSTOP 0x1ce /* stop on keypad */ --#define PADSTAR 0x1cf /* star on keypad */ --#define PADMINUS 0x1d0 /* minus on keypad */ --#define PADPLUS 0x1d1 /* plus on keypad */ --#define CTL_PADSTOP 0x1d2 /* ctl-stop on keypad */ --#define CTL_PADCENTER 0x1d3 /* ctl-enter on keypad */ --#define CTL_PADPLUS 0x1d4 /* ctl-plus on keypad */ --#define CTL_PADMINUS 0x1d5 /* ctl-minus on keypad */ --#define CTL_PADSLASH 0x1d6 /* ctl-slash on keypad */ --#define CTL_PADSTAR 0x1d7 /* ctl-star on keypad */ --#define ALT_PADPLUS 0x1d8 /* alt-plus on keypad */ --#define ALT_PADMINUS 0x1d9 /* alt-minus on keypad */ --#define ALT_PADSLASH 0x1da /* alt-slash on keypad */ --#define ALT_PADSTAR 0x1db /* alt-star on keypad */ --#define ALT_PADSTOP 0x1dc /* alt-stop on keypad */ --#define CTL_INS 0x1dd /* ctl-insert */ --#define ALT_DEL 0x1de /* alt-delete */ --#define ALT_INS 0x1df /* alt-insert */ --#define CTL_UP 0x1e0 /* ctl-up arrow */ --#define CTL_DOWN 0x1e1 /* ctl-down arrow */ --#define CTL_TAB 0x1e2 /* ctl-tab */ --#define ALT_TAB 0x1e3 --#define ALT_MINUS 0x1e4 --#define ALT_EQUAL 0x1e5 --#define ALT_HOME 0x1e6 --#define ALT_PGUP 0x1e7 --#define ALT_PGDN 0x1e8 --#define ALT_END 0x1e9 --#define ALT_UP 0x1ea /* alt-up arrow */ --#define ALT_DOWN 0x1eb /* alt-down arrow */ --#define ALT_RIGHT 0x1ec /* alt-right arrow */ --#define ALT_LEFT 0x1ed /* alt-left arrow */ --#define ALT_ENTER 0x1ee /* alt-enter */ --#define ALT_ESC 0x1ef /* alt-escape */ --#define ALT_BQUOTE 0x1f0 /* alt-back quote */ --#define ALT_LBRACKET 0x1f1 /* alt-left bracket */ --#define ALT_RBRACKET 0x1f2 /* alt-right bracket */ --#define ALT_SEMICOLON 0x1f3 /* alt-semi-colon */ --#define ALT_FQUOTE 0x1f4 /* alt-forward quote */ --#define ALT_COMMA 0x1f5 /* alt-comma */ --#define ALT_STOP 0x1f6 /* alt-stop */ --#define ALT_FSLASH 0x1f7 /* alt-forward slash */ --#define ALT_BKSP 0x1f8 /* alt-backspace */ --#define CTL_BKSP 0x1f9 /* ctl-backspace */ --#define PAD0 0x1fa /* keypad 0 */ -- --#define CTL_PAD0 0x1fb /* ctl-keypad 0 */ --#define CTL_PAD1 0x1fc --#define CTL_PAD2 0x1fd --#define CTL_PAD3 0x1fe --#define CTL_PAD4 0x1ff --#define CTL_PAD5 0x200 --#define CTL_PAD6 0x201 --#define CTL_PAD7 0x202 --#define CTL_PAD8 0x203 --#define CTL_PAD9 0x204 -- --#define ALT_PAD0 0x205 /* alt-keypad 0 */ --#define ALT_PAD1 0x206 --#define ALT_PAD2 0x207 --#define ALT_PAD3 0x208 --#define ALT_PAD4 0x209 --#define ALT_PAD5 0x20a --#define ALT_PAD6 0x20b --#define ALT_PAD7 0x20c --#define ALT_PAD8 0x20d --#define ALT_PAD9 0x20e -- --#define CTL_DEL 0x20f /* clt-delete */ --#define ALT_BSLASH 0x210 /* alt-back slash */ --#define CTL_ENTER 0x211 /* ctl-enter */ -- --#define SHF_PADENTER 0x212 /* shift-enter on keypad */ --#define SHF_PADSLASH 0x213 /* shift-slash on keypad */ --#define SHF_PADSTAR 0x214 /* shift-star on keypad */ --#define SHF_PADPLUS 0x215 /* shift-plus on keypad */ --#define SHF_PADMINUS 0x216 /* shift-minus on keypad */ --#define SHF_UP 0x217 /* shift-up on keypad */ --#define SHF_DOWN 0x218 /* shift-down on keypad */ --#define SHF_IC 0x219 /* shift-insert on keypad */ --#define SHF_DC 0x21a /* shift-delete on keypad */ -- --#define KEY_MOUSE 0x21b /* "mouse" key */ --#define KEY_SHIFT_L 0x21c /* Left-shift */ --#define KEY_SHIFT_R 0x21d /* Right-shift */ --#define KEY_CONTROL_L 0x21e /* Left-control */ --#define KEY_CONTROL_R 0x21f /* Right-control */ --#define KEY_ALT_L 0x220 /* Left-alt */ --#define KEY_ALT_R 0x221 /* Right-alt */ --#define KEY_RESIZE 0x222 /* Window resize */ --#define KEY_SUP 0x223 /* Shifted up arrow */ --#define KEY_SDOWN 0x224 /* Shifted down arrow */ -- --#define KEY_MIN KEY_BREAK /* Minimum curses key value */ --#define KEY_MAX KEY_SDOWN /* Maximum curses key */ -- --#define KEY_F(n) (KEY_F0 + (n)) -- --/*---------------------------------------------------------------------- -- * -- * PDCurses Function Declarations -- * -- */ -- --/* Standard */ -- --int addch(const chtype); --int addchnstr(const chtype *, int); --int addchstr(const chtype *); --int addnstr(const char *, int); --int addstr(const char *); --int attroff(chtype); --int attron(chtype); --int attrset(chtype); --int attr_get(attr_t *, short *, void *); --int attr_off(attr_t, void *); --int attr_on(attr_t, void *); --int attr_set(attr_t, short, void *); --int baudrate(void); --int beep(void); --int bkgd(chtype); --void bkgdset(chtype); --int border(chtype, chtype, chtype, chtype, chtype, chtype, chtype, chtype); --int box(WINDOW *, chtype, chtype); --bool can_change_color(void); --int cbreak(void); --int chgat(int, attr_t, short, const void *); --int clearok(WINDOW *, bool); --int clear(void); --int clrtobot(void); --int clrtoeol(void); --int color_content(short, short *, short *, short *); --int color_set(short, void *); --int copywin(const WINDOW *, WINDOW *, int, int, int, int, int, int, int); --int curs_set(int); --int def_prog_mode(void); --int def_shell_mode(void); --int delay_output(int); --int delch(void); --int deleteln(void); --void delscreen(SCREEN *); --int delwin(WINDOW *); --WINDOW *derwin(WINDOW *, int, int, int, int); --int doupdate(void); --WINDOW *dupwin(WINDOW *); --int echochar(const chtype); --int echo(void); --int endwin(void); --char erasechar(void); --int erase(void); --void filter(void); --int flash(void); --int flushinp(void); --chtype getbkgd(WINDOW *); --int getnstr(char *, int); --int getstr(char *); --WINDOW *getwin(FILE *); --int halfdelay(int); --bool has_colors(void); --bool has_ic(void); --bool has_il(void); --int hline(chtype, int); --void idcok(WINDOW *, bool); --int idlok(WINDOW *, bool); --void immedok(WINDOW *, bool); --int inchnstr(chtype *, int); --int inchstr(chtype *); --chtype inch(void); --int init_color(short, short, short, short); --int init_pair(short, short, short); --WINDOW *initscr(void); --int innstr(char *, int); --int insch(chtype); --int insdelln(int); --int insertln(void); --int insnstr(const char *, int); --int insstr(const char *); --int instr(char *); --int intrflush(WINDOW *, bool); --bool isendwin(void); --bool is_linetouched(WINDOW *, int); --bool is_wintouched(WINDOW *); --char *keyname(int); --int keypad(WINDOW *, bool); --char killchar(void); --int leaveok(WINDOW *, bool); --char *longname(void); --int meta(WINDOW *, bool); --int move(int, int); --int mvaddch(int, int, const chtype); --int mvaddchnstr(int, int, const chtype *, int); --int mvaddchstr(int, int, const chtype *); --int mvaddnstr(int, int, const char *, int); --int mvaddstr(int, int, const char *); --int mvchgat(int, int, int, attr_t, short, const void *); --int mvcur(int, int, int, int); --int mvdelch(int, int); --int mvderwin(WINDOW *, int, int); --int mvgetch(int, int); --int mvgetnstr(int, int, char *, int); --int mvgetstr(int, int, char *); --int mvhline(int, int, chtype, int); --chtype mvinch(int, int); --int mvinchnstr(int, int, chtype *, int); --int mvinchstr(int, int, chtype *); --int mvinnstr(int, int, char *, int); --int mvinsch(int, int, chtype); --int mvinsnstr(int, int, const char *, int); --int mvinsstr(int, int, const char *); --int mvinstr(int, int, char *); --int mvprintw(int, int, const char *, ...); --int mvscanw(int, int, const char *, ...); --int mvvline(int, int, chtype, int); --int mvwaddchnstr(WINDOW *, int, int, const chtype *, int); --int mvwaddchstr(WINDOW *, int, int, const chtype *); --int mvwaddch(WINDOW *, int, int, const chtype); --int mvwaddnstr(WINDOW *, int, int, const char *, int); --int mvwaddstr(WINDOW *, int, int, const char *); --int mvwchgat(WINDOW *, int, int, int, attr_t, short, const void *); --int mvwdelch(WINDOW *, int, int); --int mvwgetch(WINDOW *, int, int); --int mvwgetnstr(WINDOW *, int, int, char *, int); --int mvwgetstr(WINDOW *, int, int, char *); --int mvwhline(WINDOW *, int, int, chtype, int); --int mvwinchnstr(WINDOW *, int, int, chtype *, int); --int mvwinchstr(WINDOW *, int, int, chtype *); --chtype mvwinch(WINDOW *, int, int); --int mvwinnstr(WINDOW *, int, int, char *, int); --int mvwinsch(WINDOW *, int, int, chtype); --int mvwinsnstr(WINDOW *, int, int, const char *, int); --int mvwinsstr(WINDOW *, int, int, const char *); --int mvwinstr(WINDOW *, int, int, char *); --int mvwin(WINDOW *, int, int); --int mvwprintw(WINDOW *, int, int, const char *, ...); --int mvwscanw(WINDOW *, int, int, const char *, ...); --int mvwvline(WINDOW *, int, int, chtype, int); --int napms(int); --WINDOW *newpad(int, int); --SCREEN *newterm(const char *, FILE *, FILE *); --WINDOW *newwin(int, int, int, int); --int nl(void); --int nocbreak(void); --int nodelay(WINDOW *, bool); --int noecho(void); --int nonl(void); --void noqiflush(void); --int noraw(void); --int notimeout(WINDOW *, bool); --int overlay(const WINDOW *, WINDOW *); --int overwrite(const WINDOW *, WINDOW *); --int pair_content(short, short *, short *); --int pechochar(WINDOW *, chtype); --int pnoutrefresh(WINDOW *, int, int, int, int, int, int); --int prefresh(WINDOW *, int, int, int, int, int, int); --int printw(const char *, ...); --int putwin(WINDOW *, FILE *); --void qiflush(void); --int raw(void); --int redrawwin(WINDOW *); --int refresh(void); --int reset_prog_mode(void); --int reset_shell_mode(void); --int resetty(void); --int ripoffline(int, int (*)(WINDOW *, int)); --int savetty(void); --int scanw(const char *, ...); --int scr_dump(const char *); --int scr_init(const char *); --int scr_restore(const char *); --int scr_set(const char *); --int scrl(int); --int scroll(WINDOW *); --int scrollok(WINDOW *, bool); --SCREEN *set_term(SCREEN *); --int setscrreg(int, int); --int slk_attroff(const chtype); --int slk_attr_off(const attr_t, void *); --int slk_attron(const chtype); --int slk_attr_on(const attr_t, void *); --int slk_attrset(const chtype); --int slk_attr_set(const attr_t, short, void *); --int slk_clear(void); --int slk_color(short); --int slk_init(int); --char *slk_label(int); --int slk_noutrefresh(void); --int slk_refresh(void); --int slk_restore(void); --int slk_set(int, const char *, int); --int slk_touch(void); --int standend(void); --int standout(void); --int start_color(void); --WINDOW *subpad(WINDOW *, int, int, int, int); --WINDOW *subwin(WINDOW *, int, int, int, int); --int syncok(WINDOW *, bool); --chtype termattrs(void); --attr_t term_attrs(void); --char *termname(void); --void timeout(int); --int touchline(WINDOW *, int, int); --int touchwin(WINDOW *); --int typeahead(int); --int untouchwin(WINDOW *); --void use_env(bool); --int vidattr(chtype); --int vid_attr(attr_t, short, void *); --int vidputs(chtype, int (*)(int)); --int vid_puts(attr_t, short, void *, int (*)(int)); --int vline(chtype, int); --int vw_printw(WINDOW *, const char *, va_list); --int vwprintw(WINDOW *, const char *, va_list); --int vw_scanw(WINDOW *, const char *, va_list); --int vwscanw(WINDOW *, const char *, va_list); --int waddchnstr(WINDOW *, const chtype *, int); --int waddchstr(WINDOW *, const chtype *); --int waddch(WINDOW *, const chtype); --int waddnstr(WINDOW *, const char *, int); --int waddstr(WINDOW *, const char *); --int wattroff(WINDOW *, chtype); --int wattron(WINDOW *, chtype); --int wattrset(WINDOW *, chtype); --int wattr_get(WINDOW *, attr_t *, short *, void *); --int wattr_off(WINDOW *, attr_t, void *); --int wattr_on(WINDOW *, attr_t, void *); --int wattr_set(WINDOW *, attr_t, short, void *); --void wbkgdset(WINDOW *, chtype); --int wbkgd(WINDOW *, chtype); --int wborder(WINDOW *, chtype, chtype, chtype, chtype, -- chtype, chtype, chtype, chtype); --int wchgat(WINDOW *, int, attr_t, short, const void *); --int wclear(WINDOW *); --int wclrtobot(WINDOW *); --int wclrtoeol(WINDOW *); --int wcolor_set(WINDOW *, short, void *); --void wcursyncup(WINDOW *); --int wdelch(WINDOW *); --int wdeleteln(WINDOW *); --int wechochar(WINDOW *, const chtype); --int werase(WINDOW *); --int wgetch(WINDOW *); --int wgetnstr(WINDOW *, char *, int); --int wgetstr(WINDOW *, char *); --int whline(WINDOW *, chtype, int); --int winchnstr(WINDOW *, chtype *, int); --int winchstr(WINDOW *, chtype *); --chtype winch(WINDOW *); --int winnstr(WINDOW *, char *, int); --int winsch(WINDOW *, chtype); --int winsdelln(WINDOW *, int); --int winsertln(WINDOW *); --int winsnstr(WINDOW *, const char *, int); --int winsstr(WINDOW *, const char *); --int winstr(WINDOW *, char *); --int wmove(WINDOW *, int, int); --int wnoutrefresh(WINDOW *); --int wprintw(WINDOW *, const char *, ...); --int wredrawln(WINDOW *, int, int); --int wrefresh(WINDOW *); --int wscanw(WINDOW *, const char *, ...); --int wscrl(WINDOW *, int); --int wsetscrreg(WINDOW *, int, int); --int wstandend(WINDOW *); --int wstandout(WINDOW *); --void wsyncdown(WINDOW *); --void wsyncup(WINDOW *); --void wtimeout(WINDOW *, int); --int wtouchln(WINDOW *, int, int, int); --int wvline(WINDOW *, chtype, int); -- --/* Wide-character functions */ -- --#ifdef PDC_WIDE --int addnwstr(const wchar_t *, int); --int addwstr(const wchar_t *); --int add_wch(const cchar_t *); --int add_wchnstr(const cchar_t *, int); --int add_wchstr(const cchar_t *); --int border_set(const cchar_t *, const cchar_t *, const cchar_t *, -- const cchar_t *, const cchar_t *, const cchar_t *, -- const cchar_t *, const cchar_t *); --int box_set(WINDOW *, const cchar_t *, const cchar_t *); --int echo_wchar(const cchar_t *); --int erasewchar(wchar_t *); --int getbkgrnd(cchar_t *); --int getcchar(const cchar_t *, wchar_t *, attr_t *, short *, void *); --int getn_wstr(wint_t *, int); --int get_wch(wint_t *); --int get_wstr(wint_t *); --int hline_set(const cchar_t *, int); --int innwstr(wchar_t *, int); --int ins_nwstr(const wchar_t *, int); --int ins_wch(const cchar_t *); --int ins_wstr(const wchar_t *); --int inwstr(wchar_t *); --int in_wch(cchar_t *); --int in_wchnstr(cchar_t *, int); --int in_wchstr(cchar_t *); --char *key_name(wchar_t); --int killwchar(wchar_t *); --int mvaddnwstr(int, int, const wchar_t *, int); --int mvaddwstr(int, int, const wchar_t *); --int mvadd_wch(int, int, const cchar_t *); --int mvadd_wchnstr(int, int, const cchar_t *, int); --int mvadd_wchstr(int, int, const cchar_t *); --int mvgetn_wstr(int, int, wint_t *, int); --int mvget_wch(int, int, wint_t *); --int mvget_wstr(int, int, wint_t *); --int mvhline_set(int, int, const cchar_t *, int); --int mvinnwstr(int, int, wchar_t *, int); --int mvins_nwstr(int, int, const wchar_t *, int); --int mvins_wch(int, int, const cchar_t *); --int mvins_wstr(int, int, const wchar_t *); --int mvinwstr(int, int, wchar_t *); --int mvin_wch(int, int, cchar_t *); --int mvin_wchnstr(int, int, cchar_t *, int); --int mvin_wchstr(int, int, cchar_t *); --int mvvline_set(int, int, const cchar_t *, int); --int mvwaddnwstr(WINDOW *, int, int, const wchar_t *, int); --int mvwaddwstr(WINDOW *, int, int, const wchar_t *); --int mvwadd_wch(WINDOW *, int, int, const cchar_t *); --int mvwadd_wchnstr(WINDOW *, int, int, const cchar_t *, int); --int mvwadd_wchstr(WINDOW *, int, int, const cchar_t *); --int mvwgetn_wstr(WINDOW *, int, int, wint_t *, int); --int mvwget_wch(WINDOW *, int, int, wint_t *); --int mvwget_wstr(WINDOW *, int, int, wint_t *); --int mvwhline_set(WINDOW *, int, int, const cchar_t *, int); --int mvwinnwstr(WINDOW *, int, int, wchar_t *, int); --int mvwins_nwstr(WINDOW *, int, int, const wchar_t *, int); --int mvwins_wch(WINDOW *, int, int, const cchar_t *); --int mvwins_wstr(WINDOW *, int, int, const wchar_t *); --int mvwin_wch(WINDOW *, int, int, cchar_t *); --int mvwin_wchnstr(WINDOW *, int, int, cchar_t *, int); --int mvwin_wchstr(WINDOW *, int, int, cchar_t *); --int mvwinwstr(WINDOW *, int, int, wchar_t *); --int mvwvline_set(WINDOW *, int, int, const cchar_t *, int); --int pecho_wchar(WINDOW *, const cchar_t*); --int setcchar(cchar_t*, const wchar_t*, const attr_t, short, const void*); --int slk_wset(int, const wchar_t *, int); --int unget_wch(const wchar_t); --int vline_set(const cchar_t *, int); --int waddnwstr(WINDOW *, const wchar_t *, int); --int waddwstr(WINDOW *, const wchar_t *); --int wadd_wch(WINDOW *, const cchar_t *); --int wadd_wchnstr(WINDOW *, const cchar_t *, int); --int wadd_wchstr(WINDOW *, const cchar_t *); --int wbkgrnd(WINDOW *, const cchar_t *); --void wbkgrndset(WINDOW *, const cchar_t *); --int wborder_set(WINDOW *, const cchar_t *, const cchar_t *, -- const cchar_t *, const cchar_t *, const cchar_t *, -- const cchar_t *, const cchar_t *, const cchar_t *); --int wecho_wchar(WINDOW *, const cchar_t *); --int wgetbkgrnd(WINDOW *, cchar_t *); --int wgetn_wstr(WINDOW *, wint_t *, int); --int wget_wch(WINDOW *, wint_t *); --int wget_wstr(WINDOW *, wint_t *); --int whline_set(WINDOW *, const cchar_t *, int); --int winnwstr(WINDOW *, wchar_t *, int); --int wins_nwstr(WINDOW *, const wchar_t *, int); --int wins_wch(WINDOW *, const cchar_t *); --int wins_wstr(WINDOW *, const wchar_t *); --int winwstr(WINDOW *, wchar_t *); --int win_wch(WINDOW *, cchar_t *); --int win_wchnstr(WINDOW *, cchar_t *, int); --int win_wchstr(WINDOW *, cchar_t *); --wchar_t *wunctrl(cchar_t *); --int wvline_set(WINDOW *, const cchar_t *, int); --#endif -- --/* Quasi-standard */ -- --chtype getattrs(WINDOW *); --int getbegx(WINDOW *); --int getbegy(WINDOW *); --int getmaxx(WINDOW *); --int getmaxy(WINDOW *); --int getparx(WINDOW *); --int getpary(WINDOW *); --int getcurx(WINDOW *); --int getcury(WINDOW *); --void traceoff(void); --void traceon(void); --char *unctrl(chtype); -- --int crmode(void); --int nocrmode(void); --int draino(int); --int resetterm(void); --int fixterm(void); --int saveterm(void); --int setsyx(int, int); -- --int mouse_set(unsigned long); --int mouse_on(unsigned long); --int mouse_off(unsigned long); --int request_mouse_pos(void); --int map_button(unsigned long); --void wmouse_position(WINDOW *, int *, int *); --unsigned long getmouse(void); --unsigned long getbmap(void); -- --/* ncurses */ -- --int assume_default_colors(int, int); --const char *curses_version(void); --bool has_key(int); --int use_default_colors(void); --int wresize(WINDOW *, int, int); -- --int mouseinterval(int); --mmask_t mousemask(mmask_t, mmask_t *); --bool mouse_trafo(int *, int *, bool); --int nc_getmouse(MEVENT *); --int ungetmouse(MEVENT *); --bool wenclose(const WINDOW *, int, int); --bool wmouse_trafo(const WINDOW *, int *, int *, bool); -- --/* PDCurses */ -- --int addrawch(chtype); --int insrawch(chtype); --bool is_termresized(void); --int mvaddrawch(int, int, chtype); --int mvdeleteln(int, int); --int mvinsertln(int, int); --int mvinsrawch(int, int, chtype); --int mvwaddrawch(WINDOW *, int, int, chtype); --int mvwdeleteln(WINDOW *, int, int); --int mvwinsertln(WINDOW *, int, int); --int mvwinsrawch(WINDOW *, int, int, chtype); --int raw_output(bool); --int resize_term(int, int); --WINDOW *resize_window(WINDOW *, int, int); --int waddrawch(WINDOW *, chtype); --int winsrawch(WINDOW *, chtype); --char wordchar(void); -- --#ifdef PDC_WIDE --wchar_t *slk_wlabel(int); --#endif -- --void PDC_debug(const char *, ...); --int PDC_ungetch(int); --int PDC_set_blink(bool); --int PDC_set_line_color(short); --void PDC_set_title(const char *); -- --int PDC_clearclipboard(void); --int PDC_freeclipboard(char *); --int PDC_getclipboard(char **, long *); --int PDC_setclipboard(const char *, long); -- --unsigned long PDC_get_input_fd(void); --unsigned long PDC_get_key_modifiers(void); --int PDC_return_key_modifiers(bool); --int PDC_save_key_modifiers(bool); -- --#ifdef XCURSES --WINDOW *Xinitscr(int, char **); --void XCursesExit(void); --int sb_init(void); --int sb_set_horz(int, int, int); --int sb_set_vert(int, int, int); --int sb_get_horz(int *, int *, int *); --int sb_get_vert(int *, int *, int *); --int sb_refresh(void); --#endif -- --/*** Functions defined as macros ***/ -- --/* getch() and ungetch() conflict with some DOS libraries */ -- --#define getch() wgetch(stdscr) --#define ungetch(ch) PDC_ungetch(ch) -- --#define COLOR_PAIR(n) (((chtype)(n) << PDC_COLOR_SHIFT) & A_COLOR) --#define PAIR_NUMBER(n) (((n) & A_COLOR) >> PDC_COLOR_SHIFT) -- --/* These will _only_ work as macros */ -- --#define getbegyx(w, y, x) (y = getbegy(w), x = getbegx(w)) --#define getmaxyx(w, y, x) (y = getmaxy(w), x = getmaxx(w)) --#define getparyx(w, y, x) (y = getpary(w), x = getparx(w)) --#define getyx(w, y, x) (y = getcury(w), x = getcurx(w)) -- --#define getsyx(y, x) { if (curscr->_leaveit) (y)=(x)=-1; \ -- else getyx(curscr,(y),(x)); } -- --#ifdef NCURSES_MOUSE_VERSION --# define getmouse(x) nc_getmouse(x) --#endif -- --/* return codes from PDC_getclipboard() and PDC_setclipboard() calls */ -- --#define PDC_CLIP_SUCCESS 0 --#define PDC_CLIP_ACCESS_ERROR 1 --#define PDC_CLIP_EMPTY 2 --#define PDC_CLIP_MEMORY_ERROR 3 -- --/* PDCurses key modifier masks */ -- --#define PDC_KEY_MODIFIER_SHIFT 1 --#define PDC_KEY_MODIFIER_CONTROL 2 --#define PDC_KEY_MODIFIER_ALT 4 --#define PDC_KEY_MODIFIER_NUMLOCK 8 -- --#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) --# undef bool --} --#endif -- --#endif /* __PDCURSES__ */ ---- python-pysam.orig/samtools/win32/zconf.h -+++ /dev/null -@@ -1,332 +0,0 @@ --/* zconf.h -- configuration of the zlib compression library -- * Copyright (C) 1995-2005 Jean-loup Gailly. -- * For conditions of distribution and use, see copyright notice in zlib.h -- */ -- --/* @(#) $Id$ */ -- --#ifndef ZCONF_H --#define ZCONF_H -- --/* -- * If you *really* need a unique prefix for all types and library functions, -- * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. -- */ --#ifdef Z_PREFIX --# define deflateInit_ z_deflateInit_ --# define deflate z_deflate --# define deflateEnd z_deflateEnd --# define inflateInit_ z_inflateInit_ --# define inflate z_inflate --# define inflateEnd z_inflateEnd --# define deflateInit2_ z_deflateInit2_ --# define deflateSetDictionary z_deflateSetDictionary --# define deflateCopy z_deflateCopy --# define deflateReset z_deflateReset --# define deflateParams z_deflateParams --# define deflateBound z_deflateBound --# define deflatePrime z_deflatePrime --# define inflateInit2_ z_inflateInit2_ --# define inflateSetDictionary z_inflateSetDictionary --# define inflateSync z_inflateSync --# define inflateSyncPoint z_inflateSyncPoint --# define inflateCopy z_inflateCopy --# define inflateReset z_inflateReset --# define inflateBack z_inflateBack --# define inflateBackEnd z_inflateBackEnd --# define compress z_compress --# define compress2 z_compress2 --# define compressBound z_compressBound --# define uncompress z_uncompress --# define adler32 z_adler32 --# define crc32 z_crc32 --# define get_crc_table z_get_crc_table --# define zError z_zError -- --# define alloc_func z_alloc_func --# define free_func z_free_func --# define in_func z_in_func --# define out_func z_out_func --# define Byte z_Byte --# define uInt z_uInt --# define uLong z_uLong --# define Bytef z_Bytef --# define charf z_charf --# define intf z_intf --# define uIntf z_uIntf --# define uLongf z_uLongf --# define voidpf z_voidpf --# define voidp z_voidp --#endif -- --#if defined(__MSDOS__) && !defined(MSDOS) --# define MSDOS --#endif --#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) --# define OS2 --#endif --#if defined(_WINDOWS) && !defined(WINDOWS) --# define WINDOWS --#endif --#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) --# ifndef WIN32 --# define WIN32 --# endif --#endif --#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) --# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) --# ifndef SYS16BIT --# define SYS16BIT --# endif --# endif --#endif -- --/* -- * Compile with -DMAXSEG_64K if the alloc function cannot allocate more -- * than 64k bytes at a time (needed on systems with 16-bit int). -- */ --#ifdef SYS16BIT --# define MAXSEG_64K --#endif --#ifdef MSDOS --# define UNALIGNED_OK --#endif -- --#ifdef __STDC_VERSION__ --# ifndef STDC --# define STDC --# endif --# if __STDC_VERSION__ >= 199901L --# ifndef STDC99 --# define STDC99 --# endif --# endif --#endif --#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) --# define STDC --#endif --#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) --# define STDC --#endif --#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) --# define STDC --#endif --#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) --# define STDC --#endif -- --#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ --# define STDC --#endif -- --#ifndef STDC --# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ --# define const /* note: need a more gentle solution here */ --# endif --#endif -- --/* Some Mac compilers merge all .h files incorrectly: */ --#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__) --# define NO_DUMMY_DECL --#endif -- --/* Maximum value for memLevel in deflateInit2 */ --#ifndef MAX_MEM_LEVEL --# ifdef MAXSEG_64K --# define MAX_MEM_LEVEL 8 --# else --# define MAX_MEM_LEVEL 9 --# endif --#endif -- --/* Maximum value for windowBits in deflateInit2 and inflateInit2. -- * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files -- * created by gzip. (Files created by minigzip can still be extracted by -- * gzip.) -- */ --#ifndef MAX_WBITS --# define MAX_WBITS 15 /* 32K LZ77 window */ --#endif -- --/* The memory requirements for deflate are (in bytes): -- (1 << (windowBits+2)) + (1 << (memLevel+9)) -- that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) -- plus a few kilobytes for small objects. For example, if you want to reduce -- the default memory requirements from 256K to 128K, compile with -- make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" -- Of course this will generally degrade compression (there's no free lunch). -- -- The memory requirements for inflate are (in bytes) 1 << windowBits -- that is, 32K for windowBits=15 (default value) plus a few kilobytes -- for small objects. --*/ -- -- /* Type declarations */ -- --#ifndef OF /* function prototypes */ --# ifdef STDC --# define OF(args) args --# else --# define OF(args) () --# endif --#endif -- --/* The following definitions for FAR are needed only for MSDOS mixed -- * model programming (small or medium model with some far allocations). -- * This was tested only with MSC; for other MSDOS compilers you may have -- * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, -- * just define FAR to be empty. -- */ --#ifdef SYS16BIT --# if defined(M_I86SM) || defined(M_I86MM) -- /* MSC small or medium model */ --# define SMALL_MEDIUM --# ifdef _MSC_VER --# define FAR _far --# else --# define FAR far --# endif --# endif --# if (defined(__SMALL__) || defined(__MEDIUM__)) -- /* Turbo C small or medium model */ --# define SMALL_MEDIUM --# ifdef __BORLANDC__ --# define FAR _far --# else --# define FAR far --# endif --# endif --#endif -- --#if defined(WINDOWS) || defined(WIN32) -- /* If building or using zlib as a DLL, define ZLIB_DLL. -- * This is not mandatory, but it offers a little performance increase. -- */ --# ifdef ZLIB_DLL --# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) --# ifdef ZLIB_INTERNAL --# define ZEXTERN extern __declspec(dllexport) --# else --# define ZEXTERN extern __declspec(dllimport) --# endif --# endif --# endif /* ZLIB_DLL */ -- /* If building or using zlib with the WINAPI/WINAPIV calling convention, -- * define ZLIB_WINAPI. -- * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. -- */ --# ifdef ZLIB_WINAPI --# ifdef FAR --# undef FAR --# endif --# include -- /* No need for _export, use ZLIB.DEF instead. */ -- /* For complete Windows compatibility, use WINAPI, not __stdcall. */ --# define ZEXPORT WINAPI --# ifdef WIN32 --# define ZEXPORTVA WINAPIV --# else --# define ZEXPORTVA FAR CDECL --# endif --# endif --#endif -- --#if defined (__BEOS__) --# ifdef ZLIB_DLL --# ifdef ZLIB_INTERNAL --# define ZEXPORT __declspec(dllexport) --# define ZEXPORTVA __declspec(dllexport) --# else --# define ZEXPORT __declspec(dllimport) --# define ZEXPORTVA __declspec(dllimport) --# endif --# endif --#endif -- --#ifndef ZEXTERN --# define ZEXTERN extern --#endif --#ifndef ZEXPORT --# define ZEXPORT --#endif --#ifndef ZEXPORTVA --# define ZEXPORTVA --#endif -- --#ifndef FAR --# define FAR --#endif -- --#if !defined(__MACTYPES__) --typedef unsigned char Byte; /* 8 bits */ --#endif --typedef unsigned int uInt; /* 16 bits or more */ --typedef unsigned long uLong; /* 32 bits or more */ -- --#ifdef SMALL_MEDIUM -- /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ --# define Bytef Byte FAR --#else -- typedef Byte FAR Bytef; --#endif --typedef char FAR charf; --typedef int FAR intf; --typedef uInt FAR uIntf; --typedef uLong FAR uLongf; -- --#ifdef STDC -- typedef void const *voidpc; -- typedef void FAR *voidpf; -- typedef void *voidp; --#else -- typedef Byte const *voidpc; -- typedef Byte FAR *voidpf; -- typedef Byte *voidp; --#endif -- --#if 0 /* HAVE_UNISTD_H -- this line is updated by ./configure */ --# include /* for off_t */ --# include /* for SEEK_* and off_t */ --# ifdef VMS --# include /* for off_t */ --# endif --# define z_off_t off_t --#endif --#ifndef SEEK_SET --# define SEEK_SET 0 /* Seek from beginning of file. */ --# define SEEK_CUR 1 /* Seek from current position. */ --# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ --#endif --#ifndef z_off_t --# define z_off_t long --#endif -- --#if defined(__OS400__) --# define NO_vsnprintf --#endif -- --#if defined(__MVS__) --# define NO_vsnprintf --# ifdef FAR --# undef FAR --# endif --#endif -- --/* MVS linker does not support external names larger than 8 bytes */ --#if defined(__MVS__) --# pragma map(deflateInit_,"DEIN") --# pragma map(deflateInit2_,"DEIN2") --# pragma map(deflateEnd,"DEEND") --# pragma map(deflateBound,"DEBND") --# pragma map(inflateInit_,"ININ") --# pragma map(inflateInit2_,"ININ2") --# pragma map(inflateEnd,"INEND") --# pragma map(inflateSync,"INSY") --# pragma map(inflateSetDictionary,"INSEDI") --# pragma map(compressBound,"CMBND") --# pragma map(inflate_table,"INTABL") --# pragma map(inflate_fast,"INFA") --# pragma map(inflate_copyright,"INCOPY") --#endif -- --#endif /* ZCONF_H */ ---- python-pysam.orig/samtools/win32/zlib.h -+++ /dev/null -@@ -1,1357 +0,0 @@ --/* zlib.h -- interface of the 'zlib' general purpose compression library -- version 1.2.3, July 18th, 2005 -- -- Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler -- -- This software is provided 'as-is', without any express or implied -- warranty. In no event will the authors be held liable for any damages -- arising from the use of this software. -- -- Permission is granted to anyone to use this software for any purpose, -- including commercial applications, and to alter it and redistribute it -- freely, subject to the following restrictions: -- -- 1. The origin of this software must not be misrepresented; you must not -- claim that you wrote the original software. If you use this software -- in a product, an acknowledgment in the product documentation would be -- appreciated but is not required. -- 2. Altered source versions must be plainly marked as such, and must not be -- misrepresented as being the original software. -- 3. This notice may not be removed or altered from any source distribution. -- -- Jean-loup Gailly Mark Adler -- jloup@gzip.org madler@alumni.caltech.edu -- -- -- The data format used by the zlib library is described by RFCs (Request for -- Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt -- (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). --*/ -- --#ifndef ZLIB_H --#define ZLIB_H -- --#include "zconf.h" -- --#ifdef __cplusplus --extern "C" { --#endif -- --#define ZLIB_VERSION "1.2.3" --#define ZLIB_VERNUM 0x1230 -- --/* -- The 'zlib' compression library provides in-memory compression and -- decompression functions, including integrity checks of the uncompressed -- data. This version of the library supports only one compression method -- (deflation) but other algorithms will be added later and will have the same -- stream interface. -- -- Compression can be done in a single step if the buffers are large -- enough (for example if an input file is mmap'ed), or can be done by -- repeated calls of the compression function. In the latter case, the -- application must provide more input and/or consume the output -- (providing more output space) before each call. -- -- The compressed data format used by default by the in-memory functions is -- the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped -- around a deflate stream, which is itself documented in RFC 1951. -- -- The library also supports reading and writing files in gzip (.gz) format -- with an interface similar to that of stdio using the functions that start -- with "gz". The gzip format is different from the zlib format. gzip is a -- gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. -- -- This library can optionally read and write gzip streams in memory as well. -- -- The zlib format was designed to be compact and fast for use in memory -- and on communications channels. The gzip format was designed for single- -- file compression on file systems, has a larger header than zlib to maintain -- directory information, and uses a different, slower check method than zlib. -- -- The library does not install any signal handler. The decoder checks -- the consistency of the compressed data, so the library should never -- crash even in case of corrupted input. --*/ -- --typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); --typedef void (*free_func) OF((voidpf opaque, voidpf address)); -- --struct internal_state; -- --typedef struct z_stream_s { -- Bytef *next_in; /* next input byte */ -- uInt avail_in; /* number of bytes available at next_in */ -- uLong total_in; /* total nb of input bytes read so far */ -- -- Bytef *next_out; /* next output byte should be put there */ -- uInt avail_out; /* remaining free space at next_out */ -- uLong total_out; /* total nb of bytes output so far */ -- -- char *msg; /* last error message, NULL if no error */ -- struct internal_state FAR *state; /* not visible by applications */ -- -- alloc_func zalloc; /* used to allocate the internal state */ -- free_func zfree; /* used to free the internal state */ -- voidpf opaque; /* private data object passed to zalloc and zfree */ -- -- int data_type; /* best guess about the data type: binary or text */ -- uLong adler; /* adler32 value of the uncompressed data */ -- uLong reserved; /* reserved for future use */ --} z_stream; -- --typedef z_stream FAR *z_streamp; -- --/* -- gzip header information passed to and from zlib routines. See RFC 1952 -- for more details on the meanings of these fields. --*/ --typedef struct gz_header_s { -- int text; /* true if compressed data believed to be text */ -- uLong time; /* modification time */ -- int xflags; /* extra flags (not used when writing a gzip file) */ -- int os; /* operating system */ -- Bytef *extra; /* pointer to extra field or Z_NULL if none */ -- uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ -- uInt extra_max; /* space at extra (only when reading header) */ -- Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ -- uInt name_max; /* space at name (only when reading header) */ -- Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ -- uInt comm_max; /* space at comment (only when reading header) */ -- int hcrc; /* true if there was or will be a header crc */ -- int done; /* true when done reading gzip header (not used -- when writing a gzip file) */ --} gz_header; -- --typedef gz_header FAR *gz_headerp; -- --/* -- The application must update next_in and avail_in when avail_in has -- dropped to zero. It must update next_out and avail_out when avail_out -- has dropped to zero. The application must initialize zalloc, zfree and -- opaque before calling the init function. All other fields are set by the -- compression library and must not be updated by the application. -- -- The opaque value provided by the application will be passed as the first -- parameter for calls of zalloc and zfree. This can be useful for custom -- memory management. The compression library attaches no meaning to the -- opaque value. -- -- zalloc must return Z_NULL if there is not enough memory for the object. -- If zlib is used in a multi-threaded application, zalloc and zfree must be -- thread safe. -- -- On 16-bit systems, the functions zalloc and zfree must be able to allocate -- exactly 65536 bytes, but will not be required to allocate more than this -- if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, -- pointers returned by zalloc for objects of exactly 65536 bytes *must* -- have their offset normalized to zero. The default allocation function -- provided by this library ensures this (see zutil.c). To reduce memory -- requirements and avoid any allocation of 64K objects, at the expense of -- compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). -- -- The fields total_in and total_out can be used for statistics or -- progress reports. After compression, total_in holds the total size of -- the uncompressed data and may be saved for use in the decompressor -- (particularly if the decompressor wants to decompress everything in -- a single step). --*/ -- -- /* constants */ -- --#define Z_NO_FLUSH 0 --#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ --#define Z_SYNC_FLUSH 2 --#define Z_FULL_FLUSH 3 --#define Z_FINISH 4 --#define Z_BLOCK 5 --/* Allowed flush values; see deflate() and inflate() below for details */ -- --#define Z_OK 0 --#define Z_STREAM_END 1 --#define Z_NEED_DICT 2 --#define Z_ERRNO (-1) --#define Z_STREAM_ERROR (-2) --#define Z_DATA_ERROR (-3) --#define Z_MEM_ERROR (-4) --#define Z_BUF_ERROR (-5) --#define Z_VERSION_ERROR (-6) --/* Return codes for the compression/decompression functions. Negative -- * values are errors, positive values are used for special but normal events. -- */ -- --#define Z_NO_COMPRESSION 0 --#define Z_BEST_SPEED 1 --#define Z_BEST_COMPRESSION 9 --#define Z_DEFAULT_COMPRESSION (-1) --/* compression levels */ -- --#define Z_FILTERED 1 --#define Z_HUFFMAN_ONLY 2 --#define Z_RLE 3 --#define Z_FIXED 4 --#define Z_DEFAULT_STRATEGY 0 --/* compression strategy; see deflateInit2() below for details */ -- --#define Z_BINARY 0 --#define Z_TEXT 1 --#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ --#define Z_UNKNOWN 2 --/* Possible values of the data_type field (though see inflate()) */ -- --#define Z_DEFLATED 8 --/* The deflate compression method (the only one supported in this version) */ -- --#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ -- --#define zlib_version zlibVersion() --/* for compatibility with versions < 1.0.2 */ -- -- /* basic functions */ -- --ZEXTERN const char * ZEXPORT zlibVersion OF((void)); --/* The application can compare zlibVersion and ZLIB_VERSION for consistency. -- If the first character differs, the library code actually used is -- not compatible with the zlib.h header file used by the application. -- This check is automatically made by deflateInit and inflateInit. -- */ -- --/* --ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); -- -- Initializes the internal stream state for compression. The fields -- zalloc, zfree and opaque must be initialized before by the caller. -- If zalloc and zfree are set to Z_NULL, deflateInit updates them to -- use default allocation functions. -- -- The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: -- 1 gives best speed, 9 gives best compression, 0 gives no compression at -- all (the input data is simply copied a block at a time). -- Z_DEFAULT_COMPRESSION requests a default compromise between speed and -- compression (currently equivalent to level 6). -- -- deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not -- enough memory, Z_STREAM_ERROR if level is not a valid compression level, -- Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible -- with the version assumed by the caller (ZLIB_VERSION). -- msg is set to null if there is no error message. deflateInit does not -- perform any compression: this will be done by deflate(). --*/ -- -- --ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); --/* -- deflate compresses as much data as possible, and stops when the input -- buffer becomes empty or the output buffer becomes full. It may introduce some -- output latency (reading input without producing any output) except when -- forced to flush. -- -- The detailed semantics are as follows. deflate performs one or both of the -- following actions: -- -- - Compress more input starting at next_in and update next_in and avail_in -- accordingly. If not all input can be processed (because there is not -- enough room in the output buffer), next_in and avail_in are updated and -- processing will resume at this point for the next call of deflate(). -- -- - Provide more output starting at next_out and update next_out and avail_out -- accordingly. This action is forced if the parameter flush is non zero. -- Forcing flush frequently degrades the compression ratio, so this parameter -- should be set only when necessary (in interactive applications). -- Some output may be provided even if flush is not set. -- -- Before the call of deflate(), the application should ensure that at least -- one of the actions is possible, by providing more input and/or consuming -- more output, and updating avail_in or avail_out accordingly; avail_out -- should never be zero before the call. The application can consume the -- compressed output when it wants, for example when the output buffer is full -- (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK -- and with zero avail_out, it must be called again after making room in the -- output buffer because there might be more output pending. -- -- Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to -- decide how much data to accumualte before producing output, in order to -- maximize compression. -- -- If the parameter flush is set to Z_SYNC_FLUSH, all pending output is -- flushed to the output buffer and the output is aligned on a byte boundary, so -- that the decompressor can get all input data available so far. (In particular -- avail_in is zero after the call if enough output space has been provided -- before the call.) Flushing may degrade compression for some compression -- algorithms and so it should be used only when necessary. -- -- If flush is set to Z_FULL_FLUSH, all output is flushed as with -- Z_SYNC_FLUSH, and the compression state is reset so that decompression can -- restart from this point if previous compressed data has been damaged or if -- random access is desired. Using Z_FULL_FLUSH too often can seriously degrade -- compression. -- -- If deflate returns with avail_out == 0, this function must be called again -- with the same value of the flush parameter and more output space (updated -- avail_out), until the flush is complete (deflate returns with non-zero -- avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that -- avail_out is greater than six to avoid repeated flush markers due to -- avail_out == 0 on return. -- -- If the parameter flush is set to Z_FINISH, pending input is processed, -- pending output is flushed and deflate returns with Z_STREAM_END if there -- was enough output space; if deflate returns with Z_OK, this function must be -- called again with Z_FINISH and more output space (updated avail_out) but no -- more input data, until it returns with Z_STREAM_END or an error. After -- deflate has returned Z_STREAM_END, the only possible operations on the -- stream are deflateReset or deflateEnd. -- -- Z_FINISH can be used immediately after deflateInit if all the compression -- is to be done in a single step. In this case, avail_out must be at least -- the value returned by deflateBound (see below). If deflate does not return -- Z_STREAM_END, then it must be called again as described above. -- -- deflate() sets strm->adler to the adler32 checksum of all input read -- so far (that is, total_in bytes). -- -- deflate() may update strm->data_type if it can make a good guess about -- the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered -- binary. This field is only for information purposes and does not affect -- the compression algorithm in any manner. -- -- deflate() returns Z_OK if some progress has been made (more input -- processed or more output produced), Z_STREAM_END if all input has been -- consumed and all output has been produced (only when flush is set to -- Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example -- if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible -- (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not -- fatal, and deflate() can be called again with more input and more output -- space to continue compressing. --*/ -- -- --ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); --/* -- All dynamically allocated data structures for this stream are freed. -- This function discards any unprocessed input and does not flush any -- pending output. -- -- deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the -- stream state was inconsistent, Z_DATA_ERROR if the stream was freed -- prematurely (some input or output was discarded). In the error case, -- msg may be set but then points to a static string (which must not be -- deallocated). --*/ -- -- --/* --ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); -- -- Initializes the internal stream state for decompression. The fields -- next_in, avail_in, zalloc, zfree and opaque must be initialized before by -- the caller. If next_in is not Z_NULL and avail_in is large enough (the exact -- value depends on the compression method), inflateInit determines the -- compression method from the zlib header and allocates all data structures -- accordingly; otherwise the allocation will be deferred to the first call of -- inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to -- use default allocation functions. -- -- inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough -- memory, Z_VERSION_ERROR if the zlib library version is incompatible with the -- version assumed by the caller. msg is set to null if there is no error -- message. inflateInit does not perform any decompression apart from reading -- the zlib header if present: this will be done by inflate(). (So next_in and -- avail_in may be modified, but next_out and avail_out are unchanged.) --*/ -- -- --ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); --/* -- inflate decompresses as much data as possible, and stops when the input -- buffer becomes empty or the output buffer becomes full. It may introduce -- some output latency (reading input without producing any output) except when -- forced to flush. -- -- The detailed semantics are as follows. inflate performs one or both of the -- following actions: -- -- - Decompress more input starting at next_in and update next_in and avail_in -- accordingly. If not all input can be processed (because there is not -- enough room in the output buffer), next_in is updated and processing -- will resume at this point for the next call of inflate(). -- -- - Provide more output starting at next_out and update next_out and avail_out -- accordingly. inflate() provides as much output as possible, until there -- is no more input data or no more space in the output buffer (see below -- about the flush parameter). -- -- Before the call of inflate(), the application should ensure that at least -- one of the actions is possible, by providing more input and/or consuming -- more output, and updating the next_* and avail_* values accordingly. -- The application can consume the uncompressed output when it wants, for -- example when the output buffer is full (avail_out == 0), or after each -- call of inflate(). If inflate returns Z_OK and with zero avail_out, it -- must be called again after making room in the output buffer because there -- might be more output pending. -- -- The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, -- Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much -- output as possible to the output buffer. Z_BLOCK requests that inflate() stop -- if and when it gets to the next deflate block boundary. When decoding the -- zlib or gzip format, this will cause inflate() to return immediately after -- the header and before the first block. When doing a raw inflate, inflate() -- will go ahead and process the first block, and will return when it gets to -- the end of that block, or when it runs out of data. -- -- The Z_BLOCK option assists in appending to or combining deflate streams. -- Also to assist in this, on return inflate() will set strm->data_type to the -- number of unused bits in the last byte taken from strm->next_in, plus 64 -- if inflate() is currently decoding the last block in the deflate stream, -- plus 128 if inflate() returned immediately after decoding an end-of-block -- code or decoding the complete header up to just before the first byte of the -- deflate stream. The end-of-block will not be indicated until all of the -- uncompressed data from that block has been written to strm->next_out. The -- number of unused bits may in general be greater than seven, except when -- bit 7 of data_type is set, in which case the number of unused bits will be -- less than eight. -- -- inflate() should normally be called until it returns Z_STREAM_END or an -- error. However if all decompression is to be performed in a single step -- (a single call of inflate), the parameter flush should be set to -- Z_FINISH. In this case all pending input is processed and all pending -- output is flushed; avail_out must be large enough to hold all the -- uncompressed data. (The size of the uncompressed data may have been saved -- by the compressor for this purpose.) The next operation on this stream must -- be inflateEnd to deallocate the decompression state. The use of Z_FINISH -- is never required, but can be used to inform inflate that a faster approach -- may be used for the single inflate() call. -- -- In this implementation, inflate() always flushes as much output as -- possible to the output buffer, and always uses the faster approach on the -- first call. So the only effect of the flush parameter in this implementation -- is on the return value of inflate(), as noted below, or when it returns early -- because Z_BLOCK is used. -- -- If a preset dictionary is needed after this call (see inflateSetDictionary -- below), inflate sets strm->adler to the adler32 checksum of the dictionary -- chosen by the compressor and returns Z_NEED_DICT; otherwise it sets -- strm->adler to the adler32 checksum of all output produced so far (that is, -- total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described -- below. At the end of the stream, inflate() checks that its computed adler32 -- checksum is equal to that saved by the compressor and returns Z_STREAM_END -- only if the checksum is correct. -- -- inflate() will decompress and check either zlib-wrapped or gzip-wrapped -- deflate data. The header type is detected automatically. Any information -- contained in the gzip header is not retained, so applications that need that -- information should instead use raw inflate, see inflateInit2() below, or -- inflateBack() and perform their own processing of the gzip header and -- trailer. -- -- inflate() returns Z_OK if some progress has been made (more input processed -- or more output produced), Z_STREAM_END if the end of the compressed data has -- been reached and all uncompressed output has been produced, Z_NEED_DICT if a -- preset dictionary is needed at this point, Z_DATA_ERROR if the input data was -- corrupted (input stream not conforming to the zlib format or incorrect check -- value), Z_STREAM_ERROR if the stream structure was inconsistent (for example -- if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, -- Z_BUF_ERROR if no progress is possible or if there was not enough room in the -- output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and -- inflate() can be called again with more input and more output space to -- continue decompressing. If Z_DATA_ERROR is returned, the application may then -- call inflateSync() to look for a good compression block if a partial recovery -- of the data is desired. --*/ -- -- --ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); --/* -- All dynamically allocated data structures for this stream are freed. -- This function discards any unprocessed input and does not flush any -- pending output. -- -- inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state -- was inconsistent. In the error case, msg may be set but then points to a -- static string (which must not be deallocated). --*/ -- -- /* Advanced functions */ -- --/* -- The following functions are needed only in some special applications. --*/ -- --/* --ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, -- int level, -- int method, -- int windowBits, -- int memLevel, -- int strategy)); -- -- This is another version of deflateInit with more compression options. The -- fields next_in, zalloc, zfree and opaque must be initialized before by -- the caller. -- -- The method parameter is the compression method. It must be Z_DEFLATED in -- this version of the library. -- -- The windowBits parameter is the base two logarithm of the window size -- (the size of the history buffer). It should be in the range 8..15 for this -- version of the library. Larger values of this parameter result in better -- compression at the expense of memory usage. The default value is 15 if -- deflateInit is used instead. -- -- windowBits can also be -8..-15 for raw deflate. In this case, -windowBits -- determines the window size. deflate() will then generate raw deflate data -- with no zlib header or trailer, and will not compute an adler32 check value. -- -- windowBits can also be greater than 15 for optional gzip encoding. Add -- 16 to windowBits to write a simple gzip header and trailer around the -- compressed data instead of a zlib wrapper. The gzip header will have no -- file name, no extra data, no comment, no modification time (set to zero), -- no header crc, and the operating system will be set to 255 (unknown). If a -- gzip stream is being written, strm->adler is a crc32 instead of an adler32. -- -- The memLevel parameter specifies how much memory should be allocated -- for the internal compression state. memLevel=1 uses minimum memory but -- is slow and reduces compression ratio; memLevel=9 uses maximum memory -- for optimal speed. The default value is 8. See zconf.h for total memory -- usage as a function of windowBits and memLevel. -- -- The strategy parameter is used to tune the compression algorithm. Use the -- value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a -- filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no -- string match), or Z_RLE to limit match distances to one (run-length -- encoding). Filtered data consists mostly of small values with a somewhat -- random distribution. In this case, the compression algorithm is tuned to -- compress them better. The effect of Z_FILTERED is to force more Huffman -- coding and less string matching; it is somewhat intermediate between -- Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as -- Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy -- parameter only affects the compression ratio but not the correctness of the -- compressed output even if it is not set appropriately. Z_FIXED prevents the -- use of dynamic Huffman codes, allowing for a simpler decoder for special -- applications. -- -- deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough -- memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid -- method). msg is set to null if there is no error message. deflateInit2 does -- not perform any compression: this will be done by deflate(). --*/ -- --ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, -- const Bytef *dictionary, -- uInt dictLength)); --/* -- Initializes the compression dictionary from the given byte sequence -- without producing any compressed output. This function must be called -- immediately after deflateInit, deflateInit2 or deflateReset, before any -- call of deflate. The compressor and decompressor must use exactly the same -- dictionary (see inflateSetDictionary). -- -- The dictionary should consist of strings (byte sequences) that are likely -- to be encountered later in the data to be compressed, with the most commonly -- used strings preferably put towards the end of the dictionary. Using a -- dictionary is most useful when the data to be compressed is short and can be -- predicted with good accuracy; the data can then be compressed better than -- with the default empty dictionary. -- -- Depending on the size of the compression data structures selected by -- deflateInit or deflateInit2, a part of the dictionary may in effect be -- discarded, for example if the dictionary is larger than the window size in -- deflate or deflate2. Thus the strings most likely to be useful should be -- put at the end of the dictionary, not at the front. In addition, the -- current implementation of deflate will use at most the window size minus -- 262 bytes of the provided dictionary. -- -- Upon return of this function, strm->adler is set to the adler32 value -- of the dictionary; the decompressor may later use this value to determine -- which dictionary has been used by the compressor. (The adler32 value -- applies to the whole dictionary even if only a subset of the dictionary is -- actually used by the compressor.) If a raw deflate was requested, then the -- adler32 value is not computed and strm->adler is not set. -- -- deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a -- parameter is invalid (such as NULL dictionary) or the stream state is -- inconsistent (for example if deflate has already been called for this stream -- or if the compression method is bsort). deflateSetDictionary does not -- perform any compression: this will be done by deflate(). --*/ -- --ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, -- z_streamp source)); --/* -- Sets the destination stream as a complete copy of the source stream. -- -- This function can be useful when several compression strategies will be -- tried, for example when there are several ways of pre-processing the input -- data with a filter. The streams that will be discarded should then be freed -- by calling deflateEnd. Note that deflateCopy duplicates the internal -- compression state which can be quite large, so this strategy is slow and -- can consume lots of memory. -- -- deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not -- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent -- (such as zalloc being NULL). msg is left unchanged in both source and -- destination. --*/ -- --ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); --/* -- This function is equivalent to deflateEnd followed by deflateInit, -- but does not free and reallocate all the internal compression state. -- The stream will keep the same compression level and any other attributes -- that may have been set by deflateInit2. -- -- deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source -- stream state was inconsistent (such as zalloc or state being NULL). --*/ -- --ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, -- int level, -- int strategy)); --/* -- Dynamically update the compression level and compression strategy. The -- interpretation of level and strategy is as in deflateInit2. This can be -- used to switch between compression and straight copy of the input data, or -- to switch to a different kind of input data requiring a different -- strategy. If the compression level is changed, the input available so far -- is compressed with the old level (and may be flushed); the new level will -- take effect only at the next call of deflate(). -- -- Before the call of deflateParams, the stream state must be set as for -- a call of deflate(), since the currently available input may have to -- be compressed and flushed. In particular, strm->avail_out must be non-zero. -- -- deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source -- stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR -- if strm->avail_out was zero. --*/ -- --ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, -- int good_length, -- int max_lazy, -- int nice_length, -- int max_chain)); --/* -- Fine tune deflate's internal compression parameters. This should only be -- used by someone who understands the algorithm used by zlib's deflate for -- searching for the best matching string, and even then only by the most -- fanatic optimizer trying to squeeze out the last compressed bit for their -- specific input data. Read the deflate.c source code for the meaning of the -- max_lazy, good_length, nice_length, and max_chain parameters. -- -- deflateTune() can be called after deflateInit() or deflateInit2(), and -- returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. -- */ -- --ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, -- uLong sourceLen)); --/* -- deflateBound() returns an upper bound on the compressed size after -- deflation of sourceLen bytes. It must be called after deflateInit() -- or deflateInit2(). This would be used to allocate an output buffer -- for deflation in a single pass, and so would be called before deflate(). --*/ -- --ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, -- int bits, -- int value)); --/* -- deflatePrime() inserts bits in the deflate output stream. The intent -- is that this function is used to start off the deflate output with the -- bits leftover from a previous deflate stream when appending to it. As such, -- this function can only be used for raw deflate, and must be used before the -- first deflate() call after a deflateInit2() or deflateReset(). bits must be -- less than or equal to 16, and that many of the least significant bits of -- value will be inserted in the output. -- -- deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source -- stream state was inconsistent. --*/ -- --ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, -- gz_headerp head)); --/* -- deflateSetHeader() provides gzip header information for when a gzip -- stream is requested by deflateInit2(). deflateSetHeader() may be called -- after deflateInit2() or deflateReset() and before the first call of -- deflate(). The text, time, os, extra field, name, and comment information -- in the provided gz_header structure are written to the gzip header (xflag is -- ignored -- the extra flags are set according to the compression level). The -- caller must assure that, if not Z_NULL, name and comment are terminated with -- a zero byte, and that if extra is not Z_NULL, that extra_len bytes are -- available there. If hcrc is true, a gzip header crc is included. Note that -- the current versions of the command-line version of gzip (up through version -- 1.3.x) do not support header crc's, and will report that it is a "multi-part -- gzip file" and give up. -- -- If deflateSetHeader is not used, the default gzip header has text false, -- the time set to zero, and os set to 255, with no extra, name, or comment -- fields. The gzip header is returned to the default state by deflateReset(). -- -- deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source -- stream state was inconsistent. --*/ -- --/* --ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, -- int windowBits)); -- -- This is another version of inflateInit with an extra parameter. The -- fields next_in, avail_in, zalloc, zfree and opaque must be initialized -- before by the caller. -- -- The windowBits parameter is the base two logarithm of the maximum window -- size (the size of the history buffer). It should be in the range 8..15 for -- this version of the library. The default value is 15 if inflateInit is used -- instead. windowBits must be greater than or equal to the windowBits value -- provided to deflateInit2() while compressing, or it must be equal to 15 if -- deflateInit2() was not used. If a compressed stream with a larger window -- size is given as input, inflate() will return with the error code -- Z_DATA_ERROR instead of trying to allocate a larger window. -- -- windowBits can also be -8..-15 for raw inflate. In this case, -windowBits -- determines the window size. inflate() will then process raw deflate data, -- not looking for a zlib or gzip header, not generating a check value, and not -- looking for any check values for comparison at the end of the stream. This -- is for use with other formats that use the deflate compressed data format -- such as zip. Those formats provide their own check values. If a custom -- format is developed using the raw deflate format for compressed data, it is -- recommended that a check value such as an adler32 or a crc32 be applied to -- the uncompressed data as is done in the zlib, gzip, and zip formats. For -- most applications, the zlib format should be used as is. Note that comments -- above on the use in deflateInit2() applies to the magnitude of windowBits. -- -- windowBits can also be greater than 15 for optional gzip decoding. Add -- 32 to windowBits to enable zlib and gzip decoding with automatic header -- detection, or add 16 to decode only the gzip format (the zlib format will -- return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is -- a crc32 instead of an adler32. -- -- inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough -- memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg -- is set to null if there is no error message. inflateInit2 does not perform -- any decompression apart from reading the zlib header if present: this will -- be done by inflate(). (So next_in and avail_in may be modified, but next_out -- and avail_out are unchanged.) --*/ -- --ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, -- const Bytef *dictionary, -- uInt dictLength)); --/* -- Initializes the decompression dictionary from the given uncompressed byte -- sequence. This function must be called immediately after a call of inflate, -- if that call returned Z_NEED_DICT. The dictionary chosen by the compressor -- can be determined from the adler32 value returned by that call of inflate. -- The compressor and decompressor must use exactly the same dictionary (see -- deflateSetDictionary). For raw inflate, this function can be called -- immediately after inflateInit2() or inflateReset() and before any call of -- inflate() to set the dictionary. The application must insure that the -- dictionary that was used for compression is provided. -- -- inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a -- parameter is invalid (such as NULL dictionary) or the stream state is -- inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the -- expected one (incorrect adler32 value). inflateSetDictionary does not -- perform any decompression: this will be done by subsequent calls of -- inflate(). --*/ -- --ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); --/* -- Skips invalid compressed data until a full flush point (see above the -- description of deflate with Z_FULL_FLUSH) can be found, or until all -- available input is skipped. No output is provided. -- -- inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR -- if no more input was provided, Z_DATA_ERROR if no flush point has been found, -- or Z_STREAM_ERROR if the stream structure was inconsistent. In the success -- case, the application may save the current current value of total_in which -- indicates where valid compressed data was found. In the error case, the -- application may repeatedly call inflateSync, providing more input each time, -- until success or end of the input data. --*/ -- --ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, -- z_streamp source)); --/* -- Sets the destination stream as a complete copy of the source stream. -- -- This function can be useful when randomly accessing a large stream. The -- first pass through the stream can periodically record the inflate state, -- allowing restarting inflate at those points when randomly accessing the -- stream. -- -- inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not -- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent -- (such as zalloc being NULL). msg is left unchanged in both source and -- destination. --*/ -- --ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); --/* -- This function is equivalent to inflateEnd followed by inflateInit, -- but does not free and reallocate all the internal decompression state. -- The stream will keep attributes that may have been set by inflateInit2. -- -- inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source -- stream state was inconsistent (such as zalloc or state being NULL). --*/ -- --ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, -- int bits, -- int value)); --/* -- This function inserts bits in the inflate input stream. The intent is -- that this function is used to start inflating at a bit position in the -- middle of a byte. The provided bits will be used before any bytes are used -- from next_in. This function should only be used with raw inflate, and -- should be used before the first inflate() call after inflateInit2() or -- inflateReset(). bits must be less than or equal to 16, and that many of the -- least significant bits of value will be inserted in the input. -- -- inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source -- stream state was inconsistent. --*/ -- --ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, -- gz_headerp head)); --/* -- inflateGetHeader() requests that gzip header information be stored in the -- provided gz_header structure. inflateGetHeader() may be called after -- inflateInit2() or inflateReset(), and before the first call of inflate(). -- As inflate() processes the gzip stream, head->done is zero until the header -- is completed, at which time head->done is set to one. If a zlib stream is -- being decoded, then head->done is set to -1 to indicate that there will be -- no gzip header information forthcoming. Note that Z_BLOCK can be used to -- force inflate() to return immediately after header processing is complete -- and before any actual data is decompressed. -- -- The text, time, xflags, and os fields are filled in with the gzip header -- contents. hcrc is set to true if there is a header CRC. (The header CRC -- was valid if done is set to one.) If extra is not Z_NULL, then extra_max -- contains the maximum number of bytes to write to extra. Once done is true, -- extra_len contains the actual extra field length, and extra contains the -- extra field, or that field truncated if extra_max is less than extra_len. -- If name is not Z_NULL, then up to name_max characters are written there, -- terminated with a zero unless the length is greater than name_max. If -- comment is not Z_NULL, then up to comm_max characters are written there, -- terminated with a zero unless the length is greater than comm_max. When -- any of extra, name, or comment are not Z_NULL and the respective field is -- not present in the header, then that field is set to Z_NULL to signal its -- absence. This allows the use of deflateSetHeader() with the returned -- structure to duplicate the header. However if those fields are set to -- allocated memory, then the application will need to save those pointers -- elsewhere so that they can be eventually freed. -- -- If inflateGetHeader is not used, then the header information is simply -- discarded. The header is always checked for validity, including the header -- CRC if present. inflateReset() will reset the process to discard the header -- information. The application would need to call inflateGetHeader() again to -- retrieve the header from the next gzip stream. -- -- inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source -- stream state was inconsistent. --*/ -- --/* --ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, -- unsigned char FAR *window)); -- -- Initialize the internal stream state for decompression using inflateBack() -- calls. The fields zalloc, zfree and opaque in strm must be initialized -- before the call. If zalloc and zfree are Z_NULL, then the default library- -- derived memory allocation routines are used. windowBits is the base two -- logarithm of the window size, in the range 8..15. window is a caller -- supplied buffer of that size. Except for special applications where it is -- assured that deflate was used with small window sizes, windowBits must be 15 -- and a 32K byte window must be supplied to be able to decompress general -- deflate streams. -- -- See inflateBack() for the usage of these routines. -- -- inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of -- the paramaters are invalid, Z_MEM_ERROR if the internal state could not -- be allocated, or Z_VERSION_ERROR if the version of the library does not -- match the version of the header file. --*/ -- --typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *)); --typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); -- --ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, -- in_func in, void FAR *in_desc, -- out_func out, void FAR *out_desc)); --/* -- inflateBack() does a raw inflate with a single call using a call-back -- interface for input and output. This is more efficient than inflate() for -- file i/o applications in that it avoids copying between the output and the -- sliding window by simply making the window itself the output buffer. This -- function trusts the application to not change the output buffer passed by -- the output function, at least until inflateBack() returns. -- -- inflateBackInit() must be called first to allocate the internal state -- and to initialize the state with the user-provided window buffer. -- inflateBack() may then be used multiple times to inflate a complete, raw -- deflate stream with each call. inflateBackEnd() is then called to free -- the allocated state. -- -- A raw deflate stream is one with no zlib or gzip header or trailer. -- This routine would normally be used in a utility that reads zip or gzip -- files and writes out uncompressed files. The utility would decode the -- header and process the trailer on its own, hence this routine expects -- only the raw deflate stream to decompress. This is different from the -- normal behavior of inflate(), which expects either a zlib or gzip header and -- trailer around the deflate stream. -- -- inflateBack() uses two subroutines supplied by the caller that are then -- called by inflateBack() for input and output. inflateBack() calls those -- routines until it reads a complete deflate stream and writes out all of the -- uncompressed data, or until it encounters an error. The function's -- parameters and return types are defined above in the in_func and out_func -- typedefs. inflateBack() will call in(in_desc, &buf) which should return the -- number of bytes of provided input, and a pointer to that input in buf. If -- there is no input available, in() must return zero--buf is ignored in that -- case--and inflateBack() will return a buffer error. inflateBack() will call -- out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() -- should return zero on success, or non-zero on failure. If out() returns -- non-zero, inflateBack() will return with an error. Neither in() nor out() -- are permitted to change the contents of the window provided to -- inflateBackInit(), which is also the buffer that out() uses to write from. -- The length written by out() will be at most the window size. Any non-zero -- amount of input may be provided by in(). -- -- For convenience, inflateBack() can be provided input on the first call by -- setting strm->next_in and strm->avail_in. If that input is exhausted, then -- in() will be called. Therefore strm->next_in must be initialized before -- calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called -- immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in -- must also be initialized, and then if strm->avail_in is not zero, input will -- initially be taken from strm->next_in[0 .. strm->avail_in - 1]. -- -- The in_desc and out_desc parameters of inflateBack() is passed as the -- first parameter of in() and out() respectively when they are called. These -- descriptors can be optionally used to pass any information that the caller- -- supplied in() and out() functions need to do their job. -- -- On return, inflateBack() will set strm->next_in and strm->avail_in to -- pass back any unused input that was provided by the last in() call. The -- return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR -- if in() or out() returned an error, Z_DATA_ERROR if there was a format -- error in the deflate stream (in which case strm->msg is set to indicate the -- nature of the error), or Z_STREAM_ERROR if the stream was not properly -- initialized. In the case of Z_BUF_ERROR, an input or output error can be -- distinguished using strm->next_in which will be Z_NULL only if in() returned -- an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to -- out() returning non-zero. (in() will always be called before out(), so -- strm->next_in is assured to be defined if out() returns non-zero.) Note -- that inflateBack() cannot return Z_OK. --*/ -- --ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); --/* -- All memory allocated by inflateBackInit() is freed. -- -- inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream -- state was inconsistent. --*/ -- --ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); --/* Return flags indicating compile-time options. -- -- Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: -- 1.0: size of uInt -- 3.2: size of uLong -- 5.4: size of voidpf (pointer) -- 7.6: size of z_off_t -- -- Compiler, assembler, and debug options: -- 8: DEBUG -- 9: ASMV or ASMINF -- use ASM code -- 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention -- 11: 0 (reserved) -- -- One-time table building (smaller code, but not thread-safe if true): -- 12: BUILDFIXED -- build static block decoding tables when needed -- 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed -- 14,15: 0 (reserved) -- -- Library content (indicates missing functionality): -- 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking -- deflate code when not needed) -- 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect -- and decode gzip streams (to avoid linking crc code) -- 18-19: 0 (reserved) -- -- Operation variations (changes in library functionality): -- 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate -- 21: FASTEST -- deflate algorithm with only one, lowest compression level -- 22,23: 0 (reserved) -- -- The sprintf variant used by gzprintf (zero is best): -- 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format -- 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! -- 26: 0 = returns value, 1 = void -- 1 means inferred string length returned -- -- Remainder: -- 27-31: 0 (reserved) -- */ -- -- -- /* utility functions */ -- --/* -- The following utility functions are implemented on top of the -- basic stream-oriented functions. To simplify the interface, some -- default options are assumed (compression level and memory usage, -- standard memory allocation functions). The source code of these -- utility functions can easily be modified if you need special options. --*/ -- --ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, -- const Bytef *source, uLong sourceLen)); --/* -- Compresses the source buffer into the destination buffer. sourceLen is -- the byte length of the source buffer. Upon entry, destLen is the total -- size of the destination buffer, which must be at least the value returned -- by compressBound(sourceLen). Upon exit, destLen is the actual size of the -- compressed buffer. -- This function can be used to compress a whole file at once if the -- input file is mmap'ed. -- compress returns Z_OK if success, Z_MEM_ERROR if there was not -- enough memory, Z_BUF_ERROR if there was not enough room in the output -- buffer. --*/ -- --ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, -- const Bytef *source, uLong sourceLen, -- int level)); --/* -- Compresses the source buffer into the destination buffer. The level -- parameter has the same meaning as in deflateInit. sourceLen is the byte -- length of the source buffer. Upon entry, destLen is the total size of the -- destination buffer, which must be at least the value returned by -- compressBound(sourceLen). Upon exit, destLen is the actual size of the -- compressed buffer. -- -- compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough -- memory, Z_BUF_ERROR if there was not enough room in the output buffer, -- Z_STREAM_ERROR if the level parameter is invalid. --*/ -- --ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); --/* -- compressBound() returns an upper bound on the compressed size after -- compress() or compress2() on sourceLen bytes. It would be used before -- a compress() or compress2() call to allocate the destination buffer. --*/ -- --ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, -- const Bytef *source, uLong sourceLen)); --/* -- Decompresses the source buffer into the destination buffer. sourceLen is -- the byte length of the source buffer. Upon entry, destLen is the total -- size of the destination buffer, which must be large enough to hold the -- entire uncompressed data. (The size of the uncompressed data must have -- been saved previously by the compressor and transmitted to the decompressor -- by some mechanism outside the scope of this compression library.) -- Upon exit, destLen is the actual size of the compressed buffer. -- This function can be used to decompress a whole file at once if the -- input file is mmap'ed. -- -- uncompress returns Z_OK if success, Z_MEM_ERROR if there was not -- enough memory, Z_BUF_ERROR if there was not enough room in the output -- buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. --*/ -- -- --typedef voidp gzFile; -- --ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); --/* -- Opens a gzip (.gz) file for reading or writing. The mode parameter -- is as in fopen ("rb" or "wb") but can also include a compression level -- ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for -- Huffman only compression as in "wb1h", or 'R' for run-length encoding -- as in "wb1R". (See the description of deflateInit2 for more information -- about the strategy parameter.) -- -- gzopen can be used to read a file which is not in gzip format; in this -- case gzread will directly read from the file without decompression. -- -- gzopen returns NULL if the file could not be opened or if there was -- insufficient memory to allocate the (de)compression state; errno -- can be checked to distinguish the two cases (if errno is zero, the -- zlib error is Z_MEM_ERROR). */ -- --ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); --/* -- gzdopen() associates a gzFile with the file descriptor fd. File -- descriptors are obtained from calls like open, dup, creat, pipe or -- fileno (in the file has been previously opened with fopen). -- The mode parameter is as in gzopen. -- The next call of gzclose on the returned gzFile will also close the -- file descriptor fd, just like fclose(fdopen(fd), mode) closes the file -- descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). -- gzdopen returns NULL if there was insufficient memory to allocate -- the (de)compression state. --*/ -- --ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); --/* -- Dynamically update the compression level or strategy. See the description -- of deflateInit2 for the meaning of these parameters. -- gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not -- opened for writing. --*/ -- --ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); --/* -- Reads the given number of uncompressed bytes from the compressed file. -- If the input file was not in gzip format, gzread copies the given number -- of bytes into the buffer. -- gzread returns the number of uncompressed bytes actually read (0 for -- end of file, -1 for error). */ -- --ZEXTERN int ZEXPORT gzwrite OF((gzFile file, -- voidpc buf, unsigned len)); --/* -- Writes the given number of uncompressed bytes into the compressed file. -- gzwrite returns the number of uncompressed bytes actually written -- (0 in case of error). --*/ -- --ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); --/* -- Converts, formats, and writes the args to the compressed file under -- control of the format string, as in fprintf. gzprintf returns the number of -- uncompressed bytes actually written (0 in case of error). The number of -- uncompressed bytes written is limited to 4095. The caller should assure that -- this limit is not exceeded. If it is exceeded, then gzprintf() will return -- return an error (0) with nothing written. In this case, there may also be a -- buffer overflow with unpredictable consequences, which is possible only if -- zlib was compiled with the insecure functions sprintf() or vsprintf() -- because the secure snprintf() or vsnprintf() functions were not available. --*/ -- --ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); --/* -- Writes the given null-terminated string to the compressed file, excluding -- the terminating null character. -- gzputs returns the number of characters written, or -1 in case of error. --*/ -- --ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); --/* -- Reads bytes from the compressed file until len-1 characters are read, or -- a newline character is read and transferred to buf, or an end-of-file -- condition is encountered. The string is then terminated with a null -- character. -- gzgets returns buf, or Z_NULL in case of error. --*/ -- --ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); --/* -- Writes c, converted to an unsigned char, into the compressed file. -- gzputc returns the value that was written, or -1 in case of error. --*/ -- --ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); --/* -- Reads one byte from the compressed file. gzgetc returns this byte -- or -1 in case of end of file or error. --*/ -- --ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); --/* -- Push one character back onto the stream to be read again later. -- Only one character of push-back is allowed. gzungetc() returns the -- character pushed, or -1 on failure. gzungetc() will fail if a -- character has been pushed but not read yet, or if c is -1. The pushed -- character will be discarded if the stream is repositioned with gzseek() -- or gzrewind(). --*/ -- --ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); --/* -- Flushes all pending output into the compressed file. The parameter -- flush is as in the deflate() function. The return value is the zlib -- error number (see function gzerror below). gzflush returns Z_OK if -- the flush parameter is Z_FINISH and all output could be flushed. -- gzflush should be called only when strictly necessary because it can -- degrade compression. --*/ -- --ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, -- z_off_t offset, int whence)); --/* -- Sets the starting position for the next gzread or gzwrite on the -- given compressed file. The offset represents a number of bytes in the -- uncompressed data stream. The whence parameter is defined as in lseek(2); -- the value SEEK_END is not supported. -- If the file is opened for reading, this function is emulated but can be -- extremely slow. If the file is opened for writing, only forward seeks are -- supported; gzseek then compresses a sequence of zeroes up to the new -- starting position. -- -- gzseek returns the resulting offset location as measured in bytes from -- the beginning of the uncompressed stream, or -1 in case of error, in -- particular if the file is opened for writing and the new starting position -- would be before the current position. --*/ -- --ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); --/* -- Rewinds the given file. This function is supported only for reading. -- -- gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) --*/ -- --ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); --/* -- Returns the starting position for the next gzread or gzwrite on the -- given compressed file. This position represents a number of bytes in the -- uncompressed data stream. -- -- gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) --*/ -- --ZEXTERN int ZEXPORT gzeof OF((gzFile file)); --/* -- Returns 1 when EOF has previously been detected reading the given -- input stream, otherwise zero. --*/ -- --ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); --/* -- Returns 1 if file is being read directly without decompression, otherwise -- zero. --*/ -- --ZEXTERN int ZEXPORT gzclose OF((gzFile file)); --/* -- Flushes all pending output if necessary, closes the compressed file -- and deallocates all the (de)compression state. The return value is the zlib -- error number (see function gzerror below). --*/ -- --ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); --/* -- Returns the error message for the last error which occurred on the -- given compressed file. errnum is set to zlib error number. If an -- error occurred in the file system and not in the compression library, -- errnum is set to Z_ERRNO and the application may consult errno -- to get the exact error code. --*/ -- --ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); --/* -- Clears the error and end-of-file flags for file. This is analogous to the -- clearerr() function in stdio. This is useful for continuing to read a gzip -- file that is being written concurrently. --*/ -- -- /* checksum functions */ -- --/* -- These functions are not related to compression but are exported -- anyway because they might be useful in applications using the -- compression library. --*/ -- --ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); --/* -- Update a running Adler-32 checksum with the bytes buf[0..len-1] and -- return the updated checksum. If buf is NULL, this function returns -- the required initial value for the checksum. -- An Adler-32 checksum is almost as reliable as a CRC32 but can be computed -- much faster. Usage example: -- -- uLong adler = adler32(0L, Z_NULL, 0); -- -- while (read_buffer(buffer, length) != EOF) { -- adler = adler32(adler, buffer, length); -- } -- if (adler != original_adler) error(); --*/ -- --ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, -- z_off_t len2)); --/* -- Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 -- and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for -- each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of -- seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. --*/ -- --ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); --/* -- Update a running CRC-32 with the bytes buf[0..len-1] and return the -- updated CRC-32. If buf is NULL, this function returns the required initial -- value for the for the crc. Pre- and post-conditioning (one's complement) is -- performed within this function so it shouldn't be done by the application. -- Usage example: -- -- uLong crc = crc32(0L, Z_NULL, 0); -- -- while (read_buffer(buffer, length) != EOF) { -- crc = crc32(crc, buffer, length); -- } -- if (crc != original_crc) error(); --*/ -- --ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); -- --/* -- Combine two CRC-32 check values into one. For two sequences of bytes, -- seq1 and seq2 with lengths len1 and len2, CRC-32 check values were -- calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 -- check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and -- len2. --*/ -- -- -- /* various hacks, don't look :) */ -- --/* deflateInit and inflateInit are macros to allow checking the zlib version -- * and the compiler's view of z_stream: -- */ --ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, -- const char *version, int stream_size)); --ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, -- const char *version, int stream_size)); --ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, -- int windowBits, int memLevel, -- int strategy, const char *version, -- int stream_size)); --ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, -- const char *version, int stream_size)); --ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, -- unsigned char FAR *window, -- const char *version, -- int stream_size)); --#define deflateInit(strm, level) \ -- deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) --#define inflateInit(strm) \ -- inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) --#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ -- deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ -- (strategy), ZLIB_VERSION, sizeof(z_stream)) --#define inflateInit2(strm, windowBits) \ -- inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) --#define inflateBackInit(strm, windowBits, window) \ -- inflateBackInit_((strm), (windowBits), (window), \ -- ZLIB_VERSION, sizeof(z_stream)) -- -- --#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL) -- struct internal_state {int dummy;}; /* hack for buggy compilers */ --#endif -- --ZEXTERN const char * ZEXPORT zError OF((int)); --ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); --ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); -- --#ifdef __cplusplus --} --#endif -- --#endif /* ZLIB_H */ ---- python-pysam.orig/tests/pysam_data/example_empty_with_header.sam -+++ /dev/null -@@ -1 +0,0 @@ --@HD VN:1.3 SO:coordinate ---- python-pysam.orig/tests/pysam_data/rg_with_tab.sam -+++ /dev/null -@@ -1,3273 +0,0 @@ --@SQ SN:chr1 LN:1575 --@SQ SN:chr2 LN:1584 --@PG ID:bwa PN:bwa VN:0.7.9a-r786 CL:bwa mem -p -t 8 -M -R @RG ID:None SM:None /mnt/data/hg19.fa /mnt/analysis/default-0.fastq --EAS56_57:6:190:289:82 69 chr1 100 0 * = 100 0 CTCAAGGTTGTTGCAAGGGGGTCTATGTGAACAAA <<<7<<<;<<<<<<<<8;;<7;4<;<;;;;;94<; MF:i:192 --EAS56_57:6:190:289:82 137 chr1 100 73 35M = 100 0 AGGGGTGCAGAGCCGAGTCACGGGGTTGCCAGCAC <<<<<<;<<<<<<<<<<;<<;<<<<;8<6;9;;2; MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:190:727:308 99 chr1 103 99 35M = 263 195 GGTGCAGAGCCGAGTCACGGGGTTGCCAGCACAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<::<<<844 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:141:80:875 99 chr1 110 99 35M = 265 190 AGCCGAGTCACGGGGTTGCCAGCACAGGGGCTTAA <<<<<<<<<<<<<<<<<<<<<<:<<8;<<8+7;-7 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:3:40:1128:1940 163 chr1 112 99 35M = 291 214 CCGAGTCACGGGGTTGCCAGCACAGGGGCTTAACC <<<<<<<<<<<<<<<<<<<;<<5;;<<<9;;;;7: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:5:290:319:736 69 chr1 113 0 * = 113 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<:7:<.<<<<7<<.<.<<.9*<4<:<4%74 MF:i:192 --EAS51_62:5:290:319:736 137 chr1 113 73 35M = 113 0 CGAGTCACGGGGTTGCCAGCACAGGGGCTTAACCT ==;=======7====6=;==:;;====66=::27: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_597:2:132:493:921 69 chr1 119 0 * = 119 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<<<<<<<<<<<<<<<<<;<<<<77;0<;;6777 MF:i:192 --B7_597:2:132:493:921 137 chr1 119 75 35M = 119 0 ACGGGGTTGCCAGCACAGGGGCTTAACCTCTGGTG <<<<<<<<<<<<<<<<;<<<<<<<<;<<<<;;88: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:7:283:799:560 163 chr1 121 66 35M = 283 197 GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGAC <<<<+<<<<8<<<+<<<<<;<<:07;8;7402447 MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:1:225:195:543 99 chr1 123 99 35M = 299 211 GGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTG <<<<<<<<<<<<<<<<<<<<<<<9<<;::388998 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:6:114:714:317 99 chr1 126 99 35M = 311 220 TGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<5;<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:1:70:147:84 163 chr1 128 73 35M = 285 192 CCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGA <<<<<<<<<<<<<<<<<<<<;<<<<<<<<<;(5<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:187:227:818 163 chr1 129 99 35M = 290 196 CAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<3<;<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:4:77:29:126 99 chr1 131 99 35M = 315 219 GCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCT <<<<<<<<<<3<<<<<<<;;;7<;<<449<-:977 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:4:327:795:103 99 chr1 133 99 35M = 302 204 ACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:3:139:117:262 69 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTCTATGTGAAC <<<7<<<<<<<<<<<<<<<<<<<;<;<<<<<37;3 MF:i:192 --EAS114_30:3:139:117:262 137 chr1 135 76 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<;<<<<<<<<<<<<<:<<<<<:<<8<<<<:<: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:5:29:817:854 73 chr1 135 77 35M = 135 0 AGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:5:29:817:854 133 chr1 135 0 * = 135 0 GTTCTCAAGGTTGTTGCAAGGGGGTTTATGTGAAC <<<<<<<<<<<<<<<1..;:;;;;1%407)07&7. MF:i:192 --EAS192_3:6:170:169:57 163 chr1 138 99 35M = 296 193 GGCTTGACCTCTGGTGACTGCCAGAGCTGCTGGCC <<<<<;<<<<<<;<<<<<<<<<<<<:<<<<<;;+% MF:i:18 Aq:i:45 NM:i:2 UQ:i:30 H0:i:0 H1:i:1 --B7_595:4:84:802:737 99 chr1 140 68 35M = 284 179 CTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAG <<<<<<<<<<;9<9<<<;<<;73;<<<<<37;1+. MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_4:7:78:583:670 163 chr1 142 99 35M = 316 209 TAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCT <<<<<<<<<<;;;<;;<<<:7;5;<5;;<2--8-; MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:90:435:691 99 chr1 147 99 35M = 318 206 TCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAGT <<<<<<<<<<;<<<;<<<<:<<<;<81;<<1;784 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:3:13:122:187 163 chr1 153 99 35M = 343 225 GACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCCTT <<<<<<<;<;<<<;<<<<:;6<<<<;;;;:<<%%< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS221_1:6:69:735:1915 99 chr1 154 99 35M = 321 202 ACTGCCAGAGCTGCTGGCAAGCTAGAGTCCCATTT <<<<<<<<;<<<<<<<<<;<<<<;<8<<<<;1:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:5:66:959:1311 163 chr1 159 95 35M = 336 212 CAGAGCTGCTGGCAAGCTAGAGGCCCATCTGGAGC ;;4;;;+;;;-01;;&-;;4;;&;;73)(&**274 MF:i:18 Aq:i:31 NM:i:2 UQ:i:12 H0:i:0 H1:i:1 --EAS56_57:6:325:759:288 99 chr1 163 99 35M = 341 213 GCTGCTGGCAAGCTAGAGTCCCATTTGGAGCCCCT 8<;<<<<81<<<<<;<<;<<<;9<<<<1>><<<< MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:4:240:264:231 121 chr1 213 66 35M = 213 0 TGTAATGAAAACTATATTTATGCTATTCAGTTCTA 9;,;;62<9<)29<<<;96<<<;<<7<<<<<<;<< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:4:240:264:231 181 chr1 213 0 * = 213 0 CAACAGATCAAGAAGGAGGGGCAATGGACGAGTTA %15+5022))0&<<)0)+7:4+&<0<<:0<<<7<< MF:i:192 --EAS1_93:7:14:426:613 99 chr1 214 99 35M = 379 200 GTAATGAAAACTATATTTATGCTATTCAGTTCTAA ======;=;==========;;==3=;==-=<;<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:2:173:995:93 163 chr1 215 99 35M = 382 202 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<<<<<<<<<<<<<7:<<<<;:<:<<<<:7 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:6:195:348:703 163 chr1 215 99 35M = 353 173 TAATGAAAACTATATTTATGCTATTCAGTTCTAAA <<<<<<<;<<<<<;:<<<<<<<<<<<<:<1:<:7< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:2:62:879:264 163 chr1 216 99 35M = 396 215 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<2<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_61:4:83:452:970 99 chr1 216 99 35M = 379 198 AATGAAAACTATATTTATGCTATTCAGTTCTAAAT ==========================;======== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:2:64:1318:1711 99 chr1 218 99 35M = 389 206 TGAAAACTATATTTATGCTATTCAGTTCTAAATAT <<<<<<<<<<<<<<<<7<<<<<<<:<<<<<2<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:8:113:968:19 83 chr1 219 99 35M = 50 -204 GAAAACTATATTTATGCTATTCAGTTCTAAATATA 8;<;8;9<<<<<<<9<:<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:4:160:896:275 163 chr1 220 99 35M = 387 202 AAAACTATATTTATGCTATTCAGTTCTAAATATAG ============<====<==<====<==<==;=:6 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:6:181:191:418 163 chr1 221 99 36M = 387 202 AAACTATATTTATGCTATTCAGTTCTAAATATAGAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<988 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:7:242:354:637 99 chr1 222 99 36M = 417 231 AACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<6<;; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:1:122:77:789 163 chr1 223 99 35M = 396 208 ACTATATTTATGCTATTCAGTTCTAAATATAGAAA <<<:<4<<9<:7<<<:<<<7<<<<<<<<<<9<9<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:5:42:540:501 147 chr1 224 99 36M = 60 -200 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:6:155:12:674 83 chr1 224 99 36M = 52 -208 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT ;<<<<<<<<<<;<<<<;<<<<8<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:4:106:316:452 147 chr1 224 99 36M = 49 -211 CTATATTTATGCTATTCAGTTCTAAATATAGAAATT :<<<<<;<<<<:<<:<<<<<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:5:89:525:113 163 chr1 227 78 40M = 397 210 TATTTATGCTATTCAGTTATAAATATAGAAATTGAAACAG <1<7<6;+0;7;7'<70;-<7<:<:<<5<<:9<5:7:%:7 MF:i:18 Aq:i:39 NM:i:1 UQ:i:12 H0:i:0 H1:i:1 --EAS54_65:3:321:311:983 147 chr1 228 99 35M = 51 -212 ATTTATGCTATTCAGTTCTAAATATAGAAATTGAA ;;4;;<7<<<<<<77<<<<<<<<<<17<<<<<<<< MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_65:8:76:493:708 147 chr1 229 44 35M = 73 -191 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA 5/)63.&1517(544(055(0454&7706566679 MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:2:125:628:79 163 chr1 229 99 35M = 400 205 TTTATGCTATTCAGTTCTAAATATAGAAATTGAAA ==================<6<====<<:<==7;:: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:5:78:583:499 83 chr1 229 74 35M = 37 -227 TTTACGCTATTCAGTACTAAATATAGAAATTGAAA &6&9774&<;67<44&-4<;<9<7<<<<<;<<<<< MF:i:18 Aq:i:37 NM:i:2 UQ:i:27 H0:i:0 H1:i:1 --EAS54_67:3:175:730:949 83 chr1 230 99 35M = 70 -195 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC <<<<;+<<<<7<;<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:1:84:275:1572 163 chr1 230 99 35M = 394 199 TTATGCTATTCAGTTCTAAATATAGAAATTGAAAC /6;;;4;;;;;;;;7;;4;.4;;;;;6;;;77077 MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:4:248:753:731 99 chr1 231 99 35M = 402 206 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA <<<<<<<<<<<8<<<<<<<<<<<<:<<<<&<:<.: MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:1:9:1289:215 99 chr1 231 99 35M = 394 198 TATGCTATTCAGTTCTAAATATAGAAATTGAAACA ;;;;;;9;;;67;;;;;99;9;;;;;;;;977747 MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:188:802:71 163 chr1 232 99 35M = 415 218 ATGCTATTCAGTTCTAAATATAGAAATTGAAACAG <<<<<<<<<;<<<<<9<<<:<<<:<<<<<<:<<<; MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:7:252:171:323 83 chr1 234 99 35M = 43 -226 GCTATTCAGTTCTAAATATAGAAATTGAAACAGCT ;8<;<=3=6==:====;;======;========== MF:i:18 Aq:i:62 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:5:223:142:410 147 chr1 235 99 35M = 60 -210 CTATTCAGTTCTAAATATAGAAATTGAAACAGCTG 8;<<<;<<<<;<<<<<<;<;<<<<<<<<<<<<;<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:1:243:10:911 83 chr1 236 99 35M = 63 -208 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;<;;;<4;9:<<<;<<;<<<<<;;<<<<<<<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:6:5:730:1436 163 chr1 236 99 35M = 403 202 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGT ;;;;;;;;;;;;;;;;;;8;;;;;8;;;;;67777 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:2:57:1672:1890 121 chr1 236 75 40M = 236 0 TATTCAGTTCTAAATATAGAAATTGAAACAGCTGTGTTTA :;;;9<8;;*<<<<<<:<<<<<<<<1:<<<<<<<<<<<7< MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:2:57:1672:1890 181 chr1 236 0 * = 236 0 CCCCCCCCCCCCCCCCCAGCCACTGCGGCCCCCCCAGCCA -+)%)'-'+,,<066,))090+:&486083:5&&:<<5<0 MF:i:192 --EAS1_105:2:299:360:220 99 chr1 237 99 35M = 403 201 ATTCAGTTCTAAATATAGAAATTGAAACAGCTGTG <<<<<<<9<9<<<<<<<<<<<<<<<<<5<;<0<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:2:24:1037:84 163 chr1 238 99 35M = 415 212 TTCAGTTCTAAATATAGAAATTGAAACAGCTGTGT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<;:<57< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:3:86:823:683 163 chr1 240 99 35M = 408 203 CAGTTCTAAATATAGAAATTGAAACAGCTGTGTTT <<<<<<<<;<<<<<<<<<<<<<<<<<<<<;;9<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:4:130:568:978 99 chr1 246 88 35M = 434 223 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGAC 7<<;<<;<7<:;<7<<<<<<<<);4;+<7+3+%;< MF:i:18 Aq:i:24 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 --EAS114_45:4:73:1208:495 163 chr1 246 99 35M = 431 220 TAAATATAGAAATTGAAACAGCTGTGTTTAGTGCC ;;;;;;;;;;;;;;;;;;;;;;;;5;;;;;37377 MF:i:18 Aq:i:67 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:7:264:642:506 99 chr1 247 99 35M = 420 208 AAATATAGAAATTGAAACAGCTGTGTTTATTGTAT <<;<<<<<<;<<<;:;;:;;<<;<<<<;*+;*&.4 MF:i:18 Aq:i:56 NM:i:3 UQ:i:28 H0:i:1 H1:i:0 --EAS114_28:5:104:350:749 163 chr1 247 99 36M = 415 204 AAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <<8<<<<<<<<<<<;<<<<<<<<0;<<<9;<85;;; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_61:6:227:259:597 147 chr1 248 99 35M = 61 -222 AATATAGAAATTGAAACAGCTGTGTTTAGTGCCTT <8<;2;9;<;;-92<;;;<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:61 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_32:7:113:809:364 99 chr1 250 99 35M = 413 198 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<7<;<;<<<4 MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:218:877:489 83 chr1 250 86 35M = 80 -205 TATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTG 9<<<8<<<;<9<<<<<<<<<<;<<<<<<<<<<<<< MF:i:18 Aq:i:10 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:7:20:979:96 83 chr1 254 99 35M = 79 -210 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA '9996;(:;-<;1<<<<=<<<<=<<<<<<<<<<<< MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:259:219:114 99 chr1 254 99 35M = 411 192 GAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCA <<<<<<<<<<<<<<<;<<<<<<7<7<<<<<0<<9< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:6:13:1034:1144 99 chr1 256 99 35M = 429 208 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<<<<<<<<<<<<<<<<<<<<<<<3<<<;<<;<++ MF:i:18 Aq:i:69 NM:i:2 UQ:i:48 H0:i:1 H1:i:0 --EAS221_1:2:29:1486:672 147 chr1 256 99 35M = 79 -212 AATTGAAACAGCTGTGTTTAGTGCCTTTGTTCACA <<:<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<++ MF:i:18 Aq:i:29 NM:i:2 UQ:i:54 H0:i:0 H1:i:0 --EAS139_11:7:46:695:738 163 chr1 259 74 35M = 428 204 TGAAACAGCTGAGTTTAGCGCCTGTGTTCACATAG <;<<<<;<<),&4<3<<7&7<0;)).3;79;7<;0 MF:i:130 Aq:i:74 NM:i:3 UQ:i:18 H0:i:0 H1:i:0 --EAS139_11:8:26:1221:222 163 chr1 261 99 35M = 446 220 AAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:190:727:308 147 chr1 263 99 35M = 103 -195 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ;;<;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:3:284:261:124 83 chr1 263 99 35M = 79 -219 ACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTG ===27===.====&===========;;======== MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:141:80:875 147 chr1 265 99 35M = 110 -190 AGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCA 6/<;84<;<;<<<<<<5<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:3:24:1135:563 163 chr1 266 99 40M = 446 220 GCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAACC <<<<:<<<<:1:<<<<<<.<<<<<<<<;<;;;43+:30:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:2:90:986:1224 83 chr1 267 99 35M = 67 -235 CTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAAC <7*37;;;;;;;9<<;<7<<<<<<<<<<<;;<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:7:287:492:169 99 chr1 269 99 36M = 449 216 GTGTTTAGTGCCTTTGTTCAACCCCCTTGCAACAAC <<<7<<<<<<<<<<<<<<<<<<<<<<<8;;<;6<<; MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:1:48:9:409 99 chr1 271 75 18M5I12M = 464 228 GTTTAGTGCCTTTGTTCACATAGACCCCCTTGCAA <<<<<<<<<<<<<:<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:75 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS139_19:1:87:1222:878 163 chr1 272 10 40M = 435 203 TATAGGGCCTTTGTTCAAACCCCTTGCAACAACCTTGAGA &+6<6&<:<<9<1112<<;)9227>>>>>>>>>>>>>;<>>>>><<>>>;<+<>=>>+==>>==<==<=8=><:;8/;7<<<<<<<<;<:<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:240:603:890 83 chr1 740 99 36M = 590 -186 GCTCCCAAGAGGGAAAGCTTTCAACGCTTCTAGCCA ;+&+//&<<<<<<<<<<9<<<8<<<<9<<<<<<<<< MF:i:18 Aq:i:66 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --B7_591:7:129:956:115 163 chr1 740 99 36M = 927 223 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;877- MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:4:168:528:288 83 chr1 740 99 35M = 570 -205 GCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCC 8<%<31;<<;<;<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:8:275:851:240 147 chr1 743 99 35M = 561 -217 CCCCAGAGGGAAAGCTTTCAACGTTTCTAGCCATT 66614/&3616630666&66666&66666868666 MF:i:18 Aq:i:31 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS188_7:6:205:873:464 147 chr1 743 99 35M = 552 -226 CCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATT <-((+:+;289<--;<;-;<:;;<<<;;<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:6:37:610:260 163 chr1 745 99 35M = 913 203 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC <<<;<;<<7<<<<<<<<<<<<<<;6<963;;;3;1 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:7:93:945:176 147 chr1 745 99 35M = 582 -198 CCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTC 6;;;8<<3<<8.<;6)<<<<<9<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:6:61:628:681 83 chr1 746 99 36M = 586 -196 CAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCTT 95<<<<<<<<;<<<<;<<<:<<;;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:7:288:552:440 83 chr1 747 87 35M = 560 -222 AGAGGGAACGCTTTCAACTCTTCTAGCCATTTCTT 9<<%'%<<.2<<<<<<<<5:<<<<<<<<<<<<<<< MF:i:18 Aq:i:26 NM:i:2 UQ:i:33 H0:i:0 H1:i:0 --EAS56_53:2:170:265:818 163 chr1 748 10 35M = 920 207 GAGGGGAAGCTTTCAACGCTTCTAGCACTTTCTTT <<<<<(5/959<8.<9<8<<<2<&59&&:22:8+( MF:i:18 Aq:i:10 NM:i:3 UQ:i:17 H0:i:0 H1:i:0 --B7_595:2:251:121:479 83 chr1 750 99 35M = 572 -213 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<6'..663;&<<;<<9<<<9<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:8:67:1797:1931 147 chr1 750 99 35M = 562 -223 GGGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:2:226:302:758 83 chr1 751 99 35M = 556 -230 GGAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGG ;<<<<9;<<<<<<<<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:2:163:618:570 83 chr1 751 99 35M = 571 -215 GGAAAGCTGTCAACGCTTCTAGCCATTTCTTTTGG <9774<88&:8<:8<8:8<8<<<<<;88<88<<<< MF:i:18 Aq:i:41 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS1_97:3:73:292:429 99 chr1 752 99 35M = 920 203 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTTGC <<<<<<<<<<7<<;<<<<<<<2<<<5<<<<<:%)< MF:i:18 Aq:i:69 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS1_108:3:82:356:253 99 chr1 752 99 35M = 927 210 GAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC ===================<========;===39= MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:6:62:386:959 147 chr1 752 99 35M = 594 -193 AAAAGCTTTCAACGCTTCTAGCCATTTCTTTTGGC %;71131((<<6<92(+<1<<;<-3<8<<;<;;<< MF:i:18 Aq:i:57 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS51_62:3:263:74:407 83 chr1 754 99 35M = 574 -215 AAGCTTTCAACGCTTCTAGCCATTTCTTTTGGCAT ;;88<::+;<)<5<<:<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_597:3:67:620:344 99 chr1 755 99 35M = 905 185 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT <<<<2<:2<<<<<<7<<<<:<<*<<<<<<***3<< MF:i:18 Aq:i:33 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:6:148:776:486 83 chr1 755 99 35M = 578 -212 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT ;:<<<;<<;<<<<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_61:3:150:933:810 83 chr1 755 99 35M = 593 -197 AGCTTTCAACGCTTCTAGCCATTTCTTTTGGCATT :89===:=:=;;==;==================== MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:4:102:467:897 99 chr1 756 97 35M = 940 219 GCTTTCAACGCTTCTAGCCATTTCTTTTGTCTTTT <<<<9<<<<9<2<<<&,/=====>=>=>>>=>>==>=>>>>>> MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:3:297:637:86 83 chr1 869 99 35M = 704 -200 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG <:75<;<;;<<<<<<;;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 --EAS54_65:3:290:558:349 147 chr1 869 99 35M = 719 -185 TCTCAGCTAGGGGAACAGGGAGGTGCACTAATGCG 2;2;;'5&;<<5<<;5/<<<<<7<<;+;<<+1<8< MF:i:18 Aq:i:59 NM:i:1 UQ:i:6 H0:i:1 H1:i:0 --EAS1_95:3:308:956:873 99 chr1 870 99 35M = 1068 233 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC <<<<<<<<<<<<<;<;<;1<<<<<.<9<;<<<<+; MF:i:18 Aq:i:31 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_78:7:147:64:416 147 chr1 870 99 35M = 701 -204 CTCATCTAGGGGAACAGGGAGGTGCACTAATGCGC /;49;:6<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:4:30:812:345 163 chr1 871 99 36M = 1036 201 TCATCTAGGGGAACAGGGAGGTGCACTAATGCGCTC <<<<<<<7<;<<7<;77;3<&0-;<5<;6<1'13<: MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:7:134:243:630 163 chr1 871 99 35M = 1052 216 TCATCTAGGGGAACAGGGAGGCGCACTAATGAGCT <<<:<<<<::1:818;;&::<>.; MF:i:18 Aq:i:35 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS54_81:2:31:98:804 147 chr1 982 99 35M = 805 -212 CTTTACTGTCATAACTATGAAGAGACTATTGCCAG ====;========7===================== MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:2:235:805:373 163 chr1 983 99 35M = 1146 198 TTTACTGTCATAACTATGAAGAGACTATTTCCAGA <<<<<<<<<<<<<<<<<<<<;<;<<<<<<;;<99; MF:i:18 Aq:i:74 NM:i:1 UQ:i:26 H0:i:1 H1:i:0 --EAS114_28:5:11:868:62 99 chr1 983 99 36M = 1154 207 TTTACTGTCATAACTATGAAGAGACTATTGCCAGAT <<<<<<<<<<<<<<<<<<<<:<<<;<<<<(7:7039 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_81:2:280:512:316 163 chr1 984 99 35M = 1159 210 TTACTGTCATAACTATGAAGAGACTATTGCCAGCT ==<========6==4==6;;==:===;=2/:+8%6 MF:i:18 Aq:i:68 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS1_93:5:292:122:666 99 chr1 985 99 35M = 1159 209 TACTGTCATAACTATGAAGAGACTATTGTCAGATG <<<<<<6<<<<<<<<8;<<<<<<<<<<3&9+;;(; MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS56_53:1:23:403:981 99 chr1 985 99 35M = 1151 201 TACTGTCATAACTATGAAGAGACTATTGCCAGATG <8<<<;<<<<<<;<<<<<<8;<<<9<9,3;,6(91 MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:7:33:1566:588 99 chr1 985 76 35M = 1166 216 TACTGTCATAACTATGAAGAGCCTATTGCCAGATG <;.;;;;6;;;;6;;29;;;<+9;;;.3;;73797 MF:i:18 Aq:i:37 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 --EAS139_11:7:92:367:1495 83 chr1 987 99 35M = 820 -202 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA <8<88<<<<7<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS220_1:8:38:1576:1923 83 chr1 987 99 35M = 822 -200 CTGTCATAACTATGAAGAGACTATTGCCAGATGAA 8;<98<<<<<<<;<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:190:481:295 163 chr1 990 99 35M = 1161 206 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<<<<<<<<<<<<<<<<<<<<<9<<<<<9<7<2:: MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_32:7:168:117:441 99 chr1 990 99 35M = 1151 196 TCATAACTATGAAGAGACTATTGCCAGATGAACCA <<3<<<<<<<<<<<<<<<<<<<+<<17;<;:<995 MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:239:796:221 163 chr1 992 99 35M = 1160 203 ATAACTATGAAGAGACTATTGCCAGCTGACCCCCC <<<7<<7<<7<<7<;<<<<<,;;,+'<+/+99%:' MF:i:18 Aq:i:37 NM:i:4 UQ:i:26 H0:i:0 H1:i:1 --EAS220_1:4:69:88:1154 147 chr1 992 99 35M = 805 -222 ATAACTATGAAGAGACTATTGCCAGATGAACCACA <<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_3:8:34:956:1309 99 chr1 994 99 35M = 1168 209 AACTATGAAGAGACTATTGCCAGATGAACCACACA <<<<<<7<<<<<<<<<<<6<<<<<<<<<<<:<8<8 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:5:229:717:121 99 chr1 995 99 35M = 1150 190 ACTATGAAGAGACTATTGCCAGATGAACCACACAC =================<)=<4<0=.<<<71;41& MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS219_1:1:67:191:668 99 chr1 995 99 35M = 1134 174 ACTATGAAGAGACTATTGCCAGATGAACCACACCT <<<<<<<<<<<<<<<<<6<<;<;<;<<<<<<6;%2 MF:i:18 Aq:i:74 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS51_64:3:309:303:278 163 chr1 996 99 35M = 1178 217 CTATGAAGAGACTATTGCCAGATGAACCACACATT <<<<<<<<<<<<<<<<+<<+<<7<<<<<5<<<;;; MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:8:60:1020:1259 99 chr1 996 99 35M = 1157 196 CTATGAAGAGACTATTGCCAGATGAACCACACATT <;<<<<;<<<<<<<<<;<<<<<<<8<<<<<:<:<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:7:89:1487:520 83 chr1 997 99 35M = 805 -227 TATGAAGAGACTATTGCCAGATGAACCACACATTA 4;;/<<<<<:<;<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:4:15:805:420 163 chr1 998 35 35M = 1164 201 ATGAAGAGACTATTCACATGTGAACCACACATTTA ;73;;;;67.;1<<+*.;*&<4947<&474&*9*( MF:i:130 Aq:i:35 NM:i:4 UQ:i:33 H0:i:0 H1:i:0 --EAS56_57:3:119:761:239 147 chr1 999 99 35M = 813 -221 TGAAGAGACTATTGCCAGATGAACCACACATTAAT ;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:142:457:584 99 chr1 999 99 35M = 1160 196 TGAAGAGACTATTTCCAGATGAACCACACATTAAT <<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:7:190:95:706 147 chr1 1078 99 35M = 920 -193 TTGTGTCCATGTACACACGCTGTCCTATGTACTTA 9;97437;<;;<<;<;<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:1:101:825:28 83 chr1 1079 99 35M = 879 -235 TGTGTCCATGTACACACGCTGTCCTATGTACTTAT 0;0'0;<<<<<<8<;<<<<;;3<<;;<<<8<<<<< MF:i:18 Aq:i:39 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:4:188:460:1000 99 chr1 1080 99 35M = 1251 206 GTGTCCATGTACACACGCTGTCCTATGTACTTATC <<<<<<<<<<<<<<<<7<<;:4;44<;;:8;;9;; MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_95:3:268:523:511 99 chr1 1081 99 35M = 1241 195 TGTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<;<<<<6<:9<<3<44 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:6:54:263:585 99 chr1 1081 99 36M = 1254 209 TGTCCATGTACACACGCTGTCCTATGTACTTATCAT <<<<<<<<<<<<<<<<<<<:;<<;<:;::<<;;:;4 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:7:174:987:334 83 chr1 1082 99 35M = 908 -209 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ,;<;;<<<&<<<1<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:41 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:6:224:932:942 99 chr1 1082 99 34M = 1250 203 GTCCATGTACACACGCTGTCCTATGTACTTATCA <<<<<<<<<<<<<<<<<<<<<<;<<<<7<<(;3, MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:1:12:1296:358 99 chr1 1082 96 35M = 1252 205 GTCCATGTACACACGCTGTCCTATGTACTTATCAT ;;;6;7;7;;;;;7;9;;-*1;9;699/99/7477 MF:i:18 Aq:i:37 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:2:306:119:56 147 chr1 1083 99 35M = 919 -199 TCCATGTACACACGCTGTCCTATGTACTTATCATG ;;;;;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_95:4:66:179:118 163 chr1 1084 99 35M = 1262 213 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<<<:<<;<<6<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:2:110:584:649 99 chr1 1084 99 35M = 1266 217 CCATGTACACACGCTGTCCTATGTACTTATCATGA <<<<<<<<<<<<<<<<<<<<<<<<;<<<<<::<38 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:4:28:315:310 163 chr1 1085 99 35M = 1242 192 CATGTACACACGCTGTCCTATGTACTTATCATGAC <<<<<<<<<<<<<<<<<<<<<:<+.<<.<+7<*17 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:242:4:593 147 chr1 1086 99 35M = 905 -216 ATATACACACGCTGTCCTATGTACTTATCATGACT 1.%55877+8+88808887+7;7;18:8;;;.&;8 MF:i:18 Aq:i:53 NM:i:1 UQ:i:4 H0:i:1 H1:i:0 --EAS1_93:1:131:946:353 163 chr1 1087 99 35M = 1249 197 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<;<<<<;;<<<<<<<;<:52;<2 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:4:4:1732:88 99 chr1 1087 99 35M = 1265 213 TGTACACACGCTGTCCTATGTACTTATCATGACTC <<<<<<<<<<<<<<<<<2<8;8<;<8;<2;2:<:< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:4:58:703:72 83 chr1 1088 99 35M = 905 -218 GTACACACGCTGTCCTATGTACTTATCATGACTCT 5&<<7;+95;7'6<<<<<.<<<<<;<<9<7<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:5:113:694:725 163 chr1 1088 99 35M = 1266 213 GTACACACGCTGTCCTATGTACTTATCATGACTCT <<<<<<<<<<<<9<<<<<:<<<<<<<<<<:;;<;; MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:5:278:848:765 147 chr1 1088 99 35M = 920 -203 GTACACACGCTGTCCTATGTACTTATCATGACTCT 7;;<;5<55<<;;<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:234:787:12 163 chr1 1092 97 35M = 1257 200 ACACGCTGGCCTATGTACTTATAATGACTCTATCC <;<<<9<<&+9;3;<993;<9<+94;9&41;08%9 MF:i:18 Aq:i:24 NM:i:2 UQ:i:15 H0:i:0 H1:i:0 --EAS218_1:4:15:856:340 147 chr1 1093 99 35M = 936 -192 CACGCTGTCCTATGTACTTATCATGACTCTATCCC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:2:258:266:101 163 chr1 1094 99 35M = 1285 226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA <<<<<<<<<<<<<<<<<<5<;,<-2<<<<;68<<6 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:177:552:234 147 chr1 1094 99 35M = 903 -226 ACGCTGTCCTATGTACTTATCATGACTCTATCCCA ::;:=;=99=====;;====;==========<=== MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:1:134:379:893 147 chr1 1095 99 35M = 927 -203 CGCTGTCCTATGTACTTATCATGACTCTATCCCAA 7137::;<<<<<<<;<<<<<<<<<<;<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:8:256:404:584 147 chr1 1096 99 35M = 928 -203 ACTGTCCTATGTACTTATCATGACTCTATCCCAAA &&326+23<3<<<+:<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:3:57:735:151 99 chr1 1121 94 35M = 1314 228 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<<<<8<<8<:<<*<:<<<4<<<;,<<<<:<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_81:8:142:858:903 147 chr1 1121 99 35M = 943 -213 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC <<<<<;<<<<9<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:7:247:522:670 83 chr1 1121 99 35M = 960 -196 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTC ;;;9;:<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:3:75:732:442 99 chr1 1121 99 40M = 1293 212 CTATCCCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<;<<<<<9<<<;<<;<<<5<<;8<<<<<<<<;:9%% MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:7:183:645:699 99 chr1 1122 86 35M = 1281 194 TATCCCAAATTCCCAATTACGTCCTATCTTCTTCT <<9<9<<<<<<<<<;<<;<<*175;173<;;;<-/ MF:i:18 Aq:i:21 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:6:175:437:950 163 chr1 1126 99 35M = 1298 207 CCAAATTCCCAATTACGTCCTATCTTCTTCTTAGG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:59 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:3:93:1002:845 83 chr1 1129 99 35M = 954 -210 AATTCCCAATTACGTCCTATCTTCTTCTTAGGGAA <<::;;;<<<<<<<<<<<<<<<<;<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:6:50:542:881 163 chr1 1132 99 35M = 1324 227 TCCCAATTACGTCCTATCTTCTTCTTAGGTAAGAA <<<<<4<09<<9<<2<<<<<<<<<<<2/.&2<%<7 MF:i:18 Aq:i:63 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_99:3:118:851:285 83 chr1 1133 99 35M = 953 -215 CCCAATTACGTCCTATCTTCTTCTTAGGGAAGAAC 3+7<<<;<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:1:67:191:668 147 chr1 1134 99 35M = 995 -174 CCAATTACGTCCTATCTTCTTCTTAGGGAAGAACA <<<<<7<<7<<<<<<<;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:7:166:203:416 83 chr1 1136 99 35M = 963 -208 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC <<<<<<<<::<<<<<<<<<;<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:2:15:1497:1530 99 chr1 1136 99 35M = 1314 213 AATTACGTCCTATCTTCTTCTTAGGGAAGAACAGC 0<;;;9;;86<;;;<<&<<.<<;)3;7;654-471 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:8:206:563:262 83 chr1 1137 99 35M = 971 -201 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<7 MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:4:40:352:151 99 chr1 1137 99 35M = 1327 225 ATTACGTCCTATCTTCTTCTTAGGGAAGAACAGCT <<<<<<<<<<<<<<<;<<9<<<<:<<<<;<99<3< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:7:67:302:762 99 chr1 1138 99 36M = 1313 211 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<;;65;<-<;<:8<<<3 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:5:84:927:843 147 chr1 1138 99 35M = 938 -235 TTACGTCCTATCTTCTTCTTAGGGAAGAACAGCTT 588;<:<<<<<<<6<<<<;<<<:/<<3<:;<*<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:5:147:479:41 163 chr1 1139 99 35M = 1322 218 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<::6<<;<<<;;9;;6 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:3:329:177:267 83 chr1 1139 99 35M = 962 -212 TACGTCCTATCTTCTTCTTAGGGAAGAACAGCTTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:7:72:916:763 163 chr1 1142 99 35M = 1340 233 GTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGT ==7>==9>=7=>=>>=>> MF:i:18 Aq:i:78 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_65:4:91:267:655 147 chr1 1365 99 35M = 1204 -196 TGTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGT ;,:;5:<6:<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:2:91:856:504 99 chr1 1366 99 35M = 1520 189 GTTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTT <<<<<<<<<<<<<<7<<<<<<<7<<<&;<<<&&<& MF:i:18 Aq:i:68 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_108:2:170:326:433 99 chr1 1367 99 35M = 1535 203 TTGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGG =====<=9===:=<:==2=======2:===9==/5 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:6:132:717:233 99 chr1 1368 99 35M = 1529 196 TGTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGA <<<<<<<<<<<<;<<<<<<<<<<<7<<<<&-<4<1 MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:1:14:420:712 99 chr1 1368 99 40M = 1525 197 TGTTGGTTTTCTGTTTCTTTGTTTGATTTTTTTGAAGACA <<<<<<<<<<<<;<<<<<<<;<<<-;<<<&,<&*8111:6 MF:i:18 Aq:i:66 NM:i:3 UQ:i:21 H0:i:1 H1:i:0 --EAS114_39:4:43:1047:1626 163 chr1 1369 99 35M = 1523 189 GTTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAA <<<<<<<<<<<<<<<<<<<:<<<:<<<<:+;-4:( MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:2:20:413:1334 147 chr1 1370 99 35M = 1215 -190 TTGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAG 88878777;:;:1:;9;;;6;;;6;9;;;;;296; MF:i:18 Aq:i:60 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:5:154:669:853 83 chr1 1371 99 35M = 1193 -213 TGGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGA <::<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:7:117:857:942 99 chr1 1372 99 35M = 1527 190 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC <<<<<<<<<<<<<<<<<<<<<<<<<:6<;;7;9<; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:145:144:796 147 chr1 1372 99 35M = 1181 -226 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC ;<<<;<<<<<<<<<;<<<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:6:88:162:587 147 chr1 1372 99 35M = 1189 -218 GGTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGAC 386;;388-<8;<;68<<;;<;<6<<<8<<<<<<< MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:8:73:108:1621 99 chr1 1373 99 35M = 1532 194 GTTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACA <<<<<<<<71<<<<<<<<<+<<<<70:0<9<<61< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:6:127:153:861 147 chr1 1374 99 35M = 1202 -207 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT :;:6;9<<1;<<95<<<9<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:152:765:744 163 chr1 1374 99 35M = 1534 195 TTTTCTGTTTCTTTGTTTGATTTGGTGGAAGACAT <<<<<<<<<<<<<<<<<<:<<<<<<<<&<7293<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:313:827:992 147 chr1 1379 99 35M = 1197 -217 TGTTTCTTTGTTTGATTTGGTGGAAGACATAATCC '187:1'<75<.*<<:5<..<<*<<917<<7<<17 MF:i:18 Aq:i:57 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_64:3:7:268:263 121 chr1 1381 22 35M = 1381 0 TTGCGTTATTTGAGTTGGTGGAAGACATAATCCCA ',)*&2<$7+<<<'<-<7<<<<<<<7<<7><>;>+>>/;>>=>=>=:>><>=<<==;)<=8; MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:7:85:923:726 147 chr2 199 99 35M = 43 -191 GTAAAGTAACTGAACCTATGAGTCACAGGTATTCC <:<<<%3<<1<<86<<-<<<<<<<<<<<<6<<1<< MF:i:18 Aq:i:44 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:5:285:241:560 83 chr2 200 99 35M = 37 -198 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT :<<<<;<<,<<<<5<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:6:41:461:436 163 chr2 200 74 35M = 389 224 TAAAGTAACTGAACCTATGAGTCACAGGTATTCCT <<<<<<<<<<<<<<<<<<<:<<<<<<<<;<;;;:; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_61:6:25:949:33 99 chr2 201 99 35M = 383 217 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG =;===/8========*==&;6=&=&:=6&:=::67 MF:i:18 Aq:i:63 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_3:2:60:590:1760 99 chr2 201 99 35M = 376 210 AAAGTAACTGAACCTATGAGTCACAGGTATTCCTG <:<<<<<2<<<<:<::<<<::<<<<<6<<<<<<<6 MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:6:86:693:234 163 chr2 202 82 35M = 388 221 AAGTAACTGAACCTATGAGTCACAGGTATTCCTGA ;;;;;;;;;;;;;;;;9;;;;;;;;99;;&70777 MF:i:18 Aq:i:18 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:4:223:776 163 chr2 203 93 35M = 387 219 AGTAACTGAACCTATGAGTCACAGGTATTCCTGAG <<<<<<<<<<<<<<<<<<<<<<<<<9<<<:;<;2< MF:i:18 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:3:88:84:1558 99 chr2 203 95 35M = 394 226 AGTAACTGAACCTATGAGTCACAGGTATTCCTGTG <<;<<<<<<<<<<<<;;<<<<<<<::<<<<<<7&< MF:i:18 Aq:i:22 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --B7_597:2:168:829:88 163 chr2 205 99 35M = 369 199 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGA <<<<<<<<<<<<<<<<<<<<<<<6<<<<<<9;4;2 MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:1:168:389:889 147 chr2 205 99 36M = 37 -204 TAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA ;<<;;56;==================8========8 MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:5:81:685:141 99 chr2 207 85 34M = 382 210 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAA <<<<<<<<<<<<<<<<<<<<<;;<<;<<<',7,7 MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:4:26:1312:1400 99 chr2 207 99 40M = 385 218 ACTGAACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<;<<<:<<:<;<:<<<;:;<<<<<<:<8<1;;:::88 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:4:127:725:381 83 chr2 209 99 34M = 39 -204 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAA +<<.<<;<;<<<3;<;<<<<<<6<8;<<<<<<<1 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:2:19:736:559 99 chr2 209 99 35M = 370 196 TGAACCTATGAGTCACAGGTATTCCTGAGGAAAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:3:117:284:589 83 chr2 210 99 35M = 43 -202 GAACCTATGAGTCACAGGTATTCCTGAGGAAAAAG ==8==;==================;========== MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:5:120:596:847 163 chr2 211 83 35M = 410 234 AACCTATGAGTCACAGGTATTCCTGAGGAAAAAGA <<<<<<<<<<<<<;<<<9<<<<<<<;:<62;58;2 MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 --B7_610:5:51:904:391 163 chr2 212 97 35M = 401 224 ACCTATGAGTCACAGGTATTCCTGAGGAAAAAGAA <<<<<<<<<<<<<<<<<;<<<<<;:;<2<6;;;;; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:8:96:1314:1448 163 chr2 213 93 35M = 388 210 CCTATGAGTCACAGGTATTCCTGAGGAAAAATAAA <<<<<<<<<<<<<<<<<<<<<<<<5<4<<<<-<<< MF:i:18 Aq:i:18 NM:i:1 UQ:i:12 H0:i:1 H1:i:0 --EAS139_19:3:73:1158:535 163 chr2 213 99 40M = 377 204 CCTATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTG <<<<<<<<<<<<<<<<<<<<<<8<;;<<<<<9<<9::8:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:223:583:968 147 chr2 215 88 36M = 47 -204 TATGAGGCACAGGTATTCCTGAGGAAAAAGAAAAAG 1<';<<&%-:<<<<<:66%<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_97:3:160:173:889 163 chr2 215 99 35M = 379 199 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA <<<<<<<<<<<<<<<<<<<<;0<7<<;<<<;7<09 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:1:28:350:895 83 chr2 215 95 35M = 48 -202 TATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAA :<;<<<:;<-<<<<<4;77<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:4:45:707:147 163 chr2 216 99 35M = 424 243 ATGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<&<<<<:<<9<<<9<<<<75;;;< MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS220_1:8:18:1757:95 99 chr2 216 45 35M = 374 193 ATGAGTCGCAGGTATTCCTGAGGAAAAAGAAAAAG <<<<<<<<<<<<<<<<<<<1<:<<<<<<:<<<;:< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS51_66:6:310:747:415 163 chr2 217 99 35M = 387 205 TGAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGT <<<<<<<<<<<<<<<<<<:<<<<<;<<<<<;<;<; MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:114:938:216 147 chr2 218 99 36M = 63 -191 GAGTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGA <<<<7<6<<<<<<<6<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:1:179:629:513 163 chr2 220 99 35M = 409 224 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG <<<<<<<<<<<<<<<;<;<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_45:4:88:55:1187 99 chr2 220 66 35M = 391 206 GTCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAG ;;<;;;<<99<<;;<;;;;;:;49;:;;;;87898 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:5:119:38:945 99 chr2 221 99 35M = 428 242 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA <<<<<<<<<<<<<<<8<<<<<8<<<8<;<<7<:<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:6:67:800:450 147 chr2 221 99 35M = 41 -215 TCACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGA 9-<9<;<<<<9;5<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:5:102:915:87 147 chr2 222 99 35M = 65 -192 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA ;<8<;;<<<<7;<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:1:113:367:659 163 chr2 222 72 35M = 390 203 CACAGGTATTCCTGAGGAAAAAGAAAAAGCGAGAA =9====8==========:=:=====9=:=&====5 MF:i:18 Aq:i:0 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS218_1:2:26:211:481 147 chr2 222 99 35M = 43 -214 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<:<<<<<<9:5<<<<<<<<<<<<<<:<:<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:3:90:1906:1528 83 chr2 222 99 35M = 41 -216 CACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAA :<<<<<<<<<3:<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:13:100:876 163 chr2 223 73 36M = 397 210 ACAGGGATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <8<<<*<2<7<<<6<<<<<<6<<8<<<<5<<<<4<9 MF:i:18 Aq:i:30 NM:i:1 UQ:i:9 H0:i:0 H1:i:1 --EAS56_63:5:117:570:971 163 chr2 223 99 35M = 413 225 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <<<<<<<<<<<<<;;;<<<<6<7;9;<:;<;<;;< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_3:8:50:1203:1094 83 chr2 223 99 35M = 46 -212 ACAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAG <7<<<<<5:+63<<<<<<<<<<<<<<<<2<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:6:107:395:312 83 chr2 224 99 35M = 44 -215 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT ;<;;<<<<;<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:29:833:612 83 chr2 224 99 35M = 58 -201 CAGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGT <<;<<<;<::<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:27 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:7:158:943:467 83 chr2 225 99 35M = 57 -203 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT <:<<;;<:5<<<<<<<<<<<<<<<<<<;<<<;<<< MF:i:18 Aq:i:46 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:201:768:529 163 chr2 225 99 35M = 396 206 AGGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTT ==========================1=======; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:6:11:994:584 99 chr2 226 97 35M = 417 226 GGTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTT <<<<<<<<<<<7<<<<<<<<<<<<<6<<<<<<3<6 MF:i:18 Aq:i:24 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:206:873:186 83 chr2 227 99 35M = 66 -196 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;<<;--7<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:4:38:28:122 83 chr2 227 99 35M = 46 -216 GTATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTG ;9;9;-1<<<<<<<<<<<<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:5:66:372:343 83 chr2 228 99 35M = 40 -223 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG ;<1;89<<<<<;<9<<<<9<<<;8<9<;<<<<<;8 MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:3:277:144:848 83 chr2 228 99 35M = 64 -199 TATTCCTGAGGAAAAAGAAAAAGTGAGAAGTTTGG <<<)63<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:6:21:1601:1666 83 chr2 228 99 40M = 56 -212 TATTACTGAGGAAAAAGAAAAAGTGAGAAGTTTGGAAAAA -;;3&1<<<<<<<<<<<<1<<<<<<<<<<<7<<<<<<<<=<<<<<<<<<<<<<< MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:23:268:529 153 chr2 329 71 35M * 0 0 TGAAAGAGGCTCAAAGAATGCCAGGAAGATACATT 7;<<<<<<57;-<<<<<<:<77<<<<<<<;<;<<< MF:i:32 Aq:i:28 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:2:315:219:7 153 chr2 330 69 35M * 0 0 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG 7==::<2=8<<<=====>888<=2=>==>,>,>>8 MF:i:32 Aq:i:19 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:4:63:5:870 83 chr2 330 75 35M = 148 -217 GAAAGAGGCTCAAAGAATGCCAGGAAGATACATTG :<;<;<<<4:;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:5:243:557:560 163 chr2 331 75 36M = 499 204 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<9<<<<<<<<<<<<<<<<<<;<<89<<9<; MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:2:270:430:269 163 chr2 331 99 36M = 519 224 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;7;: MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:6:284:442:747 89 chr2 331 75 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <;<<<<<:<;<<<<<<<<;<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_4:7:71:31:1973 89 chr2 331 76 35M * 0 0 AAAGAGGCTCAAAGAATGCCAGGAAGATACATTGC <<<<<7<:<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:2:30:466:652 147 chr2 332 98 35M = 163 -204 AAGAGGCTAAAAGAATGCCAGGAAGATACATTGCA <<<<<;3;&<<<<<<<============= MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:4 H1:i:13 --EAS114_39:3:88:84:1558 147 chr2 394 95 35M = 203 -226 ATCAGACTATCTAAAGTCAACATGAAGGAAAAAAA ;;<<;<<;<<5<<<<<<;<<:<<<;<<<<<<;<<< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:2 H1:i:3 --EAS56_59:2:201:768:529 83 chr2 396 99 35M = 225 -206 CAGACTATCTAAAGTCAACATGAAGGAAAAAAATT 3<:<9<<;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_591:2:13:100:876 83 chr2 397 73 36M = 223 -210 AGAATATATAAAGTCAACATGAAGGAAAAAAATTCT ;9<$<<<$<<<<<<<<75<<<<<<<9<9<<<<<<<< MF:i:18 Aq:i:30 NM:i:2 UQ:i:6 H0:i:1 H1:i:1 --EAS139_11:4:26:137:1382 99 chr2 397 99 35M = 579 217 AGACTATCTAAAGTCAACATGAAGGAAAAAAATTC <<<<<<7<<<77<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:1:93:490:901 83 chr2 445 99 35M = 280 -200 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<;<<<;<<<;<<;<<;<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:96:489:453 99 chr2 445 99 35M = 625 215 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;;;: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:6:46:13:880 147 chr2 445 99 35M = 290 -190 AGAAAAGCATACAGTCATCTATAAAGGAAATCCCA =&====8==========0================= MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:167:905:852 163 chr2 445 99 36M = 647 238 AGAAAAGCATACAGTCATCTATAAAGAAAATCCCAT <<<7<<<<<<<<<<<<<<:<:<<:::&.<:<66:3< MF:i:18 Aq:i:43 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS219_FC30151:3:13:674:1717 163 chr2 445 99 35M = 623 213 AGAAAAGCATGCAGTCATCTATAAAGGAAATCCCA <<<<<<<<<<%<<<<<<<<<<<<<<<<<<<;:;;; MF:i:18 Aq:i:45 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 --EAS51_62:7:196:511:896 83 chr2 446 99 35M = 283 -198 GAAAAGCATACAGTCATCTATAAAGGAAATCCCAT 8<<<<<;<<<:<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:52 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:1:154:118:488 163 chr2 447 99 35M = 624 212 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<<<<<<<<<<<<<<:7<<<<7<:;;:: MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:44:153:969 83 chr2 447 95 35M = 245 -237 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<5<:7<72<51<7<*79<<<<<5<<<<<<<<<2< MF:i:18 Aq:i:36 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:4:215:246:640 99 chr2 447 99 36M = 624 213 AAAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<9<;<<<<<<<<<<9;<<<<<<3;<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:7:94:1655:1921 147 chr2 447 85 35M = 258 -224 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<;:===<==;<==<;================; MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:6:60:1037:1146 147 chr2 447 99 35M = 250 -232 AAAAGCATACAGTCATCTATAAAGGAAATCCCATC <<<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:53 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_65:1:23:536:229 99 chr2 448 99 35M = 614 201 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<:<8<:<<;<<<<<<7<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:6:130:865:838 163 chr2 448 99 35M = 649 236 AAAGCATACAGTCATCTATAAAGGAAATCCCATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;:<;3 MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:239:1001:406 99 chr2 450 99 35M = 634 219 AGCATACAGTCATCTATAAAGGAAATCCCATCAGA <<<<<<7<<<<<<<<8<;<<<7<<<<36<<3<:33 MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:3:147:423:584 147 chr2 451 99 35M = 277 -209 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA 27<;<3<<<+<<;<<<;;-4<<<<<;<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:1:187:715:521 83 chr2 451 99 35M = 291 -195 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <7<:<9<<<<<<<<<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:3:172:196:746 99 chr2 451 99 35M = 620 204 GCATACAGTCATCTATAAAGGAAATCCCATCAGAA <<<<<<<<9<<<<9<<<<<<<<<;<<<<6<<<<;< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:3:267:821:860 83 chr2 451 99 34M = 296 -189 GCATACAGTCATCTATAAAGGAAATCCCATCAGA $&<<<.<:;6<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 --EAS56_61:7:7:682:201 83 chr2 452 99 35M = 288 -199 CATACAGTCATCTATAAAGGAAATCCCATCAGAAT 0:8;5<8<1:78<<<<<<<<<<<<:8<<2<<<<:< MF:i:18 Aq:i:66 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_589:3:82:13:897 163 chr2 453 99 35M = 606 188 ATACAGTCATCTATAAAGGAAATCCCAGCAGAATA <<<<;<<<<<<;<;<;5<51;<1<<<<%<<<<,58 MF:i:18 Aq:i:41 NM:i:1 UQ:i:4 H0:i:0 H1:i:1 --EAS56_53:6:180:695:621 99 chr2 453 99 35M = 637 219 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;::<<< MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:158:909:321 83 chr2 453 99 35M = 271 -217 ATACAGTCATCTATAAAGGAAATCCCATCAGAATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_26:2:237:497:165 99 chr2 454 99 35M = 619 200 TACAGTCATCTATAAAGGAAATCCCATCAGAATAA 8===<8===========37=<===7=;7=8===== MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:2:152:355:962 83 chr2 456 99 35M = 269 -222 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<.9.<;+;<;<<<<<<<<<<::<<:<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --EAS192_3:4:255:549:422 83 chr2 456 99 35M = 295 -196 AAGTCATCTATAAAGGAAATCCCATCAGAATAACA &<;;+<;4;<<<<<<<<<<<;<;<<;<<<<<<<<< MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:1 H1:i:2 --EAS220_1:4:100:20:1199 163 chr2 456 99 35M = 614 193 CAGTCATCTATAAAGGAAATCCCATCAGAATAACA 7<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<4<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:71:408:741 163 chr2 457 99 35M = 637 215 AGTCATCTATAAAGGAAATCCCATCAGAATAACAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:5:285:395:450 147 chr2 458 99 35M = 269 -224 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT 8)3<8+;<)<<<<<<<<97:7<<<<<<<<<<<<<< MF:i:18 Aq:i:65 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:4:854:140 99 chr2 458 72 35M = 638 215 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<6<<<:<6<<<:36:<<<<3<<8:.6<38::4< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:6:227:657:95 147 chr2 458 99 35M = 280 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT ;3;<);<<<<<<<<<<<<18<<<<<<<<<<<<<<< MF:i:18 Aq:i:59 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:7:57:324:546 83 chr2 458 99 36M = 281 -213 GTCATCTATAAAGGAAATCCCATCAGAATAACAATG ;;5<;,<<<;;<<<<<<<97<<<<<<<<<<9<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:4:26:274:1078 83 chr2 458 99 40M = 282 -216 GTCATCTATAAAGGAAATCCCATCAGAATAACAATGGGCT 9:*:64<<;<<<<<<<<<;8;<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:6:107:636:642 163 chr2 458 99 35M = 630 207 GTCATCTATAAAGGAAATCCCATCAGAATAACAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_81:7:226:869:36 147 chr2 461 99 35M = 273 -223 ATATATAAAGGAAATCCCATCAGAATAACAATGGG <0/)&<=,==4>4=>>= MF:i:18 Aq:i:68 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:8:82:1540:77 163 chr2 619 99 35M = 786 202 GAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGC <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<:8 MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:3:172:196:746 147 chr2 620 99 35M = 451 -204 AAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCT <<<;><<+<<<<:<<<<2<;<<<;<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:7:97:892:419 163 chr2 621 99 35M = 800 214 AATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:4:312:915:751 147 chr2 621 99 35M = 461 -195 AATAAAGTCAAGTCTTTCCTGACAAGCAAAAGCTA <:-<<<99:::);:7<4;8<<<<<<<;<2<+8<;< MF:i:18 Aq:i:41 NM:i:1 UQ:i:10 H0:i:0 H1:i:1 --EAS1_93:4:325:352:67 163 chr2 622 99 35M = 794 207 ATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAA ==================<========<=<;-=== MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:4:83:731:540 99 chr2 623 99 35M = 804 216 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<;<<<<<<<<<<<<<:<7<*;&;<;;9 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:5:74:329:459 163 chr2 623 99 35M = 795 207 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<;<<;<<;9;599 MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_FC30151:3:13:674:1717 83 chr2 623 99 35M = 445 -213 TAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:1:141:415:738 69 chr2 624 0 * = 624 0 TTACCTAGTTGCTCTGTAGTCTCAATTAATTGTTT <<<<<<<<<<<<<<<<<<<<<<<;<<<<<;<8<<< MF:i:192 --EAS1_105:1:141:415:738 137 chr2 624 76 35M = 624 0 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<<<<<<<<<<<<<<<<<<<<:<<;<<;<<<<6: MF:i:64 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:1:154:118:488 83 chr2 624 99 35M = 447 -212 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<<;58<<95:<<;<;<<<;<<<;;<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:5:198:929:684 83 chr2 624 99 35M = 471 -188 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGA <<;<<<<<:<<<<<<<<<<<<<<<<<;<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:4:215:246:640 147 chr2 624 99 36M = 447 -213 AAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<<,<<<96<<:<:<9<6<97<<<<<9<<<<9<<9< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:2:234:167:381 83 chr2 625 99 35M = 443 -217 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT <<;<;<<<<;<<<<7<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:7:96:489:453 147 chr2 625 99 35M = 445 -215 AAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGAT ;<;;;<<<<5:<<:<<<<:<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:3:79:879:15 99 chr2 626 99 35M = 790 199 AGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<2<;<<1< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:2:59:286:290 147 chr2 628 99 35M = 467 -196 TCAAGTCTTTCCTGACAAGCAAATGCTAAGATAAT 77<<<<7<<<97<<,7<<<;<<<;<9<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_95:5:263:511:936 99 chr2 629 99 35M = 801 207 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<<<<<<<<<<;<<<<<;<<:<:<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_30:3:181:582:435 147 chr2 629 99 35M = 471 -193 CAAGTCTTTCCTGACAAGCAAATGCTAAGATAATT <<<<<<<<;<<<<<;<<4<<<<<<;<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:6:107:636:642 83 chr2 630 99 35M = 458 -207 AAGTCTTTCCTGACAAGCAAATGCTAAGATAATTC <<<<<<<;<<<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:4:12:273:89 83 chr2 631 99 35M = 477 -189 AGTCTTTCCTGACAAGCAAATGCTAAGATAATTCA <:737<288<<<7<<<<<<<<<:9<<<<<<<<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_59:2:239:1001:406 147 chr2 634 99 35M = 450 -219 CTTTCCTGACAAGCAAATGCTAAGATAATTCATCA 0':.71;;:9==9=;====;=;============= MF:i:18 Aq:i:49 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_4:7:96:899:106 147 chr2 636 99 35M = 462 -209 TTCCTGACAAGCAAATGCTAAGATAATTCATCATC ;;;;<<<<<<<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_65:6:67:56:806 147 chr2 637 99 35M = 464 -208 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA 844:8;7<88;8<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:71:408:741 83 chr2 637 99 35M = 457 -215 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA ;7;<;<0<<<<<<<<:;<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_53:6:180:695:621 147 chr2 637 99 35M = 453 -219 TACTGAAAAGCAAATGCTAAGATAATTCATCATCA ;&377<&<<;7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:68 NM:i:2 UQ:i:10 H0:i:1 H1:i:0 --EAS114_30:6:49:656:507 147 chr2 637 99 35M = 468 -204 TCCTGACAAGCAAATGCTAAGATAATTCATCATCA %44;;<:<<;<;<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:3:4:854:140 147 chr2 638 72 35M = 458 -215 CCTGACAAGCAAATGCTAAGATAATTCATCATCAC :9':<;<<<;<<<;<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:1:85:1521:58 99 chr2 639 99 40M = 813 214 CTGACAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<9<<<<<<<<<<<<<7<<<<<<<<<<<<;;:7: MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:2:57:1064:925 137 chr2 640 76 35M * 0 0 TGACAAGCAAATGCTAAGATAATTCATCATCACTA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:<<< MF:i:32 Aq:i:29 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:3:323:196:855 163 chr2 642 99 35M = 809 202 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<7<<<<<<:<<<<<<<<<<<<<<<<<;7: MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:117:33:262 163 chr2 642 99 35M = 814 207 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<; MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:1:59:742:549 99 chr2 642 99 35M = 816 209 ACAAGCAAATGCTAAGATAATTCATCATCACTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<8< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:55:562:403 163 chr2 643 99 36M = 825 218 CAAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<;<<;<<<<<<<;<;: MF:i:18 Aq:i:51 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:7:97:743:602 163 chr2 644 99 35M = 821 211 AAGCAAATGCTAAGATAATTCATCATCACTAAACC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<: MF:i:18 Aq:i:26 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:2:167:905:852 83 chr2 647 99 36M = 445 -238 CAAATGCTAAGATAATTCATCATCACTAAACCAGTC +<<<9;7;<<+<<<<<39<;9<;9<<7<<<<<<<<< MF:i:18 Aq:i:43 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS112_34:6:130:865:838 83 chr2 649 99 35M = 448 -236 AATGCTAAGATAATTCATCATCACTAAACCAGTCC ;<:84<<<4<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:1:60:1420:660 163 chr2 649 99 35M = 808 194 AATGCTAAGATAATTCATCATCACTAAACCAGTCC <<<<<<<<<<<<<<<<<<<<<<<<<<;<<<8<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:3:180:89:582 99 chr2 650 99 36M = 809 195 ATGCTAAGATAATTCATCATCACTAAACCAGTCCTA <<<<<<<<<7<<<<<<<<<7<<<:<<<:<<::77:< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:1:86:871:319 147 chr2 651 71 35M = 494 -192 TGCTAAGATAATTCATCATCACTAAACCAGTCCTA 7;+1;<:<<<<<<<<;<<;<<9<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:2:236:841:20 83 chr2 652 99 35M = 467 -220 GCTAAGATAATTCATCATCACTAAACCAGTCCTAT 7;<<<;<<<<;;<<<<<<<<<<<<<<<<<<<<;<< MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_62:2:133:8:379 83 chr2 653 99 35M = 470 -218 ATAAGATAATTCATCATCACTAAACCAGTCCTATA &=========='==7==0=2====28===00==== MF:i:18 Aq:i:70 NM:i:1 UQ:i:5 H0:i:1 H1:i:0 --EAS1_105:8:96:720:940 83 chr2 654 99 35M = 467 -222 TAAGATAATTCATCATCACTAAACCAGTCCTATAA *<<<<;<<<9<<;,<;0<;<<<<<<<<<<<<<<<< MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:5:71:994:576 99 chr2 655 99 35M = 805 185 AAGATAATTCATCATCACTAAACCAGTCCTATAAG <<<<<<<<<<<<<<<<<<<<<<<;<<5<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_103:4:164:79:134 147 chr2 656 99 35M = 488 -203 AGATAATTCATCATCACTAAACCAGTCCTATAAGA <;<;<<<;<<;<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:6:78:1029:512 83 chr2 656 99 40M = 500 -196 AGATAATTCATCATCACTAAACCAGTCCTATAAGAAATGC ;;;;;<;;<<<.<<6;<<;<;8<<<<::<<<<<<<<;<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_93:1:214:784:690 147 chr2 657 99 35M = 472 -220 GATAATTCATCATCACTAAACCAGTCCTATAAGAA -<7<<7<:<<2<<<<;<<<<<;<<<<3<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 --EAS220_1:4:6:1178:1105 99 chr2 657 93 35M = 830 208 GATAATTCATCATCACTAAACCAGTCCTATAAGAA <<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:17 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_99:7:171:196:287 83 chr2 658 99 35M = 485 -208 ATAATTCATCATCACTAAACCAGTCCTATAAGAAA <;;;98;<;&<;;<<<<<<<;<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_28:1:220:801:282 99 chr2 660 99 36M = 837 213 AATTCATCATCACTAAACCAGTCCTATAAGAAATGC <<<<<<<<<<<<<<<<<<<<<;<+<;<<<<<::<<: MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --EAS221_1:2:73:955:728 163 chr2 660 44 35M = 823 198 AATTCATCATCACTAAACCAGTCCTATAAGAAATG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:14 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --EAS1_105:1:3:903:957 147 chr2 661 99 35M = 516 -180 ATTCATCATCACTAAACCAGTCCTATAAGAAATGC <%12<&<<<;<:<<<<<<<<<7<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:1 --EAS56_65:2:224:579:433 83 chr2 662 99 35M = 485 -212 TTCATCATCACTAAACCAGTCCTATAAGAAATGCT '<08/8<+<>===> MF:i:18 Aq:i:75 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_105:2:146:374:692 99 chr2 690 99 35M = 874 219 AAATGCTCAAAAGAATTGTAAAAGTCAAAATTAAA <<<<<<<<<<<<<<<=>>>==>>===>==> MF:i:130 Aq:i:74 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS1_108:6:159:493:275 99 chr2 760 72 35M = 939 214 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA =====3============================= MF:i:130 Aq:i:72 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS139_11:1:81:1019:558 163 chr2 760 77 35M = 926 201 ACAAAACTCACAGGTTTTATAAAACAATTAATTGA <<<<<<<<<<<6<<<<<<<<<<<<<<<<<<<<<7< MF:i:130 Aq:i:77 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS51_62:7:162:195:761 163 chr2 767 30 18M4I13M = 922 190 TCACAGGTTTTATAAAACAATTAATTGAGACTACA <<<<<<<<<<<<<<<<<<<<<<<<<<;<:<<<<;; MF:i:130 Aq:i:30 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --B7_597:3:115:646:430 147 chr2 768 45 17M4I14M = 582 -217 CACAGGTTTTATAAAACAATTAATTGAGACTACAG 5;5<;<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:130 Aq:i:45 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS114_30:6:243:209:110 163 chr2 768 48 17M4I14M = 920 187 CACAGGTTTTATAAAACAATTAATTGAGACTACAG <<<<<;<;<<<;<<<<<<<<<<<;<:;<<:;;+85 MF:i:130 Aq:i:48 NM:i:0 UQ:i:0 H0:i:0 H1:i:0 --EAS1_108:2:266:994:429 147 chr2 769 76 16M4I15M = 612 -188 ACAGGTTTTATAAAACAATTAATTGAGACTACAGA <<<<<<<<<<<<<<<< MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:1:85:1521:58 147 chr2 813 99 40M = 639 -214 AAATTAACATTACAACAGGAACAAAACCTCATATATCAAT :::86<<:<<8<<<<;<<8<<<<<<<<<<<<<<<;<<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS188_7:4:164:719:947 99 chr2 813 99 35M = 1005 227 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<;<<<<<<<<<;<<<<< MF:i:18 Aq:i:64 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:1:50:257:341 163 chr2 813 99 35M = 971 193 AAATTAACATTACAACAGGAACAAAACCTCATATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<7<6<<<< MF:i:18 Aq:i:77 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_67:5:117:33:262 83 chr2 814 99 35M = 642 -207 AATTAACATTACAACAGGAACAAAACCTCATATAT <<;;<<;<:8<7<<;<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:8:90:706:1276 163 chr2 814 99 35M = 980 201 AATTAACATTACAACAGGAACAAAACCTCATATAT <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<:<:< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:2:116:966:193 163 chr2 815 99 35M = 967 187 ATTAACATTACAACAGGAACAAAACCTCATATATC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<<< MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:1:59:742:549 147 chr2 816 99 35M = 642 -209 TTAACATTACAACAGGAACAAAACCTCATATATCA -<<<3<<<<6<<6<<<<<6<<<<6<<<<<<<<<<< MF:i:18 Aq:i:48 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:7:74:213:877 99 chr2 816 99 35M = 996 215 TTAACATTACAACAGGAACAAAACCTCATATATCA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:2 --B7_610:3:85:219:371 163 chr2 817 99 35M = 967 185 TAACATTACAACAGGAACAAAACCTCATATATCAA <<<<<<<<<<<<<<<<<<<<<:<<<<<<<<<<;<; MF:i:18 Aq:i:54 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:2:176:653:957 163 chr2 819 82 35M = 982 198 ACATTACAACAGGAACAAAACCTCATATATCAATA ????????????<==>=>=>=>>>==>>>=>>> MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_19:5:57:366:844 83 chr2 877 99 40M = 708 -209 AAATTCCCCCACTTAAGAGATATAGATTGGCAGAACAGAT ;;;7:8&555<,;<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_32:3:236:475:254 163 chr2 880 99 35M = 1051 206 TTCCCCCACTTAAGAGATATAGATTGGCAGAACAG <<<<<<<<<<<<<<<<<<<<<<<<<:::<:;>=>>>>==>=>>>==>=>=:=====;=:=6:::6 MF:i:18 Aq:i:74 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS218_1:2:10:686:1024 163 chr2 947 99 35M = 1103 191 ACAAGAAACTCATTAATAAAGACATGAGTTCAGGT <:<<<<:<<<<<<<<<<:<:<<<<<<<<<<<5<<< MF:i:18 Aq:i:30 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_73:5:53:61:31 163 chr2 949 99 35M = 1122 208 AAGAAACTCATTAATAAAGACATGAGTTCAGATAA <<<7;<7<<<;7<;;<7<7<7<;5<73<<<;>588>9<>7:<0<9; MF:i:18 Aq:i:30 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --B7_589:2:30:644:942 99 chr2 1045 83 35M = 1229 219 TATATCAGATAAAGCACACTTTAAATCAACAACAG <<<<<<<<<<<<<<<<<<<<<<<9<<<<<<9;<9< MF:i:18 Aq:i:22 NM:i:0 UQ:i:0 H0:i:1 H1:i:3 --B7_591:2:123:924:645 83 chr2 1045 84 36M = 861 -220 TATATCAGATAAAGCACACTTTAAATCAACAACAGT ;<<<<*<<<<<<<<8<<<<<><<<<<><<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS51_62:4:308:614:911 99 chr2 1319 90 35M = 1493 209 TGCGCTTGTACTTCTAAATCTATAACAAAATTAAA <<<<<<<<<<<<<<<<<<<<<<<<<<<;;<<<<8< MF:i:18 Aq:i:43 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS54_65:3:155:541:234 83 chr2 1319 99 35M = 1151 -203 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 78;<7<<<<<<<<<<<<<<;<<<<<<<<<<;<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:175:289:351 147 chr2 1319 99 35M = 1144 -210 TGCGCTTGTACTTCTAAATCTATAAAAAAATTAAA 9;;:+<<<<<;<<:<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_593:7:283:186:707 83 chr2 1321 99 36M = 1154 -203 CGCTTGTACTTCTAAATCTATAACAAAATTAAAATT 889;<7;<7<<7<<<<<7<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS1_105:3:308:66:538 147 chr2 1321 99 35M = 1138 -218 CGCTTGTACTTCTAAATCTATAACAAAATTAAAAT 996999;<9;<:<<<<<:<<7<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS1_108:5:11:555:330 163 chr2 1321 99 35M = 1492 206 CGCTTGTACTTCTAAATCTATAAAAAAATTAAAAT <<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<4<;< MF:i:18 Aq:i:56 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS51_66:7:84:411:336 73 chr2 1322 75 35M * 0 0 GCTTGTACTTCTAAATCTATAAAAAAATTAAAATT <<<;<<<;<<<<<<<<<<<<:<<;<<<<<<;8<;< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS139_11:5:52:1278:1478 163 chr2 1322 47 35M = 1513 226 GCTTGTACTTCTAAATCTATAACAAAATTAAAATT <<<<<<<<<<<<<<9<<<<<<<<<<<<<<<<9<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS56_53:3:101:809:776 147 chr2 1326 99 35M = 1160 -201 GTACTTCTAAATCTATAAAAAAATTAAAATTTAAC <<<-<;7;<<<<:;<<<7<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:72 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS192_3:3:221:881:916 147 chr2 1327 96 35M = 1168 -194 TAATTCTAAATCTAGAACAAAATTAAAATTTAACA 44%-4(5<;9/,:<68:1<:8<:<<84;<<<<<;< MF:i:18 Aq:i:24 NM:i:3 UQ:i:41 H0:i:0 H1:i:0 --EAS1_105:1:28:745:352 147 chr2 1329 99 35M = 1159 -205 CTTCTAAATCTATAACAAAATTAAAATTTAACAAA 4;;*;<<<;;<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:45 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS114_45:2:23:1754:796 99 chr2 1329 99 35M = 1488 194 CTTCTAAATCTATAAAAAAATTAAAATTTAACAAA ;<<;<;<;<;<;<<;;;;;<<<<;;<<<<<97999 MF:i:18 Aq:i:69 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:2:96:419:327 147 chr2 1331 99 35M = 1149 -217 TCTAAATCTATAACAAAATTAAAATTTAACAAAAG ;1<<<<<9<<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:1 UQ:i:27 H0:i:0 H1:i:1 --EAS1_97:4:274:287:423 163 chr2 1332 75 35M = 1515 218 CTAAATCTATAAAAAAATTAAAATTTAACAAAAGT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS219_1:7:35:392:2042 83 chr2 1332 99 35M = 1168 -199 ATAAATCTATAAAAAAATTAAAATTTAACAAAAGT +<<<<4<>>>>;>>&>->9>9;4>->>>>,4>9>,<1> MF:i:18 Aq:i:27 NM:i:1 UQ:i:5 H0:i:0 H1:i:1 --EAS139_19:2:82:154:1333 99 chr2 1349 77 40M = 1511 202 TTAAAATTTAACAAAAGTAAATAAAACACACAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<;;<;;:;: MF:i:18 Aq:i:0 NM:i:1 UQ:i:27 H0:i:1 H1:i:0 --EAS188_7:1:290:286:763 99 chr2 1349 75 35M = 1515 201 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<7<<<<<<<<<<<<<<<8<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS221_1:4:3:248:1491 73 chr2 1349 99 35M * 0 0 TTAAAATTTAACAAAAGTAAATAAAACACATAGCT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<:8:< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS114_39:3:6:1064:1805 99 chr2 1350 99 35M = 1502 187 TAAAATTTAACAAAAGTAAATAAAACACATAGCTA <<<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<<< MF:i:18 Aq:i:76 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_595:6:137:811:130 83 chr2 1351 99 35M = 1175 -211 AAAATTTAACAAAAGTAAATAAAACACATAGCTAA <<<<<<<<<:<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:47 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:1:155:809:543 83 chr2 1352 99 35M = 1156 -231 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<+0<<<9<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:73 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_63:3:41:468:459 99 chr2 1352 75 35M = 1513 196 AAATTTAACAAAAGTAAATAAAACACATAGCTAAA <<<<<<<<<<<<<<<<<<<<<;<<<<<<<<<<;;7 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_108:4:31:622:216 73 chr2 1354 99 35M * 0 0 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<<8<<96<7 MF:i:18 Aq:i:70 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS54_71:8:105:854:975 163 chr2 1354 71 35M = 1523 202 ATTTAACAAAAGTAAATAAAACACATAGCTAAAAC <<<<<<<<<<<<<<<<<<<<<<<<<<<7:<;;;;5 MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --B7_610:7:26:749:174 147 chr2 1357 78 35M = 1183 -209 TAACAAAAGTAAATAAAACACATAGCTAAAACTAA (<<)<<<<6<<<<<<<<<<&:<3<<<6<<<)<:<< MF:i:18 Aq:i:11 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS56_57:6:21:553:57 147 chr2 1358 99 35M = 1197 -196 AACAAAAGTAAATAAAACACATAGCTAAAACTAAA <<+<<<<<<<<<;<<<<8<<<<<<8<<<<<;<<<< MF:i:18 Aq:i:71 NM:i:0 UQ:i:0 H0:i:1 H1:i:0 --EAS1_97:2:128:629:484 83 chr2 1359 96 35M = 1185 -209 AAAAAAGTAAATAAAACACATAGCTAAAACTAAAA :(::<<<<<<<<<< MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:82 H1:i:85 --B7_589:6:33:356:636 73 chr2 1520 0 35M * 0 0 TTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTT <<<<<<<8;<<<<<<<<<<<<<7<<<<<<<;;3&3 MF:i:32 Aq:i:0 NM:i:0 UQ:i:0 H0:i:14 H1:i:85 --EAS114_45:6:86:859:1779 137 chr2 1520 0 35M * 0 0 TTTTTTTCATTTCTCTTTTTTTTTTTTTTTTTTTT ;;;;;;;;;;;;;;;;;;;;;;;;;;;8;;)7699 MF:i:32 Aq:i:0 NM:i:1 UQ:i:26 H0:i:0 H1:i:15 --EAS54_71:8:105:854:975 83 chr2 1523 71 33M = 1354 -202 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTG <<<<;<:<<;<&<;<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:0 UQ:i:0 H0:i:85 H1:i:85 --EAS51_62:4:187:907:145 153 chr2 1524 28 35M * 0 0 TTTCTTCTCTCTCTTTTTTTTTTTTTTTATTGCAT <<<+;;,6<<<<6<<<<<<<<<<<<<<<<<<<<<< MF:i:32 Aq:i:28 NM:i:3 UQ:i:59 H0:i:0 H1:i:0 --EAS54_71:4:284:269:882 73 chr2 1524 0 34M * 0 0 TTTCTTTTCTCTTTTTTTTTTTTTTGTTTTTGCA <;<<<<<8<7<8;<<<;<7<<<<<;272;73&&) MF:i:32 Aq:i:0 NM:i:1 UQ:i:17 H0:i:0 H1:i:85 --EAS56_63:4:141:9:811 137 chr2 1524 10 35M * 0 0 TTTCTTTTCTCCTTTTTTTTTTTTTTTTTCTACAT <<<;<<<<<<<;<;<:<<<;<<<<<<<<..));;. MF:i:32 Aq:i:0 NM:i:3 UQ:i:47 H0:i:2 H1:i:27 --EAS114_30:6:277:397:932 73 chr2 1524 0 35M * 0 0 TTTCTTTTCACTTTTTTTTTTTTTTTTTTTTACTT <<<<<<<<<<<<<<<<<<<<<<<<<<<<:8(,0%( MF:i:32 Aq:i:0 NM:i:3 UQ:i:42 H0:i:2 H1:i:85 --EAS139_11:7:50:1229:1313 83 chr2 1528 77 35M = 1376 -187 TTTTTTCTTTTTTTTTTTTTTTTTTTTGCATGCCA <<<<,<&<7<<<<<<<<<<<<<<<<<<<<<<<<<< MF:i:18 Aq:i:0 NM:i:1 UQ:i:11 H0:i:3 H1:i:7 --EAS54_65:3:320:20:250 147 chr2 1532 77 35M = 1367 -200 TTTTTTTTTTTTTTTTTTTTTTTGCATGCCAGAAA +'''/<<<<7:;+<;::<<<;;<<<<<<<<<<<<< MF:i:18 Aq:i:6 NM:i:2 UQ:i:24 H0:i:1 H1:i:2 --EAS114_26:7:37:79:581 83 chr2 1533 68 35M = 1349 -219 TTTTTTTTTTTTTTTTTTTTTTTCATGCCAGAAAA 3,,,===6===<===<;=====-============ MF:i:18 Aq:i:27 NM:i:2 UQ:i:23 H0:i:0 H1:i:1 ---- python-pysam.orig/tests/pysam_data/example_user_header.sam -+++ /dev/null -@@ -1,8 +0,0 @@ --@HD VN:1.0 --@SQ SN:chr1 LN:1575 --@SQ SN:chr2 LN:1584 --@x1 A:2 B:5 --@x2 A:4 B:5 --@x3 A:6 B:5 --read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1 --read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2 ---- python-pysam.orig/tests/pysam_data/Makefile -+++ python-pysam/tests/pysam_data/Makefile -@@ -14,7 +14,6 @@ - $(BAM) $(BAI) \ - $(CRAM) $(CRAI) \ - example_bai.bam \ -- rg_with_tab.bam \ - ex2_truncated.bam \ - empty.bam empty.bam.bai \ - explicit_index.bam explicit_index.cram \ ---- python-pysam.orig/pysam/alternatives.py.obsolete -+++ python-pysam/pysam/alternatives.py.obsolete -@@ -12,7 +12,6 @@ - int bam_merge(int argc, char *argv[]) - int bam_index(int argc, char *argv[]) - int bam_sort(int argc, char *argv[]) -- int bam_tview_main(int argc, char *argv[]) - int bam_mating(int argc, char *argv[]) - int bam_rmdup(int argc, char *argv[]) - int bam_rmdupse(int argc, char *argv[]) ---- python-pysam.orig/tests/AlignmentFile_test.py -+++ python-pysam/tests/AlignmentFile_test.py -@@ -1382,19 +1382,19 @@ - os.unlink(tmpfilename) - - --class TestDeNovoConstructionUserTags(TestDeNovoConstruction): -- -- '''test de novo construction with a header that contains lower-case tags.''' -- -- header = {'HD': {'VN': '1.0'}, -- 'SQ': [{'LN': 1575, 'SN': 'chr1'}, -- {'LN': 1584, 'SN': 'chr2'}], -- 'x1': {'A': 2, 'B': 5}, -- 'x3': {'A': 6, 'B': 5}, -- 'x2': {'A': 4, 'B': 5}} -- -- bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") -- samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") -+# class TestDeNovoConstructionUserTags(TestDeNovoConstruction): -+# -+# '''test de novo construction with a header that contains lower-case tags.''' -+# -+# header = {'HD': {'VN': '1.0'}, -+# 'SQ': [{'LN': 1575, 'SN': 'chr1'}, -+# {'LN': 1584, 'SN': 'chr2'}], -+# 'x1': {'A': 2, 'B': 5}, -+# 'x3': {'A': 6, 'B': 5}, -+# 'x2': {'A': 4, 'B': 5}} -+# -+# bamfile = os.path.join(BAM_DATADIR, "example_user_header.bam") -+# samfile = os.path.join(BAM_DATADIR, "example_user_header.sam") - - - class TestEmptyHeader(unittest.TestCase): ---- python-pysam.orig/tests/samtools_test.py -+++ python-pysam/tests/samtools_test.py -@@ -78,7 +78,7 @@ - # ("view -bT ex1.fa -o %(out)s_ex1.view2 ex1.sam", - "sort ex1.bam -o %(out)s_ex1.sort.bam", - "mpileup ex1.bam > %(out)s_ex1.pileup", -- "depth ex1.bam > %(out)s_ex1.depth", -+ #"depth ex1.bam > %(out)s_ex1.depth", - # TODO: issues with file naming - # "faidx ex1.fa; %(out)s_ex1.fa.fai", - "index ex1.bam %(out)s_ex1.bam.fai", -@@ -100,8 +100,8 @@ - "cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam", - "targetcut ex1.bam > %(out)s_ex1.targetcut", - "phase ex1.bam > %(out)s_ex1.phase", -- "import ex1.fa.fai ex1.sam.gz %(out)s_ex1.bam", -- "bam2fq ex1.bam > %(out)s_ex1.bam2fq", -+ #"view -bt ex1.fa.fai -o %(out)s_ex1.bam ex1.sam.gz", -+ #"bam2fq ex1.bam > %(out)s_ex1.bam2fq", - # TODO: not the same - # "pad2unpad -T ex1.fa ex2.bam > %(out)s_ex2.unpad", - # TODO: command line option problem diff --git a/debian/patches/series b/debian/patches/series index 6d0edd0..43ee385 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -1,8 +1,3 @@ skip_test_remote.patch -#spelling -#hts1.10 -#samtools_v1.10_full -# samtools_v1.10 -#bcftools_v1.10_full clean_less skip-test-on-32-bit.patch diff --git a/debian/patches/spelling b/debian/patches/spelling deleted file mode 100644 index 34e82d6..0000000 --- a/debian/patches/spelling +++ /dev/null @@ -1,115 +0,0 @@ -From: Michael R. Crusoe -Subject: Fix spelling typos, courtesy of lintian ---- a/bcftools/filter.c -+++ b/bcftools/filter.c -@@ -1053,7 +1053,7 @@ static void filters_set_nmissing(filter_ - } - static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) - { -- if ( nstack==0 ) error("Error parsing the expresion\n"); -+ if ( nstack==0 ) error("Error parsing the expression\n"); - token_t *tok = stack[nstack - 1]; - if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); - ---- a/bcftools/filter.c.pysam.c -+++ b/bcftools/filter.c.pysam.c -@@ -1055,7 +1055,7 @@ static void filters_set_nmissing(filter_ - } - static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) - { -- if ( nstack==0 ) error("Error parsing the expresion\n"); -+ if ( nstack==0 ) error("Error parsing the expression\n"); - token_t *tok = stack[nstack - 1]; - if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); - ---- a/pysam/libcalignedsegment.pyx -+++ b/pysam/libcalignedsegment.pyx -@@ -2242,7 +2242,7 @@ cdef class AlignedSegment: - *value*. - - An existing value of the same *tag* will be overwritten unless -- *replace* is set to False. This is usually not recommened as a -+ *replace* is set to False. This is usually not recommended as a - tag may only appear once in the optional alignment section. - - If *value* is None, the tag will be deleted. ---- a/pysam/libcalignmentfile.pyx -+++ b/pysam/libcalignmentfile.pyx -@@ -1029,7 +1029,7 @@ cdef class AlignmentFile(HTSFile): - - See :meth:`~pysam.HTSFile.parse_region` for more information - on how genomic regions can be specified. :term:`reference` and -- `end` are also accepted for backward compatiblity as synonyms -+ `end` are also accepted for backward compatibility as synonyms - for :term:`contig` and `stop`, respectively. - - Without a `contig` or `region` all mapped reads in the file -@@ -1212,7 +1212,7 @@ cdef class AlignmentFile(HTSFile): - """perform a :term:`pileup` within a :term:`region`. The region is - specified by :term:`contig`, `start` and `stop` (using - 0-based indexing). :term:`reference` and `end` are also accepted for -- backward compatiblity as synonyms for :term:`contig` and `stop`, -+ backward compatibility as synonyms for :term:`contig` and `stop`, - respectively. Alternatively, a samtools 'region' string - can be supplied. - -@@ -1354,7 +1354,7 @@ cdef class AlignmentFile(HTSFile): - - The region is specified by :term:`contig`, `start` and `stop`. - :term:`reference` and `end` are also accepted for backward -- compatiblity as synonyms for :term:`contig` and `stop`, -+ compatibility as synonyms for :term:`contig` and `stop`, - respectively. Alternatively, a :term:`samtools` :term:`region` - string can be supplied. - -@@ -1458,7 +1458,7 @@ cdef class AlignmentFile(HTSFile): - - The region is specified by :term:`contig`, `start` and `stop`. - :term:`reference` and `end` are also accepted for backward -- compatiblity as synonyms for :term:`contig` and `stop`, -+ compatibility as synonyms for :term:`contig` and `stop`, - respectively. Alternatively, a :term:`samtools` :term:`region` - string can be supplied. The coverage is computed per-base [ACGT]. - ---- a/pysam/libchtslib.pxd -+++ b/pysam/libchtslib.pxd -@@ -2511,7 +2511,7 @@ cdef extern from "htslib/cram.h" nogil: - # 2 if the file is a stream and thus unseekable - # 1 if the file contains an EOF block - # 0 if the file does not contain an EOF block -- # -1 if an error occured whilst reading the file or we could not seek back to where we were -+ # -1 if an error occurred whilst reading the file or we could not seek back to where we were - # - # - int cram_check_EOF(cram_fd *fd) ---- a/pysam/libchtslib.pyx -+++ b/pysam/libchtslib.pyx -@@ -585,7 +585,7 @@ cdef class HTSFile(object): - rval = hts_opt_apply(self.htsfile, opts) - if rval != 0: - hts_opt_free(opts) -- raise RuntimeError('An error occured while applying the requested format options') -+ raise RuntimeError('An error occurred while applying the requested format options') - hts_opt_free(opts) - - def parse_region(self, contig=None, start=None, stop=None, -@@ -595,7 +595,7 @@ cdef class HTSFile(object): - either be specified by :term:`contig`, `start` and - `stop`. `start` and `stop` denote 0-based, half-open - intervals. :term:`reference` and `end` are also accepted for -- backward compatiblity as synonyms for :term:`contig` and -+ backward compatibility as synonyms for :term:`contig` and - `stop`, respectively. - - Alternatively, a samtools :term:`region` string can be ---- a/pysam/libcutils.pyx -+++ b/pysam/libcutils.pyx -@@ -179,7 +179,7 @@ cpdef parse_region(contig=None, - `end`. `start` and `end` denote 0-based, half-open intervals. - - :term:`reference` and `end` are also accepted for backward -- compatiblity as synonyms for :term:`contig` and `stop`, -+ compatibility as synonyms for :term:`contig` and `stop`, - respectively. - - Alternatively, a samtools :term:`region` string can be supplied.