From: Étienne Mollier Date: Sat, 19 Nov 2022 10:30:14 +0000 (+0100) Subject: New upstream version 0.20.0+ds X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1~1^2^2~12^2 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=9ae61ccd7dfcfb7733df826dd6f728711d9084ae;p=python-pysam.git New upstream version 0.20.0+ds --- diff --git a/NEWS b/NEWS index 3af63dd..a0078cb 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,19 @@ http://pysam.readthedocs.io/en/latest/release.html Release notes ============= +Release 0.20.0 +============== + +This release wraps htslib/bcftools version 1.16 and samtools version 1.16.1. + +* [#1113] Full compatibility with setuptools v62.1.0's build directory name changes +* [#1121] Build-time symbol check portability improved +* [#1122] Fix setting sample genotype using .alleles property +* [#1128] Fix test suite failure when using a libdeflate-enabled samtools + +Many additional type hints have been provided by the community, +thanks! + Release 0.19.1 ============== diff --git a/README.rst b/README.rst index 06d44bf..9a66049 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. -The current version of pysam wraps 3rd-party code from htslib-1.15.1, samtools-1.15.1, and bcftools-1.15.1. +The current version of pysam wraps 3rd-party code from htslib-1.16, samtools-1.16.1, and bcftools-1.16. Pysam is available through `pypi `_. To install, type:: diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c index 76a0d43..d373e99 100644 --- a/bcftools/bam2bcf.c +++ b/bcftools/bam2bcf.c @@ -1,7 +1,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Heng Li @@ -89,6 +89,39 @@ void bcf_call_destroy(bcf_callaux_t *bca) free(bca->bases); free(bca->inscns); free(bca); } +static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref) +{ + uint8_t *nm_tag = bam_aux_get(rec, "NM"); + if ( !nm_tag ) return -1; + int64_t nm = bam_aux2i(nm_tag); + + // Count indels as single events, not as the number of inserted/deleted + // bases (which is what NM does). Add soft clips as mismatches. + int i; + for (i=0; i < rec->core.n_cigar; i++) + { + int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK; + if ( val==BAM_CSOFT_CLIP ) + { + nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + } + else if ( val==BAM_CINS || val==BAM_CDEL ) + { + val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + if ( val > 1 ) nm -= val - 1; + } + } + + // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV + // http://www.genome.org/cgi/doi/10.1101/gr.239756.118 + nm -= is_ref ? 1 : 2; + + if ( nm < 0 ) nm = 0; + if ( nm >= B2B_N_NM ) nm = B2B_N_NM - 1; + + return nm; +} + // position in the sequence with respect to the aligned part of the read static int get_position(const bam_pileup1_t *p, int *len, int *sc_len, int *sc_dist) { @@ -158,6 +191,17 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); + if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); + if ( bca->fmt_flag&B2B_FMT_NMBZ ) + { + memset(call->ref_nm,0,sizeof(*call->ref_nm)*(call->n+1)*B2B_N_NM); + memset(call->alt_nm,0,sizeof(*call->alt_nm)*(call->n+1)*B2B_N_NM); + } + else + { + memset(call->ref_nm,0,sizeof(*call->ref_nm)*B2B_N_NM); + memset(call->alt_nm,0,sizeof(*call->alt_nm)*B2B_N_NM); + } memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES); memset(bca->ref_scl, 0, 100*sizeof(int)); memset(bca->alt_scl, 0, 100*sizeof(int)); @@ -309,21 +353,26 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (sc_len > 99) sc_len = 99; } } - int imq = mapQ * nqual_over_60; int ibq = baseQ * nqual_over_60; + int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1); if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; else bca->fwd_mqs[imq]++; - if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) + if ( !is_diff ) { bca->ref_pos[epos]++; bca->ref_bq[ibq]++; bca->ref_mq[imq]++; bca->ref_scl[sc_len]++; + if ( inm>=0 ) + { + bca->ref_nm[inm]++; + if ( r->ref_nm ) r->ref_nm[inm]++; + } } else { @@ -331,6 +380,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->alt_bq[ibq]++; bca->alt_mq[imq]++; bca->alt_scl[sc_len]++; + if ( inm>=0 ) + { + bca->alt_nm[inm]++; + if ( r->alt_nm ) r->alt_nm[inm]++; + } } } @@ -798,6 +852,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int call->n_alleles = j; if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything } + int has_alt = (call->n_alleles==2 && call->unseen!=-1) ? 0 : 1; /* * Set the phread likelihood array (call->PL) This array is 15 entries long * for each sample because that is size of an upper or lower triangle of a @@ -914,6 +969,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } + // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly + if ( !has_alt ) return 0; + calc_SegBias(calls, call); // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); @@ -922,7 +980,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int if (bca->fmt_flag & B2B_INFO_ZSCORE) { // U z-normalised as +/- number of standard deviations from mean. - if (call->ori_ref < 0) { + if (call->ori_ref < 0) { // indel if (bca->fmt_flag & B2B_INFO_RPB) call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, bca->npos, 0, 1); @@ -945,6 +1003,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, 100, 0,1); } + call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1); + if ( bca->fmt_flag & B2B_FMT_NMBZ ) + { + for (i=0; imwu_nm[i+1] = val!=HUGE_VAL ? val : 0; + } + } } else { // Old method; U as probability between 0 and 1 if ( bca->fmt_flag & B2B_INFO_RPB ) @@ -976,7 +1043,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) { extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); - int i, j, nals = 1; + int i, j, nals = 1, has_alt = 0; bcf_hdr_t *hdr = bc->bcf_hdr; rec->rid = bc->tid; @@ -1006,6 +1073,7 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); } nals++; + has_alt = 1; } } else // SNP @@ -1016,7 +1084,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if (bc->a[i] < 0) break; kputc(',', &bc->tmp); if ( bc->unseen==i ) kputs("<*>", &bc->tmp); - else kputc("ACGT"[bc->a[i]], &bc->tmp); + else + { + kputc("ACGT"[bc->a[i]], &bc->tmp); + has_alt = 1; + } nals++; } } @@ -1052,40 +1124,46 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, bcf_update_info_float(hdr, rec, "I16", tmpf, 16); bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals); - if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); - if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - - if (bca->fmt_flag & B2B_INFO_ZSCORE) { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); - if ( bc->mwu_sc != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); - } else { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); - } + if ( has_alt ) + { + if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); + if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); + + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); + if ( bc->mwu_nm[0] != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1); + if ( bc->mwu_sc != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); + } else { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + } - if ( bc->strand_bias != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); + if ( bc->strand_bias != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); #if CDF_MWU_TESTS - if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); - if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); - if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); - if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); + if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); + if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); + if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); + if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); #endif + } + tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); @@ -1144,5 +1222,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( fmt_flag&B2B_FMT_QS ) bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele); + if ( has_alt ) + { + if ( fmt_flag&B2B_FMT_NMBZ ) + bcf_update_format_float(hdr, rec, "NMBZ", bc->mwu_nm+1, rec->n_sample); + } + return 0; } diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c index 2d741fa..24c4270 100644 --- a/bcftools/bam2bcf.c.pysam.c +++ b/bcftools/bam2bcf.c.pysam.c @@ -3,7 +3,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Heng Li @@ -91,6 +91,39 @@ void bcf_call_destroy(bcf_callaux_t *bca) free(bca->bases); free(bca->inscns); free(bca); } +static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref) +{ + uint8_t *nm_tag = bam_aux_get(rec, "NM"); + if ( !nm_tag ) return -1; + int64_t nm = bam_aux2i(nm_tag); + + // Count indels as single events, not as the number of inserted/deleted + // bases (which is what NM does). Add soft clips as mismatches. + int i; + for (i=0; i < rec->core.n_cigar; i++) + { + int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK; + if ( val==BAM_CSOFT_CLIP ) + { + nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + } + else if ( val==BAM_CINS || val==BAM_CDEL ) + { + val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + if ( val > 1 ) nm -= val - 1; + } + } + + // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV + // http://www.genome.org/cgi/doi/10.1101/gr.239756.118 + nm -= is_ref ? 1 : 2; + + if ( nm < 0 ) nm = 0; + if ( nm >= B2B_N_NM ) nm = B2B_N_NM - 1; + + return nm; +} + // position in the sequence with respect to the aligned part of the read static int get_position(const bam_pileup1_t *p, int *len, int *sc_len, int *sc_dist) { @@ -160,6 +193,17 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); + if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); + if ( bca->fmt_flag&B2B_FMT_NMBZ ) + { + memset(call->ref_nm,0,sizeof(*call->ref_nm)*(call->n+1)*B2B_N_NM); + memset(call->alt_nm,0,sizeof(*call->alt_nm)*(call->n+1)*B2B_N_NM); + } + else + { + memset(call->ref_nm,0,sizeof(*call->ref_nm)*B2B_N_NM); + memset(call->alt_nm,0,sizeof(*call->alt_nm)*B2B_N_NM); + } memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES); memset(bca->ref_scl, 0, 100*sizeof(int)); memset(bca->alt_scl, 0, 100*sizeof(int)); @@ -311,21 +355,26 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (sc_len > 99) sc_len = 99; } } - int imq = mapQ * nqual_over_60; int ibq = baseQ * nqual_over_60; + int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1); if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; else bca->fwd_mqs[imq]++; - if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base ) + if ( !is_diff ) { bca->ref_pos[epos]++; bca->ref_bq[ibq]++; bca->ref_mq[imq]++; bca->ref_scl[sc_len]++; + if ( inm>=0 ) + { + bca->ref_nm[inm]++; + if ( r->ref_nm ) r->ref_nm[inm]++; + } } else { @@ -333,6 +382,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->alt_bq[ibq]++; bca->alt_mq[imq]++; bca->alt_scl[sc_len]++; + if ( inm>=0 ) + { + bca->alt_nm[inm]++; + if ( r->alt_nm ) r->alt_nm[inm]++; + } } } @@ -800,6 +854,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int call->n_alleles = j; if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything } + int has_alt = (call->n_alleles==2 && call->unseen!=-1) ? 0 : 1; /* * Set the phread likelihood array (call->PL) This array is 15 entries long * for each sample because that is size of an upper or lower triangle of a @@ -916,6 +971,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } + // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly + if ( !has_alt ) return 0; + calc_SegBias(calls, call); // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); @@ -924,7 +982,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int if (bca->fmt_flag & B2B_INFO_ZSCORE) { // U z-normalised as +/- number of standard deviations from mean. - if (call->ori_ref < 0) { + if (call->ori_ref < 0) { // indel if (bca->fmt_flag & B2B_INFO_RPB) call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, bca->npos, 0, 1); @@ -947,6 +1005,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, 100, 0,1); } + call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1); + if ( bca->fmt_flag & B2B_FMT_NMBZ ) + { + for (i=0; imwu_nm[i+1] = val!=HUGE_VAL ? val : 0; + } + } } else { // Old method; U as probability between 0 and 1 if ( bca->fmt_flag & B2B_INFO_RPB ) @@ -978,7 +1045,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) { extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); - int i, j, nals = 1; + int i, j, nals = 1, has_alt = 0; bcf_hdr_t *hdr = bc->bcf_hdr; rec->rid = bc->tid; @@ -1008,6 +1075,7 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp); } nals++; + has_alt = 1; } } else // SNP @@ -1018,7 +1086,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if (bc->a[i] < 0) break; kputc(',', &bc->tmp); if ( bc->unseen==i ) kputs("<*>", &bc->tmp); - else kputc("ACGT"[bc->a[i]], &bc->tmp); + else + { + kputc("ACGT"[bc->a[i]], &bc->tmp); + has_alt = 1; + } nals++; } } @@ -1054,40 +1126,46 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, bcf_update_info_float(hdr, rec, "I16", tmpf, 16); bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals); - if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); - if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - - if (bca->fmt_flag & B2B_INFO_ZSCORE) { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); - if ( bc->mwu_sc != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); - } else { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); - } + if ( has_alt ) + { + if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); + if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); + + if (bca->fmt_flag & B2B_INFO_ZSCORE) { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); + if ( bc->mwu_nm[0] != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1); + if ( bc->mwu_sc != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); + } else { + if ( bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); + if ( bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); + if ( bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); + if ( bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + } - if ( bc->strand_bias != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); + if ( bc->strand_bias != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); #if CDF_MWU_TESTS - if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); - if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); - if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); - if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); + if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); + if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); + if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); + if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); #endif + } + tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); @@ -1146,5 +1224,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( fmt_flag&B2B_FMT_QS ) bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele); + if ( has_alt ) + { + if ( fmt_flag&B2B_FMT_NMBZ ) + bcf_update_format_float(hdr, rec, "NMBZ", bc->mwu_nm+1, rec->n_sample); + } + return 0; } diff --git a/bcftools/bam2bcf.h b/bcftools/bam2bcf.h index e778b89..c256b26 100644 --- a/bcftools/bam2bcf.h +++ b/bcftools/bam2bcf.h @@ -1,7 +1,7 @@ /* bam2bcf.h -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Heng Li @@ -61,9 +61,12 @@ DEALINGS IN THE SOFTWARE. */ #define B2B_INFO_RPB (1<<15) #define B2B_FMT_QS (1<<16) #define B2B_INFO_SCB (1<<17) +#define B2B_FMT_NMBZ (1<<18) // per-sample NMBZ #define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised #define B2B_MAX_ALLELES 5 +#define B2B_N_NM 32 // number of NMBZ bins, i.e. max number of mismatches + #define B2B_DROP 0 #define B2B_INC_AD 1 @@ -100,6 +103,7 @@ typedef struct __bcf_callaux_t { errmod_t *e; void *rghash; float indel_bias; // adjusts indel score threshold; lower => call more. + int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm} } bcf_callaux_t; // per-sample values @@ -107,6 +111,7 @@ typedef struct { uint32_t ori_depth; // ori_depth = anno[0..3] but before --min-BQ is applied unsigned int mq0; int32_t *ADF, *ADR, SCR, *QS; // FMT/QS + int32_t *ref_nm, *alt_nm; // The fields are: // depth fwd .. ref (0) and non-ref (2) // depth rev .. ref (1) and non-ref (3) @@ -133,10 +138,10 @@ typedef struct { int n_supp; // number of supporting non-reference reads double anno[16]; unsigned int depth, ori_depth, mq0; - int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS; + int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS, *ref_nm, *alt_nm; uint8_t *fmt_arr; float vdb; // variant distance bias - float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc; + float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc, *mwu_nm; #if CDF_MWU_TESTS float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf; #endif diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 9bd33cd..84ae905 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2014-2021 Genome Research Ltd. + Copyright (c) 2014-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -123,9 +123,17 @@ typedef struct } args_t; +static void destroy_chain(chain_t *chain) +{ + if ( !chain ) return; + free(chain->ref_gaps); + free(chain->alt_gaps); + free(chain->block_lengths); + free(chain); +} static chain_t* init_chain(chain_t *chain, int ref_ori_pos) { -// fprintf(stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos); + if ( chain ) destroy_chain(chain); chain = (chain_t*) calloc(1,sizeof(chain_t)); chain->num = 0; chain->block_lengths = NULL; @@ -137,18 +145,6 @@ static chain_t* init_chain(chain_t *chain, int ref_ori_pos) return chain; } -static void destroy_chain(args_t *args) -{ - chain_t *chain = args->chain; - free(chain->ref_gaps); - free(chain->alt_gaps); - free(chain->block_lengths); - free(chain); - chain = NULL; - free(args->chr); - args->chr = NULL; -} - static void print_chain(args_t *args) { /* @@ -172,7 +168,7 @@ static void print_chain(args_t *args) - alt_start (same as ref_start, as no edits are recorded/applied before that position) - alt_end (adjusted to match the length of the alt sequence) - chain_num (just an auto-increment id) - + the other (sorted) lines are: - length of the ungapped alignment block - gap on the ref sequence between this and the next block (all but the last line) @@ -197,7 +193,7 @@ static void print_chain(args_t *args) static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len) { -// fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); + // fprintf(stderr, "push_chain_gap(chain=%p, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", chain, ref_start, ref_len, alt_start, alt_len); int num = chain->num; if (num && ref_start <= chain->ref_last_block_ori) { @@ -305,6 +301,7 @@ static void destroy_data(args_t *args) if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); + destroy_chain(args->chain); } static void init_region(args_t *args, char *line) @@ -346,12 +343,8 @@ static void init_region(args_t *args, char *line) bcf_sr_seek(args->files,line,args->fa_ori_pos); if ( tmp_ptr ) *tmp_ptr = tmp; fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); - if (args->chain_fname ) - { + if ( args->chain_fname ) args->chain = init_chain(args->chain, args->fa_ori_pos); - } else { - args->chain = NULL; - } } static bcf1_t **next_vcf_line(args_t *args) @@ -526,7 +519,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( !args->missing_allele ) return; ialt = -1; } - else + else { if ( !warned_haplotype ) { @@ -544,11 +537,11 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( !args->missing_allele ) return; ialt = -1; } - else + else ialt = bcf_gt_allele(ialt); } } - else if ( action==use_iupac ) + else if ( action==use_iupac ) { ialt = -1; int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1; @@ -717,7 +710,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) fprintf(stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); return; } - + } char *alt_allele = rec->d.allele[ialt]; @@ -743,7 +736,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) } } } - if ( idx>=args->fa_buf.l ) + if ( idx>=args->fa_buf.l ) error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); // sanity check the reference base @@ -803,8 +796,8 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( fail ) { char tmp = 0; - if ( args->fa_buf.l - idx > rec->rlen ) - { + if ( args->fa_buf.l - idx > rec->rlen ) + { tmp = args->fa_buf.s[idx+rec->rlen]; args->fa_buf.s[idx+rec->rlen] = 0; } @@ -820,7 +813,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) alen = strlen(alt_allele); len_diff = alen - rec->rlen; - if ( args->mark_del && len_diff<0 ) + if ( args->mark_del && len_diff<0 ) { alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); alen = rec->rlen; @@ -833,7 +826,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) alen = strlen(alt_allele); len_diff = alen - rec->rlen; - if ( args->mark_del && len_diff<0 ) + if ( args->mark_del && len_diff<0 ) { alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); alen = rec->rlen; @@ -949,10 +942,8 @@ static void consensus(args_t *args) if ( str.s[0]=='>' ) { // new sequence encountered - if (args->chain) { - print_chain(args); - destroy_chain(args); - } + if ( args->chain ) print_chain(args); + // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*) bcf1_t **rec_ptr = NULL; while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) @@ -1026,11 +1017,7 @@ static void consensus(args_t *args) if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break; apply_variant(args, rec); } - if (args->chain) - { - print_chain(args); - destroy_chain(args); - } + if (args->chain) print_chain(args); if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX); flush_fa_buffer(args, 0); bgzf_close(fasta); @@ -1078,6 +1065,8 @@ static void usage(args_t *args) fprintf(stderr, " # in the form \">chr:from-to\".\n"); fprintf(stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n"); fprintf(stderr, "\n"); + fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/consensus-sequence.html\n"); + fprintf(stderr, "\n"); exit(1); } @@ -1086,7 +1075,7 @@ int main_consensus(int argc, char *argv[]) args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - static struct option loptions[] = + static struct option loptions[] = { {"mark-del",required_argument,NULL,1}, {"mark-ins",required_argument,NULL,2}, @@ -1109,7 +1098,7 @@ int main_consensus(int argc, char *argv[]) int c; while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->mark_del = optarg[0]; break; case 2 : @@ -1126,10 +1115,10 @@ int main_consensus(int argc, char *argv[]) case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; case 'I': args->output_iupac = 1; break; - case 'e': + case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': + case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; @@ -1139,12 +1128,12 @@ int main_consensus(int argc, char *argv[]) args->absent_allele = optarg[0]; if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg); break; - case 'M': - args->missing_allele = optarg[0]; + case 'M': + args->missing_allele = optarg[0]; if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg); break; case 'c': args->chain_fname = optarg; break; - case 'H': + case 'H': if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF; else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT; else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF; diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 9c50091..4af9c18 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -2,20 +2,20 @@ /* The MIT License - Copyright (c) 2014-2021 Genome Research Ltd. + Copyright (c) 2014-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -125,9 +125,17 @@ typedef struct } args_t; +static void destroy_chain(chain_t *chain) +{ + if ( !chain ) return; + free(chain->ref_gaps); + free(chain->alt_gaps); + free(chain->block_lengths); + free(chain); +} static chain_t* init_chain(chain_t *chain, int ref_ori_pos) { -// fprintf(bcftools_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos); + if ( chain ) destroy_chain(chain); chain = (chain_t*) calloc(1,sizeof(chain_t)); chain->num = 0; chain->block_lengths = NULL; @@ -139,18 +147,6 @@ static chain_t* init_chain(chain_t *chain, int ref_ori_pos) return chain; } -static void destroy_chain(args_t *args) -{ - chain_t *chain = args->chain; - free(chain->ref_gaps); - free(chain->alt_gaps); - free(chain->block_lengths); - free(chain); - chain = NULL; - free(args->chr); - args->chr = NULL; -} - static void print_chain(args_t *args) { /* @@ -174,7 +170,7 @@ static void print_chain(args_t *args) - alt_start (same as ref_start, as no edits are recorded/applied before that position) - alt_end (adjusted to match the length of the alt sequence) - chain_num (just an auto-increment id) - + the other (sorted) lines are: - length of the ungapped alignment block - gap on the ref sequence between this and the next block (all but the last line) @@ -199,7 +195,7 @@ static void print_chain(args_t *args) static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len) { -// fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len); + // fprintf(bcftools_stderr, "push_chain_gap(chain=%p, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", chain, ref_start, ref_len, alt_start, alt_len); int num = chain->num; if (num && ref_start <= chain->ref_last_block_ori) { @@ -307,6 +303,7 @@ static void destroy_data(args_t *args) if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); + destroy_chain(args->chain); } static void init_region(args_t *args, char *line) @@ -348,12 +345,8 @@ static void init_region(args_t *args, char *line) bcf_sr_seek(args->files,line,args->fa_ori_pos); if ( tmp_ptr ) *tmp_ptr = tmp; fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line); - if (args->chain_fname ) - { + if ( args->chain_fname ) args->chain = init_chain(args->chain, args->fa_ori_pos); - } else { - args->chain = NULL; - } } static bcf1_t **next_vcf_line(args_t *args) @@ -528,7 +521,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( !args->missing_allele ) return; ialt = -1; } - else + else { if ( !warned_haplotype ) { @@ -546,11 +539,11 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( !args->missing_allele ) return; ialt = -1; } - else + else ialt = bcf_gt_allele(ialt); } } - else if ( action==use_iupac ) + else if ( action==use_iupac ) { ialt = -1; int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1; @@ -719,7 +712,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) fprintf(bcftools_stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); return; } - + } char *alt_allele = rec->d.allele[ialt]; @@ -745,7 +738,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) } } } - if ( idx>=args->fa_buf.l ) + if ( idx>=args->fa_buf.l ) error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); // sanity check the reference base @@ -805,8 +798,8 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( fail ) { char tmp = 0; - if ( args->fa_buf.l - idx > rec->rlen ) - { + if ( args->fa_buf.l - idx > rec->rlen ) + { tmp = args->fa_buf.s[idx+rec->rlen]; args->fa_buf.s[idx+rec->rlen] = 0; } @@ -822,7 +815,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) alen = strlen(alt_allele); len_diff = alen - rec->rlen; - if ( args->mark_del && len_diff<0 ) + if ( args->mark_del && len_diff<0 ) { alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); alen = rec->rlen; @@ -835,7 +828,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) alen = strlen(alt_allele); len_diff = alen - rec->rlen; - if ( args->mark_del && len_diff<0 ) + if ( args->mark_del && len_diff<0 ) { alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); alen = rec->rlen; @@ -951,10 +944,8 @@ static void consensus(args_t *args) if ( str.s[0]=='>' ) { // new sequence encountered - if (args->chain) { - print_chain(args); - destroy_chain(args); - } + if ( args->chain ) print_chain(args); + // apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*) bcf1_t **rec_ptr = NULL; while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) ) @@ -1028,11 +1019,7 @@ static void consensus(args_t *args) if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break; apply_variant(args, rec); } - if (args->chain) - { - print_chain(args); - destroy_chain(args); - } + if (args->chain) print_chain(args); if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX); flush_fa_buffer(args, 0); bgzf_close(fasta); @@ -1080,6 +1067,8 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n"); fprintf(bcftools_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n"); fprintf(bcftools_stderr, "\n"); + fprintf(bcftools_stderr, " # See also http://samtools.github.io/bcftools/howtos/consensus-sequence.html\n"); + fprintf(bcftools_stderr, "\n"); bcftools_exit(1); } @@ -1088,7 +1077,7 @@ int main_consensus(int argc, char *argv[]) args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; - static struct option loptions[] = + static struct option loptions[] = { {"mark-del",required_argument,NULL,1}, {"mark-ins",required_argument,NULL,2}, @@ -1111,7 +1100,7 @@ int main_consensus(int argc, char *argv[]) int c; while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->mark_del = optarg[0]; break; case 2 : @@ -1128,10 +1117,10 @@ int main_consensus(int argc, char *argv[]) case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; case 'I': args->output_iupac = 1; break; - case 'e': + case 'e': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': + case 'i': if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; @@ -1141,12 +1130,12 @@ int main_consensus(int argc, char *argv[]) args->absent_allele = optarg[0]; if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg); break; - case 'M': - args->missing_allele = optarg[0]; + case 'M': + args->missing_allele = optarg[0]; if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg); break; case 'c': args->chain_fname = optarg; break; - case 'H': + case 'H': if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF; else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT; else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF; diff --git a/bcftools/convert.c b/bcftools/convert.c index 7fca60b..5317cb8 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -955,12 +955,12 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int } else if ( bcf_gt_is_missing(ptr[0]) ) { - if ( ptr[1]==bcf_int8_vector_end ) + if ( ptr[1]==bcf_int8_vector_end ) { str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; } - else - { + else + { str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; } } @@ -1192,11 +1192,10 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa } if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str); - else + else { double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5); pval *= 2; - assert( pval-1 < 1e-10 ); if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) else pval = -4.34294481903*log(pval); @@ -1356,12 +1355,12 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT); else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT); - else if ( !strcmp(str.s, "TBCSQ") ) + else if ( !strcmp(str.s, "TBCSQ") ) { fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ); fmt->subscript = parse_subscript(&q); if ( fmt->subscript==-1 ) - { + { if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; } } else fmt->subscript++; @@ -1408,7 +1407,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) else { _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) - else if ( !strcmp(str.s, "ALT") ) + else if ( !strcmp(str.s, "ALT") ) { fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); fmt->subscript = parse_subscript(&q); @@ -1619,7 +1618,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) str->l = 0; for (i=0; infmt; i++) { - // Genotype fields. + // Genotype fields. if ( convert->fmt[i].is_gt_field ) { int j = i, js, k; @@ -1640,7 +1639,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) // anything to the string, we trim all genotype fields enclosed in square // brackets here. This may be changed in future, time will show... size_t l_start = str->l; - + for (k=i; kfmt[k].type == T_MASK ) @@ -1678,7 +1677,7 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) va_list args; va_start(args, opt); - switch (opt) + switch (opt) { case allow_undef_tags: convert->allow_undef_tags = va_arg(args, int); diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index 86cf9e8..6b9e851 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -2,7 +2,7 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -957,12 +957,12 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int } else if ( bcf_gt_is_missing(ptr[0]) ) { - if ( ptr[1]==bcf_int8_vector_end ) + if ( ptr[1]==bcf_int8_vector_end ) { str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' '; } - else - { + else + { str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' '; } } @@ -1194,11 +1194,10 @@ static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa } if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str); - else + else { double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5); pval *= 2; - assert( pval-1 < 1e-10 ); if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) else pval = -4.34294481903*log(pval); @@ -1358,12 +1357,12 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE); else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT); else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT); - else if ( !strcmp(str.s, "TBCSQ") ) + else if ( !strcmp(str.s, "TBCSQ") ) { fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ); fmt->subscript = parse_subscript(&q); if ( fmt->subscript==-1 ) - { + { if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; } } else fmt->subscript++; @@ -1410,7 +1409,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf) else { _SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf) - else if ( !strcmp(str.s, "ALT") ) + else if ( !strcmp(str.s, "ALT") ) { fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT); fmt->subscript = parse_subscript(&q); @@ -1621,7 +1620,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) str->l = 0; for (i=0; infmt; i++) { - // Genotype fields. + // Genotype fields. if ( convert->fmt[i].is_gt_field ) { int j = i, js, k; @@ -1642,7 +1641,7 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) // anything to the string, we trim all genotype fields enclosed in square // brackets here. This may be changed in future, time will show... size_t l_start = str->l; - + for (k=i; kfmt[k].type == T_MASK ) @@ -1680,7 +1679,7 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...) va_list args; va_start(args, opt); - switch (opt) + switch (opt) { case allow_undef_tags: convert->allow_undef_tags = va_arg(args, int); diff --git a/bcftools/filter.c b/bcftools/filter.c index 7ff006e..d545608 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -144,12 +144,13 @@ struct _filter_t #define TOK_sMEDIAN 35 #define TOK_sSTDEV 36 #define TOK_sSUM 37 -#define TOK_IN 38 // contains, e.g. FILTER~"A" -#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" +#define TOK_IN 38 // contains, e.g. FILTER~"A" +#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" +#define TOK_MODULO 40 // % -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 -// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 +// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s % +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 }; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok); @@ -240,9 +241,10 @@ static int filters_next_token(char **str, int *len) if ( tmp[0]=='-' ) break; if ( tmp[0]=='/' ) break; if ( tmp[0]=='~' ) break; + if ( tmp[0]=='%' ) break; } if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; } - if ( tmp[0]=='[' ) square_brackets++; + if ( tmp[0]=='[' ) square_brackets++; tmp++; } if ( tmp > *str ) @@ -290,6 +292,7 @@ static int filters_next_token(char **str, int *len) if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; } if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; } if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; } + if ( tmp[0]=='%' ) { (*str) += 1; return TOK_MODULO; } *len = tmp - (*str); return TOK_VAL; @@ -298,7 +301,7 @@ static int filters_next_token(char **str, int *len) /* Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. - + Based on jkb's staden code with some adjustments. https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123 */ @@ -420,54 +423,81 @@ static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf } static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { - int i; - if ( rtok->tok_type==TOK_NOT_IN ) + // the btok values contain FILTER ids obtained by parsing the user expression + int i,j; + if ( rtok->tok_type==TOK_NOT_IN ) // fail if the query expression is a subset of the VCF FILTER { - if ( !line->d.n_flt ) + if ( !btok->nvalues ) // the query expression is ".", pass everything unless the VCF is also "." + { + if ( line->d.n_flt ) rtok->pass_site = 1; + return; + } + if ( !line->d.n_flt ) // no filters at this VCF line and the query expression has a value { - if ( atok->hdr_id==-1 ) return; // missing value rtok->pass_site = 1; - return; // no filter present, eval to true + return; } - for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) return; - rtok->pass_site = 1; + for (j=0; jnvalues; j++) // some query expression value must be absent from VCF in order to pass + { + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF + } + if ( j!=btok->nvalues ) rtok->pass_site = 1; return; } else if ( rtok->tok_type==TOK_IN ) { - if ( !line->d.n_flt ) + if ( !btok->nvalues ) // the query expression is ".", fail everything unless the VCF is also "." { - if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } - return; // no filter present, eval to false + if ( !line->d.n_flt ) rtok->pass_site = 1; + return; + } + if ( !line->d.n_flt ) return; // no filters at this VCF line and the query expression has a value + for (j=0; jnvalues; j++) // all of the query values must be present in the VCF in order to pass + { + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF } - for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; } + if ( j==btok->nvalues ) rtok->pass_site = 1; return; } - else if ( rtok->tok_type==TOK_NE ) // exact match + else if ( rtok->tok_type==TOK_NE ) // require anything but exact match { - if ( !line->d.n_flt ) + if ( btok->nvalues != line->d.n_flt ) { - if ( atok->hdr_id==-1 ) return; // missing value rtok->pass_site = 1; - return; // no filter present, eval to true + return; } - if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present - rtok->pass_site = 1; + if ( !btok->nvalues ) return; + for (j=0; jnvalues; j++) // some of the query values must be absent from the VCF in order to pass + { + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF + } + if ( j!=btok->nvalues ) rtok->pass_site = 1; return; } - else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present + else if ( rtok->tok_type==TOK_EQ ) // require exact match { - if ( !line->d.n_flt ) + if ( btok->nvalues != line->d.n_flt ) return; + if ( !btok->nvalues ) + { + rtok->pass_site = 1; + return; + } + for (j=0; jnvalues; j++) // all of the query values must be present in the VCF in order to pass { - if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } - return; // no filter present, eval to false + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF } - if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1; + if ( j==btok->nvalues ) rtok->pass_site = 1; return; } - else + else error("Only ==, !=, ~, and !~ operators are supported for FILTER\n"); return; } @@ -490,7 +520,7 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t * if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); - if ( rtok->tok_type==TOK_EQ ) + if ( rtok->tok_type==TOK_EQ ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; else if ( rtok->tok_type==TOK_NE ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; @@ -886,7 +916,7 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int tok->nvalues = tok->str_value.l = 0; return; } - + int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4; if ( tok->str_value.m <= nvals1*nsmpl ) { @@ -1072,7 +1102,7 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) tok->nvalues = 0; return; } - + int j,nmissing = 0; #define BRANCH(type_t, is_vector_end) { \ for (i=0; in_sample; i++) \ @@ -1157,7 +1187,7 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok) static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok) { filters_set_ac(flt,line,tok); - tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0; + tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0; tok->nvalues = 1; } static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok) @@ -1729,7 +1759,6 @@ static inline double calc_binom(int na, int nb) double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5); pval *= 2; - assert( pval-1 < 1e-10 ); if ( pval>1 ) pval = 1; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) return pval; @@ -1928,7 +1957,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples ) { rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples; - rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); + rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); int i; for (i=0; insamples; i++) rtok->usmpl[i] |= atok->usmpl[i]; for (i=0; insamples; i++) rtok->usmpl[i] |= btok->usmpl[i]; @@ -1937,7 +1966,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) memset(rtok->pass_samples, 0, rtok->nsamples); } -#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \ +#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP,TYPE) \ { \ token_t *rtok = _rtok; \ int i, has_values = 0; \ @@ -1956,7 +1985,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) continue; \ } \ has_values = 1; \ - rtok->values[i] = atok->values[i] AOP btok->values[i]; \ + rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \ } \ } \ else if ( atok->nsamples ) \ @@ -1972,7 +2001,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) continue; \ } \ has_values = 1; \ - rtok->values[i] = atok->values[i] AOP btok->values[0]; \ + rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[0]; \ } \ } \ } \ @@ -1989,7 +2018,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) continue; \ } \ has_values = 1; \ - rtok->values[i] = atok->values[0] AOP btok->values[i]; \ + rtok->values[i] = TYPE atok->values[0] AOP TYPE btok->values[i]; \ } \ } \ } \ @@ -2077,7 +2106,7 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token for (i=0; insamples; i++) { if ( !rtok->usmpl[i] ) continue; - rtok->pass_samples[i] = tok->pass_samples[i]; + rtok->pass_samples[i] = tok->pass_samples[i]; } rtok->pass_site = 1; return 2; @@ -2355,7 +2384,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) return; } if ( !regex ) - rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); + rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); else { token_t *tok = atok->regex ? btok : atok; @@ -2370,7 +2399,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) { if ( missing_logic[2] ) { - for (i=0; insamples; i++) + for (i=0; insamples; i++) if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } } return; @@ -2395,7 +2424,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) return; } - // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) + // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) token_t *xtok = atok->nsamples ? atok : btok; token_t *ytok = atok->nsamples ? btok : atok; assert( regex==ytok->regex ); @@ -2499,7 +2528,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, if ( !list ) error("Could not read: %s\n", fname); free(fname); tok->nsamples = bcf_hdr_nsamples(hdr); - tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); + tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); for (i=0; iidxs = idxs1; + tok->idxs = idxs1; tok->nidxs = nidxs1; tok->idx = idx1; } @@ -2556,7 +2585,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, if ( set_samples ) { tok->nsamples = bcf_hdr_nsamples(hdr); - tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); + tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); if ( idx1>=0 ) { if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); @@ -2720,6 +2749,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; tok->threshold = bcf_hdr_nsamples(filter->hdr); + tok->is_constant = 1; return 0; } else if ( !strncasecmp(str,"N_MISSING",len) ) @@ -2760,13 +2790,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } if ( is_fmt==-1 ) is_fmt = 0; } - if ( is_array ) + if ( is_array ) parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok); - else if ( is_fmt && !tok->nsamples ) + else if ( is_fmt && !tok->nsamples ) { int i; tok->nsamples = bcf_hdr_nsamples(filter->hdr); - tok->usmpl = (uint8_t*) malloc(tok->nsamples); + tok->usmpl = (uint8_t*) malloc(tok->nsamples); for (i=0; insamples; i++) tok->usmpl[i] = 1; } @@ -2817,7 +2847,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break; default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__); } - if (!is_array) + if (!is_array) { tok->idx = -2; tok->idxs = (int*) malloc(sizeof(int)); @@ -3015,7 +3045,7 @@ static void perl_init(filter_t *filter, char **str) char **env = NULL; PERL_SYS_INIT3(&argc, &argv, &env); } - + filter->perl = perl_alloc(); PerlInterpreter *perl = filter->perl; @@ -3114,6 +3144,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) tok->hdr_id = -1; tok->pass_site = -1; tok->threshold = -1.0; + tok->is_constant = 1; ret = TOK_MULT; } else if ( ret == -TOK_FUNC ) @@ -3328,6 +3359,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; } else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); + out[ival].is_constant = 1; if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; out[ival].tag = out[ival].key; out[ival].key = NULL; i = itok; @@ -3373,16 +3405,34 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( out[ival].tok_type!=TOK_VAL || !out[ival].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); - if ( strcmp(".",out[ival].key) ) + token_t *tok = &out[ival]; + char *bp = tok->key; + tok->nvalues = 0; + int has_missing = 0; + while ( *bp ) { - out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key); - if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) ) - error("The filter \"%s\" not present in the VCF header\n", out[ival].key); + char tmp, *ep = bp; + while ( *ep && *ep!=';' ) ep++; + tmp = *ep; + *ep = 0; + if ( !strcmp(".",bp) ) has_missing = 1; + else + { + tok->nvalues++; + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + int id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, bp); + if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,id) ) + error("The filter \"%s\" not present in the VCF header\n", bp); + tok->values[tok->nvalues-1] = id; + } + *ep = tmp; + if ( !tmp ) break; + bp = ep + 1; } - else - out[ival].hdr_id = -1; - out[ival].tag = out[ival].key; out[ival].key = NULL; - out[itok].hdr_id = out[ival].hdr_id; + if ( has_missing && tok->nvalues ) error("The FILTER expression cannot contain missing value AND filters: \"%s\" (%d)\n",tok->key,tok->nvalues); + out[ival].tag = tok->key; + tok->key = NULL; + out[itok].hdr_id = tok->hdr_id; continue; } } @@ -3473,7 +3523,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) kputs(filter->filters[i].key, &filter->filters[i].str_value); filter->filters[i].nvalues = filter->filters[i].str_value.l; } - else // numeric constant + else if ( filter->filters[i].is_constant ) // numeric constant { filter->filters[i].values[0] = filter->filters[i].threshold; filter->filters[i].nvalues = 1; @@ -3495,28 +3545,35 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) if ( filter->filters[i].tok_type == TOK_ADD ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+,(double)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_SUB ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-,(double)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_MULT ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*,(double)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_DIV ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/,(double)); + filter->flt_stack[nstack-2] = &filter->filters[i]; + nstack--; + continue; + } + else if ( filter->filters[i].tok_type == TOK_MODULO ) + { + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],%,(int)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index 3335cde..d15586c 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -146,12 +146,13 @@ struct _filter_t #define TOK_sMEDIAN 35 #define TOK_sSTDEV 36 #define TOK_sSUM 37 -#define TOK_IN 38 // contains, e.g. FILTER~"A" -#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" +#define TOK_IN 38 // contains, e.g. FILTER~"A" +#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A" +#define TOK_MODULO 40 // % -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 -// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 +// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s % +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 }; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok); @@ -242,9 +243,10 @@ static int filters_next_token(char **str, int *len) if ( tmp[0]=='-' ) break; if ( tmp[0]=='/' ) break; if ( tmp[0]=='~' ) break; + if ( tmp[0]=='%' ) break; } if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; } - if ( tmp[0]=='[' ) square_brackets++; + if ( tmp[0]=='[' ) square_brackets++; tmp++; } if ( tmp > *str ) @@ -292,6 +294,7 @@ static int filters_next_token(char **str, int *len) if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; } if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; } if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; } + if ( tmp[0]=='%' ) { (*str) += 1; return TOK_MODULO; } *len = tmp - (*str); return TOK_VAL; @@ -300,7 +303,7 @@ static int filters_next_token(char **str, int *len) /* Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. - + Based on jkb's staden code with some adjustments. https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123 */ @@ -422,54 +425,81 @@ static void filters_cmp_bit_and(token_t *atok, token_t *btok, token_t *rtok, bcf } static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { - int i; - if ( rtok->tok_type==TOK_NOT_IN ) + // the btok values contain FILTER ids obtained by parsing the user expression + int i,j; + if ( rtok->tok_type==TOK_NOT_IN ) // fail if the query expression is a subset of the VCF FILTER { - if ( !line->d.n_flt ) + if ( !btok->nvalues ) // the query expression is ".", pass everything unless the VCF is also "." + { + if ( line->d.n_flt ) rtok->pass_site = 1; + return; + } + if ( !line->d.n_flt ) // no filters at this VCF line and the query expression has a value { - if ( atok->hdr_id==-1 ) return; // missing value rtok->pass_site = 1; - return; // no filter present, eval to true + return; } - for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) return; - rtok->pass_site = 1; + for (j=0; jnvalues; j++) // some query expression value must be absent from VCF in order to pass + { + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF + } + if ( j!=btok->nvalues ) rtok->pass_site = 1; return; } else if ( rtok->tok_type==TOK_IN ) { - if ( !line->d.n_flt ) + if ( !btok->nvalues ) // the query expression is ".", fail everything unless the VCF is also "." { - if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } - return; // no filter present, eval to false + if ( !line->d.n_flt ) rtok->pass_site = 1; + return; + } + if ( !line->d.n_flt ) return; // no filters at this VCF line and the query expression has a value + for (j=0; jnvalues; j++) // all of the query values must be present in the VCF in order to pass + { + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF } - for (i=0; id.n_flt; i++) - if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; } + if ( j==btok->nvalues ) rtok->pass_site = 1; return; } - else if ( rtok->tok_type==TOK_NE ) // exact match + else if ( rtok->tok_type==TOK_NE ) // require anything but exact match { - if ( !line->d.n_flt ) + if ( btok->nvalues != line->d.n_flt ) { - if ( atok->hdr_id==-1 ) return; // missing value rtok->pass_site = 1; - return; // no filter present, eval to true + return; } - if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present - rtok->pass_site = 1; + if ( !btok->nvalues ) return; + for (j=0; jnvalues; j++) // some of the query values must be absent from the VCF in order to pass + { + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF + } + if ( j!=btok->nvalues ) rtok->pass_site = 1; return; } - else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present + else if ( rtok->tok_type==TOK_EQ ) // require exact match { - if ( !line->d.n_flt ) + if ( btok->nvalues != line->d.n_flt ) return; + if ( !btok->nvalues ) + { + rtok->pass_site = 1; + return; + } + for (j=0; jnvalues; j++) // all of the query values must be present in the VCF in order to pass { - if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; } - return; // no filter present, eval to false + for (i=0; id.n_flt; i++) + if ( btok->values[j]==line->d.flt[i] ) break; + if ( i==line->d.n_flt ) break; // the query is not in the VCF } - if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1; + if ( j==btok->nvalues ) rtok->pass_site = 1; return; } - else + else error("Only ==, !=, ~, and !~ operators are supported for FILTER\n"); return; } @@ -492,7 +522,7 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t * if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n"); - if ( rtok->tok_type==TOK_EQ ) + if ( rtok->tok_type==TOK_EQ ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; else if ( rtok->tok_type==TOK_NE ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; @@ -888,7 +918,7 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int tok->nvalues = tok->str_value.l = 0; return; } - + int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4; if ( tok->str_value.m <= nvals1*nsmpl ) { @@ -1074,7 +1104,7 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) tok->nvalues = 0; return; } - + int j,nmissing = 0; #define BRANCH(type_t, is_vector_end) { \ for (i=0; in_sample; i++) \ @@ -1159,7 +1189,7 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok) static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok) { filters_set_ac(flt,line,tok); - tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0; + tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0; tok->nvalues = 1; } static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok) @@ -1731,7 +1761,6 @@ static inline double calc_binom(int na, int nb) double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5); pval *= 2; - assert( pval-1 < 1e-10 ); if ( pval>1 ) pval = 1; // this can happen, machine precision error, eg. kf_betai(1,0,0.5) return pval; @@ -1930,7 +1959,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples ) { rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples; - rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); + rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1); int i; for (i=0; insamples; i++) rtok->usmpl[i] |= atok->usmpl[i]; for (i=0; insamples; i++) rtok->usmpl[i] |= btok->usmpl[i]; @@ -1939,7 +1968,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) memset(rtok->pass_samples, 0, rtok->nsamples); } -#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \ +#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP,TYPE) \ { \ token_t *rtok = _rtok; \ int i, has_values = 0; \ @@ -1958,7 +1987,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) continue; \ } \ has_values = 1; \ - rtok->values[i] = atok->values[i] AOP btok->values[i]; \ + rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \ } \ } \ else if ( atok->nsamples ) \ @@ -1974,7 +2003,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) continue; \ } \ has_values = 1; \ - rtok->values[i] = atok->values[i] AOP btok->values[0]; \ + rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[0]; \ } \ } \ } \ @@ -1991,7 +2020,7 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) continue; \ } \ has_values = 1; \ - rtok->values[i] = atok->values[0] AOP btok->values[i]; \ + rtok->values[i] = TYPE atok->values[0] AOP TYPE btok->values[i]; \ } \ } \ } \ @@ -2079,7 +2108,7 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token for (i=0; insamples; i++) { if ( !rtok->usmpl[i] ) continue; - rtok->pass_samples[i] = tok->pass_samples[i]; + rtok->pass_samples[i] = tok->pass_samples[i]; } rtok->pass_site = 1; return 2; @@ -2357,7 +2386,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) return; } if ( !regex ) - rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); + rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic); else { token_t *tok = atok->regex ? btok : atok; @@ -2372,7 +2401,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) { if ( missing_logic[2] ) { - for (i=0; insamples; i++) + for (i=0; insamples; i++) if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; } } return; @@ -2397,7 +2426,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) return; } - // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) + // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues) token_t *xtok = atok->nsamples ? atok : btok; token_t *ytok = atok->nsamples ? btok : atok; assert( regex==ytok->regex ); @@ -2501,7 +2530,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, if ( !list ) error("Could not read: %s\n", fname); free(fname); tok->nsamples = bcf_hdr_nsamples(hdr); - tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); + tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); for (i=0; iidxs = idxs1; + tok->idxs = idxs1; tok->nidxs = nidxs1; tok->idx = idx1; } @@ -2558,7 +2587,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, if ( set_samples ) { tok->nsamples = bcf_hdr_nsamples(hdr); - tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); + tok->usmpl = (uint8_t*) calloc(tok->nsamples,1); if ( idx1>=0 ) { if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); @@ -2722,6 +2751,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; tok->threshold = bcf_hdr_nsamples(filter->hdr); + tok->is_constant = 1; return 0; } else if ( !strncasecmp(str,"N_MISSING",len) ) @@ -2762,13 +2792,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) } if ( is_fmt==-1 ) is_fmt = 0; } - if ( is_array ) + if ( is_array ) parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok); - else if ( is_fmt && !tok->nsamples ) + else if ( is_fmt && !tok->nsamples ) { int i; tok->nsamples = bcf_hdr_nsamples(filter->hdr); - tok->usmpl = (uint8_t*) malloc(tok->nsamples); + tok->usmpl = (uint8_t*) malloc(tok->nsamples); for (i=0; insamples; i++) tok->usmpl[i] = 1; } @@ -2819,7 +2849,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break; default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__); } - if (!is_array) + if (!is_array) { tok->idx = -2; tok->idxs = (int*) malloc(sizeof(int)); @@ -3017,7 +3047,7 @@ static void perl_init(filter_t *filter, char **str) char **env = NULL; PERL_SYS_INIT3(&argc, &argv, &env); } - + filter->perl = perl_alloc(); PerlInterpreter *perl = filter->perl; @@ -3116,6 +3146,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) tok->hdr_id = -1; tok->pass_site = -1; tok->threshold = -1.0; + tok->is_constant = 1; ret = TOK_MULT; } else if ( ret == -TOK_FUNC ) @@ -3330,6 +3361,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; } else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; } else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str); + out[ival].is_constant = 1; if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and; out[ival].tag = out[ival].key; out[ival].key = NULL; i = itok; @@ -3375,16 +3407,34 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( out[ival].tok_type!=TOK_VAL || !out[ival].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); - if ( strcmp(".",out[ival].key) ) + token_t *tok = &out[ival]; + char *bp = tok->key; + tok->nvalues = 0; + int has_missing = 0; + while ( *bp ) { - out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key); - if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) ) - error("The filter \"%s\" not present in the VCF header\n", out[ival].key); + char tmp, *ep = bp; + while ( *ep && *ep!=';' ) ep++; + tmp = *ep; + *ep = 0; + if ( !strcmp(".",bp) ) has_missing = 1; + else + { + tok->nvalues++; + hts_expand(double,tok->nvalues,tok->mvalues,tok->values); + int id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, bp); + if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,id) ) + error("The filter \"%s\" not present in the VCF header\n", bp); + tok->values[tok->nvalues-1] = id; + } + *ep = tmp; + if ( !tmp ) break; + bp = ep + 1; } - else - out[ival].hdr_id = -1; - out[ival].tag = out[ival].key; out[ival].key = NULL; - out[itok].hdr_id = out[ival].hdr_id; + if ( has_missing && tok->nvalues ) error("The FILTER expression cannot contain missing value AND filters: \"%s\" (%d)\n",tok->key,tok->nvalues); + out[ival].tag = tok->key; + tok->key = NULL; + out[itok].hdr_id = tok->hdr_id; continue; } } @@ -3475,7 +3525,7 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) kputs(filter->filters[i].key, &filter->filters[i].str_value); filter->filters[i].nvalues = filter->filters[i].str_value.l; } - else // numeric constant + else if ( filter->filters[i].is_constant ) // numeric constant { filter->filters[i].values[0] = filter->filters[i].threshold; filter->filters[i].nvalues = 1; @@ -3497,28 +3547,35 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) if ( filter->filters[i].tok_type == TOK_ADD ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+,(double)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_SUB ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-,(double)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_MULT ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*,(double)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_DIV ) { - VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/); + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/,(double)); + filter->flt_stack[nstack-2] = &filter->filters[i]; + nstack--; + continue; + } + else if ( filter->filters[i].tok_type == TOK_MODULO ) + { + VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],%,(int)); filter->flt_stack[nstack-2] = &filter->filters[i]; nstack--; continue; diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index fd5aa51..fc4f4b1 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -644,7 +644,7 @@ static int mpileup(mplp_conf_t *conf) fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); exit(EXIT_FAILURE); } - if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 1)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } @@ -777,6 +777,9 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_FMT_NMBZ ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_INFO_SCB ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); } else { @@ -875,6 +878,17 @@ static int mpileup(mplp_conf_t *conf) if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) ) conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR)); } + int nnmbz = (conf->fmt_flag&B2B_FMT_NMBZ) ? nsmpl + 1 : 1; + conf->bc.ref_nm = (int32_t*) malloc(sizeof(*conf->bc.ref_nm) * nnmbz * B2B_N_NM); + conf->bc.alt_nm = (int32_t*) malloc(sizeof(*conf->bc.alt_nm) * nnmbz * B2B_N_NM); + conf->bc.mwu_nm = (float*) malloc((nsmpl+1)*sizeof(*conf->bc.mwu_nm)); + conf->bca->ref_nm = conf->bc.ref_nm; // this is just to make the arrays available in bcf_call_glfgen() + conf->bca->alt_nm = conf->bc.alt_nm; + if ( conf->fmt_flag&B2B_FMT_NMBZ ) + { + for (i=0; ibcr[i].ref_nm = conf->bc.ref_nm + (i+1)*B2B_N_NM; + for (i=0; ibcr[i].alt_nm = conf->bc.alt_nm + (i+1)*B2B_N_NM; + } // init mpileup conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); @@ -941,7 +955,10 @@ static int mpileup(mplp_conf_t *conf) free(conf->bc.ADF); free(conf->bc.SCR); free(conf->bc.QS); + free(conf->bc.ref_nm); + free(conf->bc.alt_nm); free(conf->bc.fmt_arr); + free(conf->bc.mwu_nm); free(conf->bcr); } if ( conf->gvcf ) gvcf_destroy(conf->gvcf); @@ -1045,6 +1062,7 @@ int parse_format_flag(const char *str) else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS; + else if ( !strcasecmp(tags[i],"NMBZ") || !strcasecmp(tags[i],"FORMAT/NMBZ") || !strcasecmp(tags[i],"FMT/NMBZ") ) flag |= B2B_FMT_NMBZ; else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; @@ -1070,13 +1088,14 @@ static void list_annotations(FILE *fp) "\n" "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n" "\n" -" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n" -" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" -" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" -" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" -" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n" -" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" -" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" +" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n" +" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" +" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" +" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" +" FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n" +" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n" +" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" +" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" "\n" "INFO annotation tags available:\n" "\n" @@ -1141,7 +1160,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --seed INT Random number seed used for sampling deep regions [0]\n" "\n" "Output options:\n" - " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n" + " -a, --annotate LIST Optional tags to output; '\\?' to list available tags []\n" " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n" " To minimum per-sample DP\n" " --no-version Do not append version and command line to the header\n" diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 159e57c..6e0ae5b 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -646,7 +646,7 @@ static int mpileup(mplp_conf_t *conf) fprintf(bcftools_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); bcftools_exit(EXIT_FAILURE); } - if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { + if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 1)) { fprintf(bcftools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); bcftools_exit(EXIT_FAILURE); } @@ -779,6 +779,9 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_FMT_NMBZ ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_INFO_SCB ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); } else { @@ -877,6 +880,17 @@ static int mpileup(mplp_conf_t *conf) if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) ) conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR)); } + int nnmbz = (conf->fmt_flag&B2B_FMT_NMBZ) ? nsmpl + 1 : 1; + conf->bc.ref_nm = (int32_t*) malloc(sizeof(*conf->bc.ref_nm) * nnmbz * B2B_N_NM); + conf->bc.alt_nm = (int32_t*) malloc(sizeof(*conf->bc.alt_nm) * nnmbz * B2B_N_NM); + conf->bc.mwu_nm = (float*) malloc((nsmpl+1)*sizeof(*conf->bc.mwu_nm)); + conf->bca->ref_nm = conf->bc.ref_nm; // this is just to make the arrays available in bcf_call_glfgen() + conf->bca->alt_nm = conf->bc.alt_nm; + if ( conf->fmt_flag&B2B_FMT_NMBZ ) + { + for (i=0; ibcr[i].ref_nm = conf->bc.ref_nm + (i+1)*B2B_N_NM; + for (i=0; ibcr[i].alt_nm = conf->bc.alt_nm + (i+1)*B2B_N_NM; + } // init mpileup conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); @@ -943,7 +957,10 @@ static int mpileup(mplp_conf_t *conf) free(conf->bc.ADF); free(conf->bc.SCR); free(conf->bc.QS); + free(conf->bc.ref_nm); + free(conf->bc.alt_nm); free(conf->bc.fmt_arr); + free(conf->bc.mwu_nm); free(conf->bcr); } if ( conf->gvcf ) gvcf_destroy(conf->gvcf); @@ -1047,6 +1064,7 @@ int parse_format_flag(const char *str) else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS; + else if ( !strcasecmp(tags[i],"NMBZ") || !strcasecmp(tags[i],"FORMAT/NMBZ") || !strcasecmp(tags[i],"FMT/NMBZ") ) flag |= B2B_FMT_NMBZ; else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; @@ -1072,13 +1090,14 @@ static void list_annotations(FILE *fp) "\n" "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n" "\n" -" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n" -" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" -" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" -" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" -" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n" -" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" -" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" +" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n" +" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" +" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" +" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" +" FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n" +" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n" +" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" +" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" "\n" "INFO annotation tags available:\n" "\n" @@ -1143,7 +1162,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --seed INT Random number seed used for sampling deep regions [0]\n" "\n" "Output options:\n" - " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n" + " -a, --annotate LIST Optional tags to output; '\\?' to list available tags []\n" " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n" " To minimum per-sample DP\n" " --no-version Do not append version and command line to the header\n" diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 3ab7bcb..d298d6a 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -30,19 +30,12 @@ THE SOFTWARE. */ #include #include #include -#include #include "prob1.h" -// #include "kstring.h" -// #include "kseq.h" -// KSTREAM_INIT(gzFile, gzread, 16384) - #define MC_MAX_EM_ITER 16 #define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 -gzFile bcf_p1_fp_lk; - void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) { int i; @@ -304,8 +297,6 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) } } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); - if (bcf_p1_fp_lk) - gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) diff --git a/bcftools/prob1.c.pysam.c b/bcftools/prob1.c.pysam.c index 6d2bbd1..822d0c2 100644 --- a/bcftools/prob1.c.pysam.c +++ b/bcftools/prob1.c.pysam.c @@ -32,19 +32,12 @@ THE SOFTWARE. */ #include #include #include -#include #include "prob1.h" -// #include "kstring.h" -// #include "kseq.h" -// KSTREAM_INIT(gzFile, gzread, 16384) - #define MC_MAX_EM_ITER 16 #define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 -gzFile bcf_p1_fp_lk; - void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) { int i; @@ -306,8 +299,6 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) } } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); - if (bcf_p1_fp_lk) - gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index b5e45d4..d33fd90 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -140,8 +140,8 @@ typedef struct _args_t int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present annot_col_t *cols; // column indexes and setters int ncols; - int match_id; // set iff `-c ~ID` given - int match_end; // set iff `-c ~INFO/END` is given + int match_id; // set iff `-c ~ID` given, -1 otherwise + int match_end; // set iff `-c ~INFO/END` is given, -1 otherwise char *set_ids_fmt; convert_t *set_ids; @@ -2441,7 +2441,7 @@ static void init_columns(args_t *args) } int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname); + error("The FORMAT tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2555,9 +2555,9 @@ static void init_columns(args_t *args) if ( ptr ) { *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '='; - error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s); + error("The INFO tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s); } - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname); + error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname); } tmp.l = 0; bcf_hrec_format_rename(hrec, key_dst, &tmp); @@ -2568,7 +2568,7 @@ static void init_columns(args_t *args) hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname); + error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } if ( args->tgts_is_vcf ) @@ -2727,6 +2727,26 @@ static int rename_annots_core(args_t *args, char *ori_tag, char *new_tag) else if ( !strncasecmp("fmt/",ori_tag,4) ) type = BCF_HL_FMT, ori_tag += 4; else if ( !strncasecmp("filter/",ori_tag,7) ) type = BCF_HL_FLT, ori_tag += 7; else return -1; + if ( !strncasecmp("info/",new_tag,5) ) + { + if ( type != BCF_HL_INFO ) error("Cannot transfer %s to INFO\n", ori_tag); + new_tag += 5; + } + else if ( !strncasecmp("format/",new_tag,7) ) + { + if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag); + new_tag += 7; + } + else if ( !strncasecmp("fmt/",new_tag,4) ) + { + if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag); + new_tag += 4; + } + else if ( !strncasecmp("filter/",new_tag,7) ) + { + if ( type != BCF_HL_FLT ) error("Cannot transfer %s to FILTER\n", ori_tag); + new_tag += 7; + } int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, ori_tag); if ( id<0 ) return 1; bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", ori_tag, NULL); @@ -3125,7 +3145,7 @@ static void annotate(args_t *args, bcf1_t *line) ialt++; } if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; - if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; + if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; has_overlap = 1; break; @@ -3315,6 +3335,7 @@ int main_vcfannotate(int argc, char *argv[]) args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; args->set_ids_replace = 1; args->match_id = -1; + args->match_end = -1; args->clevel = -1; args->pair_logic = -1; int regions_is_file = 0; diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index dfc8df0..e45c305 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -142,8 +142,8 @@ typedef struct _args_t int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present annot_col_t *cols; // column indexes and setters int ncols; - int match_id; // set iff `-c ~ID` given - int match_end; // set iff `-c ~INFO/END` is given + int match_id; // set iff `-c ~ID` given, -1 otherwise + int match_end; // set iff `-c ~INFO/END` is given, -1 otherwise char *set_ids_fmt; convert_t *set_ids; @@ -2443,7 +2443,7 @@ static void init_columns(args_t *args) } int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname); + error("The FORMAT tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2557,9 +2557,9 @@ static void init_columns(args_t *args) if ( ptr ) { *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '='; - error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s); + error("The INFO tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s); } - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname); + error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname); } tmp.l = 0; bcf_hrec_format_rename(hrec, key_dst, &tmp); @@ -2570,7 +2570,7 @@ static void init_columns(args_t *args) hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname); + error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } if ( args->tgts_is_vcf ) @@ -2729,6 +2729,26 @@ static int rename_annots_core(args_t *args, char *ori_tag, char *new_tag) else if ( !strncasecmp("fmt/",ori_tag,4) ) type = BCF_HL_FMT, ori_tag += 4; else if ( !strncasecmp("filter/",ori_tag,7) ) type = BCF_HL_FLT, ori_tag += 7; else return -1; + if ( !strncasecmp("info/",new_tag,5) ) + { + if ( type != BCF_HL_INFO ) error("Cannot transfer %s to INFO\n", ori_tag); + new_tag += 5; + } + else if ( !strncasecmp("format/",new_tag,7) ) + { + if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag); + new_tag += 7; + } + else if ( !strncasecmp("fmt/",new_tag,4) ) + { + if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag); + new_tag += 4; + } + else if ( !strncasecmp("filter/",new_tag,7) ) + { + if ( type != BCF_HL_FLT ) error("Cannot transfer %s to FILTER\n", ori_tag); + new_tag += 7; + } int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, ori_tag); if ( id<0 ) return 1; bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", ori_tag, NULL); @@ -3127,7 +3147,7 @@ static void annotate(args_t *args, bcf1_t *line) ialt++; } if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue; - if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; + if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue; args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; has_overlap = 1; break; @@ -3317,6 +3337,7 @@ int main_vcfannotate(int argc, char *argv[]) args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1; args->set_ids_replace = 1; args->match_id = -1; + args->match_end = -1; args->clevel = -1; args->pair_logic = -1; int regions_is_file = 0; diff --git a/bcftools/vcfbuf.c b/bcftools/vcfbuf.c index 71916bb..9d60c49 100644 --- a/bcftools/vcfbuf.c +++ b/bcftools/vcfbuf.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2016-2021 Genome Research Ltd. + Copyright (c) 2016-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -75,7 +75,7 @@ overlap_t; struct _vcfbuf_t { - int win; + int win, dummy; bcf_hdr_t *hdr; vcfrec_t *vcf; rbuf_t rbuf; @@ -118,6 +118,7 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } + if ( key==VCFBUF_DUMMY ) { buf->dummy = *((int*)value); return; } if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); @@ -135,6 +136,7 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST; else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND; else error("The mode \"%s\" is not recognised\n",mode); + return; } } @@ -149,7 +151,7 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec) int i = rbuf_append(&buf->rbuf); if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1(); - + bcf1_t *ret = buf->vcf[i].rec; buf->vcf[i].rec = rec; buf->vcf[i].af_set = 0; @@ -226,7 +228,7 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) for (i=-1; rbuf_next(&buf->rbuf,&i) && irecvcf[i].rec; - if ( line->n_allele > buf->prune.mac ) + if ( line->n_allele > buf->prune.mac ) { buf->prune.ac = (int*) realloc(buf->prune.ac, line->n_allele*sizeof(*buf->prune.ac)); buf->prune.mac = line->n_allele; @@ -240,7 +242,7 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) } else if ( bcf_calc_ac(buf->hdr, line, buf->prune.ac, BCF_UN_INFO|BCF_UN_FMT) ) { - int ntot = buf->prune.ac[0], nalt = 0; + int ntot = buf->prune.ac[0], nalt = 0; for (k=1; kn_allele; k++) nalt += buf->prune.ac[k]; buf->vcf[i].af = ntot ? (float)nalt/ntot : 0; } @@ -315,7 +317,7 @@ static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) { buf->overlap.rid = last->rec->rid; buf->overlap.end = end_pos; - return 0; + return 0; } if ( beg_pos <= buf->overlap.end ) { @@ -330,7 +332,7 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) int i,j; if ( buf->rbuf.n==0 ) return NULL; - if ( flush_all ) goto ret; + if ( flush_all || buf->dummy ) goto ret; i = rbuf_kth(&buf->rbuf, 0); // first j = rbuf_last(&buf->rbuf); // last @@ -347,9 +349,11 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) else if ( buf->win < 0 ) { if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL; + goto ret; } - else return NULL; - + else + return NULL; + ret: if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all); @@ -380,7 +384,7 @@ static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples) D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb)) and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage - disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265 + disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265 \hat{D} = 1/[n*(n+1)]*[ (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9) @@ -423,7 +427,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l double nhd[] = {0,0,0,0,0,0,0,0,0}; double ab = 0, aa = 0, bb = 0, a = 0, b = 0; int nab = 0, ndiff = 0; - int an_tot = 0, bn_tot = 0; + int an_tot = 0, bn_tot = 0; for (i=0; in_sample; i++) { int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size); @@ -508,7 +512,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output ld->val[VCFBUF_LD_IDX_HD] = - (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) + (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.); ld->val[VCFBUF_LD_IDX_HD] /= nab; ld->val[VCFBUF_LD_IDX_HD] /= nab+1; @@ -535,7 +539,7 @@ int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld) } for (i=-1; rbuf_next(&buf->rbuf,&i); ) - { + { if ( buf->vcf[i].filter ) continue; if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes diff --git a/bcftools/vcfbuf.c.pysam.c b/bcftools/vcfbuf.c.pysam.c index 50df73d..7b1c40e 100644 --- a/bcftools/vcfbuf.c.pysam.c +++ b/bcftools/vcfbuf.c.pysam.c @@ -2,20 +2,20 @@ /* The MIT License - Copyright (c) 2016-2021 Genome Research Ltd. + Copyright (c) 2016-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -77,7 +77,7 @@ overlap_t; struct _vcfbuf_t { - int win; + int win, dummy; bcf_hdr_t *hdr; vcfrec_t *vcf; rbuf_t rbuf; @@ -120,6 +120,7 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } + if ( key==VCFBUF_DUMMY ) { buf->dummy = *((int*)value); return; } if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); @@ -137,6 +138,7 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST; else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND; else error("The mode \"%s\" is not recognised\n",mode); + return; } } @@ -151,7 +153,7 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec) int i = rbuf_append(&buf->rbuf); if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1(); - + bcf1_t *ret = buf->vcf[i].rec; buf->vcf[i].rec = rec; buf->vcf[i].af_set = 0; @@ -228,7 +230,7 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) for (i=-1; rbuf_next(&buf->rbuf,&i) && irecvcf[i].rec; - if ( line->n_allele > buf->prune.mac ) + if ( line->n_allele > buf->prune.mac ) { buf->prune.ac = (int*) realloc(buf->prune.ac, line->n_allele*sizeof(*buf->prune.ac)); buf->prune.mac = line->n_allele; @@ -242,7 +244,7 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) } else if ( bcf_calc_ac(buf->hdr, line, buf->prune.ac, BCF_UN_INFO|BCF_UN_FMT) ) { - int ntot = buf->prune.ac[0], nalt = 0; + int ntot = buf->prune.ac[0], nalt = 0; for (k=1; kn_allele; k++) nalt += buf->prune.ac[k]; buf->vcf[i].af = ntot ? (float)nalt/ntot : 0; } @@ -317,7 +319,7 @@ static int _overlap_can_flush(vcfbuf_t *buf, int flush_all) { buf->overlap.rid = last->rec->rid; buf->overlap.end = end_pos; - return 0; + return 0; } if ( beg_pos <= buf->overlap.end ) { @@ -332,7 +334,7 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) int i,j; if ( buf->rbuf.n==0 ) return NULL; - if ( flush_all ) goto ret; + if ( flush_all || buf->dummy ) goto ret; i = rbuf_kth(&buf->rbuf, 0); // first j = rbuf_last(&buf->rbuf); // last @@ -349,9 +351,11 @@ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all) else if ( buf->win < 0 ) { if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL; + goto ret; } - else return NULL; - + else + return NULL; + ret: if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all); @@ -382,7 +386,7 @@ static double _estimate_af(int8_t *ptr, int size, int nvals, int nsamples) D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb)) and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage - disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265 + disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265 \hat{D} = 1/[n*(n+1)]*[ (n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9) @@ -425,7 +429,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l double nhd[] = {0,0,0,0,0,0,0,0,0}; double ab = 0, aa = 0, bb = 0, a = 0, b = 0; int nab = 0, ndiff = 0; - int an_tot = 0, bn_tot = 0; + int an_tot = 0, bn_tot = 0; for (i=0; in_sample; i++) { int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size); @@ -510,7 +514,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output ld->val[VCFBUF_LD_IDX_HD] = - (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) + (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8]) - (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.); ld->val[VCFBUF_LD_IDX_HD] /= nab; ld->val[VCFBUF_LD_IDX_HD] /= nab+1; @@ -537,7 +541,7 @@ int vcfbuf_ld(vcfbuf_t *buf, bcf1_t *rec, vcfbuf_ld_t *ld) } for (i=-1; rbuf_next(&buf->rbuf,&i); ) - { + { if ( buf->vcf[i].filter ) continue; if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes diff --git a/bcftools/vcfbuf.h b/bcftools/vcfbuf.h index d3be6c5..878fd10 100644 --- a/bcftools/vcfbuf.h +++ b/bcftools/vcfbuf.h @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2017-2021 Genome Research Ltd. + Copyright (c) 2017-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -38,6 +38,8 @@ typedef struct _vcfbuf_t vcfbuf_t; // Modes of operation typedef enum { + VCFBUF_DUMMY, // the caller maintains the buffer via push/peek/flush, nothing is removed by vcfbuf + VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window VCFBUF_RMDUP, // remove duplicate sites (completely) VCFBUF_NSITES, // leave at max this many sites in the window @@ -49,7 +51,7 @@ typedef enum LD_FILTER1, // exclude the next record inserted by vcfbuf_push() from LD analysis LD_MAX_R2, // If set, vcfbuf_ld() will stop at the first record that exceeds the R2, LD_MAX_LD, // LD, or HD threshold. When multiple are set, the OR logic is applied - LD_MAX_HD, // + LD_MAX_HD, // } vcfbuf_opt_t; @@ -59,7 +61,7 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value); /* * vcfbuf_init() - init buffer - * @win: number of sites (>0) or bp (<0) + * @win: number of sites (>0), bp (<0) */ vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win); void vcfbuf_destroy(vcfbuf_t *buf); @@ -81,6 +83,10 @@ bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx); */ bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx); +/* + * vcfbuf_flush() - returns the next record or NULL, depending on the mode of operation and + * the content of the buffer + */ bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all); /* diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index 0418d8e..4c9e88c 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -32,7 +32,6 @@ THE SOFTWARE. */ #include #include #include -#include #include #include #include @@ -229,9 +228,13 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl } tmp++; } - if ( j!=5 ) break; + if ( j<4 ) break; + + char sex; + if ( col_ends[3][1]=='1' ) sex = 'M'; + else if ( col_ends[3][1]=='2' ) sex = 'F'; + else break; - char sex = col_ends[3][1]=='1' ? 'M' : 'F'; lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j); if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother { @@ -256,11 +259,8 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl free(fam_str.s); khash_str2int_destroy_free(name2idx); - if ( i!=nvals ) // not a ped file - { - if ( i>0 ) error("Could not parse samples, not a PED format.\n"); - return NULL; - } + if ( i!=nvals ) return NULL; // not a ped file + *nsmpl = nlines; return lines; } @@ -287,6 +287,8 @@ static void set_samples(args_t *args, const char *fn, int is_file) lines = smpls; nlines = nsmpls; } + else if ( is_file ) + fprintf(stderr,"Note: could not parse as PED: %s\n",fn); args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); @@ -560,7 +562,7 @@ bcf1_t *next_line(args_t *args) return NULL; } - // Find the VCF and tab record with the best matching combination of alleles, prioritize + // Find the VCF and tab record with the best matching combination of alleles, prioritize // records of the same type (snp vs indel) rec_tgt_t rec_tgt; memset(&rec_tgt,0,sizeof(rec_tgt)); @@ -660,7 +662,7 @@ static void init_data(args_t *args) args->aux.ploidy = (uint8_t*) malloc(args->nsamples); for (i=0; insamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy); for (i=0; insex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy); - for (i=0; insamples; i++) + for (i=0; insamples; i++) if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1; } @@ -899,7 +901,7 @@ static void usage(args_t *args) fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); - fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); @@ -1020,7 +1022,7 @@ int main_vcfcall(int argc, char *argv[]) *args.aux.prior_AC = 0; args.aux.prior_AC++; break; - case 'g': + case 'g': args.gvcf = gvcf_init(optarg); if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg); break; diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index 1c1710b..c715c53 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -2,7 +2,7 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -34,7 +34,6 @@ THE SOFTWARE. */ #include #include #include -#include #include #include #include @@ -231,9 +230,13 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl } tmp++; } - if ( j!=5 ) break; + if ( j<4 ) break; + + char sex; + if ( col_ends[3][1]=='1' ) sex = 'M'; + else if ( col_ends[3][1]=='2' ) sex = 'F'; + else break; - char sex = col_ends[3][1]=='1' ? 'M' : 'F'; lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j); if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother { @@ -258,11 +261,8 @@ static char **parse_ped_samples(call_t *call, char **vals, int nvals, int *nsmpl free(fam_str.s); khash_str2int_destroy_free(name2idx); - if ( i!=nvals ) // not a ped file - { - if ( i>0 ) error("Could not parse samples, not a PED format.\n"); - return NULL; - } + if ( i!=nvals ) return NULL; // not a ped file + *nsmpl = nlines; return lines; } @@ -289,6 +289,8 @@ static void set_samples(args_t *args, const char *fn, int is_file) lines = smpls; nlines = nsmpls; } + else if ( is_file ) + fprintf(bcftools_stderr,"Note: could not parse as PED: %s\n",fn); args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); @@ -562,7 +564,7 @@ bcf1_t *next_line(args_t *args) return NULL; } - // Find the VCF and tab record with the best matching combination of alleles, prioritize + // Find the VCF and tab record with the best matching combination of alleles, prioritize // records of the same type (snp vs indel) rec_tgt_t rec_tgt; memset(&rec_tgt,0,sizeof(rec_tgt)); @@ -662,7 +664,7 @@ static void init_data(args_t *args) args->aux.ploidy = (uint8_t*) malloc(args->nsamples); for (i=0; insamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy); for (i=0; insex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy); - for (i=0; insamples; i++) + for (i=0; insamples; i++) if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1; } @@ -901,7 +903,7 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); - fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); @@ -1022,7 +1024,7 @@ int main_vcfcall(int argc, char *argv[]) *args.aux.prior_AC = 0; args.aux.prior_AC++; break; - case 'g': + case 'g': args.gvcf = gvcf_init(optarg); if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg); break; diff --git a/bcftools/vcfconvert.c b/bcftools/vcfconvert.c index 4a5d7ba..ce5ed99 100644 --- a/bcftools/vcfconvert.c +++ b/bcftools/vcfconvert.c @@ -426,7 +426,7 @@ static void gensample_to_vcf(args_t *args) if ( args->gen_3N6 ) { tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); - tsv_register(tsv, "CHROM", tsv_setter_chrom, args); + tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header); } else tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); diff --git a/bcftools/vcfconvert.c.pysam.c b/bcftools/vcfconvert.c.pysam.c index 0e64b34..f340171 100644 --- a/bcftools/vcfconvert.c.pysam.c +++ b/bcftools/vcfconvert.c.pysam.c @@ -428,7 +428,7 @@ static void gensample_to_vcf(args_t *args) if ( args->gen_3N6 ) { tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); - tsv_register(tsv, "CHROM", tsv_setter_chrom, args); + tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header); } else tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); diff --git a/bcftools/vcfindex.c b/bcftools/vcfindex.c index 5fd50c2..1dd960e 100644 --- a/bcftools/vcfindex.c +++ b/bcftools/vcfindex.c @@ -40,7 +40,8 @@ DEALINGS IN THE SOFTWARE. */ enum { per_contig = 1, - total = 2 + all_contigs = 2, + total = 4 }; static void usage(void) @@ -58,6 +59,7 @@ static void usage(void) fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Stats options:\n"); + fprintf(stderr, " -a, --all with --stats, print stats for all contigs even when zero\n"); fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n"); fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n"); fprintf(stderr, "\n"); @@ -181,13 +183,15 @@ int vcf_index_stats(char *fname, int stats) for (tid=0; tididx : idx, tid, &records, &v); + int ret = hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v); sum += records; - if ( (stats&total) || !records ) continue; + if ( (stats&total) || (records == 0 && !(stats&all_contigs)) ) continue; const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a"; bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); + printf("%s\t%s\t", ctg_name, hkey<0?".":hrec->vals[hkey]); + if (ret >= 0) printf("%" PRIu64 "\n", records); + else printf(".\n"); } if ( !sum ) { @@ -224,6 +228,7 @@ int main_vcfindex(int argc, char *argv[]) static struct option loptions[] = { + {"all",no_argument,NULL,'a'}, {"csi",no_argument,NULL,'c'}, {"tbi",no_argument,NULL,'t'}, {"force",no_argument,NULL,'f'}, @@ -237,7 +242,7 @@ int main_vcfindex(int argc, char *argv[]) }; char *tmp; - while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0) { switch (c) { @@ -250,6 +255,7 @@ int main_vcfindex(int argc, char *argv[]) break; case 's': stats |= per_contig; break; case 'n': stats |= total; break; + case 'a': stats |= all_contigs; break; case 9: n_threads = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); diff --git a/bcftools/vcfindex.c.pysam.c b/bcftools/vcfindex.c.pysam.c index 43d342d..ac9e3ba 100644 --- a/bcftools/vcfindex.c.pysam.c +++ b/bcftools/vcfindex.c.pysam.c @@ -42,7 +42,8 @@ DEALINGS IN THE SOFTWARE. */ enum { per_contig = 1, - total = 2 + all_contigs = 2, + total = 4 }; static void usage(void) @@ -60,6 +61,7 @@ static void usage(void) fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n"); fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "Stats options:\n"); + fprintf(bcftools_stderr, " -a, --all with --stats, print stats for all contigs even when zero\n"); fprintf(bcftools_stderr, " -n, --nrecords print number of records based on existing index file\n"); fprintf(bcftools_stderr, " -s, --stats print per contig stats based on existing index file\n"); fprintf(bcftools_stderr, "\n"); @@ -183,13 +185,15 @@ int vcf_index_stats(char *fname, int stats) for (tid=0; tididx : idx, tid, &records, &v); + int ret = hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v); sum += records; - if ( (stats&total) || !records ) continue; + if ( (stats&total) || (records == 0 && !(stats&all_contigs)) ) continue; const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a"; bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL; int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; - fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records); + fprintf(bcftools_stdout, "%s\t%s\t", ctg_name, hkey<0?".":hrec->vals[hkey]); + if (ret >= 0) fprintf(bcftools_stdout, "%" PRIu64 "\n", records); + else fprintf(bcftools_stdout, ".\n"); } if ( !sum ) { @@ -226,6 +230,7 @@ int main_vcfindex(int argc, char *argv[]) static struct option loptions[] = { + {"all",no_argument,NULL,'a'}, {"csi",no_argument,NULL,'c'}, {"tbi",no_argument,NULL,'t'}, {"force",no_argument,NULL,'f'}, @@ -239,7 +244,7 @@ int main_vcfindex(int argc, char *argv[]) }; char *tmp; - while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0) { switch (c) { @@ -252,6 +257,7 @@ int main_vcfindex(int argc, char *argv[]) break; case 's': stats |= per_contig; break; case 'n': stats |= total; break; + case 'a': stats |= all_contigs; break; case 9: n_threads = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg); diff --git a/bcftools/vcfisec.c b/bcftools/vcfisec.c index c4c09f5..a755a85 100644 --- a/bcftools/vcfisec.c +++ b/bcftools/vcfisec.c @@ -321,7 +321,7 @@ static void init_data(args_t *args) { if ( !args->write ) args->write = (int*) calloc(args->files->nreaders,sizeof(int)); if ( sscanf(p,"%d",&i)!=1 ) error("Could not parse --write %s\n", args->write_files); - if ( i<0 || i>args->files->nreaders ) error("The index is out of range: %d (%s)\n", i, args->write_files); + if ( i<=0 || i>args->files->nreaders ) error("The index is out of range: %d (-w %s)\n", i, args->write_files); args->write[i-1] = 1; args->iwrite = i-1; args->nwrite++; @@ -631,10 +631,10 @@ int main_vcfisec(int argc, char *argv[]) args->isec_op = OP_VENN; if ( !args->prefix ) error("Expected the -p option\n"); } - if ( !args->targets_list ) + if ( !args->isec_op ) { - if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n"); - if ( !args->isec_op ) error("One of the options --complement, --nfiles or --targets must be given with more than two files\n"); + args->isec_op = OP_PLUS; + args->isec_n = 1; } args->files->require_index = 1; while (optindwrite ) args->write = (int*) calloc(args->files->nreaders,sizeof(int)); if ( sscanf(p,"%d",&i)!=1 ) error("Could not parse --write %s\n", args->write_files); - if ( i<0 || i>args->files->nreaders ) error("The index is out of range: %d (%s)\n", i, args->write_files); + if ( i<=0 || i>args->files->nreaders ) error("The index is out of range: %d (-w %s)\n", i, args->write_files); args->write[i-1] = 1; args->iwrite = i-1; args->nwrite++; @@ -633,10 +633,10 @@ int main_vcfisec(int argc, char *argv[]) args->isec_op = OP_VENN; if ( !args->prefix ) error("Expected the -p option\n"); } - if ( !args->targets_list ) + if ( !args->isec_op ) { - if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n"); - if ( !args->isec_op ) error("One of the options --complement, --nfiles or --targets must be given with more than two files\n"); + args->isec_op = OP_PLUS; + args->isec_n = 1; } args->files->require_index = 1; while (optind KHASH_MAP_INIT_STR(strdict, int) typedef khash_t(strdict) strdict_t; @@ -383,7 +385,7 @@ static void info_rules_init(args_t *args) info_rule_t *rule = &args->rules[n]; rule->hdr_tag = strdup(ss); int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag); - if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag); + if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The INFO tag is not defined in the header: \"%s\"\n", rule->hdr_tag); rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id); if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); @@ -2517,7 +2519,16 @@ static inline int is_gvcf_block(bcf1_t *line) } return 0; } -static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2; + +// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h +// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and +// to accommodate for VCF_GVCF_REF defined below +static const int + snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), + indel_mask = VCF_INDEL<<2, + ins_mask = VCF_INS<<2, + del_mask = VCF_DEL<<2, + ref_mask = 2; /* Check incoming lines for new gVCF blocks, set pointer to the current source @@ -2742,7 +2753,13 @@ int can_merge(args_t *args) id = line->d.id; else { - int var_type = bcf_get_variant_types(line); + int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); + if (var_type < 0) error("bcf_has_variant_types() failed."); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) + { + // need to distinguish between ins and del so strip the VCF_INDEL flag + var_type &= ~VCF_INDEL; + } maux->var_types |= var_type ? var_type<<2 : 2; // for the `-m none -g` mode @@ -2778,7 +2795,8 @@ int can_merge(args_t *args) bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer - int line_type = bcf_get_variant_types(line); + int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); + if (line_type < 0) error("bcf_has_variant_types() failed."); line_type = line_type ? line_type<<2 : 2; // select relevant lines @@ -2812,7 +2830,7 @@ int can_merge(args_t *args) // - SNPs+SNPs+MNPs+REF if -m both,snps // - indels+indels+REF if -m both,indels, REF only if SNPs are not present // - SNPs come first - if ( line_type & indel_mask ) + if ( line_type & (indel_mask|ins_mask|del_mask) ) { if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks @@ -2895,19 +2913,26 @@ void stage_line(args_t *args) { if ( buf->rec[j].skip ) continue; // done or not compatible if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged - int line_type = bcf_get_variant_types(buf->lines[j]); + int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap); + if (line_type < 0) error("bcf_has_variant_types() failed."); if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; if ( line_type==VCF_REF ) { if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( maux->var_types&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; if ( maux->var_types&ref_mask ) break; } else if ( maux->var_types&ref_mask ) { if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; } } } @@ -3125,7 +3150,7 @@ static void usage(void) fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(stderr, " -l, --file-list FILE Read file names from the file\n"); fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); - fprintf(stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); + fprintf(stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -3229,6 +3254,7 @@ int main_vcfmerge(int argc, char *argv[]) else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE; + else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL; else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; } else error("The -m type \"%s\" is not recognised.\n", optarg); break; diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index 0a373ef..3a26cae 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -45,6 +45,8 @@ THE SOFTWARE. */ #define DBG 0 +#define COLLAPSE_SNP_INS_DEL (1<<10) + #include KHASH_MAP_INIT_STR(strdict, int) typedef khash_t(strdict) strdict_t; @@ -385,7 +387,7 @@ static void info_rules_init(args_t *args) info_rule_t *rule = &args->rules[n]; rule->hdr_tag = strdup(ss); int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag); - if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag); + if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The INFO tag is not defined in the header: \"%s\"\n", rule->hdr_tag); rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id); if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t); else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float); @@ -2519,7 +2521,16 @@ static inline int is_gvcf_block(bcf1_t *line) } return 0; } -static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2; + +// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h +// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and +// to accommodate for VCF_GVCF_REF defined below +static const int + snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), + indel_mask = VCF_INDEL<<2, + ins_mask = VCF_INS<<2, + del_mask = VCF_DEL<<2, + ref_mask = 2; /* Check incoming lines for new gVCF blocks, set pointer to the current source @@ -2744,7 +2755,13 @@ int can_merge(args_t *args) id = line->d.id; else { - int var_type = bcf_get_variant_types(line); + int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); + if (var_type < 0) error("bcf_has_variant_types() failed."); + if ( args->collapse==COLLAPSE_SNP_INS_DEL ) + { + // need to distinguish between ins and del so strip the VCF_INDEL flag + var_type &= ~VCF_INDEL; + } maux->var_types |= var_type ? var_type<<2 : 2; // for the `-m none -g` mode @@ -2780,7 +2797,8 @@ int can_merge(args_t *args) bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer - int line_type = bcf_get_variant_types(line); + int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap); + if (line_type < 0) error("bcf_has_variant_types() failed."); line_type = line_type ? line_type<<2 : 2; // select relevant lines @@ -2814,7 +2832,7 @@ int can_merge(args_t *args) // - SNPs+SNPs+MNPs+REF if -m both,snps // - indels+indels+REF if -m both,indels, REF only if SNPs are not present // - SNPs come first - if ( line_type & indel_mask ) + if ( line_type & (indel_mask|ins_mask|del_mask) ) { if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks @@ -2897,19 +2915,26 @@ void stage_line(args_t *args) { if ( buf->rec[j].skip ) continue; // done or not compatible if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged - int line_type = bcf_get_variant_types(buf->lines[j]); + int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap); + if (line_type < 0) error("bcf_has_variant_types() failed."); if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break; if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; if ( line_type==VCF_REF ) { if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( maux->var_types&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( maux->var_types&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; if ( maux->var_types&ref_mask ) break; } else if ( maux->var_types&ref_mask ) { if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break; if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break; + if ( line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; + if ( line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break; } } } @@ -3127,7 +3152,7 @@ static void usage(void) fprintf(bcftools_stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n"); fprintf(bcftools_stderr, " -l, --file-list FILE Read file names from the file\n"); fprintf(bcftools_stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n"); - fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); + fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for , see man page for details [both]\n"); fprintf(bcftools_stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -3231,6 +3256,7 @@ int main_vcfmerge(int argc, char *argv[]) else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE; + else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL; else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; } else error("The -m type \"%s\" is not recognised.\n", optarg); break; diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index b286c90..6a7272f 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -706,8 +706,10 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) for (i=1; in_allele; i++) { if ( args->first_allele_only && i>1 ) break; - if ( bcf_get_variant_type(line,i)!=VCF_INDEL ) continue; - int len = line->d.var[i].n; + int is_indel = bcf_has_variant_type(line,i,VCF_INDEL); + if (is_indel < 0) error("bcf_has_variant_type() failed."); + if ( !is_indel ) continue; + int len = bcf_variant_length(line, i); #if IRC_STATS // Indel repeat consistency diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index ebde82e..7b5c485 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -2,7 +2,7 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -708,8 +708,10 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) for (i=1; in_allele; i++) { if ( args->first_allele_only && i>1 ) break; - if ( bcf_get_variant_type(line,i)!=VCF_INDEL ) continue; - int len = line->d.var[i].n; + int is_indel = bcf_has_variant_type(line,i,VCF_INDEL); + if (is_indel < 0) error("bcf_has_variant_type() failed."); + if ( !is_indel ) continue; + int len = bcf_variant_length(line, i); #if IRC_STATS // Indel repeat consistency diff --git a/bcftools/version.sh b/bcftools/version.sh index 0e51fdd..5fbf8df 100755 --- a/bcftools/version.sh +++ b/bcftools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.15.1 +VERSION=1.16 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/devtools/import.py b/devtools/import.py index b4d6ffa..90194d0 100644 --- a/devtools/import.py +++ b/devtools/import.py @@ -152,10 +152,10 @@ if len(sys.argv) >= 1: locate("version.sh", srcdir, exclude_htslib=True)) if dest == "htslib": - # Add build files, including *.ac *.in *.mk *.m4 + # Add build files, including *.ac *.in *.mk *.m4 *.sh mfiles = itertools.chain(mfiles, locate("Makefile", srcdir), locate("configure", srcdir), - locate("*.[aim][cnk4]", srcdir)) + locate("*.[aims][cnk4h]", srcdir, exclude)) ncopied = 0 diff --git a/doc/conf.py b/doc/conf.py index 162ea38..aaf1d35 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -11,13 +11,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os, sysconfig +import sys, os, setuptools # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -_pyversion = sysconfig.get_python_version() -_libdir = "../build/lib.%s-%s" % (sysconfig.get_platform(), _pyversion) +_build_obj = setuptools.dist.Distribution().get_command_obj('build') +_build_obj.ensure_finalized() + +_libdir = os.path.join('..', _build_obj.build_platlib) if os.path.exists(_libdir): sys.path.insert(0, os.path.abspath(_libdir)) @@ -32,7 +34,7 @@ extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon'] -intersphinx_mapping = {'python': ('https://docs.python.org/%s' % _pyversion, None)} +intersphinx_mapping = {'python': ('https://docs.python.org/%d.%d' % sys.version_info[:2], None)} # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/doc/index.rst b/doc/index.rst index 6f4e408..6bff551 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.15.1*, *samtools-1.15.1*, and *bcftools-1.15.1*. +The current version wraps *htslib-1.16*, *samtools-1.16.1*, and *bcftools-1.16*. To install the latest release, type:: diff --git a/pysam/libcalignedsegment.pyi b/pysam/libcalignedsegment.pyi index f53c318..4e4b1d6 100644 --- a/pysam/libcalignedsegment.pyi +++ b/pysam/libcalignedsegment.pyi @@ -144,22 +144,28 @@ class AlignedSegment: ) -> None: ... def has_tag(self, tag: str) -> bool: ... @overload - def get_tag(self, tag: str, with_value_type: Literal[False]) -> TagValue: ... + def get_tag(self, tag: str, with_value_type: Literal[False] = ...) -> TagValue: ... @overload - def get_tag(self, tag, with_value_type: Literal[True]) -> Tuple[TagValue, str]: ... + def get_tag( + self, tag: str, with_value_type: Literal[True] + ) -> Tuple[TagValue, str]: ... @overload def get_tag( - self, tag, with_value_type: bool = ... + self, tag: str, with_value_type: bool ) -> Union[TagValue, Tuple[TagValue, str]]: ... @overload def get_tags( - self, with_value_type: Literal[False] + self, with_value_type: Literal[False] = ... ) -> List[Tuple[str, TagValue]]: ... @overload def get_tags( self, with_value_type: Literal[True] ) -> List[Tuple[str, TagValue, str]]: ... @overload + def get_tags( + self, with_value_type: bool + ) -> Union[List[Tuple[str, TagValue]], List[Tuple[str, TagValue, str]]]: ... + @overload def get_tags( self, with_value_type: bool = ... ) -> Union[List[Tuple[str, TagValue, str]], List[Tuple[str, TagValue]]]: ... diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 23631db..75c1fa4 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -191,7 +191,7 @@ class AlignmentFile(HTSFile): class IteratorRow: def __iter__(self) -> IteratorRow: ... - def __next__(self) -> AlignedSegment: ... + def __next__(self) -> PileupColumn: ... class IteratorRowAll(IteratorRow): ... class IteratorRowAllRefs(IteratorRow): ... @@ -201,7 +201,7 @@ class IteratorRowSelection(IteratorRow): ... class IteratorColumn: def __iter__(self) -> IteratorRow: ... - def __next__(self) -> AlignedSegment: ... + def __next__(self) -> PileupColumn: ... @property def seq_len(self) -> int: ... def add_reference(self, fastafile: FastaFile) -> None: ... diff --git a/pysam/libcbcf.pyi b/pysam/libcbcf.pyi index f896cca..bb875dd 100644 --- a/pysam/libcbcf.pyi +++ b/pysam/libcbcf.pyi @@ -86,7 +86,7 @@ class VariantHeaderMetadata(_Mapping[str, VariantMetadata]): def add( self, id: str, - number: Optional[str], + number: Optional[Union[int, str]], type: Optional[str], description: str, **kwargs @@ -150,12 +150,12 @@ class VariantHeader: contig: Optional[str] = ..., start: int = ..., stop: int = ..., - alleles: Optional[Tuple[str]] = ..., + alleles: Optional[Tuple[str, ...]] = ..., id: Optional[str] = ..., qual: Optional[int] = ..., filter: Optional[Any] = ..., info: Optional[Mapping[str, _InfoValue]] = ..., - samples: Optional[Iterable[str]] = ..., + samples: Optional[Iterable[Optional[Mapping[str, _FormatValue]]]] = ..., **kwargs ) -> VariantRecord: ... def add_record(self, record: VariantHeaderRecord) -> None: ... @@ -221,8 +221,8 @@ class VariantRecord: qual: Optional[int] id: Optional[str] ref: Optional[str] - alleles: Optional[Tuple[str]] - alts: Optional[Tuple[str]] + alleles: Optional[Tuple[str, ...]] + alts: Optional[Tuple[str, ...]] @property def filter(self) -> VariantRecordFilter: ... @property @@ -241,8 +241,8 @@ class VariantRecordSample(_Mapping[str, _FormatValue]): def index(self) -> int: ... @property def name(self) -> str: ... - allele_indices: Optional[Tuple[Optional[int]]] - alleles: Optional[Tuple[Optional[str]]] + allele_indices: Optional[Tuple[Optional[int, ...]]] + alleles: Optional[Tuple[Optional[str, ...]]] phased: bool def __setitem__(self, key: str, value: _FormatValue) -> None: ... def __delitem__(self, key: str) -> None: ... diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index fbb3a3d..c3cf8cf 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -3481,8 +3481,24 @@ cdef class VariantRecordSample(object): return bcf_format_get_alleles(self) @alleles.setter - def alleles(self, value): - self['GT'] = value + def alleles(self, value: tuple): + # Sets the genotype, supply a tuple of alleles to set. + # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord + # The genotype is reset when an empty tuple, None or (None,) is supplied + + if value==(None,) or value==tuple() or value is None: + self['GT'] = () + return + + if any((type(x) == int for x in value)): + raise ValueError('Use .allele_indices to set integer allele indices') + + # determine and set allele indices: + try: + self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) ) + except ValueError: + raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord." + "First set the .alleles of this record to define the alleles") @alleles.deleter def alleles(self): diff --git a/pysam/samtools.py b/pysam/samtools.py index a359398..a90d32c 100644 --- a/pysam/samtools.py +++ b/pysam/samtools.py @@ -20,6 +20,7 @@ SAMTOOLS_DISPATCH = { "merge": ("merge", None), "markdup": ("markdup", None), "rmdup": ("rmdup", None), + "reference": ("reference", None), "reheader": ("reheader", None), "cat": ("cat", None), "targetcut": ("targetcut", None), diff --git a/pysam/version.h b/pysam/version.h index 5f12054..dddd49c 100644 --- a/pysam/version.h +++ b/pysam/version.h @@ -1,5 +1,5 @@ // Version information used while compiling samtools, bcftools, and htslib -#define SAMTOOLS_VERSION "1.15.1 (pysam)" -#define BCFTOOLS_VERSION "1.15.1 (pysam)" -#define HTS_VERSION_TEXT "1.15.1 (pysam)" +#define SAMTOOLS_VERSION "1.16.1 (pysam)" +#define BCFTOOLS_VERSION "1.16 (pysam)" +#define HTS_VERSION_TEXT "1.16 (pysam)" diff --git a/pysam/version.py b/pysam/version.py index 1251985..b6aede0 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,6 +1,6 @@ # pysam versioning information -__version__ = "0.19.1" +__version__ = "0.20.0" -__samtools_version__ = "1.15.1" -__bcftools_version__ = "1.15.1" -__htslib_version__ = "1.15.1" +__samtools_version__ = "1.16.1" +__bcftools_version__ = "1.16" +__htslib_version__ = "1.16" diff --git a/samtools/README b/samtools/README index b7b08ae..7be5383 100644 --- a/samtools/README +++ b/samtools/README @@ -9,7 +9,7 @@ Building samtools The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.15.1 # Within the unpacked release directory + cd .../samtools-1.16.1 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.15.1 # Within the unpacked release directory + cd .../samtools-1.16.1 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ There are two advantages to this: To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.15.1 # Within the unpacked release directory + cd .../samtools-1.16.1 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.15.1 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.15.1 + cd .../samtools-1.16.1 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.16 make all all-htslib It is possible to override the built-in search path using the HTS_PATH diff --git a/samtools/bam.c b/samtools/bam.c index 5a77d66..f847f89 100644 --- a/samtools/bam.c +++ b/samtools/bam.c @@ -1,6 +1,6 @@ /* bam.c -- miscellaneous BAM functions. - Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd. + Copyright (C) 2008-2013, 2015, 2019-2020, 2022 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -156,3 +156,112 @@ rmB_err: b->core.flag |= BAM_FUNMAP; return -1; } + +/* Calculate the current read's start based on the stored cigar string. */ +hts_pos_t unclipped_start(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + int64_t clipped = 0; + uint32_t i; + + for (i = 0; i < b->core.n_cigar; i++) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return b->core.pos - clipped + 1; +} + +/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ +hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { + char *c = cigar; + int64_t clipped = 0; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + if (*c == 'S' || *c == 'H') { // clips + clipped += num; + } else { + break; + } + + c++; + } + + return op - clipped + 1; +} + +/* Calculate the current read's end based on the stored cigar string. */ +hts_pos_t unclipped_end(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + hts_pos_t end_pos, clipped = 0; + int32_t i; + + end_pos = bam_endpos(b); + + // now get the clipped end bases (if any) + // if we get to the beginning of the cigar string + // without hitting a non-clip then the results are meaningless + for (i = b->core.n_cigar - 1; i >= 0; i--) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return end_pos + clipped; +} + + +/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ +hts_pos_t unclipped_other_end(int64_t op, char *cigar) { + char *c = cigar; + int64_t refpos = 0; + int skip = 1; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + switch (*c) { + case 'M': + case 'D': + case 'N': + case '=': + case 'X': + refpos += num; + skip = 0; // ignore initial clips + break; + + case 'S': + case 'H': + if (!skip) { + refpos += num; + } + break; + } + + c++; + } + + return op + refpos; +} diff --git a/samtools/bam.c.pysam.c b/samtools/bam.c.pysam.c index 1fdd279..4f235ed 100644 --- a/samtools/bam.c.pysam.c +++ b/samtools/bam.c.pysam.c @@ -2,7 +2,7 @@ /* bam.c -- miscellaneous BAM functions. - Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd. + Copyright (C) 2008-2013, 2015, 2019-2020, 2022 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -158,3 +158,112 @@ rmB_err: b->core.flag |= BAM_FUNMAP; return -1; } + +/* Calculate the current read's start based on the stored cigar string. */ +hts_pos_t unclipped_start(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + int64_t clipped = 0; + uint32_t i; + + for (i = 0; i < b->core.n_cigar; i++) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return b->core.pos - clipped + 1; +} + +/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ +hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { + char *c = cigar; + int64_t clipped = 0; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + if (*c == 'S' || *c == 'H') { // clips + clipped += num; + } else { + break; + } + + c++; + } + + return op - clipped + 1; +} + +/* Calculate the current read's end based on the stored cigar string. */ +hts_pos_t unclipped_end(bam1_t *b) { + uint32_t *cigar = bam_get_cigar(b); + hts_pos_t end_pos, clipped = 0; + int32_t i; + + end_pos = bam_endpos(b); + + // now get the clipped end bases (if any) + // if we get to the beginning of the cigar string + // without hitting a non-clip then the results are meaningless + for (i = b->core.n_cigar - 1; i >= 0; i--) { + char c = bam_cigar_opchr(cigar[i]); + + if (c == 'S' || c == 'H') { // clips + clipped += bam_cigar_oplen(cigar[i]); + } else { + break; + } + } + + return end_pos + clipped; +} + + +/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ +hts_pos_t unclipped_other_end(int64_t op, char *cigar) { + char *c = cigar; + int64_t refpos = 0; + int skip = 1; + + while (*c && *c != '*') { + long num = 0; + + if (isdigit((int)*c)) { + num = strtol(c, &c, 10); + } else { + num = 1; + } + + switch (*c) { + case 'M': + case 'D': + case 'N': + case '=': + case 'X': + refpos += num; + skip = 0; // ignore initial clips + break; + + case 'S': + case 'H': + if (!skip) { + refpos += num; + } + break; + } + + c++; + } + + return op + refpos; +} diff --git a/samtools/bam.h b/samtools/bam.h index 6e1c0d5..c0b43d5 100644 --- a/samtools/bam.h +++ b/samtools/bam.h @@ -31,4 +31,9 @@ int bam_remove_B(bam1_t *b); const char *bam_get_library(sam_hdr_t *header, const bam1_t *b); +hts_pos_t unclipped_start(bam1_t *b); +hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar); +hts_pos_t unclipped_end(bam1_t *b); +hts_pos_t unclipped_other_end(int64_t op, char *cigar); + #endif diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index 5941f55..098d3ae 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -1,7 +1,7 @@ /* bam2depth.c -- depth subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd. + Copyright (C) 2012-2016, 2018, 2019-2022 Genome Research Ltd. Author: Heng Li (to 2020) Author: James Bonfield (2021 rewrite) @@ -724,7 +724,7 @@ static void usage_exit(FILE *fp, int exit_status) fprintf(fp, " -H Print a file header\n"); fprintf(fp, " -J Include reads with deletions in depth computation\n"); fprintf(fp, " -s Do not count overlapping reads within a template\n"); - sam_global_opt_help(fp, "-.---@-."); + sam_global_opt_help(fp, "-.--.@-."); exit(exit_status); } @@ -756,7 +756,7 @@ int main_depth(int argc, char *argv[]) {"min-mq", required_argument, NULL, 'Q'}, {"min-BQ", required_argument, NULL, 'q'}, {"min-bq", required_argument, NULL, 'q'}, - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), {NULL, 0, NULL, 0} }; diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index edf4281..e713822 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -3,7 +3,7 @@ /* bam2depth.c -- depth subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd. + Copyright (C) 2012-2016, 2018, 2019-2022 Genome Research Ltd. Author: Heng Li (to 2020) Author: James Bonfield (2021 rewrite) @@ -726,7 +726,7 @@ static void usage_exit(FILE *fp, int exit_status) fprintf(fp, " -H Print a file header\n"); fprintf(fp, " -J Include reads with deletions in depth computation\n"); fprintf(fp, " -s Do not count overlapping reads within a template\n"); - sam_global_opt_help(fp, "-.---@-."); + sam_global_opt_help(fp, "-.--.@-."); samtools_exit(exit_status); } @@ -758,7 +758,7 @@ int main_depth(int argc, char *argv[]) {"min-mq", required_argument, NULL, 'Q'}, {"min-BQ", required_argument, NULL, 'q'}, {"min-bq", required_argument, NULL, 'q'}, - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), {NULL, 0, NULL, 0} }; diff --git a/samtools/bam_fastq.c b/samtools/bam_fastq.c index ccc1f17..c17821d 100644 --- a/samtools/bam_fastq.c +++ b/samtools/bam_fastq.c @@ -77,7 +77,7 @@ static void bam2fq_usage(FILE *to, const char *command) " -t copy RG, BC and QT tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); fprintf(to, -" -T TAGLIST copy arbitrary tags to the %s header line\n", +" -T TAGLIST copy arbitrary tags to the %s header line, '*' for all\n", fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, " -v INT default quality score if not given in file [1]\n" @@ -349,17 +349,21 @@ void set_sam_opts(samFile *fp, bam2fq_state_t *state, hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag); - kstring_t tag_list = {0,0}; - if (state->copy_tags) - kputs("RG,BC,QT", &tag_list); - if (opts->extra_tags) { + if (opts->extra_tags && (*opts->extra_tags == '*' || *opts->extra_tags == '\0')) + hts_set_opt(fp, FASTQ_OPT_AUX, NULL); + else { + kstring_t tag_list = {0,0}; + if (state->copy_tags) + kputs("RG,BC,QT", &tag_list); + if (opts->extra_tags) { + if (tag_list.l) + kputc(',', &tag_list); + kputs(opts->extra_tags, &tag_list); + } if (tag_list.l) - kputc(',', &tag_list); - kputs(opts->extra_tags, &tag_list); + hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s); + ks_free(&tag_list); } - if (tag_list.l) - hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s); - ks_free(&tag_list); } // Open a file as normal or gzipped based on filename. diff --git a/samtools/bam_fastq.c.pysam.c b/samtools/bam_fastq.c.pysam.c index 55013ed..fbe65fb 100644 --- a/samtools/bam_fastq.c.pysam.c +++ b/samtools/bam_fastq.c.pysam.c @@ -79,7 +79,7 @@ static void bam2fq_usage(FILE *to, const char *command) " -t copy RG, BC and QT tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); fprintf(to, -" -T TAGLIST copy arbitrary tags to the %s header line\n", +" -T TAGLIST copy arbitrary tags to the %s header line, '*' for all\n", fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, " -v INT default quality score if not given in file [1]\n" @@ -351,17 +351,21 @@ void set_sam_opts(samFile *fp, bam2fq_state_t *state, hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag); - kstring_t tag_list = {0,0}; - if (state->copy_tags) - kputs("RG,BC,QT", &tag_list); - if (opts->extra_tags) { + if (opts->extra_tags && (*opts->extra_tags == '*' || *opts->extra_tags == '\0')) + hts_set_opt(fp, FASTQ_OPT_AUX, NULL); + else { + kstring_t tag_list = {0,0}; + if (state->copy_tags) + kputs("RG,BC,QT", &tag_list); + if (opts->extra_tags) { + if (tag_list.l) + kputc(',', &tag_list); + kputs(opts->extra_tags, &tag_list); + } if (tag_list.l) - kputc(',', &tag_list); - kputs(opts->extra_tags, &tag_list); + hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s); + ks_free(&tag_list); } - if (tag_list.l) - hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s); - ks_free(&tag_list); } // Open a file as normal or gzipped based on filename. diff --git a/samtools/bam_import.c b/samtools/bam_import.c index 6a25914..47cb125 100644 --- a/samtools/bam_import.c +++ b/samtools/bam_import.c @@ -252,6 +252,12 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { hdr_out = sam_hdr_init(); } + // Add a version line with the sort order to the output header + if (sam_hdr_add_line(hdr_out, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL) < 0) { + fprintf(stderr, "Could not set SO and GO in the header.\n"); + goto err; + } + // Read group if (opts->rg_line) { if (*opts->rg_line != '@') diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c index c66f7c8..76b61a4 100644 --- a/samtools/bam_import.c.pysam.c +++ b/samtools/bam_import.c.pysam.c @@ -254,6 +254,12 @@ static int import_fastq(int argc, char **argv, opts_t *opts) { hdr_out = sam_hdr_init(); } + // Add a version line with the sort order to the output header + if (sam_hdr_add_line(hdr_out, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL) < 0) { + fprintf(samtools_stderr, "Could not set SO and GO in the header.\n"); + goto err; + } + // Read group if (opts->rg_line) { if (*opts->rg_line != '@') diff --git a/samtools/bam_index.c b/samtools/bam_index.c index 84527c4..f7c3358 100644 --- a/samtools/bam_index.c +++ b/samtools/bam_index.c @@ -28,10 +28,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include -#define __STDC_FORMAT_MACROS #include #include #include @@ -44,63 +44,97 @@ DEALINGS IN THE SOFTWARE. */ static void index_usage(FILE *fp) { fprintf(fp, -"Usage: samtools index [-bc] [-m INT] [out.index]\n" +"Usage: samtools index -M [-bc] [-m INT] ...\n" +" or: samtools index [-bc] [-m INT] [out.index]\n" "Options:\n" " -b Generate BAI-format index for BAM files [default]\n" " -c Generate CSI-format index for BAM files\n" " -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n" +" -M Interpret all filename arguments as files to be indexed\n" +" -o FILE Write index to FILE [alternative to as an argument]\n" " -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT); } +// Returns 1 if the file does not exist or can be positively +// identified as an index file. +static int nonexistent_or_index(const char *fn) +{ + int ret1, ret2; + htsFormat fmt; + hFILE *fp = hopen(fn, "r"); + if (fp == NULL) return 1; + + ret1 = hts_detect_format2(fp, fn, &fmt); + ret2 = hclose(fp); + if (ret1 < 0 || ret2 < 0) return 0; + + return fmt.category == index_file; +} + int bam_index(int argc, char *argv[]) { int csi = 0; int min_shift = BAM_LIDX_SHIFT; + int multiple = 0; int n_threads = 0; - int c, ret; + int n_files, c, i, ret; + const char *fn_idx = NULL; - while ((c = getopt(argc, argv, "bcm:@:")) >= 0) + while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0) switch (c) { case 'b': csi = 0; break; case 'c': csi = 1; break; case 'm': csi = 1; min_shift = atoi(optarg); break; + case 'M': multiple = 1; break; + case 'o': fn_idx = optarg; break; case '@': n_threads = atoi(optarg); break; default: index_usage(stderr); return 1; } - if (optind == argc) { - index_usage(stdout); - return 1; - } + n_files = argc - optind; - ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads); - switch (ret) { - case 0: + if (n_files == 0) { + index_usage(stdout); return 0; + } - case -2: - print_error_errno("index", "failed to open \"%s\"", argv[optind]); - break; + // Handle legacy synopsis + if (n_files == 2 && !fn_idx && nonexistent_or_index(argv[optind+1])) { + n_files = 1; + fn_idx = argv[optind+1]; + } - case -3: - print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]); - break; + if (n_files > 1 && !multiple) { + print_error("index", "use -M to enable indexing more than one alignment file"); + return EXIT_FAILURE; + } - case -4: - if (argv[optind+1]) - print_error("index", "failed to create or write index \"%s\"", argv[optind+1]); - else - print_error("index", "failed to create or write index"); - break; + if (fn_idx && n_files > 1) { + // TODO In future we may allow %* placeholders or similar + print_error("index", "can't use -o with multiple input alignment files"); + return EXIT_FAILURE; + } - default: - print_error_errno("index", "failed to create index for \"%s\"", argv[optind]); - break; + for (i = optind; i < optind + n_files; i++) { + ret = sam_index_build3(argv[i], fn_idx, csi? min_shift : 0, n_threads); + if (ret < 0) { + if (ret == -2) + print_error_errno("index", "failed to open \"%s\"", argv[i]); + else if (ret == -3) + print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[i]); + else if (ret == -4 && fn_idx) + print_error("index", "failed to create or write index \"%s\"", fn_idx); + else if (ret == -4) + print_error("index", "failed to create or write index"); + else + print_error_errno("index", "failed to create index for \"%s\"", argv[i]); + return EXIT_FAILURE; + } } - return EXIT_FAILURE; + return EXIT_SUCCESS; } /* diff --git a/samtools/bam_index.c.pysam.c b/samtools/bam_index.c.pysam.c index 7b2ee3e..6627cfa 100644 --- a/samtools/bam_index.c.pysam.c +++ b/samtools/bam_index.c.pysam.c @@ -30,10 +30,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include #include #include -#define __STDC_FORMAT_MACROS #include #include #include @@ -46,63 +46,97 @@ DEALINGS IN THE SOFTWARE. */ static void index_usage(FILE *fp) { fprintf(fp, -"Usage: samtools index [-bc] [-m INT] [out.index]\n" +"Usage: samtools index -M [-bc] [-m INT] ...\n" +" or: samtools index [-bc] [-m INT] [out.index]\n" "Options:\n" " -b Generate BAI-format index for BAM files [default]\n" " -c Generate CSI-format index for BAM files\n" " -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n" +" -M Interpret all filename arguments as files to be indexed\n" +" -o FILE Write index to FILE [alternative to as an argument]\n" " -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT); } +// Returns 1 if the file does not exist or can be positively +// identified as an index file. +static int nonexistent_or_index(const char *fn) +{ + int ret1, ret2; + htsFormat fmt; + hFILE *fp = hopen(fn, "r"); + if (fp == NULL) return 1; + + ret1 = hts_detect_format2(fp, fn, &fmt); + ret2 = hclose(fp); + if (ret1 < 0 || ret2 < 0) return 0; + + return fmt.category == index_file; +} + int bam_index(int argc, char *argv[]) { int csi = 0; int min_shift = BAM_LIDX_SHIFT; + int multiple = 0; int n_threads = 0; - int c, ret; + int n_files, c, i, ret; + const char *fn_idx = NULL; - while ((c = getopt(argc, argv, "bcm:@:")) >= 0) + while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0) switch (c) { case 'b': csi = 0; break; case 'c': csi = 1; break; case 'm': csi = 1; min_shift = atoi(optarg); break; + case 'M': multiple = 1; break; + case 'o': fn_idx = optarg; break; case '@': n_threads = atoi(optarg); break; default: index_usage(samtools_stderr); return 1; } - if (optind == argc) { - index_usage(samtools_stdout); - return 1; - } + n_files = argc - optind; - ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads); - switch (ret) { - case 0: + if (n_files == 0) { + index_usage(samtools_stdout); return 0; + } - case -2: - print_error_errno("index", "failed to open \"%s\"", argv[optind]); - break; + // Handle legacy synopsis + if (n_files == 2 && !fn_idx && nonexistent_or_index(argv[optind+1])) { + n_files = 1; + fn_idx = argv[optind+1]; + } - case -3: - print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]); - break; + if (n_files > 1 && !multiple) { + print_error("index", "use -M to enable indexing more than one alignment file"); + return EXIT_FAILURE; + } - case -4: - if (argv[optind+1]) - print_error("index", "failed to create or write index \"%s\"", argv[optind+1]); - else - print_error("index", "failed to create or write index"); - break; + if (fn_idx && n_files > 1) { + // TODO In future we may allow %* placeholders or similar + print_error("index", "can't use -o with multiple input alignment files"); + return EXIT_FAILURE; + } - default: - print_error_errno("index", "failed to create index for \"%s\"", argv[optind]); - break; + for (i = optind; i < optind + n_files; i++) { + ret = sam_index_build3(argv[i], fn_idx, csi? min_shift : 0, n_threads); + if (ret < 0) { + if (ret == -2) + print_error_errno("index", "failed to open \"%s\"", argv[i]); + else if (ret == -3) + print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[i]); + else if (ret == -4 && fn_idx) + print_error("index", "failed to create or write index \"%s\"", fn_idx); + else if (ret == -4) + print_error("index", "failed to create or write index"); + else + print_error_errno("index", "failed to create index for \"%s\"", argv[i]); + return EXIT_FAILURE; + } } - return EXIT_FAILURE; + return EXIT_SUCCESS; } /* diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index 83e8f73..be9b195 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -1,7 +1,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2021 Genome Research Ltd. + Copyright (C) 2017-2022 Genome Research Ltd. Author: Andrew Whitwham @@ -46,6 +46,7 @@ Copyright (c) 2009,2018 The Broad Institute. MIT license. #include "htslib/klist.h" #include "htslib/kstring.h" #include "tmp_file.h" +#include "bam.h" typedef struct { @@ -71,6 +72,8 @@ typedef struct { int rgx_x; int rgx_y; int rgx_t; + char *barcode; + regex_t *bc_rgx; } md_param_t; typedef struct { @@ -78,6 +81,7 @@ typedef struct { hts_pos_t other_coord; int32_t this_ref; int32_t other_ref; + int32_t barcode; int8_t single; int8_t leftmost; int8_t orientation; @@ -126,15 +130,16 @@ static khint_t hash_key(key_data_t key) { khint_t hash; if (key.single) { - unsigned char sig[13]; + unsigned char sig[17]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; memcpy(sig + i, &key.orientation, 1); i += 1; + memcpy(sig + i, &key.barcode, 4); i += 4; hash = do_hash(sig, i); } else { - unsigned char sig[26]; + unsigned char sig[30]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; @@ -142,6 +147,7 @@ static khint_t hash_key(key_data_t key) { memcpy(sig + i, &key.other_coord, 8); i += 8; memcpy(sig + i, &key.leftmost, 1); i += 1; memcpy(sig + i, &key.orientation, 1); i += 1; + memcpy(sig + i, &key.barcode, 4); i += 4; hash = do_hash(sig, i); } @@ -161,6 +167,8 @@ static int key_equal(key_data_t a, key_data_t b) { match = 0; else if (a.single != b.single) match = 0; + else if (a.barcode != b.barcode) + match = 0; if (!a.single) { if (a.other_coord != b.other_coord) @@ -199,123 +207,6 @@ KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map has KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id - -/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ - -static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { - char *c = cigar; - int64_t clipped = 0; - - while (*c && *c != '*') { - long num = 0; - - if (isdigit((int)*c)) { - num = strtol(c, &c, 10); - } else { - num = 1; - } - - if (*c == 'S' || *c == 'H') { // clips - clipped += num; - } else { - break; - } - - c++; - } - - return op - clipped + 1; -} - - -/* Calculate the current read's start based on the stored cigar string. */ - -static hts_pos_t unclipped_start(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); - int64_t clipped = 0; - uint32_t i; - - for (i = 0; i < b->core.n_cigar; i++) { - char c = bam_cigar_opchr(cigar[i]); - - if (c == 'S' || c == 'H') { // clips - clipped += bam_cigar_oplen(cigar[i]); - } else { - break; - } - } - - return b->core.pos - clipped + 1; -} - - -/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ - -static hts_pos_t unclipped_other_end(int64_t op, char *cigar) { - char *c = cigar; - int64_t refpos = 0; - int skip = 1; - - while (*c && *c != '*') { - long num = 0; - - if (isdigit((int)*c)) { - num = strtol(c, &c, 10); - } else { - num = 1; - } - - switch (*c) { - case 'M': - case 'D': - case 'N': - case '=': - case 'X': - refpos += num; - skip = 0; // ignore initial clips - break; - - case 'S': - case 'H': - if (!skip) { - refpos += num; - } - break; - } - - c++; - } - - return op + refpos; -} - - -/* Calculate the current read's end based on the stored cigar string. */ - -static hts_pos_t unclipped_end(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); - hts_pos_t end_pos, clipped = 0; - int32_t i; - - end_pos = bam_endpos(b); - - // now get the clipped end bases (if any) - // if we get to the beginning of the cigar string - // without hitting a non-clip then the results are meaningless - for (i = b->core.n_cigar - 1; i >= 0; i--) { - char c = bam_cigar_opchr(cigar[i]); - - if (c == 'S' || c == 'H') { // clips - clipped += bam_cigar_oplen(cigar[i]); - } else { - break; - } - } - - return end_pos + clipped; -} - - /* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */ static khint32_t do_hash(unsigned char *key, khint32_t len) { @@ -373,12 +264,14 @@ static int64_t calc_score(bam1_t *b) the reference id, orientation and whether the current read is leftmost of the pair. */ -static int make_pair_key_template(key_data_t *key, bam1_t *bam) { - hts_pos_t this_coord, other_coord, this_end, other_end; - int32_t this_ref, other_ref; - int8_t orientation, leftmost; + +static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { + hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; + int32_t this_ref, other_ref, barcode = 0; + int8_t orientation, left_read; uint8_t *data; - char *cig; + char *cig, *bar; + long incoming_warnings = *warnings; this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash other_ref = bam->core.mtid + 1; @@ -400,191 +293,163 @@ static int make_pair_key_template(key_data_t *key, bam1_t *bam) { } // work out orientations - if (this_ref != other_ref) { - leftmost = this_ref < other_ref; - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - leftmost = this_coord <= other_coord; - } else { - leftmost = this_end <= other_end; - } + if (param->mode == MD_MODE_TEMPLATE) { + + if (this_ref != other_ref) { + leftmost = this_ref < other_ref; } else { - if (bam_is_rev(bam)) { - leftmost = this_end <= other_coord; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + if (!bam_is_rev(bam)) { + leftmost = this_coord <= other_coord; + } else { + leftmost = this_end <= other_end; + } } else { - leftmost = this_coord <= other_end; + if (bam_is_rev(bam)) { + leftmost = this_end <= other_coord; + } else { + leftmost = this_coord <= other_end; + } } } - } - // pair orientation - if (leftmost) { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - other_coord = other_end; + // pair orientation + if (leftmost) { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + other_coord = other_end; - if (!bam_is_rev(bam)) { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_FF; + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } } else { - orientation = O_RR; + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } } } else { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_RR; + if (!bam_is_rev(bam)) { + orientation = O_FR; + other_coord = other_end; } else { - orientation = O_FF; + orientation = O_RF; + this_coord = this_end; } } } else { - if (!bam_is_rev(bam)) { - orientation = O_FR; - other_coord = other_end; - } else { - orientation = O_RF; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { this_coord = this_end; - } - } - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - this_coord = this_end; - if (!bam_is_rev(bam)) { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_RR; + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } } else { - orientation = O_FF; + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } } } else { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_FF; + if (!bam_is_rev(bam)) { + orientation = O_RF; + other_coord = other_end; } else { - orientation = O_RR; + orientation = O_FR; + this_coord = this_end; } } - } else { - if (!bam_is_rev(bam)) { - orientation = O_RF; - other_coord = other_end; - } else { - orientation = O_FR; - this_coord = this_end; - } } - } - - if (!leftmost) - leftmost = R_RI; - else - leftmost = R_LE; + } else { // MD_MODE_SEQUENCE - key->single = 0; - key->this_ref = this_ref; - key->this_coord = this_coord; - key->other_ref = other_ref; - key->other_coord = other_coord; - key->leftmost = leftmost; - key->orientation = orientation; - - return 0; -} - - -static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { - hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; - int32_t this_ref, other_ref; - int8_t orientation, left_read; - uint8_t *data; - char *cig; - - this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash - other_ref = bam->core.mtid + 1; - - this_coord = unclipped_start(bam); - this_end = unclipped_end(bam); - - if ((data = bam_aux_get(bam, "MC"))) { - if (!(cig = bam_aux2Z(data))) { - fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); - return 1; - } - - other_end = unclipped_other_end(bam->core.mpos, cig); - other_coord = unclipped_other_start(bam->core.mpos, cig); - } else { - fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); - return 1; - } - - // work out orientations - if (this_ref != other_ref) { - leftmost = this_ref - other_ref; - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - leftmost = this_coord - other_coord; - } else { - leftmost = this_end - other_end; - } + if (this_ref != other_ref) { + leftmost = this_ref - other_ref; } else { - if (bam_is_rev(bam)) { - leftmost = this_end - other_coord; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + if (!bam_is_rev(bam)) { + leftmost = this_coord - other_coord; + } else { + leftmost = this_end - other_end; + } } else { - leftmost = this_coord - other_end; + if (bam_is_rev(bam)) { + leftmost = this_end - other_coord; + } else { + leftmost = this_coord - other_end; + } } } - } - if (leftmost < 0) { - leftmost = 1; - } else if (leftmost > 0) { - leftmost = 0; - } else { - // tie breaks + if (leftmost < 0) { + leftmost = 1; + } else if (leftmost > 0) { + leftmost = 0; + } else { + // tie breaks - if (bam->core.pos == bam->core.mpos) { - if (bam->core.flag & BAM_FREAD1) { + if (bam->core.pos == bam->core.mpos) { + if (bam->core.flag & BAM_FREAD1) { + leftmost = 1; + } else { + leftmost = 0; + } + } else if (bam->core.pos < bam->core.mpos) { leftmost = 1; } else { leftmost = 0; } - } else if (bam->core.pos < bam->core.mpos) { - leftmost = 1; - } else { - leftmost = 0; } - } - // pair orientation - if (leftmost) { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { + // pair orientation + if (leftmost) { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - orientation = O_FF; + if (!bam_is_rev(bam)) { + orientation = O_FF; + } else { + orientation = O_RR; + } } else { - orientation = O_RR; + if (!bam_is_rev(bam)) { + orientation = O_FR; + } else { + orientation = O_RF; + } } } else { - if (!bam_is_rev(bam)) { - orientation = O_FR; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + + if (!bam_is_rev(bam)) { + orientation = O_RR; + } else { + orientation = O_FF; + } } else { - orientation = O_RF; + if (!bam_is_rev(bam)) { + orientation = O_RF; + } else { + orientation = O_FR; + } } } - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - orientation = O_RR; - } else { - orientation = O_FF; - } + if (!bam_is_rev(bam)) { + this_coord = unclipped_start(bam); } else { - if (!bam_is_rev(bam)) { - orientation = O_RF; - } else { - orientation = O_FR; - } + this_coord = unclipped_end(bam); + } + + if (!bam_is_mrev(bam)) { + other_coord = unclipped_other_start(bam->core.mpos, cig); + } else { + other_coord = unclipped_other_end(bam->core.mpos, cig); } } @@ -593,16 +458,54 @@ static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { else left_read = R_LE; - if (!bam_is_rev(bam)) { - this_coord = unclipped_start(bam); - } else { - this_coord = unclipped_end(bam); + if (param->barcode) { + if ((data = bam_aux_get(bam, param->barcode))) { + if (!(bar = bam_aux2Z(data))) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + } + } else { + barcode = do_hash((unsigned char *)bar, strlen(bar)); + } + } + } else if (param->bc_rgx) { + int result; + regmatch_t matches[3]; + size_t max_matches = 2; + char *qname = bam_get_qname(bam); + + if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) { + int bc_start, bc_end; + + bc_start = matches[1].rm_so; + bc_end = matches[1].rm_eo; + + if (bc_start != -1) { + barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start); + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + } + } + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + char warn_msg[256]; + + regerror(result, param->bc_rgx, warn_msg, 256); + fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + } + } } - if (!bam_is_mrev(bam)) { - other_coord = unclipped_other_start(bam->core.mpos, cig); - } else { - other_coord = unclipped_other_end(bam->core.mpos, cig); + if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { + fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + *warnings); } key->single = 0; @@ -612,18 +515,23 @@ static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { key->other_coord = other_coord; key->leftmost = left_read; key->orientation = orientation; + key->barcode = barcode; return 0; } + /* Create a signature hash of single read (or read with an unmatched pair). Uses unclipped start (or end depending on orientation), reference id, and orientation. */ -static void make_single_key(key_data_t *key, bam1_t *bam) { +static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { hts_pos_t this_coord; - int32_t this_ref; + int32_t this_ref, barcode = 0; int8_t orientation; + uint8_t *data; + char *bar; + long incoming_warnings = *warnings; this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash @@ -635,10 +543,61 @@ static void make_single_key(key_data_t *key, bam1_t *bam) { orientation = O_FF; } + if (param->barcode) { + if ((data = bam_aux_get(bam, param->barcode))) { + if (!(bar = bam_aux2Z(data))) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + } + } else { + barcode = do_hash((unsigned char *)bar, strlen(bar)); + } + } + } else if (param->bc_rgx) { + int result; + regmatch_t matches[3]; + size_t max_matches = 2; + char *qname = bam_get_qname(bam); + + if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) { + int bc_start, bc_end; + + bc_start = matches[1].rm_so; + bc_end = matches[1].rm_eo; + + if (bc_start != -1) { + barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start); + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + } + } + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + char warn_msg[256]; + + regerror(result, param->bc_rgx, warn_msg, 256); + fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + } + } + } + + if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { + fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + *warnings); + } + key->single = 1; key->this_ref = this_ref; key->this_coord = this_coord; key->orientation = orientation; + key->barcode = barcode; } @@ -688,41 +647,95 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n } -/* Get the position of the coordinates from the read name. */ -static inline int get_coordinate_positions_colons(const char *qname, int *xpos, int *ypos) { +/* Get coordinates from the standard Illumina style read names. + Returned values are of the x and y coordinates and a section of + the read name to test (t) for string equality e.g. lane and tile part. */ + +static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { int sep = 0; int pos = 0; + int xpos = 0, ypos = 0; + char *end; while (qname[pos]) { if (qname[pos] == ':') { sep++; if (sep == 2) { - *xpos = pos + 1; + xpos = pos + 1; } else if (sep == 3) { - *ypos = pos + 1; + ypos = pos + 1; } else if (sep == 4) { // HiSeq style names - *xpos = *ypos; - *ypos = pos + 1; + xpos = ypos; + ypos = pos + 1; } else if (sep == 5) { // Newer Illumina format - *xpos = pos + 1; + xpos = pos + 1; } else if (sep == 6) { - *ypos = pos + 1; + ypos = pos + 1; } } pos++; } - return sep; + /* The most current Illumina read format at time of writing is: + @machine:run:flowcell:lane:tile:x:y:UMI or + @machine:run:flowcell:lane:tile:x:y + + Counting the separating colons gives us a quick format check. + Older name formats have fewer elements. + */ + + if (!(sep == 3 || sep == 4 || sep == 6 || sep == 7)) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); + } + + return 1; + } else { + *x_coord = strtol(qname + xpos, &end, 10); + + if ((qname + xpos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname); + } + + return 1; + } + + *y_coord = strtol(qname + ypos, &end, 10); + + if ((qname + ypos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname); + } + + return 1; + } + + *t_beg = 0; + *t_end = xpos; + } + + return 0; } -/* Get the position of the coordinates from the read name. - Positions returned are of the x and y coordinate and an optional section of +/* Get the coordinates from the read name. + Returned values are of the x and y coordinates and an optional section of the read name to test (t) for string equality e.g. lane and tile part. */ -static inline int get_coordinate_positions_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, int *xpos, int *ypos) { + +static inline int get_coordinates_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { regmatch_t matches[5]; size_t max_matches = 5; + int xpos, ypos, xend, yend, xlen, ylen; + char coord[255]; + char *end; if (!param->rgx_t) max_matches = 4; @@ -730,8 +743,8 @@ static inline int get_coordinate_positions_regex(md_param_t *param, const char * if (regexec(param->rgx, qname, max_matches, matches, 0)) return -1; - *xpos = matches[param->rgx_x].rm_so; - *ypos = matches[param->rgx_y].rm_so; + xpos = matches[param->rgx_x].rm_so; + ypos = matches[param->rgx_y].rm_so; if (param->rgx_t) { *t_beg = matches[param->rgx_t].rm_so; @@ -740,138 +753,99 @@ static inline int get_coordinate_positions_regex(md_param_t *param, const char * *t_beg = *t_end = 0; } - if (*xpos == -1 || *ypos == -1 || *t_beg == -1) + if (xpos == -1 || ypos == -1 || *t_beg == -1) return -1; - return 7; // 3, 4, 6 and 7 are successes in the previous function -} + xend = matches[param->rgx_x].rm_eo; + yend = matches[param->rgx_y].rm_eo; + if ((xlen = xend - xpos) > 254) { + (*warnings)++; -static int get_coordinate_positions(md_param_t *param, const char *qname, int *beg, int *end, int *xpos, int *ypos, long *warnings) { - int ret = 0; - int seps; + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen); + } - if (param->rgx == NULL) { - seps = get_coordinate_positions_colons(qname, xpos, ypos); - *beg = 0; - *end = *xpos; - } else { - seps = get_coordinate_positions_regex(param, qname, beg, end, xpos, ypos); + return 1; } - /* The most current Illumina read format at time of writing is: - @machine:run:flowcell:lane:tile:x:y:UMI or - @machine:run:flowcell:lane:tile:x:y + strncpy(coord, qname + xpos, xlen); + coord[xlen] = '\0'; + *x_coord = strtol(coord, &end, 10); - Counting the separating colons gives us a quick format check. - Older name formats have fewer elements. - */ - - if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { + if (coord == end) { (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); + fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord); } - ret = 1; + return 1; } - return ret; -} - + if ((ylen = yend - ypos) > 254) { + (*warnings)++; -static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { - int ret = 1; - int xpos = 0, ypos = 0; - long x = 0, y = 0; - char *end; + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen); + } - if (get_coordinate_positions(param, name, t_beg, t_end, &xpos, &ypos, warnings)) { - return ret; + return 1; } - x = strtol(name + xpos, &end, 10); + strncpy(coord, qname + ypos, ylen); + coord[ylen] = '\0'; + *y_coord = strtol(coord, &end, 10); - if ((name + xpos) == end) { + if (coord == end) { (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name); + fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord); } - return ret; + return 1; } - y = strtol(name + ypos, &end, 10); + return 0; +} - if ((name + ypos) == end) { - (*warnings)++; - if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name); - } +static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { + int ret = 1; - return ret; + if (param->rgx == NULL) { + ret = get_coordinates_colons(param, name, t_beg, t_end, x_coord, y_coord, warnings); + } else { + ret = get_coordinates_regex(param, name, t_beg, t_end, x_coord, y_coord, warnings); } - *x_coord = x; - *y_coord = y; - ret = 0; - return ret; } -/* Using the coordinates from the Illumina read name, see whether the duplicated read is +/* Using the coordinates from the read name, see whether the duplicated read is close enough (set by max_dist) to the original to be counted as optical.*/ -static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { +static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { int ret = 0; char *original, *duplicate; - int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; + long ox, oy, dx, dy; int o_beg = 0, o_end = 0, d_beg = 0, d_end = 0; - original = bam_get_qname(ori); duplicate = bam_get_qname(dup); - if (get_coordinate_positions(param, original, &o_beg, &o_end, &oxpos, &oypos, warnings)) { + if (get_coordinates(param, original, &o_beg, &o_end, &ox, &oy, warnings)) { return ret; } - if (get_coordinate_positions(param, duplicate, &d_beg, &d_end, &dxpos, &dypos, warnings)) { + if (get_coordinates(param, duplicate, &d_beg, &d_end, &dx, &dy, warnings)) { return ret; } if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { - // the initial parts match, look at the numbers - long ox, oy, dx, dy, xdiff, ydiff; - char *end; - - ox = strtol(original + oxpos, &end, 10); - - if ((original + oxpos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original); - } - - return ret; - } - - dx = strtol(duplicate + dxpos, &end, 10); - - if ((duplicate + dxpos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate); - } - - return ret; - } + long xdiff, ydiff; if (ox > dx) { xdiff = ox - dx; @@ -882,30 +856,6 @@ static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long m if (xdiff <= max_dist) { // still might be optical - oy = strtol(original + oypos, &end, 10); - - if ((original + oypos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original); - } - - return ret; - } - - dy = strtol(duplicate + dypos, &end, 10); - - if ((duplicate + dypos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate); - } - - return ret; - } - if (oy > dy) { ydiff = oy - dy; } else { @@ -985,7 +935,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } if (param->opt_dist) { // mark optical duplicates - if (optical_duplicate(param, ori, dup, param->opt_dist, warn)) { + if (is_optical_duplicate(param, ori, dup, param->opt_dist, warn)) { bam_aux_update_str(dup, "dt", 3, "SQ"); dup_type = 'O'; (*optical)++; @@ -1445,7 +1395,7 @@ static int bam_mark_duplicates(md_param_t *param) { int ret; long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; long np_duplicate, np_opt_duplicate; - long opt_warnings = 0; + long opt_warnings = 0, bc_warnings = 0; tmp_file_t temp; char *idx_fn = NULL; int exclude = 0; @@ -1579,19 +1529,12 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - if (param->mode) { - if (make_pair_key_sequence(&pair_key, in_read->b)) { - fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); - goto fail; - } - } else { - if (make_pair_key_template(&pair_key, in_read->b)) { - fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); - goto fail; - } + if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) { + fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); + goto fail; } - make_single_key(&single_key, in_read->b); + make_single_key(param, &single_key, in_read->b, &bc_warnings); pair++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos @@ -1731,7 +1674,7 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - make_single_key(&single_key, in_read->b); + make_single_key(param, &single_key, in_read->b, &bc_warnings); single++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos @@ -1990,6 +1933,10 @@ static int bam_mark_duplicates(md_param_t *param) { opt_warnings); } + if (bc_warnings) { + fprintf(stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings); + } + if (param->do_stats) { FILE *fp; int file_open = 0; @@ -2095,6 +2042,9 @@ static int markdup_usage(void) { fprintf(stderr, " --read-coords STR Regex for coords from read name.\n"); fprintf(stderr, " --coords-order STR Order of regex elements. txy (default). With t being a part of\n" " the read names that must be equal and x/y being coordinates.\n"); + fprintf(stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n"); + fprintf(stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n"); + fprintf(stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n"); fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); @@ -2108,16 +2058,17 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { - int c, ret; + int c, ret, bc_name = 0; char wmode[4] = {'w', 'b', 0, 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; kstring_t tmpprefix = {0, 0, NULL}; struct stat st; unsigned int t; - char *regex = NULL; + char *regex = NULL, *bc_regex = NULL; char *regex_order = "txy"; - md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL, NULL, 0, 0, 0}; + md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2127,6 +2078,9 @@ int bam_markdup(int argc, char **argv) { {"no-multi-dup", no_argument, NULL, 1003}, {"read-coords", required_argument, NULL, 1004}, {"coords-order", required_argument, NULL, 1005}, + {"barcode-tag", required_argument, NULL, 1006}, + {"barcode-name", no_argument, NULL, 1007}, + {"barcode-rgx", required_argument, NULL, 1008}, {NULL, 0, NULL, 0} }; @@ -2158,6 +2112,9 @@ int bam_markdup(int argc, char **argv) { case 1003: param.check_chain = 0; break; case 1004: regex = optarg; break; case 1005: regex_order = optarg; break; + case 1006: param.barcode = optarg; break; + case 1007: bc_name = 1; break; + case 1008: bc_name = 1, bc_regex = optarg; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -2167,6 +2124,12 @@ int bam_markdup(int argc, char **argv) { if (optind + 2 > argc) return markdup_usage(); + if (param.barcode && bc_name) { + fprintf(stderr, "[markdup] Error: cannot specify --barcode-tag and " + "--barcode-name (or --barcode-rgx) at same time.\n"); + return 1; + } + if (param.opt_dist < 0) param.opt_dist = 0; if (param.max_length < 0) param.max_length = 300; @@ -2195,7 +2158,7 @@ int bam_markdup(int argc, char **argv) { param.rgx_y = 2; param.rgx_t = 0; } else { - fprintf(stderr, "[markdup] error: could not recognise regex coorindate order \"%s\".\n", regex_order); + fprintf(stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order); return 1; } @@ -2214,6 +2177,32 @@ int bam_markdup(int argc, char **argv) { } } + if (bc_name) { + int result; + + /* From Illumina UMI documentation: "The UMI sequence is located in the + eighth colon-delimited field of the read name (QNAME)". */ + char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)"; + + if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) { + fprintf(stderr, "[markdup] error: could not allocate memory for barcode regex.\n"); + return 1; + } + + if (bc_regex) { + rgx = bc_regex; + } + + if ((result = regcomp(param.bc_rgx, rgx, REG_EXTENDED))) { + char err_msg[256]; + + regerror(result, param.bc_rgx, err_msg, 256); + fprintf(stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg); + free(param.bc_rgx); + return 1; + } + } + param.in = sam_open_format(argv[optind], "r", &ga.in); if (!param.in) { @@ -2278,6 +2267,11 @@ int bam_markdup(int argc, char **argv) { free(param.rgx); } + if (param.bc_rgx) { + regfree(param.bc_rgx); + free(param.bc_rgx); + } + free(param.arg_list); free(tmpprefix.s); sam_global_args_free(&ga); diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index 06fb361..3c14d8b 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -3,7 +3,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2021 Genome Research Ltd. + Copyright (C) 2017-2022 Genome Research Ltd. Author: Andrew Whitwham @@ -48,6 +48,7 @@ Copyright (c) 2009,2018 The Broad Institute. MIT license. #include "htslib/klist.h" #include "htslib/kstring.h" #include "tmp_file.h" +#include "bam.h" typedef struct { @@ -73,6 +74,8 @@ typedef struct { int rgx_x; int rgx_y; int rgx_t; + char *barcode; + regex_t *bc_rgx; } md_param_t; typedef struct { @@ -80,6 +83,7 @@ typedef struct { hts_pos_t other_coord; int32_t this_ref; int32_t other_ref; + int32_t barcode; int8_t single; int8_t leftmost; int8_t orientation; @@ -128,15 +132,16 @@ static khint_t hash_key(key_data_t key) { khint_t hash; if (key.single) { - unsigned char sig[13]; + unsigned char sig[17]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; memcpy(sig + i, &key.orientation, 1); i += 1; + memcpy(sig + i, &key.barcode, 4); i += 4; hash = do_hash(sig, i); } else { - unsigned char sig[26]; + unsigned char sig[30]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; @@ -144,6 +149,7 @@ static khint_t hash_key(key_data_t key) { memcpy(sig + i, &key.other_coord, 8); i += 8; memcpy(sig + i, &key.leftmost, 1); i += 1; memcpy(sig + i, &key.orientation, 1); i += 1; + memcpy(sig + i, &key.barcode, 4); i += 4; hash = do_hash(sig, i); } @@ -163,6 +169,8 @@ static int key_equal(key_data_t a, key_data_t b) { match = 0; else if (a.single != b.single) match = 0; + else if (a.barcode != b.barcode) + match = 0; if (!a.single) { if (a.other_coord != b.other_coord) @@ -201,123 +209,6 @@ KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map has KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id - -/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */ - -static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) { - char *c = cigar; - int64_t clipped = 0; - - while (*c && *c != '*') { - long num = 0; - - if (isdigit((int)*c)) { - num = strtol(c, &c, 10); - } else { - num = 1; - } - - if (*c == 'S' || *c == 'H') { // clips - clipped += num; - } else { - break; - } - - c++; - } - - return op - clipped + 1; -} - - -/* Calculate the current read's start based on the stored cigar string. */ - -static hts_pos_t unclipped_start(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); - int64_t clipped = 0; - uint32_t i; - - for (i = 0; i < b->core.n_cigar; i++) { - char c = bam_cigar_opchr(cigar[i]); - - if (c == 'S' || c == 'H') { // clips - clipped += bam_cigar_oplen(cigar[i]); - } else { - break; - } - } - - return b->core.pos - clipped + 1; -} - - -/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/ - -static hts_pos_t unclipped_other_end(int64_t op, char *cigar) { - char *c = cigar; - int64_t refpos = 0; - int skip = 1; - - while (*c && *c != '*') { - long num = 0; - - if (isdigit((int)*c)) { - num = strtol(c, &c, 10); - } else { - num = 1; - } - - switch (*c) { - case 'M': - case 'D': - case 'N': - case '=': - case 'X': - refpos += num; - skip = 0; // ignore initial clips - break; - - case 'S': - case 'H': - if (!skip) { - refpos += num; - } - break; - } - - c++; - } - - return op + refpos; -} - - -/* Calculate the current read's end based on the stored cigar string. */ - -static hts_pos_t unclipped_end(bam1_t *b) { - uint32_t *cigar = bam_get_cigar(b); - hts_pos_t end_pos, clipped = 0; - int32_t i; - - end_pos = bam_endpos(b); - - // now get the clipped end bases (if any) - // if we get to the beginning of the cigar string - // without hitting a non-clip then the results are meaningless - for (i = b->core.n_cigar - 1; i >= 0; i--) { - char c = bam_cigar_opchr(cigar[i]); - - if (c == 'S' || c == 'H') { // clips - clipped += bam_cigar_oplen(cigar[i]); - } else { - break; - } - } - - return end_pos + clipped; -} - - /* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */ static khint32_t do_hash(unsigned char *key, khint32_t len) { @@ -375,12 +266,14 @@ static int64_t calc_score(bam1_t *b) the reference id, orientation and whether the current read is leftmost of the pair. */ -static int make_pair_key_template(key_data_t *key, bam1_t *bam) { - hts_pos_t this_coord, other_coord, this_end, other_end; - int32_t this_ref, other_ref; - int8_t orientation, leftmost; + +static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { + hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; + int32_t this_ref, other_ref, barcode = 0; + int8_t orientation, left_read; uint8_t *data; - char *cig; + char *cig, *bar; + long incoming_warnings = *warnings; this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash other_ref = bam->core.mtid + 1; @@ -402,191 +295,163 @@ static int make_pair_key_template(key_data_t *key, bam1_t *bam) { } // work out orientations - if (this_ref != other_ref) { - leftmost = this_ref < other_ref; - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - leftmost = this_coord <= other_coord; - } else { - leftmost = this_end <= other_end; - } + if (param->mode == MD_MODE_TEMPLATE) { + + if (this_ref != other_ref) { + leftmost = this_ref < other_ref; } else { - if (bam_is_rev(bam)) { - leftmost = this_end <= other_coord; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + if (!bam_is_rev(bam)) { + leftmost = this_coord <= other_coord; + } else { + leftmost = this_end <= other_end; + } } else { - leftmost = this_coord <= other_end; + if (bam_is_rev(bam)) { + leftmost = this_end <= other_coord; + } else { + leftmost = this_coord <= other_end; + } } } - } - // pair orientation - if (leftmost) { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - other_coord = other_end; + // pair orientation + if (leftmost) { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + other_coord = other_end; - if (!bam_is_rev(bam)) { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_FF; + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } } else { - orientation = O_RR; + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } } } else { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_RR; + if (!bam_is_rev(bam)) { + orientation = O_FR; + other_coord = other_end; } else { - orientation = O_FF; + orientation = O_RF; + this_coord = this_end; } } } else { - if (!bam_is_rev(bam)) { - orientation = O_FR; - other_coord = other_end; - } else { - orientation = O_RF; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { this_coord = this_end; - } - } - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - this_coord = this_end; - if (!bam_is_rev(bam)) { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_RR; + if (!bam_is_rev(bam)) { + if (bam->core.flag & BAM_FREAD1) { + orientation = O_RR; + } else { + orientation = O_FF; + } } else { - orientation = O_FF; + if (bam->core.flag & BAM_FREAD1) { + orientation = O_FF; + } else { + orientation = O_RR; + } } } else { - if (bam->core.flag & BAM_FREAD1) { - orientation = O_FF; + if (!bam_is_rev(bam)) { + orientation = O_RF; + other_coord = other_end; } else { - orientation = O_RR; + orientation = O_FR; + this_coord = this_end; } } - } else { - if (!bam_is_rev(bam)) { - orientation = O_RF; - other_coord = other_end; - } else { - orientation = O_FR; - this_coord = this_end; - } } - } - - if (!leftmost) - leftmost = R_RI; - else - leftmost = R_LE; + } else { // MD_MODE_SEQUENCE - key->single = 0; - key->this_ref = this_ref; - key->this_coord = this_coord; - key->other_ref = other_ref; - key->other_coord = other_coord; - key->leftmost = leftmost; - key->orientation = orientation; - - return 0; -} - - -static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { - hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; - int32_t this_ref, other_ref; - int8_t orientation, left_read; - uint8_t *data; - char *cig; - - this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash - other_ref = bam->core.mtid + 1; - - this_coord = unclipped_start(bam); - this_end = unclipped_end(bam); - - if ((data = bam_aux_get(bam, "MC"))) { - if (!(cig = bam_aux2Z(data))) { - fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); - return 1; - } - - other_end = unclipped_other_end(bam->core.mpos, cig); - other_coord = unclipped_other_start(bam->core.mpos, cig); - } else { - fprintf(samtools_stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); - return 1; - } - - // work out orientations - if (this_ref != other_ref) { - leftmost = this_ref - other_ref; - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - leftmost = this_coord - other_coord; - } else { - leftmost = this_end - other_end; - } + if (this_ref != other_ref) { + leftmost = this_ref - other_ref; } else { - if (bam_is_rev(bam)) { - leftmost = this_end - other_coord; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + if (!bam_is_rev(bam)) { + leftmost = this_coord - other_coord; + } else { + leftmost = this_end - other_end; + } } else { - leftmost = this_coord - other_end; + if (bam_is_rev(bam)) { + leftmost = this_end - other_coord; + } else { + leftmost = this_coord - other_end; + } } } - } - if (leftmost < 0) { - leftmost = 1; - } else if (leftmost > 0) { - leftmost = 0; - } else { - // tie breaks + if (leftmost < 0) { + leftmost = 1; + } else if (leftmost > 0) { + leftmost = 0; + } else { + // tie breaks - if (bam->core.pos == bam->core.mpos) { - if (bam->core.flag & BAM_FREAD1) { + if (bam->core.pos == bam->core.mpos) { + if (bam->core.flag & BAM_FREAD1) { + leftmost = 1; + } else { + leftmost = 0; + } + } else if (bam->core.pos < bam->core.mpos) { leftmost = 1; } else { leftmost = 0; } - } else if (bam->core.pos < bam->core.mpos) { - leftmost = 1; - } else { - leftmost = 0; } - } - // pair orientation - if (leftmost) { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { + // pair orientation + if (leftmost) { + if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - orientation = O_FF; + if (!bam_is_rev(bam)) { + orientation = O_FF; + } else { + orientation = O_RR; + } } else { - orientation = O_RR; + if (!bam_is_rev(bam)) { + orientation = O_FR; + } else { + orientation = O_RF; + } } } else { - if (!bam_is_rev(bam)) { - orientation = O_FR; + if (bam_is_rev(bam) == bam_is_mrev(bam)) { + + if (!bam_is_rev(bam)) { + orientation = O_RR; + } else { + orientation = O_FF; + } } else { - orientation = O_RF; + if (!bam_is_rev(bam)) { + orientation = O_RF; + } else { + orientation = O_FR; + } } } - } else { - if (bam_is_rev(bam) == bam_is_mrev(bam)) { - if (!bam_is_rev(bam)) { - orientation = O_RR; - } else { - orientation = O_FF; - } + if (!bam_is_rev(bam)) { + this_coord = unclipped_start(bam); } else { - if (!bam_is_rev(bam)) { - orientation = O_RF; - } else { - orientation = O_FR; - } + this_coord = unclipped_end(bam); + } + + if (!bam_is_mrev(bam)) { + other_coord = unclipped_other_start(bam->core.mpos, cig); + } else { + other_coord = unclipped_other_end(bam->core.mpos, cig); } } @@ -595,16 +460,54 @@ static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { else left_read = R_LE; - if (!bam_is_rev(bam)) { - this_coord = unclipped_start(bam); - } else { - this_coord = unclipped_end(bam); + if (param->barcode) { + if ((data = bam_aux_get(bam, param->barcode))) { + if (!(bar = bam_aux2Z(data))) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + } + } else { + barcode = do_hash((unsigned char *)bar, strlen(bar)); + } + } + } else if (param->bc_rgx) { + int result; + regmatch_t matches[3]; + size_t max_matches = 2; + char *qname = bam_get_qname(bam); + + if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) { + int bc_start, bc_end; + + bc_start = matches[1].rm_so; + bc_end = matches[1].rm_eo; + + if (bc_start != -1) { + barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start); + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + } + } + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + char warn_msg[256]; + + regerror(result, param->bc_rgx, warn_msg, 256); + fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + } + } } - if (!bam_is_mrev(bam)) { - other_coord = unclipped_other_start(bam->core.mpos, cig); - } else { - other_coord = unclipped_other_end(bam->core.mpos, cig); + if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { + fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + *warnings); } key->single = 0; @@ -614,18 +517,23 @@ static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) { key->other_coord = other_coord; key->leftmost = left_read; key->orientation = orientation; + key->barcode = barcode; return 0; } + /* Create a signature hash of single read (or read with an unmatched pair). Uses unclipped start (or end depending on orientation), reference id, and orientation. */ -static void make_single_key(key_data_t *key, bam1_t *bam) { +static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { hts_pos_t this_coord; - int32_t this_ref; + int32_t this_ref, barcode = 0; int8_t orientation; + uint8_t *data; + char *bar; + long incoming_warnings = *warnings; this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash @@ -637,10 +545,61 @@ static void make_single_key(key_data_t *key, bam1_t *bam) { orientation = O_FF; } + if (param->barcode) { + if ((data = bam_aux_get(bam, param->barcode))) { + if (!(bar = bam_aux2Z(data))) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + } + } else { + barcode = do_hash((unsigned char *)bar, strlen(bar)); + } + } + } else if (param->bc_rgx) { + int result; + regmatch_t matches[3]; + size_t max_matches = 2; + char *qname = bam_get_qname(bam); + + if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) { + int bc_start, bc_end; + + bc_start = matches[1].rm_so; + bc_end = matches[1].rm_eo; + + if (bc_start != -1) { + barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start); + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + } + } + } else { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + char warn_msg[256]; + + regerror(result, param->bc_rgx, warn_msg, 256); + fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + } + } + } + + if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { + fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + *warnings); + } + key->single = 1; key->this_ref = this_ref; key->this_coord = this_coord; key->orientation = orientation; + key->barcode = barcode; } @@ -690,41 +649,95 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n } -/* Get the position of the coordinates from the read name. */ -static inline int get_coordinate_positions_colons(const char *qname, int *xpos, int *ypos) { +/* Get coordinates from the standard Illumina style read names. + Returned values are of the x and y coordinates and a section of + the read name to test (t) for string equality e.g. lane and tile part. */ + +static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { int sep = 0; int pos = 0; + int xpos = 0, ypos = 0; + char *end; while (qname[pos]) { if (qname[pos] == ':') { sep++; if (sep == 2) { - *xpos = pos + 1; + xpos = pos + 1; } else if (sep == 3) { - *ypos = pos + 1; + ypos = pos + 1; } else if (sep == 4) { // HiSeq style names - *xpos = *ypos; - *ypos = pos + 1; + xpos = ypos; + ypos = pos + 1; } else if (sep == 5) { // Newer Illumina format - *xpos = pos + 1; + xpos = pos + 1; } else if (sep == 6) { - *ypos = pos + 1; + ypos = pos + 1; } } pos++; } - return sep; + /* The most current Illumina read format at time of writing is: + @machine:run:flowcell:lane:tile:x:y:UMI or + @machine:run:flowcell:lane:tile:x:y + + Counting the separating colons gives us a quick format check. + Older name formats have fewer elements. + */ + + if (!(sep == 3 || sep == 4 || sep == 6 || sep == 7)) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); + } + + return 1; + } else { + *x_coord = strtol(qname + xpos, &end, 10); + + if ((qname + xpos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname); + } + + return 1; + } + + *y_coord = strtol(qname + ypos, &end, 10); + + if ((qname + ypos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname); + } + + return 1; + } + + *t_beg = 0; + *t_end = xpos; + } + + return 0; } -/* Get the position of the coordinates from the read name. - Positions returned are of the x and y coordinate and an optional section of +/* Get the coordinates from the read name. + Returned values are of the x and y coordinates and an optional section of the read name to test (t) for string equality e.g. lane and tile part. */ -static inline int get_coordinate_positions_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, int *xpos, int *ypos) { + +static inline int get_coordinates_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { regmatch_t matches[5]; size_t max_matches = 5; + int xpos, ypos, xend, yend, xlen, ylen; + char coord[255]; + char *end; if (!param->rgx_t) max_matches = 4; @@ -732,8 +745,8 @@ static inline int get_coordinate_positions_regex(md_param_t *param, const char * if (regexec(param->rgx, qname, max_matches, matches, 0)) return -1; - *xpos = matches[param->rgx_x].rm_so; - *ypos = matches[param->rgx_y].rm_so; + xpos = matches[param->rgx_x].rm_so; + ypos = matches[param->rgx_y].rm_so; if (param->rgx_t) { *t_beg = matches[param->rgx_t].rm_so; @@ -742,138 +755,99 @@ static inline int get_coordinate_positions_regex(md_param_t *param, const char * *t_beg = *t_end = 0; } - if (*xpos == -1 || *ypos == -1 || *t_beg == -1) + if (xpos == -1 || ypos == -1 || *t_beg == -1) return -1; - return 7; // 3, 4, 6 and 7 are successes in the previous function -} + xend = matches[param->rgx_x].rm_eo; + yend = matches[param->rgx_y].rm_eo; + if ((xlen = xend - xpos) > 254) { + (*warnings)++; -static int get_coordinate_positions(md_param_t *param, const char *qname, int *beg, int *end, int *xpos, int *ypos, long *warnings) { - int ret = 0; - int seps; + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen); + } - if (param->rgx == NULL) { - seps = get_coordinate_positions_colons(qname, xpos, ypos); - *beg = 0; - *end = *xpos; - } else { - seps = get_coordinate_positions_regex(param, qname, beg, end, xpos, ypos); + return 1; } - /* The most current Illumina read format at time of writing is: - @machine:run:flowcell:lane:tile:x:y:UMI or - @machine:run:flowcell:lane:tile:x:y + strncpy(coord, qname + xpos, xlen); + coord[xlen] = '\0'; + *x_coord = strtol(coord, &end, 10); - Counting the separating colons gives us a quick format check. - Older name formats have fewer elements. - */ - - if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { + if (coord == end) { (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); + fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord); } - ret = 1; + return 1; } - return ret; -} - + if ((ylen = yend - ypos) > 254) { + (*warnings)++; -static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { - int ret = 1; - int xpos = 0, ypos = 0; - long x = 0, y = 0; - char *end; + if (*warnings <= BMD_WARNING_MAX) { + fprintf(samtools_stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen); + } - if (get_coordinate_positions(param, name, t_beg, t_end, &xpos, &ypos, warnings)) { - return ret; + return 1; } - x = strtol(name + xpos, &end, 10); + strncpy(coord, qname + ypos, ylen); + coord[ylen] = '\0'; + *y_coord = strtol(coord, &end, 10); - if ((name + xpos) == end) { + if (coord == end) { (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name); + fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord); } - return ret; + return 1; } - y = strtol(name + ypos, &end, 10); + return 0; +} - if ((name + ypos) == end) { - (*warnings)++; - if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name); - } +static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) { + int ret = 1; - return ret; + if (param->rgx == NULL) { + ret = get_coordinates_colons(param, name, t_beg, t_end, x_coord, y_coord, warnings); + } else { + ret = get_coordinates_regex(param, name, t_beg, t_end, x_coord, y_coord, warnings); } - *x_coord = x; - *y_coord = y; - ret = 0; - return ret; } -/* Using the coordinates from the Illumina read name, see whether the duplicated read is +/* Using the coordinates from the read name, see whether the duplicated read is close enough (set by max_dist) to the original to be counted as optical.*/ -static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { +static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) { int ret = 0; char *original, *duplicate; - int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0; + long ox, oy, dx, dy; int o_beg = 0, o_end = 0, d_beg = 0, d_end = 0; - original = bam_get_qname(ori); duplicate = bam_get_qname(dup); - if (get_coordinate_positions(param, original, &o_beg, &o_end, &oxpos, &oypos, warnings)) { + if (get_coordinates(param, original, &o_beg, &o_end, &ox, &oy, warnings)) { return ret; } - if (get_coordinate_positions(param, duplicate, &d_beg, &d_end, &dxpos, &dypos, warnings)) { + if (get_coordinates(param, duplicate, &d_beg, &d_end, &dx, &dy, warnings)) { return ret; } if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) { - // the initial parts match, look at the numbers - long ox, oy, dx, dy, xdiff, ydiff; - char *end; - - ox = strtol(original + oxpos, &end, 10); - - if ((original + oxpos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original); - } - - return ret; - } - - dx = strtol(duplicate + dxpos, &end, 10); - - if ((duplicate + dxpos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate); - } - - return ret; - } + long xdiff, ydiff; if (ox > dx) { xdiff = ox - dx; @@ -884,30 +858,6 @@ static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long m if (xdiff <= max_dist) { // still might be optical - oy = strtol(original + oypos, &end, 10); - - if ((original + oypos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original); - } - - return ret; - } - - dy = strtol(duplicate + dypos, &end, 10); - - if ((duplicate + dypos) == end) { - (*warnings)++; - - if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate); - } - - return ret; - } - if (oy > dy) { ydiff = oy - dy; } else { @@ -987,7 +937,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } if (param->opt_dist) { // mark optical duplicates - if (optical_duplicate(param, ori, dup, param->opt_dist, warn)) { + if (is_optical_duplicate(param, ori, dup, param->opt_dist, warn)) { bam_aux_update_str(dup, "dt", 3, "SQ"); dup_type = 'O'; (*optical)++; @@ -1447,7 +1397,7 @@ static int bam_mark_duplicates(md_param_t *param) { int ret; long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; long np_duplicate, np_opt_duplicate; - long opt_warnings = 0; + long opt_warnings = 0, bc_warnings = 0; tmp_file_t temp; char *idx_fn = NULL; int exclude = 0; @@ -1581,19 +1531,12 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - if (param->mode) { - if (make_pair_key_sequence(&pair_key, in_read->b)) { - fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); - goto fail; - } - } else { - if (make_pair_key_template(&pair_key, in_read->b)) { - fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); - goto fail; - } + if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) { + fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); + goto fail; } - make_single_key(&single_key, in_read->b); + make_single_key(param, &single_key, in_read->b, &bc_warnings); pair++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos @@ -1733,7 +1676,7 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - make_single_key(&single_key, in_read->b); + make_single_key(param, &single_key, in_read->b, &bc_warnings); single++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos @@ -1992,6 +1935,10 @@ static int bam_mark_duplicates(md_param_t *param) { opt_warnings); } + if (bc_warnings) { + fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings); + } + if (param->do_stats) { FILE *fp; int file_open = 0; @@ -2097,6 +2044,9 @@ static int markdup_usage(void) { fprintf(samtools_stderr, " --read-coords STR Regex for coords from read name.\n"); fprintf(samtools_stderr, " --coords-order STR Order of regex elements. txy (default). With t being a part of\n" " the read names that must be equal and x/y being coordinates.\n"); + fprintf(samtools_stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n"); + fprintf(samtools_stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n"); + fprintf(samtools_stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n"); fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); @@ -2110,16 +2060,17 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { - int c, ret; + int c, ret, bc_name = 0; char wmode[4] = {'w', 'b', 0, 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; kstring_t tmpprefix = {0, 0, NULL}; struct stat st; unsigned int t; - char *regex = NULL; + char *regex = NULL, *bc_regex = NULL; char *regex_order = "txy"; - md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL, NULL, 0, 0, 0}; + md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2129,6 +2080,9 @@ int bam_markdup(int argc, char **argv) { {"no-multi-dup", no_argument, NULL, 1003}, {"read-coords", required_argument, NULL, 1004}, {"coords-order", required_argument, NULL, 1005}, + {"barcode-tag", required_argument, NULL, 1006}, + {"barcode-name", no_argument, NULL, 1007}, + {"barcode-rgx", required_argument, NULL, 1008}, {NULL, 0, NULL, 0} }; @@ -2160,6 +2114,9 @@ int bam_markdup(int argc, char **argv) { case 1003: param.check_chain = 0; break; case 1004: regex = optarg; break; case 1005: regex_order = optarg; break; + case 1006: param.barcode = optarg; break; + case 1007: bc_name = 1; break; + case 1008: bc_name = 1, bc_regex = optarg; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -2169,6 +2126,12 @@ int bam_markdup(int argc, char **argv) { if (optind + 2 > argc) return markdup_usage(); + if (param.barcode && bc_name) { + fprintf(samtools_stderr, "[markdup] Error: cannot specify --barcode-tag and " + "--barcode-name (or --barcode-rgx) at same time.\n"); + return 1; + } + if (param.opt_dist < 0) param.opt_dist = 0; if (param.max_length < 0) param.max_length = 300; @@ -2197,7 +2160,7 @@ int bam_markdup(int argc, char **argv) { param.rgx_y = 2; param.rgx_t = 0; } else { - fprintf(samtools_stderr, "[markdup] error: could not recognise regex coorindate order \"%s\".\n", regex_order); + fprintf(samtools_stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order); return 1; } @@ -2216,6 +2179,32 @@ int bam_markdup(int argc, char **argv) { } } + if (bc_name) { + int result; + + /* From Illumina UMI documentation: "The UMI sequence is located in the + eighth colon-delimited field of the read name (QNAME)". */ + char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)"; + + if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) { + fprintf(samtools_stderr, "[markdup] error: could not allocate memory for barcode regex.\n"); + return 1; + } + + if (bc_regex) { + rgx = bc_regex; + } + + if ((result = regcomp(param.bc_rgx, rgx, REG_EXTENDED))) { + char err_msg[256]; + + regerror(result, param.bc_rgx, err_msg, 256); + fprintf(samtools_stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg); + free(param.bc_rgx); + return 1; + } + } + param.in = sam_open_format(argv[optind], "r", &ga.in); if (!param.in) { @@ -2280,6 +2269,11 @@ int bam_markdup(int argc, char **argv) { free(param.rgx); } + if (param.bc_rgx) { + regfree(param.bc_rgx); + free(param.bc_rgx); + } + free(param.arg_list); free(tmpprefix.s); sam_global_args_free(&ga); diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index 9b49500..c73bf89 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -946,7 +946,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, -" -x, --ignore-overlaps disable read-pair overlap detection\n" +" -x, --ignore-overlaps-removal, --disable-overlap-removal\n" +" disable read-pair overlap detection and removal\n" " -X, --customized-index use customized index files\n" // -X flag for index filename "\n" "Output options:\n" @@ -1032,7 +1033,9 @@ int bam_mpileup(int argc, char *argv[]) {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, - {"ignore-overlaps", no_argument, NULL, 'x'}, + // NB: old "--ignore-overlaps" auto-completes to this + {"ignore-overlaps-removal", no_argument, NULL, 'x'}, + {"disable-overlap-removal", no_argument, NULL, 'x'}, {"output-mods", no_argument, NULL, 'M'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index c8252cd..8147e85 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -948,7 +948,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, -" -x, --ignore-overlaps disable read-pair overlap detection\n" +" -x, --ignore-overlaps-removal, --disable-overlap-removal\n" +" disable read-pair overlap detection and removal\n" " -X, --customized-index use customized index files\n" // -X flag for index filename "\n" "Output options:\n" @@ -1034,7 +1035,9 @@ int bam_mpileup(int argc, char *argv[]) {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, - {"ignore-overlaps", no_argument, NULL, 'x'}, + // NB: old "--ignore-overlaps" auto-completes to this + {"ignore-overlaps-removal", no_argument, NULL, 'x'}, + {"disable-overlap-removal", no_argument, NULL, 'x'}, {"output-mods", no_argument, NULL, 'M'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 0971c3f..58ecdfd 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -1,6 +1,6 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2021 Genome Research Ltd. + Copyright (C) 2008-2022 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -51,16 +51,71 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_opts.h" #include "samtools.h" #include "bedidx.h" +#include "bam.h" + + +// Struct which contains the sorting key for TemplateCoordinate sort. +typedef struct { + int tid1; + int tid2; + hts_pos_t pos1; + hts_pos_t pos2; + bool neg1; + bool neg2; + const char *library; + char *mid; + char *name; + bool is_upper_of_pair; +} template_coordinate_key_t; + +// Struct to store fixed buffers of template coordinate keys +typedef struct { + size_t n; // the # of keys stored + size_t m; // the # of buffers allocated + size_t buffer_size; // # the fixed size of each buffer + template_coordinate_key_t **buffers; // the list of buffers +} template_coordinate_keys_t; + +// Gets the idx'th key; does not OOB check +static template_coordinate_key_t* template_coordinate_keys_get(template_coordinate_keys_t *keys, size_t idx) { + size_t buffer_idx = idx / keys->buffer_size; // the index of the buffer to retrieve in buffer + size_t buffer_offset = idx % keys->buffer_size; // the offset into the given buffer to retrieve + //assert(buffer_idx < keys->m); + //assert(buffer_offset < keys->buffer_size); + return &keys->buffers[buffer_idx][buffer_offset]; +} + +// Rellocates the buffers to hold at least max_k entries +static int template_coordinate_keys_realloc(template_coordinate_keys_t *keys, int max_k) { + size_t cur_m = keys->m; + keys->m += 0x100; + //assert(keys->m > cur_m); + //assert(keys->m * keys->buffer_size >= max_k); + if ((keys->buffers = realloc(keys->buffers, keys->m * sizeof(template_coordinate_key_t*))) == NULL) { + print_error("sort", "couldn't reallocate memory for template coordinate key buffers"); + return -1; + } + // allocate space for new buffers + int j; + for (j = cur_m; j < keys->m; ++j) { + if ((keys->buffers[j]= malloc(sizeof(template_coordinate_key_t) * keys->buffer_size)) == NULL) { + print_error("sort", "couldn't allocate memory for template coordinate key buffer"); + return -1; + } + } + return 0; +} // Struct which contains the a record, and the pointer to the sort tag (if any) or // a combined ref / position / strand. -// Used to speed up tag and position sorts. +// Used to speed up sorts (coordinate, by-tag, and template-coordinate). typedef struct bam1_tag { bam1_t *bam_record; union { const uint8_t *tag; uint8_t pos_tid[12]; + template_coordinate_key_t *key; } u; } bam1_tag; @@ -95,13 +150,15 @@ void memset_pattern4(void *target, const void *pattern, size_t size) { KHASH_INIT(c2c, char*, char*, 1, kh_str_hash_func, kh_str_hash_equal) KHASH_INIT(cset, char*, char, 0, kh_str_hash_func, kh_str_hash_equal) KHASH_MAP_INIT_STR(c2i, int) +KHASH_MAP_INIT_STR(const_c2c, char *) #define hdrln_free_char(p) KLIST_INIT(hdrln, char*, hdrln_free_char) -static int g_is_by_qname = 0; -static int g_is_by_tag = 0; -static int g_is_by_minhash = 0; +static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup); + +typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder; +static SamOrder g_sam_order = Coordinate; static char g_sort_tag[2] = {0,0}; static int strnum_cmp(const char *_a, const char *_b) @@ -139,6 +196,9 @@ typedef struct { static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b); static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b); +static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b); +static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header); +static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup); // Function to compare reads in the heap and determine which one is < the other // Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this @@ -150,25 +210,38 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) if (!b.entry.bam_record) return 0; - if (g_is_by_tag) { - int t; - t = bam1_cmp_by_tag(a.entry, b.entry); - if (t != 0) return t > 0; - } else if (g_is_by_minhash) { - int t = bam1_cmp_by_minhash(a.entry, b.entry); - if (t != 0) return t > 0; - } else if (g_is_by_qname) { - int t, fa, fb; - t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); - if (t != 0) return t > 0; - fa = a.entry.bam_record->core.flag & 0xc0; - fb = b.entry.bam_record->core.flag & 0xc0; - if (fa != fb) return fa > fb; - } else { - if (a.tid != b.tid) return a.tid > b.tid; - if (a.pos != b.pos) return a.pos > b.pos; - if (a.rev != b.rev) return a.rev > b.rev; + int t, fa, fb; + switch (g_sam_order) { + case Coordinate: + if (a.tid != b.tid) return a.tid > b.tid; + if (a.pos != b.pos) return a.pos > b.pos; + if (a.rev != b.rev) return a.rev > b.rev; + break; + case QueryName: + t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); + if (t != 0) return t > 0; + fa = a.entry.bam_record->core.flag & 0xc0; + fb = b.entry.bam_record->core.flag & 0xc0; + if (fa != fb) return fa > fb; + break; + case TagQueryName: + case TagCoordinate: + t = bam1_cmp_by_tag(a.entry, b.entry); + if (t != 0) return t > 0; + break; + case MinHash: + t = bam1_cmp_by_minhash(a.entry, b.entry); + if (t != 0) return t > 0; + break; + case TemplateCoordinate: + t = bam1_cmp_template_coordinate(a.entry, b.entry); + if (t != 0) return t > 0; + break; + default: + print_error("heap_lt", "unknown sort order: %d", g_sam_order); + break; } + // This compares by position in the input file(s) if (a.i != b.i) return a.i > b.i; return a.idx > b.idx; @@ -561,7 +634,7 @@ static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, } // If there are no RG lines in the file and we are overriding add one - if (is_rg && override && kl_begin(hdr_lines) == NULL) { + if (is_rg && override && hdr_lines->size == 0) { kstring_t new_id = {0, 0, NULL}; kstring_t line = {0, 0, NULL}; kstring_t empty = {0, 0, NULL}; @@ -975,8 +1048,8 @@ static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) { /*! @abstract Merge multiple sorted BAM. - @param by_qname whether to sort by query name - @param sort_tag if non-null, sort by the given tag + @param sam_order the order in which the data was sorted + @param sort_tag if non-null, the tag that data was sorted by @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @@ -996,7 +1069,7 @@ static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) { @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ -int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, +int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const char *mode, const char *headers, int n, char * const *fn, char * const *fn_idx, const char *fn_bed, int flag, const char *reg, int n_threads, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, @@ -1019,6 +1092,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; refs_t *refs = NULL; + template_coordinate_keys_t *keys = NULL; + khash_t(const_c2c) *lib_lookup = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { @@ -1035,9 +1110,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m } } - g_is_by_qname = by_qname; - if (sort_tag) { - g_is_by_tag = 1; + g_sam_order = sam_order; + if (sam_order == TagQueryName || sam_order == TagCoordinate) { g_sort_tag[0] = sort_tag[0]; g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; } @@ -1106,7 +1180,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m hdr[i] = hin; int order_ok = 1; - if ((translation_tbl+i)->lost_coord_sort && !by_qname) { + if ((translation_tbl+i)->lost_coord_sort && (sam_order == Coordinate || sam_order == MinHash)) { fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); order_ok = 0; } @@ -1226,6 +1300,26 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m rtrans = NULL; } + // Make sure that there's enough memory for template coordinate keys, one per file to read + if (sam_order == TemplateCoordinate) { + if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) { + print_error("sort", "could not allocate memory for the top-level keys"); + goto mem_fail; + } + keys->n = 0; + keys->m = 0; + keys->buffer_size = 0x10000; + keys->buffers = NULL; + // Make sure that there's enough memory for template coordinate keys, one per file to read + if (keys->n + n >= keys->m * keys->buffer_size) { + if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail; + } + lib_lookup = lookup_libraries(hout); + if (!lib_lookup) { + goto mem_fail; + } + } + // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; @@ -1241,8 +1335,12 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1); h->rev = bam_is_rev(h->entry.bam_record); h->idx = idx++; - if (g_is_by_tag) { + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { h->entry.u.tag = bam_aux_get(h->entry.bam_record, g_sort_tag); + } else if (g_sam_order == TemplateCoordinate) { + template_coordinate_key_t *key = template_coordinate_keys_get(keys, i); // get the next key to use + h->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key + if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out } else { h->entry.u.tag = NULL; } @@ -1252,6 +1350,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m bam_destroy1(h->entry.bam_record); h->entry.bam_record = NULL; h->entry.u.tag = NULL; + h->entry.u.key = NULL; } else { print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; @@ -1309,8 +1408,12 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m heap->pos = (uint64_t)(b->core.pos + 1); heap->rev = bam_is_rev(b); heap->idx = idx++; - if (g_is_by_tag) { + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); + } else if (g_sam_order == TemplateCoordinate) { + template_coordinate_key_t *key = template_coordinate_keys_get(keys, heap->i); // get the next key to use + heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key + if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out } else { heap->entry.u.tag = NULL; } @@ -1385,6 +1488,14 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m free(fp); free(rtrans); free(out_idx_fn); + if (keys != NULL) { + for (i = 0; i < keys->m; ++i) { + free(keys->buffers[i]); + } + free(keys->buffers); + free(keys); + } + lib_lookup_destroy(lib_lookup); return -1; } @@ -1395,7 +1506,8 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch strcpy(mode, "wb"); if (flag & MERGE_UNCOMP) strcat(mode, "0"); else if (flag & MERGE_LEVEL1) strcat(mode, "1"); - return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); + SamOrder sam_order = by_qname ? QueryName : Coordinate; + return bam_merge_core2(sam_order, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); } static void merge_usage(FILE *to) @@ -1421,25 +1533,28 @@ static void merge_usage(FILE *to) " -b FILE List of input BAM filenames, one per line [null]\n" " -X Use customized index files\n" " -L FILE Specify a BED file for multiple region filtering [null]\n" -" --no-PG do not add a PG line\n"); +" --no-PG do not add a PG line\n" +" --template-coordinate Input files are sorted by template-coordinate\n"); sam_global_opt_help(to, "-.O..@.."); } int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; + int c, flag = 0, ret = 0, level = -1, has_index_file = 0; char *fn_headers = NULL, *reg = NULL, mode[12]; char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL; long random_seed = (long)time(NULL); char** fn = NULL; char** fn_idx = NULL, *fn_bed = NULL; int fn_size = 0, no_pg = 0; + SamOrder sam_order = Coordinate; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, {"no-PG", no_argument, NULL, 1}, + { "template-coordinate", no_argument, NULL, 2}, { NULL, 0, NULL, 0 } }; @@ -1453,7 +1568,7 @@ int bam_merge(int argc, char *argv[]) case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = optarg; break; - case 'n': is_by_qname = 1; break; + case 'n': sam_order = QueryName; break; case 'o': fnout = optarg; break; case 't': sort_tag = optarg; break; case '1': flag |= MERGE_LEVEL1; level = 1; break; @@ -1488,12 +1603,17 @@ int bam_merge(int argc, char *argv[]) break; } case 1: no_pg = 1; break; + case 2: sam_order = TemplateCoordinate; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': merge_usage(stderr); return 1; } } + if (sort_tag != NULL) { + sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate; + } + if (fnout == NULL && argc - optind >= 1) { fnout = argv[optind]; optind++; @@ -1558,7 +1678,7 @@ int bam_merge(int argc, char *argv[]) strcpy(mode, "wb"); sam_open_mode(mode+1, fnout, NULL); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); - if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers, + if (bam_merge_core2(sam_order, sort_tag, fnout, mode, fn_headers, fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads, "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) ret = 1; @@ -1580,6 +1700,7 @@ end: * BAM sorting * ***************/ + typedef struct { size_t from; size_t to; @@ -1591,13 +1712,24 @@ typedef struct { static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp, int num_in_mem, buf_region *in_mem, - bam1_tag *buf, uint64_t *idx, sam_hdr_t *hout) { + bam1_tag *buf, template_coordinate_keys_t *keys, + uint64_t *idx, sam_hdr_t *hout, + khash_t(const_c2c) *lib_lookup) { int i = heap->i, res; if (i < nfiles) { // read from file res = sam_read1(fp[i], hout, heap->entry.bam_record); + if (res >= 0 && g_sam_order == TemplateCoordinate) { // file read OK and TemplateCoordinate order + // It is assumed that there are nfiles more keys allocated than keys->n; see allocation in bam_merge_simple + template_coordinate_key_t *key = template_coordinate_keys_get(keys, keys->n + i); // get the next key to use + heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key + if (heap->entry.u.key == NULL) res = -1; // key could not be created, error out + } } else { // read from memory if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) { - heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record; + size_t from = in_mem[i - nfiles].from; + heap->entry.bam_record = buf[from].bam_record; + if (g_sam_order == TemplateCoordinate) heap->entry.u.key = buf[from].u.key; + in_mem[i - nfiles].from++; res = 0; } else { res = -1; @@ -1608,26 +1740,30 @@ static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp, heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1); heap->rev = bam_is_rev(heap->entry.bam_record); heap->idx = (*idx)++; - if (g_is_by_tag) { + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); - } else { + } else if (g_sam_order != TemplateCoordinate) { heap->entry.u.tag = NULL; + heap->entry.u.key = NULL; } } else if (res == -1) { heap->pos = HEAP_EMPTY; if (i < nfiles) bam_destroy1(heap->entry.bam_record); heap->entry.bam_record = NULL; heap->entry.u.tag = NULL; + heap->entry.u.key = NULL; } else { return -1; } return 0; } -static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, +static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, const char *mode, sam_hdr_t *hout, int n, char * const *fn, int num_in_mem, - buf_region *in_mem, bam1_tag *buf, int n_threads, + buf_region *in_mem, bam1_tag *buf, + template_coordinate_keys_t *keys, + khash_t(const_c2c) *lib_lookup, int n_threads, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -1637,9 +1773,7 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, int i, heap_size = n + num_in_mem; char *out_idx_fn = NULL; - g_is_by_qname = by_qname; - if (sort_tag) { - g_is_by_tag = 1; + if (sam_order == TagQueryName || sam_order == TagCoordinate) { g_sort_tag[0] = sort_tag[0]; g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; } @@ -1650,6 +1784,11 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t)); if (!heap) goto mem_fail; + // Make sure that there's enough memory for template coordinate keys, one per file to read + if (keys && keys->n + n >= keys->m * keys->buffer_size) { + if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail; + } + // Open each file, read the header and put the first read into the heap for (i = 0; i < heap_size; i++) { sam_hdr_t *hin; @@ -1675,11 +1814,13 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, // Get a read into the heap h->i = i; h->entry.u.tag = NULL; + h->entry.u.key = NULL; if (i < n) { h->entry.bam_record = bam_init1(); if (!h->entry.bam_record) goto mem_fail; } - if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, keys, &idx, hout, + lib_lookup) < 0) { assert(i < n); print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; @@ -1721,7 +1862,7 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, ks_heapmake(heap, heap_size, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->entry.bam_record; - if (g_is_by_minhash && b->core.tid == -1) { + if (g_sam_order == MinHash && b->core.tid == -1) { // Remove the cached minhash value b->core.pos = -1; b->core.mpos = -1; @@ -1731,7 +1872,8 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, print_error_errno(cmd, "failed writing to \"%s\"", out); goto fail; } - if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, keys, &idx, + hout, lib_lookup) < 0) { assert(heap->i < n); print_error(cmd, "Error reading \"%s\" : %s", fn[heap->i], strerror(errno)); @@ -1787,12 +1929,10 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b) { uint64_t pa, pb; - if (!a.bam_record) - return 1; - if (!b.bam_record) - return 0; + if (!a.bam_record) return 1; + if (!b.bam_record) return 0; - if (g_is_by_qname) { + if (g_sam_order == QueryName || g_sam_order == TagQueryName) { int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record)); if (t != 0) return t; return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); @@ -1898,8 +2038,7 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) if (!A) return 1; if (!B) return 0; - if (A->core.tid != -1 || B->core.tid != -1) - return bam1_cmp_core(a,b); + if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b); const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos; const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos; @@ -1916,16 +2055,158 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) return bam1_cmp_core(a,b); } +// compares to molecular identifiers, ignoring any trailing slash and subsequent single-character +// * if mid1 is less than mid2, then -1 will be returned +// * if mid1 is greater than mid2, then 1 will be returned +static inline int template_coordinate_key_compare_mid(const char* mid1, const char* mid2) { + size_t i = 0; + size_t len1 = strlen(mid1); + size_t len2 = strlen(mid2); + size_t shortest; + + // Snip off trailing slash followed by a single character, if present + if (len1 >= 2 && mid1[len1-2] == '/') len1 -= 2; + if (len2 >= 2 && mid2[len2-2] == '/') len2 -= 2; + shortest = len1 < len2 ? len1 : len2; + + // find first mismatching character + while (i < shortest && mid1[i] == mid2[i]) i++; + + // compare last characters + if (i == len1 && i < len2) return -1; // mid1 shorter + if (i == len2 && i < len1) return 1; // mid2 shorter + if (i == len1 && i == len2) return 0; // all characters match + if (mid1[i] < mid2[i]) return -1; // mid1 earlier + else return 1; +} + + +// Builds a key use to sort in TemplateCoordinate order. Returns NULL if the key could not be created (e.g. MC +// tag is missing), otherwise the pointer to the provided key. +static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup) { + uint8_t *data; + char *rg; + khiter_t k; + + // defaults + key->tid1 = key->tid2 = INT32_MAX; + key->pos1 = key->pos2 = HTS_POS_MAX; + key->neg1 = key->neg2 = false; + key->mid = ""; + + // update values + rg = (char *)bam_aux_get(b, "RG"); + if (rg && rg[0] == 'Z' + &&(k = kh_get(const_c2c, lib_lookup, rg + 1)) < kh_end(lib_lookup)) { + key->library = kh_value(lib_lookup, k); + } else { + key->library = ""; + } + key->name = bam_get_qname(b); + if (!(b->core.flag & BAM_FUNMAP)) { // read is mapped, update coordinates + key->tid1 = b->core.tid; + key->neg1 = bam_is_rev(b); + key->pos1 = (key->neg1) ? unclipped_end(b) : unclipped_start(b); + } + if (b->core.flag & BAM_FPAIRED && !(b->core.flag & BAM_FMUNMAP)) { // mate is mapped, update coordinates + char *cigar; + if ((data = bam_aux_get(b, "MC"))) { + if (!(cigar = bam_aux2Z(data))) { + fprintf(stderr, "[bam_sort] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); + return NULL; + } + } else { + fprintf(stderr, "[bam_sort] error: no MC tag. Please run samtools fixmate on file first.\n"); + return NULL; + } + key->tid2 = b->core.mtid; + key->neg2 = bam_is_mrev(b); + key->pos2 = (key->neg2) ? unclipped_other_end(b->core.mpos, cigar) : unclipped_other_start(b->core.mpos, cigar); + } + + if ((data = bam_aux_get(b, "MI"))) { + if (!(key->mid=bam_aux2Z(data))) { + fprintf(stderr, "[bam_sort] error: MI tag wrong type (not a string).\n"); + return NULL; + } + } + + // set is_upper_of_pair, and swap if we get the same key regardless of which end + // of the pair it is + if (key->tid1 < key->tid2 + || (key->tid1 == key->tid2 && key->pos1 < key->pos2) + || (key->tid1 == key->tid2 && key->pos1 == key->pos2 && !key->neg1)) { + key->is_upper_of_pair = false; + } else { + key->is_upper_of_pair = true; + // swap + int tmp_tid; + hts_pos_t tmp_pos; + bool tmp_neg; + tmp_tid = key->tid1; + key->tid1 = key->tid2; + key->tid2 = tmp_tid; + tmp_pos = key->pos1; + key->pos1 = key->pos2; + key->pos2 = tmp_pos; + tmp_neg = key->neg1; + key->neg1 = key->neg2; + key->neg2 = tmp_neg; + } + + return key; +} + +// Function to compare reads and determine which one is < or > the other +// Handles template-coordinate, which sorts by: +// 1. the earlier unclipped 5' coordinate of the read pair +// 2. the higher unclipped 5' coordinate of the read pair +// 3. library (from read group) +// 4. the molecular identifier (if present) +// 5. read name +// 6. if unpaired, or if R1 has the lower coordinates of the pair +// Returns a value less than, equal to or greater than zero if a is less than, +// equal to or greater than b, respectively. +static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b) +{ + if (!a.bam_record) return 1; + if (!b.bam_record) return 0; + + const template_coordinate_key_t* key_a = a.u.key; + const template_coordinate_key_t* key_b = b.u.key; + + int retval = 0; + if (0 == retval) retval = key_a->tid1 - key_b->tid1; + if (0 == retval) retval = key_a->tid2 - key_b->tid2; + if (0 == retval) retval = key_a->pos1 < key_b->pos1 ? -1 : (key_a->pos1 > key_b->pos1 ? 1 : 0); + if (0 == retval) retval = key_a->pos2 < key_b->pos2 ? -1 : (key_a->pos2 > key_b->pos2 ? 1 : 0); + if (0 == retval) retval = key_a->neg1 == key_b->neg1 ? 0 : (key_a->neg1 ? -1 : 1); + if (0 == retval) retval = key_a->neg2 == key_b->neg2 ? 0 : (key_a->neg2 ? -1 : 1); + if (0 == retval) retval = strcmp(key_a->library, key_b->library); + if (0 == retval) retval = template_coordinate_key_compare_mid(key_a->mid, key_b->mid); + if (0 == retval) retval = strcmp(key_a->name, key_b->name); + if (0 == retval) retval = key_a->is_upper_of_pair == key_b->is_upper_of_pair ? 0 : (key_a->is_upper_of_pair ? 1 : -1); + return retval < 0 ? -1 : (retval > 0 ? 1 : 0); +} + + // Function to compare reads and determine which one is < the other -// Handle sort-by-pos, sort-by-name, or sort-by-tag +// Handle sort-by-pos, sort-by-name, sort-by-tag, or sort-by-template-coordinate. static inline int bam1_lt(const bam1_tag a, const bam1_tag b) { - if (g_is_by_tag) { - return bam1_cmp_by_tag(a, b) < 0; - } else if (g_is_by_minhash) { - return bam1_cmp_by_minhash(a, b) < 0; - } else { - return bam1_cmp_core(a,b) < 0; + switch (g_sam_order) { + case Coordinate: + case QueryName: + return bam1_cmp_core(a, b) < 0; + case TagQueryName: + case TagCoordinate: + return bam1_cmp_by_tag(a, b) < 0; + case MinHash: + return bam1_cmp_by_minhash(a, b) < 0; + case TemplateCoordinate: + return bam1_cmp_template_coordinate(a, b) < 0; + default: + return bam1_cmp_core(a,b) < 0; } } @@ -1943,6 +2224,7 @@ typedef struct { int error; int no_save; int large_pos; + int minimiser_kmer; } worker_t; // Returns 0 for success @@ -2246,6 +2528,30 @@ static int reverse_complement(bam1_t *b) { } //--- End of candidates to punt to htslib + +static inline void worker_minhash(worker_t *w) { + int i; + for (i = 0; i < w->buf_len; i++) { + bam1_t *b = w->buf[i].bam_record; + if (b->core.tid != -1) + continue; + + int pos = 0, rev = 0; + uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev); + if (rev) + reverse_complement(b); + + // Store 64-bit hash in unmapped pos and mpos fields. + // The position of hash is in isize, which we use for + // resolving ties when sorting by hash key. + // These are unused for completely unmapped data and + // will be reset during final output. + b->core.pos = mh>>31; + b->core.mpos = mh&0x7fffffff; + b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + } +} + static void *worker(void *data) { worker_t *w = (worker_t*)data; @@ -2254,35 +2560,18 @@ static void *worker(void *data) w->error = 0; w->tmpfile_name = NULL; - if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) { - if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { - w->error = errno; - return NULL; - } - } else { - if (g_is_by_minhash) { - int i; - for (i = 0; i < w->buf_len; i++) { - bam1_t *b = w->buf[i].bam_record; - if (b->core.tid != -1) - continue; - - int pos = 0, rev = 0; - uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev); - if (rev) - reverse_complement(b); - - // Store 64-bit hash in unmapped pos and mpos fields. - // The position of hash is in isize, which we use for - // resolving ties when sorting by hash key. - // These are unused for completely unmapped data and - // will be reset during final output. - b->core.pos = mh>>31; - b->core.mpos = mh&0x7fffffff; - b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + switch (g_sam_order) { + case Coordinate: + if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { + w->error = errno; + return NULL; } - } - ks_mergesort(sort, w->buf_len, w->buf, 0); + break; + case MinHash: + worker_minhash(w); + // no break, go to merge sort + default: + ks_mergesort(sort, w->buf_len, w->buf, 0); } if (w->no_save) @@ -2323,7 +2612,7 @@ static void *worker(void *data) static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, const sam_hdr_t *h, int n_threads, buf_region *in_mem, - int large_pos, char **fns, size_t fns_size) + int large_pos, int minimiser_kmer, char **fns, size_t fns_size) { int i; size_t pos, rest; @@ -2349,6 +2638,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, w[i].index = n_files + i; w[i].tmpfile_name = NULL; w[i].large_pos = large_pos; + w[i].minimiser_kmer = minimiser_kmer; if (in_mem) { w[i].no_save = 1; in_mem[i].from = pos; @@ -2388,13 +2678,68 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, return n_files + n_threads; } +static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) { + khiter_t k; + if (lib_lookup == NULL) + return; + for (k = kh_begin(lib_lookup); k < kh_end(lib_lookup); k++) { + if (kh_exist(lib_lookup, k)) + free(kh_value(lib_lookup, k)); + } + kh_destroy(const_c2c, lib_lookup); +} + +// Build an RG to LB lookup table, for the template coordinate sort. +// Returns a populated hash table (which may be empty) on success; +// NULL on failure. +static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) +{ + khash_t(const_c2c) *lib_lookup = kh_init(const_c2c); + kstring_t lib_name = KS_INITIALIZE; + int num_rg, i, res; + if (!lib_lookup) + return NULL; + + // Iterate through any RG lines and look for library information + num_rg = sam_hdr_count_lines(header, "RG"); + if (num_rg < 0) + goto fail; + + for (i = 0; i < num_rg; i++) { + const char *rg_id = sam_hdr_line_name(header, "RG", i); + khiter_t k; + if (!rg_id) + goto fail; + res = sam_hdr_find_tag_pos(header, "RG", i, "LB", &lib_name); + if (res < -1) // Error + goto fail; + if (res < 0 || !lib_name.s) // No LB tag + continue; + // Add to lookup table + k = kh_put(const_c2c, lib_lookup, rg_id, &res); + if (res < 0) // Error + goto fail; + if (res > 0) { // Inserted + kh_value(lib_lookup, k) = ks_release(&lib_name); + } + } + + free(lib_name.s); + + return lib_lookup; + + fail: + lib_lookup_destroy(lib_lookup); + free(lib_name.s); + return NULL; +} /*! - @abstract Sort an unsorted BAM file based on the chromosome order - and the leftmost position of an alignment + @abstract Sort an unsorted BAM file based on the provided sort order - @param is_by_qname whether to sort by query name - @param sort_by_tag if non-null, sort by the given tag + @param sam_order the order in which the sort should occur + @param sort_tag the tag to use if sorting by Tag + @param minimiser_kmer the kmer size when sorting by MinHash @param fn name of the file to be sorted @param prefix prefix of the temporary files (prefix.NNNN.bam are written) @param fnout name of the final output file to be written @@ -2411,23 +2756,28 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, and then merge them by calling bam_merge_simple(). This function is NOT thread safe. */ -int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, +int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, + const char *fn, const char *prefix, const char *fnout, const char *modeout, - size_t _max_mem, int by_minimiser, int n_threads, + size_t _max_mem, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { int ret = -1, res, i, nref, n_files = 0; size_t max_k, k, max_mem, bam_mem_offset; sam_hdr_t *header = NULL; - samFile *fp; + samFile *fp = NULL; bam1_tag *buf = NULL; + template_coordinate_keys_t *keys = NULL; bam1_t *b = bam_init1(); uint8_t *bam_mem = NULL; char **fns = NULL; size_t fns_size = 0; - const char *new_so; + const char *new_so = NULL; + const char *new_go = NULL; + const char *new_ss = NULL; buf_region *in_mem = NULL; + khash_t(const_c2c) *lib_lookup = NULL; int num_in_mem = 0; int large_pos = 0; @@ -2437,12 +2787,21 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } if (n_threads < 2) n_threads = 1; - g_is_by_qname = is_by_qname; - g_is_by_minhash = by_minimiser; - if (sort_by_tag) { - g_is_by_tag = 1; - g_sort_tag[0] = sort_by_tag[0]; - g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0'; + g_sam_order = sam_order; + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { + g_sort_tag[0] = sort_tag[0]; + g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; + } + + if (sam_order == TemplateCoordinate) { + if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) { + print_error("sort", "could not allocate memory for the top-level keys"); + goto err; + } + keys->n = 0; + keys->m = 0; + keys->buffer_size = 0x10000; + keys->buffers = NULL; } max_mem = _max_mem * n_threads; @@ -2480,15 +2839,45 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } } - if (sort_by_tag != NULL) - new_so = "unknown"; - else if (is_by_qname) - new_so = "queryname"; - else - new_so = "coordinate"; + if (g_sam_order == TemplateCoordinate) { + lib_lookup = lookup_libraries(header); + if (!lib_lookup) + goto err; + } - if (by_minimiser) { - const char *new_ss = "coordinate:minhash"; + switch (g_sam_order) { + case Coordinate: + new_so = "coordinate"; + break; + case QueryName: + new_so = "queryname"; + break; + case MinHash: + new_so = "coordinate"; + new_ss = "coordinate:minhash"; + break; + case TagQueryName: + case TagCoordinate: + new_so = "unknown"; + break; + case TemplateCoordinate: + new_so = "unsorted"; + new_go = "query"; + new_ss = "unsorted:template-coordinate"; + break; + default: + new_so = "unknown"; + break; + } + + if (new_ss == NULL && new_go == NULL) { // just SO + if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); + goto err; + } + } else if (new_ss != NULL && new_go == NULL) { // update SO and SS, but not GO if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss)) && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, "SS", new_ss, NULL)) @@ -2497,18 +2886,37 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const new_so, new_ss); goto err; } - } else { - if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) + } else if (new_ss == NULL && new_go != NULL) { // update SO and GO, but not SS + if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, + "SO", new_so, "GO", new_go, NULL)) ) { - print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); + print_error("sort", "failed to change sort order header to 'SO:%s GO:%s'\n", + new_so, new_go); + goto err; + } + } else { // update SO, GO, and SS + if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go, "SS", new_ss)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, + "SO", new_so, "GO", new_go, "SS", new_ss, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s GO:%s SS:%s'\n", + new_so, new_go, new_ss); goto err; } } - if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { - print_error("sort", "failed to delete group order header\n"); - goto err; + if (new_go == NULL) { + if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { + print_error("sort", "failed to delete group order in header\n"); + goto err; + } + } + if (new_ss == NULL) { + if (-1 == sam_hdr_remove_tag_hd(header, "SS")) { + print_error("sort", "failed to delete sub sort in header\n"); + goto err; + } } // No gain to using the thread pool here as the flow of this code @@ -2536,6 +2944,11 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } buf = new_buf; } + if (sam_order == TemplateCoordinate && k >= keys->m * keys->buffer_size) { + if (template_coordinate_keys_realloc(keys, k + 1) == -1) { + goto err; + } + } // Check if the BAM record will fit in the memory limit if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) { @@ -2553,12 +2966,21 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const mem_full = 1; } - // Pull out the value of the position - // or the pointer to the sort tag if applicable - if (g_is_by_tag) { - buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag); - } else { - buf[k].u.tag = NULL; + // Set the tag if sorting by tag, or the key for template cooridinate sorting + switch (g_sam_order) { + case TagQueryName: + case TagCoordinate: + buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag); + break; + case TemplateCoordinate: + ++keys->n; + template_coordinate_key_t *key = template_coordinate_keys_get(keys, k); + buf[k].u.key = template_coordinate_key(buf[k].bam_record, key, header, lib_lookup); + if (buf[k].u.key == NULL) goto err; + break; + default: + buf[k].u.tag = NULL; + buf[k].u.key = NULL; } ++k; @@ -2567,13 +2989,14 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const &fns_size, &fns, 0) < 0) goto err; int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads, - NULL, large_pos, fns, fns_size); + NULL, large_pos, minimiser_kmer, fns, fns_size); if (new_n < 0) { goto err; } else { n_files = new_n; } k = 0; + if (keys != NULL) keys->n = 0; bam_mem_offset = 0; } } @@ -2587,7 +3010,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); if (!in_mem) goto err; num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, - in_mem, large_pos, fns, fns_size); + in_mem, large_pos, minimiser_kmer, fns, fns_size); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -2596,7 +3019,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const // write the final output if (n_files == 0 && num_in_mem < 2) { // a single block if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, - g_is_by_minhash, arg_list, no_pg, write_index) != 0) { + minimiser_kmer, arg_list, no_pg, write_index) != 0) { print_error_errno("sort", "failed to create \"%s\"", fnout); goto err; } @@ -2612,10 +3035,11 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const abort(); } } - if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, - n_files, fns, num_in_mem, in_mem, buf, - n_threads, "sort", in_fmt, out_fmt, arg_list, - no_pg, write_index) < 0) { + char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL; + if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header, + n_files, fns, num_in_mem, in_mem, buf, keys, + lib_lookup, n_threads, "sort", in_fmt, out_fmt, + arg_list, no_pg, write_index) < 0) { // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -2637,8 +3061,16 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } bam_destroy1(b); free(buf); + if (keys != NULL) { + for (i = 0; i < keys->m; ++i) { + free(keys->buffers[i]); + } + free(keys->buffers); + free(keys); + } free(bam_mem); free(in_mem); + lib_lookup_destroy(lib_lookup); sam_hdr_destroy(header); if (fp) sam_close(fp); return ret; @@ -2651,7 +3083,9 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma char *fnout = calloc(strlen(prefix) + 4 + 1, 1); if (!fnout) return -1; sprintf(fnout, "%s.bam", prefix); - ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0); + SamOrder sam_order = is_by_qname ? QueryName : Coordinate; + g_sam_order = sam_order; + ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); free(fnout); return ret; } @@ -2670,7 +3104,10 @@ static void sort_usage(FILE *fp) " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" " -o FILE Write final output to FILE rather than standard output\n" " -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" -" --no-PG do not add a PG line\n"); +" --no-PG\n" +" Do not add a PG line\n" +" --template-coordinate\n" +" Sort by template-coordinate\n"); sam_global_opt_help(fp, "-.O..@.."); } @@ -2694,8 +3131,10 @@ static void complain_about_memory_setting(size_t max_mem) { int bam_sort(int argc, char *argv[]) { size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; - int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; - int by_minimiser = 0, minimiser_kmer = 20; + int c, nargs, ret, o_seen = 0, level = -1, no_pg = 0; + SamOrder sam_order = Coordinate; + bool by_tag = false; + int minimiser_kmer = 20; char* sort_tag = NULL, *arg_list = NULL; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; @@ -2706,14 +3145,15 @@ int bam_sort(int argc, char *argv[]) SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, {"no-PG", no_argument, NULL, 1}, + { "template-coordinate", no_argument, NULL, 2}, { NULL, 0, NULL, 0 } }; while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) { switch (c) { case 'o': fnout = optarg; o_seen = 1; break; - case 'n': is_by_qname = 1; break; - case 't': sort_tag = optarg; break; + case 'n': sam_order = QueryName; break; + case 't': by_tag = true; sort_tag = optarg; break; case 'm': { char *q; max_mem = strtol(optarg, &q, 0); @@ -2726,7 +3166,8 @@ int bam_sort(int argc, char *argv[]) case 'l': level = atoi(optarg); break; case 'u': level = 0; break; case 1: no_pg = 1; break; - case 'M': by_minimiser = 1; break; + case 2: sam_order = TemplateCoordinate; break; + case 'M': sam_order = MinHash; break; case 'K': minimiser_kmer = atoi(optarg); if (minimiser_kmer < 1) @@ -2741,6 +3182,11 @@ int bam_sort(int argc, char *argv[]) } } + // Change sort order if tag sorting is requested. Must update based on secondary index + if (by_tag) { + sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate; + } + nargs = argc - optind; if (nargs == 0 && isatty(STDIN_FILENO)) { sort_usage(stdout); @@ -2757,7 +3203,7 @@ int bam_sort(int argc, char *argv[]) goto sort_end; } - if (ga.write_index && (is_by_qname || sort_tag)) { + if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) { fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); ga.write_index = 0; } @@ -2792,9 +3238,9 @@ int bam_sort(int argc, char *argv[]) ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); } - ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, - by_minimiser * minimiser_kmer, ga.nthreads, + ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0, + (nargs > 0) ? argv[optind] : "-", + tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, &ga.in, &ga.out, arg_list, no_pg, ga.write_index); if (ret >= 0) ret = EXIT_SUCCESS; diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 1385b29..3489044 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -2,7 +2,7 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2021 Genome Research Ltd. + Copyright (C) 2008-2022 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -53,16 +53,71 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_opts.h" #include "samtools.h" #include "bedidx.h" +#include "bam.h" + + +// Struct which contains the sorting key for TemplateCoordinate sort. +typedef struct { + int tid1; + int tid2; + hts_pos_t pos1; + hts_pos_t pos2; + bool neg1; + bool neg2; + const char *library; + char *mid; + char *name; + bool is_upper_of_pair; +} template_coordinate_key_t; + +// Struct to store fixed buffers of template coordinate keys +typedef struct { + size_t n; // the # of keys stored + size_t m; // the # of buffers allocated + size_t buffer_size; // # the fixed size of each buffer + template_coordinate_key_t **buffers; // the list of buffers +} template_coordinate_keys_t; + +// Gets the idx'th key; does not OOB check +static template_coordinate_key_t* template_coordinate_keys_get(template_coordinate_keys_t *keys, size_t idx) { + size_t buffer_idx = idx / keys->buffer_size; // the index of the buffer to retrieve in buffer + size_t buffer_offset = idx % keys->buffer_size; // the offset into the given buffer to retrieve + //assert(buffer_idx < keys->m); + //assert(buffer_offset < keys->buffer_size); + return &keys->buffers[buffer_idx][buffer_offset]; +} + +// Rellocates the buffers to hold at least max_k entries +static int template_coordinate_keys_realloc(template_coordinate_keys_t *keys, int max_k) { + size_t cur_m = keys->m; + keys->m += 0x100; + //assert(keys->m > cur_m); + //assert(keys->m * keys->buffer_size >= max_k); + if ((keys->buffers = realloc(keys->buffers, keys->m * sizeof(template_coordinate_key_t*))) == NULL) { + print_error("sort", "couldn't reallocate memory for template coordinate key buffers"); + return -1; + } + // allocate space for new buffers + int j; + for (j = cur_m; j < keys->m; ++j) { + if ((keys->buffers[j]= malloc(sizeof(template_coordinate_key_t) * keys->buffer_size)) == NULL) { + print_error("sort", "couldn't allocate memory for template coordinate key buffer"); + return -1; + } + } + return 0; +} // Struct which contains the a record, and the pointer to the sort tag (if any) or // a combined ref / position / strand. -// Used to speed up tag and position sorts. +// Used to speed up sorts (coordinate, by-tag, and template-coordinate). typedef struct bam1_tag { bam1_t *bam_record; union { const uint8_t *tag; uint8_t pos_tid[12]; + template_coordinate_key_t *key; } u; } bam1_tag; @@ -97,13 +152,15 @@ void memset_pattern4(void *target, const void *pattern, size_t size) { KHASH_INIT(c2c, char*, char*, 1, kh_str_hash_func, kh_str_hash_equal) KHASH_INIT(cset, char*, char, 0, kh_str_hash_func, kh_str_hash_equal) KHASH_MAP_INIT_STR(c2i, int) +KHASH_MAP_INIT_STR(const_c2c, char *) #define hdrln_free_char(p) KLIST_INIT(hdrln, char*, hdrln_free_char) -static int g_is_by_qname = 0; -static int g_is_by_tag = 0; -static int g_is_by_minhash = 0; +static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup); + +typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder; +static SamOrder g_sam_order = Coordinate; static char g_sort_tag[2] = {0,0}; static int strnum_cmp(const char *_a, const char *_b) @@ -141,6 +198,9 @@ typedef struct { static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b); static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b); +static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b); +static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header); +static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup); // Function to compare reads in the heap and determine which one is < the other // Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this @@ -152,25 +212,38 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) if (!b.entry.bam_record) return 0; - if (g_is_by_tag) { - int t; - t = bam1_cmp_by_tag(a.entry, b.entry); - if (t != 0) return t > 0; - } else if (g_is_by_minhash) { - int t = bam1_cmp_by_minhash(a.entry, b.entry); - if (t != 0) return t > 0; - } else if (g_is_by_qname) { - int t, fa, fb; - t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); - if (t != 0) return t > 0; - fa = a.entry.bam_record->core.flag & 0xc0; - fb = b.entry.bam_record->core.flag & 0xc0; - if (fa != fb) return fa > fb; - } else { - if (a.tid != b.tid) return a.tid > b.tid; - if (a.pos != b.pos) return a.pos > b.pos; - if (a.rev != b.rev) return a.rev > b.rev; + int t, fa, fb; + switch (g_sam_order) { + case Coordinate: + if (a.tid != b.tid) return a.tid > b.tid; + if (a.pos != b.pos) return a.pos > b.pos; + if (a.rev != b.rev) return a.rev > b.rev; + break; + case QueryName: + t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record)); + if (t != 0) return t > 0; + fa = a.entry.bam_record->core.flag & 0xc0; + fb = b.entry.bam_record->core.flag & 0xc0; + if (fa != fb) return fa > fb; + break; + case TagQueryName: + case TagCoordinate: + t = bam1_cmp_by_tag(a.entry, b.entry); + if (t != 0) return t > 0; + break; + case MinHash: + t = bam1_cmp_by_minhash(a.entry, b.entry); + if (t != 0) return t > 0; + break; + case TemplateCoordinate: + t = bam1_cmp_template_coordinate(a.entry, b.entry); + if (t != 0) return t > 0; + break; + default: + print_error("heap_lt", "unknown sort order: %d", g_sam_order); + break; } + // This compares by position in the input file(s) if (a.i != b.i) return a.i > b.i; return a.idx > b.idx; @@ -563,7 +636,7 @@ static klist_t(hdrln) * trans_rg_pg(bool is_rg, sam_hdr_t *translate, } // If there are no RG lines in the file and we are overriding add one - if (is_rg && override && kl_begin(hdr_lines) == NULL) { + if (is_rg && override && hdr_lines->size == 0) { kstring_t new_id = {0, 0, NULL}; kstring_t line = {0, 0, NULL}; kstring_t empty = {0, 0, NULL}; @@ -977,8 +1050,8 @@ static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) { /*! @abstract Merge multiple sorted BAM. - @param by_qname whether to sort by query name - @param sort_tag if non-null, sort by the given tag + @param sam_order the order in which the data was sorted + @param sort_tag if non-null, the tag that data was sorted by @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @@ -998,7 +1071,7 @@ static hts_reglist_t *duplicate_reglist(const hts_reglist_t *rl, int rn) { @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ -int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode, +int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const char *mode, const char *headers, int n, char * const *fn, char * const *fn_idx, const char *fn_bed, int flag, const char *reg, int n_threads, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, @@ -1021,6 +1094,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m merged_header_t *merged_hdr = init_merged_header(); if (!merged_hdr) return -1; refs_t *refs = NULL; + template_coordinate_keys_t *keys = NULL; + khash_t(const_c2c) *lib_lookup = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { @@ -1037,9 +1112,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m } } - g_is_by_qname = by_qname; - if (sort_tag) { - g_is_by_tag = 1; + g_sam_order = sam_order; + if (sam_order == TagQueryName || sam_order == TagCoordinate) { g_sort_tag[0] = sort_tag[0]; g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; } @@ -1108,7 +1182,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m hdr[i] = hin; int order_ok = 1; - if ((translation_tbl+i)->lost_coord_sort && !by_qname) { + if ((translation_tbl+i)->lost_coord_sort && (sam_order == Coordinate || sam_order == MinHash)) { fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); order_ok = 0; } @@ -1228,6 +1302,26 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m rtrans = NULL; } + // Make sure that there's enough memory for template coordinate keys, one per file to read + if (sam_order == TemplateCoordinate) { + if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) { + print_error("sort", "could not allocate memory for the top-level keys"); + goto mem_fail; + } + keys->n = 0; + keys->m = 0; + keys->buffer_size = 0x10000; + keys->buffers = NULL; + // Make sure that there's enough memory for template coordinate keys, one per file to read + if (keys->n + n >= keys->m * keys->buffer_size) { + if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail; + } + lib_lookup = lookup_libraries(hout); + if (!lib_lookup) { + goto mem_fail; + } + } + // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; @@ -1243,8 +1337,12 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1); h->rev = bam_is_rev(h->entry.bam_record); h->idx = idx++; - if (g_is_by_tag) { + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { h->entry.u.tag = bam_aux_get(h->entry.bam_record, g_sort_tag); + } else if (g_sam_order == TemplateCoordinate) { + template_coordinate_key_t *key = template_coordinate_keys_get(keys, i); // get the next key to use + h->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key + if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out } else { h->entry.u.tag = NULL; } @@ -1254,6 +1352,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m bam_destroy1(h->entry.bam_record); h->entry.bam_record = NULL; h->entry.u.tag = NULL; + h->entry.u.key = NULL; } else { print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; @@ -1311,8 +1410,12 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m heap->pos = (uint64_t)(b->core.pos + 1); heap->rev = bam_is_rev(b); heap->idx = idx++; - if (g_is_by_tag) { + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); + } else if (g_sam_order == TemplateCoordinate) { + template_coordinate_key_t *key = template_coordinate_keys_get(keys, heap->i); // get the next key to use + heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key + if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out } else { heap->entry.u.tag = NULL; } @@ -1387,6 +1490,14 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m free(fp); free(rtrans); free(out_idx_fn); + if (keys != NULL) { + for (i = 0; i < keys->m; ++i) { + free(keys->buffers[i]); + } + free(keys->buffers); + free(keys); + } + lib_lookup_destroy(lib_lookup); return -1; } @@ -1397,7 +1508,8 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch strcpy(mode, "wb"); if (flag & MERGE_UNCOMP) strcat(mode, "0"); else if (flag & MERGE_LEVEL1) strcat(mode, "1"); - return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); + SamOrder sam_order = by_qname ? QueryName : Coordinate; + return bam_merge_core2(sam_order, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1); } static void merge_usage(FILE *to) @@ -1423,25 +1535,28 @@ static void merge_usage(FILE *to) " -b FILE List of input BAM filenames, one per line [null]\n" " -X Use customized index files\n" " -L FILE Specify a BED file for multiple region filtering [null]\n" -" --no-PG do not add a PG line\n"); +" --no-PG do not add a PG line\n" +" --template-coordinate Input files are sorted by template-coordinate\n"); sam_global_opt_help(to, "-.O..@.."); } int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; + int c, flag = 0, ret = 0, level = -1, has_index_file = 0; char *fn_headers = NULL, *reg = NULL, mode[12]; char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL; long random_seed = (long)time(NULL); char** fn = NULL; char** fn_idx = NULL, *fn_bed = NULL; int fn_size = 0, no_pg = 0; + SamOrder sam_order = Coordinate; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, {"no-PG", no_argument, NULL, 1}, + { "template-coordinate", no_argument, NULL, 2}, { NULL, 0, NULL, 0 } }; @@ -1455,7 +1570,7 @@ int bam_merge(int argc, char *argv[]) case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = optarg; break; - case 'n': is_by_qname = 1; break; + case 'n': sam_order = QueryName; break; case 'o': fnout = optarg; break; case 't': sort_tag = optarg; break; case '1': flag |= MERGE_LEVEL1; level = 1; break; @@ -1490,12 +1605,17 @@ int bam_merge(int argc, char *argv[]) break; } case 1: no_pg = 1; break; + case 2: sam_order = TemplateCoordinate; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': merge_usage(samtools_stderr); return 1; } } + if (sort_tag != NULL) { + sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate; + } + if (fnout == NULL && argc - optind >= 1) { fnout = argv[optind]; optind++; @@ -1560,7 +1680,7 @@ int bam_merge(int argc, char *argv[]) strcpy(mode, "wb"); sam_open_mode(mode+1, fnout, NULL); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); - if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers, + if (bam_merge_core2(sam_order, sort_tag, fnout, mode, fn_headers, fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads, "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) ret = 1; @@ -1582,6 +1702,7 @@ end: * BAM sorting * ***************/ + typedef struct { size_t from; size_t to; @@ -1593,13 +1714,24 @@ typedef struct { static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp, int num_in_mem, buf_region *in_mem, - bam1_tag *buf, uint64_t *idx, sam_hdr_t *hout) { + bam1_tag *buf, template_coordinate_keys_t *keys, + uint64_t *idx, sam_hdr_t *hout, + khash_t(const_c2c) *lib_lookup) { int i = heap->i, res; if (i < nfiles) { // read from file res = sam_read1(fp[i], hout, heap->entry.bam_record); + if (res >= 0 && g_sam_order == TemplateCoordinate) { // file read OK and TemplateCoordinate order + // It is assumed that there are nfiles more keys allocated than keys->n; see allocation in bam_merge_simple + template_coordinate_key_t *key = template_coordinate_keys_get(keys, keys->n + i); // get the next key to use + heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key + if (heap->entry.u.key == NULL) res = -1; // key could not be created, error out + } } else { // read from memory if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) { - heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record; + size_t from = in_mem[i - nfiles].from; + heap->entry.bam_record = buf[from].bam_record; + if (g_sam_order == TemplateCoordinate) heap->entry.u.key = buf[from].u.key; + in_mem[i - nfiles].from++; res = 0; } else { res = -1; @@ -1610,26 +1742,30 @@ static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp, heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1); heap->rev = bam_is_rev(heap->entry.bam_record); heap->idx = (*idx)++; - if (g_is_by_tag) { + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag); - } else { + } else if (g_sam_order != TemplateCoordinate) { heap->entry.u.tag = NULL; + heap->entry.u.key = NULL; } } else if (res == -1) { heap->pos = HEAP_EMPTY; if (i < nfiles) bam_destroy1(heap->entry.bam_record); heap->entry.bam_record = NULL; heap->entry.u.tag = NULL; + heap->entry.u.key = NULL; } else { return -1; } return 0; } -static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, +static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, const char *mode, sam_hdr_t *hout, int n, char * const *fn, int num_in_mem, - buf_region *in_mem, bam1_tag *buf, int n_threads, + buf_region *in_mem, bam1_tag *buf, + template_coordinate_keys_t *keys, + khash_t(const_c2c) *lib_lookup, int n_threads, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -1639,9 +1775,7 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, int i, heap_size = n + num_in_mem; char *out_idx_fn = NULL; - g_is_by_qname = by_qname; - if (sort_tag) { - g_is_by_tag = 1; + if (sam_order == TagQueryName || sam_order == TagCoordinate) { g_sort_tag[0] = sort_tag[0]; g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; } @@ -1652,6 +1786,11 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t)); if (!heap) goto mem_fail; + // Make sure that there's enough memory for template coordinate keys, one per file to read + if (keys && keys->n + n >= keys->m * keys->buffer_size) { + if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail; + } + // Open each file, read the header and put the first read into the heap for (i = 0; i < heap_size; i++) { sam_hdr_t *hin; @@ -1677,11 +1816,13 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, // Get a read into the heap h->i = i; h->entry.u.tag = NULL; + h->entry.u.key = NULL; if (i < n) { h->entry.bam_record = bam_init1(); if (!h->entry.bam_record) goto mem_fail; } - if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, keys, &idx, hout, + lib_lookup) < 0) { assert(i < n); print_error(cmd, "failed to read first record from \"%s\"", fn[i]); goto fail; @@ -1723,7 +1864,7 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, ks_heapmake(heap, heap_size, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->entry.bam_record; - if (g_is_by_minhash && b->core.tid == -1) { + if (g_sam_order == MinHash && b->core.tid == -1) { // Remove the cached minhash value b->core.pos = -1; b->core.mpos = -1; @@ -1733,7 +1874,8 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, print_error_errno(cmd, "failed writing to \"%s\"", out); goto fail; } - if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) { + if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, keys, &idx, + hout, lib_lookup) < 0) { assert(heap->i < n); print_error(cmd, "Error reading \"%s\" : %s", fn[heap->i], strerror(errno)); @@ -1789,12 +1931,10 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out, static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b) { uint64_t pa, pb; - if (!a.bam_record) - return 1; - if (!b.bam_record) - return 0; + if (!a.bam_record) return 1; + if (!b.bam_record) return 0; - if (g_is_by_qname) { + if (g_sam_order == QueryName || g_sam_order == TagQueryName) { int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record)); if (t != 0) return t; return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0); @@ -1900,8 +2040,7 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) if (!A) return 1; if (!B) return 0; - if (A->core.tid != -1 || B->core.tid != -1) - return bam1_cmp_core(a,b); + if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b); const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos; const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos; @@ -1918,16 +2057,158 @@ static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b) return bam1_cmp_core(a,b); } +// compares to molecular identifiers, ignoring any trailing slash and subsequent single-character +// * if mid1 is less than mid2, then -1 will be returned +// * if mid1 is greater than mid2, then 1 will be returned +static inline int template_coordinate_key_compare_mid(const char* mid1, const char* mid2) { + size_t i = 0; + size_t len1 = strlen(mid1); + size_t len2 = strlen(mid2); + size_t shortest; + + // Snip off trailing slash followed by a single character, if present + if (len1 >= 2 && mid1[len1-2] == '/') len1 -= 2; + if (len2 >= 2 && mid2[len2-2] == '/') len2 -= 2; + shortest = len1 < len2 ? len1 : len2; + + // find first mismatching character + while (i < shortest && mid1[i] == mid2[i]) i++; + + // compare last characters + if (i == len1 && i < len2) return -1; // mid1 shorter + if (i == len2 && i < len1) return 1; // mid2 shorter + if (i == len1 && i == len2) return 0; // all characters match + if (mid1[i] < mid2[i]) return -1; // mid1 earlier + else return 1; +} + + +// Builds a key use to sort in TemplateCoordinate order. Returns NULL if the key could not be created (e.g. MC +// tag is missing), otherwise the pointer to the provided key. +static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup) { + uint8_t *data; + char *rg; + khiter_t k; + + // defaults + key->tid1 = key->tid2 = INT32_MAX; + key->pos1 = key->pos2 = HTS_POS_MAX; + key->neg1 = key->neg2 = false; + key->mid = ""; + + // update values + rg = (char *)bam_aux_get(b, "RG"); + if (rg && rg[0] == 'Z' + &&(k = kh_get(const_c2c, lib_lookup, rg + 1)) < kh_end(lib_lookup)) { + key->library = kh_value(lib_lookup, k); + } else { + key->library = ""; + } + key->name = bam_get_qname(b); + if (!(b->core.flag & BAM_FUNMAP)) { // read is mapped, update coordinates + key->tid1 = b->core.tid; + key->neg1 = bam_is_rev(b); + key->pos1 = (key->neg1) ? unclipped_end(b) : unclipped_start(b); + } + if (b->core.flag & BAM_FPAIRED && !(b->core.flag & BAM_FMUNMAP)) { // mate is mapped, update coordinates + char *cigar; + if ((data = bam_aux_get(b, "MC"))) { + if (!(cigar = bam_aux2Z(data))) { + fprintf(samtools_stderr, "[bam_sort] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); + return NULL; + } + } else { + fprintf(samtools_stderr, "[bam_sort] error: no MC tag. Please run samtools fixmate on file first.\n"); + return NULL; + } + key->tid2 = b->core.mtid; + key->neg2 = bam_is_mrev(b); + key->pos2 = (key->neg2) ? unclipped_other_end(b->core.mpos, cigar) : unclipped_other_start(b->core.mpos, cigar); + } + + if ((data = bam_aux_get(b, "MI"))) { + if (!(key->mid=bam_aux2Z(data))) { + fprintf(samtools_stderr, "[bam_sort] error: MI tag wrong type (not a string).\n"); + return NULL; + } + } + + // set is_upper_of_pair, and swap if we get the same key regardless of which end + // of the pair it is + if (key->tid1 < key->tid2 + || (key->tid1 == key->tid2 && key->pos1 < key->pos2) + || (key->tid1 == key->tid2 && key->pos1 == key->pos2 && !key->neg1)) { + key->is_upper_of_pair = false; + } else { + key->is_upper_of_pair = true; + // swap + int tmp_tid; + hts_pos_t tmp_pos; + bool tmp_neg; + tmp_tid = key->tid1; + key->tid1 = key->tid2; + key->tid2 = tmp_tid; + tmp_pos = key->pos1; + key->pos1 = key->pos2; + key->pos2 = tmp_pos; + tmp_neg = key->neg1; + key->neg1 = key->neg2; + key->neg2 = tmp_neg; + } + + return key; +} + +// Function to compare reads and determine which one is < or > the other +// Handles template-coordinate, which sorts by: +// 1. the earlier unclipped 5' coordinate of the read pair +// 2. the higher unclipped 5' coordinate of the read pair +// 3. library (from read group) +// 4. the molecular identifier (if present) +// 5. read name +// 6. if unpaired, or if R1 has the lower coordinates of the pair +// Returns a value less than, equal to or greater than zero if a is less than, +// equal to or greater than b, respectively. +static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b) +{ + if (!a.bam_record) return 1; + if (!b.bam_record) return 0; + + const template_coordinate_key_t* key_a = a.u.key; + const template_coordinate_key_t* key_b = b.u.key; + + int retval = 0; + if (0 == retval) retval = key_a->tid1 - key_b->tid1; + if (0 == retval) retval = key_a->tid2 - key_b->tid2; + if (0 == retval) retval = key_a->pos1 < key_b->pos1 ? -1 : (key_a->pos1 > key_b->pos1 ? 1 : 0); + if (0 == retval) retval = key_a->pos2 < key_b->pos2 ? -1 : (key_a->pos2 > key_b->pos2 ? 1 : 0); + if (0 == retval) retval = key_a->neg1 == key_b->neg1 ? 0 : (key_a->neg1 ? -1 : 1); + if (0 == retval) retval = key_a->neg2 == key_b->neg2 ? 0 : (key_a->neg2 ? -1 : 1); + if (0 == retval) retval = strcmp(key_a->library, key_b->library); + if (0 == retval) retval = template_coordinate_key_compare_mid(key_a->mid, key_b->mid); + if (0 == retval) retval = strcmp(key_a->name, key_b->name); + if (0 == retval) retval = key_a->is_upper_of_pair == key_b->is_upper_of_pair ? 0 : (key_a->is_upper_of_pair ? 1 : -1); + return retval < 0 ? -1 : (retval > 0 ? 1 : 0); +} + + // Function to compare reads and determine which one is < the other -// Handle sort-by-pos, sort-by-name, or sort-by-tag +// Handle sort-by-pos, sort-by-name, sort-by-tag, or sort-by-template-coordinate. static inline int bam1_lt(const bam1_tag a, const bam1_tag b) { - if (g_is_by_tag) { - return bam1_cmp_by_tag(a, b) < 0; - } else if (g_is_by_minhash) { - return bam1_cmp_by_minhash(a, b) < 0; - } else { - return bam1_cmp_core(a,b) < 0; + switch (g_sam_order) { + case Coordinate: + case QueryName: + return bam1_cmp_core(a, b) < 0; + case TagQueryName: + case TagCoordinate: + return bam1_cmp_by_tag(a, b) < 0; + case MinHash: + return bam1_cmp_by_minhash(a, b) < 0; + case TemplateCoordinate: + return bam1_cmp_template_coordinate(a, b) < 0; + default: + return bam1_cmp_core(a,b) < 0; } } @@ -1945,6 +2226,7 @@ typedef struct { int error; int no_save; int large_pos; + int minimiser_kmer; } worker_t; // Returns 0 for success @@ -2248,6 +2530,30 @@ static int reverse_complement(bam1_t *b) { } //--- End of candidates to punt to htslib + +static inline void worker_minhash(worker_t *w) { + int i; + for (i = 0; i < w->buf_len; i++) { + bam1_t *b = w->buf[i].bam_record; + if (b->core.tid != -1) + continue; + + int pos = 0, rev = 0; + uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev); + if (rev) + reverse_complement(b); + + // Store 64-bit hash in unmapped pos and mpos fields. + // The position of hash is in isize, which we use for + // resolving ties when sorting by hash key. + // These are unused for completely unmapped data and + // will be reset during final output. + b->core.pos = mh>>31; + b->core.mpos = mh&0x7fffffff; + b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + } +} + static void *worker(void *data) { worker_t *w = (worker_t*)data; @@ -2256,35 +2562,18 @@ static void *worker(void *data) w->error = 0; w->tmpfile_name = NULL; - if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) { - if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { - w->error = errno; - return NULL; - } - } else { - if (g_is_by_minhash) { - int i; - for (i = 0; i < w->buf_len; i++) { - bam1_t *b = w->buf[i].bam_record; - if (b->core.tid != -1) - continue; - - int pos = 0, rev = 0; - uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev); - if (rev) - reverse_complement(b); - - // Store 64-bit hash in unmapped pos and mpos fields. - // The position of hash is in isize, which we use for - // resolving ties when sorting by hash key. - // These are unused for completely unmapped data and - // will be reset during final output. - b->core.pos = mh>>31; - b->core.mpos = mh&0x7fffffff; - b->core.isize = 65535-pos >=0 ? 65535-pos : 0; + switch (g_sam_order) { + case Coordinate: + if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) { + w->error = errno; + return NULL; } - } - ks_mergesort(sort, w->buf_len, w->buf, 0); + break; + case MinHash: + worker_minhash(w); + // no break, go to merge sort + default: + ks_mergesort(sort, w->buf_len, w->buf, 0); } if (w->no_save) @@ -2325,7 +2614,7 @@ static void *worker(void *data) static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, const sam_hdr_t *h, int n_threads, buf_region *in_mem, - int large_pos, char **fns, size_t fns_size) + int large_pos, int minimiser_kmer, char **fns, size_t fns_size) { int i; size_t pos, rest; @@ -2351,6 +2640,7 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, w[i].index = n_files + i; w[i].tmpfile_name = NULL; w[i].large_pos = large_pos; + w[i].minimiser_kmer = minimiser_kmer; if (in_mem) { w[i].no_save = 1; in_mem[i].from = pos; @@ -2390,13 +2680,68 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, return n_files + n_threads; } +static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) { + khiter_t k; + if (lib_lookup == NULL) + return; + for (k = kh_begin(lib_lookup); k < kh_end(lib_lookup); k++) { + if (kh_exist(lib_lookup, k)) + free(kh_value(lib_lookup, k)); + } + kh_destroy(const_c2c, lib_lookup); +} + +// Build an RG to LB lookup table, for the template coordinate sort. +// Returns a populated hash table (which may be empty) on success; +// NULL on failure. +static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header) +{ + khash_t(const_c2c) *lib_lookup = kh_init(const_c2c); + kstring_t lib_name = KS_INITIALIZE; + int num_rg, i, res; + if (!lib_lookup) + return NULL; + + // Iterate through any RG lines and look for library information + num_rg = sam_hdr_count_lines(header, "RG"); + if (num_rg < 0) + goto fail; + + for (i = 0; i < num_rg; i++) { + const char *rg_id = sam_hdr_line_name(header, "RG", i); + khiter_t k; + if (!rg_id) + goto fail; + res = sam_hdr_find_tag_pos(header, "RG", i, "LB", &lib_name); + if (res < -1) // Error + goto fail; + if (res < 0 || !lib_name.s) // No LB tag + continue; + // Add to lookup table + k = kh_put(const_c2c, lib_lookup, rg_id, &res); + if (res < 0) // Error + goto fail; + if (res > 0) { // Inserted + kh_value(lib_lookup, k) = ks_release(&lib_name); + } + } + + free(lib_name.s); + + return lib_lookup; + + fail: + lib_lookup_destroy(lib_lookup); + free(lib_name.s); + return NULL; +} /*! - @abstract Sort an unsorted BAM file based on the chromosome order - and the leftmost position of an alignment + @abstract Sort an unsorted BAM file based on the provided sort order - @param is_by_qname whether to sort by query name - @param sort_by_tag if non-null, sort by the given tag + @param sam_order the order in which the sort should occur + @param sort_tag the tag to use if sorting by Tag + @param minimiser_kmer the kmer size when sorting by MinHash @param fn name of the file to be sorted @param prefix prefix of the temporary files (prefix.NNNN.bam are written) @param fnout name of the final output file to be written @@ -2413,23 +2758,28 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, and then merge them by calling bam_merge_simple(). This function is NOT thread safe. */ -int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix, +int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, + const char *fn, const char *prefix, const char *fnout, const char *modeout, - size_t _max_mem, int by_minimiser, int n_threads, + size_t _max_mem, int n_threads, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { int ret = -1, res, i, nref, n_files = 0; size_t max_k, k, max_mem, bam_mem_offset; sam_hdr_t *header = NULL; - samFile *fp; + samFile *fp = NULL; bam1_tag *buf = NULL; + template_coordinate_keys_t *keys = NULL; bam1_t *b = bam_init1(); uint8_t *bam_mem = NULL; char **fns = NULL; size_t fns_size = 0; - const char *new_so; + const char *new_so = NULL; + const char *new_go = NULL; + const char *new_ss = NULL; buf_region *in_mem = NULL; + khash_t(const_c2c) *lib_lookup = NULL; int num_in_mem = 0; int large_pos = 0; @@ -2439,12 +2789,21 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } if (n_threads < 2) n_threads = 1; - g_is_by_qname = is_by_qname; - g_is_by_minhash = by_minimiser; - if (sort_by_tag) { - g_is_by_tag = 1; - g_sort_tag[0] = sort_by_tag[0]; - g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0'; + g_sam_order = sam_order; + if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) { + g_sort_tag[0] = sort_tag[0]; + g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0'; + } + + if (sam_order == TemplateCoordinate) { + if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) { + print_error("sort", "could not allocate memory for the top-level keys"); + goto err; + } + keys->n = 0; + keys->m = 0; + keys->buffer_size = 0x10000; + keys->buffers = NULL; } max_mem = _max_mem * n_threads; @@ -2482,15 +2841,45 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } } - if (sort_by_tag != NULL) - new_so = "unknown"; - else if (is_by_qname) - new_so = "queryname"; - else - new_so = "coordinate"; + if (g_sam_order == TemplateCoordinate) { + lib_lookup = lookup_libraries(header); + if (!lib_lookup) + goto err; + } - if (by_minimiser) { - const char *new_ss = "coordinate:minhash"; + switch (g_sam_order) { + case Coordinate: + new_so = "coordinate"; + break; + case QueryName: + new_so = "queryname"; + break; + case MinHash: + new_so = "coordinate"; + new_ss = "coordinate:minhash"; + break; + case TagQueryName: + case TagCoordinate: + new_so = "unknown"; + break; + case TemplateCoordinate: + new_so = "unsorted"; + new_go = "query"; + new_ss = "unsorted:template-coordinate"; + break; + default: + new_so = "unknown"; + break; + } + + if (new_ss == NULL && new_go == NULL) { // just SO + if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); + goto err; + } + } else if (new_ss != NULL && new_go == NULL) { // update SO and SS, but not GO if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss)) && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, "SS", new_ss, NULL)) @@ -2499,18 +2888,37 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const new_so, new_ss); goto err; } - } else { - if ((-1 == sam_hdr_update_hd(header, "SO", new_so)) - && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL)) + } else if (new_ss == NULL && new_go != NULL) { // update SO and GO, but not SS + if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, + "SO", new_so, "GO", new_go, NULL)) ) { - print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so); + print_error("sort", "failed to change sort order header to 'SO:%s GO:%s'\n", + new_so, new_go); + goto err; + } + } else { // update SO, GO, and SS + if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go, "SS", new_ss)) + && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, + "SO", new_so, "GO", new_go, "SS", new_ss, NULL)) + ) { + print_error("sort", "failed to change sort order header to 'SO:%s GO:%s SS:%s'\n", + new_so, new_go, new_ss); goto err; } } - if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { - print_error("sort", "failed to delete group order header\n"); - goto err; + if (new_go == NULL) { + if (-1 == sam_hdr_remove_tag_hd(header, "GO")) { + print_error("sort", "failed to delete group order in header\n"); + goto err; + } + } + if (new_ss == NULL) { + if (-1 == sam_hdr_remove_tag_hd(header, "SS")) { + print_error("sort", "failed to delete sub sort in header\n"); + goto err; + } } // No gain to using the thread pool here as the flow of this code @@ -2538,6 +2946,11 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } buf = new_buf; } + if (sam_order == TemplateCoordinate && k >= keys->m * keys->buffer_size) { + if (template_coordinate_keys_realloc(keys, k + 1) == -1) { + goto err; + } + } // Check if the BAM record will fit in the memory limit if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) { @@ -2555,12 +2968,21 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const mem_full = 1; } - // Pull out the value of the position - // or the pointer to the sort tag if applicable - if (g_is_by_tag) { - buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag); - } else { - buf[k].u.tag = NULL; + // Set the tag if sorting by tag, or the key for template cooridinate sorting + switch (g_sam_order) { + case TagQueryName: + case TagCoordinate: + buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag); + break; + case TemplateCoordinate: + ++keys->n; + template_coordinate_key_t *key = template_coordinate_keys_get(keys, k); + buf[k].u.key = template_coordinate_key(buf[k].bam_record, key, header, lib_lookup); + if (buf[k].u.key == NULL) goto err; + break; + default: + buf[k].u.tag = NULL; + buf[k].u.key = NULL; } ++k; @@ -2569,13 +2991,14 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const &fns_size, &fns, 0) < 0) goto err; int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads, - NULL, large_pos, fns, fns_size); + NULL, large_pos, minimiser_kmer, fns, fns_size); if (new_n < 0) { goto err; } else { n_files = new_n; } k = 0; + if (keys != NULL) keys->n = 0; bam_mem_offset = 0; } } @@ -2589,7 +3012,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); if (!in_mem) goto err; num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, - in_mem, large_pos, fns, fns_size); + in_mem, large_pos, minimiser_kmer, fns, fns_size); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -2598,7 +3021,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const // write the final output if (n_files == 0 && num_in_mem < 2) { // a single block if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt, - g_is_by_minhash, arg_list, no_pg, write_index) != 0) { + minimiser_kmer, arg_list, no_pg, write_index) != 0) { print_error_errno("sort", "failed to create \"%s\"", fnout); goto err; } @@ -2614,10 +3037,11 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const abort(); } } - if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header, - n_files, fns, num_in_mem, in_mem, buf, - n_threads, "sort", in_fmt, out_fmt, arg_list, - no_pg, write_index) < 0) { + char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL; + if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header, + n_files, fns, num_in_mem, in_mem, buf, keys, + lib_lookup, n_threads, "sort", in_fmt, out_fmt, + arg_list, no_pg, write_index) < 0) { // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. goto err; @@ -2639,8 +3063,16 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const } bam_destroy1(b); free(buf); + if (keys != NULL) { + for (i = 0; i < keys->m; ++i) { + free(keys->buffers[i]); + } + free(keys->buffers); + free(keys); + } free(bam_mem); free(in_mem); + lib_lookup_destroy(lib_lookup); sam_hdr_destroy(header); if (fp) sam_close(fp); return ret; @@ -2653,7 +3085,9 @@ int bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t ma char *fnout = calloc(strlen(prefix) + 4 + 1, 1); if (!fnout) return -1; sprintf(fnout, "%s.bam", prefix); - ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0); + SamOrder sam_order = is_by_qname ? QueryName : Coordinate; + g_sam_order = sam_order; + ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0); free(fnout); return ret; } @@ -2672,7 +3106,10 @@ static void sort_usage(FILE *fp) " -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n" " -o FILE Write final output to FILE rather than standard output\n" " -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" -" --no-PG do not add a PG line\n"); +" --no-PG\n" +" Do not add a PG line\n" +" --template-coordinate\n" +" Sort by template-coordinate\n"); sam_global_opt_help(fp, "-.O..@.."); } @@ -2696,8 +3133,10 @@ static void complain_about_memory_setting(size_t max_mem) { int bam_sort(int argc, char *argv[]) { size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20; - int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0; - int by_minimiser = 0, minimiser_kmer = 20; + int c, nargs, ret, o_seen = 0, level = -1, no_pg = 0; + SamOrder sam_order = Coordinate; + bool by_tag = false; + int minimiser_kmer = 20; char* sort_tag = NULL, *arg_list = NULL; char *fnout = "-", modeout[12]; kstring_t tmpprefix = { 0, 0, NULL }; @@ -2708,14 +3147,15 @@ int bam_sort(int argc, char *argv[]) SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), { "threads", required_argument, NULL, '@' }, {"no-PG", no_argument, NULL, 1}, + { "template-coordinate", no_argument, NULL, 2}, { NULL, 0, NULL, 0 } }; while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) { switch (c) { case 'o': fnout = optarg; o_seen = 1; break; - case 'n': is_by_qname = 1; break; - case 't': sort_tag = optarg; break; + case 'n': sam_order = QueryName; break; + case 't': by_tag = true; sort_tag = optarg; break; case 'm': { char *q; max_mem = strtol(optarg, &q, 0); @@ -2728,7 +3168,8 @@ int bam_sort(int argc, char *argv[]) case 'l': level = atoi(optarg); break; case 'u': level = 0; break; case 1: no_pg = 1; break; - case 'M': by_minimiser = 1; break; + case 2: sam_order = TemplateCoordinate; break; + case 'M': sam_order = MinHash; break; case 'K': minimiser_kmer = atoi(optarg); if (minimiser_kmer < 1) @@ -2743,6 +3184,11 @@ int bam_sort(int argc, char *argv[]) } } + // Change sort order if tag sorting is requested. Must update based on secondary index + if (by_tag) { + sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate; + } + nargs = argc - optind; if (nargs == 0 && isatty(STDIN_FILENO)) { sort_usage(samtools_stdout); @@ -2759,7 +3205,7 @@ int bam_sort(int argc, char *argv[]) goto sort_end; } - if (ga.write_index && (is_by_qname || sort_tag)) { + if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) { fprintf(samtools_stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n"); ga.write_index = 0; } @@ -2794,9 +3240,9 @@ int bam_sort(int argc, char *argv[]) ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000); } - ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-", - tmpprefix.s, fnout, modeout, max_mem, - by_minimiser * minimiser_kmer, ga.nthreads, + ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0, + (nargs > 0) ? argv[optind] : "-", + tmpprefix.s, fnout, modeout, max_mem, ga.nthreads, &ga.in, &ga.out, arg_list, no_pg, ga.write_index); if (ret >= 0) ret = EXIT_SUCCESS; diff --git a/samtools/bamtk.c b/samtools/bamtk.c index ffec347..e690c1d 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -71,6 +71,7 @@ int main_ampliconstats(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_samples(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); +int main_reference(int argc, char *argv[]); const char *samtools_version() { @@ -179,6 +180,7 @@ static void usage(FILE *fp) " fastq converts a BAM to a FASTQ\n" " fasta converts a BAM to a FASTA\n" " import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n" +" reference Generates a reference from aligned data\n" "\n" " -- Statistics\n" " bedcov read depth per BED region\n" @@ -284,6 +286,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); + else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) long_version(); diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index 11d21a3..b798658 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -74,6 +74,7 @@ int main_ampliconstats(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_samples(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); +int main_reference(int argc, char *argv[]); const char *samtools_version() { @@ -182,6 +183,7 @@ static void usage(FILE *fp) " fastq converts a BAM to a FASTQ\n" " fasta converts a BAM to a FASTA\n" " import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n" +" reference Generates a reference from aligned data\n" "\n" " -- Statistics\n" " bedcov read depth per BED region\n" @@ -287,6 +289,7 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); + else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) long_version(); diff --git a/samtools/bedcov.c b/samtools/bedcov.c index 07bd9ce..1bd46a0 100644 --- a/samtools/bedcov.c +++ b/samtools/bedcov.c @@ -1,7 +1,7 @@ /* bedcov.c -- bedcov subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd. + Copyright (C) 2013-2014, 2018-2022 Genome Research Ltd. Author: Heng Li @@ -48,6 +48,7 @@ typedef struct { hts_itr_t *iter; int min_mapQ; uint32_t flags; // read filtering flags + int64_t rcnt; } aux_t; static int read_bam(void *data, bam1_t *b) @@ -65,6 +66,12 @@ static int read_bam(void *data, bam1_t *b) return ret; } +static int incr_rcnt(void *data, const bam1_t *b, bam_pileup_cd *cd) { + aux_t *aux = (aux_t *)data; + aux->rcnt++; + return 0; +} + int main_bedcov(int argc, char *argv[]) { gzFile fp; @@ -72,8 +79,9 @@ int main_bedcov(int argc, char *argv[]) kstream_t *ks; hts_idx_t **idx; aux_t **aux; - int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0; - int64_t *cnt, *pcov = NULL;; + int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0; + int skip_DN = 0, do_rcount = 0; + int64_t *cnt, *pcov = NULL; const bam_pileup1_t **plp; int usage = 0, has_index_file = 0; uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); @@ -87,10 +95,11 @@ int main_bedcov(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:c", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; case 'X': has_index_file = 1; break; + case 'c': do_rcount = 1; break; case 'g': tflags = bam_str2flag(optarg); if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { @@ -122,10 +131,11 @@ int main_bedcov(int argc, char *argv[]) fprintf(stderr, " -X use customized index files\n"); fprintf(stderr, " -g remove the specified flags from the set used to filter out reads\n"); fprintf(stderr, " -G add the specified flags to the set used to filter out reads\n" - " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704"); + " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704\n"); fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); fprintf(stderr, " -d depth threshold. Number of reference bases with coverage above and" " including this value will be displayed in a separate column\n"); + fprintf(stderr, " -c add an additional column showing read count\n"); sam_global_opt_help(stderr, "-.--.--."); return 1; } @@ -168,8 +178,12 @@ int main_bedcov(int argc, char *argv[]) aux[i]->flags = flags; } cnt = calloc(n, sizeof(*cnt)); + if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov)); - if (!cnt || (min_depth >= 0 && !pcov)) return 2; + if (!cnt || (min_depth >= 0 && !pcov)) { + print_error_errno("bedcov", "failed to allocate memory"); + return 2; + } fp = gzopen(argv[optind], "rb"); if (fp == NULL) { @@ -202,6 +216,7 @@ int main_bedcov(int argc, char *argv[]) for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); + aux[i]->rcnt = 0; } mplp = bam_mplp_init(n, read_bam, (void**)aux); @@ -213,6 +228,9 @@ int main_bedcov(int argc, char *argv[]) memset(cnt, 0, sizeof(*cnt) * n); if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n); + if (do_rcount) + bam_mplp_constructor(mplp, incr_rcnt); + while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) if (pos >= beg && pos < end) { for (i = 0; i < n; ++i) { @@ -246,6 +264,12 @@ int main_bedcov(int argc, char *argv[]) kputl(pcov[i], &str); } } + if (do_rcount) { + for (i = 0; i < n; ++i) { + kputc('\t', &str); + kputl(aux[i]->rcnt, &str); + } + } puts(str.s); bam_mplp_destroy(mplp); continue; diff --git a/samtools/bedcov.c.pysam.c b/samtools/bedcov.c.pysam.c index 162630f..f259cb1 100644 --- a/samtools/bedcov.c.pysam.c +++ b/samtools/bedcov.c.pysam.c @@ -3,7 +3,7 @@ /* bedcov.c -- bedcov subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd. + Copyright (C) 2013-2014, 2018-2022 Genome Research Ltd. Author: Heng Li @@ -50,6 +50,7 @@ typedef struct { hts_itr_t *iter; int min_mapQ; uint32_t flags; // read filtering flags + int64_t rcnt; } aux_t; static int read_bam(void *data, bam1_t *b) @@ -67,6 +68,12 @@ static int read_bam(void *data, bam1_t *b) return ret; } +static int incr_rcnt(void *data, const bam1_t *b, bam_pileup_cd *cd) { + aux_t *aux = (aux_t *)data; + aux->rcnt++; + return 0; +} + int main_bedcov(int argc, char *argv[]) { gzFile fp; @@ -74,8 +81,9 @@ int main_bedcov(int argc, char *argv[]) kstream_t *ks; hts_idx_t **idx; aux_t **aux; - int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0; - int64_t *cnt, *pcov = NULL;; + int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0; + int skip_DN = 0, do_rcount = 0; + int64_t *cnt, *pcov = NULL; const bam_pileup1_t **plp; int usage = 0, has_index_file = 0; uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); @@ -89,10 +97,11 @@ int main_bedcov(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:c", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; case 'X': has_index_file = 1; break; + case 'c': do_rcount = 1; break; case 'g': tflags = bam_str2flag(optarg); if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { @@ -124,10 +133,11 @@ int main_bedcov(int argc, char *argv[]) fprintf(samtools_stderr, " -X use customized index files\n"); fprintf(samtools_stderr, " -g remove the specified flags from the set used to filter out reads\n"); fprintf(samtools_stderr, " -G add the specified flags to the set used to filter out reads\n" - " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704"); + " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704\n"); fprintf(samtools_stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n"); fprintf(samtools_stderr, " -d depth threshold. Number of reference bases with coverage above and" " including this value will be displayed in a separate column\n"); + fprintf(samtools_stderr, " -c add an additional column showing read count\n"); sam_global_opt_help(samtools_stderr, "-.--.--."); return 1; } @@ -170,8 +180,12 @@ int main_bedcov(int argc, char *argv[]) aux[i]->flags = flags; } cnt = calloc(n, sizeof(*cnt)); + if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov)); - if (!cnt || (min_depth >= 0 && !pcov)) return 2; + if (!cnt || (min_depth >= 0 && !pcov)) { + print_error_errno("bedcov", "failed to allocate memory"); + return 2; + } fp = gzopen(argv[optind], "rb"); if (fp == NULL) { @@ -204,6 +218,7 @@ int main_bedcov(int argc, char *argv[]) for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); + aux[i]->rcnt = 0; } mplp = bam_mplp_init(n, read_bam, (void**)aux); @@ -215,6 +230,9 @@ int main_bedcov(int argc, char *argv[]) memset(cnt, 0, sizeof(*cnt) * n); if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n); + if (do_rcount) + bam_mplp_constructor(mplp, incr_rcnt); + while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) if (pos >= beg && pos < end) { for (i = 0; i < n; ++i) { @@ -248,6 +266,12 @@ int main_bedcov(int argc, char *argv[]) kputl(pcov[i], &str); } } + if (do_rcount) { + for (i = 0; i < n; ++i) { + kputc('\t', &str); + kputl(aux[i]->rcnt, &str); + } + } samtools_puts(str.s); bam_mplp_destroy(mplp); continue; diff --git a/samtools/coverage.c b/samtools/coverage.c index 5204cd4..dedaa8e 100644 --- a/samtools/coverage.c +++ b/samtools/coverage.c @@ -55,8 +55,6 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "sam_opts.h" -const char *VERSION = "0.1"; - typedef struct { // auxiliary data structure to hold stats on coverage unsigned long long n_covered_bases; unsigned long long summed_coverage; @@ -254,7 +252,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); fputc(' ', file_out); switch (i) { - case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break; + case 9: fprintf(file_out, "Number of reads: %u", stats[tid].n_selected_reads); break; case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break; case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break; case 6: fprintf(file_out, "Percent covered: %.4g%%", diff --git a/samtools/coverage.c.pysam.c b/samtools/coverage.c.pysam.c index d5affdf..894f4ac 100644 --- a/samtools/coverage.c.pysam.c +++ b/samtools/coverage.c.pysam.c @@ -57,8 +57,6 @@ DEALINGS IN THE SOFTWARE. */ #include "samtools.h" #include "sam_opts.h" -const char *VERSION = "0.1"; - typedef struct { // auxiliary data structure to hold stats on coverage unsigned long long n_covered_bases; unsigned long long summed_coverage; @@ -256,7 +254,7 @@ void print_hist(FILE *file_out, const sam_hdr_t *h, const stats_aux_t *stats, in fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); fputc(' ', file_out); switch (i) { - case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break; + case 9: fprintf(file_out, "Number of reads: %u", stats[tid].n_selected_reads); break; case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break; case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break; case 6: fprintf(file_out, "Percent covered: %.4g%%", diff --git a/samtools/dict.c b/samtools/dict.c index 029d548..47cb842 100644 --- a/samtools/dict.c +++ b/samtools/dict.c @@ -29,16 +29,20 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/khash.h" #include "htslib/kseq.h" #include "htslib/hts.h" +#include "samtools.h" +KHASH_SET_INIT_STR(str) KSEQ_INIT(gzFile, gzread) typedef struct _args_t { - char *output_fname, *fname; + char *output_fname, *alt_fname; char *assembly, *species, *uri; int alias, header; + khash_t(str) *is_alt; } args_t; @@ -53,14 +57,14 @@ static void write_dict(const char *fn, args_t *args) fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { - fprintf(stderr, "dict: %s: No such file or directory\n", fn); + print_error_errno("dict", "Cannot open %s", fn); exit(1); } FILE *out = stdout; if (args->output_fname) { out = fopen(args->output_fname, "w"); if (out == NULL) { - fprintf(stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); + print_error_errno("dict", "Cannot open %s for writing", args->output_fname); exit(1); } } @@ -80,6 +84,8 @@ static void write_dict(const char *fn, args_t *args) hts_md5_final(digest, md5); hts_md5_hex(hex, digest); fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex); + if (args->is_alt && kh_get(str, args->is_alt, seq->name.s) != kh_end(args->is_alt)) + fprintf(out, "\tAH:*"); if (args->alias) { const char *name = seq->name.s; if (strncmp(name, "chr", 3) == 0) { @@ -116,6 +122,34 @@ static void write_dict(const char *fn, args_t *args) gzclose(fp); } +static void read_alt_file(khash_t(str) *is_alt, const char *fname) +{ + htsFile *fp = hts_open(fname, "r"); + if (fp == NULL) { + print_error_errno("dict", "Cannot open %s", fname); + exit(1); + } + + // .alt files are in a SAM-like format, but we don't use sam_read1() + // as these files may not have a complete set of @SQ headers. + + kstring_t str = KS_INITIALIZE; + while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { + if (str.l == 0 || str.s[0] == '@') continue; + + char *tab = strchr(str.s, '\t'); + if (tab) *tab = '\0'; + + int ret; + char *seqname = strdup(str.s); + kh_put(str, is_alt, seqname, &ret); + if (ret == 0) free(seqname); // Already present + } + + ks_free(&str); + hts_close(fp); +} + static int dict_usage(void) { fprintf(stderr, "\n"); @@ -125,6 +159,7 @@ static int dict_usage(void) fprintf(stderr, " -A, --alias, --alternative-name\n"); fprintf(stderr, " add AN tag by adding/removing 'chr'\n"); fprintf(stderr, " -H, --no-header do not print @HD line\n"); + fprintf(stderr, " -l, --alt FILE add AH:* tag to alternate locus sequences\n"); fprintf(stderr, " -o, --output FILE file to write out dict file [stdout]\n"); fprintf(stderr, " -s, --species STR species\n"); fprintf(stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); @@ -142,6 +177,7 @@ int dict_main(int argc, char *argv[]) {"help", no_argument, NULL, 'h'}, {"no-header", no_argument, NULL, 'H'}, {"alias", no_argument, NULL, 'A'}, + {"alt", required_argument, NULL, 'l'}, {"alternative-name", no_argument, NULL, 'A'}, {"assembly", required_argument, NULL, 'a'}, {"species", required_argument, NULL, 's'}, @@ -150,12 +186,13 @@ int dict_main(int argc, char *argv[]) {NULL, 0, NULL, 0} }; int c; - while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 ) + while ( (c=getopt_long(argc,argv,"?AhHa:l:s:u:o:",loptions,NULL))>0 ) { switch (c) { case 'A': args->alias = 1; break; case 'a': args->assembly = optarg; break; + case 'l': args->alt_fname = optarg; break; case 's': args->species = optarg; break; case 'u': args->uri = optarg; break; case 'o': args->output_fname = optarg; break; @@ -173,7 +210,20 @@ int dict_main(int argc, char *argv[]) } else fname = argv[optind]; + if (args->alt_fname) { + args->is_alt = kh_init(str); + read_alt_file(args->is_alt, args->alt_fname); + } + write_dict(fname, args); + + if (args->is_alt) { + khint_t k; + for (k = 0; k < kh_end(args->is_alt); ++k) + if (kh_exist(args->is_alt, k)) free((char *) kh_key(args->is_alt, k)); + kh_destroy(str, args->is_alt); + } + free(args); return 0; } diff --git a/samtools/dict.c.pysam.c b/samtools/dict.c.pysam.c index ca54c48..6b2f2ba 100644 --- a/samtools/dict.c.pysam.c +++ b/samtools/dict.c.pysam.c @@ -31,16 +31,20 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include "htslib/khash.h" #include "htslib/kseq.h" #include "htslib/hts.h" +#include "samtools.h" +KHASH_SET_INIT_STR(str) KSEQ_INIT(gzFile, gzread) typedef struct _args_t { - char *output_fname, *fname; + char *output_fname, *alt_fname; char *assembly, *species, *uri; int alias, header; + khash_t(str) *is_alt; } args_t; @@ -55,14 +59,14 @@ static void write_dict(const char *fn, args_t *args) fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { - fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn); + print_error_errno("dict", "Cannot open %s", fn); samtools_exit(1); } FILE *out = samtools_stdout; if (args->output_fname) { out = fopen(args->output_fname, "w"); if (out == NULL) { - fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname); + print_error_errno("dict", "Cannot open %s for writing", args->output_fname); samtools_exit(1); } } @@ -82,6 +86,8 @@ static void write_dict(const char *fn, args_t *args) hts_md5_final(digest, md5); hts_md5_hex(hex, digest); fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex); + if (args->is_alt && kh_get(str, args->is_alt, seq->name.s) != kh_end(args->is_alt)) + fprintf(out, "\tAH:*"); if (args->alias) { const char *name = seq->name.s; if (strncmp(name, "chr", 3) == 0) { @@ -118,6 +124,34 @@ static void write_dict(const char *fn, args_t *args) gzclose(fp); } +static void read_alt_file(khash_t(str) *is_alt, const char *fname) +{ + htsFile *fp = hts_open(fname, "r"); + if (fp == NULL) { + print_error_errno("dict", "Cannot open %s", fname); + samtools_exit(1); + } + + // .alt files are in a SAM-like format, but we don't use sam_read1() + // as these files may not have a complete set of @SQ headers. + + kstring_t str = KS_INITIALIZE; + while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) { + if (str.l == 0 || str.s[0] == '@') continue; + + char *tab = strchr(str.s, '\t'); + if (tab) *tab = '\0'; + + int ret; + char *seqname = strdup(str.s); + kh_put(str, is_alt, seqname, &ret); + if (ret == 0) free(seqname); // Already present + } + + ks_free(&str); + hts_close(fp); +} + static int dict_usage(void) { fprintf(samtools_stderr, "\n"); @@ -127,6 +161,7 @@ static int dict_usage(void) fprintf(samtools_stderr, " -A, --alias, --alternative-name\n"); fprintf(samtools_stderr, " add AN tag by adding/removing 'chr'\n"); fprintf(samtools_stderr, " -H, --no-header do not print @HD line\n"); + fprintf(samtools_stderr, " -l, --alt FILE add AH:* tag to alternate locus sequences\n"); fprintf(samtools_stderr, " -o, --output FILE file to write out dict file [samtools_stdout]\n"); fprintf(samtools_stderr, " -s, --species STR species\n"); fprintf(samtools_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n"); @@ -144,6 +179,7 @@ int dict_main(int argc, char *argv[]) {"help", no_argument, NULL, 'h'}, {"no-header", no_argument, NULL, 'H'}, {"alias", no_argument, NULL, 'A'}, + {"alt", required_argument, NULL, 'l'}, {"alternative-name", no_argument, NULL, 'A'}, {"assembly", required_argument, NULL, 'a'}, {"species", required_argument, NULL, 's'}, @@ -152,12 +188,13 @@ int dict_main(int argc, char *argv[]) {NULL, 0, NULL, 0} }; int c; - while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 ) + while ( (c=getopt_long(argc,argv,"?AhHa:l:s:u:o:",loptions,NULL))>0 ) { switch (c) { case 'A': args->alias = 1; break; case 'a': args->assembly = optarg; break; + case 'l': args->alt_fname = optarg; break; case 's': args->species = optarg; break; case 'u': args->uri = optarg; break; case 'o': args->output_fname = optarg; break; @@ -175,7 +212,20 @@ int dict_main(int argc, char *argv[]) } else fname = argv[optind]; + if (args->alt_fname) { + args->is_alt = kh_init(str); + read_alt_file(args->is_alt, args->alt_fname); + } + write_dict(fname, args); + + if (args->is_alt) { + khint_t k; + for (k = 0; k < kh_end(args->is_alt); ++k) + if (kh_exist(args->is_alt, k)) free((char *) kh_key(args->is_alt, k)); + kh_destroy(str, args->is_alt); + } + free(args); return 0; } diff --git a/samtools/reference.c b/samtools/reference.c new file mode 100644 index 0000000..2d95d64 --- /dev/null +++ b/samtools/reference.c @@ -0,0 +1,598 @@ +/* bam_reference.c -- extracts an embedded reference from a CRAM file, + or creates it from alignments plus MD:Z tags. + + Copyright (C) 2022 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include +#include + +#include "htslib/sam.h" +#include "htslib/cram.h" +#include "samtools.h" +#include "sam_opts.h" + + +/* + * There are two main modes of operation. + * + * 1. Extracting the reference from the CRAM file embed_ref blocks. + * 2. Generation of reference by analysing consensus plus patches applied + * via MD tags. + * + * The first is very rapid, but only applies to a CRAM files generated with + * the specific options (not commonly used and not the default). The second + * is a slow operation, but applies to any data type. + * + * This is also a testing ground for a future CRAM auto-embed-ref option that + * permits the use of an embedded reference without having to first extract + * the reference. (Note this may require the creation of MD tags during + * decode by use of an existing embedded reference, if the records don't + * have an MD tag themselves, but that's an issue for htslib when we get + * there.) + */ + +/* + * --------------------------------------------------------------------------- + * Shared utility functions by both methods. + */ + +#define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL) +#define MIN(a,b) ((a)<(b)?(a):(b)) +static int dump_ref(sam_hdr_t *h, hts_itr_t *iter, int ref_id, + char *ref, uint64_t ref_len, FILE *fp, int verbose) { + int N = 0; + if (iter && iter->end >= HTS_POS_MAX) + iter->end = ref_len; + if (iter && (iter->beg > 0 || iter->end < ref_len)) { + fprintf(fp, ">%s:%"PRIhts_pos"-%"PRIhts_pos"\n", + sam_hdr_tid2name(h, ref_id), iter->beg+1, iter->end); + ref += iter->beg; + ref_len = MIN(ref_len, iter->end) - iter->beg; + } else { + fprintf(fp, ">%s\n", sam_hdr_tid2name(h, ref_id)); + } + + int i, j; + uint64_t rem = ref_len; + + // Count coverage, purely for information purposes. + // About 90% of dump_ref CPU is here, so maybe this isn't useful, + // but this is still 3-4x faster than the obvious naive loop. + // + // Overall though it's only about 5% overhead of the entire process + // (was ~20%). + if (verbose) { + int n4[8] = {0}; + for (j = 0; j < ref_len && (((uintptr_t) &ref[j] & 7) != 0); j++) + N += ref[j] == 'N'; + uint64_t fast_end = ((ref_len - j) & ~7) + j; + for (; j < fast_end; j+=8) { + uint64_t i64 = *(uint64_t *)&ref[j]; + if (!haszero(i64 ^ 0x4e4e4e4e4e4e4e4eUL)) // 'N' <-> 0 + continue; + + n4[0] += ref[j+0] == 'N'; + n4[1] += ref[j+1] == 'N'; + n4[2] += ref[j+2] == 'N'; + n4[3] += ref[j+3] == 'N'; + n4[4] += ref[j+4] == 'N'; + n4[5] += ref[j+5] == 'N'; + n4[6] += ref[j+6] == 'N'; + n4[7] += ref[j+7] == 'N'; + } + for (; j < ref_len; j++) + N += ref[j] == 'N'; + N += n4[0]+n4[1]+n4[2]+n4[3]+ + n4[4]+n4[5]+n4[6]+n4[7]; + } + + // Format reference + for (i = 0; i < ref_len; i += 60, rem -= 60) { + int len = (int)(rem < 60 ? rem : 60); + if (fwrite(ref, 1, len, fp) != len) + return -1; + putc('\n', fp); + ref += 60; + } + + if (verbose) + fprintf(stderr, "Dump ref %d len %"PRId64", coverage %.2f%%\n", + ref_id, ref_len, 100 - N*100.0 / ref_len); + + return 0; +} + +/* + * --------------------------------------------------------------------------- + * CRAM embedded reference method of reference construction + */ + +/* + * Extracts an embedded reference from a sorted CRAM file. + * Modelled on the CRAM container copy loop from bam_cat.c. + */ +static int cram2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg, + FILE *outfp, int verbose) { + cram_fd *in_c; + cram_container *c = NULL; + cram_block *blk = NULL; + cram_block_slice_hdr *shdr = NULL; + + int curr_ref_id = -99; + char *ref = NULL; + uint64_t ref_len = 0; + + // We have no direct public API for seeking in CRAM to a specific + // location by genome coordinates. The sam_itr_query API is + // designed for fetching records, rather than seeks to specific + // file locations. + // + // TODO: consider exposing cram_range and cram_seek_to_refpos API. + // After a sam_index_load which will add the index to infp, these + // functions should seek direct to the start of a container. + // Or use cram_index *e =cram_index_query(cram, tid, beg, NULL); + // + // However, fortuitously(?) sam_itr_querys calls cram_seek_to_refpos + // so we can do a region query and let that do the initial seek. + // We still need to do our own end-range detection though. + + hts_itr_t *iter = NULL; + if (reg) { + iter = sam_itr_querys(idx, h, reg); + if (!iter) { + print_error("reference", "failed to parse region '%s'", reg); + goto err; + } + } + + in_c = in->fp.cram; // low level htslib abuse? + int eor = 0; + while (!eor && (c = cram_read_container(in_c))) { + if (cram_container_is_empty(in_c)) { + cram_block *blk; + // Container compression header + if (!(blk = cram_read_block(in_c))) + goto err; + cram_free_block(blk); + cram_free_container(c); + c = NULL; blk = NULL; + continue; + } + + // Container compression header; read and discard + int32_t num_slices; + if (!(blk = cram_read_block(in_c))) + goto err; + cram_free_block(blk); + blk = NULL; + + // Container num_blocks can be invalid, due to a bug. + // Instead we iterate in slice context instead. + (void)cram_container_get_landmarks(c, &num_slices); + int i, j; + for (i = 0; i < num_slices; i++) { + // Slice header + if (!(blk = cram_read_block(in_c))) + goto err; + if (!(shdr = cram_decode_slice_header(in_c, blk))) + goto err; + cram_free_block(blk); + blk = NULL; + + int num_blocks = cram_slice_hdr_get_num_blocks(shdr); + int embed_id = cram_slice_hdr_get_embed_ref_id(shdr); + int ref_id; + hts_pos_t ref_start, ref_span; + cram_slice_hdr_get_coords(shdr, &ref_id, &ref_start, &ref_span); + + if (iter) { + if (iter->tid != ref_id || ref_start > iter->end) { + // Beyond end of specified region. + cram_free_slice_header(shdr); + eor = 1; + break; + } + } + + if (embed_id < 0 && ref_id != -1) { + fprintf(stderr, "CRAM file has slice without embedded " + "reference\n"); + goto err; + } + + if (ref_id != curr_ref_id) { + if (curr_ref_id >= 0) { + if (dump_ref(h, iter, curr_ref_id, ref, ref_len, + outfp, verbose) < 0) + goto err; + } + + ref_len = sam_hdr_tid2len(h, ref_id); + if (ref_len) { + char *ref2 = realloc(ref, ref_len); + if (!ref2) + goto err; + else + ref = ref2; + memset(ref, 'N', ref_len); + } + curr_ref_id = ref_id; + } + + // Slice data blocks + for (j = 0; j < num_blocks; j++) { + // read and discard, unless it's the ref-ID block + if (!(blk = cram_read_block(in_c))) + goto err; + if (cram_block_get_content_id(blk) == embed_id) { + cram_uncompress_block(blk); + //printf("%.*s\n", blk->uncomp_size, blk->data); + + int32_t usize = cram_block_get_uncomp_size(blk); + int ref_end = ref_start + usize; + if (ref_end > ref_len+1) + ref_end = ref_len+1; + if (ref_end > ref_start) + memcpy(ref + ref_start-1, cram_block_get_data(blk), + ref_end - ref_start); + } + cram_free_block(blk); + blk = NULL; + } + cram_free_slice_header(shdr); + shdr = NULL; + } + + cram_free_container(c); + c = NULL; + } + + int ret = 0; + if (curr_ref_id >= 0) { + ret = dump_ref(h, iter, curr_ref_id, ref, ref_len, outfp, verbose); + } else if (reg) { + // no data present + // no data present, but we explicitly asked for the reference so + // report it still as Ns. + ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid)); + ref = malloc(ref_len); + memset(ref, 'N', ref_len); + if (!ref) + goto err; + ret = dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose); + } + + free(ref); + if (iter) + hts_itr_destroy(iter); + + return ret; + + err: + free(ref); + if (blk) + cram_free_block(blk); + if (shdr) + cram_free_slice_header(shdr); + if (c) + cram_free_container(c); + if (iter) + hts_itr_destroy(iter); + + return -1; +} + +/* + * --------------------------------------------------------------------------- + * MD method of reference construction + */ + +// Returns the next cigar op code: one of the BAM_C* codes, +// or -1 if no more are present. +static inline +int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos, + uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) { + for(;;) { + while (*cig_len == 0) { + if (*cig_ind < *ncigar) { + *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK; + *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT; + (*cig_ind)++; + } else { + return -1; + } + } + + if (skip[*cig_op]) { + *spos += (bam_cigar_type(*cig_op)&1) * *cig_len; + *cig_len = 0; + continue; + } + + (*cig_len)--; + break; + } + + return *cig_op; +} + +// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference. +// Updates ref[] array. +// +// Returns >0 on success, +// 0 on no-MD found, +// -1 on failure (eg inconsistent data) +static int build_ref(bam1_t *b, char *ref, size_t ref_len) { + uint8_t *seq = bam_get_seq(b); + uint32_t *cigar = bam_get_cigar(b); + int ncigar = b->core.n_cigar; + uint32_t cig_op = 0, cig_len = 0, cig_ind = 0; + + const uint8_t *MD = bam_aux_get(b, "MD"); + if (!MD || *MD != 'Z') + return 0; + MD++; + + // Walk through MD + seq to generate ref + int iseq = 0, iref = b->core.pos, next_op; + int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1}; + while (iseq < b->core.l_qseq && *MD) { + if (isdigit(*MD)) { + // match + int len = strtol((char *)MD, (char **)&MD, 10); + while (iseq < b->core.l_qseq && len) { + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && + next_op != BAM_CEQUAL) { + print_error("MD2ref", + "MD:Z and CIGAR are incompatible"); + return -1; + } + + if (iref < ref_len) + ref[iref] = seq_nt16_str[bam_seqi(seq, iseq)]; + iseq++; + iref++; + len--; + } + } else if (*MD == '^') { + // deletion + MD++; + while (*MD && isalpha(*MD)) { + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CDEL) { + print_error("MD2ref", + "MD:Z and CIGAR are incompatible"); + return -1; + } + + if (iref < ref_len) + ref[iref] = *MD; + + MD++; + iref++; + } + } else { + // substitution + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) { + print_error("MD2ref", "MD:Z and CIGAR are incompatible"); + return -1; + } + if (iref < ref_len) + ref[iref] = *MD; + + MD++; + iref++; + iseq++; + } + } + + return 1; +} + +static int MD2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg, + FILE *outfp, int verbose) { + bam1_t *b = bam_init1(); + int r, last_tid = -99; + size_t ref_len = 0; + char *ref = NULL; + int ret = -1; + + hts_itr_t *iter = NULL; + if (idx && reg) { + iter = sam_itr_querys(idx, h, reg); + if (!iter) { + print_error("reference", "failed to parse region '%s'", reg); + goto err; + } + } + + while ((r = iter + ? sam_itr_next(in, iter, b) + : sam_read1(in, h, b)) >= 0) { + // check b->core.tid and flush old seq. + if (b->core.tid != last_tid) { + if (last_tid >= 0) + if (dump_ref(h, iter, last_tid, ref, ref_len, outfp, + verbose) < 0) + goto err; + + last_tid = b->core.tid; + ref_len = sam_hdr_tid2len(h, last_tid); + if (ref_len) { + char *ref2 = realloc(ref, ref_len); + if (!ref2) + goto err; + else + ref = ref2; + memset(ref, 'N', ref_len); + } + } + + if (build_ref(b, ref, ref_len) < 0) + goto err; + } + + if (last_tid >= 0) { + if (dump_ref(h, iter, last_tid, ref, ref_len, outfp, verbose) < 0) + goto err; + } else if (reg) { + // no data present, but we explicitly asked for the reference so + // report it still as Ns. + ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid)); + ref = malloc(ref_len); + memset(ref, 'N', ref_len); + if (!ref) + goto err; + if (dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose) < 0) + goto err; + } + + if (r < -1) + goto err; + + ret = 0; + + err: + if (iter) + hts_itr_destroy(iter); + bam_destroy1(b); + free(ref); + return ret; +} + +int main_reference(int argc, char *argv[]) +{ + int c, usage = 0, verbose = 1, use_embedded = 0; + sam_hdr_t *h = 0; + samFile *in = NULL; + hts_idx_t *idx = NULL; + sam_global_args ga; + FILE *outfp = stdout; + char *reg = NULL; + + static const struct option lopts[] = { + {"output", required_argument, NULL, 'o'}, + {"quiet", no_argument, NULL, 'q'}, + {"embedded", no_argument, NULL, 'e'}, + {"region", required_argument, NULL, 'r'}, + SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'), + { NULL, 0, NULL, 0 } + }; + + sam_global_args_init(&ga); + + while ((c = getopt_long(argc, argv, "@:qo:er:", lopts, NULL)) >= 0) { + switch (c) { + case 'o': + if (!(outfp = fopen(optarg, "w"))) { + perror(optarg); + goto err; + } + break; + + case 'q': + verbose = 0; + break; + + case 'e': + use_embedded = 1; + break; + + case 'r': + reg = optarg; + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage=1; break; + } + } + + if ((optind == argc && isatty(0)) || usage) { + printf("Usage: samtools reference [-@ N] [-r region] [-e] [-q] [-o out.fa] [in.cram]\n"); + return 0; + } + + char *fn = optind < argc ? argv[optind] : "-"; + if (!(in = sam_open(fn, "r"))) { + print_error_errno("reference", "failed to open file '%s'", fn); + return 1; + } + + if (ga.nthreads > 0) + hts_set_threads(in, ga.nthreads); + + if (!(h = sam_hdr_read(in))) + goto err; + + if (reg) { + idx = sam_index_load(in, fn); + if (!idx) { + print_error_errno("reference", "Failed to load the index"); + goto err; + } + } + + int ret = use_embedded + ? cram2ref(in, h, idx, reg, outfp, verbose) + : MD2ref(in, h, idx, reg, outfp, verbose); + + sam_hdr_destroy(h); + if (outfp != stdout) + fclose(outfp); + if (idx) + hts_idx_destroy(idx); + sam_close(in); + + return ret; + + err: + if (idx) + hts_idx_destroy(idx); + if (in) + sam_close(in); + if (h) + sam_hdr_destroy(h); + + return 1; +} diff --git a/samtools/reference.c.pysam.c b/samtools/reference.c.pysam.c new file mode 100644 index 0000000..fe8025f --- /dev/null +++ b/samtools/reference.c.pysam.c @@ -0,0 +1,600 @@ +#include "samtools.pysam.h" + +/* bam_reference.c -- extracts an embedded reference from a CRAM file, + or creates it from alignments plus MD:Z tags. + + Copyright (C) 2022 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include +#include + +#include "htslib/sam.h" +#include "htslib/cram.h" +#include "samtools.h" +#include "sam_opts.h" + + +/* + * There are two main modes of operation. + * + * 1. Extracting the reference from the CRAM file embed_ref blocks. + * 2. Generation of reference by analysing consensus plus patches applied + * via MD tags. + * + * The first is very rapid, but only applies to a CRAM files generated with + * the specific options (not commonly used and not the default). The second + * is a slow operation, but applies to any data type. + * + * This is also a testing ground for a future CRAM auto-embed-ref option that + * permits the use of an embedded reference without having to first extract + * the reference. (Note this may require the creation of MD tags during + * decode by use of an existing embedded reference, if the records don't + * have an MD tag themselves, but that's an issue for htslib when we get + * there.) + */ + +/* + * --------------------------------------------------------------------------- + * Shared utility functions by both methods. + */ + +#define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL) +#define MIN(a,b) ((a)<(b)?(a):(b)) +static int dump_ref(sam_hdr_t *h, hts_itr_t *iter, int ref_id, + char *ref, uint64_t ref_len, FILE *fp, int verbose) { + int N = 0; + if (iter && iter->end >= HTS_POS_MAX) + iter->end = ref_len; + if (iter && (iter->beg > 0 || iter->end < ref_len)) { + fprintf(fp, ">%s:%"PRIhts_pos"-%"PRIhts_pos"\n", + sam_hdr_tid2name(h, ref_id), iter->beg+1, iter->end); + ref += iter->beg; + ref_len = MIN(ref_len, iter->end) - iter->beg; + } else { + fprintf(fp, ">%s\n", sam_hdr_tid2name(h, ref_id)); + } + + int i, j; + uint64_t rem = ref_len; + + // Count coverage, purely for information purposes. + // About 90% of dump_ref CPU is here, so maybe this isn't useful, + // but this is still 3-4x faster than the obvious naive loop. + // + // Overall though it's only about 5% overhead of the entire process + // (was ~20%). + if (verbose) { + int n4[8] = {0}; + for (j = 0; j < ref_len && (((uintptr_t) &ref[j] & 7) != 0); j++) + N += ref[j] == 'N'; + uint64_t fast_end = ((ref_len - j) & ~7) + j; + for (; j < fast_end; j+=8) { + uint64_t i64 = *(uint64_t *)&ref[j]; + if (!haszero(i64 ^ 0x4e4e4e4e4e4e4e4eUL)) // 'N' <-> 0 + continue; + + n4[0] += ref[j+0] == 'N'; + n4[1] += ref[j+1] == 'N'; + n4[2] += ref[j+2] == 'N'; + n4[3] += ref[j+3] == 'N'; + n4[4] += ref[j+4] == 'N'; + n4[5] += ref[j+5] == 'N'; + n4[6] += ref[j+6] == 'N'; + n4[7] += ref[j+7] == 'N'; + } + for (; j < ref_len; j++) + N += ref[j] == 'N'; + N += n4[0]+n4[1]+n4[2]+n4[3]+ + n4[4]+n4[5]+n4[6]+n4[7]; + } + + // Format reference + for (i = 0; i < ref_len; i += 60, rem -= 60) { + int len = (int)(rem < 60 ? rem : 60); + if (fwrite(ref, 1, len, fp) != len) + return -1; + putc('\n', fp); + ref += 60; + } + + if (verbose) + fprintf(samtools_stderr, "Dump ref %d len %"PRId64", coverage %.2f%%\n", + ref_id, ref_len, 100 - N*100.0 / ref_len); + + return 0; +} + +/* + * --------------------------------------------------------------------------- + * CRAM embedded reference method of reference construction + */ + +/* + * Extracts an embedded reference from a sorted CRAM file. + * Modelled on the CRAM container copy loop from bam_cat.c. + */ +static int cram2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg, + FILE *outfp, int verbose) { + cram_fd *in_c; + cram_container *c = NULL; + cram_block *blk = NULL; + cram_block_slice_hdr *shdr = NULL; + + int curr_ref_id = -99; + char *ref = NULL; + uint64_t ref_len = 0; + + // We have no direct public API for seeking in CRAM to a specific + // location by genome coordinates. The sam_itr_query API is + // designed for fetching records, rather than seeks to specific + // file locations. + // + // TODO: consider exposing cram_range and cram_seek_to_refpos API. + // After a sam_index_load which will add the index to infp, these + // functions should seek direct to the start of a container. + // Or use cram_index *e =cram_index_query(cram, tid, beg, NULL); + // + // However, fortuitously(?) sam_itr_querys calls cram_seek_to_refpos + // so we can do a region query and let that do the initial seek. + // We still need to do our own end-range detection though. + + hts_itr_t *iter = NULL; + if (reg) { + iter = sam_itr_querys(idx, h, reg); + if (!iter) { + print_error("reference", "failed to parse region '%s'", reg); + goto err; + } + } + + in_c = in->fp.cram; // low level htslib abuse? + int eor = 0; + while (!eor && (c = cram_read_container(in_c))) { + if (cram_container_is_empty(in_c)) { + cram_block *blk; + // Container compression header + if (!(blk = cram_read_block(in_c))) + goto err; + cram_free_block(blk); + cram_free_container(c); + c = NULL; blk = NULL; + continue; + } + + // Container compression header; read and discard + int32_t num_slices; + if (!(blk = cram_read_block(in_c))) + goto err; + cram_free_block(blk); + blk = NULL; + + // Container num_blocks can be invalid, due to a bug. + // Instead we iterate in slice context instead. + (void)cram_container_get_landmarks(c, &num_slices); + int i, j; + for (i = 0; i < num_slices; i++) { + // Slice header + if (!(blk = cram_read_block(in_c))) + goto err; + if (!(shdr = cram_decode_slice_header(in_c, blk))) + goto err; + cram_free_block(blk); + blk = NULL; + + int num_blocks = cram_slice_hdr_get_num_blocks(shdr); + int embed_id = cram_slice_hdr_get_embed_ref_id(shdr); + int ref_id; + hts_pos_t ref_start, ref_span; + cram_slice_hdr_get_coords(shdr, &ref_id, &ref_start, &ref_span); + + if (iter) { + if (iter->tid != ref_id || ref_start > iter->end) { + // Beyond end of specified region. + cram_free_slice_header(shdr); + eor = 1; + break; + } + } + + if (embed_id < 0 && ref_id != -1) { + fprintf(samtools_stderr, "CRAM file has slice without embedded " + "reference\n"); + goto err; + } + + if (ref_id != curr_ref_id) { + if (curr_ref_id >= 0) { + if (dump_ref(h, iter, curr_ref_id, ref, ref_len, + outfp, verbose) < 0) + goto err; + } + + ref_len = sam_hdr_tid2len(h, ref_id); + if (ref_len) { + char *ref2 = realloc(ref, ref_len); + if (!ref2) + goto err; + else + ref = ref2; + memset(ref, 'N', ref_len); + } + curr_ref_id = ref_id; + } + + // Slice data blocks + for (j = 0; j < num_blocks; j++) { + // read and discard, unless it's the ref-ID block + if (!(blk = cram_read_block(in_c))) + goto err; + if (cram_block_get_content_id(blk) == embed_id) { + cram_uncompress_block(blk); + //printf("%.*s\n", blk->uncomp_size, blk->data); + + int32_t usize = cram_block_get_uncomp_size(blk); + int ref_end = ref_start + usize; + if (ref_end > ref_len+1) + ref_end = ref_len+1; + if (ref_end > ref_start) + memcpy(ref + ref_start-1, cram_block_get_data(blk), + ref_end - ref_start); + } + cram_free_block(blk); + blk = NULL; + } + cram_free_slice_header(shdr); + shdr = NULL; + } + + cram_free_container(c); + c = NULL; + } + + int ret = 0; + if (curr_ref_id >= 0) { + ret = dump_ref(h, iter, curr_ref_id, ref, ref_len, outfp, verbose); + } else if (reg) { + // no data present + // no data present, but we explicitly asked for the reference so + // report it still as Ns. + ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid)); + ref = malloc(ref_len); + memset(ref, 'N', ref_len); + if (!ref) + goto err; + ret = dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose); + } + + free(ref); + if (iter) + hts_itr_destroy(iter); + + return ret; + + err: + free(ref); + if (blk) + cram_free_block(blk); + if (shdr) + cram_free_slice_header(shdr); + if (c) + cram_free_container(c); + if (iter) + hts_itr_destroy(iter); + + return -1; +} + +/* + * --------------------------------------------------------------------------- + * MD method of reference construction + */ + +// Returns the next cigar op code: one of the BAM_C* codes, +// or -1 if no more are present. +static inline +int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos, + uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) { + for(;;) { + while (*cig_len == 0) { + if (*cig_ind < *ncigar) { + *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK; + *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT; + (*cig_ind)++; + } else { + return -1; + } + } + + if (skip[*cig_op]) { + *spos += (bam_cigar_type(*cig_op)&1) * *cig_len; + *cig_len = 0; + continue; + } + + (*cig_len)--; + break; + } + + return *cig_op; +} + +// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference. +// Updates ref[] array. +// +// Returns >0 on success, +// 0 on no-MD found, +// -1 on failure (eg inconsistent data) +static int build_ref(bam1_t *b, char *ref, size_t ref_len) { + uint8_t *seq = bam_get_seq(b); + uint32_t *cigar = bam_get_cigar(b); + int ncigar = b->core.n_cigar; + uint32_t cig_op = 0, cig_len = 0, cig_ind = 0; + + const uint8_t *MD = bam_aux_get(b, "MD"); + if (!MD || *MD != 'Z') + return 0; + MD++; + + // Walk through MD + seq to generate ref + int iseq = 0, iref = b->core.pos, next_op; + int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1}; + while (iseq < b->core.l_qseq && *MD) { + if (isdigit(*MD)) { + // match + int len = strtol((char *)MD, (char **)&MD, 10); + while (iseq < b->core.l_qseq && len) { + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && + next_op != BAM_CEQUAL) { + print_error("MD2ref", + "MD:Z and CIGAR are incompatible"); + return -1; + } + + if (iref < ref_len) + ref[iref] = seq_nt16_str[bam_seqi(seq, iseq)]; + iseq++; + iref++; + len--; + } + } else if (*MD == '^') { + // deletion + MD++; + while (*MD && isalpha(*MD)) { + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CDEL) { + print_error("MD2ref", + "MD:Z and CIGAR are incompatible"); + return -1; + } + + if (iref < ref_len) + ref[iref] = *MD; + + MD++; + iref++; + } + } else { + // substitution + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) { + print_error("MD2ref", "MD:Z and CIGAR are incompatible"); + return -1; + } + if (iref < ref_len) + ref[iref] = *MD; + + MD++; + iref++; + iseq++; + } + } + + return 1; +} + +static int MD2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg, + FILE *outfp, int verbose) { + bam1_t *b = bam_init1(); + int r, last_tid = -99; + size_t ref_len = 0; + char *ref = NULL; + int ret = -1; + + hts_itr_t *iter = NULL; + if (idx && reg) { + iter = sam_itr_querys(idx, h, reg); + if (!iter) { + print_error("reference", "failed to parse region '%s'", reg); + goto err; + } + } + + while ((r = iter + ? sam_itr_next(in, iter, b) + : sam_read1(in, h, b)) >= 0) { + // check b->core.tid and flush old seq. + if (b->core.tid != last_tid) { + if (last_tid >= 0) + if (dump_ref(h, iter, last_tid, ref, ref_len, outfp, + verbose) < 0) + goto err; + + last_tid = b->core.tid; + ref_len = sam_hdr_tid2len(h, last_tid); + if (ref_len) { + char *ref2 = realloc(ref, ref_len); + if (!ref2) + goto err; + else + ref = ref2; + memset(ref, 'N', ref_len); + } + } + + if (build_ref(b, ref, ref_len) < 0) + goto err; + } + + if (last_tid >= 0) { + if (dump_ref(h, iter, last_tid, ref, ref_len, outfp, verbose) < 0) + goto err; + } else if (reg) { + // no data present, but we explicitly asked for the reference so + // report it still as Ns. + ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid)); + ref = malloc(ref_len); + memset(ref, 'N', ref_len); + if (!ref) + goto err; + if (dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose) < 0) + goto err; + } + + if (r < -1) + goto err; + + ret = 0; + + err: + if (iter) + hts_itr_destroy(iter); + bam_destroy1(b); + free(ref); + return ret; +} + +int main_reference(int argc, char *argv[]) +{ + int c, usage = 0, verbose = 1, use_embedded = 0; + sam_hdr_t *h = 0; + samFile *in = NULL; + hts_idx_t *idx = NULL; + sam_global_args ga; + FILE *outfp = samtools_stdout; + char *reg = NULL; + + static const struct option lopts[] = { + {"output", required_argument, NULL, 'o'}, + {"quiet", no_argument, NULL, 'q'}, + {"embedded", no_argument, NULL, 'e'}, + {"region", required_argument, NULL, 'r'}, + SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'), + { NULL, 0, NULL, 0 } + }; + + sam_global_args_init(&ga); + + while ((c = getopt_long(argc, argv, "@:qo:er:", lopts, NULL)) >= 0) { + switch (c) { + case 'o': + if (!(outfp = fopen(optarg, "w"))) { + perror(optarg); + goto err; + } + break; + + case 'q': + verbose = 0; + break; + + case 'e': + use_embedded = 1; + break; + + case 'r': + reg = optarg; + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage=1; break; + } + } + + if ((optind == argc && isatty(0)) || usage) { + fprintf(samtools_stdout, "Usage: samtools reference [-@ N] [-r region] [-e] [-q] [-o out.fa] [in.cram]\n"); + return 0; + } + + char *fn = optind < argc ? argv[optind] : "-"; + if (!(in = sam_open(fn, "r"))) { + print_error_errno("reference", "failed to open file '%s'", fn); + return 1; + } + + if (ga.nthreads > 0) + hts_set_threads(in, ga.nthreads); + + if (!(h = sam_hdr_read(in))) + goto err; + + if (reg) { + idx = sam_index_load(in, fn); + if (!idx) { + print_error_errno("reference", "Failed to load the index"); + goto err; + } + } + + int ret = use_embedded + ? cram2ref(in, h, idx, reg, outfp, verbose) + : MD2ref(in, h, idx, reg, outfp, verbose); + + sam_hdr_destroy(h); + if (outfp != samtools_stdout) + fclose(outfp); + if (idx) + hts_idx_destroy(idx); + sam_close(in); + + return ret; + + err: + if (idx) + hts_idx_destroy(idx); + if (in) + sam_close(in); + if (h) + sam_hdr_destroy(h); + + return 1; +} diff --git a/samtools/sam_view.c b/samtools/sam_view.c index c4d65d2..d60d8f7 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -705,6 +705,17 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b, conf->count++; } else if (conf->unmap) { b->core.flag |= BAM_FUNMAP; + b->core.qual = 0; + b->core.isize = 0; + + // remove CIGAR + if (b->core.n_cigar) { + memmove(bam_get_cigar(b), bam_get_seq(b), + b->data + b->l_data - bam_get_seq(b)); + b->l_data -= 4*b->core.n_cigar; + b->core.n_cigar = 0; + } + if (check_sam_write1(conf->out, conf->header, b, conf->fn_out, write_error) < 0) { return -1; @@ -727,6 +738,7 @@ static int stream_view(samview_settings_t *conf) { print_error_errno("view", "could not allocate bam record"); return 1; } + errno = 0; // prevent false error messages. while ((r = sam_read1(conf->in, conf->header, b)) >= 0) { if (process_one_record(conf, b, &write_error) < 0) break; } @@ -754,7 +766,7 @@ static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter) bam_destroy1(b); if (result < -1) { - print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); + print_error("view", "retrieval of region #%d failed", iter->curr_tid); return 1; } return write_error; diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index e768ec4..bb61059 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -707,6 +707,17 @@ static inline int process_one_record(samview_settings_t *conf, bam1_t *b, conf->count++; } else if (conf->unmap) { b->core.flag |= BAM_FUNMAP; + b->core.qual = 0; + b->core.isize = 0; + + // remove CIGAR + if (b->core.n_cigar) { + memmove(bam_get_cigar(b), bam_get_seq(b), + b->data + b->l_data - bam_get_seq(b)); + b->l_data -= 4*b->core.n_cigar; + b->core.n_cigar = 0; + } + if (check_sam_write1(conf->out, conf->header, b, conf->fn_out, write_error) < 0) { return -1; @@ -729,6 +740,7 @@ static int stream_view(samview_settings_t *conf) { print_error_errno("view", "could not allocate bam record"); return 1; } + errno = 0; // prevent false error messages. while ((r = sam_read1(conf->in, conf->header, b)) >= 0) { if (process_one_record(conf, b, &write_error) < 0) break; } @@ -756,7 +768,7 @@ static int multi_region_view(samview_settings_t *conf, hts_itr_multi_t *iter) bam_destroy1(b); if (result < -1) { - print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid); + print_error("view", "retrieval of region #%d failed", iter->curr_tid); return 1; } return write_error; diff --git a/samtools/version.sh b/samtools/version.sh index e943440..5327353 100755 --- a/samtools/version.sh +++ b/samtools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.15.1 +VERSION=1.16.1 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/setup.py b/setup.py index 2a3a386..890b90a 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,11 @@ def run_configure(option): return False +def run_make(targets): + sys.stdout.flush() + subprocess.check_call([os.environ.get("MAKE", "make")] + targets) + + def run_make_print_config(): stdout = subprocess.check_output(["make", "-s", "print-config"]) if IS_PYTHON3: @@ -93,13 +98,13 @@ def run_nm_defined_symbols(objfile): symbols = set() for line in stdout.splitlines(): (sym, symtype) = line.split()[:2] - if symtype not in "UFWw": + if symtype not in "UFNWw": if IS_DARWIN: # On macOS, all symbols have a leading underscore symbols.add(sym.lstrip('_')) else: # Ignore symbols such as _edata (present in all shared objects) - if not sym.startswith('_'): symbols.add(sym) + if sym[0] not in "_$.@": symbols.add(sym) return symbols @@ -132,7 +137,8 @@ def build_config_dict(ext): optionise('-D', kvtuples(ext.define_macros)) + optionise('-U', ext.undef_macros)) - cflags = " ".join(sc('CFLAGS') + env('CFLAGS') + ext.extra_compile_args) + cflags = " ".join(sc('CFLAGS') + env('CFLAGS') + sc('CCSHARED') + + ext.extra_compile_args) # distutils actually includes $CPPFLAGS here too, but that's weird and # unnecessary for us as we know the output LDFLAGS will be used correctly @@ -167,6 +173,8 @@ def set_compiler_envvars(): print("# pysam: (env) {}={}".format(var, os.environ[var])) elif var in sysconfig.get_config_vars(): value = sysconfig.get_config_var(var) + if var == 'CFLAGS' and 'CCSHARED' in sysconfig.get_config_vars(): + value += ' ' + sysconfig.get_config_var('CCSHARED') print("# pysam: (sysconfig) {}={}".format(var, value)) os.environ[var] = value tmp_vars += [var] @@ -203,17 +211,6 @@ def configure_library(library_dir, env_options=None, options=[]): return None -def distutils_dir_name(dname): - """Returns the name of a distutils build directory - see: http://stackoverflow.com/questions/14320220/ - testing-python-c-libraries-get-build-path - """ - f = "{dirname}.{platform}-{version[0]}.{version[1]}" - return f.format(dirname=dname, - platform=sysconfig.get_platform(), - version=sys.version_info) - - def get_pysam_version(): sys.path.insert(0, "pysam") import version @@ -260,7 +257,8 @@ class extra_build(build): def run(self): build.run(self) try: - self.check_ext_symbol_conflicts() + if HTSLIB_MODE != 'separate': + self.check_ext_symbol_conflicts() except OSError as e: log.warn("skipping symbol collision check (invoking nm failed: %s)", e) except subprocess.CalledProcessError: @@ -292,6 +290,14 @@ class clean_ext(Command): for header in headers: os.remove(header) + objects = (glob.glob(os.path.join("htslib", "*.[oa]")) + + glob.glob(os.path.join("htslib", "cram", "*.o")) + + glob.glob(os.path.join("htslib", "htscodecs", "htscodecs", "*.o"))) + if objects: + log.info("removing 'htslib/**/*.o' and libhts.a (%s objects)", len(objects)) + for obj in objects: + os.remove(obj) + # How to link against HTSLIB # shared: build shared chtslib from builtin htslib code. @@ -382,17 +388,11 @@ if HTSLIB_MODE in ['shared', 'separate']: external_htslib_libraries.extend( [re.sub("^-l", "", x) for x in htslib_make_options["LIBS"].split(" ") if x.strip()]) - shared_htslib_sources = [re.sub("\.o", ".c", os.path.join("htslib", x)) - for x in - htslib_make_options["LIBHTS_OBJS"].split(" ")] - - htslib_sources = [] - if HTSLIB_LIBRARY_DIR: - # linking against a shared, externally installed htslib version, no - # sources required for htslib - htslib_sources = [] - shared_htslib_sources = [] + # linking against a shared, externally installed htslib version, + # no sources or built libhts.a required for htslib + htslib_objects = [] + separate_htslib_objects = [] chtslib_sources = [] htslib_library_dirs = [HTSLIB_LIBRARY_DIR] htslib_include_dirs = [HTSLIB_INCLUDE_DIR] @@ -400,19 +400,22 @@ if HTSLIB_LIBRARY_DIR: elif HTSLIB_MODE == 'separate': # add to each pysam component a separately compiled # htslib - htslib_sources = shared_htslib_sources - shared_htslib_sources = htslib_sources + htslib_objects = ['htslib/libhts.a'] + separate_htslib_objects = ['htslib/libhts.a'] htslib_library_dirs = [] htslib_include_dirs = ['htslib'] elif HTSLIB_MODE == 'shared': # link each pysam component against the same # htslib built from sources included in the pysam # package. - htslib_library_dirs = [ - "pysam", # when using setup.py develop? - ".", # when using setup.py develop? - os.path.join("build", distutils_dir_name("lib"), "pysam")] + # Link with the object files rather than the final htslib/libhts.a, to ensure that + # all object files are pulled into the link, even those not used by htslib itself. + htslib_objects = [os.path.join("htslib", x) + for x in htslib_make_options["LIBHTS_OBJS"].split(" ")] + separate_htslib_objects = [] + + htslib_library_dirs = ["."] # when using setup.py develop? htslib_include_dirs = ['htslib'] else: raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE) @@ -501,53 +504,79 @@ libraries_for_pysam_module = external_htslib_libraries + internal_htslib_librari def prebuild_libchtslib(ext, force): if HTSLIB_MODE not in ['shared', 'separate']: return + write_configvars_header("htslib/config_vars.h", ext, "HTS") + if force or not os.path.exists("htslib/libhts.a"): + log.info("building 'libhts.a'") + with changedir("htslib"): + # TODO Eventually by running configure here, we can set these + # extra flags for configure instead of hacking on ALL_CPPFLAGS. + args = " ".join(ext.extra_compile_args) + run_make(["ALL_CPPFLAGS=-I. " + args + " $(CPPFLAGS)", "lib-static"]) + else: + log.warn("skipping 'libhts.a' (already built)") + + def prebuild_libcsamtools(ext, force): write_configvars_header("samtools/samtools_config_vars.h", ext, "SAMTOOLS") + modules = [ dict(name="pysam.libchtslib", prebuild_func=prebuild_libchtslib, - sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + shared_htslib_sources + os_c_files, + sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + os_c_files, + extra_objects=htslib_objects, libraries=external_htslib_libraries), dict(name="pysam.libcsamtools", prebuild_func=prebuild_libcsamtools, sources=[source_pattern % "samtools"] + glob.glob(os.path.join("samtools", "*.pysam.c")) + - [os.path.join("samtools", "lz4", "lz4.c")] + htslib_sources + os_c_files, + [os.path.join("samtools", "lz4", "lz4.c")] + os_c_files, + extra_objects=separate_htslib_objects, libraries=external_htslib_libraries + internal_htslib_libraries), dict(name="pysam.libcbcftools", - sources=[source_pattern % "bcftools"] + glob.glob(os.path.join("bcftools", "*.pysam.c")) + htslib_sources + os_c_files, + sources=[source_pattern % "bcftools"] + glob.glob(os.path.join("bcftools", "*.pysam.c")) + os_c_files, + extra_objects=separate_htslib_objects, libraries=external_htslib_libraries + internal_htslib_libraries), dict(name="pysam.libcutils", - sources=[source_pattern % "utils", "pysam/pysam_util.c"] + htslib_sources + os_c_files, + sources=[source_pattern % "utils", "pysam/pysam_util.c"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=external_htslib_libraries + internal_htslib_libraries + internal_samtools_libraries), dict(name="pysam.libcalignmentfile", - sources=[source_pattern % "alignmentfile"] + htslib_sources + os_c_files, + sources=[source_pattern % "alignmentfile"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libcsamfile", - sources=[source_pattern % "samfile"] + htslib_sources + os_c_files, + sources=[source_pattern % "samfile"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libcalignedsegment", - sources=[source_pattern % "alignedsegment"] + htslib_sources + os_c_files, + sources=[source_pattern % "alignedsegment"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libctabix", - sources=[source_pattern % "tabix"] + htslib_sources + os_c_files, + sources=[source_pattern % "tabix"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libcfaidx", - sources=[source_pattern % "faidx"] + htslib_sources + os_c_files, + sources=[source_pattern % "faidx"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libcbcf", - sources=[source_pattern % "bcf"] + htslib_sources + os_c_files, + sources=[source_pattern % "bcf"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libcbgzf", - sources=[source_pattern % "bgzf"] + htslib_sources + os_c_files, + sources=[source_pattern % "bgzf"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libctabixproxies", - sources=[source_pattern % "tabixproxies"] + htslib_sources + os_c_files, + sources=[source_pattern % "tabixproxies"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), dict(name="pysam.libcvcf", - sources=[source_pattern % "vcf"] + htslib_sources + os_c_files, + sources=[source_pattern % "vcf"] + os_c_files, + extra_objects=separate_htslib_objects, libraries=libraries_for_pysam_module), ] @@ -557,8 +586,8 @@ common_options = dict( define_macros=define_macros, # for out-of-tree compilation, use absolute paths library_dirs=[os.path.abspath(x) for x in ["pysam"] + htslib_library_dirs], - include_dirs=[os.path.abspath(x) for x in htslib_include_dirs + \ - ["samtools", "samtools/lz4", "bcftools", "pysam", "."] + include_os]) + include_dirs=[os.path.abspath(x) for x in ["pysam"] + htslib_include_dirs + \ + ["samtools", "samtools/lz4", "bcftools", "."] + include_os]) # add common options (in python >3.5, could use n = {**a, **b} for module in modules: diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 3a6cafc..61531f4 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -6,6 +6,7 @@ and data files located there. ''' import unittest +import pytest import os import shutil import sys @@ -1456,6 +1457,7 @@ class TestTruncatedBAM(unittest.TestCase): pysam.AlignmentFile, os.path.join(BAM_DATADIR, 'ex2_truncated.bam')) + @pytest.mark.filterwarnings('ignore:no BGZF EOF marker') def testTruncatedBamIterator(self): s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, 'ex2_truncated.bam'), ignore_truncation=True) diff --git a/tests/VariantRecord_test.py b/tests/VariantRecord_test.py index b045b98..c44d9a6 100644 --- a/tests/VariantRecord_test.py +++ b/tests/VariantRecord_test.py @@ -66,3 +66,30 @@ def test_unicode_annotation_can_be_added(vcf_header): assert str(record)[:-1].split("\t")[-2:] == [ "anno1", "Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"] + +def test_set_sample_alleles(vcf_header): + vcf_header.formats.add('GT',1,'String',"Genotype") # id, number, type, description + record = vcf_header.new_record( + contig="1", + start=20, + stop=21, + alleles=('A','T') + ) + + record.samples['sample1'].alleles = ('T', 'A') + assert record.samples['sample1'].alleles == ('T','A') + + # Empty record: + record.samples['sample1'].alleles = (None, ) + assert record.samples['sample1'].alleles == tuple() + record.samples['sample1'].alleles = None + assert record.samples['sample1'].alleles == tuple() + record.samples['sample1'].alleles = tuple() + assert record.samples['sample1'].alleles == tuple() + + # check error conditions: + with pytest.raises(ValueError, match='One or more of the supplied sample alleles are not defined'): + record.samples['sample1'].alleles = ('C', 'A') + + with pytest.raises(ValueError, match='Use .allele_indices to set integer allele indices'): + record.samples['sample1'].alleles = (1, 0) diff --git a/tests/pysam_data/Makefile b/tests/pysam_data/Makefile index c6ad884..d870440 100644 --- a/tests/pysam_data/Makefile +++ b/tests/pysam_data/Makefile @@ -70,7 +70,7 @@ ex1.pileup.gz:ex1.bam ex1.fa samtools mpileup -f ex1.fa ex1.bam | gzip > ex1.pileup.gz ex2_truncated.bam: ex2.bam - head -c 124000 ex2.bam > ex2_truncated.bam + dd if=ex2.bam of=ex2_truncated.bam bs=$$((`wc -c < ex2.bam`-512)) count=1 # Append a corrupt read with block_size < sizeof(bam_core_t fields) ex2_corrupt.bam: ex2.bam diff --git a/tests/tabix_test.py b/tests/tabix_test.py index 3f1f716..12ab4d2 100644 --- a/tests/tabix_test.py +++ b/tests/tabix_test.py @@ -1024,7 +1024,7 @@ class TestRemoteFileHTTP(unittest.TestCase): local = os.path.join(TABIX_DATADIR, "example.gtf.gz") def setUp(self): - if not pysam.config.HAVE_LIBCURL or not check_url(self.url): + if not getattr(pysam.config, "HAVE_LIBCURL", 0) or not check_url(self.url): self.remote_file = None else: self.remote_file = pysam.TabixFile(self.url, "r") @@ -1063,7 +1063,7 @@ class TestRemoteFileHTTPWithHeader(TestRemoteFileHTTP): local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz") def setUp(self): - if not pysam.config.HAVE_LIBCURL or not check_url(self.url): + if not getattr(pysam.config, "HAVE_LIBCURL", 0) or not check_url(self.url): self.remote_file = None else: self.remote_file = pysam.TabixFile(self.url, "r")